The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/local/bin/perl 
#!/usr/bin/env   perl 
#!/bin/sh

######################################################################
# unichars - list characters for one or more properties
#
# Tom Christiansen <tchrist@perl.com>
#  v1.0: Fri Oct 22 23:05:16 MDT 2010
#  v1.2: Tue Oct 26 08:28:25 MDT 2010
#   better 5.10 support and simpler evals
#
################################################################
#
#   This is an sh wrapper to run the script under
#   whichever perl occurs first in your path.  See
#   CHOICEs 1 and 2 below for alternate strategies.
#   The -x will throw off your line numbers otherwise.
#
######################################################################
#
#   The next line is legal in both shell and perl,
#   but perl sees the if 0 so doesn't execute it.
#

eval 'exec perl -x -S $0 ${1+"$@"}'
      if 0;

### CHOICE 1:
######################################################################
### MAKE FOLLOWING #! line THE TOP LINE, REPLACING /usr/local/bin  ###
###   with wherever you have a late enough version of Perl is      ###
###   installed.  Will run under 5.10, but prefers 5.12 or better. ###
######################################################################
#!/usr/local/bin/perl 
# ^^^^^^^^^^^^^^  <=== CHANGE ME                                   ###
######################################################################

### CHOICE 2:
######################################################################
### ALTERNATELY, the following #! line does the same thing as      ###
###   the tricksy sh eval exec line:  it finds whichever Perl is   ###
###   first in your path.  However, it works only on BSD systems   ###
###   (including MacOS), but breaks under Solaris and Linux.       ###
######################################################################
#!/usr/bin/env perl -CLA
######################################################################

use strict;
use warnings;     # qw[ FATAL all ];
use charnames qw[ :full :short latin greek ];

use 5.10.1;

use File::Basename  qw[ basename    ];
use Getopt::Long    qw[ GetOptions  ];
use File::Spec;
use Carp;

use Pod::Usage      qw[ pod2usage   ];
use Encode          qw[ decode      ];

use Unicode::UCD qw(charinfo casefold);

use if $^V >= v5.11.3, qw[ feature unicode_strings ];

# don't need to import this
sub utf::is_utf8($);

################################################################

sub ARGCOUNT;
sub CF();
sub IT();
sub NAME();
sub NOT_REACHED;
sub NUM();

sub am_running_perldb;
sub check_options();
sub compile_filter();
sub deQ($);
sub deQQ($);
sub debug($);
sub dequeue($$);
sub display;
sub fork_pager;
sub genfuncs;
sub is_runnable;
sub locate_program;
sub main();
sub panic;
sub run_filter();
sub start_pager;
sub stupid_evil_and_wrong;
sub titlecase;
sub underscore;

################################################################

our $VERSION = "1.4 (2011-04-11)";

$| = 1;             # command buffering quick-feeds piped stdout
$0 = basename($0);  # shorten up warnings/errors

our %Opt;
our $CF;
our $CI;
our $Shown_Count = 0;

main();
exit;

################################################################

sub   IT() { $_ }
sub NAME() { charnames::viacode(ord $_) || "" }

sub genfuncs  { 

    for my $nf ( qw< NFD NFC NFKD NFKC FCD FCC > ) {
	no strict "refs";
	*$nf = sub(_) {
	    require Unicode::Normalize;
	    "Unicode::Normalize::$nf"->($_);
	};
    } 

    for my $check ( qw< checkNFD checkNFC checkNFKD checkNFKC checkFCD checkFCC > ) {
	no strict "refs";
	*$check = sub(_) {
	    require Unicode::Normalize;
	    my $stat = "Unicode::Normalize::$check"->($_);
	    if (defined $stat) {
		return $stat || "0 but true";
	    } else {
		# trick to quiet zero-conversion under -w
		return 0 == 1;
	    }
	}
    } 

    for my $nf ( qw< Singleton Exclusion NonStDecomp Comp_Ex 
		     NFD_NO  NFC_NO  NFC_MAYBE 
		    NFKD_NO NFKC_NO NFKC_MAYBE > 
		) 
    {
	no strict "refs";
	*$nf = sub() {
	    require Unicode::Normalize;
	    "Unicode::Normalize::is$nf"->(ord);
	};
    } 


    for my $nl ( 1 .. 4 ) {
	no strict "refs";
	*{ "UCA$nl" } = sub(_) {
	    require Unicode::Collate;
	    my $class = Unicode::Collate:: ;
	    my @args = (level => $nl, variable => "Non-Ignorable");
	    if ($Opt{locale}) {
		require Unicode::Collate::Locale;
		$class = Unicode::Collate::Locale:: ;
		push @args, locale => $Opt{locale};
	    } 
	    state $coll = $class->new(@args);
	    return $coll->getSortKey($_[0]);
	};
    } 

    no warnings "once";
    *UCA = \&UCA1;

}

sub CF() {
    $CF = casefold(ord);
    return ($CF && $CF->{status}) || "";
} 

sub NUM() {
    require Unicode::UCD;
    Unicode::UCD->VERSION(0.32);
    my $n = Unicode::UCD::num($_);
    if (defined $n) {
	return $n || "0 but true";
    } else {
	# trick to quiet zero-conversion under -w
	return 0 == 1;
    } 
} 

################################################################

sub main() {

    for my $fh ( qw[STDOUT STDERR] ) {
        binmode($fh, ":utf8")
            || die "can't binmode($fh) to :utf8 encoding: $!";
    }

    check_options();
    genfuncs();
    compile_filter();

    $SIG{PIPE} = sub {exit 0};

    run_filter();

    if ($Opt{verbose}) {
	print STDERR "$0: $Shown_Count code points matched.\n";
    } 

    close(STDOUT) || warn "$0: close stdout failed: $!\n";

    if ($Shown_Count) {
	exit 0;
    } else {
	exit 1;
    } 
}

################################################################

sub debug($) {
    return unless $Opt{debug};
    my $msg = shift();
    print STDERR "$msg\n";
}

sub check_options() {

    Getopt::Long::Configure qw[ bundling auto_version ];

    if (@ARGV == 0) {
	@ARGV = qw{
		    --all
		    --category
		    --script
		};
    } 

    GetOptions(\%Opt, qw[

        help|h|?
        man|m
        debug|d

	unnamed|u
        bmp
        smp
        astral|all|a

	casefold|f

	decimal|d

	category|general|c|g
	combining|C
	script|s
	block|b
	bidi|B
	numeric|n

	locale|l=s

	nopager
	verbose

    ]) || pod2usage(2);

    pod2usage(0)                                if $Opt{help};
    pod2usage(-exitstatus => 0, -verbose => 2)  if $Opt{man};

    @ARGV = (1) unless @ARGV;

    #$Opt{smp}++;
    #$Opt{bmp}++;

    pod2usage("$0: missing arguments")          if @ARGV == 0;

    if (grep /\P{ASCII}/ => @ARGV) {
        @ARGV = map { decode("UTF-8", $_) } @ARGV;
    }

}

sub compile_filter() {

    my @criteria;

    for my $i ( 0 .. $#ARGV ) {
        my $snippet = $ARGV[$i];
        $snippet =~ s/^\s+//;

        # args starting with a backslash or which are a bracketed
        # espression are interpreted as pattern matches
        if ($snippet =~ m{ ^ \\ | ^ \[ .* \] $ }x) {
            $snippet = "/$snippet/";
        }

        my $test_compile  = deQ <<'START_TEST';
            |Q|    use warnings qw[FATAL all];
            |Q|    my $ignore  =
START_TEST
           $test_compile .= deQQ(<<"END_TEST");
            |QQ|                  sub { $snippet };
            |QQ|
            |QQ|    # so eval returns true
            |QQ|    1;
            |QQ|
END_TEST

        # debug("test compile:\n$test_compile");

        eval($test_compile) ||
            die "$0: invalid criterion in '$snippet': $@\n";

        $criteria[$i] = "do { $snippet }";
    }

    my $real_code  = deQ(<<'START_CODE') . "\t";
            |Q|    use warnings;
            |Q|    #use warnings qw[FATAL all];
            |Q|    #no warnings qw[deprecated];
            |Q|
            |Q|    sub filter {
            |Q|
            |Q|	        debug(sprintf("testing code point %X", ord()));
            |Q|
            |Q|		my $result = 
            |Q|
START_CODE

       $real_code .= join("\n     &&\n\t" => @criteria)
                  .  deQ(<<'END_CODE');
            |Q|
            |Q|	    ;
            |Q|
            |Q|	        debug("result of " . join(" && ",@criteria) . " is $result");
            |Q|	        return $result;
            |Q|    }
            |Q|
            |Q|    # so eval returns true
            |Q|    1;
END_CODE

    debug("CRITERIA are\n$real_code");
    eval($real_code) || die;
}

sub run_filter() {

    my $first_codepoint = 0x00_0000;
    my $last_codepoint  = 0x10_FFFF;

    unless ($Opt{astral} || $Opt{smp}) {
        $last_codepoint = 0x00_FFFF;
    }

    if ($Opt{bmp}) {
        $first_codepoint = 0x00_0000;
        $last_codepoint  = 0x00_FFFF;
    }

    if ($Opt{smp}) {
        $first_codepoint = 0x01_0000 unless $Opt{bmp};
        $last_codepoint  = 0x01_FFFF;
    }

    if ($Opt{astral}) {
        $last_codepoint  = 0x10_FFFF;
    }

    my $hex_width = length(sprintf("%x", $last_codepoint));
    my $dec_width = length(sprintf("%d", $last_codepoint));

    --$hex_width if $last_codepoint == 0x10_FFFF;

    debug(sprintf("checking codepoints %0${hex_width}X .. %0${hex_width}X",
                    $first_codepoint, $last_codepoint));

CODEPOINT:
    for my $codepoint ( $first_codepoint .. $last_codepoint ) {

        # gaggy UTF-16 surrogates are invalid UTF-8 code points
        next if $codepoint >= 0xD800 && $codepoint <= 0xDFFF;

        # from utf8.c in perl src; must avoid fatals in 5.10
        next if $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF;

        next if 0xFFFE == ($codepoint & 0xFFFE); # both FFFE and FFFF

        # debug("testing codepoint $codepoint");

        # see "Unicode non-character %s is illegal for interchange" in perldiag(1)
        $_ = do { no warnings "utf8"; chr($codepoint) };

        # fixes "the Unicode bug"
        unless (utf8::is_utf8($_)) {
            $_ = decode("iso-8859-1", $_);
        }

	unless ($Opt{unnamed}) {
	    # won't find string names for any of these, so don't bother printing
	    next if m{ \p{Unassigned}           }x;
	    next if m{ \p{PrivateUse}           }x;
	    next if m{ \p{Han}                  }x;
	    next if m{ \p{InHangulSyllables}    }x;
	}

        next unless &filter;

	$Shown_Count++;

	$CI = charinfo(ord);

        if (/[\pC\pZ]/) {
            display " ---- ";
        } else {
	    display "\N{LEFT-TO-RIGHT OVERRIDE}" ;# if /[\p{BC=R}\p{BC=AL}\p{BC=AN}\p{BC=ON}]/;
	    # display " " if /[\p{BC=R}\p{BC=AL}\p{BC=AN}]/;
	    display " ";
	    display "\N{DOTTED CIRCLE}" if /\p{BC=NSM}/;
            # display " \N{LEFT-TO-RIGHT MARK}$_\N{LEFT-TO-RIGHT MARK} ";
            display "$_ ";
	    # display " " unless /[\p{BC=R}\p{BC=AL}\p{BC=AN}]/;
	    display " " unless /[\p{EA=F}\p{EA=W}]/;
        }


        display sprintf "%${dec_width}d %0${hex_width}X ", ($codepoint) x 2
	    if $Opt{decimal};

        display sprintf "U+%0${hex_width}X ", $codepoint;

	if ($Opt{category}) {
	    display sprintf("GC=%2s ", $CI->{category});
	} 

	if ($Opt{casefold}) {
	    display sprintf("CF=%1s ", CF());
	} 

	if ($Opt{bidi}) {
	    display sprintf("BC=%-3s ", $CI->{bidi});
	} 

	if ($Opt{numeric}) {
	    display sprintf("%6s=NV  ", $CI->{numeric});
	} 

	if ($Opt{block}) {
	    display sprintf("BLK=%-22s ", underscore($CI->{block}));
	} 

	if ($Opt{script}) {
	    display sprintf("SC=%-12s ", titlecase($CI->{script}));
	} 

	if ($Opt{combining}) {
	    display sprintf("CC=%-3s ", $CI->{combining});
	} 

        display sprintf "%s\n", charnames::viacode($codepoint) || "<unnamed codepoint>";
    }

}

sub underscore {
    local $_ = shift();
    y/ /_/;
    return $_;
} 

sub titlecase {
    local $_ = shift();
    s/[-_]\K(\p{Ll})/\u$1/g;
    return $_;
} 


sub display {
    ARGCOUNT() unless @_ == 1;

    my $string = $_[0];

    state $begun_pager;
    start_pager() unless $begun_pager++;

    print $string;

}

sub am_running_perldb {
    no warnings "once";
    return keys(%DB::sub) > 0;
}

sub locate_program {
    ARGCOUNT() unless @_ == 1;

    my $program = $_[0];

    return unless defined $program
               && length  $program;

    if (File::Spec->file_name_is_absolute($program)) {
        return is_runnable($program);
    }

    my @path_dirs = File::Spec->path();

    for my $dir (@path_dirs) {
        my $pathname = File::Spec->catfile($dir, $program);
        my $runpath;
        return $runpath if $runpath = is_runnable($pathname);
    }

    return;
}


sub is_runnable {
    ARGCOUNT() unless @_ == 1;
    my $fullpath = $_[0];

    if (-x $fullpath && ! -d _) {
        return $fullpath;
    }
    elsif (stupid_evil_and_wrong()  &&  $fullpath !~ /\.exe\z/i) {
        return is_runnable("$fullpath.exe")
    }
    else {
        return ();
    }

    NOT_REACHED();
}

sub stupid_evil_and_wrong {
    return lc $^O ~~ [ qw<dos os2 netware symbian mswin32> ];
}

sub panic {
    confess "$0: INTERNAL ERROR: @_";
}

sub NOT_REACHED {
    panic("NOT REACHED");
}

sub ARGCOUNT {
    panic("wrong arguments to function");
}

sub dequeue($$) {
    my($leader, $body) = @_;
    $body =~ s/^\s*\Q$leader\E ?//gm;
    return $body;
}

sub deQ($) {
    my $text = $_[0];
    return dequeue q<|Q|>,  $text;
}

sub deQQ($) {
    my $text = $_[0];
    return dequeue qq<|QQ|>, $text;
}


sub start_pager {
    ARGCOUNT() unless @_ == 0;

    return if am_running_perldb();

    return if $Opt{nopager};

    return unless -t STDOUT;

    my $his_pager  =  locate_program($ENV{PAGER})
                   || locate_program("less")
                   || locate_program("more")
                   || locate_program("type")
                  ;

    return unless $his_pager;

    my $am_less = ($his_pager =~ /\bless\b/i);

    local $ENV{LESSCHARSET} = "utf-8" if $am_less;

    my @pager_args = ();
    push (@pager_args, "-r") if $am_less;
    open(STDOUT, "|- :utf8", $his_pager, @pager_args);
}

sub fork_pager {
    if (-t STDOUT) {

    } 
} 


################################################################
################################################################
################################################################

__END__

=encoding utf8

=head1 NAME

unichars - list characters for one or more properties

=head1 SYNOPSIS

B<unichars> [I<options>] I<criterion> ...

Each criterion is either a square-bracketed character class, a regex
starting with a backslash, or an arbitrary Perl expression.  See the
EXAMPLES section below.

OPTIONS:

 Selection Options:

    --bmp           include the Basic Multilingual Plane (plane 0) [DEFAULT]
    --smp           include the Supplementary Multilingual Plane (plane 1)
    --astral    -a  include planes above the BMP (planes 1-15)
    --unnamed   -u  include various unnamed characters (see DESCRIPTION)
    --locale    -l  specify the locale used for UCA functions

 Display Options:

    --category  -c  include the general category (GC=) 
    --script    -s  include the script name (SC=) 
    --block     -b  include the block name (BLK=) 
    --bidi      -B  include the bidi class (BC=) 
    --combining -C  include the canonical combining class (CCC=)
    --numeric   -n  include the numeric value (NV=) 
    --casefold  -f  include the casefold status
    --decimal   -d  include the decimal representation of the code point

 Miscellaneous Options:

    --version   -v  print version information and exit
    --help      -h  this message
    --man       -m  full manpage
    --debug     -d  show debugging of criteria and examined code point span

 Special Functions:

     $_    is the current code point
     ord   is the current code point's ordinal

     NAME is charname::viacode(ord)
     NUM is Unicode::UCD::num(ord), not code point number
     CF is casefold->{status}
     NFD, NFC, NFKD, NFKC, FCD, FCC  (normalization)
     UCA, UCA1, UCA2, UCA3, UCA4 (binary sort keys)

     Singleton, Exclusion, NonStDecomp, Comp_Ex 
     checkNFD, checkNFC, checkNFKD, checkNFKC, checkFCD, checkFCC 
     NFD_NO, NFC_NO, NFC_MAYBE, NFKD_NO, NFKC_NO, NFKC_MAYBE 

=head1 DESCRIPTION

The I<unichars> program reports which characters match all selection criteria
I<and>ed together.

A criterion beginning with a square bracket or a backslash is assumed to be
a regular expression.  Anything else is a Perl expression such as you might
pass to the Perl C<grep> function.  The C<$_> variable is set to each
successive Unicode character, and if all criteria match, that character is
displayed.

The numeric code point is therefore accessible as C<ord>.

The special token C<NAME> is set to the full name of the current code point.
Also, the tokens C<NFD>, C<NFKD>, C<NFC>, and C<NFKC> are set to the
corresponding normalization form.

By default only plane 0, the Basic Multilingual Plane, is examined.
For plane 1, the Supplementary Multilingual Plane, use B<--smp>.
To examine either, specify both B<--bmp> and B<--smp> options, or B<-bs>.
To include I<all> valid code points, use the B<-a> or B<--astral> option.

Unless the B<--unnamed> option is given, characters with any of the
properties Unassigned, PrivateUse, Han, or InHangulSyllables will be
excluded.

=head1 EXAMPLES

Could all non-ASCII digits:

     $ unichars -a '\d' '\P{ASCII}' | wc -l
     401

Find all line terminators:

    $ unichars '\R'
     --       10  0000A  LINE FEED (LF)
     --       11  0000B  LINE TABULATION
     --       12  0000C  FORM FEED (FF)
     --       13  0000D  CARRIAGE RETURN (CR)
     --      133  00085  NEXT LINE (NEL)
     --     8232  02028  LINE SEPARATOR
     --     8233  02029  PARAGRAPH SEPARATOR

Find what is not C<\s> but is C<[\h\v]>:

    $ unichars '\S' '[\h\v]'
     --       11  0000B  LINE TABULATION

Count how many code points in the Basic Multilingual Plane
are I<not> marks but I<are> diacritics:

    $ unichars '\PM' '\p{Diacritic}' | wc -l
         209

Count how many code points in the Basic Multilingual Plane
I<are> marks but are I<not> diacritics:

    $ unichars '\pM' '\P{Diacritic}' | wc -l
         750

Find all code points that are Letters, are in the Greek script,
have differing canonical and compatibility decompositions, and
whose name contains "SYMBOL":

    $ unichars -a '\pL' '\p{Greek}' 'NFD ne NFKD' 'NAME =~ /SYMBOL/'
     ϐ       976  003D0  GREEK BETA SYMBOL
     ϑ       977  003D1  GREEK THETA SYMBOL
     ϒ       978  003D2  GREEK UPSILON WITH HOOK SYMBOL
     ϓ       979  003D3  GREEK UPSILON WITH ACUTE AND HOOK SYMBOL
     ϔ       980  003D4  GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL
     ϕ       981  003D5  GREEK PHI SYMBOL
     ϖ       982  003D6  GREEK PI SYMBOL
     ϰ      1008  003F0  GREEK KAPPA SYMBOL
     ϱ      1009  003F1  GREEK RHO SYMBOL
     ϲ      1010  003F2  GREEK LUNATE SIGMA SYMBOL
     ϴ      1012  003F4  GREEK CAPITAL THETA SYMBOL
     ϵ      1013  003F5  GREEK LUNATE EPSILON SYMBOL
     Ϲ      1017  003F9  GREEK CAPITAL LUNATE SIGMA SYMBOL

Find all numeric nondigits in the Latin script (within the BMP):

    $ unichars '\pN' '\D' '\p{Latin}'
     Ⅰ      8544  02160  ROMAN NUMERAL ONE
     Ⅱ      8545  02161  ROMAN NUMERAL TWO
     Ⅲ      8546  02162  ROMAN NUMERAL THREE
     Ⅳ      8547  02163  ROMAN NUMERAL FOUR
     Ⅴ      8548  02164  ROMAN NUMERAL FIVE
     Ⅵ      8549  02165  ROMAN NUMERAL SIX
     Ⅶ      8550  02166  ROMAN NUMERAL SEVEN
     Ⅷ      8551  02167  ROMAN NUMERAL EIGHT
     (etc)

Find the first three alphanumunderish code points with no assigned name:

    $ unichars -au '\w' '!length NAME' | head -3
     㐀   13312 003400 <unnamed codepoint>
     㐁   13313 003401 <unnamed codepoint>
     㐂   13314 003402 <unnamed codepoint>

Count the combining characters in the Suuplemental Multilingual Plane:

    $ unichars -s '\pM' | wc -l
	  61

=head1 ENVIRONMENT

If your environment smells like it's in a Unicode encoding,
program arguments will be in UTF-8.

=head1 BUGS

The B<--man> option does not correctly process the page for UTF-8, because
it does not pass the necessary B<--utf8> option to L<pod2man>.

=head1 SEE ALSO

L<uniprops>,
L<uninames>,
L<perluniprops>,
L<perlunicode>,
L<perlrecharclass>,
L<perlre>

=head1 AUTHOR

Tom Christiansen <I<tchrist@perl.com>>

=head1 COPYRIGHT AND LICENCE

Copyright 2010 Tom Christiansen.

This program is free software; you may redistribute it and/or modify it
under the same terms as Perl itself.