The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/local/bin/perl
#
# ucsort - sort alphabetic text using the Unicode Collation algorithm
#
# Tom Christiansen <tchrist@perl.com>
#
# Date: Wed Feb 16 17:47:20 MST 2011
# v0.2: undocumented prototype
#
# Date: Tue Apr 19 12:50:37 MDT 2011
# v0.3: unsupported prototype
#
#################################################################

use 5.10.1;
use strict;
use autodie;

use warnings qw[ FATAL all ];

use utf8;
use open qw[ :std IO :utf8 ];

use Carp;
use Getopt::Long qw[ GetOptions ];
use Pod::Usage;
use Scalar::Util qw[ reftype ];
use Unicode::Collate;

#################################################################

sub main();

sub compile($);
sub dequeue($$);
sub deQ($);
sub deQQ($);
sub flip_fields(_);
sub flip_line(_);
sub get_input();
sub load_options();
sub reorder(@);
sub usage(;$);

#################################################################

##
## ** THIS IS AN UNSUPPORTED, PRE-RELEASE VERSION ONLY **
##
our $VERSION = "0.3";

our %Opt;

#################################################################

$SIG{__DIE__} = sub { confess @_ unless $^S };

main();
exit;

#################################################################

sub main() {
    load_options();

    my @input  = get_input();
    my @output = reorder(@input);

    if ($Opt{"reverse-output"}) {
	print for reverse @output;
    } else {
	print for         @output;
    }

}

#################################################################

sub get_input() {
    confess "expected list context" unless wantarray();

    if (!@ARGV && -t STDIN) {
	warn "$0: reading from stdin, type ^D to end, ^C to kill...\n";
    }

    if (my $incoding = $Opt{"input-encoding"}) {
	# need postfix :: to insist is classname not builtin function
	open::->import(IN => $incoding);
    }

    local $/ = "" if $Opt{paragraph};
    return <>;
}

#################################################################

sub load_options() {

    Getopt::Long::Configure qw[ bundling auto_version ];

    my @options = (

	# standard options
	qw[
	    help|?
	    man|m
	    debug|d
	],

	# collator constructor options
	qw[
	    backwards-levels=i
	    collation-level|level|l=i
	    katakana-before-hiragana
	    normalization|n=s
	    preprocess|P=s
	    override-CJK=s
	    override-Hangul=s
	    upper-before-lower|u
	    variable=s
	],

	# program specific options
	qw[
	    case-insensitive|insensitive|i
	    input-encoding|e=s
	    locale|L=s
	    paragraph|p
	    right-to-left|reverse-input
	    reverse-output|r
	    reverse-fields|last
	],

    );

    GetOptions(\%Opt => @options) || pod2usage(2);

    pod2usage(0)                                 if $Opt{help};
    pod2usage(-exitstatus => 0, -verbose => 2)   if $Opt{man};

    if ($Opt{"case-insensitive"}) {
	if ($Opt{"level"}) {
	    usage "you already specified a level of --level=$Opt{level}";
	}
	$Opt{"level"} = 1;
    }

	state $nf_types = {
		map { $_, 1 } qw[NFD NFC NFKD NFKC undef prenormalized]
		};

    if (my $nf = $Opt{"normalization"}) {
	unless (exists $nf_types->{$nf}) {
	    usage "unrecognized normalization of --normalization=$Opt{normalization}";
	}
    }

	state $var_types = {
		map { $_, 1 } qw[ blanked non-ignorable shifted shift-trimmed ]
		};

    if (my $var = $Opt{"variable"}) {
	unless ( exists $var_types->{lc $var}) {
	    usage "bogus value --variable=$var";
	}
    }

    if (my $levels = $Opt{"backwards-levels"} ) {
	unless ($levels =~ /^[1-5]+$/) {
	    usage "bogus value backwards-levels=$levels";
	}
    }

    if (my $code = $Opt{"preprocess"}) {
	$Opt{"preprocess"} = compile($code)
	    || usage("error in preprocess code $code: $@");
    }

    if (my $code = $Opt{"ignore-CJK"}) {
	$Opt{"ignore-CJK"} = compile($code)
	    || usage("error in ignore-CJK code $code: $@");
    }

    if (my $code = $Opt{"ignore-Hangul"}) {
	$Opt{"ignore-Hangul"} = compile($code)
	    || usage("error in ignore-Hangul code $code: $@");
    }

}

#################################################################

sub dequeue($$) {
    my($leader, $body) = @_;
    $body =~ s/^\s*\Q$leader\E ?//gm;
    return $body;
}

sub deQ($) {
    my $text = $_[0];
    return dequeue q<|Q|>,  $text;
}

sub deQQ($) {
    my $text = $_[0];
    return dequeue qq<|QQ|>, $text;
}

#################################################################

sub compile($) {
    my $CODE = shift();
    my $wrap = deQQ<<"END_OF_COMPILATION";
                |QQ|
                |QQ|    use warnings qw[FATAL all];
                |QQ|    no  warnings "utf8";
                |QQ|
                |QQ|    sub {
                |QQ|           my \$_ = shift;
                |QQ|           $CODE;
                |QQ|           return \$_;
                |QQ|    }
                |QQ|
END_OF_COMPILATION

    return eval $wrap;

}

#################################################################

sub usage(;$) {
    my $msg = @_ ? (" ". $_[0]) : "";
    print STDERR "$0: USAGE ERROR!", $msg, "\n";
    pod2usage(1);
    confess "NOT REACHED";
}

#################################################################

# this all done the expensive way with lots of copying
#
sub reorder(@) {
    @_		|| confess "usage error: need arguments";

    my @inputs = @_;

    my $lines_flipped = $Opt{"right-to-left"};
    if ($lines_flipped) {
	@inputs = map { flip_line } @inputs;
    }

    my $fields_flipped = $Opt{"reverse-fields"};
    if ($fields_flipped) {
	@inputs = map { flip_fields } @inputs;
    }

    my $collation_class = Unicode::Collate::;
    my @tailoring = ();

    if ($Opt{locale}) {
	require Unicode::Collate::Locale;
	$collation_class = Unicode::Collate::Locale::;
	push @tailoring, locale => $Opt{locale};
    }

    ## if ($Opt{"case-insensitive"}) { push @tailoring, level => 1; }

    if ($Opt{"upper-before-lower"}) {
	push @tailoring, upper_before_lower => 1;
    }

    if ($Opt{"collation-level"}) {
	push @tailoring, collation_level => 1;
    }

    if ($Opt{"override-CJK"}) {
	push @tailoring, override_CJK => $Opt{"override-CJK"};
    }

    if ($Opt{"override-Hangul"}) {
	push @tailoring, override_Hangul => $Opt{"override-Hangul"};
    }

    if ($Opt{"preprocess"}) {
	push @tailoring, preprocess => $Opt{"preprocess"};
    }

    if ($Opt{"katakana-before-hiragana"}) {
	push @tailoring, katakana_before_hiragana => 1;
    }

    if ($Opt{"backwards-levels"} ) {
	my @backwards_levels = $Opt{"backwards-levels"} =~ /(\d)/g;
	push @tailoring, backwards => \@backwards_levels;
    }

    if ($Opt{"variable"}) {
	push @tailoring, variable => $Opt{"variable"};
    }

    my $collator = $collation_class->new(@tailoring);
    my @output   = $collator->sort(@inputs);

    if ($fields_flipped) {
	@output = map { flip_fields } @output;
    }

    if ($lines_flipped) {
	@output = map { flip_line } @output;
    }

    return @output;

}

sub flip_fields(_) {
    local $_ = $_[0];

    my $crlf       = s/(\R+)\z// ? $1 : q();
    my $leading_ws = s/\A(\h+)// ? $1 : q();

    no warnings "utf8";

    my ($IFS,  $OFS)
	    =
       (" ", "\x{FFFF}");

    ($IFS, $OFS)
	 =
    ($OFS, $IFS)  if /\x{FFFF}/;  # déjà vu

    my @fields = reverse split $IFS;
    return $leading_ws . join ($OFS, @fields) . $crlf;
}

sub flip_line(_) {
    local $_ = $_[0];
    s/(\R+)\z//;
    my $crlf = $1 || q();
    $_ = join( "" => reverse /(\X)/g ) . $crlf;
    return $_;
}


#################################################################
#################################################################


__END__

=head1 NAME

ucsort - sort input records using the UCA

=head1 SYNOPSIS

ucsort [options] [input_files ...]

    # standard options
    --help|?
    --man|m
    --debug|d

    # collator constructor options
    --backwards-levels=i
    --collation-level|level|l=i
    --katakana-before-hiragana
    --normalization|n=s
    --override-CJK=s
    --override-Hangul=s
    --preprocess|P=s
    --upper-before-lower|u
    --variable=s

    # program specific options
    --case-insensitive|insensitive|i
    --input-encoding|e=s
    --locale|L=s
    --paragraph|p
    --reverse-fields|last
    --reverse-output|r
    --right-to-left|reverse-input

=head1 DESCRIPTION

TO BE WRITTEN: DESCRIPTION

=head1 EXAMPLES

TO BE WRITTEN: EXAMPLES

=head1 ERRORS

TO BE WRITTEN: ERRORS

=head1 ENVIRONMENT

TO BE WRITTEN: ENVIRONMENT

=head1 FILES

TO BE WRITTEN: FILES

=head1 PROGRAMS

TO BE WRITTEN: PROGRAMS

=head1 BUGS

TO BE WRITTEN: BUGS

=head1 SEE ALSO

The L<Unicode::Collate> and
L<Unicode::Collate::Locale> Perl modules.

=head1 AUTHOR

Tom Christiansen <tchrist@perl.com>

=head1 COPYRIGHT AND LICENCE

Copyright 2011 Tom Christiansen.

This program is free software; you may redistribute it and/or modify it
under the same terms as Perl itself.