The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!perl
#
# This auxiliary script makes five header files
# used for building XSUB of Unicode::Collate.
#
# Usage:
#    <do 'mkheader'> in perl, or <perl mkheader> in command line
#
# Input file:
#    Collate/allkeys.txt
#
# Output file:
#    ucatbl.h
#
use 5.006;
use strict;
use warnings;
use Carp;
use File::Spec;

BEGIN {
    unless ("A" eq pack('U', 0x41)) {
	die "Unicode::Collate cannot stringify a Unicode code point\n";
    }
    unless (0x41 == unpack('U', 'A')) {
	die "Unicode::Collate cannot get a Unicode code point\n";
    }
}

use constant TRUE  => 1;
use constant FALSE => "";
use constant VCE_TEMPLATE => 'Cn4';

sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }

our $PACKAGE = 'Unicode::Collate, mkheader';
our $prefix  = "UCA_";

our %SimpleEntries;	# $codepoint => $keys
our @Rest;

{
    my($f, $fh);
    foreach my $d (File::Spec->curdir()) {
	$f = File::Spec->catfile($d, "Collate", "allkeys.txt");
	last if open($fh, $f);
	$f = undef;
    }
    croak "$PACKAGE: Collate/allkeys.txt is not found" if !defined $f;

    while (my $line = <$fh>) {
	next if $line =~ /^\s*#/;
	if ($line =~ /^\s*\@/) {
	    push @Rest, $line;
	    next;
	}

	next if $line !~ /^\s*[0-9A-Fa-f]/; # lines without element

	$line =~ s/[#%]\s*(.*)//; # removing comment (not getting the name)

	# gets element
	my($e, $k) = split /;/, $line;

	croak "Wrong Entry: <charList> must be separated by ';' ".
	      "from <collElement>" if ! $k;

	my @uv = _getHexArray($e);
	next if !@uv;

	if (@uv != 1) {
	    push @Rest, $line;
	    next;
	    # Contractions of two or more characters will not be compiled.
	}

	my $is_L3_ignorable = TRUE;

	my @key;
	foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
	    my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
	    my @wt = _getHexArray($arr);
	    push @key, pack(VCE_TEMPLATE, $var, @wt);
	    $is_L3_ignorable = FALSE
		if $wt[0] || $wt[1] || $wt[2];
	    # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
	    # is completely ignorable.
	    # For expansion, an entry $is_L3_ignorable
	    # if and only if "all" CEs are [.0000.0000.0000].
	}
	my $mapping = $is_L3_ignorable ? [] : \@key;
	my $num = @$mapping;
	my $str = chr($num).join('', @$mapping);
	$SimpleEntries{$uv[0]} = stringify($str);
    }
}

sub stringify {
    my $str = shift;
    return sprintf '"%s"', join '',
	   map sprintf("\\x%02x", ord $_), split //, $str;

}

########## writing header files ##########

my $init = '';
{
    my $type = "char* const";
    my $head = $prefix."rest";

    $init .= "static const $type $head [] = {\n";
    for my $line (@Rest) {
	$line =~ s/\s*\z//;
	next if $line eq '';
	$init .= "/*$line*/\n" if $line =~ /^[A-Za-z0-9_.:;@\ \[\]]+\z/;
	$init .= stringify($line).",\n";
    }
    $init .= "NULL\n"; # sentinel
    $init .= "};\n\n";
}

my @tripletable = (
    {
	file => "ucatbl",
	name => "simple",
	type => "char* const",
	hash => \%SimpleEntries,
	null => "NULL",
	init => $init,
    },
);

foreach my $tbl (@tripletable) {
    my $file = "$tbl->{file}.h";
    my $head = "${prefix}$tbl->{name}";
    my $type = $tbl->{type};
    my $hash = $tbl->{hash};
    my $null = $tbl->{null};
    my $init = $tbl->{init};

    open my $fh_h, ">$file" or croak "$PACKAGE: $file can't be made";
    binmode $fh_h; select $fh_h;
    my %val;

    print << 'EOF';
/*
 * This file is auto-generated by mkheader.
 * Any changes here will be lost!
 */
EOF

    print $init if defined $init;

    foreach my $uv (keys %$hash) {
	croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
	    unless $uv <= 0x10FFFF;
	my @c = unpack 'CCCC', pack 'N', $uv;
	$val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
	# $c[0] must be 0.
    }

    foreach my $p (sort { $a <=> $b } keys %val) {
	next if ! $val{ $p };
	for (my $r = 0; $r < 256; $r++) {
	    next if ! $val{ $p }{ $r };
	    printf "static const $type ${head}_%02x_%02x [256] = {\n", $p, $r;
	    for (my $c = 0; $c < 256; $c++) {
		print "\t", defined $val{$p}{$r}{$c}
		    ? $val{$p}{$r}{$c}
		    : $null;
		print ','  if $c != 255;
		print "\n" if $c % 8 == 7;
	    }
	    print "};\n\n";
	}
    }
    foreach my $p (sort { $a <=> $b } keys %val) {
	next if ! $val{ $p };
	printf "static const $type* const ${head}_%02x [256] = {\n", $p;
	for (my $r = 0; $r < 256; $r++) {
	    print $val{ $p }{ $r }
		? sprintf("${head}_%02x_%02x", $p, $r)
		: "NULL";
	    print ','  if $r != 255;
	    print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
	}
	print "};\n\n";
    }
    print "static const $type* const * const $head [] = {\n";
    for (my $p = 0; $p <= 0x10; $p++) {
	print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
	print ','  if $p != 0x10;
	print "\n";
    }
    print "};\n\n";
    close $fh_h;
}

1;
__END__