The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!perl
#
# This auxiliary script makes five header files
# used for building XSUB of Unicode::Normalize.
#
# Usage:
#    <do 'mkheader'> in perl, or <perl mkheader> in command line
#
# Input files:
#    unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
#    unicore/Decomposition.pl (or unicode/Decomposition.pl)
#    unicore/CompositionExclusions.txt (or unicode/CompExcl.txt)
#
# Output files:
#    unfcan.h
#    unfcpt.h
#    unfcmb.h
#    unfcmp.h
#    unfexc.h
#
use 5.006;
use strict;
use warnings;
use Carp;
use File::Spec;

BEGIN {
    unless ("A" eq pack('U', 0x41)) {
	die "Unicode::Normalize cannot stringify a Unicode code point\n";
    }
}

our $PACKAGE = 'Unicode::Normalize, mkheader';

our $prefix = "UNF_";
our $structname = "${prefix}complist";

sub pack_U {
    return pack('U*', @_);
}

# %Canon and %Compat will be ($codepoint => $hexstring) after _U_stringify()
our %Comp1st;	# $codepoint => $listname  : may be composed with a next char.
our %CompList;	# $listname,$2nd  => $codepoint : composite

##### The below part is common to mkheader and PP #####

our %Combin;	# $codepoint => $number    : combination class
our %Canon;	# $codepoint => \@codepoints : canonical decomp.
our %Compat;	# $codepoint => \@codepoints : compat. decomp.
our %Compos;	# $1st,$2nd  => $codepoint : composite
our %Exclus;	# $codepoint => 1          : composition exclusions
our %Single;	# $codepoint => 1          : singletons
our %NonStD;	# $codepoint => 1          : non-starter decompositions
our %Comp2nd;	# $codepoint => 1          : may be composed with a prev char.

# from Unicode database
our $Combin = do "unicore/CombiningClass.pl"
    || do "unicode/CombiningClass.pl"
    || croak "$PACKAGE: CombiningClass.pl not found";
our $Decomp = do "unicore/Decomposition.pl"
    || do "unicode/Decomposition.pl"
    || croak "$PACKAGE: Decomposition.pl not found";

# definition of Hangul constants
use constant SBase  => 0xAC00;
use constant SFinal => 0xD7A3; # SBase -1 + SCount
use constant SCount =>  11172; # LCount * NCount
use constant NCount =>    588; # VCount * TCount
use constant LBase  => 0x1100;
use constant LFinal => 0x1112;
use constant LCount =>     19;
use constant VBase  => 0x1161;
use constant VFinal => 0x1175;
use constant VCount =>     21;
use constant TBase  => 0x11A7;
use constant TFinal => 0x11C2;
use constant TCount =>     28;

sub decomposeHangul {
    my $sindex = $_[0] - SBase;
    my $lindex = int( $sindex / NCount);
    my $vindex = int(($sindex % NCount) / TCount);
    my $tindex =      $sindex % TCount;
    my @ret = (
       LBase + $lindex,
       VBase + $vindex,
      $tindex ? (TBase + $tindex) : (),
    );
    return wantarray ? @ret : pack_U(@ret);
}

########## getting full decomposition ##########
{
    my($f, $fh);
    foreach my $d (@INC) {
	$f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
	last if open($fh, $f);
	$f = File::Spec->catfile($d, "unicore", "CompExcl.txt");
	last if open($fh, $f);
	$f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
	last if open($fh, $f);
	$f = undef;
    }
    croak "$PACKAGE: neither unicore/CompositionExclusions.txt "
	. "nor unicode/CompExcl.txt is found in @INC" unless defined $f;

    while (<$fh>) {
	next if /^#/ or /^$/;
	s/#.*//;
	$Exclus{ hex($1) } = 1 if /([0-9A-Fa-f]+)/;
    }
    close $fh;
}

## converts string "hhhh hhhh hhhh" to a numeric list
## (hex digits separated by spaces)
sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g }

while ($Combin =~ /(.+)/g) {
    my @tab = split /\t/, $1;
    my $ini = hex $tab[0];
    if ($tab[1] eq '') {
	$Combin{$ini} = $tab[2];
    } else {
	$Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]);
    }
}

while ($Decomp =~ /(.+)/g) {
    my @tab = split /\t/, $1;
    my $compat = $tab[2] =~ s/<[^>]+>//;
    my $dec = [ _getHexArray($tab[2]) ]; # decomposition
    my $ini = hex($tab[0]); # initial decomposable character
    my $end = $tab[1] eq '' ? $ini : hex($tab[1]);
    # ($ini .. $end) is the range of decomposable characters.

    foreach my $u ($ini .. $end) {
	$Compat{$u} = $dec;

	if (! $compat) {
	    $Canon{$u} = $dec;

	    if (@$dec == 2) {
		if ($Combin{ $dec->[0] }) {
		    $NonStD{$u} = 1;
		} else {
		    $Compos{ $dec->[0] }{ $dec->[1] } = $u;
		    $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
		}
	    } elsif (@$dec == 1) {
		$Single{$u} = 1;
	    } else {
		croak("Weird Canonical Decomposition of U+$tab[0]");
	    }
	}
    }
}

# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
    $Comp2nd{$j} = 1;
}

sub getCanonList {
    my @src = @_;
    my @dec = map {
	(SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
	    : $Canon{$_} ? @{ $Canon{$_} } : $_
		} @src;
    return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
    # condition @src == @dec is not ok.
}

sub getCompatList {
    my @src = @_;
    my @dec = map {
	(SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
	    : $Compat{$_} ? @{ $Compat{$_} } : $_
		} @src;
    return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
    # condition @src == @dec is not ok.
}

# exhaustive decomposition
foreach my $key (keys %Canon) {
    $Canon{$key}  = [ getCanonList($key) ];
}

# exhaustive decomposition
foreach my $key (keys %Compat) {
    $Compat{$key} = [ getCompatList($key) ];
}

##### The above part is common to mkheader and PP #####

foreach my $comp1st (keys %Compos) {
    my $listname = sprintf("${structname}_%06x", $comp1st);
		# %04x is bad since it'd place _3046 after _1d157.
    $Comp1st{$comp1st} = $listname;
    my $rh1st = $Compos{$comp1st};

    foreach my $comp2nd (keys %$rh1st) {
	my $uc = $rh1st->{$comp2nd};
	$CompList{$listname}{$comp2nd} = $uc;
    }
}

sub split_into_char {
    use bytes;
    my $uni = shift;
    my $len = length($uni);
    my @ary;
    for(my $i = 0; $i < $len; ++$i) {
	push @ary, ord(substr($uni,$i,1));
    }
    return @ary;
}

sub _U_stringify {
    sprintf '"%s"', join '',
	map sprintf("\\x%02x", $_), split_into_char(pack_U(@_));
}

foreach my $hash (\%Canon, \%Compat) {
    foreach my $key (keys %$hash) {
	$hash->{$key} = _U_stringify( @{ $hash->{$key} } );
    }
}

########## writing header files ##########

my @boolfunc = (
    {
	name => "Exclusion",
	type => "bool",
	hash => \%Exclus,
    },
    {
	name => "Singleton",
	type => "bool",
	hash => \%Single,
    },
    {
	name => "NonStDecomp",
	type => "bool",
	hash => \%NonStD,
    },
    {
	name => "Comp2nd",
	type => "bool",
	hash => \%Comp2nd,
    },
);

my $file = "unfexc.h";
open FH, ">$file" or croak "$PACKAGE: $file can't be made";
binmode FH; select FH;

    print << 'EOF';
/*
 * This file is auto-generated by mkheader.
 * Any changes here will be lost!
 */
EOF

foreach my $tbl (@boolfunc) {
    my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
    my $type = $tbl->{type};
    my $name = $tbl->{name};
    print "$type is$name (UV uv)\n{\nreturn\n\t";

    while (@temp) {
	my $cur = shift @temp;
	if (@temp && $cur + 1 == $temp[0]) {
	    print "($cur <= uv && uv <= ";
	    while (@temp && $cur + 1 == $temp[0]) {
		$cur = shift @temp;
	    }
	    print "$cur)";
	    print "\n\t|| " if @temp;
	} else {
	    print "uv == $cur";
	    print "\n\t|| " if @temp;
	}
    }
    print "\n\t? TRUE : FALSE;\n}\n\n";
}

close FH;

####################################

my $compinit =
    "typedef struct { UV nextchar; UV composite; } $structname;\n\n";

foreach my $i (sort keys %CompList) {
    $compinit .= "$structname $i [] = {\n";
    $compinit .= join ",\n",
	map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
	    sort {$a <=> $b } keys %{ $CompList{$i} };
    $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
}

my @tripletable = (
    {
	file => "unfcmb",
	name => "combin",
	type => "STDCHAR",
	hash => \%Combin,
	null =>  0,
    },
    {
	file => "unfcan",
	name => "canon",
	type => "char*",
	hash => \%Canon,
	null => "NULL",
    },
    {
	file => "unfcpt",
	name => "compat",
	type => "char*",
	hash => \%Compat,
	null => "NULL",
    },
    {
	file => "unfcmp",
	name => "compos",
	type => "$structname *",
	hash => \%Comp1st,
	null => "NULL",
	init => $compinit,
    },
);

foreach my $tbl (@tripletable) {
    my $file = "$tbl->{file}.h";
    my $head = "${prefix}$tbl->{name}";
    my $type = $tbl->{type};
    my $hash = $tbl->{hash};
    my $null = $tbl->{null};
    my $init = $tbl->{init};

    open FH, ">$file" or croak "$PACKAGE: $file can't be made";
    binmode FH; select FH;
    my %val;

    print FH << 'EOF';
/*
 * This file is auto-generated by mkheader.
 * Any changes here will be lost!
 */
EOF

    print $init if defined $init;

    foreach my $uv (keys %$hash) {
	croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
	    unless $uv <= 0x10FFFF;
	my @c = unpack 'CCCC', pack 'N', $uv;
	$val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
    }

    foreach my $p (sort { $a <=> $b } keys %val) {
	next if ! $val{ $p };
	for (my $r = 0; $r < 256; $r++) {
	    next if ! $val{ $p }{ $r };
	    printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r;
	    for (my $c = 0; $c < 256; $c++) {
		print "\t", defined $val{$p}{$r}{$c}
		    ? "($type)".$val{$p}{$r}{$c}
		    : $null;
		print ','  if $c != 255;
		print "\n" if $c % 8 == 7;
	    }
	    print "};\n\n";
	}
    }
    foreach my $p (sort { $a <=> $b } keys %val) {
	next if ! $val{ $p };
	printf "static $type* ${head}_%02x [256] = {\n", $p;
	for (my $r = 0; $r < 256; $r++) {
	    print $val{ $p }{ $r }
		? sprintf("${head}_%02x_%02x", $p, $r)
		: "NULL";
	    print ','  if $r != 255;
	    print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
	}
	print "};\n\n";
    }
    print "static $type** $head [] = {\n";
    for (my $p = 0; $p <= 0x10; $p++) {
	print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
	print ','  if $p != 0x10;
	print "\n";
    }
    print "};\n\n";
    close FH;
}

1;
__END__