######################################################################
#
# make_CaseFolding.pl - make fc() table to update Char.pm
#
# Copyright (c) 2014, 2015 INABA Hitoshi <ina@cpan.org>
#
######################################################################
use 5.00503;
use strict;
unless (@ARGV) {
die <<END;
usage:
1. download "CaseFolding.txt" from ftp://ftp.unicode.org/Public/UNIDATA/CaseFolding.txt
2. $^X $0 CaseFolding.txt > fc_table.txt
3. update Char.pm using fc_table.txt
END
}
my $casefolding_file = shift @ARGV;
binmode(STDOUT);
my @script_utf8_code = ();
my @script_utf8_mapping = ();
my @script_name = ();
my %done = ();
open(CASEFOLDING,$casefolding_file) || die "Can't open file: $casefolding_file\n";
while (<CASEFOLDING>) {
next if /^#/;
chomp;
if (my($code,$status,$mapping,$name) = /^([^;]+);\s*([^;]+);\s*([^;]+);\s*(#.*)$/) {
# Usage:
# A. To do a simple case folding, use the mappings with status C + S.
# B. To do a full case folding, use the mappings with status C + F.
if ($status =~ /[CF]/) {
if ($done{$code}) {
die <<END;
code $code was done.\n";
$done{$code}
$_
END
}
$done{$code} = $_;
my $utf8_code = utf8($code);
my $utf8_mapping = join('', map {utf8($_)} split(/ /,$mapping));
push @script_utf8_code, qq{"$utf8_code"};
push @script_utf8_mapping, qq{"$utf8_mapping",};
push @script_name, $name;
}
}
}
close(CASEFOLDING);
my($length1) = sort {$b <=> $a} map {length($_)} @script_utf8_code;
my($length2) = sort {$b <=> $a} map {length($_)} @script_utf8_mapping;
while (@script_utf8_code) {
printf(qq{ %-${length1}s => %-${length2}s %s\n},
shift @script_utf8_code,
shift @script_utf8_mapping,
shift @script_name,
);
}
sub utf8 {
my($unicode) = @_;
my $bin = join('',
map {{qw(
0 0000
1 0001
2 0010
3 0011
4 0100
5 0101
6 0110
7 0111
8 1000
9 1001
A 1010
B 1011
C 1100
D 1101
E 1110
F 1111
)}->{$_}} split(//,uc($unicode))
);
my $bin24 = substr(('0' x 24) . $bin, -24, 24);
if (0) {
} # 0123 4567 8901 2345 6789 0123
elsif ($bin24 =~ /^0000 0000 0000 0000 0... ....$/x) { # 0..7F
return join('', map {"\\x\U$_"}
unpack('H2',pack('B*', '0'.substr($bin24,17,7))),
);
} # 0123 4567 8901 2345 6789 0123
elsif ($bin24 =~ /^0000 0000 0000 0... .... ....$/x) { # ..7FF
return join('', map {"\\x\U$_"}
unpack('H2',pack('B*', '110'.substr($bin24,13,5))),
unpack('H2',pack('B*', '10' .substr($bin24,18,6))),
);
} # 0123 4567 8901 2345 6789 0123
elsif ($bin24 =~ /^0000 0000 .... .... .... ....$/x) { # ..FFFF
return join('', map {"\\x\U$_"}
unpack('H2',pack('B*', '1110'.substr($bin24, 8,4))),
unpack('H2',pack('B*', '10' .substr($bin24,12,6))),
unpack('H2',pack('B*', '10' .substr($bin24,18,6))),
);
} # 0123 4567 8901 2345 6789 0123
elsif ($bin24 =~ /^000. .... .... .... .... ....$/x) { # ..1FFFFF
return join('', map {"\\x\U$_"}
unpack('H2',pack('B*', '11110'.substr($bin24, 3,3))),
unpack('H2',pack('B*', '10' .substr($bin24, 6,6))),
unpack('H2',pack('B*', '10' .substr($bin24,12,6))),
unpack('H2',pack('B*', '10' .substr($bin24,18,6))),
);
}
else {
die "Can't encode ($unicode) into UTF-8.\n";
}
}
__END__
=pod
=head1 NAME
make_CaseFolding.pl - make fc() table to update Char.pm
=head1 SYNOPSIS
1. download "CaseFolding.txt" from ftp://ftp.unicode.org/Public/UNIDATA/CaseFolding.txt
2. perl make_CaseFolding.pl CaseFolding.txt > fc_table.txt
3. update Char.pm using fc_table.txt
=head1 DEPENDENCIES
This software requires perl5.00503 or later.
=head1 AUTHOR
INABA Hitoshi E<lt>ina@cpan.orgE<gt>
This project was originated by INABA Hitoshi.
=head1 LICENSE AND COPYRIGHT
This software is free software; you can redistribute it and/or
modify it under the same terms as Perl itself. See L<perlartistic>.
This software is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
=head1 SEE ALSO
CPAN Directory INABA Hitoshi
http://search.cpan.org/~ina/
BackPAN
http://backpan.perl.org/authors/id/I/IN/INA/
Recent Perl packages by "INABA Hitoshi"
http://code.activestate.com/ppm/author:INABA-Hitoshi/
=cut