#!perl -w
use 5.015;
use strict;
use warnings;
use Unicode::UCD qw(prop_aliases
prop_values
prop_value_aliases
prop_invlist
prop_invmap search_invlist
);
require 'regen/regen_lib.pl';
require 'regen/charset_translations.pl';
# This program outputs charclass_invlists.h, which contains various inversion
# lists in the form of C arrays that are to be used as-is for inversion lists.
# Thus, the lists it contains are essentially pre-compiled, and need only a
# light-weight fast wrapper to make them usable at run-time.
# As such, this code knows about the internal structure of these lists, and
# any change made to that has to be done here as well. A random number stored
# in the headers is used to minimize the possibility of things getting
# out-of-sync, or the wrong data structure being passed. Currently that
# random number is:
# charclass_invlists.h now also has a partial implementation of inversion
# maps; enough to generate tables for the line break properties, such as GCB
my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
# integer or float
my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
# Matches valid C language enum names: begins with ASCII alphabetic, then any
# ASCII \w
my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
my $out_fh = open_new('charclass_invlists.h', '>',
{style => '*', by => $0,
from => "Unicode::UCD"});
my $in_file_pound_if = 0;
my $max_hdr_len = 3; # In headings, how wide a name is allowed?
print $out_fh "/* See the generating file for comments */\n\n";
# The symbols generated by this program are all currently defined only in a
# single dot c each. The code knows where most of them go, but this hash
# gives overrides for the exceptions to the typical place
my %exceptions_to_where_to_define =
( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
AboveLatin1 => 'PERL_IN_REGCOMP_C',
Latin1 => 'PERL_IN_REGCOMP_C',
UpperLatin1 => 'PERL_IN_REGCOMP_C',
_Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
_Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
_Perl_IDCont => 'PERL_IN_UTF8_C',
_Perl_IDStart => 'PERL_IN_UTF8_C',
);
# This hash contains the properties with enums that have hard-coded references
# to them in C code. It is neeed to make sure that if perl is compiled
# with an older Unicode data set, that all the enum values the code is
# expecting will still be in the enum typedef. Thus the code doesn't have to
# change. The Unicode version won't have any code points that have the enum
# values not in that version, so the code that handles them will not get
# exercised. This is far better than having to #ifdef things. The names here
# should be the long names of the respective property values. The reason for
# this is because regexec.c uses them as case labels, and the long name is
# generally more understandable than the short.
my %hard_coded_enums =
( gcb => [
'Control',
'CR',
'Extend',
'L',
'LF',
'LV',
'LVT',
'Other',
'Prepend',
'Regional_Indicator',
'SpacingMark',
'T',
'V',
],
lb => [
'Alphabetic',
'Break_After',
'Break_Before',
'Break_Both',
'Break_Symbols',
'Carriage_Return',
'Close_Parenthesis',
'Close_Punctuation',
'Combining_Mark',
'Contingent_Break',
'Exclamation',
'Glue',
'H2',
'H3',
'Hebrew_Letter',
'Hyphen',
'Ideographic',
'Infix_Numeric',
'Inseparable',
'JL',
'JT',
'JV',
'Line_Feed',
'Mandatory_Break',
'Next_Line',
'Nonstarter',
'Numeric',
'Open_Punctuation',
'Postfix_Numeric',
'Prefix_Numeric',
'Quotation',
'Regional_Indicator',
'Space',
'Word_Joiner',
'ZWSpace',
],
sb => [
'ATerm',
'Close',
'CR',
'Extend',
'Format',
'LF',
'Lower',
'Numeric',
'OLetter',
'Other',
'SContinue',
'Sep',
'Sp',
'STerm',
'Upper',
],
wb => [
'ALetter',
'CR',
'Double_Quote',
'Extend',
'ExtendNumLet',
'Format',
'Hebrew_Letter',
'Katakana',
'LF',
'MidLetter',
'MidNum',
'MidNumLet',
'Newline',
'Numeric',
'Other',
'Perl_Tailored_HSpace',
'Regional_Indicator',
'Single_Quote',
],
);
my %gcb_enums;
my @gcb_short_enums;
my %gcb_abbreviations;
my %lb_enums;
my @lb_short_enums;
my %lb_abbreviations;
my %wb_enums;
my @wb_short_enums;
my %wb_abbreviations;
my @a2n;
sub uniques {
# Returns non-duplicated input values. From "Perl Best Practices:
# Encapsulated Cleverness". p. 455 in first edition.
my %seen;
return grep { ! $seen{$_}++ } @_;
}
sub a2n($) {
my $cp = shift;
# Returns the input Unicode code point translated to native.
return $cp if $cp !~ $numeric_re || $cp > 255;
return $a2n[$cp];
}
sub end_file_pound_if {
if ($in_file_pound_if) {
print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
$in_file_pound_if = 0;
}
}
sub switch_pound_if ($$) {
my $name = shift;
my $new_pound_if = shift;
# Switch to new #if given by the 2nd argument. If there is an override
# for this, it instead switches to that. The 1st argument is the
# static's name, used to look up the overrides
if (exists $exceptions_to_where_to_define{$name}) {
$new_pound_if = $exceptions_to_where_to_define{$name};
}
# Exit current #if if the new one is different from the old
if ($in_file_pound_if
&& $in_file_pound_if !~ /$new_pound_if/)
{
end_file_pound_if;
}
# Enter new #if, if not already in it.
if (! $in_file_pound_if) {
$in_file_pound_if = "defined($new_pound_if)";
print $out_fh "\n#if $in_file_pound_if\n";
}
}
sub output_invlist ($$;$) {
my $name = shift;
my $invlist = shift; # Reference to inversion list array
my $charset = shift // ""; # name of character set for comment
die "No inversion list for $name" unless defined $invlist
&& ref $invlist eq 'ARRAY';
# Output the inversion list $invlist using the name $name for it.
# It is output in the exact internal form for inversion lists.
# Is the last element of the header 0, or 1 ?
my $zero_or_one = 0;
if (@$invlist && $invlist->[0] != 0) {
unshift @$invlist, 0;
$zero_or_one = 1;
}
my $count = @$invlist;
switch_pound_if ($name, 'PERL_IN_PERL_C');
print $out_fh "\nstatic const UV ${name}_invlist[] = {";
print $out_fh " /* for $charset */" if $charset;
print $out_fh "\n";
print $out_fh "\t$count,\t/* Number of elements */\n";
print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
print $out_fh "\t", $zero_or_one,
",\t/* 0 if the list starts at 0;",
"\n\t\t 1 if it starts at the element beyond 0 */\n";
# The main body are the UVs passed in to this routine. Do the final
# element separately
for my $i (0 .. @$invlist - 1) {
printf $out_fh "\t0x%X", $invlist->[$i];
print $out_fh "," if $i < @$invlist - 1;
print $out_fh "\n";
}
print $out_fh "};\n";
}
sub output_invmap ($$$$$$$) {
my $name = shift;
my $invmap = shift; # Reference to inversion map array
my $prop_name = shift;
my $input_format = shift; # The inversion map's format
my $default = shift; # The property value for code points who
# otherwise don't have a value specified.
my $extra_enums = shift; # comma-separated list of our additions to the
# property's standard possible values
my $charset = shift // ""; # name of character set for comment
# Output the inversion map $invmap for property $prop_name, but use $name
# as the actual data structure's name.
my $count = @$invmap;
my $output_format;
my $declaration_type;
my %enums;
my $name_prefix;
if ($input_format eq 's') {
my $orig_prop_name = $prop_name;
$prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
my @enums;
if ($orig_prop_name eq $prop_name) {
@enums = prop_values($prop_name);
}
else {
@enums = uniques(@$invmap);
}
if (! @enums) {
die "Only enum properties are currently handled; '$prop_name' isn't one";
}
else {
my @expected_enums = @{$hard_coded_enums{lc $short_name}};
my @canonical_input_enums;
if (@expected_enums) {
if (@expected_enums < @enums) {
die 'You need to update %hard_coded_enums to reflect new'
. " entries in this Unicode version\n"
. "Expected: " . join(", ", sort @expected_enums) . "\n"
. " Got: " . join(", ", sort @enums);
}
if (! defined prop_aliases($prop_name)) {
# Convert the input enums into canonical form and
# save for use below
@canonical_input_enums = map { lc ($_ =~ s/_//gr) }
@enums;
}
@enums = sort @expected_enums;
}
# The internal enums come last, and in the order specified
my @extras;
if ($extra_enums ne "") {
@extras = split /,/, $extra_enums;
push @enums, @extras;
}
# Assign a value to each element of the enum. The default
# value always gets 0; the others are arbitrarily assigned.
my $enum_val = 0;
my $canonical_default = prop_value_aliases($prop_name, $default);
$default = $canonical_default if defined $canonical_default;
$enums{$default} = $enum_val++;
for my $enum (@enums) {
$enums{$enum} = $enum_val++ unless exists $enums{$enum};
}
# Calculate the enum values for certain properties like
# _Perl_GCB and _Perl_LB, because we output special tables for
# them.
if ($name =~ / ^ _Perl_ (?: GCB | LB | WB ) $ /x) {
# We use string evals to allow the same code to work on
# all tables we're doing.
my $type = lc $prop_name;
# We use lowercase single letter names for any property
# values not in the release of Unicode being compiled now.
my $placeholder = "a";
# Skip if we've already done this code, which populated
# this hash
if (eval "! \%${type}_enums") {
# For each enum ...
foreach my $enum (sort keys %enums) {
my $value = $enums{$enum};
my $short;
my $abbreviated_from;
# Special case this wb property value to make the
# name more clear
if ($enum eq 'Perl_Tailored_HSpace') {
$short = 'hs';
$abbreviated_from = $enum;
}
elsif (grep { $_ eq $enum } @extras) {
# The 'short' name for one of the property
# values added by this file is just the
# lowercase of it
$short = lc $enum;
}
elsif (grep {$_ eq lc ( $enum =~ s/_//gr) }
@canonical_input_enums)
{ # On Unicode versions that predate the
# official property, we have set up this array
# to be the canonical form of each enum in the
# substitute property. If the enum we're
# looking at is canonically the same as one of
# these, use its name instead of generating a
# placeholder one in the next clause (which
# will happen because prop_value_aliases()
# will fail because it only works on official
# properties)
$short = $enum;
}
else {
# Use the official short name for the other
# property values, which should all be
# official ones.
($short) = prop_value_aliases($type, $enum);
# But create a placeholder for ones not in
# this Unicode version.
$short = $placeholder++ unless defined $short;
}
# If our short name is too long, or we already
# know that the name is an abbreviation, truncate
# to make sure it's short enough, and remember
# that we did this so we can later place in a
# comment in the generated file
if ( $abbreviated_from
|| length $short > $max_hdr_len)
{
$short = substr($short, 0, $max_hdr_len);
$abbreviated_from = $enum
unless $abbreviated_from;
# If the name we are to display conflicts, try
# another.
while (eval "exists
\$${type}_abbreviations{$short}")
{
die $@ if $@;
$short++;
}
eval "\$${type}_abbreviations{$short} = '$enum'";
die $@ if $@;
}
# Remember the mapping from the property value
# (enum) name to its value.
eval "\$${type}_enums{$enum} = $value";
die $@ if $@;
# Remember the inverse mapping to the short name
# so that we can properly label the generated
# table's rows and columns
eval "\$${type}_short_enums[$value] = '$short'";
die $@ if $@;
}
}
}
}
# Inversion map stuff is currently used only by regexec
switch_pound_if($name, 'PERL_IN_REGEXEC_C');
{
# The short names tend to be two lower case letters, but it looks
# better for those if they are upper. XXX
$short_name = uc($short_name) if length($short_name) < 3
|| substr($short_name, 0, 1) =~ /[[:lower:]]/;
$name_prefix = "${short_name}_";
my $enum_count = keys %enums;
print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
print $out_fh "\ntypedef enum {\n";
my @enum_list;
foreach my $enum (keys %enums) {
$enum_list[$enums{$enum}] = $enum;
}
foreach my $i (0 .. @enum_list - 1) {
my $name = $enum_list[$i];
print $out_fh "\t${name_prefix}$name = $i";
print $out_fh "," if $i < $enum_count - 1;
print $out_fh "\n";
}
$declaration_type = "${name_prefix}enum";
print $out_fh "} $declaration_type;\n";
$output_format = "${name_prefix}%s";
}
}
else {
die "'$input_format' invmap() format for '$prop_name' unimplemented";
}
die "No inversion map for $prop_name" unless defined $invmap
&& ref $invmap eq 'ARRAY'
&& $count;
print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
print $out_fh " /* for $charset */" if $charset;
print $out_fh "\n";
# The main body are the scalars passed in to this routine.
for my $i (0 .. $count - 1) {
my $element = $invmap->[$i];
my $full_element_name = prop_value_aliases($prop_name, $element);
$element = $full_element_name if defined $full_element_name;
$element = $name_prefix . $element;
print $out_fh "\t$element";
print $out_fh "," if $i < $count - 1;
print $out_fh "\n";
}
print $out_fh "};\n";
}
sub mk_invlist_from_sorted_cp_list {
# Returns an inversion list constructed from the sorted input array of
# code points
my $list_ref = shift;
return unless @$list_ref;
# Initialize to just the first element
my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
# For each succeeding element, if it extends the previous range, adjust
# up, otherwise add it.
for my $i (1 .. @$list_ref - 1) {
if ($invlist[-1] == $list_ref->[$i]) {
$invlist[-1]++;
}
else {
push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
}
}
return @invlist;
}
# Read in the Case Folding rules, and construct arrays of code points for the
# properties we need.
my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
die "Could not find inversion map for Case_Folding" unless defined $format;
die "Incorrect format '$format' for Case_Folding inversion map"
unless $format eq 'al'
|| $format eq 'a';
my @has_multi_char_fold;
my @is_non_final_fold;
for my $i (0 .. @$folds_ref - 1) {
next unless ref $folds_ref->[$i]; # Skip single-char folds
push @has_multi_char_fold, $cp_ref->[$i];
# Add to the non-finals list each code point that is in a non-final
# position
for my $j (0 .. @{$folds_ref->[$i]} - 2) {
push @is_non_final_fold, $folds_ref->[$i][$j]
unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
}
}
sub _Perl_Non_Final_Folds {
@is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
}
sub prop_name_for_cmp ($) { # Sort helper
my $name = shift;
# Returns the input lowercased, with non-alphas removed, as well as
# everything starting with a comma
$name =~ s/,.*//;
$name =~ s/[[:^alpha:]]//g;
return lc $name;
}
sub UpperLatin1 {
return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
}
sub output_table_common {
# Common subroutine to actually output the generated rules table.
my ($property,
$table_value_defines_ref,
$table_ref,
$names_ref,
$abbreviations_ref) = @_;
my $size = @$table_ref;
# Output the #define list, sorted by numeric value
if ($table_value_defines_ref) {
my $max_name_length = 0;
my @defines;
# Put in order, and at the same time find the longest name
while (my ($enum, $value) = each %$table_value_defines_ref) {
$defines[$value] = $enum;
my $length = length $enum;
$max_name_length = $length if $length > $max_name_length;
}
print $out_fh "\n";
# Output, so that the values are vertically aligned in a column after
# the longest name
foreach my $i (0 .. @defines - 1) {
next unless defined $defines[$i];
printf $out_fh "#define %-*s %2d\n",
$max_name_length,
$defines[$i],
$i;
}
}
my $column_width = 2; # We currently allow 2 digits for the number
# If the maximum value in the table is 1, it can be a bool. (Being above
# a U8 is not currently handled
my $max_element = 0;
for my $i (0 .. $size - 1) {
for my $j (0 .. $size - 1) {
next if $max_element >= $table_ref->[$i][$j];
$max_element = $table_ref->[$i][$j];
}
}
die "Need wider table column width given '$max_element"
if length $max_element > $column_width;
my $table_type = ($max_element == 1)
? 'bool'
: 'U8';
# If a name is longer than the width set aside for a column, its column
# needs to have increased spacing so that the name doesn't get truncated
# nor run into an adjacent column
my @spacers;
# If we are being compiled on a Unicode version earlier than that which
# this file was designed for, it may be that some of the property values
# aren't in the current release, and so would be undefined if we didn't
# define them ourselves. Earlier code has done this, making them
# lowercase characters of length one. We look to see if any exist, so
# that we can add an annotation to the output table
my $has_placeholder = 0;
for my $i (0 .. $size - 1) {
no warnings 'numeric';
$has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax;
$spacers[$i] = " " x (length($names_ref->[$i]) - $column_width);
}
print $out_fh "\nstatic const $table_type ${property}_table[$size][$size] = {\n";
# Calculate the column heading line
my $header_line = "/* "
. (" " x $max_hdr_len) # We let the row heading meld to
# the '*/' for those that are at
# the max
. " " x 3; # Space for '*/ '
# Now each column
for my $i (0 .. $size - 1) {
$header_line .= sprintf "%s%*s",
$spacers[$i],
$column_width + 1, # 1 for the ','
$names_ref->[$i];
}
$header_line .= " */\n";
# If we have annotations, output it now.
if ($has_placeholder || scalar %$abbreviations_ref) {
my $text = "";
foreach my $abbr (sort keys %$abbreviations_ref) {
$text .= "; " if $text;
$text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'";
}
if ($has_placeholder) {
$text .= "; other " if $text;
$text .= "lowercase names are placeholders for"
. " property values not defined until a later Unicode"
. " release, so are irrelevant in this one, as they are"
. " not assigned to any code points";
}
my $indent = " " x 3;
$text = $indent . "/* $text */";
# Wrap the text so that it is no wider than the table, which the
# header line gives.
my $output_width = length $header_line;
while (length $text > $output_width) {
my $cur_line = substr($text, 0, $output_width);
# Find the first blank back from the right end to wrap at.
for (my $i = $output_width -1; $i > 0; $i--) {
if (substr($text, $i, 1) eq " ") {
print $out_fh substr($text, 0, $i), "\n";
# Set so will look at just the remaining tail (which will
# be indented and have a '*' after the indent
$text = $indent . " * " . substr($text, $i + 1);
last;
}
}
}
# And any remaining
print $out_fh $text, "\n" if $text;
}
# We calculated the header line earlier just to get its width so that we
# could make sure the annotations fit into that.
print $out_fh $header_line;
# Now output the bulk of the table.
for my $i (0 .. $size - 1) {
# First the row heading.
printf $out_fh "/* %-*s*/ ", $max_hdr_len, $names_ref->[$i];
print $out_fh "{"; # Then the brace for this row
# Then each column
for my $j (0 .. $size -1) {
print $out_fh $spacers[$j];
printf $out_fh "%*d", $column_width, $table_ref->[$i][$j];
print $out_fh "," if $j < $size - 1;
}
print $out_fh " }";
print $out_fh "," if $i < $size - 1;
print $out_fh "\n";
}
print $out_fh "};\n";
}
sub output_GCB_table() {
# Create and output the pair table for use in determining Grapheme Cluster
# Breaks, given in http://www.unicode.org/reports/tr29/.
# The table is constructed in reverse order of the rules, to make the
# lower-numbered, higher priority ones override the later ones, as the
# algorithm stops at the earliest matching rule
my @gcb_table;
my $table_size = @gcb_short_enums;
# Otherwise, break everywhere.
# GB10 Any ÷ Any
for my $i (0 .. $table_size - 1) {
for my $j (0 .. $table_size - 1) {
$gcb_table[$i][$j] = 1;
}
}
# Do not break before extending characters.
# Do not break before SpacingMarks, or after Prepend characters.
# GB9 × Extend
# GB9a × SpacingMark
# GB9b Prepend ×
for my $i (0 .. @gcb_table - 1) {
$gcb_table[$i][$gcb_enums{'Extend'}] = 0;
$gcb_table[$i][$gcb_enums{'SpacingMark'}] = 0;
$gcb_table[$gcb_enums{'Prepend'}][$i] = 0;
}
# Do not break between regional indicator symbols.
# GB8a Regional_Indicator × Regional_Indicator
$gcb_table[$gcb_enums{'Regional_Indicator'}]
[$gcb_enums{'Regional_Indicator'}] = 0;
# Do not break Hangul syllable sequences.
# GB8 ( LVT | T) × T
$gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0;
$gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0;
# GB7 ( LV | V ) × ( V | T )
$gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0;
$gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0;
$gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0;
$gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0;
# GB6 L × ( L | V | LV | LVT )
$gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0;
$gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0;
$gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0;
$gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0;
# Do not break between a CR and LF. Otherwise, break before and after
# controls.
# GB5 ÷ ( Control | CR | LF )
# GB4 ( Control | CR | LF ) ÷
for my $i (0 .. @gcb_table - 1) {
$gcb_table[$i][$gcb_enums{'Control'}] = 1;
$gcb_table[$i][$gcb_enums{'CR'}] = 1;
$gcb_table[$i][$gcb_enums{'LF'}] = 1;
$gcb_table[$gcb_enums{'Control'}][$i] = 1;
$gcb_table[$gcb_enums{'CR'}][$i] = 1;
$gcb_table[$gcb_enums{'LF'}][$i] = 1;
}
# GB3 CR × LF
$gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0;
# Break at the start and end of text.
# GB1 sot ÷
# GB2 ÷ eot
for my $i (0 .. @gcb_table - 1) {
$gcb_table[$i][$gcb_enums{'EDGE'}] = 1;
$gcb_table[$gcb_enums{'EDGE'}][$i] = 1;
}
# But, unspecified by Unicode, we shouldn't break on an empty string.
$gcb_table[$gcb_enums{'EDGE'}][$gcb_enums{'EDGE'}] = 0;
output_table_common('GCB', undef,
\@gcb_table, \@gcb_short_enums, \%gcb_abbreviations);
}
sub output_LB_table() {
# Create and output the enums, #defines, and pair table for use in
# determining Line Breaks. This uses the default line break algorithm,
# given in http://www.unicode.org/reports/tr14/, but tailored by example 7
# in that page, as the Unicode-furnished tests assume that tailoring.
# The result is really just true or false. But we follow along with tr14,
# creating a rule which is false for something like X SP* X. That gets
# encoding 2. The rest of the actions are synthetic ones that indicate
# some context handling is required. These each are added to the
# underlying 0, 1, or 2, instead of replacing them, so that the underlying
# value can be retrieved. Actually only rules from 7 through 18 (which
# are the ones where space matter) are possible to have 2 added to them.
# The others below add just 0 or 1. It might be possible for one
# synthetic rule to be added to another, yielding a larger value. This
# doesn't happen in the Unicode 8.0 rule set, and as you can see from the
# names of the middle grouping below, it is impossible for that to occur
# for them because they all start with mutually exclusive classes. That
# the final rule can't be added to any of the others isn't obvious from
# its name, so it is assigned a power of 2 higher than the others can get
# to so any addition would preserve all data. (And the code will reach an
# assert(0) on debugging builds should this happen.)
my %lb_actions = (
LB_NOBREAK => 0,
LB_BREAKABLE => 1,
LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2,
LB_CM_foo => 3, # Rule 9
LB_SP_foo => 6, # Rule 18
LB_PR_or_PO_then_OP_or_HY => 9, # Rule 25
LB_SY_or_IS_then_various => 11, # Rule 25
LB_HY_or_BA_then_foo => 13, # Rule 21
LB_various_then_PO_or_PR => (1<<4), # Rule 25
);
# Construct the LB pair table. This is based on the rules in
# http://www.unicode.org/reports/tr14/, but modified as those rules are
# designed for someone taking a string of text and sequentially going
# through it to find the break opportunities, whereas, Perl requires
# determining if a given random spot is a break opportunity, without
# knowing all the entire string before it.
#
# The table is constructed in reverse order of the rules, to make the
# lower-numbered, higher priority ones override the later ones, as the
# algorithm stops at the earliest matching rule
my @lb_table;
my $table_size = @lb_short_enums;
# LB31. Break everywhere else
for my $i (0 .. $table_size - 1) {
for my $j (0 .. $table_size - 1) {
$lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'};
}
}
# LB30a. Don't break between Regional Indicators
$lb_table[$lb_enums{'Regional_Indicator'}]
[$lb_enums{'Regional_Indicator'}] = $lb_actions{'LB_NOBREAK'};
# LB30 Do not break between letters, numbers, or ordinary symbols and
# opening or closing parentheses.
# (AL | HL | NU) × OP
$lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Open_Punctuation'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Open_Punctuation'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Numeric'}][$lb_enums{'Open_Punctuation'}]
= $lb_actions{'LB_NOBREAK'};
# CP × (AL | HL | NU)
$lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Alphabetic'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Hebrew_Letter'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Numeric'}]
= $lb_actions{'LB_NOBREAK'};
# LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
# IS × (AL | HL)
$lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Alphabetic'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
= $lb_actions{'LB_NOBREAK'};
# LB28 Do not break between alphabetics (“at”).
# (AL | HL) × (AL | HL)
$lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Alphabetic'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Alphabetic'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Hebrew_Letter'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Hebrew_Letter'}]
= $lb_actions{'LB_NOBREAK'};
# LB27 Treat a Korean Syllable Block the same as ID.
# (JL | JV | JT | H2 | H3) × IN
$lb_table[$lb_enums{'JL'}][$lb_enums{'Inseparable'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'JV'}][$lb_enums{'Inseparable'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'JT'}][$lb_enums{'Inseparable'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'H2'}][$lb_enums{'Inseparable'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'H3'}][$lb_enums{'Inseparable'}]
= $lb_actions{'LB_NOBREAK'};
# (JL | JV | JT | H2 | H3) × PO
$lb_table[$lb_enums{'JL'}][$lb_enums{'Postfix_Numeric'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'JV'}][$lb_enums{'Postfix_Numeric'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'JT'}][$lb_enums{'Postfix_Numeric'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'H2'}][$lb_enums{'Postfix_Numeric'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'H3'}][$lb_enums{'Postfix_Numeric'}]
= $lb_actions{'LB_NOBREAK'};
# PR × (JL | JV | JT | H2 | H3)
$lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JL'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JV'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JT'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H2'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H3'}]
= $lb_actions{'LB_NOBREAK'};
# LB26 Do not break a Korean syllable.
# JL × (JL | JV | H2 | H3)
$lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'};
# (JV | H2) × (JV | JT)
$lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
# (JT | H3) × JT
$lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
# LB25 Do not break between the following pairs of classes relevant to
# numbers, as tailored by example 7 in
# http://www.unicode.org/reports/tr14/#Examples
# We follow that tailoring because Unicode's test cases expect it
# (PR | PO) × ( OP | HY )? NU
$lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Numeric'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Numeric'}]
= $lb_actions{'LB_NOBREAK'};
# Given that (OP | HY )? is optional, we have to test for it in code.
# We add in the action (instead of overriding) for this, so that in
# the code we can recover the underlying break value.
$lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Open_Punctuation'}]
+= $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
$lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Open_Punctuation'}]
+= $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
$lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hyphen'}]
+= $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
$lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hyphen'}]
+= $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
# ( OP | HY ) × NU
$lb_table[$lb_enums{'Open_Punctuation'}][$lb_enums{'Numeric'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Hyphen'}][$lb_enums{'Numeric'}]
= $lb_actions{'LB_NOBREAK'};
# NU (NU | SY | IS)* × (NU | SY | IS | CL | CP )
# which can be rewritten as:
# NU (SY | IS)* × (NU | SY | IS | CL | CP )
$lb_table[$lb_enums{'Numeric'}][$lb_enums{'Numeric'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Numeric'}][$lb_enums{'Break_Symbols'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Numeric'}][$lb_enums{'Infix_Numeric'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Punctuation'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Parenthesis'}]
= $lb_actions{'LB_NOBREAK'};
# Like earlier where we have to test in code, we add in the action so
# that we can recover the underlying values. This is done in rules
# below, as well. The code assumes that we haven't added 2 actions.
# Shoul a later Unicode release break that assumption, then tests
# should start failing.
$lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Numeric'}]
+= $lb_actions{'LB_SY_or_IS_then_various'};
$lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Break_Symbols'}]
+= $lb_actions{'LB_SY_or_IS_then_various'};
$lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Infix_Numeric'}]
+= $lb_actions{'LB_SY_or_IS_then_various'};
$lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Punctuation'}]
+= $lb_actions{'LB_SY_or_IS_then_various'};
$lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Parenthesis'}]
+= $lb_actions{'LB_SY_or_IS_then_various'};
$lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Numeric'}]
+= $lb_actions{'LB_SY_or_IS_then_various'};
$lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Break_Symbols'}]
+= $lb_actions{'LB_SY_or_IS_then_various'};
$lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Infix_Numeric'}]
+= $lb_actions{'LB_SY_or_IS_then_various'};
$lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Punctuation'}]
+= $lb_actions{'LB_SY_or_IS_then_various'};
$lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Parenthesis'}]
+= $lb_actions{'LB_SY_or_IS_then_various'};
# NU (NU | SY | IS)* (CL | CP)? × (PO | PR)
# which can be rewritten as:
# NU (SY | IS)* (CL | CP)? × (PO | PR)
$lb_table[$lb_enums{'Numeric'}][$lb_enums{'Postfix_Numeric'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Numeric'}][$lb_enums{'Prefix_Numeric'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Postfix_Numeric'}]
+= $lb_actions{'LB_various_then_PO_or_PR'};
$lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Postfix_Numeric'}]
+= $lb_actions{'LB_various_then_PO_or_PR'};
$lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Postfix_Numeric'}]
+= $lb_actions{'LB_various_then_PO_or_PR'};
$lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Postfix_Numeric'}]
+= $lb_actions{'LB_various_then_PO_or_PR'};
$lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Prefix_Numeric'}]
+= $lb_actions{'LB_various_then_PO_or_PR'};
$lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Prefix_Numeric'}]
+= $lb_actions{'LB_various_then_PO_or_PR'};
$lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Prefix_Numeric'}]
+= $lb_actions{'LB_various_then_PO_or_PR'};
$lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Prefix_Numeric'}]
+= $lb_actions{'LB_various_then_PO_or_PR'};
# LB24 Do not break between prefix and letters or ideographs.
# PR × ID
$lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Ideographic'}]
= $lb_actions{'LB_NOBREAK'};
# PR × (AL | HL)
$lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Alphabetic'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
= $lb_actions{'LB_NOBREAK'};
# PO × (AL | HL)
$lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Alphabetic'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
= $lb_actions{'LB_NOBREAK'};
# LB23 Do not break within ‘a9’, ‘3a’, or ‘H%’.
# ID × PO
$lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Postfix_Numeric'}]
= $lb_actions{'LB_NOBREAK'};
# (AL | HL) × NU
$lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Numeric'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Numeric'}]
= $lb_actions{'LB_NOBREAK'};
# NU × (AL | HL)
$lb_table[$lb_enums{'Numeric'}][$lb_enums{'Alphabetic'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Numeric'}][$lb_enums{'Hebrew_Letter'}]
= $lb_actions{'LB_NOBREAK'};
# LB22 Do not break between two ellipses, or between letters, numbers or
# exclamations and ellipsis.
# (AL | HL) × IN
$lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Inseparable'}]
= $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Inseparable'}]
= $lb_actions{'LB_NOBREAK'};
# Exclamation × IN
$lb_table[$lb_enums{'Exclamation'}][$lb_enums{'Inseparable'}]
= $lb_actions{'LB_NOBREAK'};
# ID × IN
$lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Inseparable'}]
= $lb_actions{'LB_NOBREAK'};
# IN × IN
$lb_table[$lb_enums{'Inseparable'}][$lb_enums{'Inseparable'}]
= $lb_actions{'LB_NOBREAK'};
# NU × IN
$lb_table[$lb_enums{'Numeric'}][$lb_enums{'Inseparable'}]
= $lb_actions{'LB_NOBREAK'};
# LB21b Don’t break between Solidus and Hebrew letters.
# SY × HL
$lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Hebrew_Letter'}]
= $lb_actions{'LB_NOBREAK'};
# LB21a Don't break after Hebrew + Hyphen.
# HL (HY | BA) ×
for my $i (0 .. @lb_table - 1) {
$lb_table[$lb_enums{'Hyphen'}][$i]
+= $lb_actions{'LB_HY_or_BA_then_foo'};
$lb_table[$lb_enums{'Break_After'}][$i]
+= $lb_actions{'LB_HY_or_BA_then_foo'};
}
# LB21 Do not break before hyphen-minus, other hyphens, fixed-width
# spaces, small kana, and other non-starters, or after acute accents.
# × BA
# × HY
# × NS
# BB ×
for my $i (0 .. @lb_table - 1) {
$lb_table[$i][$lb_enums{'Break_After'}] = $lb_actions{'LB_NOBREAK'};
$lb_table[$i][$lb_enums{'Hyphen'}] = $lb_actions{'LB_NOBREAK'};
$lb_table[$i][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Break_Before'}][$i] = $lb_actions{'LB_NOBREAK'};
}
# LB20 Break before and after unresolved CB.
# ÷ CB
# CB ÷
# Conditional breaks should be resolved external to the line breaking
# rules. However, the default action is to treat unresolved CB as breaking
# before and after.
for my $i (0 .. @lb_table - 1) {
$lb_table[$i][$lb_enums{'Contingent_Break'}]
= $lb_actions{'LB_BREAKABLE'};
$lb_table[$lb_enums{'Contingent_Break'}][$i]
= $lb_actions{'LB_BREAKABLE'};
}
# LB19 Do not break before or after quotation marks, such as ‘ ” ’.
# × QU
# QU ×
for my $i (0 .. @lb_table - 1) {
$lb_table[$i][$lb_enums{'Quotation'}] = $lb_actions{'LB_NOBREAK'};
$lb_table[$lb_enums{'Quotation'}][$i] = $lb_actions{'LB_NOBREAK'};
}
# LB18 Break after spaces
# SP ÷
for my $i (0 .. @lb_table - 1) {
$lb_table[$lb_enums{'Space'}][$i] = $lb_actions{'LB_BREAKABLE'};
}
# LB17 Do not break within ‘——’, even with intervening spaces.
# B2 SP* × B2
$lb_table[$lb_enums{'Break_Both'}][$lb_enums{'Break_Both'}]
= $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
# LB16 Do not break between closing punctuation and a nonstarter even with
# intervening spaces.
# (CL | CP) SP* × NS
$lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Nonstarter'}]
= $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
$lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Nonstarter'}]
= $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
# LB15 Do not break within ‘”[’, even with intervening spaces.
# QU SP* × OP
$lb_table[$lb_enums{'Quotation'}][$lb_enums{'Open_Punctuation'}]
= $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
# LB14 Do not break after ‘[’, even after spaces.
# OP SP* ×
for my $i (0 .. @lb_table - 1) {
$lb_table[$lb_enums{'Open_Punctuation'}][$i]
= $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
}
# LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as
# tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples
# [^NU] × CL
# [^NU] × CP
# × EX
# [^NU] × IS
# [^NU] × SY
for my $i (0 .. @lb_table - 1) {
$lb_table[$i][$lb_enums{'Exclamation'}]
= $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
next if $i == $lb_enums{'Numeric'};
$lb_table[$i][$lb_enums{'Close_Punctuation'}]
= $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
$lb_table[$i][$lb_enums{'Close_Parenthesis'}]
= $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
$lb_table[$i][$lb_enums{'Infix_Numeric'}]
= $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
$lb_table[$i][$lb_enums{'Break_Symbols'}]
= $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
}
# LB12a Do not break before NBSP and related characters, except after
# spaces and hyphens.
# [^SP BA HY] × GL
for my $i (0 .. @lb_table - 1) {
next if $i == $lb_enums{'Space'}
|| $i == $lb_enums{'Break_After'}
|| $i == $lb_enums{'Hyphen'};
# We don't break, but if a property above has said don't break even
# with space between, don't override that (also in the next few rules)
next if $lb_table[$i][$lb_enums{'Glue'}]
== $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
$lb_table[$i][$lb_enums{'Glue'}] = $lb_actions{'LB_NOBREAK'};
}
# LB12 Do not break after NBSP and related characters.
# GL ×
for my $i (0 .. @lb_table - 1) {
next if $lb_table[$lb_enums{'Glue'}][$i]
== $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
$lb_table[$lb_enums{'Glue'}][$i] = $lb_actions{'LB_NOBREAK'};
}
# LB11 Do not break before or after Word joiner and related characters.
# × WJ
# WJ ×
for my $i (0 .. @lb_table - 1) {
if ($lb_table[$i][$lb_enums{'Word_Joiner'}]
!= $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
{
$lb_table[$i][$lb_enums{'Word_Joiner'}] = $lb_actions{'LB_NOBREAK'};
}
if ($lb_table[$lb_enums{'Word_Joiner'}][$i]
!= $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
{
$lb_table[$lb_enums{'Word_Joiner'}][$i] = $lb_actions{'LB_NOBREAK'};
}
}
# Special case this here to avoid having to do a special case in the code,
# by making this the same as other things with a SP in front of them that
# don't break, we avoid an extra test
$lb_table[$lb_enums{'Space'}][$lb_enums{'Word_Joiner'}]
= $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
# LB9 and LB10 are done in the same loop
#
# LB9 Do not break a combining character sequence; treat it as if it has
# the line breaking class of the base character in all of the
# higher-numbered rules.
# Treat X CM* as if it were X.
# where X is any line break class except BK, CR, LF, NL, SP, or ZW.
# LB10 Treat any remaining combining mark as AL. This catches the case
# where a CM is the first character on the line or follows SP, BK, CR, LF,
# NL, or ZW.
for my $i (0 .. @lb_table - 1) {
# When the CM is the first in the pair, we don't know without looking
# behind whether the CM is going to inherit from an earlier character,
# or not. So have to figure this out in the code
$lb_table[$lb_enums{'Combining_Mark'}][$i] = $lb_actions{'LB_CM_foo'};
if ( $i == $lb_enums{'Mandatory_Break'}
|| $i == $lb_enums{'EDGE'}
|| $i == $lb_enums{'Carriage_Return'}
|| $i == $lb_enums{'Line_Feed'}
|| $i == $lb_enums{'Next_Line'}
|| $i == $lb_enums{'Space'}
|| $i == $lb_enums{'ZWSpace'})
{
# For these classes, a following CM doesn't combine, and should do
# whatever 'Alphabetic' would do.
$lb_table[$i][$lb_enums{'Combining_Mark'}]
= $lb_table[$i][$lb_enums{'Alphabetic'}];
}
else {
# For these classes, the CM combines, so doesn't break, inheriting
# the type of nobreak from the master character.
if ($lb_table[$i][$lb_enums{'Combining_Mark'}]
!= $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
{
$lb_table[$i][$lb_enums{'Combining_Mark'}]
= $lb_actions{'LB_NOBREAK'};
}
}
}
# LB8 Break before any character following a zero-width space, even if one
# or more spaces intervene.
# ZW SP* ÷
for my $i (0 .. @lb_table - 1) {
$lb_table[$lb_enums{'ZWSpace'}][$i] = $lb_actions{'LB_BREAKABLE'};
}
# Because of LB8-10, we need to look at context for "SP x", and this must
# be done in the code. So override the existing rules for that, by adding
# a constant to get new rules that tell the code it needs to look at
# context. By adding this action instead of replacing the existing one,
# we can get back to the original rule if necessary.
for my $i (0 .. @lb_table - 1) {
$lb_table[$lb_enums{'Space'}][$i] += $lb_actions{'LB_SP_foo'};
}
# LB7 Do not break before spaces or zero width space.
# × SP
# × ZW
for my $i (0 .. @lb_table - 1) {
$lb_table[$i][$lb_enums{'Space'}] = $lb_actions{'LB_NOBREAK'};
$lb_table[$i][$lb_enums{'ZWSpace'}] = $lb_actions{'LB_NOBREAK'};
}
# LB6 Do not break before hard line breaks.
# × ( BK | CR | LF | NL )
for my $i (0 .. @lb_table - 1) {
$lb_table[$i][$lb_enums{'Mandatory_Break'}] = $lb_actions{'LB_NOBREAK'};
$lb_table[$i][$lb_enums{'Carriage_Return'}] = $lb_actions{'LB_NOBREAK'};
$lb_table[$i][$lb_enums{'Line_Feed'}] = $lb_actions{'LB_NOBREAK'};
$lb_table[$i][$lb_enums{'Next_Line'}] = $lb_actions{'LB_NOBREAK'};
}
# LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
# CR × LF
# CR !
# LF !
# NL !
for my $i (0 .. @lb_table - 1) {
$lb_table[$lb_enums{'Carriage_Return'}][$i]
= $lb_actions{'LB_BREAKABLE'};
$lb_table[$lb_enums{'Line_Feed'}][$i] = $lb_actions{'LB_BREAKABLE'};
$lb_table[$lb_enums{'Next_Line'}][$i] = $lb_actions{'LB_BREAKABLE'};
}
$lb_table[$lb_enums{'Carriage_Return'}][$lb_enums{'Line_Feed'}]
= $lb_actions{'LB_NOBREAK'};
# LB4 Always break after hard line breaks.
# BK !
for my $i (0 .. @lb_table - 1) {
$lb_table[$lb_enums{'Mandatory_Break'}][$i]
= $lb_actions{'LB_BREAKABLE'};
}
# LB2 Never break at the start of text.
# sot ×
# LB3 Always break at the end of text.
# ! eot
# but these are reversed in the loop below, so that won't break if there
# is no text
for my $i (0 .. @lb_table - 1) {
$lb_table[$i][$lb_enums{'EDGE'}] = $lb_actions{'LB_BREAKABLE'};
$lb_table[$lb_enums{'EDGE'}][$i] = $lb_actions{'LB_NOBREAK'};
}
# LB1 Assign a line breaking class to each code point of the input.
# Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
# depending on criteria outside the scope of this algorithm.
#
# In the absence of such criteria all characters with a specific
# combination of original class and General_Category property value are
# resolved as follows:
# Original Resolved General_Category
# AI, SG, XX AL Any
# SA CM Only Mn or Mc
# SA AL Any except Mn and Mc
# CJ NS Any
#
# This is done in mktables, so we never see any of the remapped-from
# classes.
output_table_common('LB', \%lb_actions,
\@lb_table, \@lb_short_enums, \%lb_abbreviations);
}
sub output_WB_table() {
# Create and output the enums, #defines, and pair table for use in
# determining Word Breaks, given in http://www.unicode.org/reports/tr29/.
# This uses the same mechanism in the other bounds tables generated by
# this file. The actions that could override a 0 or 1 are added to those
# numbers; the actions that clearly don't depend on the underlying rule
# simply overwrite
my %wb_actions = (
WB_NOBREAK => 0,
WB_BREAKABLE => 1,
WB_hs_then_hs => 2,
WB_Ex_or_FO_then_foo => 3,
WB_DQ_then_HL => 4,
WB_HL_then_DQ => 6,
WB_LE_or_HL_then_MB_or_ML_or_SQ => 8,
WB_MB_or_ML_or_SQ_then_LE_or_HL => 10,
WB_MB_or_MN_or_SQ_then_NU => 12,
WB_NU_then_MB_or_MN_or_SQ => 14,
);
# Construct the WB pair table.
# The table is constructed in reverse order of the rules, to make the
# lower-numbered, higher priority ones override the later ones, as the
# algorithm stops at the earliest matching rule
my @wb_table;
my $table_size = @wb_short_enums - 1; # -1 because we don't use UNKNOWN
# Otherwise, break everywhere (including around ideographs).
# WB14 Any ÷ Any
for my $i (0 .. $table_size - 1) {
for my $j (0 .. $table_size - 1) {
$wb_table[$i][$j] = $wb_actions{'WB_BREAKABLE'};
}
}
# Do not break between regional indicator symbols.
# WB13c Regional_Indicator × Regional_Indicator
$wb_table[$wb_enums{'Regional_Indicator'}]
[$wb_enums{'Regional_Indicator'}] = $wb_actions{'WB_NOBREAK'};
# Do not break from extenders.
# WB13b ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
$wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}]
= $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}]
= $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}]
= $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Katakana'}]
= $wb_actions{'WB_NOBREAK'};
# WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet)
# × # ExtendNumLet
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}]
= $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}]
= $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}]
= $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Katakana'}][$wb_enums{'ExtendNumLet'}]
= $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ExtendNumLet'}]
= $wb_actions{'WB_NOBREAK'};
# Do not break between Katakana.
# WB13 Katakana × Katakana
$wb_table[$wb_enums{'Katakana'}][$wb_enums{'Katakana'}]
= $wb_actions{'WB_NOBREAK'};
# Do not break within sequences, such as “3.2” or “3,456.789”.
# WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
$wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNumLet'}]
+= $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
$wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNum'}]
+= $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
$wb_table[$wb_enums{'Numeric'}][$wb_enums{'Single_Quote'}]
+= $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
# WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
$wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Numeric'}]
+= $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
$wb_table[$wb_enums{'MidNum'}][$wb_enums{'Numeric'}]
+= $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
$wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Numeric'}]
+= $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
# Do not break within sequences of digits, or digits adjacent to letters
# (“3a”, or “A3”).
# WB10 Numeric × (ALetter | Hebrew_Letter)
$wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}]
= $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}]
= $wb_actions{'WB_NOBREAK'};
# WB9 (ALetter | Hebrew_Letter) × Numeric
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}]
= $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}]
= $wb_actions{'WB_NOBREAK'};
# WB8 Numeric × Numeric
$wb_table[$wb_enums{'Numeric'}][$wb_enums{'Numeric'}]
= $wb_actions{'WB_NOBREAK'};
# Do not break letters across certain punctuation.
# WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
$wb_table[$wb_enums{'Double_Quote'}][$wb_enums{'Hebrew_Letter'}]
+= $wb_actions{'WB_DQ_then_HL'};
# WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Double_Quote'}]
+= $wb_actions{'WB_HL_then_DQ'};
# WB7a Hebrew_Letter × Single_Quote
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
= $wb_actions{'WB_NOBREAK'};
# WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)
# × (ALetter | Hebrew_Letter)
$wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}]
+= $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
$wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}]
+= $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
$wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}]
+= $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
$wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}]
+= $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
$wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}]
+= $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
$wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}]
+= $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
# WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
# | Single_Quote) (ALetter | Hebrew_Letter)
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}]
+= $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}]
+= $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}]
+= $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}]
+= $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}]
+= $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
+= $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
# Do not break between most letters.
# WB5 (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}]
= $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}]
= $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}]
= $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}]
= $wb_actions{'WB_NOBREAK'};
# Ignore Format and Extend characters, except when they appear at the
# beginning of a region of text.
# WB4 X (Extend | Format)* → X
for my $i (0 .. @wb_table - 1) {
$wb_table[$wb_enums{'Extend'}][$i]
= $wb_actions{'WB_Ex_or_FO_then_foo'};
$wb_table[$wb_enums{'Format'}][$i]
= $wb_actions{'WB_Ex_or_FO_then_foo'};
}
# Implied is that these attach to the character before them, except for
# the characters that mark the end of a region of text. The rules below
# override the ones set up here, for all the characters that need
# overriding.
for my $i (0 .. @wb_table - 1) {
$wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
$wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
}
# Break before and after white space
# WB3b ÷ (Newline | CR | LF)
# WB3a (Newline | CR | LF) ÷
# et. al.
for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
for my $j (0 .. @wb_table - 1) {
$wb_table[$j][$wb_enums{$i}] = $wb_actions{'WB_BREAKABLE'};
$wb_table[$wb_enums{$i}][$j] = $wb_actions{'WB_BREAKABLE'};
}
}
# But do not break within white space.
# WB3 CR × LF
# et.al.
for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
for my $j ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
$wb_table[$wb_enums{$i}][$wb_enums{$j}] = $wb_actions{'WB_NOBREAK'};
}
}
# And do not break horizontal space followed by Extend or Format
$wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Extend'}]
= $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Format'}]
= $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Perl_Tailored_HSpace'}]
[$wb_enums{'Perl_Tailored_HSpace'}]
= $wb_actions{'WB_hs_then_hs'};
# Break at the start and end of text.
# WB2 ÷ eot
# WB1 sot ÷
for my $i (0 .. @wb_table - 1) {
$wb_table[$i][$wb_enums{'EDGE'}] = $wb_actions{'WB_BREAKABLE'};
$wb_table[$wb_enums{'EDGE'}][$i] = $wb_actions{'WB_BREAKABLE'};
}
# But, unspecified by Unicode, we shouldn't break on an empty string.
$wb_table[$wb_enums{'EDGE'}][$wb_enums{'EDGE'}] = 0;
output_table_common('WB', \%wb_actions,
\@wb_table, \@wb_short_enums, \%wb_abbreviations);
}
output_invlist("Latin1", [ 0, 256 ]);
output_invlist("AboveLatin1", [ 256 ]);
end_file_pound_if;
# We construct lists for all the POSIX and backslash sequence character
# classes in two forms:
# 1) ones which match only in the ASCII range
# 2) ones which match either in the Latin1 range, or the entire Unicode range
#
# These get compiled in, and hence affect the memory footprint of every Perl
# program, even those not using Unicode. To minimize the size, currently
# the Latin1 version is generated for the beyond ASCII range except for those
# lists that are quite small for the entire range, such as for \s, which is 22
# UVs long plus 4 UVs (currently) for the header.
#
# To save even more memory, the ASCII versions could be derived from the
# larger ones at runtime, saving some memory (minus the expense of the machine
# instructions to do so), but these are all small anyway, so their total is
# about 100 UVs.
#
# In the list of properties below that get generated, the L1 prefix is a fake
# property that means just the Latin1 range of the full property (whose name
# has an X prefix instead of L1).
#
# An initial & means to use the subroutine from this file instead of an
# official inversion list.
for my $charset (get_supported_code_pages()) {
print $out_fh "\n" . get_conditional_compile_line_start($charset);
@a2n = @{get_a2n($charset)};
no warnings 'qw';
# Ignore non-alpha in sort
for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
ASCII
Cased
VertSpace
XPerlSpace
XPosixAlnum
XPosixAlpha
XPosixBlank
XPosixCntrl
XPosixDigit
XPosixGraph
XPosixLower
XPosixPrint
XPosixPunct
XPosixSpace
XPosixUpper
XPosixWord
XPosixXDigit
_Perl_Any_Folds
&NonL1_Perl_Non_Final_Folds
_Perl_Folds_To_Multi_Char
&UpperLatin1
_Perl_IDStart
_Perl_IDCont
_Perl_GCB,EDGE
_Perl_LB,EDGE
_Perl_SB,EDGE
_Perl_WB,EDGE,UNKNOWN
)
) {
# For the Latin1 properties, we change to use the eXtended version of the
# base property, then go through the result and get rid of everything not
# in Latin1 (above 255). Actually, we retain the element for the range
# that crosses the 255/256 boundary if it is one that matches the
# property. For example, in the Word property, there is a range of code
# points that start at U+00F8 and goes through U+02C1. Instead of
# artificially cutting that off at 256 because 256 is the first code point
# above Latin1, we let the range go to its natural ending. That gives us
# extra information with no added space taken. But if the range that
# crosses the boundary is one that doesn't match the property, we don't
# start a new range above 255, as that could be construed as going to
# infinity. For example, the Upper property doesn't include the character
# at 255, but does include the one at 256. We don't include the 256 one.
my $prop_name = $prop;
my $is_local_sub = $prop_name =~ s/^&//;
my $extra_enums = "";
$extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
my $lookup_prop = $prop_name;
my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
or $lookup_prop =~ s/^L1//);
my $nonl1_only = 0;
$nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
my @invlist;
my @invmap;
my $map_format;
my $map_default;
my $maps_to_code_point;
my $to_adjust;
if ($is_local_sub) {
@invlist = eval $lookup_prop;
die $@ if $@;
}
else {
@invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
if (! @invlist) {
# If couldn't find a non-empty inversion list, see if it is
# instead an inversion map
my ($list_ref, $map_ref, $format, $default)
= prop_invmap($lookup_prop, '_perl_core_internal_ok');
if (! $list_ref) {
# An empty return here could mean an unknown property, or
# merely that the original inversion list is empty. Call
# in scalar context to differentiate
my $count = prop_invlist($lookup_prop,
'_perl_core_internal_ok');
die "Could not find inversion list for '$lookup_prop'"
unless defined $count;
}
else {
@invlist = @$list_ref;
@invmap = @$map_ref;
$map_format = $format;
$map_default = $default;
$maps_to_code_point = $map_format =~ /x/;
$to_adjust = $map_format =~ /a/;
}
}
}
# Short-circuit an empty inversion list.
if (! @invlist) {
output_invlist($prop_name, \@invlist, $charset);
next;
}
# Re-order the Unicode code points to native ones for this platform.
# This is only needed for code points below 256, because native code
# points are only in that range. For inversion maps of properties
# where the mappings are adjusted (format =~ /a/), this reordering
# could mess up the adjustment pattern that was in the input, so that
# has to be dealt with.
#
# And inversion maps that map to code points need to eventually have
# all those code points remapped to native, and it's better to do that
# here, going through the whole list not just those below 256. This
# is because some inversion maps have adjustments (format =~ /a/)
# which may be affected by the reordering. This code needs to be done
# both for when we are translating the inversion lists for < 256, and
# for the inversion maps for everything. By doing both in this loop,
# we can share that code.
#
# So, we go through everything for an inversion map to code points;
# otherwise, we can skip any remapping at all if we are going to
# output only the above-Latin1 values, or if the range spans the whole
# of 0..256, as the remap will also include all of 0..256 (256 not
# 255 because a re-ordering could cause 256 to need to be in the same
# range as 255.)
if ((@invmap && $maps_to_code_point)
|| (! $nonl1_only || ($invlist[0] < 256
&& ! ($invlist[0] == 0 && $invlist[1] > 256))))
{
if (! @invmap) { # Straight inversion list
# Look at all the ranges that start before 257.
my @latin1_list;
while (@invlist) {
last if $invlist[0] > 256;
my $upper = @invlist > 1
? $invlist[1] - 1 # In range
# To infinity. You may want to stop much much
# earlier; going this high may expose perl
# deficiencies with very large numbers.
: $Unicode::UCD::MAX_CP;
for my $j ($invlist[0] .. $upper) {
push @latin1_list, a2n($j);
}
shift @invlist; # Shift off the range that's in the list
shift @invlist; # Shift off the range not in the list
}
# Here @invlist contains all the ranges in the original that start
# at code points above 256, and @latin1_list contains all the
# native code points for ranges that start with a Unicode code
# point below 257. We sort the latter and convert it to inversion
# list format. Then simply prepend it to the list of the higher
# code points.
@latin1_list = sort { $a <=> $b } @latin1_list;
@latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
unshift @invlist, @latin1_list;
}
else { # Is an inversion map
# This is a similar procedure as plain inversion list, but has
# multiple buckets. A plain inversion list just has two
# buckets, 1) 'in' the list; and 2) 'not' in the list, and we
# pretty much can ignore the 2nd bucket, as it is completely
# defined by the 1st. But here, what we do is create buckets
# which contain the code points that map to each, translated
# to native and turned into an inversion list. Thus each
# bucket is an inversion list of native code points that map
# to it or don't map to it. We use these to create an
# inversion map for the whole property.
# As mentioned earlier, we use this procedure to not just
# remap the inversion list to native values, but also the maps
# of code points to native ones. In the latter case we have
# to look at the whole of the inversion map (or at least to
# above Unicode; as the maps of code points above that should
# all be to the default).
my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
my %mapped_lists; # A hash whose keys are the buckets.
while (@invlist) {
last if $invlist[0] > $upper_limit;
# This shouldn't actually happen, as prop_invmap() returns
# an extra element at the end that is beyond $upper_limit
die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
my $bucket;
# A hash key can't be a ref (we are only expecting arrays
# of scalars here), so convert any such to a string that
# will be converted back later (using a vertical tab as
# the separator). Even if the mapping is to code points,
# we don't translate to native here because the code
# output_map() calls to output these arrays assumes the
# input is Unicode, not native.
if (ref $invmap[0]) {
$bucket = join "\cK", @{$invmap[0]};
}
elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
# Do convert to native for maps to single code points.
# There are some properties that have a few outlier
# maps that aren't code points, so the above test
# skips those.
$bucket = a2n($invmap[0]);
} else {
$bucket = $invmap[0];
}
# We now have the bucket that all code points in the range
# map to, though possibly they need to be adjusted. Go
# through the range and put each translated code point in
# it into its bucket.
my $base_map = $invmap[0];
for my $j ($invlist[0] .. $invlist[1] - 1) {
if ($to_adjust
# The 1st code point doesn't need adjusting
&& $j > $invlist[0]
# Skip any non-numeric maps: these are outliers
# that aren't code points.
&& $base_map =~ $numeric_re
# 'ne' because the default can be a string
&& $base_map ne $map_default)
{
# We adjust, by incrementing each the bucket and
# the map. For code point maps, translate to
# native
$base_map++;
$bucket = ($maps_to_code_point)
? a2n($base_map)
: $base_map;
}
# Add the native code point to the bucket for the
# current map
push @{$mapped_lists{$bucket}}, a2n($j);
} # End of loop through all code points in the range
# Get ready for the next range
shift @invlist;
shift @invmap;
} # End of loop through all ranges in the map.
# Here, @invlist and @invmap retain all the ranges from the
# originals that start with code points above $upper_limit.
# Each bucket in %mapped_lists contains all the code points
# that map to that bucket. If the bucket is for a map to a
# single code point is a single code point, the bucket has
# been converted to native. If something else (including
# multiple code points), no conversion is done.
#
# Now we recreate the inversion map into %xlated, but this
# time for the native character set.
my %xlated;
foreach my $bucket (keys %mapped_lists) {
# Sort and convert this bucket to an inversion list. The
# result will be that ranges that start with even-numbered
# indexes will be for code points that map to this bucket;
# odd ones map to some other bucket, and are discarded
# below.
@{$mapped_lists{$bucket}}
= sort{ $a <=> $b} @{$mapped_lists{$bucket}};
@{$mapped_lists{$bucket}}
= mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
# Add each even-numbered range in the bucket to %xlated;
# so that the keys of %xlated become the range start code
# points, and the values are their corresponding maps.
while (@{$mapped_lists{$bucket}}) {
my $range_start = $mapped_lists{$bucket}->[0];
if ($bucket =~ /\cK/) {
@{$xlated{$range_start}} = split /\cK/, $bucket;
}
else {
$xlated{$range_start} = $bucket;
}
shift @{$mapped_lists{$bucket}}; # Discard odd ranges
shift @{$mapped_lists{$bucket}}; # Get ready for next
# iteration
}
} # End of loop through all the buckets.
# Here %xlated's keys are the range starts of all the code
# points in the inversion map. Construct an inversion list
# from them.
my @new_invlist = sort { $a <=> $b } keys %xlated;
# If the list is adjusted, we want to munge this list so that
# we only have one entry for where consecutive code points map
# to consecutive values. We just skip the subsequent entries
# where this is the case.
if ($to_adjust) {
my @temp;
for my $i (0 .. @new_invlist - 1) {
next if $i > 0
&& $new_invlist[$i-1] + 1 == $new_invlist[$i]
&& $xlated{$new_invlist[$i-1]} =~ $numeric_re
&& $xlated{$new_invlist[$i]} =~ $numeric_re
&& $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
push @temp, $new_invlist[$i];
}
@new_invlist = @temp;
}
# The inversion map comes from %xlated's values. We can
# unshift each onto the front of the untouched portion, in
# reverse order of the portion we did process.
foreach my $start (reverse @new_invlist) {
unshift @invmap, $xlated{$start};
}
# Finally prepend the inversion list we have just constructed to the
# one that contains anything we didn't process.
unshift @invlist, @new_invlist;
}
}
# prop_invmap() returns an extra final entry, which we can now
# discard.
if (@invmap) {
pop @invlist;
pop @invmap;
}
if ($l1_only) {
die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
for my $i (0 .. @invlist - 1 - 1) {
if ($invlist[$i] > 255) {
# In an inversion list, even-numbered elements give the code
# points that begin ranges that match the property;
# odd-numbered give ones that begin ranges that don't match.
# If $i is odd, we are at the first code point above 255 that
# doesn't match, which means the range it is ending does
# match, and crosses the 255/256 boundary. We want to include
# this ending point, so increment $i, so the splice below
# includes it. Conversely, if $i is even, it is the first
# code point above 255 that matches, which means there was no
# matching range that crossed the boundary, and we don't want
# to include this code point, so splice before it.
$i++ if $i % 2 != 0;
# Remove everything past this.
splice @invlist, $i;
splice @invmap, $i if @invmap;
last;
}
}
}
elsif ($nonl1_only) {
my $found_nonl1 = 0;
for my $i (0 .. @invlist - 1 - 1) {
next if $invlist[$i] < 256;
# Here, we have the first element in the array that indicates an
# element above Latin1. Get rid of all previous ones.
splice @invlist, 0, $i;
splice @invmap, 0, $i if @invmap;
# If this one's index is not divisible by 2, it means that this
# element is inverting away from being in the list, which means
# all code points from 256 to this one are in this list (or
# map to the default for inversion maps)
if ($i % 2 != 0) {
unshift @invlist, 256;
unshift @invmap, $map_default if @invmap;
}
$found_nonl1 = 1;
last;
}
die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
}
output_invlist($prop_name, \@invlist, $charset);
output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
}
end_file_pound_if;
print $out_fh "\n" . get_conditional_compile_line_end();
}
switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
output_GCB_table();
output_LB_table();
output_WB_table();
end_file_pound_if;
my $sources_list = "lib/unicore/mktables.lst";
my @sources = ($0, qw(lib/unicore/mktables
lib/Unicode/UCD.pm
regen/charset_translations.pl
));
{
# Depend on mktables’ own sources. It’s a shorter list of files than
# those that Unicode::UCD uses.
if (! open my $mktables_list, $sources_list) {
# This should force a rebuild once $sources_list exists
push @sources, $sources_list;
}
else {
while(<$mktables_list>) {
last if /===/;
chomp;
push @sources, "lib/unicore/$_" if /^[^#]/;
}
}
}
read_only_bottom_close_and_rename($out_fh, \@sources);