# Common subroutines and constants, called by .t files in this directory that
# deal with UTF-8
# The test files can't use byte_utf8a_to_utf8n() from t/charset_tools.pl
# because that uses the same functions we are testing here. So UTF-EBCDIC
# strings are hard-coded as I8 strings in this file instead, and we use the
# translation functions to/from I8 from that file instead.
sub isASCII { ord "A" == 65 }
sub display_bytes_no_quotes {
use bytes;
my $string = shift;
return join("", map { sprintf("\\x%02x", ord $_) } split "", $string)
}
sub display_bytes {
return '"' . display_bytes_no_quotes(shift) . '"';
}
sub output_warnings(@) {
my @list = @_;
if (@list) {
diag "The warnings were:\n" . join "\n", map { chomp; $_ } @list;
}
else {
diag "No warnings were raised";
}
}
sub start_byte_to_cont($) {
# Extract the code point information from the input UTF-8 start byte, and
# return a continuation byte containing the same information. This is
# used in constructing an overlong malformation from valid input.
my $byte = shift;
my $len = test_UTF8_SKIP($byte);
if ($len < 2) {
die "start_byte_to_cont() is expecting a UTF-8 variant";
}
$byte = ord native_to_I8($byte);
# Copied from utf8.h. This gets rid of the leading 1 bits.
$byte &= ((($len) >= 7) ? 0x00 : (0x1F >> (($len)-2)));
$byte |= (isASCII) ? 0x80 : 0xA0;
return I8_to_native(chr $byte);
}
$::is64bit = length sprintf("%x", ~0) > 8;
$::lowest_continuation = (isASCII) ? 0x80 : 0xA0;
$::I8c = (isASCII) ? "\x80" : "\xa0"; # A continuation byte
$::max_bytes = (isASCII) ? 13 : 14; # Max number of bytes in a UTF-8 sequence
# representing a single code point
# Copied from utf8.h
$::UTF8_ALLOW_EMPTY = 0x0001;
$::UTF8_GOT_EMPTY = $UTF8_ALLOW_EMPTY;
$::UTF8_ALLOW_CONTINUATION = 0x0002;
$::UTF8_GOT_CONTINUATION = $UTF8_ALLOW_CONTINUATION;
$::UTF8_ALLOW_NON_CONTINUATION = 0x0004;
$::UTF8_GOT_NON_CONTINUATION = $UTF8_ALLOW_NON_CONTINUATION;
$::UTF8_ALLOW_SHORT = 0x0008;
$::UTF8_GOT_SHORT = $UTF8_ALLOW_SHORT;
$::UTF8_ALLOW_LONG = 0x0010;
$::UTF8_ALLOW_LONG_AND_ITS_VALUE = $UTF8_ALLOW_LONG|0x0020;
$::UTF8_GOT_LONG = $UTF8_ALLOW_LONG;
$::UTF8_ALLOW_OVERFLOW = 0x0080;
$::UTF8_GOT_OVERFLOW = $UTF8_ALLOW_OVERFLOW;
$::UTF8_DISALLOW_SURROGATE = 0x0100;
$::UTF8_GOT_SURROGATE = $UTF8_DISALLOW_SURROGATE;
$::UTF8_WARN_SURROGATE = 0x0200;
$::UTF8_DISALLOW_NONCHAR = 0x0400;
$::UTF8_GOT_NONCHAR = $UTF8_DISALLOW_NONCHAR;
$::UTF8_WARN_NONCHAR = 0x0800;
$::UTF8_DISALLOW_SUPER = 0x1000;
$::UTF8_GOT_SUPER = $UTF8_DISALLOW_SUPER;
$::UTF8_WARN_SUPER = 0x2000;
$::UTF8_DISALLOW_PERL_EXTENDED = 0x4000;
$::UTF8_GOT_PERL_EXTENDED = $UTF8_DISALLOW_PERL_EXTENDED;
$::UTF8_WARN_PERL_EXTENDED = 0x8000;
$::UTF8_CHECK_ONLY = 0x10000;
$::UTF8_NO_CONFIDENCE_IN_CURLEN_ = 0x20000;
$::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
= $UTF8_DISALLOW_SUPER|$UTF8_DISALLOW_SURROGATE;
$::UTF8_DISALLOW_ILLEGAL_INTERCHANGE
= $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE|$UTF8_DISALLOW_NONCHAR;
$::UTF8_WARN_ILLEGAL_C9_INTERCHANGE
= $UTF8_WARN_SUPER|$UTF8_WARN_SURROGATE;
$::UTF8_WARN_ILLEGAL_INTERCHANGE
= $UTF8_WARN_ILLEGAL_C9_INTERCHANGE|$UTF8_WARN_NONCHAR;
# Test uvchr_to_utf8().
$::UNICODE_WARN_SURROGATE = 0x0001;
$::UNICODE_WARN_NONCHAR = 0x0002;
$::UNICODE_WARN_SUPER = 0x0004;
$::UNICODE_WARN_PERL_EXTENDED = 0x0008;
$::UNICODE_DISALLOW_SURROGATE = 0x0010;
$::UNICODE_DISALLOW_NONCHAR = 0x0020;
$::UNICODE_DISALLOW_SUPER = 0x0040;
$::UNICODE_DISALLOW_PERL_EXTENDED = 0x0080;