#!perl -w
# This is a base file to be used by various .t's in its directory
# It tests various malformed UTF-8 sequences and some code points that are
# "problematic", and verifies that the correct warnings/flags etc are
# generated when using them. For the code points, it also takes the UTF-8 and
# perturbs it to be malformed in various ways, and tests that this gets
# appropriately detected.
use strict;
use Test::More;
BEGIN {
use_ok('XS::APItest');
require 'charset_tools.pl';
require './t/utf8_setup.pl';
};
$|=1;
use XS::APItest;
my @warnings_gotten;
use warnings 'utf8';
local $SIG{__WARN__} = sub { my @copy = @_;
push @warnings_gotten, map { chomp; $_ } @copy;
};
my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF;
my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation);
# C5 is chosen as it is valid for both ASCII and EBCDIC platforms
my $known_start_byte = I8_to_native("\xC5");
sub requires_extended_utf8($) {
# Returns a boolean as to whether or not the code point parameter fits
# into 31 bits, subject to the convention that a negative code point
# stands for one that overflows the word size, so won't fit in 31 bits.
return shift > $highest_non_extended_utf8_cp;
}
sub is_extended_utf8($) {
# Returns a boolean as to whether or not the input UTF-8 sequence uses
# Perl extended UTF-8.
my $byte = substr(shift, 0, 1);
return ord $byte >= 0xFE if isASCII;
return $byte == I8_to_native("\xFF");
}
sub overflow_discern_len($) {
# Returns how many bytes are needed to tell if a non-overlong UTF-8
# sequence is for a code point that won't fit in the platform's word size.
# Only the length of the sequence representing a single code point is
# needed.
if (isASCII) {
return ($::is64bit) ? 3 : 1;
# Below is needed for code points above IV_MAX
#return ($::is64bit) ? 3 : ((shift == $::max_bytes)
# ? 1
# : 2);
}
return ($::is64bit) ? 2 : 8;
}
sub overlong_discern_len($) {
# Returns how many bytes are needed to tell if the input UTF-8 sequence
# for a code point is overlong
my $string = shift;
my $length = length $string;
my $byte = ord native_to_I8(substr($string, 0, 1));
if (isASCII) {
return ($byte >= 0xFE)
? ((! $::is64bit)
? 1
: ($byte == 0xFF) ? 7 : 2)
: (($length == 2) ? 1 : 2);
# Below is needed for code points above IV_MAX
#return ($length == $::max_bytes)
# # This is constrained to 1 on 32-bit machines, as it
# # overflows there
# ? (($::is64bit) ? 7 : 1)
# : (($length == 2) ? 1 : 2);
}
return ($length == $::max_bytes) ? 8 : (($length <= 3) ? 1 : 2);
}
my @tests;
{
no warnings qw(portable overflow);
@tests = (
# $testname,
# $bytes, UTF-8 string
# $allowed_uv, code point $bytes evaluates to; -1 if
# overflows
# $needed_to_discern_len optional, how long an initial substring do
# we need to tell that the string must be for
# a code point in the category it falls in,
# like being a surrogate; 0 indicates we need
# the whole string. Some categories have a
# default that is used if this is omitted.
[ "orphan continuation byte malformation",
I8_to_native("$::I8c"),
0xFFFD,
1,
],
[ "overlong malformation, lowest 2-byte",
(isASCII) ? "\xc0\x80" : I8_to_native("\xc0\xa0"),
0, # NUL
],
[ "overlong malformation, highest 2-byte",
(isASCII) ? "\xc1\xbf" : I8_to_native("\xc4\xbf"),
(isASCII) ? 0x7F : utf8::unicode_to_native(0x9F),
],
[ "overlong malformation, lowest 3-byte",
(isASCII) ? "\xe0\x80\x80" : I8_to_native("\xe0\xa0\xa0"),
0, # NUL
],
[ "overlong malformation, highest 3-byte",
(isASCII) ? "\xe0\x9f\xbf" : I8_to_native("\xe0\xbf\xbf"),
(isASCII) ? 0x7FF : 0x3FF,
],
[ "lowest surrogate",
(isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
0xD800,
],
[ "a middle surrogate",
(isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
0xD90D,
],
[ "highest surrogate",
(isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
0xDFFF,
],
[ "first of 32 consecutive non-character code points",
(isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
0xFDD0,
],
[ "a mid non-character code point of the 32 consecutive ones",
(isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
0xFDE0,
],
[ "final of 32 consecutive non-character code points",
(isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
0xFDEF,
],
[ "non-character code point U+FFFE",
(isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
0xFFFE,
],
[ "non-character code point U+FFFF",
(isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
0xFFFF,
],
[ "overlong malformation, lowest 4-byte",
(isASCII) ? "\xf0\x80\x80\x80" : I8_to_native("\xf0\xa0\xa0\xa0"),
0, # NUL
],
[ "overlong malformation, highest 4-byte",
(isASCII) ? "\xf0\x8F\xbf\xbf" : I8_to_native("\xf0\xaf\xbf\xbf"),
(isASCII) ? 0xFFFF : 0x3FFF,
],
[ "non-character code point U+1FFFE",
(isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
0x1FFFE,
],
[ "non-character code point U+1FFFF",
(isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
0x1FFFF,
],
[ "non-character code point U+2FFFE",
(isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
0x2FFFE,
],
[ "non-character code point U+2FFFF",
(isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
0x2FFFF,
],
[ "non-character code point U+3FFFE",
(isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
0x3FFFE,
],
[ "non-character code point U+3FFFF",
(isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
0x3FFFF,
],
[ "non-character code point U+4FFFE",
(isASCII)
? "\xf1\x8f\xbf\xbe"
: I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
0x4FFFE,
],
[ "non-character code point U+4FFFF",
(isASCII)
? "\xf1\x8f\xbf\xbf"
: I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
0x4FFFF,
],
[ "non-character code point U+5FFFE",
(isASCII)
? "\xf1\x9f\xbf\xbe"
: I8_to_native("\xf8\xab\xbf\xbf\xbe"),
0x5FFFE,
],
[ "non-character code point U+5FFFF",
(isASCII)
? "\xf1\x9f\xbf\xbf"
: I8_to_native("\xf8\xab\xbf\xbf\xbf"),
0x5FFFF,
],
[ "non-character code point U+6FFFE",
(isASCII)
? "\xf1\xaf\xbf\xbe"
: I8_to_native("\xf8\xad\xbf\xbf\xbe"),
0x6FFFE,
],
[ "non-character code point U+6FFFF",
(isASCII)
? "\xf1\xaf\xbf\xbf"
: I8_to_native("\xf8\xad\xbf\xbf\xbf"),
0x6FFFF,
],
[ "non-character code point U+7FFFE",
(isASCII)
? "\xf1\xbf\xbf\xbe"
: I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
0x7FFFE,
],
[ "non-character code point U+7FFFF",
(isASCII)
? "\xf1\xbf\xbf\xbf"
: I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
0x7FFFF,
],
[ "non-character code point U+8FFFE",
(isASCII)
? "\xf2\x8f\xbf\xbe"
: I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
0x8FFFE,
],
[ "non-character code point U+8FFFF",
(isASCII)
? "\xf2\x8f\xbf\xbf"
: I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
0x8FFFF,
],
[ "non-character code point U+9FFFE",
(isASCII)
? "\xf2\x9f\xbf\xbe"
: I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
0x9FFFE,
],
[ "non-character code point U+9FFFF",
(isASCII)
? "\xf2\x9f\xbf\xbf"
: I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
0x9FFFF,
],
[ "non-character code point U+AFFFE",
(isASCII)
? "\xf2\xaf\xbf\xbe"
: I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
0xAFFFE,
],
[ "non-character code point U+AFFFF",
(isASCII)
? "\xf2\xaf\xbf\xbf"
: I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
0xAFFFF,
],
[ "non-character code point U+BFFFE",
(isASCII)
? "\xf2\xbf\xbf\xbe"
: I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
0xBFFFE,
],
[ "non-character code point U+BFFFF",
(isASCII)
? "\xf2\xbf\xbf\xbf"
: I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
0xBFFFF,
],
[ "non-character code point U+CFFFE",
(isASCII)
? "\xf3\x8f\xbf\xbe"
: I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
0xCFFFE,
],
[ "non-character code point U+CFFFF",
(isASCII)
? "\xf3\x8f\xbf\xbf"
: I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
0xCFFFF,
],
[ "non-character code point U+DFFFE",
(isASCII)
? "\xf3\x9f\xbf\xbe"
: I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
0xDFFFE,
],
[ "non-character code point U+DFFFF",
(isASCII)
? "\xf3\x9f\xbf\xbf"
: I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
0xDFFFF,
],
[ "non-character code point U+EFFFE",
(isASCII)
? "\xf3\xaf\xbf\xbe"
: I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
0xEFFFE,
],
[ "non-character code point U+EFFFF",
(isASCII)
? "\xf3\xaf\xbf\xbf"
: I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
0xEFFFF,
],
[ "non-character code point U+FFFFE",
(isASCII)
? "\xf3\xbf\xbf\xbe"
: I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
0xFFFFE,
],
[ "non-character code point U+FFFFF",
(isASCII)
? "\xf3\xbf\xbf\xbf"
: I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
0xFFFFF,
],
[ "non-character code point U+10FFFE",
(isASCII)
? "\xf4\x8f\xbf\xbe"
: I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
0x10FFFE,
],
[ "non-character code point U+10FFFF",
(isASCII)
? "\xf4\x8f\xbf\xbf"
: I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
0x10FFFF,
],
[ "first non_unicode",
(isASCII)
? "\xf4\x90\x80\x80"
: I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
0x110000,
2,
],
[ "non_unicode whose first byte tells that",
(isASCII)
? "\xf5\x80\x80\x80"
: I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
(isASCII) ? 0x140000 : 0x200000,
1,
],
[ "overlong malformation, lowest 5-byte",
(isASCII)
? "\xf8\x80\x80\x80\x80"
: I8_to_native("\xf8\xa0\xa0\xa0\xa0"),
0, # NUL
],
[ "overlong malformation, highest 5-byte",
(isASCII)
? "\xf8\x87\xbf\xbf\xbf"
: I8_to_native("\xf8\xa7\xbf\xbf\xbf"),
(isASCII) ? 0x1FFFFF : 0x3FFFF,
],
[ "overlong malformation, lowest 6-byte",
(isASCII)
? "\xfc\x80\x80\x80\x80\x80"
: I8_to_native("\xfc\xa0\xa0\xa0\xa0\xa0"),
0, # NUL
],
[ "overlong malformation, highest 6-byte",
(isASCII)
? "\xfc\x83\xbf\xbf\xbf\xbf"
: I8_to_native("\xfc\xa3\xbf\xbf\xbf\xbf"),
(isASCII) ? 0x3FFFFFF : 0x3FFFFF,
],
[ "overlong malformation, lowest 7-byte",
(isASCII)
? "\xfe\x80\x80\x80\x80\x80\x80"
: I8_to_native("\xfe\xa0\xa0\xa0\xa0\xa0\xa0"),
0, # NUL
],
[ "overlong malformation, highest 7-byte",
(isASCII)
? "\xfe\x81\xbf\xbf\xbf\xbf\xbf"
: I8_to_native("\xfe\xa1\xbf\xbf\xbf\xbf\xbf"),
(isASCII) ? 0x7FFFFFFF : 0x3FFFFFF,
],
[ "highest 31 bit code point",
(isASCII)
? "\xfd\xbf\xbf\xbf\xbf\xbf"
: I8_to_native(
"\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf"),
0x7FFFFFFF,
1,
],
[ "lowest 32 bit code point",
(isASCII)
? "\xfe\x82\x80\x80\x80\x80\x80"
: I8_to_native(
"\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
($::is64bit) ? 0x80000000 : -1, # Overflows on 32-bit systems
1,
],
# Used when UV_MAX is allowed as a code point
#[ "highest 32 bit code point",
# (isASCII)
# ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
# : I8_to_native(
# "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
# 0xFFFFFFFF,
#],
#[ "Lowest 33 bit code point",
# (isASCII)
# ? "\xfe\x84\x80\x80\x80\x80\x80"
# : I8_to_native(
# "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
# ($::is64bit) ? 0x100000000 : 0x0, # Overflows on 32-bit systems
#],
);
if (! $::is64bit) {
if (isASCII) {
push @tests,
[ "overlong malformation, but naively looks like overflow",
"\xff\x80\x80\x80\x80\x80\x80\x81\xbf\xbf\xbf\xbf\xbf",
0x7FFFFFFF,
],
# Used when above IV_MAX are allowed.
#[ "overlong malformation, but naively looks like overflow",
# "\xff\x80\x80\x80\x80\x80\x80\x83\xbf\xbf\xbf\xbf\xbf",
# 0xFFFFFFFF,
#],
[ "overflow that old algorithm failed to detect",
"\xfe\x86\x80\x80\x80\x80\x80",
-1,
];
}
}
push @tests,
[ "overlong malformation, lowest max-byte",
(isASCII)
? "\xff\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
: I8_to_native(
"\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
0, # NUL
],
[ "overlong malformation, highest max-byte",
(isASCII) # 2**36-1 on ASCII; 2**30-1 on EBCDIC
? "\xff\x80\x80\x80\x80\x80\x80\xbf\xbf\xbf\xbf\xbf\xbf"
: I8_to_native(
"\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xbf\xbf\xbf\xbf\xbf\xbf"),
(isASCII) ? (($::is64bit) ? 0xFFFFFFFFF : -1) : 0x3FFFFFFF,
];
if (isASCII) {
push @tests,
[ "Lowest code point requiring 13 bytes to represent", # 2**36
"\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
($::is64bit) ? 0x1000000000 : -1, # overflows on 32bit
],
};
if ($::is64bit) {
push @tests,
[ "highest 63 bit code point",
(isASCII)
? "\xff\x80\x87\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
: I8_to_native(
"\xff\xa7\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
0x7FFFFFFFFFFFFFFF,
(isASCII) ? 1 : 2,
],
[ "first 64 bit code point",
(isASCII)
? "\xff\x80\x88\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
: I8_to_native(
"\xff\xa8\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
-1,
];
# Used when UV_MAX is allowed as a code point
#[ "highest 64 bit code point",
# (isASCII)
# ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
# : I8_to_native(
# "\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
# 0xFFFFFFFFFFFFFFFF,
# (isASCII) ? 1 : 2,
#],
#[ "first 65 bit code point",
# (isASCII)
# ? "\xff\x80\x9f\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
# : I8_to_native(
# "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
# 0,
#];
if (isASCII) {
push @tests,
[ "overflow that old algorithm failed to detect",
"\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
-1,
];
}
else {
push @tests, # These could falsely show wrongly in a naive
# implementation
[ "requires at least 32 bits",
I8_to_native(
"\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
0x800000000,
],
[ "requires at least 32 bits",
I8_to_native(
"\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
0x10000000000,
],
[ "requires at least 32 bits",
I8_to_native(
"\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
0x200000000000,
],
[ "requires at least 32 bits",
I8_to_native(
"\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
0x4000000000000,
],
[ "requires at least 32 bits",
I8_to_native(
"\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
0x80000000000000,
],
[ "requires at least 32 bits",
I8_to_native(
"\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
0x1000000000000000,
];
}
}
}
sub flags_to_text($$)
{
my ($flags, $flags_to_text_ref) = @_;
# Returns a string containing a mnemonic representation of the bits that
# are set in the $flags. These are assumed to be flag bits. The return
# looks like "FOO|BAR|BAZ". The second parameter is a reference to an
# array that gives the textual representation of all the possible flags.
# Element 0 is the text for the bit 0 flag; element 1 for bit 1; .... If
# no bits at all are set the string "0" is returned;
my @flag_text;
my $shift = 0;
return "0" if $flags == 0;
while ($flags) {
#diag sprintf "%x", $flags;
if ($flags & 1) {
push @flag_text, $flags_to_text_ref->[$shift];
}
$shift++;
$flags >>= 1;
}
return join "|", @flag_text;
}
# Possible flag returns from utf8n_to_uvchr_error(). These should have G_,
# instead of A_, D_, but the prefixes will be used in a a later commit, so
# minimize churn by having them here.
my @utf8n_flags_to_text = ( qw(
A_EMPTY
A_CONTINUATION
A_NON_CONTINUATION
A_SHORT
A_LONG
A_LONG_AND_ITS_VALUE
PLACEHOLDER
A_OVERFLOW
D_SURROGATE
W_SURROGATE
D_NONCHAR
W_NONCHAR
D_SUPER
W_SUPER
D_PERL_EXTENDED
W_PERL_EXTENDED
CHECK_ONLY
NO_CONFIDENCE_IN_CURLEN_
) );
sub utf8n_display_call($)
{
# Converts an eval string that calls test_utf8n_to_uvchr into a more human
# readable form, and returns it. Doesn't work if the byte string contains
# an apostrophe. The return will look something like:
# test_utf8n_to_uvchr_error('$bytes', $length, $flags)
#diag $_[0];
$_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x;
my $text1 = $1; # Everything before the byte string
my $bytes = $2;
my $text2 = $3; # Includes the length
my $flags = $4;
return $text1
. display_bytes($bytes)
. $text2
. flags_to_text($flags, \@utf8n_flags_to_text)
. ')';
}
sub uvchr_display_call($)
{
# Converts an eval string that calls test_uvchr_to_utf8 into a more human
# readable form, and returns it. The return will look something like:
# test_uvchr_to_utf8n_flags($uv, $flags)
#diag $_[0];
my @flags_to_text = ( qw(
W_SURROGATE
W_NONCHAR
W_SUPER
W_PERL_EXTENDED
D_SURROGATE
D_NONCHAR
D_SUPER
D_PERL_EXTENDED
) );
$_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
my $text = $1;
my $cp = sprintf "%X", $2;
my $flags = $3;
return "${text}0x$cp, " . flags_to_text($flags, \@flags_to_text) . ')';
}
sub do_warnings_test(@)
{
my @expected_warnings = @_;
# Compares the input expected warnings array with @warnings_gotten,
# generating a pass for each found, removing it from @warnings_gotten.
# Any discrepancies generate test failures. Returns TRUE if no
# discrepcancies; otherwise FALSE.
my $succeeded = 1;
if (@expected_warnings == 0) {
if (! is(@warnings_gotten, 0, " Expected and got no warnings")) {
output_warnings(@warnings_gotten);
$succeeded = 0;
}
return $succeeded;
}
# Check that we got all the expected warnings,
# removing each one found
WARNING:
foreach my $expected (@expected_warnings) {
foreach (my $i = 0; $i < @warnings_gotten; $i++) {
if ($warnings_gotten[$i] =~ $expected) {
pass(" Expected and got warning: "
. " $warnings_gotten[$i]");
splice @warnings_gotten, $i, 1;
next WARNING;
}
}
fail(" Expected a warning that matches "
. $expected . " but didn't get it");
$succeeded = 0;
}
if (! is(@warnings_gotten, 0, " Got no unexpected warnings")) {
output_warnings(@warnings_gotten);
$succeeded = 0;
}
return $succeeded;
}
# This test is split into this number of files.
my $num_test_files = $ENV{TEST_JOBS} || 1;
$num_test_files = 10 if $num_test_files > 10;
my $test_count = -1;
foreach my $test (@tests) {
$test_count++;
next if $test_count % $num_test_files != $::TEST_CHUNK;
my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test;
my $length = length $bytes;
my $initially_overlong = $testname =~ /overlong/;
my $initially_orphan = $testname =~ /orphan/;
my $will_overflow = $allowed_uv < 0;
my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv);
my $display_bytes = display_bytes($bytes);
my $controlling_warning_category;
my $utf8n_flag_to_warn;
my $utf8n_flag_to_disallow;
my $uvchr_flag_to_warn;
my $uvchr_flag_to_disallow;
# We want to test that the independent flags are actually independent.
# For example, that a surrogate doesn't trigger a non-character warning,
# and conversely, turning off an above-Unicode flag doesn't suppress a
# surrogate warning. Earlier versions of this file used nested loops to
# test all possible combinations. But that creates lots of tests, making
# this run too long. What is now done instead is to use the complement of
# the category we are testing to greatly reduce the combinatorial
# explosion. For example, if we have a surrogate and we aren't expecting
# a warning about it, we set all the flags for non-surrogates to raise
# warnings. If one shows up, it indicates the flags aren't independent.
my $utf8n_flag_to_warn_complement;
my $utf8n_flag_to_disallow_complement;
my $uvchr_flag_to_warn_complement;
my $uvchr_flag_to_disallow_complement;
# Many of the code points being tested are middling in that if code point
# edge cases work, these are very likely to as well. Because this test
# file takes a while to execute, we skip testing the edge effects of code
# points deemed middling, while testing their basics and continuing to
# fully test the non-middling code points.
my $skip_most_tests = 0;
my $cp_message_qr; # Pattern that matches the message raised when
# that message contains the problematic code
# point. The message is the same (currently) both
# when going from/to utf8.
my $non_cp_trailing_text; # The suffix text when the message doesn't
# contain a code point. (This is a result of
# some sort of malformation that means we
# can't get an exact code poin
my $extended_cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
\Q requires a Perl extension, and so is not\E
\Q portable\E/x;
my $extended_non_cp_trailing_text
= "is a Perl extension, and so is not portable";
# What bytes should have been used to specify a code point that has been
# specified as an overlong.
my $correct_bytes_for_overlong;
# Is this test malformed from the beginning? If so, we know to generally
# expect that the tests will show it isn't valid.
my $initially_malformed = 0;
if ($initially_overlong || $initially_orphan) {
$non_cp_trailing_text = "if you see this, there is an error";
$cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
$initially_malformed = 1;
$utf8n_flag_to_warn = 0;
$utf8n_flag_to_disallow = 0;
$utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE;
$utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE;
if (! $will_overflow && $allowed_uv <= 0x10FFFF) {
$utf8n_flag_to_warn_complement |= $::UTF8_WARN_SUPER;
$utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_SUPER;
if (($allowed_uv & 0xFFFF) != 0xFFFF) {
$utf8n_flag_to_warn_complement |= $::UTF8_WARN_NONCHAR;
$utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_NONCHAR;
}
}
if (! is_extended_utf8($bytes)) {
$utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED;
$utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_PERL_EXTENDED;
}
$controlling_warning_category = 'utf8';
if ($initially_overlong) {
if (! defined $needed_to_discern_len) {
$needed_to_discern_len = overlong_discern_len($bytes);
}
$correct_bytes_for_overlong = display_bytes_no_quotes(chr $allowed_uv);
}
}
elsif($will_overflow || $allowed_uv > 0x10FFFF) {
# Set the SUPER flags; later, we test for PERL_EXTENDED as well.
$utf8n_flag_to_warn = $::UTF8_WARN_SUPER;
$utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER;
$uvchr_flag_to_warn = $::UNICODE_WARN_SUPER;
$uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;;
# Below, we add the flags for non-perl_extended to the code points
# that don't fit that category. Special tests are done for this
# category in the inner loop.
$utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR
|$::UTF8_WARN_SURROGATE;
$utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
|$::UTF8_DISALLOW_SURROGATE;
$uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR
|$::UNICODE_WARN_SURROGATE;
$uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
|$::UNICODE_DISALLOW_SURROGATE;
$controlling_warning_category = 'non_unicode';
if ($will_overflow) { # This is realy a malformation
$non_cp_trailing_text = "if you see this, there is an error";
$cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
$initially_malformed = 1;
if (! defined $needed_to_discern_len) {
$needed_to_discern_len = overflow_discern_len($length);
}
}
elsif (requires_extended_utf8($allowed_uv)) {
$cp_message_qr = $extended_cp_message_qr;
$non_cp_trailing_text = $extended_non_cp_trailing_text;
$needed_to_discern_len = 1 unless defined $needed_to_discern_len;
}
else {
$cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
\Q may not be portable\E/x;
$non_cp_trailing_text = "is for a non-Unicode code point, may not"
. " be portable";
$utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED;
$utf8n_flag_to_disallow_complement
|= $::UTF8_DISALLOW_PERL_EXTENDED;
$uvchr_flag_to_warn_complement |= $::UNICODE_WARN_PERL_EXTENDED;
$uvchr_flag_to_disallow_complement
|= $::UNICODE_DISALLOW_PERL_EXTENDED;
}
}
elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
$cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/;
$non_cp_trailing_text = "is for a surrogate";
$needed_to_discern_len = 2 unless defined $needed_to_discern_len;
$skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF;
$utf8n_flag_to_warn = $::UTF8_WARN_SURROGATE;
$utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE;
$uvchr_flag_to_warn = $::UNICODE_WARN_SURROGATE;
$uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;;
$utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR
|$::UTF8_WARN_SUPER
|$::UTF8_WARN_PERL_EXTENDED;
$utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
|$::UTF8_DISALLOW_SUPER
|$::UTF8_DISALLOW_PERL_EXTENDED;
$uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR
|$::UNICODE_WARN_SUPER
|$::UNICODE_WARN_PERL_EXTENDED;
$uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
|$::UNICODE_DISALLOW_SUPER
|$::UNICODE_DISALLOW_PERL_EXTENDED;
$controlling_warning_category = 'surrogate';
}
elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
|| ($allowed_uv & 0xFFFE) == 0xFFFE)
{
$cp_message_qr = qr/\QUnicode non-character U+$uv_string\E
\Q is not recommended for open interchange\E/x;
$non_cp_trailing_text = "if you see this, there is an error";
$needed_to_discern_len = $length unless defined $needed_to_discern_len;
if ( ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF)
|| ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE))
{
$skip_most_tests = 1;
}
$utf8n_flag_to_warn = $::UTF8_WARN_NONCHAR;
$utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR;
$uvchr_flag_to_warn = $::UNICODE_WARN_NONCHAR;
$uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;;
$utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE
|$::UTF8_WARN_SUPER
|$::UTF8_WARN_PERL_EXTENDED;
$utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE
|$::UTF8_DISALLOW_SUPER
|$::UTF8_DISALLOW_PERL_EXTENDED;
$uvchr_flag_to_warn_complement = $::UNICODE_WARN_SURROGATE
|$::UNICODE_WARN_SUPER
|$::UNICODE_WARN_PERL_EXTENDED;
$uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE
|$::UNICODE_DISALLOW_SUPER
|$::UNICODE_DISALLOW_PERL_EXTENDED;
$controlling_warning_category = 'nonchar';
}
else {
die "Can't figure out what type of warning to test for $testname"
}
die 'Didn\'t set $needed_to_discern_len for ' . $testname
unless defined $needed_to_discern_len;
# We try various combinations of malformations that can occur
foreach my $short (0, 1) {
next if $skip_most_tests && $short;
foreach my $unexpected_noncont (0, 1) {
next if $skip_most_tests && $unexpected_noncont;
foreach my $overlong (0, 1) {
next if $overlong && $skip_most_tests;
next if $initially_overlong && ! $overlong;
# If we're creating an overlong, it can't be longer than the
# maximum length, so skip if we're already at that length.
next if (! $initially_overlong && $overlong)
&& $length >= $::max_bytes;
my $this_cp_message_qr = $cp_message_qr;
my $this_non_cp_trailing_text = $non_cp_trailing_text;
foreach my $malformed_allow_type (0..2) {
# 0 don't allow this malformation; ignored if no malformation
# 1 allow, with REPLACEMENT CHARACTER returned
# 2 allow, with intended code point returned. All malformations
# other than overlong can't determine the intended code point,
# so this isn't valid for them.
next if $malformed_allow_type == 2
&& ($will_overflow || $short || $unexpected_noncont);
next if $skip_most_tests && $malformed_allow_type;
# Here we are in the innermost loop for malformations. So we
# know which ones are in effect. Can now change the input to be
# appropriately malformed. We also can set up certain other
# things now, like whether we expect a return flag from this
# malformation, and which flag.
my $this_bytes = $bytes;
my $this_length = $length;
my $this_expected_len = $length;
my $this_needed_to_discern_len = $needed_to_discern_len;
my @malformation_names;
my @expected_malformation_warnings;
my @expected_malformation_return_flags;
# Contains the flags for any allowed malformations. Currently no
# combinations of on/off are tested for. It's either all are
# allowed, or none are.
my $allow_flags = 0;
my $overlong_is_in_perl_extended_utf8 = 0;
my $dont_use_overlong_cp = 0;
if ($initially_orphan) {
next if $overlong || $short || $unexpected_noncont;
}
if ($overlong) {
if (! $initially_overlong) {
my $new_expected_len;
# To force this malformation, we convert the original start
# byte into a continuation byte with the same data bits as
# originally. ...
my $start_byte = substr($this_bytes, 0, 1);
my $converted_to_continuation_byte
= start_byte_to_cont($start_byte);
# ... Then we prepend it with a known overlong sequence.
# This should evaluate to the exact same code point as the
# original. We try to avoid an overlong using Perl
# extended UTF-8. The code points are the highest
# representable as overlongs on the respective platform
# without using extended UTF-8.
if (native_to_I8($start_byte) lt "\xFC") {
$start_byte = I8_to_native("\xFC");
$new_expected_len = 6;
}
elsif (! isASCII && native_to_I8($start_byte) lt "\xFE") {
# FE is not extended UTF-8 on EBCDIC
$start_byte = I8_to_native("\xFE");
$new_expected_len = 7;
}
else { # Must use extended UTF-8. On ASCII platforms, we
# could express some overlongs here starting with
# \xFE, but there's no real reason to do so.
$overlong_is_in_perl_extended_utf8 = 1;
$start_byte = I8_to_native("\xFF");
$new_expected_len = $::max_bytes;
$this_cp_message_qr = $extended_cp_message_qr;
# The warning that gets raised doesn't include the
# code point in the message if the code point can be
# expressed without using extended UTF-8, but the
# particular overlong sequence used is in extended
# UTF-8. To do otherwise would be confusing to the
# user, as it would claim the code point requires
# extended, when it doesn't.
$dont_use_overlong_cp = 1
unless requires_extended_utf8($allowed_uv);
$this_non_cp_trailing_text
= $extended_non_cp_trailing_text;
}
# Splice in the revise continuation byte, preceded by the
# start byte and the proper number of the lowest
# continuation bytes.
$this_bytes = $start_byte
. ($native_lowest_continuation_chr
x ( $new_expected_len
- 1
- length($this_bytes)))
. $converted_to_continuation_byte
. substr($this_bytes, 1);
$this_length = length($this_bytes);
$this_needed_to_discern_len = $new_expected_len
- ( $this_expected_len
- $this_needed_to_discern_len);
$this_expected_len = $new_expected_len;
}
}
if ($short) {
# To force this malformation, just tell the test to not look
# as far as it should into the input.
$this_length--;
$this_expected_len--;
$allow_flags |= $::UTF8_ALLOW_SHORT if $malformed_allow_type;
}
if ($unexpected_noncont) {
# To force this malformation, change the final continuation
# byte into a start byte.
my $pos = ($short) ? -2 : -1;
substr($this_bytes, $pos, 1) = $known_start_byte;
$this_expected_len--;
}
# The whole point of a test that is malformed from the beginning
# is to test for that malformation. If we've modified things so
# much that we don't have enough information to detect that
# malformation, there's no point in testing.
next if $initially_malformed
&& $this_expected_len < $this_needed_to_discern_len;
# Here, we've transformed the input with all of the desired
# non-overflow malformations. We are now in a position to
# construct any potential warnings for those malformations. But
# it's a pain to get the detailed messages exactly right, so for
# now XXX, only do so for those that return an explicit code
# point.
if ($initially_orphan) {
push @malformation_names, "orphan continuation";
push @expected_malformation_return_flags,
$::UTF8_GOT_CONTINUATION;
$allow_flags |= $::UTF8_ALLOW_CONTINUATION
if $malformed_allow_type;
push @expected_malformation_warnings, qr/unexpected continuation/;
}
if ($overlong) {
push @malformation_names, 'overlong';
push @expected_malformation_return_flags, $::UTF8_GOT_LONG;
# If one of the other malformation types is also in effect, we
# don't know what the intended code point was.
if ($short || $unexpected_noncont || $will_overflow) {
push @expected_malformation_warnings, qr/overlong/;
}
else {
my $wrong_bytes = display_bytes_no_quotes(
substr($this_bytes, 0, $this_length));
if (! defined $correct_bytes_for_overlong) {
$correct_bytes_for_overlong
= display_bytes_no_quotes($bytes);
}
my $prefix = ($allowed_uv > 0x10FFFF) ? "0x" : "U+";
push @expected_malformation_warnings,
qr/\QMalformed UTF-8 character: $wrong_bytes\E
\Q (overlong; instead use\E
\Q $correct_bytes_for_overlong to\E
\Q represent $prefix$uv_string)/x;
}
if ($malformed_allow_type == 2) {
$allow_flags |= $::UTF8_ALLOW_LONG_AND_ITS_VALUE;
}
elsif ($malformed_allow_type) {
$allow_flags |= $::UTF8_ALLOW_LONG;
}
}
if ($short) {
push @malformation_names, 'short';
push @expected_malformation_return_flags, $::UTF8_GOT_SHORT;
push @expected_malformation_warnings, qr/too short/;
}
if ($unexpected_noncont) {
push @malformation_names, 'unexpected non-continuation';
push @expected_malformation_return_flags,
$::UTF8_GOT_NON_CONTINUATION;
$allow_flags |= $::UTF8_ALLOW_NON_CONTINUATION
if $malformed_allow_type;
push @expected_malformation_warnings,
qr/unexpected non-continuation byte/;
}
# The overflow malformation is done differently than other
# malformations. It comes from manually typed tests in the test
# array. We now make it be treated like one of the other
# malformations. But some has to be deferred until the inner loop
my $overflow_msg_pattern;
if ($will_overflow) {
push @malformation_names, 'overflow';
$overflow_msg_pattern = display_bytes_no_quotes(
substr($this_bytes, 0, $this_expected_len));
$overflow_msg_pattern = qr/\QMalformed UTF-8 character:\E
\Q $overflow_msg_pattern\E
\Q (overflows)\E/x;
push @expected_malformation_return_flags, $::UTF8_GOT_OVERFLOW;
$allow_flags |= $::UTF8_ALLOW_OVERFLOW if $malformed_allow_type;
}
# And we can create the malformation-related text for the the test
# names we eventually will generate.
my $malformations_name = "";
if (@malformation_names) {
$malformations_name .= "dis" unless $malformed_allow_type;
$malformations_name .= "allowed ";
$malformations_name .= "malformation";
$malformations_name .= "s" if @malformation_names > 1;
$malformations_name .= ": ";
$malformations_name .= join "/", @malformation_names;
$malformations_name = " ($malformations_name)";
}
# Done setting up the malformation related stuff
{ # First test the isFOO calls
use warnings; # XXX no warnings 'deprecated'; # Make sure these don't raise warnings
undef @warnings_gotten;
my $ret = test_isUTF8_CHAR($this_bytes, $this_length);
my $ret_flags
= test_isUTF8_CHAR_flags($this_bytes, $this_length, 0);
if ($malformations_name) {
is($ret, 0, "For $testname$malformations_name: isUTF8_CHAR() returns 0");
is($ret_flags, 0, " And isUTF8_CHAR_flags() returns 0");
}
else {
is($ret, $this_length, "For $testname: isUTF8_CHAR() returns"
. " expected length: $this_length");
is($ret_flags, $this_length,
" And isUTF8_CHAR_flags(...,0) returns expected"
. " length: $this_length");
}
is(scalar @warnings_gotten, 0,
" And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags"
. " generated any warnings")
or output_warnings(@warnings_gotten);
undef @warnings_gotten;
$ret = test_isSTRICT_UTF8_CHAR($this_bytes, $this_length);
if ($malformations_name) {
is($ret, 0, " And isSTRICT_UTF8_CHAR() returns 0");
}
else {
my $expected_ret
= ( $testname =~ /surrogate|non-character/
|| $allowed_uv > 0x10FFFF)
? 0
: $this_length;
is($ret, $expected_ret,
" And isSTRICT_UTF8_CHAR() returns expected"
. " length: $expected_ret");
$ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
$::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
is($ret, $expected_ret,
" And isUTF8_CHAR_flags('"
. "DISALLOW_ILLEGAL_INTERCHANGE') acts like"
. " isSTRICT_UTF8_CHAR");
}
is(scalar @warnings_gotten, 0,
" And neither isSTRICT_UTF8_CHAR() nor"
. " isUTF8_CHAR_flags generated any warnings")
or output_warnings(@warnings_gotten);
undef @warnings_gotten;
$ret = test_isC9_STRICT_UTF8_CHAR($this_bytes, $this_length);
if ($malformations_name) {
is($ret, 0, " And isC9_STRICT_UTF8_CHAR() returns 0");
}
else {
my $expected_ret = ( $testname =~ /surrogate/
|| $allowed_uv > 0x10FFFF)
? 0
: $this_expected_len;
is($ret, $expected_ret, " And isC9_STRICT_UTF8_CHAR()"
. " returns expected length:"
. " $expected_ret");
$ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
$::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
is($ret, $expected_ret,
" And isUTF8_CHAR_flags('"
. "DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like"
. " isC9_STRICT_UTF8_CHAR");
}
is(scalar @warnings_gotten, 0,
" And neither isC9_STRICT_UTF8_CHAR() nor"
. " isUTF8_CHAR_flags generated any warnings")
or output_warnings(@warnings_gotten);
foreach my $disallow_type (0..2) {
# 0 is don't disallow this type of code point
# 1 is do disallow
# 2 is do disallow, but only code points requiring
# perl-extended-UTF8
my $disallow_flags;
my $expected_ret;
if ($malformations_name) {
# Malformations are by default disallowed, so testing
# with $disallow_type equal to 0 is sufficicient.
next if $disallow_type;
$disallow_flags = 0;
$expected_ret = 0;
}
elsif ($disallow_type == 1) {
$disallow_flags = $utf8n_flag_to_disallow;
$expected_ret = 0;
}
elsif ($disallow_type == 2) {
next if ! requires_extended_utf8($allowed_uv);
$disallow_flags = $::UTF8_DISALLOW_PERL_EXTENDED;
$expected_ret = 0;
}
else { # type is 0
$disallow_flags = $utf8n_flag_to_disallow_complement;
$expected_ret = $this_length;
}
$ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
$disallow_flags);
is($ret, $expected_ret,
" And isUTF8_CHAR_flags($display_bytes,"
. " $disallow_flags) returns $expected_ret")
or diag "The flags mean "
. flags_to_text($disallow_flags,
\@utf8n_flags_to_text);
is(scalar @warnings_gotten, 0,
" And isUTF8_CHAR_flags(...) generated"
. " no warnings")
or output_warnings(@warnings_gotten);
# Test partial character handling, for each byte not a
# full character
my $did_test_partial = 0;
for (my $j = 1; $j < $this_length - 1; $j++) {
$did_test_partial = 1;
my $partial = substr($this_bytes, 0, $j);
my $ret_should_be;
my $comment;
if ($disallow_type || $malformations_name) {
$ret_should_be = 0;
$comment = "disallowed";
# The number of bytes required to tell if a
# sequence has something wrong is the smallest of
# all the things wrong with it. We start with the
# number for this type of code point, if that is
# disallowed; or the whole length if not. The
# latter is what a couple of the malformations
# require.
my $needed_to_tell = ($disallow_type)
? $this_needed_to_discern_len
: $this_expected_len;
# Then we see if the malformations that are
# detectable early in the string are present.
if ($overlong) {
my $dl = overlong_discern_len($this_bytes);
$needed_to_tell = $dl if $dl < $needed_to_tell;
}
if ($will_overflow) {
my $dl = overflow_discern_len($length);
$needed_to_tell = $dl if $dl < $needed_to_tell;
}
if ($j < $needed_to_tell) {
$ret_should_be = 1;
$comment .= ", but need $needed_to_tell"
. " bytes to discern:";
}
}
else {
$ret_should_be = 1;
$comment = "allowed";
}
undef @warnings_gotten;
$ret = test_is_utf8_valid_partial_char_flags($partial,
$j, $disallow_flags);
is($ret, $ret_should_be,
" And is_utf8_valid_partial_char_flags("
. display_bytes($partial)
. ", $disallow_flags), $comment: returns"
. " $ret_should_be")
or diag "The flags mean "
. flags_to_text($disallow_flags, \@utf8n_flags_to_text);
}
if ($did_test_partial) {
is(scalar @warnings_gotten, 0,
" And is_utf8_valid_partial_char_flags()"
. " generated no warnings for any of the lengths")
or output_warnings(@warnings_gotten);
}
}
}
# Now test the to/from UTF-8 calls. There are several orthogonal
# variables involved. We test most possible combinations
foreach my $do_disallow (0, 1) {
if ($do_disallow) {
next if $initially_overlong || $initially_orphan;
}
else {
next if $skip_most_tests;
}
# We classify the warnings into certain "interesting" types,
# described later
foreach my $warning_type (0..4) {
next if $skip_most_tests && $warning_type != 1;
foreach my $use_warn_flag (0, 1) {
if ($use_warn_flag) {
next if $initially_overlong || $initially_orphan;
}
else {
next if $skip_most_tests;
}
# Finally, here is the inner loop
my $this_utf8n_flag_to_warn = $utf8n_flag_to_warn;
my $this_utf8n_flag_to_disallow = $utf8n_flag_to_disallow;
my $this_uvchr_flag_to_warn = $uvchr_flag_to_warn;
my $this_uvchr_flag_to_disallow = $uvchr_flag_to_disallow;
my $eval_warn;
my $expect_regular_warnings;
my $expect_warnings_for_malformed;
my $expect_warnings_for_overflow;
if ($warning_type == 0) {
$eval_warn = "use warnings";
$expect_regular_warnings = $use_warn_flag;
# We ordinarily expect overflow warnings here. But it
# is somewhat more complicated, and the final
# determination is deferred to one place in the filw
# where we handle overflow.
$expect_warnings_for_overflow = 1;
# We would ordinarily expect malformed warnings in
# this case, but not if malformations are allowed.
$expect_warnings_for_malformed
= $malformed_allow_type == 0;
}
elsif ($warning_type == 1) {
$eval_warn = "no warnings";
$expect_regular_warnings = 0;
$expect_warnings_for_overflow = 0;
$expect_warnings_for_malformed = 0;
}
elsif ($warning_type == 2) {
$eval_warn = "no warnings; use warnings 'utf8'";
$expect_regular_warnings = $use_warn_flag;
$expect_warnings_for_overflow = 1;
$expect_warnings_for_malformed
= $malformed_allow_type == 0;
}
elsif ($warning_type == 3) {
$eval_warn = "no warnings; use warnings"
. " '$controlling_warning_category'";
$expect_regular_warnings = $use_warn_flag;
$expect_warnings_for_overflow
= $controlling_warning_category eq 'non_unicode';
$expect_warnings_for_malformed = 0;
}
elsif ($warning_type == 4) { # Like type 3, but uses the
# PERL_EXTENDED flags
# The complement flags were set up so that the
# PERL_EXTENDED flags have been tested that they don't
# trigger wrongly for too small code points. And the
# flags have been set up so that those small code
# points are tested for being above Unicode. What's
# left to test is that the large code points do
# trigger the PERL_EXTENDED flags.
next if ! requires_extended_utf8($allowed_uv);
next if $controlling_warning_category ne 'non_unicode';
$eval_warn = "no warnings; use warnings 'non_unicode'";
$expect_regular_warnings = 1;
$expect_warnings_for_overflow = 1;
$expect_warnings_for_malformed = 0;
$this_utf8n_flag_to_warn = $::UTF8_WARN_PERL_EXTENDED;
$this_utf8n_flag_to_disallow
= $::UTF8_DISALLOW_PERL_EXTENDED;
$this_uvchr_flag_to_warn
= $::UNICODE_WARN_PERL_EXTENDED;
$this_uvchr_flag_to_disallow
= $::UNICODE_DISALLOW_PERL_EXTENDED;
}
else {
die "Unexpected warning type '$warning_type'";
}
# We only need to test the case where all warnings are
# enabled (type 0) to see if turning off the warning flag
# causes things to not be output. If those pass, then
# turning on some sub-category of warnings, or turning off
# warnings altogether are extremely likely to not output
# warnings either, given how the warnings subsystem is
# supposed to work, and this file assumes it does work.
next if $warning_type != 0 && ! $use_warn_flag;
# The convention is that the 'got' flag is the same value
# as the disallow one. If this were violated, the tests
# here should start failing.
my $return_flag = $this_utf8n_flag_to_disallow;
# If we aren't expecting warnings/disallow for this, turn
# on all the other flags. That makes sure that they all
# are independent of this flag, and so we don't need to
# test them individually.
my $this_warning_flags
= ($use_warn_flag)
? $this_utf8n_flag_to_warn
: ($overlong_is_in_perl_extended_utf8
? ($utf8n_flag_to_warn_complement
& ~$::UTF8_WARN_PERL_EXTENDED)
: $utf8n_flag_to_warn_complement);
my $this_disallow_flags
= ($do_disallow)
? $this_utf8n_flag_to_disallow
: ($overlong_is_in_perl_extended_utf8
? ($utf8n_flag_to_disallow_complement
& ~$::UTF8_DISALLOW_PERL_EXTENDED)
: $utf8n_flag_to_disallow_complement);
my $expected_uv = $allowed_uv;
my $this_uv_string = $uv_string;
my @expected_return_flags
= @expected_malformation_return_flags;
my @expected_warnings;
push @expected_warnings, @expected_malformation_warnings
if $expect_warnings_for_malformed;
# The overflow malformation is done differently than other
# malformations. It comes from manually typed tests in
# the test array, but it also is above Unicode and uses
# Perl extended UTF-8, so affects some of the flags being
# tested. We now make it be treated like one of the other
# generated malformations.
if ($will_overflow) {
# An overflow is (way) above Unicode, and overrides
# everything else.
$expect_regular_warnings = 0;
# Earlier, we tentatively calculated whether this
# should emit a message or not. It's tentative
# because, even if we ordinarily would output it, we
# don't if malformations are allowed -- except an
# overflow is also a SUPER and PERL_EXTENDED, and if
# warnings for those are enabled, the overflow
# warning does get raised.
if ( $expect_warnings_for_overflow
&& ( $malformed_allow_type == 0
|| ( $this_warning_flags
& ($::UTF8_WARN_SUPER
|$::UTF8_WARN_PERL_EXTENDED))))
{
push @expected_warnings, $overflow_msg_pattern;
}
}
# It may be that the malformations have shortened the
# amount of input we look at so much that we can't tell
# what the category the code point was in. Otherwise, set
# up the expected return flags based on the warnings and
# disallowments.
if ($this_expected_len < $this_needed_to_discern_len) {
$expect_regular_warnings = 0;
}
elsif ( ($this_warning_flags & $this_utf8n_flag_to_warn)
|| ( $this_disallow_flags
& $this_utf8n_flag_to_disallow))
{
push @expected_return_flags, $return_flag;
}
# Finish setting up the expected warning.
if ($expect_regular_warnings) {
# So far the array contains warnings generated by
# malformations. Add the expected regular one.
unshift @expected_warnings, $this_cp_message_qr;
# But it may need to be modified, because either of
# these malformations means we can't determine the
# expected code point.
if ( $short || $unexpected_noncont
|| $dont_use_overlong_cp)
{
my $first_byte = substr($this_bytes, 0, 1);
$expected_warnings[0] = display_bytes(
substr($this_bytes, 0, $this_expected_len));
$expected_warnings[0]
= qr/[Aa]\Qny UTF-8 sequence that starts with\E
\Q $expected_warnings[0]\E
\Q $this_non_cp_trailing_text\E/x;
}
}
# Is effectively disallowed if we've set up a malformation
# (unless malformations are allowed), even if the flag
# indicates it is allowed. Fix up test name to indicate
# this as well
my $disallowed = 0;
if ( $this_disallow_flags & $this_utf8n_flag_to_disallow
&& $this_expected_len >= $this_needed_to_discern_len)
{
$disallowed = 1;
}
if ($malformations_name) {
if ($malformed_allow_type == 0) {
$disallowed = 1;
}
elsif ($malformed_allow_type == 1) {
# Even if allowed, the malformation returns the
# REPLACEMENT CHARACTER.
$expected_uv = 0xFFFD;
$this_uv_string = "0xFFFD"
}
}
my $this_name = "utf8n_to_uvchr_error() $testname: ";
if (! $initially_malformed) {
$this_name .= ($disallowed)
? 'disallowed, '
: 'allowed, ';
}
$this_name .= "$eval_warn";
$this_name .= ", " . (( $this_warning_flags
& $this_utf8n_flag_to_warn)
? 'with flag for raising warnings'
: 'no flag for raising warnings');
$this_name .= $malformations_name;
# Do the actual test using an eval
undef @warnings_gotten;
my $ret_ref;
my $this_flags
= $allow_flags|$this_warning_flags|$this_disallow_flags;
my $eval_text = "$eval_warn; \$ret_ref"
. " = test_utf8n_to_uvchr_error("
. "'$this_bytes', $this_length, $this_flags)";
eval "$eval_text";
if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
{
diag "\$@='$@'; call was: "
. utf8n_display_call($eval_text);
next;
}
if ($disallowed) {
is($ret_ref->[0], 0, " And returns 0")
or diag "Call was: " . utf8n_display_call($eval_text);
}
else {
is($ret_ref->[0], $expected_uv,
" And returns expected uv: "
. $this_uv_string)
or diag "Call was: " . utf8n_display_call($eval_text);
}
is($ret_ref->[1], $this_expected_len,
" And returns expected length:"
. " $this_expected_len")
or diag "Call was: " . utf8n_display_call($eval_text);
my $returned_flags = $ret_ref->[2];
for (my $i = @expected_return_flags - 1; $i >= 0; $i--) {
if ($expected_return_flags[$i] & $returned_flags) {
if ($expected_return_flags[$i]
== $::UTF8_GOT_PERL_EXTENDED)
{
pass(" Expected and got return flag for"
. " PERL_EXTENDED");
}
# The first entries in this are
# malformations
elsif ($i > @malformation_names - 1) {
pass(" Expected and got return flag"
. " for " . $controlling_warning_category);
}
else {
pass(" Expected and got return flag for "
. $malformation_names[$i]
. " malformation");
}
$returned_flags &= ~$expected_return_flags[$i];
splice @expected_return_flags, $i, 1;
}
}
is($returned_flags, 0,
" Got no unexpected return flags")
or diag "The unexpected flags gotten were: "
. (flags_to_text($returned_flags,
\@utf8n_flags_to_text)
# We strip off any prefixes from the flag
# names
=~ s/ \b [A-Z] _ //xgr);
is (scalar @expected_return_flags, 0,
" Got all expected return flags")
or diag "The expected flags not gotten were: "
. (flags_to_text(eval join("|",
@expected_return_flags),
\@utf8n_flags_to_text)
# We strip off any prefixes from the flag
# names
=~ s/ \b [A-Z] _ //xgr);
do_warnings_test(@expected_warnings)
or diag "Call was: " . utf8n_display_call($eval_text);
undef @warnings_gotten;
# Check CHECK_ONLY results when the input is
# disallowed. Do this when actually disallowed,
# not just when the $this_disallow_flags is set
if ($disallowed) {
my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY;
my $eval_text = "use warnings; \$ret_ref ="
. " test_utf8n_to_uvchr_error('"
. "$this_bytes', $this_length,"
. " $this_flags)";
eval $eval_text;
if (! ok ("$@ eq ''",
" And eval succeeded with CHECK_ONLY"))
{
diag "\$@='$@'; Call was: "
. utf8n_display_call($eval_text);
next;
}
is($ret_ref->[0], 0, " CHECK_ONLY: Returns 0")
or diag "Call was: " . utf8n_display_call($eval_text);
is($ret_ref->[1], -1,
" CHECK_ONLY: returns -1 for length")
or diag "Call was: " . utf8n_display_call($eval_text);
if (! is(scalar @warnings_gotten, 0,
" CHECK_ONLY: no warnings generated"))
{
diag "Call was: " . utf8n_display_call($eval_text);
output_warnings(@warnings_gotten);
}
}
# Now repeat some of the above, but for
# uvchr_to_utf8_flags(). Since this comes from an
# existing code point, it hasn't overflowed, and isn't
# malformed.
next if @malformation_names;
$this_warning_flags = ($use_warn_flag)
? $this_uvchr_flag_to_warn
: 0;
$this_disallow_flags = ($do_disallow)
? $this_uvchr_flag_to_disallow
: 0;
$disallowed = $this_disallow_flags
& $this_uvchr_flag_to_disallow;
$this_name .= ", " . (( $this_warning_flags
& $this_utf8n_flag_to_warn)
? 'with flag for raising warnings'
: 'no flag for raising warnings');
$this_name = "uvchr_to_utf8_flags() $testname: "
. (($disallowed)
? 'disallowed'
: 'allowed');
$this_name .= ", $eval_warn";
$this_name .= ", " . (( $this_warning_flags
& $this_uvchr_flag_to_warn)
? 'with warning flag'
: 'no warning flag');
undef @warnings_gotten;
my $ret;
$this_flags = $this_warning_flags|$this_disallow_flags;
$eval_text = "$eval_warn; \$ret ="
. " test_uvchr_to_utf8_flags("
. "$allowed_uv, $this_flags)";
eval "$eval_text";
if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
{
diag "\$@='$@'; call was: "
. uvchr_display_call($eval_text);
next;
}
if ($disallowed) {
is($ret, undef, " And returns undef")
or diag "Call was: " . uvchr_display_call($eval_text);
}
else {
is($ret, $this_bytes, " And returns expected string")
or diag "Call was: " . uvchr_display_call($eval_text);
}
do_warnings_test(@expected_warnings)
or diag "Call was: " . uvchr_display_call($eval_text);
}
}
}
}
}
}
}
}
done_testing;