#!/usr/bin/env perl
use Test::More;
use Inline C => <<END_C;
int isUTF8(const char *s, int len) {
int i;
unsigned int y[len];
for (i=0; i<len; i++) {
if ((s[i] & 128) != 0) {
y[i] = (s[i] & 127) + 128;
} else {
y[i] = s[i];
}
}
for (i=0; i<len; i++) {
if ( i+3 < len && /* 4-byte: byte 1 and 2 */
( ( y[i] == 0xf0 && (y[i+1] & 240) == 144 ) || /* b1 == F0 */
( y[i] > 0xf1 && y[i] < 0xf3 && (y[i+1] & 192) == 128 ) || /* b1 >= F1 and <= F3 */
( y[i] == 0xf4 && y[i+1] >= 0x80 && y[i+1] <= 0x8f )) && /* b1 == F4 */
( y[i+2] & 192 ) == 128 && /* byte 3 */
( y[i+3] & 192 ) == 128 /* byte 4 */
) {
i += 3;
}
else
if ( i+2 < len && /* 3-byte: byte 1 and 2 */
( ( y[i] == 0xe0 && (y[i+1] & 224) == 160 ) || /* b1 == E0 */
( y[i] == 0xed && (y[i+1] & 224) == 128 ) || /* b1 == ED */
( ((y[i] >= 0xe1 && y[i] <= 0xec) || y[i] == 0xee || y[i] == 0xef) /* b1 >= E1 and <= EC or == EE or == EF */
&& (y[i+1] & 192) == 128 ) ) &&
(y[i+2] & 192) == 128) { /* byte 3 */
i += 2;
}
else if (i+1 < len && /* 2-byte: byte 1 */
y[i] >= 0xc2 && y[i] <= 0xdf && /* b1 >= C2 and <= DF */
(y[i+1] & 192) == 128) { /* byte 2 */
i += 1;
}
else if ((y[i] & 128) != 0) {
return 0;
}
}
return 1;
}
END_C
my %valid = (
"abc123" => 1,
"\xc0\x81" => 0,
"\xc1\xa0" => 0,
"\xc2\x81" => 1,
"\xdf\x80" => 1,
"\xdf\xc0" => 0,
"\xe0\x80" => 0,
"\xe0\x81\x80" => 0,
"\xe0\xa0\x80" => 1,
"\xed\xa0\x80" => 0,
"\xee\x81\x81" => 1,
"\xe9a" => 0,
"\xf0\x90\xbe\xbf" => 1,
"\xf2\x79\x80\x80" => 0,
"\xf4\x8f\xbf\x80" => 1,
);
for my $k (sort keys %valid) {
my $bytes = pack('C*', unpack('C*', $k));
my $length = length($bytes);
my $isUTF8 = isUTF8($bytes,$length);
my $hex = join ' ', map { sprintf( "%x", ord($_) ) } (split(//,$bytes));
my $vStr = ($valid{$k}) ? 'valid' : 'invalid';
is( $isUTF8, $valid{$k}, "utf8-test: $hex ($vStr)" );
}
done_testing();