#! /usr/bin/perl
use strict;
use warnings;
use Test::More;
BEGIN {
if ($] >= 5.008) {
binmode Test::More->builder->output, ':utf8';
binmode Test::More->builder->failure_output, ':utf8';
}
}
my @cities = read_utf8_lines('t/data/cities.txt');
my @yapc = read_utf8_lines('t/data/yapc.txt');
plan tests => 1 + 2 * @cities;
use_ok('Text::Match::FastAlternatives');
my $tmfa = Text::Match::FastAlternatives->new(@yapc);
my $tmfa_i = Text::Match::FastAlternatives->new(map { lc } @yapc);
my $rx = build_regex(0, @yapc);
my $rx_i = build_regex(1, @yapc);
for my $line (@cities) {
my $match_tmfa = $tmfa->match($line);
my $match_rx = $line =~ $rx;
ok($match_tmfa && $match_rx || !$match_tmfa && !$match_rx,
"same case-sensitive result for '$line'");
my $match_tmfa_i = $tmfa_i->match(lc $line);
my $match_rx_i = $line =~ $rx_i;
ok($match_tmfa_i && $match_rx_i || !$match_tmfa_i && !$match_rx_i,
"same case-insensitive result for '$line'");
}
sub build_regex {
my ($caseless, @items) = @_;
my $rx = join '|', map { quotemeta } @items;
return $caseless ? qr/$rx/i : qr/$rx/;
}
sub read_utf8_lines {
my ($filename) = @_;
return read_raw_lines($filename, '<:utf8')
if $] >= 5.008;
my @lines = read_raw_lines($filename, '<');
$_ = decode_utf8($_) for @lines;
return @lines;
}
sub read_raw_lines {
my ($filename, $mode) = @_;
open my $fh, $mode, $filename
or die "can't open $filename for reading: $!\n";
my @lines = <$fh>;
chomp @lines;
return @lines;
}
sub top_set_bits {
my ($i) = @_;
my @masks = (0, 0b1000_0000, 0b1100_0000, 0b1110_0000, 0b1111_0000,
0b1111_1000, 0b1111_1100, 0b1111_1110, 0b1111_1111);
for my $n (0 .. 7) {
return $n if ($i & $masks[$n + 1]) == $masks[$n];
}
return 8;
}
sub utf8_char {
my ($start, @bytes) = @_;
for (@bytes) {
$start <<= 6;
$start |= $_ & 0b11_1111;
}
return $start;
}
sub decode_utf8 {
my ($encoded) = @_;
my @chars;
my @bytes = unpack 'C*', $encoded;
while (@bytes) {
my $byte = shift @bytes;
my $top = top_set_bits($byte);
if ($top == 0) {
push @chars, $byte;
}
elsif ($top == 2) {
push @chars, utf8_char($byte & 0b1_1111, shift @bytes);
}
elsif ($top == 3) {
push @chars, utf8_char($byte & 0b1111, splice @bytes, 0, 2);
}
elsif ($top == 4) {
push @chars, utf8_char($byte & 0b0111, splice @bytes, 0, 3);
}
elsif ($top == 5) {
push @chars, utf8_char($byte & 0b0011, splice @bytes, 0, 4);
}
elsif ($top == 6) {
push @chars, utf8_char($byte & 0b0001, splice @bytes, 0, 5);
}
else {
die "Malformed UTF-8; byte=$byte\n";
}
}
return pack 'U*', @chars;
}