#!perl
package RTF::Tokenizer::TestCorpus;
use strict;
use warnings;
use RTF::Tokenizer;
my $group_separator = "\x{1d}";
my $record_separator = "\x{1e}";
my $unit_separator = "\x{1f}";
sub test_corpus {
my $filename = shift;
my ( $test_name, $rtf, $expected_tokens ) = load_from_corpus( $filename );
my @actual_tokens = RTF::Tokenizer->new( string => $rtf )->get_all_tokens;
Test::More::is_deeply( \@actual_tokens, $expected_tokens, $test_name );
}
sub load_from_corpus {
my $filename = shift;
my $data = read_file( $filename );
my ( $test_name, $rtf, $token_string ) = split( $group_separator, $data );
my @tokens_raw = split( $record_separator, $token_string);
my @tokens = map {
my @fields = split( $unit_separator, $_ );
push( @fields, '' ) unless @fields > 2;
\@fields;
} @tokens_raw;
return ( $test_name, $rtf, \@tokens );
}
sub write_corpus {
my $filename = shift;
my $data = create_from_t( $filename );
my ($output) = $filename =~ m/9\d+(.+)\./;
$output = 't/corpus/' . $output . '.corpus';
open( FH, ">$output" ) || die $!;
print FH $data;
close FH;
print $output . "\n";
}
sub create_from_t {
my $filename = shift;
my $data = read_file( $filename );
my $test_name = $filename;
$test_name =~ s/.+9\d*//;
$test_name =~ s/\.t$//;
$test_name =~ s/_/ /g;
$test_name = ucfirst( $test_name );
my ( $perl, $rtf ) = split(/\n__DATA__\n/, $data);
return create( $test_name, $rtf );
}
sub create {
my ( $test_name, $rtf ) = @_;
my @tokens = RTF::Tokenizer->new( string => $rtf )->get_all_tokens;
my $token_string = join $record_separator, map { join( $unit_separator, @$_ ) } @tokens;
return join $group_separator, $test_name, $rtf, $token_string;
}
sub read_file {
my $filename = shift;
open( FH, "< $filename" ) || die $!;
my $data = join '', (<FH>);
close FH;
return $data;
}