package Hailo::Tokenizer::Words;
BEGIN {
$Hailo::Tokenizer::Words::AUTHORITY = 'cpan:AVAR';
}
{
$Hailo::Tokenizer::Words::VERSION = '0.72';
}
use 5.010;
use utf8;
use Any::Moose;
use Any::Moose 'X::StrictConstructor';
use Regexp::Common qw/ URI /;
use namespace::clean -except => 'meta';
with qw(Hailo::Role::Arguments
Hailo::Role::Tokenizer);
# [[:alpha:]] doesn't match combining characters on Perl >=5.12
my $ALPHABET = qr/(?![_\d])\w/;
# tokenization
my $SPACE = qr/\s/;
my $NONSPACE = qr/\S/;
my $DASH = qr/[–-]/;
my $POINT = qr/[.,]/;
my $APOSTROPHE = qr/['’´]/;
my $ELLIPSIS = qr/\.{2,}|…/;
my $NON_WORD = qr/[^\w\s]+/;
my $BARE_WORD = qr/\w+/;
my $CURRENCY = qr/[¤¥¢£\$]/;
my $NUMBER = qr/$CURRENCY?$POINT\d+(?:$POINT\d+)*(?:$CURRENCY|$ALPHABET+)?|$CURRENCY?\d+(?:$POINT\d+)*(?:$CURRENCY|$ALPHABET+)?(?!\d|$ALPHABET)/;
my $APOST_WORD = qr/$ALPHABET+(?:$APOSTROPHE$ALPHABET+)+/;
my $ABBREV = qr/$ALPHABET(?:\.$ALPHABET)+\./;
my $DOTTED = qr/$BARE_WORD?\.$BARE_WORD(?:\.$BARE_WORD)*/;
my $WORD_TYPES = qr/$NUMBER|$ABBREV|$DOTTED|$APOST_WORD|$BARE_WORD/;
my $WORD_APOST = qr/$WORD_TYPES(?:$DASH$WORD_TYPES)*$APOSTROPHE(?!$ALPHABET|$NUMBER)/;
my $WORD = qr/$WORD_TYPES(?:(?:$DASH$WORD_TYPES)+|$DASH(?!$DASH))?/;
my $MIXED_CASE = qr/ \p{Lower}+ \p{Upper} | \p{Upper}{2,} \p{Lower} /x;
my $UPPER_NONW = qr/^ (?:\p{Upper}+ \W+)(?<!I') (?: \p{Upper}* \p{Lower} ) /x;
# special tokens
my $TWAT_NAME = qr/ \@ [A-Za-z0-9_]+ /x;
my $EMAIL = qr/ [A-Z0-9._%+-]+ @ [A-Z0-9.-]+ (?: \. [A-Z]{2,4} )* /xi;
my $PERL_CLASS = qr/ (?: :: \w+ (?: :: \w+ )* | \w+ (?: :: \w+ )+ ) (?: :: )? | \w+ :: /x;
my $EXTRA_URI = qr{ (?: \w+ \+ ) ssh:// $NONSPACE+ }x;
my $ESC_SPACE = qr/(?:\\ )+/;
my $NAME = qr/(?:$BARE_WORD|$ESC_SPACE)+/;
my $FILENAME = qr/ $NAME? \. $NAME (?: \. $NAME )* | $NAME/x;
my $UNIX_PATH = qr{ / $FILENAME (?: / $FILENAME )* /? }x;
my $WIN_PATH = qr{ $ALPHABET : \\ $FILENAME (?: \\ $FILENAME )* \\?}x;
my $PATH = qr/$UNIX_PATH|$WIN_PATH/;
my $DATE = qr/[0-9]{4}-W?[0-9]{1,2}-[0-9]{1,2}/i;
my $TIME = qr/[0-9]{1,2}:[0-9]{2}(?::[0-9]{2})?(?:Z| ?[AP]M|[-+±][0-9]{2}(?::?[0-9]{2})?)?/i;
my $DATETIME = qr/${DATE}T$TIME/;
my $IRC_NICK = qr/<(?: |[&~]?[@%+~&])?[A-Za-z_`\-^\|\\\{}\[\]][A-Za-z_0-9`\-^\|\\\{}\[\]]+>/;
my $IRC_CHAN = qr/[#&+][^ \a\0\012\015,:]{1,199}/;
my $NUMERO = qr/#[0-9]+/;
my $CLOSE_TAG = qr{</[-\w]+>};
my $CASED_WORD = qr/$CLOSE_TAG|$IRC_NICK|$IRC_CHAN|$DATETIME|$DATE|$TIME|$PERL_CLASS|$EXTRA_URI|$EMAIL|$TWAT_NAME|$PATH|$NUMERO/;
# capitalization
# The rest of the regexes are pretty hairy. The goal here is to catch the
# most common cases where a word should be capitalized. We try hard to
# guard against capitalizing things which don't look like proper words.
# Examples include URLs and code snippets.
my $OPEN_QUOTE = qr/['"‘“„«»「『‹‚]/;
my $CLOSE_QUOTE = qr/['"’“”«»」』›‘]/;
my $TERMINATOR = qr/(?:[?!‽]+|(?<!\.)\.)/;
my $ADDRESS = qr/:/;
my $PUNCTUATION = qr/[?!‽,;.:]/;
my $BOUNDARY = qr/$CLOSE_QUOTE?(?:\s*$TERMINATOR|$ADDRESS)\s+$OPEN_QUOTE?\s*/;
my $LOOSE_WORD = qr/$IRC_CHAN|$DATETIME|$DATE|$TIME|$PATH|$NUMBER|$ABBREV|$APOST_WORD|$NUMERO|$BARE_WORD(?:$DASH(?:$WORD_TYPES|$BARE_WORD)|$APOSTROPHE(?!$ALPHABET|$NUMBER|$APOSTROPHE)|$DASH(?!$DASH{2}))*/;
my $SPLIT_WORD = qr{$LOOSE_WORD(?:/$LOOSE_WORD)?(?=$PUNCTUATION(?:\s+|$)|$CLOSE_QUOTE|$TERMINATOR|\s+|$)};
# we want to capitalize words that come after "On example.com?"
# or "You mean 3.2?", but not "Yes, e.g."
my $DOTTED_STRICT = qr/$LOOSE_WORD(?:$POINT(?:\d+|\w{2,}))?/;
my $WORD_STRICT = qr/$DOTTED_STRICT(?:$APOSTROPHE$DOTTED_STRICT)*/;
# This string is added to (and later removed from) the output string when
# capitalizing it in multiple passes. We use backspace, because that is
# unlikely to be in the input. This dirty approach can probably be replaced
# with regex grammars, but I haven't bothered to learn to use those.
my $SEPARATOR = "\x08";
# input -> tokens
sub make_tokens {
my ($self, $input) = @_;
my @tokens;
$input =~ s/$DASH\K\s*\n+\s*//;
$input =~ s/\s*\n+\s*/ /gm;
while (length $input) {
# remove the next chunk of whitespace
$input =~ s/^$SPACE+//;
my $got_word;
while (length $input && $input =~ /^$NONSPACE/) {
# We convert it to ASCII and then look for a URI because $RE{URI}
# from Regexp::Common doesn't support non-ASCII domain names
my ($ascii) = $input =~ /^($NONSPACE+)/;
$ascii =~ s/[^[:ascii:]]/a/g;
# URIs
if (!$got_word && $ascii =~ / ^ $RE{URI} /xo) {
my $uri_end = $+[0];
my $uri = substr $input, 0, $uri_end;
$input =~ s/^\Q$uri//;
push @tokens, [$self->{_spacing_normal}, $uri];
$got_word = 1;
}
# special words for which we preserve case
elsif (!$got_word && $input =~ s/ ^ (?<word> $CASED_WORD )//xo) {
push @tokens, [$self->{_spacing_normal}, $+{word}];
$got_word = 1;
}
# normal words
elsif ($input =~ / ^ $WORD /xo) {
my $word;
# special case to allow matching q{ridin'} as one word, even when
# it appears as q{"ridin'"}, but not as q{'ridin'}
my $last_char = @tokens ? substr $tokens[-1][1], -1, 1 : '';
if (!@tokens && $input =~ s/ ^ (?<word>$WORD_APOST) //xo
|| $last_char =~ / ^ $APOSTROPHE $ /xo
&& $input =~ s/ ^ (?<word>$WORD_APOST) (?<! $last_char ) //xo) {
$word = $+{word};
}
else {
$input =~ s/^($WORD)//o and $word = $1;
}
# Maybe preserve the casing of this word
$word = lc $word
if $word ne uc $word
# Mixed-case words like "WoW" or "ATMs"
and $word !~ $MIXED_CASE
# Words that are upper case followed by a non-word character.
and $word !~ $UPPER_NONW;
push @tokens, [$self->{_spacing_normal}, $word];
$got_word = 1;
}
# everything else
elsif ($input =~ s/ ^ (?<non_word> $NON_WORD ) //xo) {
my $non_word = $+{non_word};
my $spacing = $self->{_spacing_normal};
# was the previous token a word?
if ($got_word) {
$spacing = $input =~ /^$NONSPACE/
? $self->{_spacing_infix}
: $self->{_spacing_postfix};
}
# do we still have more tokens?
elsif ($input =~ /^$NONSPACE/) {
$spacing = $self->{_spacing_prefix};
}
push @tokens, [$spacing, $non_word];
}
}
}
return \@tokens;
}
# tokens -> output
sub make_output {
my ($self, $tokens) = @_;
my $reply = '';
for my $pos (0 .. $#{ $tokens }) {
my ($spacing, $text) = @{ $tokens->[$pos] };
$reply .= $text;
# append whitespace if this is not a prefix token or infix token,
# and this is not the last token, and the next token is not
# a postfix/infix token
if ($pos != $#{ $tokens }
&& $spacing != $self->{_spacing_prefix}
&& $spacing != $self->{_spacing_infix}
&& !($pos < $#{ $tokens }
&& ($tokens->[$pos+1][0] == $self->{_spacing_postfix}
|| $tokens->[$pos+1][0] == $self->{_spacing_infix})
)
) {
$reply .= ' ';
}
}
# capitalize the first word
$reply =~ s/^\s*$OPEN_QUOTE?\s*\K($SPLIT_WORD)(?=$ELLIPSIS|(?:(?:$CLOSE_QUOTE|$TERMINATOR|$ADDRESS|$PUNCTUATION+)?(?:\s|$)))/\u$1/o;
# capitalize the second word
$reply =~ s/^\s*$OPEN_QUOTE?\s*$SPLIT_WORD(?:(?:\s*$TERMINATOR|$ADDRESS)\s+)\K($SPLIT_WORD)/\u$1/o;
# capitalize all other words after word boundaries
# we do it in two passes because we need to match two words at a time
$reply =~ s/(?:$ELLIPSIS|\s+)$OPEN_QUOTE?\s*$WORD_STRICT$BOUNDARY\K($SPLIT_WORD)/$SEPARATOR\u$1$SEPARATOR/go;
$reply =~ s/$SEPARATOR$WORD_STRICT$SEPARATOR$BOUNDARY\K($SPLIT_WORD)/\u$1/go;
$reply =~ s/$SEPARATOR//go;
# end paragraphs with a period when it makes sense
$reply =~ s/(?:$ELLIPSIS|\s+|^)$OPEN_QUOTE?(?:$SPLIT_WORD(?:\.$SPLIT_WORD)*)\K($CLOSE_QUOTE?)$/.$1/o;
# capitalize I'm, I've...
$reply =~ s{(?:(?:$ELLIPSIS|\s+)|$OPEN_QUOTE)\Ki(?=$APOSTROPHE$ALPHABET)}{I}go;
return $reply;
}
__PACKAGE__->meta->make_immutable;
=encoding utf8
=head1 NAME
Hailo::Tokenizer::Words - A tokenizer for L<Hailo|Hailo> which splits
on whitespace and word boundaries, mostly.
=head1 DESCRIPTION
This tokenizer does its best to handle various languages. It knows about most
apostrophes, quotes, and sentence terminators.
=head1 AUTHOR
Hinrik E<Ouml>rn SigurE<eth>sson, hinrik.sig@gmail.com
=head1 LICENSE AND COPYRIGHT
Copyright 2010 Hinrik E<Ouml>rn SigurE<eth>sson
This program is free software, you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut