The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
#!/usr/bin/env perl
use strict;
use warnings;
use Test::More tests => 3;
use Treex::Core::Document;
use Treex::Block::W2A::Tokenize;

my $sentence = 'http://example.com costs $10.5  and forty-two C++ programmers.';
my $expected = 'http://example.com costs $ 10.5 and forty-two C++ programmers .';
my $block    = new_ok('Treex::Block::W2A::Tokenize');
my $got      = $block->tokenize_sentence($sentence);
is($got, $expected, "Tokenizing '$sentence'");

$sentence   = '. . . tricky one.';
$expected   = 'ord=1|form=...|no_space_after=0 ord=2|form=tricky|no_space_after=0 ord=3|form=one|no_space_after=1 ord=4|form=.|no_space_after=0';
my $doc     = Treex::Core::Document->new();
my $bundle  = $doc->create_bundle();
my $zone    = $bundle->create_zone('en');
$zone->set_sentence($sentence);
$block->process_document($doc);
my $atree = $zone->get_atree();
$got = join ' ', map {'ord='.$_->ord.'|form='.$_->form.'|no_space_after='.$_->no_space_after} $atree->get_descendants({ordered=>1});

is($got, $expected, "Tokenizing via process_document '$sentence'");