package Treex::Block::Read::CoNLLX;
BEGIN {
$Treex::Block::Read::CoNLLX::VERSION = '0.08170';
}
use Moose;
use Treex::Core::Common;
use File::Slurp;
extends 'Treex::Block::Read::BaseTextReader';
sub next_document_text {
my ($self) = @_;
my $FH = $self->_current_fh;
if ( !$FH ) {
$FH = $self->next_filehandle() or return;
$self->_set_current_fh($FH);
}
if ( $self->is_one_doc_per_file ) {
$self->_set_current_fh(undef);
return read_file($FH);
}
my $text;
my $empty_lines;
LINE:
while (<$FH>) {
if ( $_ =~ m/^\s*$/ ) {
$empty_lines++;
return $text if $empty_lines == $self->lines_per_doc;
}
$text .= $_;
}
return $text;
}
sub next_document {
my ($self) = @_;
my $text = $self->next_document_text();
return if !defined $text;
my $document = $self->new_document();
foreach my $tree ( split /\n\s*\n/, $text ) {
my @tokens = split( /\n/, $tree );
# Skip empty sentences (if any sentence is empty at all,
# typically it is the first or the last one because of superfluous empty lines).
next unless(@tokens);
my $bundle = $document->create_bundle();
my $zone = $bundle->create_zone( $self->language, $self->selector );
my $aroot = $zone->create_atree();
my @parents = (0);
my @nodes = ($aroot);
my $sentence;
foreach my $token (@tokens) {
next if $token =~ /^\s*$/;
my ( $id, $form, $lemma, $cpos, $pos, $feat, $head, $deprel ) = split( /\t/, $token );
my $newnode = $aroot->create_child();
$newnode->shift_after_subtree($aroot);
$newnode->set_form($form);
$newnode->set_lemma($lemma);
$newnode->set_tag($pos);
$newnode->set_conll_cpos($cpos);
$newnode->set_conll_pos($pos);
$newnode->set_conll_feat($feat);
$newnode->set_conll_deprel($deprel);
$sentence .= "$form ";
push @nodes, $newnode;
push @parents, $head;
}
foreach my $i ( 1 .. $#nodes ) {
$nodes[$i]->set_parent( $nodes[ $parents[$i] ] );
}
$sentence =~ s/\s+$//;
$zone->set_sentence($sentence);
}
return $document;
}
1;
__END__
=head1 NAME
Treex::Block::Read::CoNLLX
=head1 VERSION
version 0.08170
=head1 DESCRIPTION
Document reader for CoNLL format.
Each token is on separated line in the following format:
ord<tab>form<tab>lemma<tab>cpos<tab>pos<tab>features<tab>head<tab>deprel
Sentences are separated with blank line.
The sentences are stored into L<bundles|Treex::Core::Bundle> in the
L<document|Treex::Core::Document>.
=head1 ATTRIBUTES
=over
=item from
space or comma separated list of filenames
=item lines_per_doc
number of sentences (!) per document
=back
=head1 METHODS
=over
=item next_document
Loads a document.
=back
=head1 SEE
L<Treex::Block::Read::BaseTextReader>
L<Treex::Core::Document>
L<Treex::Core::Bundle>
=head1 AUTHOR
David Mareček
=head1 COPYRIGHT AND LICENSE
Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague
This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.