The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Treex::Block::Read::CoNLLX;
BEGIN {
  $Treex::Block::Read::CoNLLX::VERSION = '0.08170';
}
use Moose;
use Treex::Core::Common;
use File::Slurp;
extends 'Treex::Block::Read::BaseTextReader';

sub next_document_text {
    my ($self) = @_;
    my $FH = $self->_current_fh;
    if ( !$FH ) {
        $FH = $self->next_filehandle() or return;
        $self->_set_current_fh($FH);
    }

    if ( $self->is_one_doc_per_file ) {
        $self->_set_current_fh(undef);
        return read_file($FH);
    }

    my $text;
    my $empty_lines;

    LINE:

    while (<$FH>) {
        if ( $_ =~ m/^\s*$/ ) {
            $empty_lines++;
            return $text if $empty_lines == $self->lines_per_doc;
        }
        $text .= $_;
    }

    return $text;
}

sub next_document {
    my ($self) = @_;
    my $text = $self->next_document_text();
    return if !defined $text;

    my $document = $self->new_document();
    foreach my $tree ( split /\n\s*\n/, $text ) {
        my @tokens  = split( /\n/, $tree );
        # Skip empty sentences (if any sentence is empty at all,
        # typically it is the first or the last one because of superfluous empty lines).
        next unless(@tokens);
        my $bundle  = $document->create_bundle();
        my $zone    = $bundle->create_zone( $self->language, $self->selector );
        my $aroot   = $zone->create_atree();
        my @parents = (0);
        my @nodes   = ($aroot);
        my $sentence;
        foreach my $token (@tokens) {
            next if $token =~ /^\s*$/;
            my ( $id, $form, $lemma, $cpos, $pos, $feat, $head, $deprel ) = split( /\t/, $token );
            my $newnode = $aroot->create_child();
            $newnode->shift_after_subtree($aroot);
            $newnode->set_form($form);
            $newnode->set_lemma($lemma);
            $newnode->set_tag($pos);
            $newnode->set_conll_cpos($cpos);
            $newnode->set_conll_pos($pos);
            $newnode->set_conll_feat($feat);
            $newnode->set_conll_deprel($deprel);
            $sentence .= "$form ";
            push @nodes,   $newnode;
            push @parents, $head;
        }
        foreach my $i ( 1 .. $#nodes ) {
            $nodes[$i]->set_parent( $nodes[ $parents[$i] ] );
        }
        $sentence =~ s/\s+$//;
        $zone->set_sentence($sentence);
    }

    return $document;
}

1;

__END__

=head1 NAME

Treex::Block::Read::CoNLLX

=head1 VERSION

version 0.08170

=head1 DESCRIPTION

Document reader for CoNLL format.
Each token is on separated line in the following format:
ord<tab>form<tab>lemma<tab>cpos<tab>pos<tab>features<tab>head<tab>deprel
Sentences are separated with blank line.
The sentences are stored into L<bundles|Treex::Core::Bundle> in the 
L<document|Treex::Core::Document>.

=head1 ATTRIBUTES

=over

=item from

space or comma separated list of filenames

=item lines_per_doc

number of sentences (!) per document

=back

=head1 METHODS

=over

=item next_document

Loads a document.

=back

=head1 SEE

L<Treex::Block::Read::BaseTextReader>
L<Treex::Core::Document>
L<Treex::Core::Bundle>

=head1 AUTHOR

David Mareček

=head1 COPYRIGHT AND LICENSE

Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague

This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.