The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Treex::Block::Write::CoNLLX;
$Treex::Block::Write::CoNLLX::VERSION = '2.20151102';
use strict;
use warnings;
use Moose;
use Treex::Core::Common;
extends 'Treex::Block::Write::BaseTextWriter';

my %FALLBACK_FOR = ( 'pos' => 'tag', 'deprel' => 'afun', );

has '+language'                        => ( required => 1 );
has 'deprel_attribute'                 => ( is       => 'rw', isa => 'Str', default => 'autodetect' );
has 'pos_attribute'                    => ( is       => 'rw', isa => 'Str', default => 'autodetect' );
has 'cpos_attribute'                   => ( is       => 'rw', isa => 'Str', default => 'conll/cpos' );
has 'feat_attribute'                   => ( is       => 'rw', isa => 'Str', default => 'conll/feat' );
has 'is_member_within_afun'            => ( is       => 'rw', isa => 'Bool', default => 0 );
has 'is_shared_modifier_within_afun'   => ( is       => 'rw', isa => 'Bool', default => 0 );
has 'is_coord_conjunction_within_afun' => ( is       => 'rw', isa => 'Bool', default => 0 );
has 'randomly_select_sentences_ratio'  => ( is       => 'rw', isa => 'Num',  default => 1 );

has _was => ( is => 'rw', default => sub{{}} );

has '+extension' => ( default => '.conll' );

sub process_atree {
    my ( $self, $atree ) = @_;

    # if only random sentences are printed
    return if rand() > $self->randomly_select_sentences_ratio;
    foreach my $anode ( $atree->get_descendants( { ordered => 1 } ) ) {
        my ( $lemma, $pos, $cpos, $deprel ) =
            map { $self->get_attribute( $anode, $_ ) }
            (qw(lemma pos cpos deprel)); # "conll/" will be prefixed if needed; see get_attribute()

        #my $ctag  = $self->get_coarse_grained_tag($tag);

        # append suffices to afuns
        my $suffix = '';
        $suffix .= 'M' if $self->is_member_within_afun            && $anode->is_member;
        $suffix .= 'S' if $self->is_shared_modifier_within_afun   && $anode->is_shared_modifier;
        $suffix .= 'C' if $self->is_coord_conjunction_within_afun && $anode->wild->{is_coord_conjunction};
        $deprel .= "_$suffix" if $suffix;

        my $feat;
        if ( $self->feat_attribute eq 'conll/feat' && defined $anode->conll_feat() ) {
            $feat = $anode->conll_feat();
        }
        elsif ( $self->feat_attribute eq 'iset' ) {
            $feat = $anode->iset()->as_string_conllx();
        }
        else {
            $feat = '_';
        }
        my $p_ord = $anode->get_parent->ord;
        # Make sure that values are not empty and that they do not contain spaces.
        my @values = ($anode->ord, $anode->form, $lemma, $cpos, $pos, $feat, $p_ord, $deprel);
        @values = map
        {
            my $x = $_ // '_';
            $x =~ s/^\s+//;
            $x =~ s/\s+$//;
            $x =~ s/\s+/_/g;
            $x = '_' if($x eq '');
            $x
        }
        (@values);
        print { $self->_file_handle } join( "\t", @values ) . "\n";
    }
    print { $self->_file_handle } "\n" if $atree->get_descendants;
    return;
}

sub get_attribute {
    my ( $self, $anode, $name ) = @_;
    my $from = $self->{ $name . '_attribute' } || $name;    # TODO don't expect blessed hashref
    my $value;
    if ( $from eq 'autodetect' ) {
        my $before = $self->_was->{$name};
        if ( !defined $before ) {
            $value = $anode->get_attr("conll/$name");
            if ( defined $value ) {
                $self->_was->{$name} = "conll/$name";
            }
            else {
                my $fallback = $FALLBACK_FOR{$name}
                    or log_fatal "No fallback for attribute $name";
                $value = $anode->get_attr($fallback);
                $self->_was->{$name} = $fallback;
            }
        }
        else {
            $value = $anode->get_attr($before);
            if ( !defined $value && $before =~ /^conll/ ) {
                my $id = $anode->get_address();
                log_warn "Attribute $before not defined in $id"
                    . " but it was filled in some previous nodes."
                    . " Consider Write::CoNLLX with parameter ${name}_attribute != autodetect.";
            }
        }
    }
    else {
        $value = $anode->get_attr($from);
    }

    return defined $value ? $value : '_';
}

sub get_coarse_grained_tag {
    my ( $self, $tag ) = @_;
    return substr $tag, 0, 2;
}

1;

__END__

=head1 NAME

Treex::Block::Write::CoNLLX

=head1 VERSION

version 2.20151102

=head1 DESCRIPTION

Document writer for CoNLLX format, one token per line.

=head1 PARAMETERS

=over

=item encoding

Output encoding. C<utf8> by default.

=item to

The name of the output file, STDOUT by default.

=item deprel_attribute

The name of attribute which will be printed into the 8th column (dependency relation).
Default is C<autodetect> which tries first C<conll/deprel>
and if it is not defined then C<afun>.

=item pos_attribute

The name of attribute which will be printed into the 5th column (part-of-speech tag).
Default is C<autodetect> which tries first C<conll/pos>
and if it is not defined then C<tag>.

=item cpos_attribute

The name of attribute which will be printed into the 4th column
(coarse-grain part-of-speech tag).
Default is C<conll/cpos>.

=item feat_attribute

The name of attribute which will be printed into the 6th column (features).
Default is C<_> which means that an underscore will be printed instead of the features.
Possible values are C<conll/feat> and C<iset>.


=back

=head1 METHODS

=over

=item process_document

Saves the document.

=back

=head1 AUTHOR

David Mareček, Daniel Zeman, Martin Popel

=head1 COPYRIGHT AND LICENSE

Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague

This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.