The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Treex::Core::DocumentReader::Base;
BEGIN {
  $Treex::Core::DocumentReader::Base::VERSION = '0.08399';
}
use Moose;
use Treex::Core::Common;
with 'Treex::Core::DocumentReader';

has encoding => ( isa => 'Str', is => 'ro', default => 'utf8' );

has selector => ( isa => 'Treex::Type::Selector', is => 'ro', default => '' );

has file_stem => (
    isa           => 'Str',
    is            => 'ro',
    documentation => 'how to name the loaded documents',
);

has lines_per_doc => ( isa => 'Int',  is => 'ro', default => 0 );
has merge_files   => ( isa => 'Bool', is => 'ro', default => 0 );

has check_same_number_of_files_per_zone => (
    isa           => 'Bool',
    is            => 'ro',
    default       => 1,
    documentation => 'exit with fatal error if zones have different number of input files',
);

has save_doc_text => (
    isa           => 'Bool',
    is            => 'ro',
    default       => 0,
    documentation => 'save raw document text into (each) doc-zone "text" attribute',
);

has _files_per_zone => ( is => 'rw', default => 0 );

has zones => (
    isa     => 'ArrayRef[Treex::Core::DocumentReader::ZoneReader]',
    is      => 'ro',
    default => sub { [] },
);

has is_one_doc_per_file => (
    is      => 'ro',
    isa     => 'Bool',
    default => 1,
);

sub BUILD {
    my ( $self, $args ) = @_;
    foreach my $arg ( keys %{$args} ) {
        my ( $lang, $sele ) = ( $arg, '' );
        if ( $arg =~ /_/ ) {
            ( $lang, $sele ) = split /_/, $arg;
        }
        if ( Treex::Core::Types::is_lang_code($lang) ) {
            $self->add_zone_files( $lang, $sele, $args->{$arg} );
        }
        elsif ( $arg =~ /selector|language|scenario/ ) { }
        else {
            log_warn "$arg is not a zone label (e.g. en_src)";
        }
    }
    return;
}

sub add_zone_filenames {
    my ( $self, $language, $selector, $files_string ) = @_;
    $files_string =~ s/^\s+|\s+$//g;
    my @files = split( /[ ,]+/, $files_string );

    if ( $self->check_same_number_of_files_per_zone ) {
        if ( !$self->_files_per_zone ) {
            $self->_set_files_per_zone( scalar @files );
        }
        elsif ( @files != $self->_files_per_zone ) {
            log_fatal("All zones must have the same number of files");
        }
    }

    push @{ $self->zones }, Treex::Core::DocumentReader::ReaderZone->new(
        language      => $language,
        selector      => $selector,
        filenames     => \@files,
        encoding      => $self->encoding,
        lines_per_doc => $self->lines_per_doc,
        merge_files   => $self->merge_files,
    );
    return;
}

sub new_document {
    my ( $self, $load_from ) = @_;
    my ( $stem, $file_number ) = ( '', '' );
    my ( $volume, $dirs, $file );
    if ( $self->file_stem ) {
        ( $stem, $file_number ) = ( $self->file_stem, undef );
    }
    else {    # Magical heuristics how to choose default name for a document loaded from several files
        foreach my $zone ( @{ $self->zones } ) {
            my $filename = $zone->current_filename;
            ( $volume, $dirs, $file ) = File::Spec->splitpath($filename);
            my ( $name, $extension ) = $file =~ /([^.]+)(\..+)?/;
            my $zonelabel = $zone->zone_label;
            my $lang      = $zone->language;
            my $sele      = $zone->selector;
            $name =~ s/[_-]?($lang|$sele|$zonelabel)[_-]?//gi;
            if ( !$name && !$stem ) {
                $name        = 'noname';
                $file_number = undef;
            }
            if ( $stem !~ /$name/ ) {
                if ( $stem ne '' ) {
                    $stem .= '_';
                }
                $stem .= $name;
            }
        }
    }

    $self->_set_doc_number( $self->doc_number + 1 );
    return Treex::Core::Document->new(
        {
            file_stem => $stem,
            loaded_from => join( ',', map { $_->current_filename } @{ $self->zones } ),
            defined $file_number ? ( file_number => $file_number )    : (),
            defined $dirs        ? ( path        => $volume . $dirs ) : (),
            defined $load_from   ? ( filename    => $load_from )      : (),
        }
    );
}

sub number_of_documents {
    my $self = shift;
    return if !$self->is_one_doc_per_file;
    return $self->_files_per_zone;
}

after 'restart' => sub {
    my $self = shift;
    foreach my $zone_reader ( values %{ $self->zones } ) {
        $zone_reader->reset();
    }
    return;
};

sub zonelabels {
    my ($self) = @_;
    return map { $_->zone_label } @{ $self->zones };
}

sub next_document {
    my ($self) = @_;
    my ( %texts, %sents, $n_sentences );

    foreach my $zone ( @{ $self->zones } ) {
        my $text      = $zone->next_document_text();
        my $zonelabel = $zone->zonelabel;
        $texts{$zonelabel} = $text;
    }
    my $doc = $self->new_document();

    if ( $self->save_doc_text ) {
        foreach my $zone ( @{ $self->zones } ) {
            my $zonelabel = $zone->zonelabel;
            my $language  = $zone->language;
            my $selector  = $zone->selector;
            my $doczone   = $doc->create_zone( $language, $selector );
            $doczone->set_text( $texts{$zonelabel} );
        }
    }

    my $same_n_sentences = 1;
    foreach my $zonelabel ( $self->zonelabels ) {
        my $text = $texts{$zonelabel};
        my @sentences = $self->get_sentences_from_doc_text( $text, $zonelabel );
        $sents{$zonelabel} = \@sentences;
        if ( !defined $n_sentences ) {
            $n_sentences = @sentences;
        }
        elsif ( $n_sentences != @sentences ) {
            $same_n_sentences = 0;
        }
    }

    if ( !$same_n_sentences ) {
        log_fatal 'Different number of sentences for each zone: '
            . join( ', ', map { "$_=" . scalar( @{ $sents{$_} } ) } $self->zonelabels );
    }

    for my $i ( 1 .. $n_sentences ) {
        my $bundle = $doc->create_bundle();

        foreach my $zonelabel ( $self->zonelabels ) {
            my ( $language, $selector ) = ( $zonelabel, '' );
            if ( $zonelabel =~ /_/ ) {
                ( $language, $selector ) = split /_/, $zonelabel;
            }
            my $zone = $bundle->create_zone( $language, $selector );
            $self->fill_bundle_zone( $zone, $sents{$zonelabel}[ $i - 1 ] );
        }
    }

    return $doc;
}

sub get_sentences_from_doc_text {
    my ( $self, $doc_text, $zone_label ) = @_;
    return split /\n/, $doc_text;
}

sub fill_bundle_zone {
    my ( $self, $zone, $raw_sentence ) = @_;
    return log_fatal 'fill_bundle_zone must be overriden';
}

1;