The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# Copyright 2008-2010 Tim Rayner
# 
# This file is part of Bio::MAGETAB.
# 
# Bio::MAGETAB is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
# 
# Bio::MAGETAB is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with Bio::MAGETAB.  If not, see <http://www.gnu.org/licenses/>.
#
# $Id: TagValueFile.pm 368 2012-05-28 15:49:02Z tfrayner $

package Bio::MAGETAB::Util::Reader::TagValueFile;

use Moose;
use MooseX::FollowPBP;

use Carp;
use List::Util qw(first);

use MooseX::Types::Moose qw( HashRef );

BEGIN { extends 'Bio::MAGETAB::Util::Reader::Tabfile' };

has 'text_store'          => ( is         => 'rw',
                               isa        => HashRef,
                               default    => sub { {} },
                               required   => 1 );

has 'dispatch_table'      => ( is         => 'rw',
                               isa        => HashRef,
                               default    => sub { {} },
                               required   => 1 );

# Define some standard regexps:
my $BLANK = qr/\A [ ]* \z/xms;
my $COMMENT_TAG = qr/\A \s* Comment \s* \[ ([^\]]+) \] \s* \z/ixms;

###################
# Private methods #
###################

sub _create_termsources {

    my ( $self ) = @_;

    my @termsources;
    TS:
    foreach my $ts_data ( @{ $self->get_text_store()->{'termsource'} } ) {

        # Term Sources without Names are never going to work, so we skip them.
        next TS unless ( defined $ts_data->{'name'}
                              && $ts_data->{'name'} !~ $BLANK );

        my $termsource = $self->get_builder()->find_or_create_term_source( $ts_data );
	push @termsources, $termsource;
    }

    return \@termsources;
}

sub _create_controlled_terms {

    my ( $self, $type, $category ) = @_;

    my @terms;
    TERM:
    foreach my $term_data ( @{ $self->get_text_store()->{ $type } } ) {

        # Value is required for all ControlledTerm objects.
        next TERM unless ( defined $term_data->{'value'}
                                && $term_data->{'value'} !~ $BLANK );

        my $termsource;
        if ( my $ts = $term_data->{'termSource'} ) {
            $termsource = $self->get_builder()->get_term_source({
                'name' => $ts,
            });
        }

        my $args = {
            'category'   => $category,
            'value'      => $term_data->{'value'},
        };
        $args->{'accession'}  = $term_data->{'accession'}
            if defined $term_data->{'accession'};
        $args->{'termSource'} = $termsource if $termsource;

        my $term = $self->get_builder()->find_or_create_controlled_term( $args );

	push @terms, $term;
    }

    return \@terms;
}

sub _create_comments {

    my ( $self ) = @_;

    my @comments;
    COMM:
    while ( my ( $name, $values ) = each %{ $self->get_text_store()->{'comment'} } ) {

        # Value is required for all Comment objects.
        foreach my $value ( @$values ) {
            next COMM unless ( defined $value
                                   && $value !~ $BLANK );
            
            my $comment = $self->get_builder()->find_or_create_comment({
                'name'    => $name,
                'value'   => $value,
            });

            push @comments, $comment;
        }
    }

    return \@comments;
}

sub _read_as_arrayref {

    # Method to parse the TagValueFile object file into an array of
    # arrayrefs. This method uses Text::CSV_XS to parse tab-delimited
    # text.

    my ( $self ) = @_;

    # This is required for Text::CSV_XS.
    local $/ = $self->get_eol_char();

    my (@rows, $larry);

    FILE_LINE:
    while ( $larry = $self->getline() ) {

        # Skip empty lines, comments.
        next FILE_LINE if $self->can_ignore( $larry );

	# Strip surrounding whitespace from each element.
        $larry = $self->strip_whitespace( $larry );

	# Strip off empty trailing values.
	my $end_value;
	until ( defined($end_value) && $end_value !~ /\A \s* \z/xms ) {
	    $end_value = pop(@$larry);
	}
	push @$larry, $end_value;

	# Reset empty strings to undefs.
	foreach my $value ( @$larry ) {
	    undef($value) if ( defined($value) && $value eq q{} );
	}

	push @rows, $larry;
    }

    # Check we've parsed to the end of the file.
    $self->confirm_full_parse();

    return \@rows;
}

sub _validate_arrayref_tags {

    # Method to check the return value from _read_as_arrayref to check
    # for (a) duplicate tags, and (b) unrecognised tags. Returns a
    # hash with keys corresponding to known tags and value arrayrefs
    # containing the file annotation.

    my ($self, $array_of_rows) = @_;

    # Array with row tags linked to rest-of-row arrayref values.
    my @file_data;
    foreach my $row ( @{ $array_of_rows } ) {
        push @file_data, [ $row->[0], [@{ $row }[1..$#$row] ] ];
    }

    # A list of acceptable tags, expressed as qr//
    my @acceptable = keys %{ $self->get_dispatch_table() };
    foreach my $datum ( @file_data ) {
        my ( $tag, $values ) = @$datum;

	# N.B. acceptable tag REs may contain whitespace; no x option
	# here.
	next if $tag =~ /\A\s*$COMMENT_TAG\s*\z/ms;

	# Check for recognised tags here.
	unless ( first { $tag =~ /\A\s*$_\s*\z/ms } @acceptable ) {
	    croak(qq{Error: unrecognized tag(s): "$tag"});
	}

	# Empty Name tags are invalid and will cause fatal crashes
	# later; we check for them here.
	if ( $tag =~ m/name \s* \z/ixms ) {
	    foreach my $value ( @$values ) {
		warn(
		    qq{Warning: Name attribute "$tag" is empty.\n}
		) unless $value;
	    }
	}
    }

    return \@file_data;
}

sub _add_grouped_data {

    # Create an ordered set of data groups indexed by $i.
    my ( $self, $group, $tag, @args ) = @_;

    for ( my $i = 0; $i <= $#args; $i++ ) {

        # It's conceivable that this should actually be valid, but
        # until someone comes up with a way to unambiguously group
        # more than one set of lines specifying a given list of
        # objects (e.g. protocols) *within the current specification*,
        # it's not practical to try and support it.
        if ( defined $self->get_text_store()->{ $group }[$i]{ $tag } ) {
            croak("Error: Duplicate $group $tag field encountered")
        }
        $self->get_text_store()->{ $group }[$i]{ $tag } = $args[$i];
    }

    return;
}

sub _add_singleton_data {

    # Record a 1:n object:args relationship.
    my ( $self, $group, $tag, @args ) = @_;

    # Make a copy of @args, just in case.
    my $data = $self->get_text_store()->{ $group }{ $tag } || [];
    push @{ $data }, @args;
    my %uniq = map { $_ => 1 } @{ $data };
    $self->get_text_store()->{ $group }{ $tag } = [ keys %uniq ];

    return;
}

sub _add_singleton_datum {

    # Record a 1:1 group:arg relationship.
    my ( $self, $group, $tag, $arg ) = @_;

    # These really aren't allowed to be duplicated so we throw an
    # error here rather than try and cope.
    if ( defined $self->get_text_store()->{ $group }{ $tag } ) {
        croak("Error: Duplicate $group $tag field encountered");
    }

    $self->get_text_store()->{ $group }{ $tag } = $arg;

    return;
}

sub _add_comment {

    # Comments are currently processed at the level of the top-level
    # enclosing object (Investigation or ArrayDesign) only.
    my ( $self, $name, @values ) = @_;

    my $comments = $self->get_text_store()->{ 'comment' }{ $name } || [];

    VALUE:
    foreach my $value ( @values ) {
        next VALUE if ( ! defined $value || $value =~ $BLANK );
        push @{ $comments }, $value;
    }

    my %uniq = map { $_ => 1 } @{ $comments };
    $self->get_text_store()->{ 'comment' }{ $name } = [ keys %uniq ];

    return;
}
    
sub _retrieve_sub {

    my ( $self, $tag ) = @_;

    my $rc;

    while ( my ( $key, $sub ) = each %{ $self->get_dispatch_table() } ) {

	# $key may contain whitespace, no x option here.
	if ( $tag =~ /\A\s*$key\s*\z/ms ) {
	    $rc = $sub;
	}
	
	# Have to loop through the rest of the list to reset while()
	# on the hash.
    }

    return $rc;
}

sub _dispatch {

    my ( $self, $tag, @args ) = @_;

    unless (defined $tag) {
	confess("Error: dispatch needs a defined tag name.");
    }

    my $sub = $self->_retrieve_sub( $tag );
    unless (defined $sub && ref $sub eq 'CODE') {
	if ( my ( $commentname ) = ( $tag =~ /\A\s*$COMMENT_TAG\s*\z/ms ) ) {
	    $self->_add_comment($commentname, @args);
	}
	else {

	    # This should have been caught in _validate_arrayref_tags
	    croak(qq{Error: Cannot parse the tag: "$tag".});
	}
    }

    return $sub ? $sub->(@args) : undef;
}

# Make the classes immutable. In theory this speeds up object
# instantiation for a small compilation time cost.
__PACKAGE__->meta->make_immutable();

no Moose;

=head1 NAME

Bio::MAGETAB::Util::Reader::TagValueFile - An abstract class providing methods for
handling tab-delimited files.

=head1 SYNOPSIS

 use base qw(Bio::MAGETAB::Util::Reader::TagValueFile);

=head1 DESCRIPTION

This is an abstract class providing methods for parsing tag-value
files such as the MAGE-TAB IDF component, and the header of ADF
files. It is not designed to be used directly.

=head1 ATTRIBUTES

See the L<Tabfile|Bio::MAGETAB::Util::Reader::Tabfile> class for superclass attributes.

=over 2

=item text_store

=item dispatch_table

These are internal caches not designed to be addressed directly.

=back

=head1 METHODS

No public methods.

=head1 SEE ALSO

L<Bio::MAGETAB::Util::Reader::Tabfile>
L<Bio::MAGETAB::Util::Reader>
L<Bio::MAGETAB::Util::Reader::ADF>
L<Bio::MAGETAB::Util::Reader::IDF>

=head1 AUTHOR

Tim F. Rayner <tfrayner@gmail.com>

=head1 LICENSE

This library is released under version 2 of the GNU General Public
License (GPL).

=cut

1;