The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# Copyright 2008-2010 Tim Rayner
# 
# This file is part of Bio::MAGETAB.
# 
# Bio::MAGETAB is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
# 
# Bio::MAGETAB is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with Bio::MAGETAB.  If not, see <http://www.gnu.org/licenses/>.
#
# $Id: IDF.pm 372 2012-08-01 14:01:42Z tfrayner $

package Bio::MAGETAB::Util::Reader::IDF;

use Moose;
use MooseX::FollowPBP;

use Carp;
use List::Util qw(first);

BEGIN { extends 'Bio::MAGETAB::Util::Reader::TagValueFile' };

has 'magetab_object'     => ( is         => 'rw',
                              isa        => 'Bio::MAGETAB::Investigation',
                              required   => 0 );

has 'document_version'   => ( is         => 'rw',
                              isa        => 'Str',
                              required   => 1,
                              default    => '1.0' );

# Define some standard regexps:
my $BLANK = qr/\A [ ]* \z/xms;

sub BUILD {

    my ( $self, $params ) = @_;

    # Dispatch table to direct each field to the appropriate place in
    # the text_store hashref. First argument is the internal tag used
    # to group the fields into concepts, the second is the
    # Bio::MAGETAB attribute name for the object.
    my $dispatch = {
        qr/Investigation *Title/i
            => sub{ $self->_add_singleton_datum('investigation', 'title',          @_) },
        qr/Date *Of *Experiment/i
            => sub{ $self->_add_singleton_datum('investigation', 'date',           @_) },
        qr/Public *Release *Date/i
            => sub{ $self->_add_singleton_datum('investigation', 'publicReleaseDate', @_) },
        qr/Experiment *Description/i
            => sub{ $self->_add_singleton_datum('investigation', 'description',    @_) },
        qr/SDRF *Files?/i
            => sub{ $self->_add_singleton_data('sdrf', 'uris',   @_) },

        qr/Experimental *Factor *Names?/i
            => sub{ $self->_add_grouped_data('factor', 'name',       @_) },
        qr/Experimental *Factor *Types?/i
            => sub{ $self->_add_grouped_data('factor', 'factorType', @_) },
        qr/Experimental *Factor *(?:Types?)? *Term *Source *REF/i
            => sub{ $self->_add_grouped_data('factor', 'termSource', @_) },
        qr/Experimental *Factor *(?:Types?)? *Term *Accession *Numbers?/i
            => sub{ $self->_add_grouped_data('factor', 'accession',  @_) },

        qr/Person *Last *Names?/i
            => sub{ $self->_add_grouped_data('person', 'lastName',    @_) },
        qr/Person *First *Names?/i
            => sub{ $self->_add_grouped_data('person', 'firstName',   @_) },
        qr/Person *Mid *Initials?/i
            => sub{ $self->_add_grouped_data('person', 'midInitials', @_) },
        qr/Person *Emails?/i
            => sub{ $self->_add_grouped_data('person', 'email',       @_) },
        qr/Person *Phones?/i
            => sub{ $self->_add_grouped_data('person', 'phone',       @_) },
        qr/Person *Fax(es)?/i
            => sub{ $self->_add_grouped_data('person', 'fax',         @_) },
        qr/Person *Address(es)?/i
            => sub{ $self->_add_grouped_data('person', 'address',     @_) },
        qr/Person *Affiliations?/i
            => sub{ $self->_add_grouped_data('person', 'organization', @_) },
        qr/Person *Roles?/i
            => sub{ $self->_add_grouped_data('person', 'roles',       @_) },
        qr/Person *Roles? *Term *Source *REF/i
            => sub{ $self->_add_grouped_data('person', 'termSource',  @_) },
        qr/Person *Roles? *Term *Accession *Numbers?/i
            => sub{ $self->_add_grouped_data('person', 'accession',  @_) },

        qr/Experimental *Designs?/i
            => sub{ $self->_add_grouped_data('design', 'value',     @_) },
        qr/Experimental *Designs? *Term *Source *REF/i
            => sub{ $self->_add_grouped_data('design', 'termSource', @_) },
        qr/Experimental *Designs? *Term *Accession *Numbers?/i
            => sub{ $self->_add_grouped_data('design', 'accession', @_) },
        qr/Quality *Control *Types?/i
            => sub{ $self->_add_grouped_data('qualitycontrol', 'value',       @_) },
        qr/Quality *Control *(?:Types?)? *Term *Source *REF/i
            => sub{ $self->_add_grouped_data('qualitycontrol', 'termSource', @_) },
        qr/Quality *Control *(?:Types?)? *Term *Accession *Numbers?/i
            => sub{ $self->_add_grouped_data('qualitycontrol', 'accession', @_) },
        qr/Replicate *Types?/i
            => sub{ $self->_add_grouped_data('replicate',      'value',       @_) },
        qr/Replicate *(?:Types?)? *Term *Source *REF/i
            => sub{ $self->_add_grouped_data('replicate',      'termSource', @_) },
        qr/Replicate *(?:Types?)? *Term *Accession *Numbers?/i
            => sub{ $self->_add_grouped_data('replicate',      'accession', @_) },
        qr/Normali[sz]ation *Types?/i
            => sub{ $self->_add_grouped_data('normalization',  'value',       @_) },
        qr/Normali[sz]ation *(?:Types?)? *Term *Source *REF/i
            => sub{ $self->_add_grouped_data('normalization',  'termSource', @_) },
        qr/Normali[sz]ation *(?:Types?)? *Term *Accession *Numbers?/i
            => sub{ $self->_add_grouped_data('normalization',  'accession', @_) },
 
        qr/PubMed *IDs?/i
            => sub{ $self->_add_grouped_data('publication', 'pubMedID',   @_) },
        qr/Publication *DOIs?/i
            => sub{ $self->_add_grouped_data('publication', 'DOI',        @_) },
        qr/Publication *Authors? *Lists?/i
            => sub{ $self->_add_grouped_data('publication', 'authorList', @_) },
        qr/Publication *Titles?/i
            => sub{ $self->_add_grouped_data('publication', 'title',      @_) },
        qr/Publication *Status/i
            => sub{ $self->_add_grouped_data('publication', 'status',     @_) },
        qr/Publication *Status *Term *Source *REF/i
            => sub{ $self->_add_grouped_data('publication', 'termSource', @_) },
        qr/Publication *Status *Term *Accession *Numbers?/i
            => sub{ $self->_add_grouped_data('publication', 'accession', @_) },

        qr/Protocol *Names?/i
            => sub{ $self->_add_grouped_data('protocol', 'name',        @_) },
        qr/Protocol *Descriptions?/i
            => sub{ $self->_add_grouped_data('protocol', 'text', @_) },
        qr/Protocol *Parameters?/i
            => sub{ $self->_add_grouped_data('protocol', 'parameters',  @_) },
        qr/Protocol *Hardwares?/i
            => sub{ $self->_add_grouped_data('protocol', 'hardware',    @_) },
        qr/Protocol *Softwares?/i
        => sub{ $self->_add_grouped_data('protocol', 'software',    @_) },
        qr/Protocol *Contacts?/i
            => sub{ $self->_add_grouped_data('protocol', 'contact',     @_) },
        qr/Protocol *Types?/i
            => sub{ $self->_add_grouped_data('protocol', 'protocolType', @_) },
        qr/Protocol *(?:Types?)? *Term *Source *REF/i
            => sub{ $self->_add_grouped_data('protocol', 'termSource',  @_) },
        qr/Protocol *(?:Types?)? *Term *Accession *Numbers?/i
            => sub{ $self->_add_grouped_data('protocol', 'accession',  @_) },

        qr/Term *Source *Names?/i
            => sub{ $self->_add_grouped_data('termsource', 'name',     @_) },
        qr/Term *Source *Files?/i
            => sub{ $self->_add_grouped_data('termsource', 'uri',      @_) },
        qr/Term *Source *Versions?/i
            => sub{ $self->_add_grouped_data('termsource', 'version',  @_) },

        qr/MAGE-?TAB *Version/i    # New in 1.1; Strictly speaking 1.0 should never appear.
            => sub{ $self->set_document_version($_[0]);
                    croak("Unsupported MAGE-TAB version.") unless( first { $_[0] eq $_ } qw(1.1 1.0) ) },
    };

    $self->set_dispatch_table( $dispatch );

    return;
}

##################
# Public methods #
##################

sub parse {

    my ( $self ) = @_;

    # Parse the IDF file into memory here.
    my $array_of_rows = $self->_read_as_arrayref();

    # Check tags for duplicates, make sure that tags are recognized.
    my $idf_data = $self->_validate_arrayref_tags( $array_of_rows );

    # Populate the IDF object's internal data text_store attribute.
    foreach my $datum ( @$idf_data ) {
        my ( $tag, $values ) = @$datum;
	$self->_dispatch( $tag, @$values );
    }

    # Actually generate the Bio::MAGETAB objects.
    my ( $investigation, $magetab ) = $self->_generate_magetab();

    return wantarray ? ( $investigation, $magetab ) : $investigation;
}

###################
# Private methods #
###################

sub _generate_magetab {

    my ( $self ) = @_;

    my $magetab       = $self->get_builder()->get_magetab();
    my $investigation = $self->_create_investigation();

    return ( $investigation, $magetab );
}

sub _create_sdrfs {

    my ( $self ) = @_;

    my @sdrfs;

    SDRF:
    foreach my $uri ( @{ $self->get_text_store()->{ 'sdrf' }{ 'uris' } } ) {

        # URI is required for all SDRF objects.
        next SDRF unless ( defined $uri
                                && $uri !~ $BLANK );

        my $sdrf = $self->get_builder()->find_or_create_sdrf({
            uri => $uri,
        });
        push @sdrfs, $sdrf;
    }

    return \@sdrfs;
}

sub _create_factors {

    my ( $self ) = @_;

    my @factors;
    FACTOR:
    foreach my $f_data ( @{ $self->get_text_store()->{ 'factor' } } ) {

        # Name is required for all Factor objects.
        next FACTOR unless ( defined $f_data->{'name'}
                                  && $f_data->{'name'} !~ $BLANK );

        my %args = ('name' => $f_data->{'name'} );

        if ( $f_data->{'factorType'} ) {

            my $termsource;
            if ( my $ts = $f_data->{'termSource'} ) {
                $termsource = $self->get_builder()->get_term_source({
                    'name' => $ts,
                });
            }
            
            my $type = $self->get_builder()->find_or_create_controlled_term({
                'category'   => 'ExperimentalFactorCategory',
                'value'      => $f_data->{'factorType'},
                'termSource' => $termsource,
            });

            if ( defined $f_data->{'accession'} && ! defined $type->get_accession() ) {
                $type->set_accession( $f_data->{'accession'} );
                $self->get_builder()->update( $type );
            }

            $args{'factorType'} = $type,
        }

        my $factor = $self->get_builder()->find_or_create_factor( \%args );

	push @factors, $factor;
    }

    return \@factors;
}

sub _create_people {

    my ( $self ) = @_;

    my @people;
    PERSON:
    foreach my $p_data ( @{ $self->get_text_store()->{ 'person' } } ) {

        # Something is required for all Contact objects. MidInitials
        # just doesn't cut it.
        my $id_found;
        foreach my $key ( qw( lastName firstName ) ) {
            $id_found++ if ( defined $p_data->{$key}
                                  && $p_data->{$key} !~ $BLANK );
        }
        next PERSON unless $id_found;

        my $termsource;
        if ( my $ts = $p_data->{'termSource'} ) {
            $termsource = $self->get_builder()->get_term_source({
                'name' => $ts,
            });
        }

        my @roles = map {
            my $role = $self->get_builder()->find_or_create_controlled_term({
                'category'   => 'Roles',
                'value'      => $_,
                'termSource' => $termsource,
            });
            if ( defined $p_data->{'accession'} && ! defined $role->get_accession() ) {
                $role->set_accession( $p_data->{'accession'} );
                $self->get_builder()->update( $role );
            }                
            $role;
        } split /\s*;\s*/, ( $p_data->{'roles'} || q{} );

        my @wanted = grep { $_ !~ /^roles|termSource|accession$/ } keys %{ $p_data };
        my %args   = map { $_ => $p_data->{$_} } @wanted;
        $args{'roles'} = \@roles;

        my $person = $self->get_builder()->find_or_create_contact( \%args );

	push @people, $person;
    }

    return \@people;
}

sub _create_protocols {

    my ( $self ) = @_;

    my @protocols;
    PROTOCOL:
    foreach my $p_data ( @{ $self->get_text_store()->{ 'protocol' } } ) {

        # Name is required for all Protocol objects.
        next PROTOCOL unless ( defined $p_data->{'name'}
                                    && $p_data->{'name'} !~ $BLANK );

        my @wanted = grep { $_ !~ /^parameters|protocolType|termSource|accession$/ } keys %{ $p_data };
        my %args   = map { $_ => $p_data->{$_} } @wanted;

        if ( defined $p_data->{'protocolType'} ) {

            my $termsource;
            if ( my $ts = $p_data->{'termSource'} ) {
                $termsource = $self->get_builder()->get_term_source({
                    'name' => $ts,
                });
            }

            my $type = $self->get_builder()->find_or_create_controlled_term({
                'category'   => 'ProtocolType',
                'value'      => $p_data->{'protocolType'},
                'termSource' => $termsource,
            });

            if ( defined $p_data->{'accession'} && ! defined $type->get_accession() ) {
                $type->set_accession( $p_data->{'accession'} );
                $self->get_builder()->update( $type );
            }

            $args{'protocolType'} = $type;
        }

        my $protocol = $self->get_builder()->find_or_create_protocol( \%args );

        if ( my $parameters = $p_data->{'parameters'} ) {
            foreach my $paramname ( split /\s*;\s*/, $parameters ) {
                $self->get_builder()->find_or_create_protocol_parameter({
                    'name'       => $paramname,
                    'protocol'   => $protocol,
                });
            }
        }

	push @protocols, $protocol;
    }

    return \@protocols;
}

sub _create_publications {

    my ( $self ) = @_;

    my @publications;
    PUBL:
    foreach my $p_data ( @{ $self->get_text_store()->{ 'publication' } } ) {

        # Title is required for all Publication objects.
        next PUBL unless ( defined $p_data->{'title'}
                                && $p_data->{'title'} !~ $BLANK );

        my @wanted = grep { $_ !~ /^status|termSource|accession$/ } keys %{ $p_data };
        my %args   = map { $_ => $p_data->{$_} } @wanted;

        if ( defined $p_data->{'status'} ) {

            my $termsource;
            if ( my $ts = $p_data->{'termSource'} ) {
                $termsource = $self->get_builder()->get_term_source({
                    'name' => $ts,
                });
            }

            my $status = $self->get_builder()->find_or_create_controlled_term({
                'category'   => 'PublicationStatus',
                'value'      => $p_data->{'status'},
                'termSource' => $termsource,
            });

            if ( defined $p_data->{'accession'} && ! defined $status->get_accession() ) {
                $status->set_accession( $p_data->{'accession'} );
                $self->get_builder()->update( $status );
            }

            $args{'status'} = $status;
        }

        my $publication = $self->get_builder()->find_or_create_publication( \%args );

	push @publications, $publication;
    }

    return \@publications;
}

sub _create_investigation {

    my ( $self ) = @_;

    # Term Sources. These must be created first.
    my $term_sources = $self->_create_termsources();

    my $factors      = $self->_create_factors();
    my $people       = $self->_create_people();
    my $protocols    = $self->_create_protocols();
    my $publications = $self->_create_publications();

    my $design_types        = $self->_create_controlled_terms(
        'design',         'ExperimentDesignType',
    );
    my $normalization_types = $self->_create_controlled_terms(
        'normalization',  'NormalizationDescriptionType',
    );
    my $replicate_types     = $self->_create_controlled_terms(
        'replicate',      'ReplicateDescriptionType',
    );
    my $qc_types            = $self->_create_controlled_terms(
        'qualitycontrol', 'QualityControlDescriptionType',
    );

    my $sdrfs = $self->_create_sdrfs();

    my $data = $self->get_text_store()->{'investigation'};
    
    my $investigation;
    if ( $investigation = $self->get_magetab_object() ) {
        while ( my ( $key, $value ) = each %{ $data } ) {
            my $setter = "set_$key";
            $investigation->$setter( $value ) if defined $value;
        }
    }
    else {
        $investigation = $self->get_builder()->find_or_create_investigation({
            %{ $data },
        });
        $self->set_magetab_object( $investigation );
    }

    $investigation->set_contacts            ( $people              ) if @$people;
    $investigation->set_protocols           ( $protocols           ) if @$protocols;
    $investigation->set_publications        ( $publications        ) if @$publications;
    $investigation->set_factors             ( $factors             ) if @$factors;
    $investigation->set_termSources         ( $term_sources        ) if @$term_sources;
    $investigation->set_designTypes         ( $design_types        ) if @$design_types;
    $investigation->set_normalizationTypes  ( $normalization_types ) if @$normalization_types;
    $investigation->set_replicateTypes      ( $replicate_types     ) if @$replicate_types;
    $investigation->set_qualityControlTypes ( $qc_types            ) if @$qc_types;
    $investigation->set_sdrfs               ( $sdrfs               ) if @$sdrfs;

    my $comments = $self->_create_comments();
    $investigation->set_comments( $comments );
    $self->get_builder()->update( $investigation );

    return $investigation;
}

# Make the classes immutable. In theory this speeds up object
# instantiation for a small compilation time cost.
__PACKAGE__->meta->make_immutable();

no Moose;

=head1 NAME

Bio::MAGETAB::Util::Reader::IDF - IDF parser class.

=head1 SYNOPSIS

 use Bio::MAGETAB::Util::Reader::IDF;
 my $parser = Bio::MAGETAB::Util::Reader::IDF->new({
     uri => $idf_filename,
 });
 my $investigation = $parser->parse();

=head1 DESCRIPTION

This class is used to parse IDF files. It can be used on its own, but
more often you will want to use the main Bio::MAGETAB::Util::Reader
class which handles extended parsing options more transparently.

=head1 ATTRIBUTES

See the L<TagValueFile|Bio::MAGETAB::Util::Reader::TagValueFile> class for superclass attributes.

=over 2

=item magetab_object

A Bio::MAGETAB::Investigation object. This can either be set upon
instantiation, or a new object will be created for you. It can be
retrieved at any time using C<get_magetab_object>.

=item document_version

A string representing the MAGE-TAB version used in the parsed
document. This is populated by the parse() method.

=back

=head1 METHODS

=over 2

=item parse

Parse the IDF pointed to by C<$self-E<gt>get_uri()>. Returns the
Bio::MAGETAB::Investigation object updated with the IDF contents.

=back

=head1 SEE ALSO

L<Bio::MAGETAB::Util::Reader::TagValueFile>
L<Bio::MAGETAB::Util::Reader::Tabfile>
L<Bio::MAGETAB::Util::Reader>
L<Bio::MAGETAB::Investigation>

=head1 AUTHOR

Tim F. Rayner <tfrayner@gmail.com>

=head1 LICENSE

This library is released under version 2 of the GNU General Public
License (GPL).

=cut

1;