The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# $Id: mgi_assoc_parser.pm,v 1.3 2006/08/05 20:26:12 cmungall Exp $
#
#
# see also - http://www.geneontology.org
#          - http://www.godatabase.org/dev
#
# You may distribute this module under the same terms as perl itself

package GO::Parsers::mgi_assoc_parser;

=head1 NAME

  GO::Parsers::mgi_assoc_parser - parses MGI gene assoc stanza files

=head1 SYNOPSIS

  do not use this class directly; use GO::Parser

=head1 DESCRIPTION

This is an EXPERIMENTAL module for converting MGIs in-house stanza format

this is richer than go_assoc files as it allows combinatorial assocs

these are represented with the go-assoc xml <in> element

=head1 AUTHOR

=cut

use Exporter;
use base qw(GO::Parsers::base_parser Exporter);
use GO::Parsers::ParserEventNames;

use Carp;
use FileHandle;
use strict;

sub dtd {
    'go_assoc-parser-events.dtd';
}

sub ev_filter {
    my $self = shift;
    $self->{_ev_filter} = shift if @_;
    return $self->{_ev_filter};
}



sub skip_uncurated {
    my $self = shift;
    $self->{_skip_uncurated} = shift if @_;
    return $self->{_skip_uncurated};
}

sub _parse_ids {
    my $txt = shift;
    $txt =~ s/^:+//;
    $txt =~ s/^\s+//;
    my @parts = split(/\|/,$txt);
    return grep {$_} map {if (/(\w+:\S+)/){$1}else {()}} @parts;
}

sub parse_fh {
    my ($self, $fh) = @_;
    my $file = $self->file;


    $self->start_event(ASSOCS);
    $self->start_event(DBSET);
    $self->event(PRODDB, 'mgi');

    my $curr_gene = '';
    while (<$fh>) {
        # UNICODE causes problems for XML and DB
        # delete 8th bit
        tr [\200-\377]
          [\000-\177];   # see 'man perlop', section on tr/
        # weird ascii characters should be excluded
        tr/\0-\10//d;   # remove weird characters; ascii 0-8
                        # preserve \11 (9 - tab) and \12 (10-linefeed)
        tr/\13\14//d;   # remove weird characters; 11,12
                        # preserve \15 (13 - carriage return)
        tr/\16-\37//d;  # remove 14-31 (all rest before space)
        tr/\177//d;     # remove DEL character

	chomp;
	if (/^\!/) {
	    next;
	}
	if (!$_) {
	    next;
	}

        if (/^MGI gene:\s(\S+);\s+(\S+)/) {
            if ($1 ne $curr_gene) {
                $self->pop_stack_to_depth(2);
                $self->start_event(PROD);
                $self->event(PRODACC, $1);
                $self->event(PRODSYMBOL, $2);
                $self->event(PRODTYPE, 'gene');
                $curr_gene = $1;
            }
        }
        if (/^GO term:\s(.*);\s+(\S+)/) {
	    $self->start_event(ASSOC);
	    $self->event(TERMACC, $2);
            while (<>) {
                chomp;
                last unless $_;
                if (/^GO evidence:\s*(.*)/) {
                    my $evtxt = $1;
                    foreach (split(/\s+\|\s+/,$evtxt)) {
                        $self->start_event(EVIDENCE);
                        $self->event(EVCODE, $_);
                        $self->end_event(EVIDENCE);
                    }
                }
                elsif (/^anatomy:(.*)/) {
                    my @ids = _parse_ids($1);
                    $self->event(property_value=>[[type=>'located_in'],
                                                  [to=>$_]]) foreach @ids;
                }
                elsif (/^cell type:(.*)/) {
                    my @ids = _parse_ids($1);
                    $self->event(property_value=>[[type=>'located_in'],
                                                  [to=>$_]]) foreach @ids;
                }
                else {
                }
            }
	    $self->end_event(ASSOC);
        }
    }
    $fh->close;

    $self->pop_stack_to_depth(0);
}


1;