# $Id: go_assoc_parser.pm,v 1.22 2009/08/17 00:46:16 cmungall Exp $
#
#
# see also - http://www.geneontology.org
# - http://www.godatabase.org/dev
#
# You may distribute this module under the same terms as perl itself
package GO::Parsers::go_assoc_parser;
=head1 NAME
GO::Parsers::go_assoc_parser - syntax parsing of GO gene-association flat files
=head1 SYNOPSIS
=head1 DESCRIPTION
do not use this class directly; use L<GO::Parser>
This generates Stag/XML event streams from GO association files.
Examples of these files can be found at http://www.geneontology.org,
an example of lines from an association file:
SGD S0004660 AAC1 GO:0005743 SGD:12031|PMID:2167309 TAS C ADP/ATP translocator YMR056C gene taxon:4932 20010118
SGD S0004660 AAC1 GO:0006854 SGD:12031|PMID:2167309 IDA P ADP/ATP translocator YMR056C gene taxon:4932 20010118
See L<http://www.geneontology.org/GO.annotation.shtml#file>
See
L<http://www.godatabase.org/dev/xml/dtd/go_assoc-parser-events.dtd>
For the DTD of the event stream that is generated
The following stag-schema describes the events that are generated in
parsing an assoc file:
(assocs
(dbset+
(proddb "s")
(prod+
(prodacc "s")
(prodsymbol "s")
(prodtype "s")
(prodtaxa "i")
(assoc+
(assocdate "i")
(source_db "s")
(termacc "s")
(is_not "i")
(aspect "s")
(evidence+
(evcode "s")
(ref "s"))))))
=cut
use Exporter;
use base qw(GO::Parsers::base_parser Exporter);
#use Text::Balanced qw(extract_bracketed);
use GO::Parsers::ParserEventNames;
use GO::Parser;
use Carp;
use FileHandle;
use strict;
sub dtd {
'go_assoc-parser-events.dtd';
}
sub ev_filter {
my $self = shift;
$self->{_ev_filter} = shift if @_;
return $self->{_ev_filter};
}
sub skip_uncurated {
my $self = shift;
$self->{_skip_uncurated} = shift if @_;
return $self->{_skip_uncurated};
}
sub parse_fh {
my ($self, $fh) = @_;
my $file = $self->file;
my $product;
my $term;
my $assoc;
my $line_no = 0;
my $obo_parser; # an OBO parser may be required for parsing the PROPERTIES column
my @COLS = (0..16);
my ($PRODDB,
$PRODACC,
$PRODSYMBOL,
$QUALIFIER,
$TERMACC,
$REF,
$EVCODE,
$WITH,
$ASPECT,
$PRODNAME,
$PRODSYN,
$PRODTYPE,
$PRODTAXA,
$ASSOCDATE,
$SOURCE_DB,
$PROPERTIES, # GAF2.0
$ISOFORM, # GAF2.0
) = @COLS;
my @mandatory_cols = ($PRODDB, $PRODACC, $TERMACC, $EVCODE);
# <assocs>
# <dbset>
# <db>fb</db>
# <prod>
# <prodacc>FBgn0027087</>
# <prodsym>Aats-his</>
# <prodtype>gene</>
# <prodtaxa>7227</>
# <prodsynonym>...</>
# <assoc>
# <termacc>GO:0004821</termacc>
# <evidence>
# <code>NAS</code>
# <ref>FB:FBrf0105495</ref>
# <with>...</with>
# </evidence>
# </assoc>
# </prod>
# </dbset>
# <assocs>
$self->start_event(ASSOCS);
$self->fire_source_event($file);
my @last = map {''} @COLS;
my $skip_uncurated = $self->skip_uncurated;
my $ev = $self->ev_filter;
my %evyes = ();
my %evno = ();
if ($ev) {
if ($ev =~ /\!(.*)/) {
$evno{$1} = 1;
}
else {
$evyes{$ev} = 1;
}
}
my $taxa_warning;
my $line;
my @vals;
my @stack = ();
while (<$fh>) {
# UNICODE causes problems for XML and DB
# delete 8th bit
tr [\200-\377]
[\000-\177]; # see 'man perlop', section on tr/
# weird ascii characters should be excluded
tr/\0-\10//d; # remove weird characters; ascii 0-8
# preserve \11 (9 - tab) and \12 (10-linefeed)
tr/\13\14//d; # remove weird characters; 11,12
# preserve \15 (13 - carriage return)
tr/\16-\37//d; # remove 14-31 (all rest before space)
tr/\177//d; # remove DEL character
$line_no++;
chomp;
if (/^\!/) {
next;
}
if (!$_) {
next;
}
# some files use string NULL - we just use empty string as null
s/\\NULL//g;
$line = $_;
$self->line($line);
$self->line_no($line_no);
@vals = split(/\t/, $line);
# normalise columns, and set $h
for (my $i=0; $i<@COLS;$i++) {
if (defined($vals[$i])) {
# remove trailing and
# leading blanks
$vals[$i] =~ s/^\s*//;
$vals[$i] =~ s/\s*$//;
# sometimes - is used for null
$vals[$i] =~ s/^\-$//;
# TAIR seem to be
# doing a mysql dump...
$vals[$i] =~ s/\\NULL//;
}
if (!defined($vals[$i]) ||
length ($vals[$i]) == 0) {
if ( grep {$i == $_} @mandatory_cols) {
$self->parse_err("no value defined for col ".($i+1)." in line_no $line_no line\n$line\n");
next;
}
$vals[$i] = '';
}
}
my ($proddb,
$prodacc,
$prodsymbol,
$qualifier,
$termacc,
$ref,
$evcode,
$with,
$aspect,
$prodname,
$prodsyn,
$prodtype,
$prodtaxa,
$assocdate,
$source_db,
$properties, # GAF2.0
$isoform) = @vals; # GAF2.0
# backward compatibility GAF2.0 -> GAF1.0
$properties = '' unless defined $properties;
$isoform = '' unless defined $isoform;
$assocdate = '' unless defined $assocdate;
$source_db = '' unless defined $source_db;
# if (!grep {$aspect eq $_} qw(P C F)) {
# $self->parse_err("Aspect column says: \"$aspect\" - aspect must be P/C/F");
# next;
# }
if ($self->acc_not_found($termacc)) {
$self->parse_err("No such ID: $termacc");
next;
}
if (!($ref =~ /:/)) {
# ref does not have a prefix - we assume it is medline
$ref = "medline:$ref";
}
if ($skip_uncurated && $evcode eq "IEA") {
next;
}
if (%evyes && !$evyes{$evcode}) {
next;
}
if (%evno && $evno{$evcode}) {
next;
}
my @prodtaxa_ids = split(/\|/,$prodtaxa);
@prodtaxa_ids =
map {
s/taxonid://gi;
s/taxon://gi;
if ($_ !~ /\d+/) {
if (!$taxa_warning) {
$taxa_warning = 1;
$self->parse_err("No NCBI TAXON wrong fmt: $_");
$_ = "";
}
}
$_;
} @prodtaxa_ids;
@prodtaxa_ids = grep {$_} @prodtaxa_ids;
my $main_taxon_id = shift @prodtaxa_ids;
if (!$main_taxon_id) {
if (!$taxa_warning) {
$taxa_warning = 1;
$self->parse_err("No NCBI TAXON specified; ignoring");
}
}
# check for new element; shift a level
my $new_dbset = $proddb ne $last[$PRODDB];
my $new_prodacc =
$prodacc ne $last[$PRODACC] || $new_dbset;
my $new_assoc =
($termacc ne $last[$TERMACC]) ||
$new_prodacc ||
($qualifier ne $last[$QUALIFIER]) ||
($source_db ne $last[$SOURCE_DB]) ||
($assocdate ne $last[$ASSOCDATE]) ||
($isoform ne $last[$ISOFORM]);
#if (!$new_prodacc && ($prodtaxa ne $last[$PRODTAXA])) {
## Before we declare an error, let's make sure that we're not
## talking about secondary taxons...
my $chopped_taxa = $prodtaxa;
my $chopped_prev_taxa = $last[$PRODTAXA];
$chopped_taxa =~ s/\|.+//;
$chopped_prev_taxa =~ s/\|.+//;
if (!$new_prodacc && ($chopped_taxa ne $chopped_prev_taxa)) {
# two identical gene products with the same taxon
# IGNORE!
$self->parse_err("different taxa ($prodtaxa, $last[$PRODTAXA]) for same product $prodacc");
next;
}
# close finished events
if ($new_assoc) {
$self->pop_stack_to_depth(3) if $last[$TERMACC];
# $self->end_event("assoc") if $last[$TERMACC];
}
if ($new_prodacc) {
$self->pop_stack_to_depth(2) if $last[$PRODACC];
# $self->end_event("prod") if $last[$PRODACC];
}
if ($new_dbset) {
$self->pop_stack_to_depth(1) if $last[$PRODDB];
# $self->end_event("dbset") if $last[$PRODDB];
}
# open new events
if ($new_dbset) {
$self->start_event(DBSET);
$self->event(PRODDB, $proddb);
}
if ($new_prodacc) {
$self->start_event(PROD);
$self->event(PRODACC, $prodacc);
$self->event(PRODSYMBOL, $prodsymbol);
$self->event(PRODNAME, $prodname) if $prodname;
$self->event(PRODTYPE, $prodtype) if $prodtype;
if ($main_taxon_id) {
$self->event(PRODTAXA, $main_taxon_id);
}
my $syn = $prodsyn;
if ($syn) {
my @syns = split(/\|/, $syn);
my %ucheck = ();
@syns = grep {
if ($ucheck{lc($_)}) {
0;
}
else {
$ucheck{lc($_)} = 1;
1;
}
} @syns;
map {
$self->event(PRODSYN, $_);
} @syns;
}
}
if ($new_assoc) {
my $assocdate = $assocdate;
$self->start_event(ASSOC);
if ($assocdate) {
if ($assocdate && length($assocdate) == 8) {
$self->event(ASSOCDATE, $assocdate);
}
else {
$self->parse_err("ASSOCDATE wrong format (must be YYYYMMDD): $assocdate");
}
}
$self->event(SOURCE_DB, $source_db)
if $source_db;
$self->event(TERMACC, $termacc);
my @quals = map lc,split(/[\|]\s*/,$qualifier || '');
my $is_not = grep {/^not$/i} @quals;
$self->event(IS_NOT, $is_not || '0');
$self->event(QUALIFIER, $_) foreach @quals;
$self->event(SPECIES_QUALIFIER, $_) foreach @prodtaxa_ids; # all REMAINING (after "|') tax ids are qualifiers
$self->event(ASPECT, $aspect);
if ($isoform) {
$self->event(ISOFORM, $isoform);
}
if ($properties) {
my @properties_list = split(/\|/,$properties);
if (!$obo_parser) {
$obo_parser = GO::Parser->new({format=>'obo_text'});
}
foreach my $p (@properties_list) {
my $diffs = $obo_parser->parse_differentia($p);
$self->event(PROPERTIES, $diffs);
}
}
}
$self->start_event(EVIDENCE);
$self->event(EVCODE, $evcode);
if ($with) {
# TODO: discriminate between pipes and commas
# (semicolon is there for legacy reasons - check if this can be removed)
my @with_accs = split(/\s*[\|\;\,]\s*/, $with);
$self->event(WITH, $_)
foreach (grep (/:/, @with_accs));
# we have found errors where the : was left out, this just skips
# no longer checks for cardinality errors
}
my @refs = split(/\|/, $ref);
map {
$self->event(REF, $_)
} @refs;
$self->end_event(EVIDENCE);
#@last = @vals;
@last =
(
$proddb,
$prodacc,
$prodsymbol,
$qualifier,
$termacc,
$ref,
$evcode,
$with,
$aspect,
$prodname,
$prodsyn,
$prodtype,
$prodtaxa,
$assocdate,
$source_db,
$properties,
$isoform,
);
}
$fh->close;
$self->pop_stack_to_depth(0);
}
1;
# 2.864 orig/handler
# 2.849 opt/handler
# 1.986 orig/xml
# 1.310 opt/xml