# $Id: GoaParser.pm 2159 2013-02-20 Erick Antezana $
#
# Module : GoaParser.pm
# Purpose : Parse GOA files
# License : Copyright (c) 2006-2014 by Vladimir Mironov. All rights reserved.
# This program is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
# Contact : erick.antezana@gmail.com
#
package OBO::Parser::GoaParser;
use strict;
use warnings;
use Carp;
use open qw(:std :utf8); # Make All I/O Default to UTF-8
use OBO::Core::Term;
use OBO::APO::GoaAssociation;
use OBO::APO::GoaAssociationSet;
# use Data::Dumper;
$Carp::Verbose = 0;
my $verbose = 0;
sub new {
my $class = shift;
my $self = {};
bless ( $self, $class );
return $self;
}
=head2 parse
Usage - $GoaParser->parse ( $FH, $map )
Returns - OBO::APO::GoaAssociationSet object
Args -
1. indirect filehandle to a GOA associations file
2. ref to a hash { UniProtAC => UniProtID } or { GO ID => GO term name } to filter by, optional
Function - converts a GOA associations file into a OBO::APO::GoaAssociationSet object
=cut
sub parse {
# TODO remove the option of filtering by GO map
# TODO accomodate annotations including multiple taxa like: 'taxon:10090|taxon:10360', taxon:9606|taxon:44130
my (
$self,
$in_file_path,
$map # hash ref, either UP map or GO map, optional
) = @_;
open my $FH, '<', $in_file_path or croak "Can't open file '$in_file_path': $! ";
my $goaAssocSet = OBO::APO::GoaAssociationSet->new ( );
# assumption: the map is not empty and no blank lines
my $map_source;
if ( $map ) {
# identify the type of the map
my @ids = keys %{ $map };
if ( $ids[0] =~ /\AGO:\d{7}\z/xms ) {
$map_source = 'GO';
}
elsif ( $ids[0] =~ /\A\w{6}\z/xms ){
$map_source = 'UniProtKB';
}
else {
carp "An illegal ID in the map! ";
}
print "Filtering by $map_source map\n" if $verbose;
}
# Populate the OBO::APO::GoaAssociationSet class with objects
my $count_added =0;
my $count_rejected =0;
while ( <$FH> ){
chomp;
my @fields = split ( /\t/ );
next if ( @fields != 15 );
my $goaAssoc = load_data ( \@fields, $map, $map_source );
$goaAssoc ? $count_added++ : $count_rejected++;
$goaAssocSet->add_unique ( $goaAssoc ) if $goaAssoc;
}# end of while $FH
close $FH;
print "accepted associations: $count_added, rejected associations: $count_rejected\n" if $verbose;
! $goaAssocSet->is_empty ( ) ? return $goaAssocSet : carp "The set is empty ! $! ";
}
=head2 work
Usage - $GoaParser->work ( $ontology, $data, $up_map, $parent_protein_name ) # the last arg is optional
Returns - a data structure with added proteins ( { NCBI_ID => {UP_AC => OBO::Core::Term object}} )
Args -
1. OBO::Core::Ontology object,
2. OBO::APO::GoaAssociationSet object,
3. parent term name for proteins ( string ), # to link new proteins to, e.g. 'gene regulation protein'
Function - adds GO associations ( and optionally protein terms ) to ontology
=cut
sub work {
my (
$self,
$onto,
$data,
$parent_protein_name,
) = @_ ;
croak "Not enough arguments! " if ! ( $onto and $data );
my $parent_protein;
my $taxon;
if ( $parent_protein_name ) {
$parent_protein = $onto->get_term_by_name ( $parent_protein_name ) || croak "No term for $parent_protein_name in ontology: $! ";
}
my $is_a = 'is_a';
my $participates_in = 'participates_in';
my $located_in = 'located_in';
my $has_function = 'has_function';
my $has_source = 'has_source';
my @rel_types = ( $is_a, $participates_in, $located_in, $has_function, $has_source );
foreach ( @rel_types ){
croak "'$_' is not in ontology" unless ( $onto->{RELATIONSHIP_TYPES}->{$_} );
}
my %proteins; # { NCBI_ID => {UP_AC => OBO::Core::Term object}}
my %go_terms; # { GO_id => OBO::Core::Term object }
my %taxa; # { NCBI_id => OBO::Core::Term object }
foreach my $goaAssoc ( @{$data->{SET}} ){
# get GO term
my $go_id = $goaAssoc->go_id ( );
next if ( $go_id eq 'GO:0008150' ); # remove associations with 'biological_process'
my $go_term = $go_terms{$go_id};
if ( ! $go_term ) { # $go_term is not yet in the hash
$go_term = $onto->get_term_by_id ( $go_id );
next if ( ! $go_term );
$go_terms{$go_id} = $go_term;
}
# get taxon
# for multiple taxa in a single association: taxon:9606|taxon:44130
$goaAssoc->taxon ( ) =~ /\Ataxon:(\d+)/xms;
my $ncbi_id = $1 or carp "No NCBI id: $!";
my $taxon_id = "NCBI:$ncbi_id";
my $taxon = $taxa{$ncbi_id};
if ( ! $taxon ) {
$taxon = $onto->get_term_by_id ( $taxon_id ) || carp "No taxon term for $taxon_id in ontology: $! ";
next if ! $taxon;
$taxa{$ncbi_id} = $taxon;
}
# get protein term
my $obj_id = $goaAssoc->obj_id ( );
my $db = $goaAssoc->obj_src ( );
my $prot_id = "$db:$obj_id";
my $protein = $proteins{$ncbi_id}{$obj_id};
if ( ! $protein ) { # $protein is not yet in the hash
$protein = $onto->get_term_by_id ( $prot_id );
if ( ! $protein ) { # $protein is not yet in the ontolgy
if ( $parent_protein_name ) { # testing whether new protein terms should be created
$protein = OBO::Core::Term->new ( );
$protein->id ( $prot_id );
$onto->add_term ( $protein );
$onto->create_rel ( $protein, $has_source, $taxon );
$onto->create_rel ( $protein, $is_a, $parent_protein );
}
else {
# the protein not in the ontology and no new terms should be created
# normally should not happen
carp "No protein term in the ontology for $prot_id";
next;
}
}
$proteins{$ncbi_id}{$obj_id} = $protein;
}
# create relations
my $aspect = $goaAssoc->aspect ( );
if ( $aspect eq 'F' ) {
$onto->create_rel ( $protein, $has_function, $go_term );
}
elsif ( $aspect eq 'C' ) {
$onto->create_rel ( $protein, $located_in, $go_term );
# $onto->create_rel ( $go_term, $location_of, $protein ); # inverse of 'located_in'
}
elsif ( $aspect eq 'P' ) {
$onto->create_rel ( $protein, $participates_in, $go_term );
# $onto->create_rel ( $go_term, $has_participant, $protein ); # inverse of 'participates_in'
}
else {carp "An illegal GO aspect '$aspect'\n"}
}
return \%proteins;
}
sub load_data {
my ( $fields, $map, $map_source ) = @_;
my @fields = @{$fields};
foreach ( @fields ) {
$_ =~ s/^\s+//;
$_ =~ s/\s+$//;
}
if ( $map ) {
if ( $map_source eq 'GO' ) {
return 0 unless $map->{$fields[4]};
}
if ( $map_source eq 'UniProtKB' ) {
return 0 unless $map->{$fields[1]};
}
}
my $goaAssoc = OBO::APO::GoaAssociation->new ( );
$goaAssoc->assc_id ( $. );
$goaAssoc->obj_src ( $fields[0] );
$goaAssoc->obj_id ( $fields[1] );
$goaAssoc->obj_symb ( $fields[2] );
$goaAssoc->qualifier ( $fields[3] );
$goaAssoc->go_id ( $fields[4] );
$goaAssoc->refer ( $fields[5] );
$goaAssoc->evid_code ( $fields[6] );
$goaAssoc->sup_ref ( $fields[7] );
$goaAssoc->aspect ( $fields[8] );
$goaAssoc->description ( $fields[9] );
$goaAssoc->synonym ( $fields[10] );
$goaAssoc->type ( $fields[11] );
$goaAssoc->taxon ( $fields[12] );
$goaAssoc->date ( $fields[13] );
$goaAssoc->annot_src ( $fields[14] );
return $goaAssoc;
}
1;
__END__
=head1 NAME
OBO::Parser::GoaParser - A GOA associations to OBO translator.
=head1 DESCRIPTION
Includes methods for adding information from GOA association files to ontologies
GOA associations files can be obtained from http://www.ebi.ac.uk/GOA/proteomes.html
The method 'parse' parses the GOA association file and optioanlly filters data by a map
The method 'work' incorporates OBJ_SRC, OBJ_ID, OBJ_SYMB, SYNONYM, DESCRIPTION into the input ontology, writes the ontology into an OBO file, writes map files.
This method assumes:
- the ontology contains all and only the necessary GO terms.
- the ontology contains the relationship types 'is_a', 'participates_in', 'has_participant'
=head1 AUTHOR
Vladimir Mironov E<lt>vladimir.n.mironov@gmail.comE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (c) 2006-2014 by Vladimir Mironov
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.7 or,
at your option, any later version of Perl 5 you may have available.
=cut