# $Id: NewIntActParser.pm 2109 2010-09-29 Erick Antezana $
#
# Module : IntActParser.pm
# Purpose : Parse IntAct files and transfer the data to an ontology
# License : Copyright ( c ) 2006, 2007, 2008 Cell Cycle Ontology. All rights reserved.
# This program is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#
package OBO::Parser::IntActParser;
use strict;
use warnings;
use Carp;
use OBO::Core::Term;
use XML::XPath;
$Carp::Verbose = 0;
my $verbose = 0;
sub new {
my $class = shift;
my $self = {};
bless ( $self, $class );
return $self;
}
=head2 parse
Usage - $intact_parser->parse ( $intact_files, $map )
Returns - data structure ( hash reference )
Args -
1. [ IntAct data file paths ( fully qualified )] ( array reference )
2. { UniProtKB accession => UniProtKB id ( strings )} ( hash reference )
3. { UniProtKB accession => UniProtKB id ( strings )} ( hash reference ), optional
Function - parses IntAct data file and optionally filters it by the map
=cut
sub parse {
my (
$self,
$intact_files, # ref to a list of fully qualified paths
$long_map, # complete UniProt map, to get rid of secondary UniPriot ACs
$short_map # { UP AC => UP ID } ( optional ), map to filter by
) = @_;
croak "No IntAct files to parse!\n" if ( ! $intact_files );
my %data;
my $count_accepted = 0;
my $count_rejected =0;
foreach my $file_path ( @{$intact_files} ) {
# xpath is very sensitive to extra spaces in double quoted strings in find() !!!!
my $xpath = XML::XPath->new ( filename => $file_path ) or croak "Failed to parse file $file_path";
my $int_set = $xpath->find ( "/entrySet/entry/interactionList/interaction" );
foreach my $interaction ( $int_set->get_nodelist ( ) ) {
# Note: internal IntAct IDs for participants and interactors are different !!!
my $interaction_id = $interaction->find ( "\@id", $interaction )->string_value ( ); # IntAct internal id
my $participants = $xpath->find ( "/entrySet/entry/interactionList/interaction[\@id = $interaction_id]/participantList/participant" );
my @participants = $participants->get_nodelist ( );
# tagging interections for including in $data
my $map_hit = 0;
# if a map is provided
if ( $short_map ) {
foreach my $participant ( @participants ) {
my $interactor_id = $participant->find ( "interactorRef/text()", $participant )->string_value ( ); # IntAct internal ID
# UniProt AC
my $protein_id = $xpath->find ( "/entrySet/entry/interactorList/interactor[\@id = $interactor_id]/xref/primaryRef/\@id" )->string_value ( );
# searching for the first occurence of a core protein
if ( $short_map->{$protein_id} ) {
$map_hit++;
last;
}
}
}
# if there is no map take all interactions
else {
$map_hit = 1;
}
$map_hit ? $count_accepted++ : $count_rejected++;
# filling up $data
if ( $map_hit ) {
my $interaction_xref = $interaction->find ( "xref/primaryRef/\@id", $interaction )->string_value ( ); # EBI ID
$interaction_xref =~s/-/:/; # format change
$data{$interaction_xref}{'interactionName'} = $interaction->find ( "names/shortLabel/text()", $interaction )->string_value ( ); # interaction name
$data{$interaction_xref}{'interactionFullName'} = $interaction->find ( "names/fullName/text()", $interaction )->string_value ( ); # interaction full name
$data{$interaction_xref}{'interactionTypeId'} = $interaction->find ( "interactionType/xref/primaryRef/\@id", $interaction )->string_value ( ); # MI id
foreach my $participant ( @participants ) {
# internal IntAct interactor ID used for getting taxon ID and protein ID:
my $interactor_id = $participant->find ( "interactorRef/text()", $participant )->string_value ( );
my $ncbi_id = $xpath->find ( "/entrySet/entry/interactorList/interactor[\@id = $interactor_id]/organism/\@ncbiTaxId" )->string_value ( );
# protein xref, tyically UniProt accession, any others??? TODO
# for now ingnores any other sources, as well as secondary UniProt ACs
my $protein_id = $xpath->find ( "/entrySet/entry/interactorList/interactor[\@id = $interactor_id]/xref/primaryRef/\@id" )->string_value ( );
my $role = $participant->find ( "experimentalRoleList/experimentalRole/names/shortLabel/text()", $participant )->string_value ( ); # experimental role
$long_map->{$protein_id} ?
$data{$interaction_xref}{'participants'}{$ncbi_id}{$protein_id} = $role :
next;
} # end of foreach participant
}
} # end of foreach interaction
} # end of foreach file
print "Accepted interactions: $count_accepted, rejected interactions: $count_rejected\n" if $verbose;
%data ? return \%data : croak "No data produced!";
}
=head2 work
Usage - $intact_parser->work ( $ontology, $data, $parent_protein_name )
Returns - { NCBI ID => { UP AC => OBO::Core::Term object }} ( data structure with all the proteins in the interactions )
Args -
1. OBO::Core::Ontology object,
2. data structure from parse ( )
3. parent term name for proteins ( string ) # to link new proteins to, e.g. 'cell cycle protein'
Function - adds to the input ontology OBO::Core::Term objects along with appropriate relations for interactions and proteins from IntAct data
=cut
sub work {
my (
$self,
$onto,
$data,
$parent_protein_name
) = @_;
my @rel_types = ( 'is_a', 'participates_in', 'has_participant', 'has_source' );
foreach my $rel_type ( @rel_types ) {
croak "Not a valid relationship type '$rel_type' !" unless ( $onto->{RELATIONSHIP_TYPES}->{$rel_type} );
}
# hashes to collect terms
my %taxa; # { NCBI ID => OBO::Core::Term object }
my %interaction_types; # { MI ID => OBO::Core::Term object }
my %proteins; # { NCBI ID => { UP AC => OBO::Core::Term object }}
my %role_rel_types; # not used yet
my $parent_protein = $onto->get_term_by_name ( $parent_protein_name ) || croak "No term for $parent_protein_name in ontology: $!";
foreach my $interaction_id ( keys %{$data} ) { # EBI id
my $interaction_type_id = $data->{$interaction_id}{'interactionTypeId'}; # MI id
my $interaction_type_term = $interaction_types{$interaction_type_id};
if ( ! $interaction_type_term ) {
$interaction_type_term = $onto->get_term_by_id ( $interaction_type_id ) or next;
}
my $interaction_name = $data->{$interaction_id}{'interactionName'};
my $interaction_type_name = $interaction_type_term->name ( );
my $interaction_full_name = $data->{$interaction_id}{'interactionFullName'};
# interaction terms
my $new_interaction = OBO::Core::Term->new ( );
$new_interaction->id ( "$interaction_id" );
$new_interaction->name ( "$interaction_name $interaction_type_name" );
$new_interaction->def_as_string ( "$interaction_full_name", "[$interaction_id]" );
$onto->add_term ( $new_interaction );
$onto->create_rel ( $new_interaction, 'is_a', $interaction_type_term );
# protein terms
foreach my $ncbi_id ( keys %{$data->{$interaction_id}{'participants'}} ) {
my $taxon = $taxa{$ncbi_id};
if ( ! $taxon ) {
my $taxon_id = "NCBI:$ncbi_id";
# external proteins are not accepted
$taxon = $onto->get_term_by_id ( $taxon_id ) or next;
}
foreach my $up_ac ( keys %{$data->{$interaction_id}{'participants'}{$ncbi_id}} ) {
my $protein = $proteins{$ncbi_id}{$up_ac};
if ( ! $protein ) {
my $protein_id = "UniProtKB:$up_ac";
$protein = $onto->get_term_by_id ( $protein_id );
if ( ! $protein ) {
$protein = OBO::Core::Term->new ( );
$protein->id ( $protein_id );
$onto->add_term ( $protein );
$onto->create_rel ( $protein, 'is_a', $parent_protein );
$onto->create_rel ( $protein, 'has_source', $taxon ) if $taxon;
}
$proteins{$ncbi_id}{$up_ac} = $protein; # the protein was not yet in the hash
}
# TODO incorporate the roles
# my $role = $data->{$interaction_id}{'participants'}{$ncbi_id}{$up_ac};
# $role =~s/\s/_/g; # removing spaces
# my $role_rel_name = "participates_in_as_$role";
# $onto->create_rel ( $protein, 'participates_in', $new_interaction );
$onto->create_rel ( $new_interaction, 'has_participant', $protein );
} # end of foreach protein
} # end of foreach taxon
} # end of foreach interaction
return \%proteins;
}
1;
__END__
=head1 NAME
OBO::Parser::IntActParser - An IntAct to OBO parser/filter.
=head1 DESCRIPTION
A parser for IntAct-to-OBO conversion. The conversion is filtered
according to the proteins already existing in the input ontology.
=head1 AUTHOR
Vladimir Mironov E<lt>vladimir.n.mironov@gmail.comE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright ( C ) 2006 by Vladimir Mironov
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.7 or,
at your option, any later version of Perl 5 you may have available.
=cut