The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#
# BioPerl module for Bio::Network::IO::psi10
#
# You may distribute this module under the same terms as perl itself
# POD documentation - main docs before the code

=head1 NAME

Bio::Network::IO::psi10

=head1 SYNOPSIS

Do not use this module directly, use Bio::Network::IO:

  my $io = Bio::Network::IO->new(-format => 'psi10',
                                 -file   => 'data.xml');

  my $network = $io->next_network;

=head1 DESCRIPTION

PSI MI (Protein Standards Initiative Molecular Interaction) XML is a format 
to describe protein-protein interactions and interaction networks. 
This module parses version 1.0 of PSI MI.

=head2 Databases

The following databases provide their data as PSI MI XML:

=over 3

=item *

DIP L<http://dip.doe-mbi.ucla.edu/>

=item *

HPRD L<http://www.hprd.org>

=item *

IntAct L<http://www.ebi.ac.uk/intact>

=item *

MINT L<http://cbm.bio.uniroma2.it/mint/>

=back

Each of these databases will call PSI format by some different name.
for example, PSI MI from DIP comes in files with the suffix "mif"
whereas PSI MI from IntAct or MINT has the "xml" suffix.

Documentation for PSI XML can be found at L<http://www.psidev.info>.

=head2 Version

This module supports a subset of the fields described in PSI MI version 1.0
(L<http://www.psidev.info/index.php?q=node/88>). The NODE DATA section below
describes which fields are currently parsed into ProteinNet networks.

=head2 Notes

See the Bio::Network::IO::psi_xml page in the Bioperl Wiki 
(L<http://bioperl.open-bio.org/wiki/Bio::Network::IO::psi_xml>)
for notes on PSI XML from various databases.

When using this parser recall that some PSI MI fields, or classes,
are populated by values taken from an ontology created for the PSI MI
format. This ontology is an OBO ontology and can be browsed at
L<http://www.ebi.ac.uk/ontology-lookup/browse.do?ontName=MI>.

=head1 METHODS

The naming system is analagous to the SeqIO system, although usually
next_network() will be called only once per file.

=head1 DATA IN THE NODE

The Node (protein or protein complex) is roughly equivalent to the PSI MI 
B<interactor> (entrySet/entry/interactorList/interactor). The following are 
subclasses of B<interactor> whose values are accessible through the Node
object.

=over 3

=item *

interactor/names/shortLabel

L<Bio::Annotation::SimpleValue|Bio::Annotation::SimpleValue>

=item *

interactor/names/fullName

L<Bio::Annotation::SimpleValue|Bio::Annotation::SimpleValue>

=item *

interactor/xref/primaryRef

L<Bio::Annotation::DBLink|Bio::Annotation::DBLink>

=item *

interactor/xref/secondaryRef

L<Bio::Annotation::DBLink|Bio::Annotation::DBLink>

L<Bio::Species|Bio::Species> object

=item *

interactor/organism/names/alias

L<Bio::Species|Bio::Species> object

=item *

interactor/organism/names/fullName

L<Bio::Species|Bio::Species> object

=item *

interactor/organism/names/shortLabel

L<Bio::Species|Bio::Species> object

=back

=head1 DATA NOT YET AVAILABLE

The following are subclasses of B<interactor> whose values are currently not
accessible through the Node object.

=over 3

=item *

interactor/names/alias

L<Bio::Annotation::SimpleValue|Bio::Annotation::SimpleValue>

=item *

interactor/sequence

=item *

interactor/interactorType/names

Controlled vocabulary maintained by PSI MI
L<http://www.ebi.ac.uk/ontology-lookup/browse.do?ontName=MI>.
Example: "protein".

L<Bio::Annotation::OntologyTerm|Bio::Annotation::OntologyTerm>

=item *

interactor/interactorType/xref

L<Bio::Annotation::DBLink|Bio::Annotation::DBLink>

=item *

interactor/organism/cellType

L<Bio::Annotation::OntologyTerm|Bio::Annotation::OntologyTerm>

=item *

interactor/organism/compartment

L<Bio::Annotation::OntologyTerm|Bio::Annotation::OntologyTerm>

=item *

interactor/organism/tissue

L<Bio::Annotation::OntologyTerm|Bio::Annotation::OntologyTerm>

=back

=head1 INTERACTION DATA

The Interaction object is roughly equivalent to the PSI MI B<interaction>
(entrySet/entry/interactionList/interaction) and B<experimentDescription>
(entrySet/entry/experimentList/experimentDescription). The following are
subclasses of B<interaction> and B<experimentDescription> whose values are 
NOT yet accessible through the Interaction object.

=over 3

=item *

interaction/xref/primaryRef

L<Bio::Annotation::DBLink|Bio::Annotation::DBLink>

=item *

interaction/xref/secondaryRef

L<Bio::Annotation::DBLink|Bio::Annotation::DBLink>

=item *

interaction/organism/names/shortLabel

L<Bio::Species|Bio::Species> object

=item *

interaction/organism/names/alias

L<Bio::Species|Bio::Species> object

=item *

interaction/organism/names/fullName

L<Bio::Species|Bio::Species> object

=item *

interaction/modelled

L<Bio::Annotation::SimpleValue|Bio::Annotation::SimpleValue>

=item *

interaction/intraMolecular

L<Bio::Annotation::SimpleValue|Bio::Annotation::SimpleValue>

=item *

interaction/negative

L<Bio::Annotation::SimpleValue|Bio::Annotation::SimpleValue>

=item *

interaction/interactionType

Controlled vocabulary maintained by PSI MI
L<http://www.ebi.ac.uk/ontology-lookup/browse.do?ontName=MI>.
Example: "phosphorylation reaction".

L<Bio::Annotation::OntologyTerm|Bio::Annotation::OntologyTerm>

=item *

interaction/confidenceList

L<Bio::Annotation::SimpleValue|Bio::Annotation::SimpleValue>

=item *

experimentDescription/confidenceList

L<Bio::Annotation::SimpleValue|Bio::Annotation::SimpleValue>

=item *

experimentDescription/interactionDetectionMethod

Controlled vocabulary maintained by PSI MI
L<http://www.ebi.ac.uk/ontology-lookup/browse.do?ontName=MI>.
Example: "two hybrid array".

L<Bio::Annotation::OntologyTerm|Bio::Annotation::OntologyTerm>

=item *

featureElementType/featureType

Controlled vocabulary maintained by PSI MI
L<http://www.ebi.ac.uk/ontology-lookup/browse.do?ontName=MI>. 
The featureType includes data on post-translational modification.
Example: "phospho-histidine".

L<Bio::Annotation::OntologyTerm|Bio::Annotation::OntologyTerm>

=back

=head1 FEEDBACK

=head2 Mailing Lists

User feedback is an integral part of the evolution of this and other
Bioperl modules. Send your comments and suggestions preferably to one
of the Bioperl mailing lists. Your participation is much appreciated.

  bioperl-l@bioperl.org                  - General discussion
  http://bioperl.org/wiki/Mailing_lists  - About the mailing lists

=head2 Support 

Please direct usage questions or support issues to the mailing list:

I<bioperl-l@bioperl.org>

rather than to the module maintainer directly. Many experienced and 
reponsive experts will be able look at the problem and quickly 
address it. Please include a thorough description of the problem 
with code and data examples if at all possible.

=head2 Reporting Bugs

Report bugs to the Bioperl bug tracking system to help us keep track
the bugs and their resolution.  Bug reports can be submitted via the
web:

  http://bugzilla.open-bio.org/

=head1 AUTHORS

Brian Osborne bosborne at alum.mit.edu
Richard Adams richard.adams@ed.ac.uk

=cut

package Bio::Network::IO::psi10;
use strict;
use XML::Twig;
use Bio::Root::Root;
use Bio::Seq::SeqFactory;
use Bio::Network::ProteinNet;
use Bio::Network::Interaction;
use Bio::Network::IO;
use Bio::Network::Node;
use Bio::Species;
use Bio::Annotation::DBLink;
use Bio::Annotation::Collection;
# use Bio::Annotation::OntologyTerm;
# use Bio::Annotation::Comment;
# use Bio::Annotation::Reference;
# use Bio::Annotation::SimpleValue;
# use Bio::Network::IO::psi::intact;

use vars qw( @ISA %species $net $fac );
@ISA = qw(Bio::Network::IO Bio::Root::Root );

BEGIN {
	$fac = Bio::Seq::SeqFactory->new(-type => 'Bio::Seq::RichSeq');
}

=head2 next_network

 Name       : next_network
 Purpose    : Constructs a protein interaction graph from PSI XML data
 Usage      : my $net = $io->next_network()
 Arguments  :
 Returns    : A Bio::Network::ProteinNet object

=cut

sub next_network {
	my $self = shift;
	$net = Bio::Network::ProteinNet->new(refvertexed => 1);

	my $t = XML::Twig->new(TwigHandlers => {
								  proteinInteractor => \&_proteinInteractor,
								  interaction       => \&_addInteraction
														});
	$t->parsefile($self->file);
	$net;
}

=head2 _proteinInteractor

 Name      : _proteinInteractor
 Purpose   : Parses protein information into Bio::Seq::RichSeq objects
 Returns   :
 Usage     : Internally called by next_network()
 Arguments : None

=cut

sub _proteinInteractor {
	my ($twig, $pi) = @_;

	my ($acc, $sp, $desc, $prim_id);

	my $org = $pi->first_child('organism');
	my $taxid = $org->att('ncbiTaxId');

	# Make new species object if doesn't already exist
	if ( !exists($species{$taxid}) ) {
		my $common = $org->first_child('names')->first_child('shortLabel')->text;
		my $full;
		# some PSI MI files have entries with species lacking "fullName"
		eval {
			$full = $org->first_child('names')->first_child('fullName')->text;
		};
		$full = $common if $@;

		my $sp_obj = Bio::Species->new(-ncbi_taxid  => $taxid,
												 -name        => $full,
												 -common_name => $common
												);
		$species{$taxid} = $sp_obj;
	}

	# Extract sequence and ontology identifiers
	my @ids          = $pi->first_child('xref')->children();
	my %ids          = map {$_->att('db'), $_->att('id')} @ids;
	$ids{'psixml'}   = $pi->att('id');

	$prim_id = defined ($ids{'GI'}) ?  $ids{'GI'} : '';
	# needs to be done by reference to an actual ontology:
	$acc = $ids{'RefSeq'} || 
	       $ids{'SWP'} ||               # DIP's name for Swissprot
			 $ids{'Swiss-Prot'} ||        # db name from HPRD
			 $ids{'Ref-Seq'} ||           # db name from HPRD
          $ids{'uniprotkb'} ||         # db name from MINT
			 $ids{'GI'} || 
			 $ids{'PIR'} ||
			 $ids{'intact'} ||            # db name from IntAct
			 $ids{'psi-mi'} ||            # db name from IntAct
			 $ids{'DIP'} ||               # DIP node name
          $ids{'ensembl'} ||           # db name from MINT
          $ids{'flybase'} ||           # db name from MINT
          $ids{'wormbase'} ||          # db name from MINT
          $ids{'sgd'} ||               # db name from MINT
          $ids{'ddbj/embl/genbank'} || # db name from MINT
          $ids{'mint'};                # db name from MINT

	# Get description line - certain files, like PSI XML from HPRD, have
	# "shortLabel" but no "fullName"
	eval {
		$desc = $pi->first_child('names')->first_child('fullName')->text; 
	};
	if ($@) {
		warn("No fullName, use shortLabel for description instead");
		$desc = $pi->first_child('names')->first_child('shortLabel')->text;
	}
	
	# Use ids other than accession_no or primary_id for DBLink annotations
	my $ac = Bio::Annotation::Collection->new();	
	for my $db (keys %ids) {
		next if $ids{$db} eq $acc;
		next if $ids{$db} eq $prim_id;
		my $an = Bio::Annotation::DBLink->new( -database   => $db,
															-primary_id => $ids{$db},
											);
		$ac->add_Annotation('dblink',$an);
	}

	# Make sequence object
	my $prot = $fac->create(
						-accession_number => $acc,
						-desc             => $desc,
						-display_id       => $acc,
						-primary_id       => $prim_id,
						-species          => $species{$taxid},
						-annotation       => $ac);

	# Add node to network
	my $node = Bio::Network::Node->new(-protein => [($prot)]);
	$net->add_node($node);

	# Add primary identifier and accession to internal id <-> node mapping hash
	$net->add_id_to_node($ids{'psixml'},$node);
	$net->add_id_to_node($prot->primary_id,$node);
	$net->add_id_to_node($prot->accession_number,$node);

	# Add secondary identifiers to internal id <-> node mapping hash
	$ac = $prot->annotation();
	for my $an ($ac->get_Annotations('dblink')) {
		$net->add_id_to_node($an->primary_id,$node);
	}

	$twig->purge();
}

=head2 _addInteraction

 Name     : _addInteraction
 Purpose  : Adds a new Interaction to a graph
 Usage    : Do not call, called internally by next_network()
 Returns  :
 Notes    : The PSI MI 2.5 standard calls for a field titled interactorRef

=cut

sub _addInteraction {
	my ($twig, $i) = @_;
	my @ints = $i->first_child('participantList')->children;
	my @nodeids = map {$_->first_child('proteinInteractorRef')->att('ref')} @ints;

	my $interx_id = $i->first_child('xref')->first_child('primaryRef')->att('id');
	
	my $node1 = $net->get_nodes_by_id($nodeids[0]);
	my $node2 = $net->get_nodes_by_id($nodeids[1]);

	my $interx = Bio::Network::Interaction->new(-id => $interx_id);
	$net->add_interaction(-nodes => [($node1,$node2)],
							    -interaction => $interx );
	$net->add_id_to_interaction($interx_id,$interx);

	$twig->purge();
}

1;

__END__