The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# $Id: UniProtParser.pm 2015-02-12 Erick Antezana $
#
# Module : UniProtParser.pm
# Purpose : Parse UniProt files and add data to an ontology
# License : Copyright (c) 2006-2015 by ONTO-perl. All rights reserved.
# This program is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#

package OBO::Parser::UniProtParser;

use strict;
use warnings;
use Carp;

use open qw(:std :utf8); # Make All I/O Default to UTF-8

use OBO::Core::Term;
use OBO::Core::Dbxref;
use SWISS::Entry;

$Carp::Verbose = 0;
my $verbose = 0;

sub new {
 my $class = $_[0];
 my $self = {};
 bless ( $self, $class );
 return $self;
}

=head2 parse

 Usage - $UniProtParser->parse ( $uniprot_file_path, $uniprot_map ) 
 Returns - ref to a hash { UniProtAC => SWISS::Entry object }
 Args - 
 	1. UniProt data file path, 
 	2. ref to a hash { UniProtAC => UniProtID } to filter by, optional
 Function - parses UniProt data file 
 
=cut

sub parse {
	my ( 
	$self,
	$uniprot_file_path,
	$up_map # optional
	 ) = @_;
	my %entries;
	my $count_accepted = 0;
	my $count_rejected =0;
	open my $FH, '<', $uniprot_file_path or croak "Cannot open file '$uniprot_file_path': $!";
	local $/ = "\n//\n";
	while ( <$FH> ) { 
		my $entry = SWISS::Entry->fromText ( $_ );
		my ( $accession ) = @{ $entry->ACs->{list} };
		if ( $up_map ) {
			if ( $up_map->{$accession} ) {
				$entries{$accession} = $entry;
				$count_accepted++;				
			}
			else {
				$count_rejected++;
			}
		}
		else {		
			$entries{$accession} = $entry;
			$count_accepted++;
		}
	}
	close $FH;
	print "Accepted proteins: $count_accepted, rejected proteins: $count_rejected\n" if $verbose;
	%entries ? return \%entries : croak "No data produced!";
}

=head2 work

 Usage - $UniProtParser->work ( $onto, $data, $parent_protein_name, $parent_gene_name )
 Returns - a hash with terms for added genes { NCBI GeneID => OBO::Core::Term object } 
 Args - 
 	1. OBO::Core::Ontology object, 
 	2. ref to a hash { UniProtAC => SWISS::Entry object }
	3. parent term name for proteins ( string ), # to link added modified proteins to, e.g. 'modified gene regulation protein'
	4. parent term name for genes ( string ), # to link added genes to, e.g. 'gene regulation gene'
 Function - adds gene and modified protein terms to the input ontology along with appropriate relations
 
=cut

sub work {
	# TODO Note: the relations to 'protein_modificaton' is now hard coded, an additional arg?
	my ( $self, 
		$onto, 
		$data, 
		$parent_protein_name, 
		$parent_gene_name 
	) = @_;
	
	my $no_gene_id =0;
	my $multiple_genes = 0;
	my $multiple_proteins = 0;
	my $no_GN = 0;
	my $multiple_GNs = 0;
	

	
	my @rel_types = ( 'is_a', 'has_source', 'encoded_by', 'codes_for', 'transformation_of', 'transforms_into', 'bearer_of' );
	foreach ( @rel_types ) {
		croak "Not a valid relationship type WITHIN the ontology: '$_'" unless ( $onto->{RELATIONSHIP_TYPES}{$_} );
	}
	my $parent_protein = $onto->get_term_by_name ( $parent_protein_name ) || croak "No term for '$parent_protein_name' in the ontology: $!";
	my $parent_gene = $onto->get_term_by_name ( $parent_gene_name ) || croak "No term for $parent_gene_name in the ontology: $!";
	my $protein_modification_id = 'MOD:00000';
	my $protein_modification = $onto->get_term_by_id ( $protein_modification_id )  || croak "No term for $protein_modification_id in the ontology: $!";
	
	my %genes; # gene_id=>gene_term
	my%new_genes; # gene_id=>gene_name
	
	foreach my $accession ( keys %{$data} ) {
		my $entry =  $data->{$accession};
		#######################################################################
		#
		# protein
		#
		#######################################################################
		my $protein_name = $entry->ID;
		my $oxs = $entry->OXs; # all NCBI ids
		my $taxon_id = $oxs->{'NCBI_TaxID'}{'list'}[0]{'text'}; # the primary ID
		my $taxon = $onto->get_term_by_id ( "NCBI:$taxon_id" );
		if ( ! $taxon ) {
			print "No taxon term for $taxon_id in ontology ($protein_name)\n" if $verbose;
			next;
		}
		my @acs  = @{ $entry->ACs->{list} };
		shift @acs; # shifting the primary AC from the array, the same as $accession
		# TODO synonyms for protein names
		my $prot_id = "UniProtKB:$accession";
		my $protein = $onto->get_term_by_id ( $prot_id );
		if ( ! $protein ) { # the proteins in the UP file and in the ontology are expected to be identical
			carp "No protein term in the ontology for '$protein_name'\n";
			next;
		}
		$protein->name ( $protein_name );
		$protein->alt_id ( map { "UniProtKB:$_" } @acs );
		
#		only the primary DE should be used 
		my $def_term = ${ $entry->DEs->{list} }[0]; # an object
		my $definition = $def_term->{text};
		$protein->def_as_string ( $definition.'.', "UniProtKB:$accession" );

		# add DB cross references to the protein, currently only EMBL
		# TODO consider other xrefs
		my $dbxrefs = $entry->DRs; # an object containing all DB cross-references
		my @pids = $dbxrefs->pids; # an array containing EMBL ids: ( gene_id, @protein_ids )
		# Note: a small fraction of EMBL entries has no protein ids !!!
		# TODO check how many protein ids could be present in @pids ( hopefully only relevant ones are there )
		foreach ( @pids ) {
			$protein->xref_set_as_string ( "[EMBL:$_]" );
		}
		
		####################################################################
		#
		# post-translationally modified derivatives of the protein
		#
		####################################################################
		if ( my @fts = @{$entry->FTs->{list}} ){ # an array of references to arrays corresponding to individual FT lines 
			foreach my $ft ( @fts ){
				# select only lines for modified residues
				$ft->[0] eq 'MOD_RES' ? 
				my ( $feature_key, $from_position, $to_position, $description, $qualifier, $FTId, $evidence_tag ) = @{$ft} :
				next; # go to the next FT line
				my ( $mod_residue, $tail, $mod_prot_id, $mod_prot_name, $mod_prot_comment, $mod_prot_def );

				if ( $description =~ /(\S+);\s(.*)/xms ) {
					# description contains the name of the modified residue separated by a semicolon from the rest
					 ( $mod_residue, $tail ) = ( $1, $2 );
					$mod_prot_comment = "$tail; $qualifier";
				}
				else {
					# $description contains only the name of the modified residue
					$mod_residue = $description;
					$mod_prot_comment = $qualifier;
 		}
 		$mod_prot_name = $protein_name.'-'.$mod_residue.'-'.$from_position;
 		$mod_prot_def = "Protein $protein_name with the residue $from_position substituted with $mod_residue.";
# 		$mod_prot_id = "$prot_id-$mod_residue-$from_position";
 		$mod_residue =~ s/\W/_/xmsg;
 		$mod_residue =~ tr/_//s; # replacing multiple '_' with a single one
 		$mod_prot_id = $prot_id . '_' . $mod_residue . '_' . $from_position;
# 		$mod_prot_id =~ s/\s+/_/xmsg;
# 		$mod_prot_id =~ s/-/_/xmsg;
# 		$mod_prot_id =~ s/\s|-|,|\(|\)/_/xmsg;
# 		$mod_prot_id =~ s/\W/_/xmsg;

				# create protein terms for modified proteins and add to ontology
				my $mod_prot = OBO::Core::Term->new ( );
				$mod_prot->name ( $mod_prot_name );
				$mod_prot->id ( $mod_prot_id );
				$mod_prot->def_as_string ( $mod_prot_def, "[UniProtKB:$accession]" );
				$mod_prot->xref_set_as_string ( "[UniProtKB:$accession]" );
				$mod_prot->comment ( $mod_prot_comment ); 
				$onto->add_term ( $mod_prot );
				$onto->create_rel ( $mod_prot, 'is_a', $parent_protein );
				$onto->create_rel ( $mod_prot, 'has_source', $taxon );
#				$onto->create_rel ( $mod_prot, 'transformation_of', $protein );
				$onto->create_rel ( $protein, 'transforms_into', $mod_prot );
				$onto->create_rel ( $mod_prot, 'bearer_of', $protein_modification );
			}
		}
 
 ######################################################################
 #
# genes
#
######################################################################
		# only GeneID and relations are added, the rest is taken from NCBI
		# gene ids
		my @gene_ids; # strings 'GeneID:\d+' - NCBI GeneID database
		# the number of proteins with multiple GeneIDs is ~0.2%
		# building a list of GeneIDs:
		foreach my $xref ( @{ $dbxrefs->{list} } ) {
#			${$xref}[0] eq 'GeneID' ? push @gene_ids, "${$xref}[0]:${$xref}[1]" : next;
			${$xref}[0] eq 'GeneID' ? push @gene_ids, "${$xref}[1]" : next;
		}
		my $geneid_count = @gene_ids;
		# if no GeneID no gene term created
		if ( $geneid_count == 0 ) {
			print "$protein_name has no GeneIDs\n" if $verbose;
			$no_gene_id++;
			next;
		}
		elsif ( $geneid_count > 1 ) {
			print "$protein_name has $geneid_count GeneIDs\n" if $verbose;
			$multiple_genes++;
		}
		
		foreach my  $gene_id ( @gene_ids ) { # $gene_id does not include a namespace
			my $gene = $genes{$taxon_id}{$gene_id};
#			$gene = $onto->get_term_by_id ( $gene_id ) if ! $gene; # is it necessary ?
			if ( $gene ) { # $gene has already been created for another protein
				print "Gene 'GeneID:$gene_id' codes for multiple proteins\n" if $verbose;
				$multiple_proteins++;
			}
			else {
				#creating new gene term
				$gene = OBO::Core::Term->new ( );
				$gene->id ( "GeneID:$gene_id" );
				$onto->add_term ( $gene );	
				$onto->create_rel ( $gene, 'is_a', $parent_gene );			
				$onto->create_rel ( $gene, 'has_source', $taxon );				

			} # end of new gene
			$onto->create_rel ( $protein, 'encoded_by', $gene );
			$onto->create_rel ( $gene, 'codes_for', $protein ); # inverse of 'encoded_by'			
			# gene xref to UniProt
			$gene->xref_set_as_string ( "UniProtKB:$accession" );
			$genes{$taxon_id}{$gene_id} = $gene

		} # end of foreach gene_id
		
		
		# gene names
		# the number of gene groups often is higher the the number of GeneIDs
		# currently is used only for counting, gene names are taken from NCBI
		my @gene_groups = @{ $entry->GNs->{list} };
		my $gene_group_count = @gene_groups;
		if ( $gene_group_count != 1 ) {
			print "$protein_name has  $gene_group_count GNs\n" if $verbose;
			$gene_group_count == 0 ? $no_GN++ : $multiple_GNs++;
		}
		

	} # foreach $entry
	print "$no_gene_id proteins without GeneIDs, $multiple_genes proteins with multiple GeneIDs;\n$no_GN proteins without GN, $multiple_GNs proteins with multiple GNs;\n $multiple_proteins genes with multiple proteins\n" if $verbose;
	return \%genes;
}

1;

__END__


=head1 NAME

OBO::Parser::UniProtParser - A UniProt to OBO translator.

=head1 DESCRIPTION

Includes methods for adding information from UniProt files to ontologies

UniProt files can be obtained from:
	ftp://ftp.expasy.org/databases/uniprot/knowledgebase/

The method 'parse' parses imput UniProt file and filters the data by map (optioanally) 
The method 'work' transfers selected data from the ouput parse() into the input ontology. 
This method assumes: 
- the input ontology contains the NCBI taxonomy. 
- the input ontology contains the relationship types 'is_a', 'encoded_by', 
 'codes_for', 'has_source', 'tranformation_of', 'source_of'

=head1 AUTHOR

Vladimir Mironov E<lt>vladimir.n.mironov@gmail.comE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright ( C ) 2006 by Vladimir Mironov

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.7 or,
at your option, any later version of Perl 5 you may have available.

=cut