The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# $Id: NCBIParser.pm 2015-02-12 Erick Antezana $
#
# Module  : NCBIParser.pm
# Purpose : Parse NCBI files: names and nodes
# License : Copyright (c) 2006-2015 by ONTO-perl. All rights reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#
###############################################################################
package OBO::Parser::NCBIParser;

use OBO::Core::Term;
use OBO::Parser::OBOParser;
use OBO::Core::Ontology;

use strict;
use warnings;
use Carp;

$Carp::Verbose = 0;

sub new {
	my $class = shift;
	my $self = {};	
	bless ( $self, $class );
	return $self;
}

=head2 parse

 Usage    : $ncbi_parser->parse ( $ncbi_nodes_path )
 Returns  : ref to a hash { child_id => parent_id }
 Args     : nodes.dmp path ( string )
 
 Usage    : $ncbi_parser->parse ( $ncbi_names_path, $name_type )
 Returns  : ref to a hash { ncbi_id => ncbi_name }
 Args     : 
            1. names.dmp path or nodes.dmp path, string 
            2. ncbi name type ( string, e.g. 'scientific name' ) if the first arg is names.dmp otherwise none				
 Function : parses the complete NCBI taxonomy
 
=cut

sub parse {
	my ( 
		$self, 
		$input_path, 
		$name_type # optional
	 ) = @_;
	 
	croak "Not enough arguments! " if ( @_ < 1 );
	
	open my $IN, '<', $input_path || croak "Can't open file '$input_path': $! ";
	my @in_lines = <$IN>;
	
	my %map;
	if ( $name_type ) { # parsing names.dmp
		# %map: ncbi_id => scientific_name
		foreach my $line ( @in_lines ) {
			my @fields = split /\t\|/, $line;
			if ( $fields[3] eq "\t$name_type" ) {
				my $key    = $fields[0];
				my $value  = substr $fields[1], 1;
				$map{$key} = $value;
			}
		}		
	} else { # parsing nodes.dmp
		# %map: child_id => parent_id
		foreach my $line ( @in_lines ) {
			my @fields = split (/\t\|/, $line);
			my $key    = $fields[0];
			my $value  = substr $fields[1], 1;
			$map{$key} = $value;
		}
	}
	close ( $IN );
	return \%map;
}

=head2 work

 Usage   : $NCBIParser->work ( $onto, $nodes, $names, $ncbi_ids )
 Returns : map of added terms { NCBI ID => OBO::Core::Term object }
 Args    :
           1. input ontology, OBO::Core::Ontology object
           2. ref to a hash { child_id => parent_id }
           3. ref to a hash { ncbi_id => scientific_name }
           4. parental ontology term for the root of the taxonomy, OBO::Core::Term object
           5. ref to a list of NCBI taxon ids ( \d+ )
 					
 Function : adds NCBI taxonomy to the input ontology for the specified taxa
 
=cut

sub work {
	my ( 
		$self,
		$ontology,
		$nodes,
		$names,
		$parent,
		$ncbi_ids,
	 ) = @_;
	
	my %selected_nodes = ( ); # taxon id => parent id
	my %selected_names = ( ); # taxon id => taxon name

	# the hashes %selected_nodes and %selected_names are being populated:
	foreach my $ncbi_id ( @{$ncbi_ids} ) {
		getParentsRecursively ( $ncbi_id, $nodes, $names, \%selected_nodes, \%selected_names );		
	}	
	my %map; # NCBI_ID=>taxon_term
	# the terms are created and added to the ontology and %map
	foreach my $ncbi_id ( keys %selected_nodes ){
		my $selected_name = $selected_names{$ncbi_id};
		my $taxon = OBO::Core::Term->new ( );
		$taxon->id ( "NCBI:$ncbi_id" );
		$taxon->name ( $selected_name );			
		$ontology->add_term ( $taxon );
		$map{$ncbi_id} = $taxon;
	} # the end of foreach	
	# Connect children to parents by 'is_a' relationships but not if the child is root ( cyclic is_a )
	foreach my $ncbi_id ( keys %selected_nodes ) {
		my $child = $map{$ncbi_id} or croak "No term for '$ncbi_id' in the map: $! ";
		my $parent = $map{$selected_nodes{$ncbi_id}} or croak "No term for '$selected_nodes{$ncbi_id}' in the map: $! ";
		$ontology->create_rel ( $child, 'is_a', $parent ) if ( $ncbi_id != 1 );		
	}
	my $root = $ontology->get_term_by_id ( 'NCBI:1' ) or croak "No term in the ontology for 'root': $!";
	$ontology->create_rel ( $root, 'is_a', $parent );
	return \%map;
}

########################################################################
#
# Subroutines
#
########################################################################

sub getParentsRecursively {
	
	caller eq __PACKAGE__ or croak;
	
	my ( $ncbi_id, $nodes, $names, $selected_nodes, $selected_names ) = @_;
	my $child_id    = $ncbi_id;
	my $parent_id   = ${$nodes}{$ncbi_id};
	my $child_name  = ${$names}{$ncbi_id};
	my $parent_name = ${$names}{$ncbi_id};
	
	$selected_nodes->{$child_id} = $parent_id;
	$selected_names->{$child_id} = $child_name;
	
	getParentsRecursively ( $parent_id, $nodes, $names, $selected_nodes, $selected_names ) if ( $child_id != 1 );
}

1;

__END__

=head1 NAME

OBO::Parser::NCBIParser - A NCBI taxonomy to OBO translator.

=head1 DESCRIPTION

This parser converts chosen parts of the NCBI taxonomy-tree into an OBO file. 
A taxon ID is given to the parser and the whole tree up to the root is 
reconstructed in the given OBO ontology, using scientific names.

The dump files ( nodes.dmp and names.dmp ) should be obtained from: 

	ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz

TODO: include ranks and disjoints only in correlating ranks.

=head1 AUTHOR

Mikel Egana Aranguren and Vladimir Mironov

http://www.mikeleganaranguren.com, vladimir.n.mironov@gmail.com

=head1 COPYRIGHT AND LICENSE

Copyright (c) 2006 by Mikel Egana Aranguren

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.7 or,
at your option, any later version of Perl 5 you may have available.

=cut