The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Bio::Phylo::Parsers::Adjacency;
use strict;
use warnings;
use base 'Bio::Phylo::Parsers::Abstract';
use Bio::Phylo::Util::Exceptions 'throw';
use Bio::Phylo::Util::CONSTANT qw'/looks_like/ :namespaces :objecttypes';

=head1 NAME

Bio::Phylo::Parsers::Adjacency - Parser used by Bio::Phylo::IO, no serviceable parts inside

=head1 DESCRIPTION

This module parses a tree structure from tabular data organized as an "adjacency
list", i.e. child -> parent relationships. The table should at least have the
following columns: 'child' and 'parent'. 'length' is interpreted as branch
length. Columns starting with 'node:' are assigned as semantic annotations
to the focal node, columns starting with 'branch:' are assigned to the focal
branch. Records need to be listed in pre-order, so that references to parent
nodes can be resolved immediately. Consequently, the root is the first record,
without a parent. Example:

 ((A:1,B:2)n1:3,C:4)n2:0;

Becomes (with an extra example annotation):

 child  parent  length	node:dcterms:identifier
 n2             0       35462
 n1     n2      3       34987
 A      n1      1       73843
 B      n1      2       98743
 C      n2      4       39847

=cut

sub _parse {
    my $self = shift;
    my $fh   = $self->_handle;
    my $fac  = $self->_factory;
	my $log  = $self->_logger;
    my $tree = $fac->create_tree;
	my $ns   = $self->_args->{'-namespaces'};
	if ( $ns ) {
		$tree->set_namespaces( %{ $ns } );
	}
	my ( @header, %node_cols );
	my %node_for_id;
    LINE: while (<$fh>) {
    	unless ( scalar(keys(%node_for_id)) % 1000 ) {
    		$log->debug("processed node " . scalar(keys(%node_for_id)));
    	}
        chomp;
		
		# the first line is the header row
		if ( not @header ) {
			@header = split /\t/, $_;
			for my $col ( @header ) {
				if ( $col =~ /^node:(.+)$/ ) {
					my $predicate = $1;
					$node_cols{$col} = $predicate;
				}
			}
			next LINE;
		}
		
		# this is a record
        my @fields = split /\t/, $_;
		my %record = map { $header[$_] => $fields[$_] } 0 .. $#header;
		
		# create node
		my $name   = $record{'child'};
		my $pname  = $record{'parent'};
		my $node   = $fac->create_node( '-name' => $name );
		$tree->insert($node);
		$node_for_id{$name} = $node;
		
		# build the tree structure
		if ( my $parent = $node_for_id{$pname} ) {
			$node->set_parent($parent);
		}
		
		# assign branch length, if defined
		if ( defined $record{'length'} ) {
			$node->set_branch_length($record{'length'});
		}
		
		# now see if there are any node columns
		for my $col ( keys %node_cols ) {
			my $value = $record{$col};
			if ( $value ) {
				my $predicate = $node_cols{$col};
				if ( $predicate =~ /^(.+)?:.+$/ ) {
					my $prefix = $1;
					if ( my $ns = $Bio::Phylo::Util::CONSTANT::NS->{$prefix} ) {
						$node->add_meta(
							$fac->create_meta(
								'-namespaces' => { $prefix    => $ns },
								'-triple'     => { $predicate => $value }
							)
						);
					}
					else {
						$log->warn("No namespace for prefix $prefix");
					}
				}
			}
		}
    }
	my $forest = $fac->create_forest;
	$forest->insert($tree);	
    return $forest;
}

=head1 SEE ALSO

There is a mailing list at L<https://groups.google.com/forum/#!forum/bio-phylo> 
for any user or developer questions and discussions.

=over

=item L<Bio::Phylo::IO>

The adjacency parser is called by the L<Bio::Phylo::IO|Bio::Phylo::IO> object.
Look there to learn how to parse trees in general

=item L<Bio::Phylo::Manual>

Also see the manual: L<Bio::Phylo::Manual> and L<http://rutgervos.blogspot.com>

=back

=head1 CITATION

If you use Bio::Phylo in published research, please cite it:

B<Rutger A Vos>, B<Jason Caravas>, B<Klaas Hartmann>, B<Mark A Jensen>
and B<Chase Miller>, 2011. Bio::Phylo - phyloinformatic analysis using Perl.
I<BMC Bioinformatics> B<12>:63.
L<http://dx.doi.org/10.1186/1471-2105-12-63>

=cut

1;