The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Bio::PhyloTastic::TaxTractor;
use strict;
use warnings;
use base 'Bio::PhyloTastic';

=head1 NAME

Bio::PhyloTastic::TaxTractor - Extracts taxa from phylogenetic file formats

=head1 SYNOPSIS

 phylotastic TaxTractor -i <infile> -d nexml -o <outfile> -search '_' -r ' '

=head1 DESCRIPTION

This module extracts taxon labels from common phylogenetic file formats.

=head1 OPTIONS AND ARGUMENTS

=over

=item -i infile

An input file. Default is a text file with one name per line. Required.

=item -o outfile

An output file name. If '-', prints output to STDOUT. Required.

=item -d informat

An input format, such as NEXUS, Newick, NeXML, PhyloXML, TaxList. Required.

=item -s outformat

An output format, such as NeXML, TaxList. Optional. Default is TaxList (i.e.
a simple text file).

=item -search pattern

A character (or pattern) to search for, which is replaced with the -r argument.
This is especially useful for replacing underscores (which occur often) with
spaces.

=item -r character

What the value of -search gets replaced with.

=back

=cut

# these are used as "perl compatible regular expressions" to process
# the labels, so you could strip out accession numbers and such
my $search;
my $replace;
my $serializer = 'taxlist';

sub _get_args {	
	return (
		'search=s'     => \$search,
		'replace=s'    => \$replace,
		'serializer=s' => \$serializer,
	);
}

sub _run {		
	my ( $class, $project ) = @_;
	
	# fetch factory object
	my $fac = $class->_fac;
	
	# create merged taxa block
	my $merged_taxa = $fac->create_taxa;
	
	# compile hash set of distinct names, that are
	# processed using s/$search/$replace/
	my %names;
	for my $block ( @{ $project->get_entities } ) {
		my $taxa;
		if ( $block->can('make_taxa') ) {
			$taxa = $block->make_taxa;
		}
		else {
			$taxa = $block;
		}
		$taxa->visit(sub{
			my $name = shift->get_name;
			if ( defined $search and defined $replace ) {
				$name =~ s/$search/$replace/g;
			}
			$names{$name} = 1
		});
	}
	
	# create distinct taxa
	$merged_taxa->insert( $fac->create_taxon( '-name' => $_ ) ) for sort { $a cmp $b } keys %names;
	
	# return result
	return $merged_taxa;
}

1;