The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Bio::Phylo::Parsers::Newick;
use warnings;
use strict;
use base 'Bio::Phylo::Parsers::Abstract';
no warnings 'recursion';

=head1 NAME

Bio::Phylo::Parsers::Newick - Parser used by Bio::Phylo::IO, no serviceable parts inside

=head1 DESCRIPTION

This module parses tree descriptions in parenthetical format. It is called by the 
L<Bio::Phylo::IO> facade, don't call it directly. Several additional flags can be
passed to the Bio::Phylo::IO parse and parse_tree functions to influence how to deal
with complex newick strings:

 -keep => [ ...list of taxa names... ]

The C<-keep> flag allows you to only retain certain taxa of interest, ignoring others
while building the tree object.

 -ignore_comments => 1,

This will treat comments in square brackets as if they are a normal taxon name character,
this so that names such as C<Choristoneura diversana|BC ZSM Lep 23401[05/*> are parsed 
"successfully". (Note: square brackets should NOT be used in this way as it will break
many parsers).

 -keep_whitespace => 1,

This will treat unescaped whitespace as if it is a normal taxon name character. Normally,
whitespace is only retained inside quoted strings (e.g. C<'Homo sapiens'>), otherwise it
is the convention to use underscores (C<Homo_sapiens>). This is because some programs 
introduce whitespace to prettify a newick string, e.g. to indicate indentation/depth, 
in which case you almost certainly want to ignore it. This is the default behaviour. The 
option to keep it is provided for dealing with incorrectly formatted data.

=cut

sub _return_is_scalar { 1 }


sub _simplify {
    # Simplify a Newick tree string by removing unneeded nodes. The leaves to
    # keep are given as $ids, an arrayref of terminal node IDs. Note that only
    # cherries are simplified to keep the function fast. Ternary or higher order
    # branches are left alone. Quoted strings should be handled properly.
    my ($string, $ids) = @_;   
    my %id_hash = map { $_ => undef } @$ids;

    # Setup some regular expressions:
    # 1/ ID is anything but these characters (except when quoted): , ; : ( ) " '
    my $id_re_simple = qr/[^)(,:"';]+/;
    my $id_re_squote = qr/[^']+/;
    my $id_re_dquote = qr/[^']+/;
    my $id_re = qr/ (?: $id_re_simple | '$id_re_squote' | "$id_re_dquote" ) /x;
    # 2/ Distance is a real number (regexp taken from Regexp::Common $RE{num}{real})
    my $dist_re   = qr/(?:(?i)(?:[+-]?)(?:(?=[.]?[0123456789])(?:[0123456789]*)(?:(?:[.])(?:[0123456789]{0,}))?)(?:(?:[E])(?:(?:[+-]?)(?:[0123456789]+))|))/;
    # 3/ A pair of ID and distance (both optional)
    my $pair_re   = qr/ ($id_re)? (?: \: ($dist_re) )? /x;
    # 4/ Cherry
    my $cherry_re = qr/ ( \( $pair_re , $pair_re \) $pair_re ) /x;
    # 5/ Whitespaces 
    my $ws_re     = qr/ \s+ /msx;

    # Remove spaces and newlines (no spaces allowed in node names)
    $string =~ s/$ws_re//g;

    # Prune cherries
    my $prev_string = '';
    while (not $string eq $prev_string) {
        $prev_string = $string;
        $string =~ s/ $cherry_re / _prune_cherry($1, $2, $3, $4, $5, $6, $7, \%id_hash) /gex;
    }
	__PACKAGE__->_logger->debug("simplified string by removing unneeded nodes");
    return $string;
}


sub _prune_cherry {
    my ($match, $id1, $dist1, $id2, $dist2, $idp, $distp, $id_hash) = @_;
    my $repl;
    my $id1_exists = defined $id1 && exists $id_hash->{$id1};
    my $id2_exists = defined $id2 && exists $id_hash->{$id2};
    if ( $id1_exists && $id2_exists ) {
        # Keep both leaves
        $repl = $match;
    } else {
        # There are from zero to one leaves to keep. Delete one of them.
        my ($id, $dist) = $id1_exists ? ($id1, $dist1) : ($id2, $dist2);
        if ( defined($dist) || defined($distp) ) {
            $dist = ':'.(($dist||0) + ($distp||0));
        }
        $id   ||= '' if not defined $id;
        $dist ||= '' if not defined $dist;
        $repl = $id.$dist;
    }
    return $repl;
}


sub _parse {
    my $self   = shift;
    my $fh     = $self->_handle;
    my $forest = $self->_factory->create_forest;

    my $string;
    while (<$fh>) {
        chomp;
        $string .= $_;
    }

    my $ids        = $self->_args->{'-keep'};
    my $ignore     = $self->_args->{'-ignore_comments'};
    my $whitespace = $self->_args->{'-keep_whitespace'};
    my $quotes     = $self->_args->{'-ignore_quotes'};

    # remove comments, split on tree descriptions
    my $counter = 1;
    for my $newick ( $self->_split($string,$ignore,$whitespace,$quotes) ) {
		$self->_logger->debug("going to process newick string " . $counter++);
		
        # simplify tree
        if ($ids) {
            $newick = _simplify($string, $ids);
        }

        # parse trees
        my $tree = $self->_parse_string($newick);

        # adding labels to untagged nodes
        if ( $self->_args->{'-label'} ) {
            my $i = 1;
            $tree->visit(
                sub {
                    my $n = shift;
                    $n->set_name( 'n' . $i++ ) unless $n->get_name;
                }
            );
        }
        $forest->insert($tree);
    }
    return $forest;
}

=begin comment

 Type    : Parser
 Title   : _split($string)
 Usage   : my @strings = $newick->_split($string);
 Function: Creates an array of (decommented) tree descriptions
 Returns : A Bio::Phylo::Forest::Tree object.
 Args    : $string = concatenated tree descriptions

=end comment

=cut

sub _split {
    my ( $self, $string, $ignore, $whitespace, $quotes ) = @_;
    my $log = $self->_logger;
    my ( $QUOTED, $COMMENTED ) = ( 0, 0 );
    my $decommented = '';
    my @trees;
  TOKEN: for my $i ( 0 .. length($string) ) {
        my $token = substr( $string, $i, 1 );
        if ( !$QUOTED && !$COMMENTED && $token eq "'" && ! $quotes ) {
            $QUOTED++;
        }
        elsif ( !$QUOTED && !$COMMENTED && $token eq "[" && ! $ignore ) {
            $COMMENTED++;
            $log->debug("quote level changed to $COMMENTED");
            next TOKEN;
        }
        elsif ( !$QUOTED && $COMMENTED && $token eq "]" && ! $ignore ) {
            $COMMENTED--;
            next TOKEN;
        }
        elsif ($QUOTED
            && !$COMMENTED
            && $token eq "'"
            && substr( $string, $i, 2 ) ne "''" && ! $quotes )
        {
            $QUOTED--;
        }
        if ( !$QUOTED && $token eq ' ' && ! $whitespace ) {
            next TOKEN;
        }
        $decommented .= $token unless $COMMENTED;
        if ( !$QUOTED && !$COMMENTED && substr( $string, $i, 1 ) eq ';' ) {
            push @trees, $decommented;
            $decommented = '';
        }
    }
    $log->debug("removed comments, split on tree descriptions");
    $log->debug("found ".scalar(@trees)." tree descriptions");
    return @trees;
}

=begin comment

 Type    : Parser
 Title   : _parse_string($string)
 Usage   : my $tree = $newick->_parse_string($string);
 Function: Creates a populated Bio::Phylo::Forest::Tree object from a newick
           string.
 Returns : A Bio::Phylo::Forest::Tree object.
 Args    : $string = a newick tree description

=end comment

=cut

sub _parse_string {
    my ( $self, $string ) = @_;
    my $fac = $self->_factory;
    $self->_logger->debug("going to parse tree string '$string'");
    my $tree      = $fac->create_tree;
    my $remainder = $string;
    my $token;
    my @tokens;
    while ( ( $token, $remainder ) = $self->_next_token($remainder) ) {
        last if ( !defined $token || !defined $remainder );
        $self->_logger->debug("fetched token '$token'");
        push @tokens, $token;
    }
    my $i;
    for ( $i = $#tokens ; $i >= 0 ; $i-- ) {
        last if $tokens[$i] eq ';';
    }
    my $root = $fac->create_node;
    $tree->insert($root);
    $self->_parse_node_data( $root, @tokens[ 0 .. ( $i - 1 ) ] );
    $self->_parse_clade( $tree, $root, @tokens[ 0 .. ( $i - 1 ) ] );
    return $tree;
}

sub _parse_clade {
    my ( $self, $tree, $root, @tokens ) = @_;
    my $fac = $self->_factory;
    $self->_logger->debug("recursively parsing clade '@tokens'");
    my ( @clade, $depth, @remainder );
  TOKEN: for my $i ( 0 .. $#tokens ) {
        if ( $tokens[$i] eq '(' ) {
            if ( not defined $depth ) {
                $depth = 1;
                next TOKEN;
            }
            else {
                $depth++;
            }
        }
        elsif ( $tokens[$i] eq ',' && $depth == 1 ) {
            my $node = $fac->create_node;
            $root->set_child($node);
            $tree->insert($node);
            $self->_parse_node_data( $node, @clade );
            $self->_parse_clade( $tree, $node, @clade );
            @clade = ();
            next TOKEN;
        }
        elsif ( $tokens[$i] eq ')' ) {
            $depth--;
            if ( $depth == 0 ) {
                @remainder = @tokens[ ( $i + 1 ) .. $#tokens ];
                my $node = $fac->create_node;
                $root->set_child($node);
                $tree->insert($node);
                $self->_parse_node_data( $node, @clade );
                $self->_parse_clade( $tree, $node, @clade );
                last TOKEN;
            }
        }
        push @clade, $tokens[$i];
    }
}

sub _parse_node_data {
    my ( $self, $node, @clade ) = @_;
    $self->_logger->debug("parsing name and branch length for node");
    my @tail;
  PARSE_TAIL: for ( my $i = $#clade ; $i >= 0 ; $i-- ) {
        if ( $clade[$i] eq ')' ) {
            @tail = @clade[ ( $i + 1 ) .. $#clade ];
            last PARSE_TAIL;
        }
        elsif ( $i == 0 ) {
            @tail = @clade;
        }
    }

    # name only
    if ( scalar @tail == 1 ) {
        $node->set_name( $tail[0] );
    }
    elsif ( scalar @tail == 2 ) {
        $node->set_branch_length( $tail[-1] );
    }
    elsif ( scalar @tail == 3 ) {
        $node->set_name( $tail[0] );
        $node->set_branch_length( $tail[-1] );
    }
}

sub _next_token {
    my ( $self, $string ) = @_;
    $self->_logger->debug("tokenizing string '$string'");
    my $QUOTED          = 0;
    my $token           = '';
    my $TOKEN_DELIMITER = qr/[():,;]/;
  TOKEN: for my $i ( 0 .. length($string) ) {
        $token .= substr( $string, $i, 1 );
        $self->_logger->debug("growing token: '$token'");
        if ( !$QUOTED && $token =~ $TOKEN_DELIMITER ) {
            my $length = length($token);
            if ( $length == 1 ) {
                $self->_logger->debug("single char token: '$token'");
                return $token, substr( $string, ( $i + 1 ) );
            }
            else {
                $self->_logger->debug(
                    sprintf( "range token: %s",
                        substr( $token, 0, $length - 1 ) )
                );
                return substr( $token, 0, $length - 1 ),
                  substr( $token, $length - 1, 1 )
                  . substr( $string, ( $i + 1 ) );
            }
        }
        if ( !$QUOTED && substr( $string, $i, 1 ) eq "'" ) {
            $QUOTED++;
        }
        elsif ($QUOTED
            && substr( $string, $i, 1 ) eq "'"
            && substr( $string, $i, 2 ) ne "''" )
        {
            $QUOTED--;
        }
    }
}

# podinherit_insert_token

=head1 SEE ALSO

There is a mailing list at L<https://groups.google.com/forum/#!forum/bio-phylo> 
for any user or developer questions and discussions.

=over

=item L<Bio::Phylo::IO>

The newick parser is called by the L<Bio::Phylo::IO> object.
Look there to learn how to parse newick strings.

=item L<Bio::Phylo::Manual>

Also see the manual: L<Bio::Phylo::Manual> and L<http://rutgervos.blogspot.com>.

=back

=head1 CITATION

If you use Bio::Phylo in published research, please cite it:

B<Rutger A Vos>, B<Jason Caravas>, B<Klaas Hartmann>, B<Mark A Jensen>
and B<Chase Miller>, 2011. Bio::Phylo - phyloinformatic analysis using Perl.
I<BMC Bioinformatics> B<12>:63.
L<http://dx.doi.org/10.1186/1471-2105-12-63>

=cut

1;