The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#######################################################################
# UnalignedBlock.pm
#######################################################################
# 
# thanks to Tom Hladish for the original version 
#
# $Id: UnalignedBlock.pm,v 1.25 2012/02/10 13:28:28 astoltzfus Exp $

#################### START POD DOCUMENTATION ##########################

=head1 NAME

Bio::NEXUS::UnalignedBlock - Represents an UNALIGNED block of a NEXUS file

=head1 SYNOPSIS

 if ( $type =~ /unaligned/i ) {
     $block_object = new Bio::NEXUS::UnalignedBlock($type, $block, $verbose);
 }

=head1 DESCRIPTION

This is a class representing an unaligned block in NEXUS file

=head1 FEEDBACK

All feedback (bugs, feature enhancements, etc.) is greatly appreciated. 

=head1 AUTHORS

 Thomas Hladish (tjhladish at yahoo)

=head1 VERSION

$Id: UnalignedBlock.pm,v 1.25 2012/02/10 13:28:28 astoltzfus Exp $

=head1 METHODS

=cut

package Bio::NEXUS::UnalignedBlock;

use strict;
#use Data::Dumper; # XXX this is not used, might as well not import it!
#use Carp;# XXX this is not used, might as well not import it!
use Bio::NEXUS::Functions;
use Bio::NEXUS::TaxUnitSet;
use Bio::NEXUS::Matrix;
use Bio::NEXUS::Util::Exceptions;
use vars qw(@ISA $VERSION $AUTOLOAD);
use Bio::NEXUS; $VERSION = $Bio::NEXUS::VERSION;
@ISA = qw(Bio::NEXUS::Matrix);
my $logger = Bio::NEXUS::Util::Logger->new();

=head2 new

 Title   : new
 Usage   : block_object = new Bio::NEXUS::UnalignedBlock($block_type, $commands, $verbose, $taxlabels);
 Function: Creates a new Bio::NEXUS::UnalignedBlock object
 Returns : Bio::NEXUS::UnalignedBlock object
 Args    : type (string), the commands/comments to parse (array ref), and a verbose flag (0 or 1)

=cut

sub new {
    my ( $class, $type, $commands, $verbose, $taxa ) = @_;
    unless ($type) { ( $type = lc $class ) =~ s/Bio::NEXUS::(.+)Block/$1/i; }
    my $self = { type => $type };
    bless $self, $class;
    $self->set_taxlabels($taxa);
    $self->{'otuset'} = new Bio::NEXUS::TaxUnitSet();
    $self->_parse_block( $commands, $verbose )
        if ( ( defined $commands ) and @$commands );
    return $self;
}

=begin comment

 Title   : _parse_format
 Usage   : $format = $self->_parse_format($buffer); (private)
 Function: Extracts format values from line and stores values in a hash
 Returns : hash of formats
 Args    : buffer (string)
 Methods : Separates formats by whitespace and creates hash containing
           key = format name and value = format value.

=end comment 

=cut

sub _parse_format {
    my ( $self, $string ) = @_;

    my %format = ();

    while ( $string =~ s/(\S+\s*=\s*[\"|\'][^\"\']+[\"|\'])// ) {
        my ( $name, $symbol ) = split /\s*=\s*/, $1;
        $format{ lc $name } = $symbol;
    }
    while ( $string =~ s/(\S+\s*=\s*\S+)// ) {
        my ( $name, $symbol ) = split /\s*=\s*/, $1;
        $format{ lc $name } = lc $symbol;
    }
    for my $other ( split /\s+/, $string ) {
        if ($other) { $format{ lc $other } = 1; }
    }
    return \%format;
}

=begin comment

 Title   : _parse_matrix
 Usage   : $self->_parse_matrix($buffer); (private)
 Function: Processes buffer containing matrix data
 Returns : none
 Args    : buffer (string)
 Method  : parse according to if name is quoted string or single word, 
           if each state is single character or multi-character (use token keyword)

=end comment 

=cut

sub _parse_matrix {
    my ( $self, $matrix, $verbose ) = @_;

    my @taxa;
    my ( $name, $seq ) = ();

    # Build an array of hashrefs, where each hash has "name" and "seq" values
    # corresponding to the name and sequence found in each row of the matrix
    for my $row ( split /\n|\r/, $matrix ) {
        if ( $row =~ /^\s*$/ ) { next; }
		
        #for quoted taxon name
        if ( $row =~ /^\s*[\"|\']([^\"\']+)[\"|\']\s*([^\[]*)(\[.*\]\s*)*/ ) {
            ( $name, $seq ) = ( $1, $2 );
            $name =~ s/\s+/_/g;
            if ( !$self->find_taxon($name) ) { 
            	Bio::NEXUS::Util::Exceptions::BadArgs->throw(
            		'error' => "Undefined Taxon: $name"
            	); 
            }
        }
        else {

            # for one-word non-quoted taxon name
            $row =~ /^\s*(\S+)(\s*)([^\[]*)(\[.*\]\s*)*/;
            if ( $self->find_taxon($1) ) {
                $name = $1;
                $seq  = $3;
                #print Dumper $seq;
            }
            else {
                print "taxon name $1 not found\n" if $verbose;
                $seq = $1 . $2 . $3;
            }
        }
        #print "> row: $row\n";
        #print "> name: $name\n";
        #print "> seq: $seq\n";
        
        my $newtaxon = 1;
        for my $taxon (@taxa) {
            if ( $taxon->{'name'} eq $name ) {
                $taxon->{'seq'} .= ' ' . $seq;
                $newtaxon = 0;
            }
        }
        if ($newtaxon) {
            push @taxa, { name => $name, seq => $seq };
        }
    }
	#print '> @taxa: ';
    # split each character
    my @otus;
    #print Dumper \@taxa;
    for my $taxon (@taxa) {
        $seq = $taxon->{'seq'};
        $seq =~ s/^\s*(.*\S)\s*$/$1/;

        my @seq;
        while ( $seq =~ s/([^\(]+)|\(([^\(]+)\)// ) {    # for +-(+ -)+-
            if ($1) {                                    # for +-
###             The following 4 commented lines of code are implemented in CharactersBlock.pm; they allow data tokens to be space-delimited.
###             Unaligned blocks do not include the tokens or continuous formats according the Maddison et al.  We
###             may decide that we don't want to restrict unaligned data to DNA/RNA/AA the way Maddison et al have.
#               if ($self->get_format->{'tokens'}  || lc $self->get_format->{'datatype'}  eq 'continuous') {  #LINE 1
#                   push @seq, split /\s+/, $1;                                                               #LINE 2
#               } else {                                                                                      #LINE 3
                push @seq, split /\s*/, $1;

#               }                                                                                             #LINE4
            }
            elsif ($2) {
                push @seq, [ split /,\s*|\s+/, $2 ];     # for (+ -)
            }
        }

        push @otus, Bio::NEXUS::TaxUnit->new( $taxon->{'name'}, \@seq );
    }
    
    my $otuset = $self->get_otuset();
    $otuset->set_otus( \@otus );
    $self->set_taxlabels( $otuset->get_otu_names() );
    return \@otus;
}

=head2 find_taxon

 Title   : find_taxon
 Usage   : my $is_taxon_present = $self->find_taxon($taxon_name);
 Function: Finds whether the input taxon name is present in the taxon label.
 Returns : 0 (not present)  or 1 (if present).
 Args    : taxon label (as string)

=cut

sub find_taxon {
    my ( $self, $name ) = @_;
    if ( @{ $self->get_taxlabels || [] } == 0 ) { return 1; }
    for my $taxon ( @{ $self->get_taxlabels() } ) {
        if ( lc $taxon eq lc $name ) { return 1; }
    }
    return 0;
}

=head2 set_format

 Title   : set_format
 Usage   : $block->set_format(\%format);
 Function: set the format of the characters
 Returns : none
 Args    : hash of format values

=cut

sub set_format {
    my ( $self, $format ) = @_;
    $self->{'format'} = $format;
}

=head2 get_format

 Title   : get_format
 Usage   : $block->get_format();
 Function: Returns the format of the characters
 Returns : hash of format values
 Args    : none

=cut

sub get_format { shift->{'format'} || {} }

=head2 set_otuset

 Title   : set_otuset
 Usage   : $block->set_otuset($otuset);
 Function: Set the otus
 Returns : none
 Args    : TaxUnitSet object

=cut

sub set_otuset {
    my ( $self, $otuset ) = @_;
    $self->{'otuset'} = $otuset;
    $self->set_taxlabels( $otuset->get_otu_names() );
}

=head2 set_charstatelabels

 Title   : set_charstatelabels
 Usage   : $block->set_charstatelabels($labels);
 Function: Set the character names and states
 Returns : none
 Args    : array of character states

=cut

sub set_charstatelabels {
    my ( $self, $charstatelabels ) = @_;
    $self->get_otuset->set_charstatelabels($charstatelabels);
}

=head2 get_charstatelabels

 Title   : get_charstatelabels
 Usage   : $set->get_charstatelabels();
 Function: Returns an array of character states
 Returns : character states
 Args    : none

=cut

sub get_charstatelabels {
    my ($self) = @_;
    return $self->get_otuset->get_charstatelabels();
}

=head2 get_ntax

 Title   : get_ntax
 Usage   : $block->get_ntax();
 Function: Returns the number of taxa of the block
 Returns : # taxa
 Args    : none

=cut

sub get_ntax {
    my $self = shift;
    return $self->get_otuset()->get_ntax();
}

=head2 rename_otus

 Title   : rename_otus
 Usage   : $block->rename_otus(\%translation);
 Function: Renames all the OTUs to something else
 Returns : none
 Args    : hash containing translation

=cut

sub rename_otus {
    my ( $self, $translation ) = @_;
    $self->get_otuset()->rename_otus($translation);
}

=head2 add_otu_clone

 Title   : add_otu_clone
 Usage   : ...
 Function: ...
 Returns : ...
 Args    : ...

=cut

sub add_otu_clone {
	my ( $self, $original_otu_name, $copy_otu_name ) = @_;
	# print "Warning: Bio::NEXUS::UnalignedBlock::add_otu_clone() method not fully implemented\n";
	
	if ($self->find_taxon($copy_otu_name)) {
		print "Error: an OTU with that name [$copy_otu_name] already exists.\n";
	}
	else {
		$self->add_taxlabel($copy_otu_name);
	}
	
	my @otu_set = ();
	if (defined $self->{'otuset'}->{'otus'}) {
	    @otu_set = @{ $self->{'otuset'}->{'otus'} };
	}
	foreach my $otu (@otu_set) {
		if (defined $otu) {
			if ($otu->get_name() eq $original_otu_name) {
				my $otu_clone = $otu->clone();
				$otu_clone->set_name($copy_otu_name);
				$self->{'otuset'}->add_otu($otu_clone);
			}
		}
	}
	
}

=head2 equals

 Name    : equals
 Usage   : $block->equals($another);
 Function: compare if two Bio::NEXUS::UnalignedBlock objects are equal
 Returns : boolean 
 Args    : a Bio::NEXUS::CharactersBlock object

=cut

sub equals {
    my ( $self, $block ) = @_;
    if ( !Bio::NEXUS::Block::equals( $self, $block ) ) { return 0; }
    return $self->get_otuset()->equals( $block->get_otuset() );
}

=begin comment

 Name    : _write
 Usage   : $block->_write();
 Function: Writes NEXUS block containing unaligned data
 Returns : none
 Args    : file name (string)

=end comment 

=cut

sub _write {
    my ( $self, $fh, $verbose ) = @_;
    $fh ||= \*STDOUT;

    Bio::NEXUS::Block::_write( $self, $fh );
    $self->_write_matrix_info( $fh, $verbose );
    $self->_write_matrix( $fh, $verbose );
    print $fh "END;\n";
    return;
}

=begin comment

 Name    : _write_matrix_info
 Usage   : $self->_write_matrix_info($file_handle,$verbose);
 Function: Writes UnalignedBlock info (all the block content except the matrix data) into the filehandle
 Returns : none
 Args    : $file_handle and $verbose 

=end comment 

=cut

sub _write_matrix_info {
    my ( $self, $fh, $verbose ) = @_;
    $fh ||= \*STDOUT;

    my $ntax = $self->get_ntax();
    print $fh "\tDIMENSIONS ntax=$ntax;\n";

    my %formats = %{ $self->get_format() };
    if ( scalar keys %formats ) {
        print $fh "\tFORMAT ";
        if ( defined $formats{'datatype'} ) {
            print $fh " datatype=$formats{'datatype'}";
        }

        for my $format ( keys %formats ) {
            if ( !$formats{$format} || $format =~ /datatype/i ) { next; }
            elsif ( $formats{$format} eq '1' ) {
                print $fh " $format";
            }
            else {
                print $fh " $format=$formats{$format}";
            }
        }
        print $fh ";\n";
    }
    return;
}

=begin comment

 Name    : _write_matrix
 Usage   : $self->_write_matrix($file_handle,$verbose);
 Function: Writes UnalignedBlock matrix( The data stored in the matrix command)  into the filehandle 
 Returns : none
 Args    : $file_handle and $verbose 

=end comment 

=cut

sub _write_matrix {
    my ( $self, $fh, $verbose ) = @_;
    $fh ||= \*STDOUT;

    my @otus = @{ $self->get_otuset()->get_otus() };
    print $fh "\tMATRIX\n";
    for my $otu (@otus) {
        my $seq = $otu->get_seq_string();
        print $fh "\t", $otu->get_name(), "\t", $seq, "\n";
    }
    print $fh "\t;\n";
    return;
}

sub AUTOLOAD {
    return if $AUTOLOAD =~ /DESTROY$/;
    my $package_name = __PACKAGE__ . '::';

    # The following methods are deprecated and are temporarily supported
    # via a warning and a redirection
    my %synonym_for = (
        "${package_name}set_charstates" => "${package_name}set_charstatelabels",
        "${package_name}get_charstates" => "${package_name}get_charstatelabels",
    );

    if ( defined $synonym_for{$AUTOLOAD} ) {
        $logger->warn( "$AUTOLOAD() is deprecated; use $synonym_for{$AUTOLOAD}() instead" );
        goto &{ $synonym_for{$AUTOLOAD} };
    }
    else {
        Bio::NEXUS::Util::Exceptions::UnknownMethod->throw(
        	'error' => "ERROR: Unknown method $AUTOLOAD called"
        );
    }
}

1;