The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -w
package Bio::NEXUS::Tools::NexModifier;
#######################################################################
# Bio::NEXUS::Tools::NexModifier.pm
#######################################################################
# Author: Chengzhi Liang, Weigang Qiu, Eugene Melamud, Peter Yang
# $Id: NexModifier.pm,v 1.2 2008/06/16 19:53:41 astoltzfus Exp $

# This script provides a set of functions for manipulating NEXUS files
# eg, select/exclude/rename OTUs, select character columns in OTUs
# select/exclude subtree, select/reroot tree

use strict;
use Pod::Usage;
use Carp;
use Data::Dumper;
use Bio::NEXUS; our $VERSION = $Bio::NEXUS::VERSION;

$Data::Dumper::Maxdepth=3;

=head2 new

 Title   : new
 Usage   : NA
 Function: create a new modifier object
 Returns : NA
 Args    : NA

=cut
sub new {
   my $self = shift;
   my $inp_data = $_[0];
   $inp_data = [@_] if (ref($inp_data) ne 'ARRAY') ;

   #print Dumper $inp_data;
   my $object_data = {
      'parameters' => [
	 'version' 			=> 1.0,	
	 'help' 			=> undef,	
	 'input_file'			=> 'test.nex',	
	 'output_file'			=> 'temp.nex',	
	 'rename_otus'			=> undef,	
	 'rename_otus_file'  	=> undef,	
	 'restrict_blocks'		=> undef,	
	 'restrict_otus'		=> undef,	
	 'restrict_subtree'		=> undef,	
	 'restrict_tree' 		=> undef,	
	 'restrict_column' 		=> undef,	
	 'restrict_set' 		=> undef,	
	 'exclude_blocks'		=> undef,	
	 'exclude_otus'			=> undef,	
	 'exclude_subtree' 		=> undef,	
	 'exclude_column' 		=> undef,	
	 'exclude_set' 			=> undef,	
	 'root'	 			=> undef,	
	 'addtree' 			=> undef,	
	 'make_sets_by_file'		=> undef,	
	 'make_sets_by_inode'		=> undef,	
	 'make_sets_by_otu'		=> undef,	
	 'make_sets_by_clade'		=> undef,	
	 'make_sets_by_union'		=> undef,	
	 'make_sets_by_difference'	=> undef,	
	 'make_sets_by_charstate'	=> undef,	
	 'make_sets_by_cladeconsensus'	=> undef,	
	 'make_sets_by_difference'	=> undef,	
	 'remove_sets' 			=> undef,	
     'list_sets' 			=> undef,	
     'rewrite' 			=> undef,	
     'return_nexus_obj'		=> 1,
     ],
     'data'=> {

     },
 };

 for (my $i=0; $i < scalar @$inp_data; $i = $i+2) {
     $inp_data->[$i] =~s/^-//;
 }
 
$object_data->{'parameters'} = $inp_data;
 
 bless($object_data,$self);
 return $object_data;

}

=head2 modify

 Title   : modify
 Usage   : NA
 Function: alter object
 Returns : NA
 Args    : NA

=cut

sub modify {
    my $self            = shift;
    my $options 	       = $self->{parameters};
    my %options_as_hash = @{$options};

   my $infile = $options_as_hash{input_file};

   unless ($infile) {die "Usage: nextool.pl <infile> [outfile] [command [options]]\n";}

   if ($infile eq "-v") {
      die "nextool.pl \$Revision: 1.2 $\n"; 
   }
   if ($infile eq "-h" || $infile eq '--help') {
      pod2usage(-exitval => 0, -verbose => 1);
   }

   my $outfile = $options_as_hash{output_file};

###if ($outfile eq "-") {$outfile = \*STDOUT;}
   if ($outfile =~ /(^-$|^STDOUT$)/) {$outfile = \*STDOUT;}
   my $command;
   my $value;

   my @test = @{$options};

   my $nexus = new Bio::NEXUS($infile);

   if (! $outfile || $options_as_hash{"rewrite"}) { 
       $nexus = new Bio::NEXUS($infile, 1);
       $nexus->write($outfile||"temp.nex", 1);
       return $nexus;
   }
   #print Dumper $nexus;

   for(my $i=0; $i < scalar @test ; $i = $i+2 ) { 
       my $runtime_options = { $test[$i] => $test[$i+1] };
       #print "$test[$i], i=$i\n";

       ### Rename options
       $nexus = rename_otus_file($nexus,$runtime_options->{rename_otus_file} ) if $runtime_options->{rename_otus_file};
       $nexus = $nexus->rename_otus($runtime_options->{rename_otus}) if $runtime_options->{rename_otus};

       ### Restrict options
       $nexus = selectbyblock($nexus, $runtime_options->{restrict_blocks}) if $runtime_options->{restrict_blocks};
       $nexus = $nexus->select_otus($runtime_options->{restrict_otus}) if $runtime_options->{restrict_otus};
       $nexus = selectbyinode($nexus, $runtime_options->{restrict_subtree}) if $runtime_options->{restrict_subtree};
       $nexus = selectbytree($nexus, $runtime_options->{restrict_tree}) if $runtime_options->{restrict_tree};
       $nexus = selectbycolumn($nexus,1, $runtime_options->{restrict_column}) if $runtime_options->{restrict_column};
       $nexus = selectbysets($nexus,1, $runtime_options->{restrict_set}) if $runtime_options->{restrict_set};

      ### Exclude options
      $nexus = exclude_blocks($nexus, $runtime_options->{exclude_blocks}) if $runtime_options->{exclude_blocks};
      $nexus = $nexus->exclude_otus($runtime_options->{exclude_otus}) if $runtime_options->{exclude_otus};
      $nexus = exclude_subtree($nexus, $runtime_options->{exclude_subtree}) if $runtime_options->{exclude_subtree};
      $nexus = selectbycolumn($nexus,0, $runtime_options->{exclude_column}) if $runtime_options->{exclude_column};
      $nexus = selectbysets($nexus,0, $runtime_options->{exclude_set}) if $runtime_options->{exclude_set};

      ### Reroot options
      $nexus = reroottree($nexus, $runtime_options->{reroottree}) if $runtime_options->{reroottree};

      ### Addtree options
      &addtree($nexus,$runtime_options->{add_tree}) if $runtime_options->{addtree};

      ### Sets options
      $nexus = setsbyfile($nexus, $runtime_options->{make_sets_by_file}) if $runtime_options->{make_sets_by_file}; 
      $nexus = setsbyinode($nexus,$runtime_options->{make_sets_by_inode} ) if $runtime_options->{make_sets_by_inode}; 
      $nexus = setsbyotus($nexus, $runtime_options->{make_sets_by_otu}) if $runtime_options->{make_sets_by_otu}; 
      $nexus = setsbyclade($nexus, $runtime_options->{make_sets_by_clade}) if $runtime_options->{make_sets_by_clade}; 
      $nexus = setsbyunion($nexus, $runtime_options->{make_sets_by_union}) if $runtime_options->{make_sets_by_union}; 
      $nexus = setsbydifference($nexus, $runtime_options->{make_sets_by_difference}) if $runtime_options->{make_sets_by_difference};
      $nexus = setsbycharstate($nexus,$runtime_options->{make_sets_by_charstate}) if $runtime_options->{make_sets_by_charstate}; 
      $nexus = setsbycladeconsensus($nexus,$runtime_options->{make_sets_by_cladeconsensus} ) if $runtime_options->{make_sets_by_cladeconsensus}; 
      #die "Expecting command that looks like 'makesets <bymode> <arguments>', where bymode is one of: file, inode, otu, clade, union, difference, charstate, cladeconsensus\n";
      $nexus = removesets($nexus,$runtime_options->{remove_sets}) if $runtime_options->{remove_sets};
      $nexus = renamesets($nexus,$runtime_options->{rename_sets}) if $runtime_options->{rename_sets};
      if ($runtime_options->{listsets}) {
          file_overwrite_warning($infile, $outfile, 'listsets', 0);
          listsets($nexus, $outfile, $runtime_options->{listsets});
      }
      if ($runtime_options->{listsetnames}) {
          file_overwrite_warning($infile, $outfile,'listsetnames' , 0);
          listsetnames($nexus, $outfile);
      }
  }

  $nexus->write($outfile);
  return $nexus if $options_as_hash{return_nexus_obj};
}


######### SUBROUTINES #########################


=head2 rename_otus

 Title   : rename_otus
 Usage   : NA
 Function: assigns new labels to OTUs
 Returns : NA
 Args    : NA

=cut

sub rename_otus {
   my ($nexus, $transfile) = @_;
   open(FILE, "<$transfile") or die "ERROR: Can\'t open $transfile\n";
   my @lines = <FILE>;
   my $lines = "@lines";
   my %translation = (split(/\s+/, $lines));
   return $nexus->rename_otus(\%translation);
}

=head2 selectbyblock

 Title   : selectbyblock
 Usage   : NA
 Function: select a subset of blocks
 Returns : NA
 Args    : NA

=cut

sub selectbyblock {
   my ($nexus, @blocks) = @_;
   return $nexus->select_blocks(\@blocks);
}

=head2 selectbyotu

 Title   : selectbyotu
 Usage   : NA
 Function: select subset of OTUs by labels
 Returns : NA
 Args    : NA

=cut
#  
sub selectbyotu {
   my ($nexus, $mode, @args) = @_;
   die "need otu names" unless (@args);
   my @otus;
   if ($args[0] eq '-f') { # input from file
      my $file = $args[1];
      open(FILE, $file) or carp "File $file not found\n";
      my @lines = <FILE>;
      @otus = split /\s+/, "@lines";
      close(FILE);
   }
   else {   # input from command line separated by space
      my $list = join( " ", @args ); 
      $list = unquote( $list ); 
      $list =~ s/^\s+|\s+$//g; 
      @otus = split( /[,\s]\s*/, $list ); 
   }
   if ( $mode == 1 ) { # select mode 
      return $nexus->select_otus(\@otus); 
   }
   else { # exclude mode 
      return $nexus->exclude_otus(\@otus); 
   }
}

=head2 selectbytree

 Title   : selectbytree
 Usage   : NA
 Function: select a tree
 Returns : NA
 Args    : NA

=cut

sub selectbytree {
   my ($nexus, $treename) = @_;
   ($treename) or die "ERROR: Need to specify a tree to be selected\n";
   return $nexus->select_tree($treename);
}

=head2 selectbyinode

 Title   : selectbyinode
 Usage   : NA
 Function: select a subtree by specifying its root internal node
 Returns : NA
 Args    : NA

=cut
# 
sub selectbyinode {
   my ($nexus, $nodename, $treename) = @_;
   $nodename or die "ERROR: Need to specify an internal node for subtree\n";

   return $nexus->select_subtree($nodename, $treename);
}

=head2 selectbycolumn

 Title   : selectbycolumn
 Usage   : NA
 Function: select ($command=1) or exclude ($command=0) specified column list or file with list
 Returns : NA
 Args    : NA

=cut
# arguments (column numbers) can be of form: 
# 1) 1-3 4 5 6-10 or 1-3, 4  5, 6-10
# 2) -f <file name> contains numbers in format as examplified in 1)  
sub selectbycolumn {
   my ($nexus, $command, @args) = @_;
   my $args = "@args";
   $args =~ s/title\s*=\s*(\w+)//i;    
   my $title = $1;
#    my $block = $nexus->get_block("characters", $title);

   die "need column numbers" unless $args;

   my $columns;
   if ($args =~ /-f (\S+)/) { # input from file
      my $file = $1;
      $columns = do{ local(@ARGV, $/) = $file; <>};
      $columns =~ s/\n/ /g;
   }else {   # input from command line separated by comma or space
      $columns = $args;
   }

   my @columns = @{ &parse_number($columns) };
   if ($command) {
      return $nexus->select_chars(\@columns, $title);
   }else {
      return $nexus->exclude_chars(\@columns, $title);
   }
}

=head2 selectbysets

 Title   : selectbysets
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub selectbysets {
   my ($nexus, $mode, @setnames) = @_;    
   my @otus;
   die "Provide set names as arguments\n" unless (@setnames);
   for my $setname (@setnames) {
      push(@otus, @{ $nexus->get_block('sets')->get_taxset($setname) } );
   }

   if ( $mode == 1 ) { # "select" mode 
      return $nexus->select_otus(\@otus); 
   } else { # "exclude" mode 
      return $nexus->exclude_otus(\@otus); 
   }
}

=head2 parse_number

 Title   : parse_number
 Usage   : NA
 Function: parse numbers in format "1-3, 4 6 8-10"
 Returns : NA
 Args    : NA

=cut
# 
sub parse_number {
   my $s = unquote( shift );

   if (! $s =~ /^\s*(\d+(-\d+)?)([,\s]\s*\d+(-\d+)?)*\s*$/ ) { 
      die "Invalid number format.  Use 1 or 1-3 or 1, 3, 5-8 or 1 3 5 6-10.\n"; 
   } 
   $s =~ s/^\s+|\s+$//g;
   $s =~ s/,?\s+/,/g;  # use ',' as separator
   my @cols = split(/,/, $s);

   my @arr;
   foreach my $item (@cols) {
      if ($item =~ /-/) { # eg 1-3
	 $item =~ /([0-9]+)\s*-\s*([0-9]+)/;
	 for (my $i = $1; $i <= $2; $i++) { push ( @arr, $i-1 ); }
      } elsif ($item =~ /^\d+$/) { # eg 4
	 push ( @arr, $item-1 );
      } elsif ($item) {
	 die "non-number was used for column number\n";
      }
   }

   @arr = sort {$a<=>$b} @arr;
   return \@arr;
}

=head2 unquote

 Title   : unquote
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub unquote { 
   my $string = shift; 
   $string =~ s/^ *'(.*)' *$/$1/;
   $string =~ s/^ *"(.*)" *$/$1/;
   return( $string ); 
}

=head2 exclude_blocks

 Title   : exclude_blocks
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub exclude_blocks {
   my ($nexus, @blocks) = @_;
   return $nexus->exclude_blocks(\@blocks);
}

=head2 exclude_subtree

 Title   : exclude_subtree
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub exclude_subtree {
   my ($nexus, $subtree) = @_;
   return $nexus->exclude_subtree($subtree);
}

=head2 reroottree

 Title   : reroottree
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub reroottree {
   my ($nexus, $outgroup, $root_position, $treename) = @_;
   if ( $nexus->get_block('trees')->get_tree('$root_position') ) { #in case no root position is supplied, but a tree name is
      ($root_position, $treename) = (undef, $root_position);
   }
   return $nexus->reroot($outgroup, $root_position, $treename);
}

=head2 addtree

 Title   : addtree
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub addtree {
   my ($nexus,$treeFile,$treeName) = @_;

   if ( !$treeName ) { ($treeName = (split("/",$treeFile))[-1]) =~ s/(.*)\..*/$1/; }

   if ( -f $treeFile ) {
      open(TF,"<$treeFile") || warn("ERROR: could not open $treeFile. No tree data will appear in NEXUS file.");

      my ($treeString);
      while(<TF>) {
	 s/\s*//go;
	 $treeString .= $_;
      }

      close(TF);

      $treeString =~ s/(.*);$/$1/; #remove semicolon from end of $treeString, if there is one.

      #$treeString must be formatted like this in order to work w/ Bio::NEXUS::TreesBlock::new -
      #      tree  $treeName = $treeString
      #the following manipulations ensure we end up w/ such a string to pass to create_block
      $treeString =~ s/^tree(.*)$/$1/;
      unless ( $treeString =~ m/^$treeName=/ ) {
	 $treeString = $treeName . " = " . $treeString;
      }
      $treeString = "tree " . $treeString;

      print STDERR "tree string:\n$treeString\n";
      $nexus->add_block( $nexus->create_block('trees',$treeString, 1) );
   } else {
      die("Tree file: \"$treeFile\" not found or could not be read");
   }
}

=head2 setsbyfile

 Title   : setsbyfile
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub setsbyfile {
   my ($nexus, @setfiles) = @_;
   my $sets;

   for my $setfile (@setfiles) {
      open (SF, "<$setfile") || die ("Could not open set file $setfile.\n\n");
      while (<SF>) {
	 chomp;
	 push (@{$$sets{$setfile}}, $_);
      }
      close (SF);
   }
   &add_or_create_set ($nexus, $sets);
   return $nexus;
}

=head2 setsbyinode

 Title   : setsbyinode
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub setsbyinode {
   my ($nexus, @inodenames) = @_;
   my $sets;

   for my $inodename (@inodenames) {
      unless ($nexus->get_block('trees')->get_tree()->find($inodename)) {
	 die "User-provided inode <$inodename> is not the name of an inode in this NEXUS file\nCommand aborted\n\n";
      }
      my $subtree = $nexus->select_subtree($inodename);
      my $otus = $subtree->get_otus();
      $$sets{$inodename} = $otus;
   }
   &add_or_create_set ($nexus, $sets);
   return $nexus;
}

=head2 setsbyclade

 Title   : setsbyclade
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub setsbyclade {
   my ($nexus, @clade_elements) = @_;
   my ($sets, $treename);# $treename is currently a dummy variable, but we should implement it

   unless (validate_otus($nexus, [@clade_elements])) {die "User-provided OTUs failed validation against TAXA Block\n\n"}

   while (scalar(@clade_elements) > 0) {
      my ($otu1, $otu2) = splice(@clade_elements, 0, 2);
      unless (defined $otu2) {die "You have supplied an odd number of OTU's.  This mode requires pairs.\n\n"}
      $otu1 = $nexus->get_block('trees')->get_tree($treename)->find($otu1);
      $otu2 = $nexus->get_block('trees')->get_tree($treename)->find($otu2);
      my $inode_name = $otu1 -> mrca($otu2, $treename) -> name();
      my $otus = $nexus->select_subtree($inode_name)->get_otus();
      $$sets{$inode_name} = $otus;
   }

   &add_or_create_set ($nexus, $sets);
   return $nexus;
}

=head2 setsbyotus

 Title   : setsbyotus
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub setsbyotus {
   my ($nexus, @otuargs) = @_;
   my $otuargs = join(" ", @otuargs);
   my $sets;

   my @setsandelements = $otuargs =~ /\s*([^=]+)\s*=\s*\[\s*([^\]]+)\s*\]/g;

   while (@setsandelements) {
      $$sets{shift(@setsandelements)} = [split(/\s/, splice(@setsandelements, 1, 1))];
   }

   my $validation = 1;
   for my $users_otus (values %$sets) {
      $validation = validate_otus($nexus, $users_otus);
   }

   unless ($validation == 1) {die "User-defined sets failed validation against TAXA Block\n\n"}

   &add_or_create_set ($nexus, $sets);
   return $nexus;
}

=head2 setsbyunion

 Title   : setsbyunion
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub setsbyunion {
   my ($nexus, @setargs) = @_;
   my $setargs = join(" ", @setargs);
   my $sets = $nexus -> get_block('sets') -> get_taxsets();

   my @setsandelements = $setargs =~ /\s*(\b[^=]+\b)\s*=\s*\[\s*([^\]]+)\s*\]/g;
   while (@setsandelements) {
      my ($newsetname, @subsets) = (shift(@setsandelements), split(/(?:\s+|\s*\+\s*)/, shift(@setsandelements)));
      my @otus;
      for my $subset (@subsets) {
	 push(@otus, @{$nexus -> get_block('sets') -> get_taxset($subset)});
      }
      $$sets{$newsetname} = [@otus];
   }

   &add_or_create_set ($nexus, $sets);
   return $nexus;
}

=head2 setsbydifference

 Title   : setsbydifference
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub setsbydifference {
   my ($nexus, @setargs) = @_;
   my $setargs = join(" ", @setargs);
   my $sets = $nexus -> get_block('sets') -> get_taxsets();
#    print Dumper ($sets);

   @setargs = $setargs =~ /\s*([^=\s]+)\s*=\s*\[\s*([^-\s\]]+)\s*-\s*([^-\s\]]+)\s*\]/g;
   my (@difference_set_names, @minuend_set_names, @subtrahend_set_names);
   while (@setargs) {
      push(@difference_set_names, shift (@setargs));
      push(@minuend_set_names, shift (@setargs));
      push(@subtrahend_set_names, shift (@setargs));
   }

   for (my $i = 0; $i < @difference_set_names; $i++) {
      $$sets{$difference_set_names[$i]} = [@{$$sets{$minuend_set_names[$i]}}];
      for (my $j = 0; $j < @{$$sets{ $difference_set_names[$i] }}; $j++ ) {
	 for my $subtrahendOTU ( @{$$sets{ $subtrahend_set_names[$i] }} ) {
	    if ( ${$$sets{ $difference_set_names[$i] }}[$j] eq $subtrahendOTU) {
	       splice ( @{$$sets{ $difference_set_names[$i] }}, $j, 1);
	    }
	 }
      }
   }

   &add_or_create_set ($nexus, $sets);
   return $nexus;
}

=head2 setsbycharstate

 Title   : setsbycharstate
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub setsbycharstate {
   my ($nexus, @char_args) = @_;
   my ($proposed_sets, $title) = &parse_charstate_args(@char_args);
   my ($sets, @column_labels);
   my $block = $nexus->get_block("characters", $title);

   my $character_labels = $block->get_charlabels;
   my $seq_length = $block->get_nchar;
   my %char_matrix = %{$block->get_otuset->get_otu_sequences};
   if ($character_labels && @$character_labels) { 
      @column_labels = @$character_labels; 
   } elsif ($seq_length) {
      for (1 .. $seq_length) {push @column_labels, $_;}
   }
   my %column_indices;
   for (my $i = 0; $i < @column_labels; $i++) {
      $column_indices{$column_labels[$i]} = $i;
   }
   foreach my $proposed_setname (keys %{$proposed_sets}){
      while (@{$$proposed_sets{$proposed_setname}} > 0 ) {
	 my ($column_label, $query_value) = splice(@{$$proposed_sets{$proposed_setname}}, 0, 2);
	 foreach my $otu (keys %char_matrix) {
	    my $locus_value = substr($char_matrix{$otu},$column_indices{$column_label},1);
	    if ($query_value =~ m/$locus_value/) {
	       push (@{$$sets{$proposed_setname}}, $otu);
	    }
	 }
      }
   }

   &add_or_create_set ($nexus, $sets);
   return $nexus;
}

=head2 parse_charstate_args

 Title   : parse_charstate_args
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub parse_charstate_args {
   my $char_args = join (" ", @_);
   $char_args =~ m/title\s*=\s*(\w+)/i;    
   my $title = $1;                                                # store the captured title name, if one was provided
   my $num_title_matches = $char_args =~ s/title\s*=\s*\w+//g;    # make sure the user only referred to one title (to prevent ambiguity)
   if ($num_title_matches > 1 ) {die ("Too many 'title' arguments--please reference only one Characters Block\n\n")}
   # the following line matches each set as defined by the user and stores it in the @proposed_sets array
   # basically, it looks for bracket pairs that look like foo[bar], 
   # which may or may not be preceded by a set name declaration ('set_name = ')
   my @proposed_sets = $char_args =~ m/(?:[-\w]+\s*=\s*)?\s*[-\w]+\s*\[[^\]]+\]\s*/g;
   my %proposed_sets;
   for my $proposed_set (@proposed_sets) {                     # for each of the matches made above
      my $set_name;
      if ($proposed_set =~ m/^\s*([^\s\[\]]+)\s*=/) {            # if a name declaration was matched
	 $set_name = $1;                                        # use that as the set name
	 $proposed_set =~ s/^\s*[^\s\[\]]+\s*=\s*//;            # then, get rid of that portion of the array element
      } else {                                                # otherwise, make up a unique set name by joining the column label and locus value(s)
	 $set_name = join("", ($proposed_set =~ m/[^"\s\[\]]/g));
      }
      my ($column_label, $locus_value);
      while ($proposed_set =~ m/[^"\s]/) {                    # as long as $proposed_set contains more than spaces and quotes
	 ($column_label) = $proposed_set =~ m/[^"\s\[]+/;    # match the column label
	 $proposed_set =~ m/\[([^\]]+)\]/;                    # and match the locus value(s)
	 $locus_value = $1;
	 $locus_value =~ s/[\s,]//g;                            # get rid of any spaces or commas the user may have separated the locus values with
	 push (@{$proposed_sets{$set_name}}, $column_label, $locus_value); # push the column label and locus value(s) onto this sets array in the set hash
	 $proposed_set =~ s/^[^\]]*\]//;                        # destroy the evidence
      }
   }
   return (\%proposed_sets, $title);
}

=head2 setsbycladeconsensus

 Title   : setsbycladeconsensus
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub setsbycladeconsensus {
   my ($nexus, @args) = @_;
   my $consensus_sets = {};
   my $treename;
   my $args = join(' ', @args);
   $args =~ m/title\s*=\s*(\w+)/i;
   my $title = $1;
   if ($title) {$args =~ s/\s*title\s*=\s*\w+\s*//i;}
   if ($args =~ /(\w+)/) {$treename = $1;}
   my $char_block = $nexus->get_block("characters", $title);
   my %otunames_sequences = %{$char_block->get_otuset->get_otu_sequences};
   my $tree = $nexus->get_block('trees')->get_tree($treename)->clone();
   my $rootnode = $tree->get_rootnode();
   &set_consensus_seqs($rootnode, \%otunames_sequences);
   ($rootnode, $consensus_sets) = &simplify_tree($rootnode, $consensus_sets);
   $tree->set_name("nonredundant" . $tree->name());
   $nexus->get_block('trees')->add_tree($tree);
   &add_or_create_set ($nexus, $consensus_sets);
   return $nexus;
}

=head2 set_consensus_seqs

 Title   : set_consensus_seqs
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub set_consensus_seqs {
   my ($node, $otus_seqs) = @_;
   my @children = @{ $node->children() };
   for my $child (@children) {
      if ( $child->is_otu() ) {
	 $child->set_seq(${$otus_seqs}{$child->name()});
	 if ( ! $node->get_seq() || ( $node->get_seq() eq $child->get_seq() ) ) {
	    $node->set_seq($child->get_seq());
	 } else {
	    $node->set_seq('NO_CONSENSUS');
	 }
      } else {
	 &set_consensus_seqs($child, $otus_seqs);
	 if ( ! $node->get_seq() || ( $node->get_seq() eq $child->get_seq() ) ) {
	    $node->set_seq($child->get_seq());
	 } else {
	    $node->set_seq('NO_CONSENSUS');
	 }
      }
   }
}

=head2 simplify_tree

 Title   : simplify_tree
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub simplify_tree {
   my ($node, $consensus_sets) = @_;
   my $setname;
   if ($node->get_seq() ne 'NO_CONSENSUS' && ! $node->is_otu() ) {
      my @otu_descendents = @{ $node->get_otus() };
      $setname = $otu_descendents[0]->name() . "_GROUP";
      ${ $consensus_sets }{$setname} = [];
      for my $descendent (@otu_descendents) {
	 push(@{${$consensus_sets }{$setname}}, $descendent->name());
      }
      my $newlength = $node->distance($otu_descendents[0]) + $node->length();
      my $parent = $node->get_parent();
      my $siblings = $node->get_siblings();
      $parent->set_children($siblings);
      $parent->adopt($otu_descendents[0],0);
      $otu_descendents[0]->set_length($newlength);
   } else {
      for my $child (@{ $node->children() }) {
	 simplify_tree($child, $consensus_sets);
      }
   }
   return ($node, $consensus_sets);
}

=head2 validate_otus

 Title   : validate_otus
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub validate_otus { #Verifies that user-specified OTU names are taxlabels in the TAXA block
   my ($nexus, $users_otus) = @_;
   my $validation = 1;
   for my $users_otu (@$users_otus) {
      unless (defined ($nexus->get_block('taxa')->is_taxon($users_otu, 1))) {
	 $validation = 0;
      }
   }
   return $validation;
}

=head2 add_or_create_set

 Title   : add_or_create_set
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub add_or_create_set {
   my ($nexus, $sets) = @_;
   my $setsblock;
   if ($nexus -> get_block('Sets')) {
      $nexus -> get_block('Sets') -> add_taxsets($sets);
   } else {
      $setsblock = Bio::NEXUS::SetsBlock -> new('Sets');
      $setsblock -> set_taxsets($sets);
      $nexus -> add_block($setsblock);
   }
   return $nexus;
}

=head2 removesets

 Title   : removesets
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub removesets {
   my ($nexus, @usersets) = @_;
   my $failure = 0;
   my $sets_to_remove;
   my $setsblock = get_setsblock_if_setsblock($nexus);
   my $taxsets = $setsblock->get_taxsets();
   unless (scalar(keys %$taxsets) > 0) {die "The SETS Block is already empty\n\n";}

   if ($usersets[0] =~ /^-p$/i) {
      shift @usersets;
      for my $userset (@usersets) {
	 my $match = 0;
	 TAXSET:        for my $taxset (keys %$taxsets) {
	    if ($taxset =~ /^$userset$/) {
	       push(@$sets_to_remove, $taxset);
	       $match = 1;
	       next TAXSET;
	    }
	 }
	 if ($match == 0) {
	    warn "<$userset> does not match any sets in this NEXUS file\n";
	    $failure = 1;
	 }
      }
   } else {
      for (my $i=0; $i < scalar(@usersets); $i++) {
	 if ($$taxsets{$usersets[$i]}) {
	    push (@$sets_to_remove, $usersets[$i]);
	 } else { 
	    warn "<$usersets[$i]> is not the name of a set in this NEXUS file.\n";
	    $failure = 1;
	 }
      }
   }
   die "Command aborted\n\n" if $failure == 1;
   $nexus->get_block('sets')->delete_taxsets(@$sets_to_remove);
   return $nexus;
}

=head2 swap_children

 Title   : swap_children
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub renamesets {
   my ($nexus, @old_and_new) = @_;
   my $setsblock = get_setsblock_if_setsblock($nexus);
   $setsblock->rename_taxsets(@old_and_new);
   return $nexus;
}

=head2 listsets

 Title   : listsets
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub listsets {
   my ($nexus, $outfile, @sets_to_list) = @_;
   my $setsblock = get_setsblock_if_setsblock($nexus);
   if (@sets_to_list > 0) {
      my $fh = open_filehandle($outfile);
      for (@sets_to_list) {
	 my @otus = @{ $setsblock->get_taxset($_) };
	 if (@otus == 0) {
	    warn("$_ = []    Possible error: Set is empty or does not exist\n\n");
	 } elsif (@otus > 0) {
	    print $fh "$_ = [@otus]\n\n";
	 }
      }
   } else {
      $setsblock->print_all_taxsets($outfile);
   }
}

=head2 listsetnames

 Title   : listsetnames
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub listsetnames {
   my ($nexus, $outfile) = @_;
   my $setsblock = get_setsblock_if_setsblock($nexus);
   my @setnames = @{ $setsblock->get_taxset_names() };
   my $fh = open_filehandle($outfile);
   print $fh "@setnames\n";
}

=head2 open_filehandle

 Title   : open_filehandle
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub open_filehandle {
   my ($fh) = @_;
   if ($fh eq "-" || $fh eq \*STDOUT) {return \*STDOUT}
   else {open(FH, ">$fh") || die "Could not open $fh for writing\n\n"}
   return \*FH;
}

=head2 get_setsblock_if_setsblock

 Title   : get_setsblock_if_setsblock
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub get_setsblock_if_setsblock {
   my ($nexus) = @_;
   my $setsblock;
   unless ($setsblock = $nexus->get_block('sets')){die "This NEXUS file does not contain a SETS Block\n\n"}
   return $setsblock;
}

=head2 file_overwrite_warning

 Title   : file_overwrite_warning
 Usage   : NA
 Function: NA
 Returns : NA
 Args    : NA

=cut
sub file_overwrite_warning {
   my ($infile, $outfile, $command, $silentmode) = @_;
   if ($silentmode == 1) {return 1}
   if ($infile eq $outfile) {
      warn("Do you really want to overwrite NEXUS file $infile with the $command output?\n");
      if (<STDIN> =~ /y/) {
	 return 1;
      } else { die("Command aborted\n\n")}
   } else { return 1;}
}

################# POD Documentation ##################

__END__

#################### START POD DOCUMENTATION ##########################

=head1 NAME

Bio::NEXUS::Tools::Modifier - NEXUS file content modifier ( exclude/select/rename options on OTUs).

=head1 SYNOPSIS

nextool.pl <input_file> [output_file] [COMMAND [arguments]]

if outfile is not specified, just read input file and output to temp.nex

Commands:

    rewrite
    rename_otus <translation_file>
    reroot <outgroup_node_name> [tree_name]
    select blocks <block1 block2 ...>
    select OTU <OTU_1 OTU_2 ...> or <-f filename>
    select tree <tree_name>
    select subtree <inode_number> [tree_name] 
    select column <columns> or <-f filename>
    exclude blocks <block1 block2 ...>
    exclude OTU <otu1 otu2 ...>
    exclude subtree <inode> [treename]
    exclude column <columns> or <-f filename>
    makesets byfile <file1> [file2 file3 ...]
    makesets byinode <inode1> [inode2 inode3 ...]
    makesets byclade <OTU1> <OTU2> [OTU3 OTU4 ...]
	Square brackets are required syntax in this command:
    makesets byotus <set1>=[<OTU1 OTU2 ...>] <set2>=[<OTU3 OTU4 ...>] ...
	Square brackets are required in this command:
    makesets byunion <set1>=[<setA + setB ...>] <set2>=[<setA + setC ...>] ...
	Square brackets are required in this command:
    makesets bydifference <set1>=[<setA - setB>] <set2>=[<setA - setC>] ...
	Square brackets around 'state' argument are required in this command:
    makesets bycharstate [title=char_block_title] [set1=]"<sequence_or_intron_position>[<state>]" [set2=...]
    makesets bycladeconsensus [title=char_block_title]
    removesets <set1 set2 ...>
    listsets [set1 set2 ...]
    listsetnames
    renamesets oldname1 newname1 [oldname2 newname2 ...]


=head1 COMMANDS

=head2 rewrite

Writes the contents of <input_file> to <output_file>.  This is used to standardize the 
format of the file.  this is the default program action if <outfile> is not specified, the output file will be temp.nex

=head2 rename_otus <translation_file>

Renames OTUs in taxa, characters and trees blocks according to translation_file. Each line of translation_file contains the old name, separated by whitespace, then the new name.

=head2 reroot <outgroup_node_name> [tree_name]

reroot the tree of tree_name (optional-- the first tree in trees block if not specified) with outgroup_node_name as the new outgroup

=head2 select blocks <block1 block2 ...>

Select block given in list

=head2 select OTU <OTU_1 OTU_2 ...> or <-f filename>

Selects OTUs given in list. Changes taxa block, characters block and trees block.

=head2 select tree <tree_name>

Selects one tree given the tree name. Changes taxa/characters blocks to match.

=head2 select subtree <inode_number> [tree_name] 

Selects subtree given the tree name and the internal node number. Changes taxa/characters blocks to match.

=head2 select column <columns> or <-f filename>

Selects a number of columns from character lists to get a new set of character lists (for a new Bio::NEXUS file). eg, "1-3, 5 8, 10-15" (comma or space can be used to separate numbers)

=head2 exclude blocks <block1 block2 ...>

Remove blocks from file

=head2 exclude OTU <otu1 otu2 ...>

Remove OTUs from file

=head2 exclude subtree <inode> [treename]

Remove subtree rooted with 'inode' in 'treename' or the first tree if 'treename' is not specified

=head2 exclude column <columns> or <-f filename>

Remove a number of columns from character lists to get a new set of character lists (for a new Bio::NEXUS file). eg, "1-3, 5 8, 10-15" (comma or space can be used to separate numbers)

=head2     makesets byfile <file1> [file2 file3 ...]

Add sets based on OTU's listed in simple text files.  Sets take names of files; OTU's should be newline-delimited in the text files.

=head2 makesets byinode <inode1> [inode2 inode3 ...]

Add sets based on ancestral internal nodes and their children

=head2 makesets byclade <OTU1> <OTU2> [OTU3 OTU4 ...]

Add sets by finding the children of the most recent common ancestor (mrca) of OTU1 and OTU2 pairs

=head2 makesets byotus <set1>=[<OTU1 OTU2 ...>] <set2>=[<OTU3 OTU4 ...>] ...

Add sets by specifying setnames and OTU's each set is to contain.  Syntax is setname=[<OTU LIST>]

=head2 makesets byunion <set1>=[<setA + setB ...>] <set2>=[<setA + setC ...>] ...

Add sets by specifying setnames and which existing sets contain the OTUs the new set will comprise.  Syntax is setname=[<SET LIST>]

=head2 makesets bydifference <set1>=[<setA - setB>] <set2>=[<setA - setC>] ...

Add sets by specifying setnames and which existing sets contain the OTUs that will be used in defining the new sets.

=head2 makesets bycharstate [title=char_block_title] [set1=]"<sequence_or_intron_position>[<state>]" [set2=...]

Add sets by specifying setnames and which existing sets contain the OTUs that will be used in defining the new sets.

=head2 makesets bycladeconsensus [title=char_block_title]

Add sets by finding clades that have a consensus sequence at all loci, and creating one set for each group of "otu synonyms"

=head2 removesets [-p] set1 set2 ...

Delete sets from NEXUS file.  Will empty SETS Block but not remove it from file. '-p' switch allows you to specify regular expression patterns ('set\d*' will delete all sets of the form set<number>.)  QUOTATION MARKS AROUND PATTERNS IS STRONGLY RECOMMENDED.

=head2 listsets

Print sets (all sets by default, otherwise those provided by user) to output file, or standard out of STDOUT or - is used as output filename

=head2 listsetnames

Print setnames to output file, or standard out if STDOUT or - is used as output filename

=head2 renamesets oldname1 newname1 [oldname2 newname2 ...]

Rename sets by space-delimited <oldname newname> pairs

=head1 DESCRIPTION

B<This program> provides several services in the manipulation of NEXUS files (selecting specific OTUs or tree nodes, combining files, renaming OTUs, etc.

=head1 VERSION

$Revision: 1.2 $

=head1 REQUIRES

Bio::NEXUS

=head1 AUTHOR

Chengzhi Liang <liangc@umbi.umd.edu>
Peter Yang <pyang@rice.edu>
Tom Hladish <hladish@umbi.umd.edu>

=cut

##################### End ##########################