The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -w

################################################################################
# Title: #######################################################################
#                                  nexfix.pl                                   #
################################################################################
# Author: ######################################################################
#                                 Tom Hladish                                  #
################################################################################
# Description: #################################################################
#                                                                              #
# This program takes a list of NEXUS files and checks for specific violations  #
# of the NEXUS standard without using the NEXPL API.  Any files that fail this #
# compliancy check are modified accordingly.  All files are then validated by  #
# reading them in as NEXPL objects and writing them back out again.            #
#                                                                              #
# Currently, nexfix.pl checks for the following compliancy failures:           #
#                                                                              #
# > Character labels (charlabels) for introns that are written in              #
#   "codon-phase" notation must be single-quoted, as they contain a NEXUS      #
#   punctuation mark (the hyphen).  Files produced by our lab before 08/30/05  #
#   that contain intron information most likely have this problem in a         #
#   Characters Block and in the History Block, if one exists.                  #
#                                                                              #
# > OTU names that contain NEXUS punctuation (i.e., (){}/\,;:=*'"`+-<>) need   #
#   to be single-quoted.  Some taxa in our files are hyphenated.  nexfix       #
#   currently checks the following commands for OTU names: TAXLABELS, MATRIX,  #
#   TAXSET.  It does not check the ADD command in the Span Block.              #
#                                                                              #
################################################################################
# Usage: #######################################################################
#                                                                              #
# nexfix.pl [-ckP] filename [filenames]                                        #
#                                                                              #
#   -c   : clobber the old files with the new ones (don't rename the old ones) #
#   -k   : keep the old files intact, renaming the new ones (default is to     #
#          rename the old ones, and write the new files using the old name)    #
#   -P   : do not move non-compliant files to a /problematic/ directory--leave #
#          them where they are                                                 #
#   -v   : send error messages to screen instead of error.log file             #
#                                                                              #
################################################################################

my $RCSId = '$Id: nexfix.pl,v 1.11 2007/02/01 04:52:09 vivek Exp $';
my $shortname = join (" v", $RCSId =~ m/(\w+.?\w+,)v (\d+\.\d+)/);

use strict;
use Data::Dumper;
use Bio::NEXUS;
use File::Copy;
use File::Path;
use Getopt::Std;

# read in the command-line options, if any; see Usage above for details
my %flags;
getopts('ckPv', \%flags) or die "ERROR: Unknown options\n";

# get list of NEXUS files to process
my @paths = @ARGV;

my $punctuation_pattern = '[\(\)\{\}\/\\,;:=\*\'"`\+\-<>]';


# loop through them
for my $path (@paths) {

    # verify that $path is a file
    unless (-e $path) {warn "'$path' is not a valid filename: skipping\n"; next };
    
    print "Processing: <$path>\n";
    
    # split the path into directory and filename components
    my ($directory, $filename) = $path =~ /(.*?)([^\/]+)$/;

    unless ($flags{v}) { 
        open(STDERR, ">> ./$directory/error.log") || die "Couldn't open error log in $directory\n";
    }
    
    # get the time stamp, so it can be reported later
    my $time = localtime time;
    
#    print STDERR "$shortname run on $time\n";
    
    # slurp in the entire NEXUS file
    my $nexus_text = do {local(@ARGV, $/) = $path; <>};
    
    # move the old file to filename.old unless the user has specified 
    # overwriting (-c) or keeping the original file intact (-k)
    move($path, "$path.old") unless ($flags{c} || $flags{k});
    
    # use an alternate filename for the new file if the user specifies keeping the original file
    if ($flags{k}) {$path = "$path.new";}
    
    # split the file into its commands (and their arguments)
    my @commands = split(/;/, $nexus_text);
    # loop through them
    my $within_history_block = 0;
    for my $command (@commands) {
        # remove previous comments that this program inserted
        $command =~ s/\[This file was checked by nexfix\.pl, v\d+\.\d+ on \w+ \w+\s+\d{1,2}\s+\d{1,2}:\d{2}:\d{2} \d{4}\]\n//;
        # if it's the beginning of the file, insert a comment documenting this processing
        $command =~ s/(#NEXUS)/$1\n\[This file was checked by $shortname on $time\]/;

    $within_history_block = 0 if ($within_history_block and $command =~/(end|endblock)/i);

        # match command, capture the arguments
        if ($command =~ /^\s*charlabels\s+(.+?)\s*$/si) {
            $command = &charlabels($1);
        } elsif ($command =~ /^\s*taxlabels\s+(.+?)\s*$/si) {
            $command = &taxlabels($1);
        } elsif ($command =~ /^\s*matrix\s+(.+?)\s*$/si) {
        $command = &matrix($1); 
        $command =  &history_matrix($command) if ($within_history_block); 
        $command = "\n\tMATRIX\n" . $command; 
        } elsif ($command =~ /^\s*taxset\s+(.+?)\s+=(.+?)\s*$/si) {
            $command = "TAXSET $1 = " . &taxset($2) . "\n";
    } elsif ($command =~ /^\s*tree\s+(.+?)\s+=(.+?)\s*$/si) {
        $command = "\nTREE $1 = " . &tree($2) . "\n";
    } elsif ($command =~ /\s*Begin\s+History/si) {
        $within_history_block = 1;
    } elsif ($command =~ /\s*format /si and $within_history_block) {
        $command .= " statesformat=frequency" if $command !~ /statesformat/;
        } #!!!!!! TO ADD MORE ERROR-CHECKING FUNCTUNALITY, PUT AN ELSIF HERE !!!!!!

    }
    # join the commands back together
    $nexus_text = join(";", @commands);
    # open a FH, write out the altered text
    open( my $fh, ">$path" ) || die "Can't create $path $!" ;
    print $fh $nexus_text;
    close $fh;
    
    # to make sure that the new file is a well-formed NEXUS file, read it in and
    # write it back out with NEXPL.  This is done using a system call so that
    # die commands within NEXPL do not kill this process.

    if ((my $retval = system("perl -MBio::NEXUS -e 'new Bio::NEXUS(\"$path\")->write(\"$path\")'")) == 0) {
        # system command finished properly
        warn "File: <$path> has been validated and written.\n";
    } elsif ( $retval == 2 ) {
        # if the system call returned a value of 2, it's because the process was 
        # interrupted by a SIGINT (such as Ctrl-C)
        warn "Processing of File: <$path> interrupted; file written but not validated\n";
        print "\n$shortname interrupted by SIGINT\n" unless $flags{v};
        die "$shortname interrupted by SIGINT\n";
    } else {
        # print "retval = $retval\n";
        # system command failed, presumably because NEXPL could not read the 
        # file, and the offending files will be moved to ./problematic/
        if ($flags{P}) {
            print "File: <$path> is not NEXPL-compatible\n" unless $flags{v};
            warn "File: <$path> is not NEXPL-compatible\n";
        } else {
            my $problem_dir = "$directory./problematic/";
            mkpath ($problem_dir);
            move ($path, "$problem_dir/$filename.new") if $path =~ /\.new$/;
            move ("$path.old", "$problem_dir/$filename.old") if -e "$path.old";
            move ("./$directory/$filename", "$problem_dir/$filename");
            print "File: <$path> is not NEXPL-compatible and has been moved with original file to ./problematic/\n" unless $flags{v};
            warn "File: <$path> is not NEXPL-compatible and has been moved with original file to ./problematic/\n";
        }
    }
    
    close STDERR unless $flags{v};
}

sub charlabels {
    my ($charlabels) = @_;
    # split the arguments on spaces
    my @charlabels = @{ &parse_tokens($charlabels) };
    # loop through them
CHARLABEL:  for my $charlabel (@charlabels) {
        if ($charlabel =~ /^'.*?'$/) {next CHARLABEL;}
        # put single quotes around them, if they contain hyphens.  In particular,
        # we are expecting charlabels in the condonHYPHENphase form
        $charlabel = "'$charlabel'" if ($charlabel =~ /-/);
    }
    # join them all back together, delimited by spaces
    return "\n\tCHARLABELS\n\t@charlabels";
}

sub taxlabels {
    my ($taxa) = @_;
    my @taxa = @{ &parse_tokens($taxa) };
TAXON:  for my $taxon (@taxa) {
        if ($taxon =~ /^'.*?'$/) {next TAXON;}
        # put single quotes around them, if they contain NEXUS punctuation.
        $taxon = "'$taxon'" if ($taxon =~ /$punctuation_pattern/);
    }
    return "\n\tTAXLABELS @taxa";
}

sub matrix {
    my ($matrix) = @_;
    my @rows = split /\n\t?/, $matrix;
ROW:    for my $row (@rows) {
        my ($taxon, $seq) = $row =~ /\s*('.+'|\S+)\s*(.*?)\s*$/;
        if ($taxon =~ /^'.*?'$/) { $row .="\n"; next ROW;}
        $taxon = "'$taxon'" if ($taxon =~ /$punctuation_pattern/);
        $row = "$taxon  $seq\n";
    }
    return "@rows";
}

sub history_matrix{
    my ($matrix) = @_;
    my @rows = split /\n\t?/, $matrix;
    my ($state0, $state1);
ROW:    for my $row (@rows) {
    my ($taxon, $seq) = $row =~ /\s*('.+'|\S+)\s*(.*?)\s*$/;
    my @seq;

    if ($seq !~ /\(/) {
        @seq = split //,$seq;
        foreach my $char(@seq) {
            $state0 = ($char eq '1') ? 0 : 1;
            $state1 = ($char eq '1') ? 1 : 0;
            $char = "(1:$state1 0:$state0)"; 
        }
    } else {
        @seq = split /[()]/,$seq;
        foreach my $char(@seq) {
            next if $char =~ /^\s*$/; 
            next if $char =~ /:/; 
            my @states = split/\s+/, $char;
            $state0 = $states[0]; 
            $state1 = $states[1];
            $char = "(1:$state1 0:$state0)"; 
        }
    }
    $seq = join('',@seq);
    $row = "$taxon  $seq\n";
}
return "@rows";
}

sub tree {
    my ($tree) = @_;
    # print "$tree\n"; # for debugging
    my @rows = split /[(,:)]/, $tree;
    ROW:    for my $row (@rows) {
        next if $row eq '';
        #       my ($taxon, $seq) = $row =~ /\s*('.+'|\S+)\s*(.*?)\s*$/;
        if ($row=~ /^'.*?'$/) {next ROW;}

        ## Floating point/scientific notation - Reference : http://www.regular-expressions.info/floatingpoint.html
        if ($row =~ /^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?/) { 
            #print "xx $row\n"; # Debugging
            next ROW;
        }; 

        $tree =~s/$row/'$row'/g if ($row =~ /$punctuation_pattern/);
        #print "$row\n"; # for debugging 
    }
    return "$tree";
}

sub taxset {
    my ($elements) = @_;
    my @elements = @{ &parse_tokens($elements) };
ELEMENT:    for my $element (@elements) {
        if ($element =~ /^'.*?'$/) {next ELEMENT;}
        # put single quotes around them, if they contain NEXUS punctuation.
        $element = "'$element'" if ($element =~ /$punctuation_pattern/);
    }
    return "@elements";
}

sub parse_tokens {
    my ($string) = @_;
    my @tokens;
    my $token = '';
    # split the string on whitespace
    foreach my $chunk ( split (/\s+/, $string) ) {
        # true if chunk is single-quoted
        if ($chunk =~ /^'.*?'$/) {
            # push it onto the array and move on to the next chunk
            push @tokens, $chunk;
        # true if a single quote is found at the beginning of the chunk
        } elsif ($chunk =~ /^'/) {
            # set the token equal to this first chunk of the token
            $token = $chunk;
        # true if the last chunk of a quoted token has been found
        } elsif ($token && $chunk =~ /'$/) {
            # concatenate it with a space
            $token .= " $chunk";
            push @tokens, $token;
            $token = '';
        # if there aren't any quotes at the beginning or end
        }else {
            # either it is an unquoted string without whitespace
            if ($token eq '') {
                push @tokens, $chunk;
            # or it is a chunk in the middle of a quoted token
            }else {
                $token .= " $chunk";
            }
        }
    }
    # send them back whence they came
    return \@tokens;
}