#!/usr/bin/perl -w
################################################################################
# Title: #######################################################################
# nexfix.pl #
################################################################################
# Author: ######################################################################
# Tom Hladish #
################################################################################
# Description: #################################################################
# #
# This program takes a list of NEXUS files and checks for specific violations #
# of the NEXUS standard without using the NEXPL API. Any files that fail this #
# compliancy check are modified accordingly. All files are then validated by #
# reading them in as NEXPL objects and writing them back out again. #
# #
# Currently, nexfix.pl checks for the following compliancy failures: #
# #
# > Character labels (charlabels) for introns that are written in #
# "codon-phase" notation must be single-quoted, as they contain a NEXUS #
# punctuation mark (the hyphen). Files produced by our lab before 08/30/05 #
# that contain intron information most likely have this problem in a #
# Characters Block and in the History Block, if one exists. #
# #
# > OTU names that contain NEXUS punctuation (i.e., (){}/\,;:=*'"`+-<>) need #
# to be single-quoted. Some taxa in our files are hyphenated. nexfix #
# currently checks the following commands for OTU names: TAXLABELS, MATRIX, #
# TAXSET. It does not check the ADD command in the Span Block. #
# #
################################################################################
# Usage: #######################################################################
# #
# nexfix.pl [-ckP] filename [filenames] #
# #
# -c : clobber the old files with the new ones (don't rename the old ones) #
# -k : keep the old files intact, renaming the new ones (default is to #
# rename the old ones, and write the new files using the old name) #
# -P : do not move non-compliant files to a /problematic/ directory--leave #
# them where they are #
# -v : send error messages to screen instead of error.log file #
# #
################################################################################
my $RCSId = '$Id: nexfix.pl,v 1.11 2007/02/01 04:52:09 vivek Exp $';
my $shortname = join (" v", $RCSId =~ m/(\w+.?\w+,)v (\d+\.\d+)/);
use strict;
use Data::Dumper;
use Bio::NEXUS;
use File::Copy;
use File::Path;
use Getopt::Std;
# read in the command-line options, if any; see Usage above for details
my %flags;
getopts('ckPv', \%flags) or die "ERROR: Unknown options\n";
# get list of NEXUS files to process
my @paths = @ARGV;
my $punctuation_pattern = '[\(\)\{\}\/\\,;:=\*\'"`\+\-<>]';
# loop through them
for my $path (@paths) {
# verify that $path is a file
unless (-e $path) {warn "'$path' is not a valid filename: skipping\n"; next };
print "Processing: <$path>\n";
# split the path into directory and filename components
my ($directory, $filename) = $path =~ /(.*?)([^\/]+)$/;
unless ($flags{v}) {
open(STDERR, ">> ./$directory/error.log") || die "Couldn't open error log in $directory\n";
}
# get the time stamp, so it can be reported later
my $time = localtime time;
# print STDERR "$shortname run on $time\n";
# slurp in the entire NEXUS file
my $nexus_text = do {local(@ARGV, $/) = $path; <>};
# move the old file to filename.old unless the user has specified
# overwriting (-c) or keeping the original file intact (-k)
move($path, "$path.old") unless ($flags{c} || $flags{k});
# use an alternate filename for the new file if the user specifies keeping the original file
if ($flags{k}) {$path = "$path.new";}
# split the file into its commands (and their arguments)
my @commands = split(/;/, $nexus_text);
# loop through them
my $within_history_block = 0;
for my $command (@commands) {
# remove previous comments that this program inserted
$command =~ s/\[This file was checked by nexfix\.pl, v\d+\.\d+ on \w+ \w+\s+\d{1,2}\s+\d{1,2}:\d{2}:\d{2} \d{4}\]\n//;
# if it's the beginning of the file, insert a comment documenting this processing
$command =~ s/(#NEXUS)/$1\n\[This file was checked by $shortname on $time\]/;
$within_history_block = 0 if ($within_history_block and $command =~/(end|endblock)/i);
# match command, capture the arguments
if ($command =~ /^\s*charlabels\s+(.+?)\s*$/si) {
$command = &charlabels($1);
} elsif ($command =~ /^\s*taxlabels\s+(.+?)\s*$/si) {
$command = &taxlabels($1);
} elsif ($command =~ /^\s*matrix\s+(.+?)\s*$/si) {
$command = &matrix($1);
$command = &history_matrix($command) if ($within_history_block);
$command = "\n\tMATRIX\n" . $command;
} elsif ($command =~ /^\s*taxset\s+(.+?)\s+=(.+?)\s*$/si) {
$command = "TAXSET $1 = " . &taxset($2) . "\n";
} elsif ($command =~ /^\s*tree\s+(.+?)\s+=(.+?)\s*$/si) {
$command = "\nTREE $1 = " . &tree($2) . "\n";
} elsif ($command =~ /\s*Begin\s+History/si) {
$within_history_block = 1;
} elsif ($command =~ /\s*format /si and $within_history_block) {
$command .= " statesformat=frequency" if $command !~ /statesformat/;
} #!!!!!! TO ADD MORE ERROR-CHECKING FUNCTUNALITY, PUT AN ELSIF HERE !!!!!!
}
# join the commands back together
$nexus_text = join(";", @commands);
# open a FH, write out the altered text
open( my $fh, ">$path" ) || die "Can't create $path $!" ;
print $fh $nexus_text;
close $fh;
# to make sure that the new file is a well-formed NEXUS file, read it in and
# write it back out with NEXPL. This is done using a system call so that
# die commands within NEXPL do not kill this process.
if ((my $retval = system("perl -MBio::NEXUS -e 'new Bio::NEXUS(\"$path\")->write(\"$path\")'")) == 0) {
# system command finished properly
warn "File: <$path> has been validated and written.\n";
} elsif ( $retval == 2 ) {
# if the system call returned a value of 2, it's because the process was
# interrupted by a SIGINT (such as Ctrl-C)
warn "Processing of File: <$path> interrupted; file written but not validated\n";
print "\n$shortname interrupted by SIGINT\n" unless $flags{v};
die "$shortname interrupted by SIGINT\n";
} else {
# print "retval = $retval\n";
# system command failed, presumably because NEXPL could not read the
# file, and the offending files will be moved to ./problematic/
if ($flags{P}) {
print "File: <$path> is not NEXPL-compatible\n" unless $flags{v};
warn "File: <$path> is not NEXPL-compatible\n";
} else {
my $problem_dir = "$directory./problematic/";
mkpath ($problem_dir);
move ($path, "$problem_dir/$filename.new") if $path =~ /\.new$/;
move ("$path.old", "$problem_dir/$filename.old") if -e "$path.old";
move ("./$directory/$filename", "$problem_dir/$filename");
print "File: <$path> is not NEXPL-compatible and has been moved with original file to ./problematic/\n" unless $flags{v};
warn "File: <$path> is not NEXPL-compatible and has been moved with original file to ./problematic/\n";
}
}
close STDERR unless $flags{v};
}
sub charlabels {
my ($charlabels) = @_;
# split the arguments on spaces
my @charlabels = @{ &parse_tokens($charlabels) };
# loop through them
CHARLABEL: for my $charlabel (@charlabels) {
if ($charlabel =~ /^'.*?'$/) {next CHARLABEL;}
# put single quotes around them, if they contain hyphens. In particular,
# we are expecting charlabels in the condonHYPHENphase form
$charlabel = "'$charlabel'" if ($charlabel =~ /-/);
}
# join them all back together, delimited by spaces
return "\n\tCHARLABELS\n\t@charlabels";
}
sub taxlabels {
my ($taxa) = @_;
my @taxa = @{ &parse_tokens($taxa) };
TAXON: for my $taxon (@taxa) {
if ($taxon =~ /^'.*?'$/) {next TAXON;}
# put single quotes around them, if they contain NEXUS punctuation.
$taxon = "'$taxon'" if ($taxon =~ /$punctuation_pattern/);
}
return "\n\tTAXLABELS @taxa";
}
sub matrix {
my ($matrix) = @_;
my @rows = split /\n\t?/, $matrix;
ROW: for my $row (@rows) {
my ($taxon, $seq) = $row =~ /\s*('.+'|\S+)\s*(.*?)\s*$/;
if ($taxon =~ /^'.*?'$/) { $row .="\n"; next ROW;}
$taxon = "'$taxon'" if ($taxon =~ /$punctuation_pattern/);
$row = "$taxon $seq\n";
}
return "@rows";
}
sub history_matrix{
my ($matrix) = @_;
my @rows = split /\n\t?/, $matrix;
my ($state0, $state1);
ROW: for my $row (@rows) {
my ($taxon, $seq) = $row =~ /\s*('.+'|\S+)\s*(.*?)\s*$/;
my @seq;
if ($seq !~ /\(/) {
@seq = split //,$seq;
foreach my $char(@seq) {
$state0 = ($char eq '1') ? 0 : 1;
$state1 = ($char eq '1') ? 1 : 0;
$char = "(1:$state1 0:$state0)";
}
} else {
@seq = split /[()]/,$seq;
foreach my $char(@seq) {
next if $char =~ /^\s*$/;
next if $char =~ /:/;
my @states = split/\s+/, $char;
$state0 = $states[0];
$state1 = $states[1];
$char = "(1:$state1 0:$state0)";
}
}
$seq = join('',@seq);
$row = "$taxon $seq\n";
}
return "@rows";
}
sub tree {
my ($tree) = @_;
# print "$tree\n"; # for debugging
my @rows = split /[(,:)]/, $tree;
ROW: for my $row (@rows) {
next if $row eq '';
# my ($taxon, $seq) = $row =~ /\s*('.+'|\S+)\s*(.*?)\s*$/;
if ($row=~ /^'.*?'$/) {next ROW;}
## Floating point/scientific notation - Reference : http://www.regular-expressions.info/floatingpoint.html
if ($row =~ /^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?/) {
#print "xx $row\n"; # Debugging
next ROW;
};
$tree =~s/$row/'$row'/g if ($row =~ /$punctuation_pattern/);
#print "$row\n"; # for debugging
}
return "$tree";
}
sub taxset {
my ($elements) = @_;
my @elements = @{ &parse_tokens($elements) };
ELEMENT: for my $element (@elements) {
if ($element =~ /^'.*?'$/) {next ELEMENT;}
# put single quotes around them, if they contain NEXUS punctuation.
$element = "'$element'" if ($element =~ /$punctuation_pattern/);
}
return "@elements";
}
sub parse_tokens {
my ($string) = @_;
my @tokens;
my $token = '';
# split the string on whitespace
foreach my $chunk ( split (/\s+/, $string) ) {
# true if chunk is single-quoted
if ($chunk =~ /^'.*?'$/) {
# push it onto the array and move on to the next chunk
push @tokens, $chunk;
# true if a single quote is found at the beginning of the chunk
} elsif ($chunk =~ /^'/) {
# set the token equal to this first chunk of the token
$token = $chunk;
# true if the last chunk of a quoted token has been found
} elsif ($token && $chunk =~ /'$/) {
# concatenate it with a space
$token .= " $chunk";
push @tokens, $token;
$token = '';
# if there aren't any quotes at the beginning or end
}else {
# either it is an unquoted string without whitespace
if ($token eq '') {
push @tokens, $chunk;
# or it is a chunk in the middle of a quoted token
}else {
$token .= " $chunk";
}
}
}
# send them back whence they came
return \@tokens;
}