The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl

use strict;
use vars qw($USAGE %VALIDALIGN $CODONSIZE);
use Bio::SeqIO;
use Bio::AlignIO;
use Bio::LocatableSeq;
use Bio::SimpleAlign;
use Getopt::Long;
use Bio::Tools::CodonTable;
use Carp;

BEGIN {
    $CODONSIZE = 3; # parametrize everything like a good little programmer
    if( ! defined $ENV{'CLUSTALDIR'} ) { 
	$ENV{'CLUSTALDIR'} = '/usr/local/bin';
    } 
    if( ! defined $ENV{'TCOFFEEDIR'} ) { 
	$ENV{'TCOFFEEDIR'} = '/usr/local/bin';
    }
    $USAGE = 
qq{align_on_codons.pl < file.fa
-h/--help                 See this information
-f/--frame            Translation Frame (0,1,2) are valid (defaults to '0')
-ct/--codontable      Codon table to use (defaults to '1')
                        see perldoc Bio::PrimarySeq for more information
-i/--input            Input Filename (defaults to STDIN)
-o/--output           Output Filename (defaults to STDOUT)
-sf/--seqformat       Input format (defaults to FASTA/Pearson)
-af/--alignformat     Alignment output format (clustal,fasta,nexus,phylip,
		      msf,pfam,mase,meme,prodom,selex,stockholm)
-ap/--alignprog       ClustalW, TCoffee (currently only support 
					 local execution) 
-v/--verbose          Run in verbose mode
};

    %VALIDALIGN = ('clustalw' => 'Bio::Tools::Run::Alignment::Clustalw',
		   'tcoffee' => 'Bio::Tools::Run::Alignment::TCoffee');
}

my ($help, $input, $output);

my ($alignprog, $sformat, $aformat, $frame, $codontable, $verbose) 
  = ('clustalw', 'fasta', 'clustalw', 0, 1, 0);

GetOptions( 'h|help'            => \$help,
				'i|input:s'         => \$input,
				'o|output:s'        => \$output,
				'sf|seqformat:s'    => \$sformat,
				'af|alignformat:s'  => \$aformat,
				'ap|alignprog:s'    => \$alignprog,
				# for translate
				'f|frame:s'         => \$frame,
				'ct|codontable:s'   => \$codontable,
				'v|verbose'         => \$verbose,
			 );

if( $help ) { 
    die($USAGE);
}
if( ! $alignprog || !defined $VALIDALIGN{$alignprog} ) {
    die("Cannot use $alignprog as 'alignprog' parameter");  
} else {
    my $modname = $VALIDALIGN{$alignprog} .".pm";
    $modname =~ s/::/\//g;
    require $modname;
}

my $alignout;
if( $output ) {
    $alignout = new Bio::AlignIO('-format' => $aformat,
				 '-file'   => ">$output");
} else { 
    $alignout = new Bio::AlignIO('-format' => $aformat);
}

my (@nucseqs,@protseqs);
my $seqio;

if( $input ) {
    $seqio = new Bio::SeqIO('-format' => $sformat,
			    '-file'   => $input);
} else { 
    $seqio = new Bio::SeqIO('-format' => $sformat,
			    '-fh'     => \*STDIN);
}

my $table = new Bio::Tools::CodonTable();
while( my $seq = $seqio->next_seq ) {
    
	#    if( $frame == 0 && $alignprog eq 'tcoffee' ) {
	#	print "last codon is ",$seq->subseq($seq->length() -2,
	#					    $seq->length()), "\n";
	#	if( $table->is_ter_codon($seq->subseq($seq->length() -2,
	#					      $seq->length())) ) {
	#	    $seq->seq($seq->subseq(1,$seq->length() - 3));
	#	}
	#    }

	push @nucseqs, $seq;    
	push @protseqs, $seq->translate(-frame => $frame,
											   -codontable_id => $codontable );
}

if( @nucseqs <= 1 ) {
	die("Must specify > 1 sequence for alignment on codons");
}

# allow these to be tweaked by cmdline parameters at some point
my @params = ('ktuple' => 2, 'matrix' => 'BLOSUM'); 

my $alignengine = $VALIDALIGN{$alignprog}->new('-verbose' => $verbose,
					       @params);

my $aln = $alignengine->align(\@protseqs);

my $dnaalign = new Bio::SimpleAlign;
my $seqorder = 0;
my $alnlen = $aln->length;
foreach my $seq ( $aln->each_seq ) {    
	my $newseq;
    
	foreach my $pos ( 1..$alnlen ) {
		my $loc = $seq->location_from_column($pos);
		my $dna = ''; 
		if( !defined $loc || $loc->location_type ne 'EXACT' ) {
			$dna = '---';
		} else {
			# to readjust to codon boundaries
			# end needs to be +1 so we can just multiply by CODONSIZE 
			# to get this
			my ($start,$end) = ((($loc->start - 1)*$CODONSIZE) +1,
									  ($loc->end)*$CODONSIZE);
			if( $start <=0 || $end > $nucseqs[$seqorder]->length() ) {
				print "start is ", $loc->start, " end is ", $loc->end, "\n";
				warn("codons don't seem to be matching up for $start,$end");
				$dna = '---';
			} else {
				$dna = $nucseqs[$seqorder]->subseq($start,$end);
			}
		}
		$newseq .= $dna;
	}
	$seqorder++;
	# funky looking math is to readjust to codon boundaries and deal
	# with fact that sequence start with 1
	my $newdna = new Bio::LocatableSeq(-display_id  => $seq->id(),
												  -start => (($seq->start - 1) * 
																 $CODONSIZE) + 1, 
												  -end   => ($seq->end * $CODONSIZE),
												  -strand => $seq->strand,
												  -seq   => $newseq);    
	$dnaalign->add_seq($newdna);
}

$alignout->write_aln($dnaalign);