#!/usr/bin/perl
use strict;
use vars qw($USAGE %VALIDALIGN $CODONSIZE);
use Bio::SeqIO;
use Bio::AlignIO;
use Bio::LocatableSeq;
use Bio::SimpleAlign;
use Getopt::Long;
use Bio::Tools::CodonTable;
use Carp;
BEGIN {
$CODONSIZE = 3; # parametrize everything like a good little programmer
if( ! defined $ENV{'CLUSTALDIR'} ) {
$ENV{'CLUSTALDIR'} = '/usr/local/bin';
}
if( ! defined $ENV{'TCOFFEEDIR'} ) {
$ENV{'TCOFFEEDIR'} = '/usr/local/bin';
}
$USAGE =
qq{align_on_codons.pl < file.fa
-h/--help See this information
-f/--frame Translation Frame (0,1,2) are valid (defaults to '0')
-ct/--codontable Codon table to use (defaults to '1')
see perldoc Bio::PrimarySeq for more information
-i/--input Input Filename (defaults to STDIN)
-o/--output Output Filename (defaults to STDOUT)
-sf/--seqformat Input format (defaults to FASTA/Pearson)
-af/--alignformat Alignment output format (clustal,fasta,nexus,phylip,
msf,pfam,mase,meme,prodom,selex,stockholm)
-ap/--alignprog ClustalW, TCoffee (currently only support
local execution)
-v/--verbose Run in verbose mode
};
%VALIDALIGN = ('clustalw' => 'Bio::Tools::Run::Alignment::Clustalw',
'tcoffee' => 'Bio::Tools::Run::Alignment::TCoffee');
}
my ($help, $input, $output);
my ($alignprog, $sformat, $aformat, $frame, $codontable, $verbose)
= ('clustalw', 'fasta', 'clustalw', 0, 1, 0);
GetOptions( 'h|help' => \$help,
'i|input:s' => \$input,
'o|output:s' => \$output,
'sf|seqformat:s' => \$sformat,
'af|alignformat:s' => \$aformat,
'ap|alignprog:s' => \$alignprog,
# for translate
'f|frame:s' => \$frame,
'ct|codontable:s' => \$codontable,
'v|verbose' => \$verbose,
);
if( $help ) {
die($USAGE);
}
if( ! $alignprog || !defined $VALIDALIGN{$alignprog} ) {
die("Cannot use $alignprog as 'alignprog' parameter");
} else {
my $modname = $VALIDALIGN{$alignprog} .".pm";
$modname =~ s/::/\//g;
require $modname;
}
my $alignout;
if( $output ) {
$alignout = new Bio::AlignIO('-format' => $aformat,
'-file' => ">$output");
} else {
$alignout = new Bio::AlignIO('-format' => $aformat);
}
my (@nucseqs,@protseqs);
my $seqio;
if( $input ) {
$seqio = new Bio::SeqIO('-format' => $sformat,
'-file' => $input);
} else {
$seqio = new Bio::SeqIO('-format' => $sformat,
'-fh' => \*STDIN);
}
my $table = new Bio::Tools::CodonTable();
while( my $seq = $seqio->next_seq ) {
# if( $frame == 0 && $alignprog eq 'tcoffee' ) {
# print "last codon is ",$seq->subseq($seq->length() -2,
# $seq->length()), "\n";
# if( $table->is_ter_codon($seq->subseq($seq->length() -2,
# $seq->length())) ) {
# $seq->seq($seq->subseq(1,$seq->length() - 3));
# }
# }
push @nucseqs, $seq;
push @protseqs, $seq->translate(-frame => $frame,
-codontable_id => $codontable );
}
if( @nucseqs <= 1 ) {
die("Must specify > 1 sequence for alignment on codons");
}
# allow these to be tweaked by cmdline parameters at some point
my @params = ('ktuple' => 2, 'matrix' => 'BLOSUM');
my $alignengine = $VALIDALIGN{$alignprog}->new('-verbose' => $verbose,
@params);
my $aln = $alignengine->align(\@protseqs);
my $dnaalign = new Bio::SimpleAlign;
my $seqorder = 0;
my $alnlen = $aln->length;
foreach my $seq ( $aln->each_seq ) {
my $newseq;
foreach my $pos ( 1..$alnlen ) {
my $loc = $seq->location_from_column($pos);
my $dna = '';
if( !defined $loc || $loc->location_type ne 'EXACT' ) {
$dna = '---';
} else {
# to readjust to codon boundaries
# end needs to be +1 so we can just multiply by CODONSIZE
# to get this
my ($start,$end) = ((($loc->start - 1)*$CODONSIZE) +1,
($loc->end)*$CODONSIZE);
if( $start <=0 || $end > $nucseqs[$seqorder]->length() ) {
print "start is ", $loc->start, " end is ", $loc->end, "\n";
warn("codons don't seem to be matching up for $start,$end");
$dna = '---';
} else {
$dna = $nucseqs[$seqorder]->subseq($start,$end);
}
}
$newseq .= $dna;
}
$seqorder++;
# funky looking math is to readjust to codon boundaries and deal
# with fact that sequence start with 1
my $newdna = new Bio::LocatableSeq(-display_id => $seq->id(),
-start => (($seq->start - 1) *
$CODONSIZE) + 1,
-end => ($seq->end * $CODONSIZE),
-strand => $seq->strand,
-seq => $newseq);
$dnaalign->add_seq($newdna);
}
$alignout->write_aln($dnaalign);