# $Id: Align.pm,v 1.15 2003/11/18 19:45:45 rkh Exp $
# @@banner@@
=head1 NAME
Bio::Prospect::Align -- Package for overlaying multiple Prospect alignments
S<$Id: Align.pm,v 1.15 2003/11/18 19:45:45 rkh Exp $>
=head1 SYNOPSIS
use Bio::Prospect::Options;
use Bio::Prospect::LocalClient;
use Bio::Prospect::Align;
use Bio::SeqIO;
my $in = new Bio::SeqIO( -format=> 'Fasta', '-file' => $ARGV[0] );
my $po = new Bio::Prospect::Options( seq=>1, svm=>1, global_local=>1,
templates=>[qw(1bgc 1alu 1rcb 1eera)] );
my $pf = new Bio::Prospect::LocalClient( {options=>$po} );
while ( my $s = $in->next_seq() ) {
my @threads = $pf->thread( $s );
my $pa = new Bio::Prospect::Align( -debug=>0,-threads => \@threads );
print $pa->getAlignment(-format=>'html');
}
=head1 DESCRIPTION
B<Bio::Prospect::Align> represents an alignment of one or more
Prospect structural alignments.
=cut
package Bio::Prospect::Align;
use vars qw( $VERSION );
$VERSION = sprintf( "%d.%02d", q$Revision: 1.15 $ =~ /(\d+)\.(\d+)/ );
use strict;
use fields qw( debug threads alignment );
use Carp qw(cluck);
use IO::Scalar;
use Bio::AlignIO;
use Bio::Prospect::Init;
=head1 METHODS
=cut
#-------------------------------------------------------------------------------
# new()
#-------------------------------------------------------------------------------
=head2 new()
Name: new()
Purpose: return Bio::Prospect::Align object
Arguments:
-threads => [ Bio::Prospect::Thread objects ],
Returns: Bio::Prospect::Align object
=cut
sub new {
my $self = fields::new(shift);
# parse the arguments into a parameter hash
my %param = @_;
@param{ map { lc $_ } keys %param } = values %param; # lowercase keys
# -threads is a required argument, return undef
# if not supplied
if ( $param{'-threads'} ) {
$self->{'threads'} = $param{'-threads'};
} else {
return undef;
}
if (not defined $ENV{MVIEW_APP}) {
$ENV{MVIEW_APP} = $Bio::Prospect::Init::MVIEW_APP;
if (not -x $ENV{MVIEW_APP}) {
throw Bio::Prospect::Exception
( "MVIEW_APP not set",
"MVIEW_APP isn't set and $ENV{MVIEW_APP} doesn't exist",
"MVIEW_APP can be set in the Bio::Prospect::Init class or as an environment variable" );
}
}
return $self;
}
#-------------------------------------------------------------------------------
# get_alignment()
#-------------------------------------------------------------------------------
=head2 get_alignment()
Name: get_alignment()
Purpose: return the sequence alignment of the Bio::Prospect::Thread objects
Arguments:
-format => one of: 'clustalw', 'bl2seq','clustalw',
'emboss','fasta','mase','mega','meme','msf','nexus',
'pfam','phylip','prodom','psi','selex','stockholm'
(default: clustalw)
-show_ss => output secondard structure (default: off)
-show_seq => output target sequence (default: on)
Returns: scalar containing the alignment
=cut
sub get_alignment {
my $self = shift;
my $retval = '';
# parse the arguments into a parameter hash
my %param = @_;
@param{ map { lc $_ } keys %param } = values %param; # lowercase keys
# the show_ss and show_seq flags are only valid for html and clustalw output
# (Bio::AlignIO.pm has some issues when I try them)
if ( $param{'-show_ss'} && ( $param{'-format'} !~ m/clustalw|html/i ) ) {
print STDERR "Bio::Prospect::Align.pm WARNING: get_alignment() show_ss flag is only valid for " .
"-format => clustalw | html - ignoring\n";
$param{ '-show_ss' } = 0;
}
if ( !$param{'-show_seq'} && ( $param{'-format'} !~ m/clustalw|html/i ) ) {
print STDERR "Bio::Prospect::Align.pm WARNING: get_alignment() show_seq flag is only valid for " .
"-format => clustalw | html - ignoring\n";
$param{ '-show_seq' } = 1;
}
#
# get the clustalw alignment if we have not already done so.
#
if ( ! defined $self->{'alignment'} ) {
$self->_align( %param );
}
#
# default is clustalw because the alignment is internally stored
# in clustalw format. utilize Bio::SimpleAln object for other
# format tyoes
if ( ! defined $param{'-format'} || $param{'-format'} eq 'clustalw' ) {
$retval = $self->{'alignment'};
} elsif ( $param{'-format'} =~ m/html/i ) {
my @args;
#push(@args,'-css on'); # sigh... won't work to embed in another html
push(@args,'-html head');
push(@args,'-ruler on -width 60');
push(@args,'-coloring consensus -threshold 80 -consensus on -con_coloring any');
$retval = `echo \"$self->{'alignment'}\" | $Bio::Prospect::Init::MVIEW_APP -in clustalw -alncolor '#FFFFFF' @args`;
} elsif ( $param{'-format'} =~
m/bl2seq|clustalw|emboss|fasta|mase|mega|meme|msf|nexus|pfam|phylip|prodom|psi|selex|stockholm/i ) {
my $in_fh = new IO::Scalar;
my $out_fh = new IO::Scalar;
$in_fh->open(\$self->{'alignment'});
$out_fh->open(\$retval);
my $in = Bio::AlignIO->new(-fh => $in_fh , '-format' => 'clustalw');
my $out = Bio::AlignIO->new(-fh => $out_fh, '-format' => $param{'-format'} );
while ( my $aln = $in->next_aln() ) {
$out->write_aln($aln);
}
$in_fh->close();
$out_fh->close();
} else {
die( "Bio::Prospect::Align.pm ERROR: get_alignment() format ($param{'-format'}) " .
"not recognized" );
}
# [rkh] strip the surrounding table tags
$retval =~ s%</PRE>\n<TABLE BORDER=0.+<TR><TD>\n<PRE>%\n%;
$retval =~ s%\n</TD></TR></TABLE>\n%%;
$retval =~ s%^%<!-- BEGIN Bio::Prospect::Align output -->\n%;
$retval =~ s%$%\n<!-- END Bio::Prospect::Align output -->\n%;
$retval =~ s/^ consensus\/[198].+\n//gm;
return( $retval );
}
#-------------------------------------------------------------------------------
# get_threads()
#-------------------------------------------------------------------------------
=head2 get_threads()
Name: get_threads()
Purpose: return the Bio::Prospect::Thread object list associated with this object
Arguments: none
Returns: list of Bio::Prospect::Thread objects
=cut
sub get_threads {
my $self = shift;
return( @{$self->{'threads'}});
}
#-------------------------------------------------------------------------------
# DEPRECATED METHODS - generally replaced by other methods, will be removed in
# subsequent releases.
#-------------------------------------------------------------------------------
sub getAlignment {
my $self = shift;
cluck("getAlignment is deprecated on Oct-22-2003: use get_alignment instead\n");
return( $self->get_alignment(@_));
}
sub getThreads {
my $self = shift;
cluck("getThreads is deprecated on Oct-22-2003: use get_threads instead\n");
return( $self->get_threads );
}
#-------------------------------------------------------------------------------
# INTERNAL METHODS: not intended for use outside this module
#-------------------------------------------------------------------------------
=pod
=head1 INTERNAL METHODS & ROUTINES
The following functions are documented for developers' benefit. THESE
SHOULD NOT BE CALLED OUTSIDE OF THIS MODULE. YOU'VE BEEN WARNED.
=cut
#-------------------------------------------------------------------------------
# _align()
#-------------------------------------------------------------------------------
=head2 _align()
Name: _align()
Purpose: private method that does the alignment work - called by new().
Builds a clustalw alignment internally. use get_alignment() to
retrieve the alignment in other formats.
Arguments:
-show_ss => 0 | 1 output secondard structure (default: off)
-show_seq => 0 | 1 output target sequence (default: on)
Returns: nothing
=cut
sub _align {
my $self = shift;
#
# parse the arguments into a parameter hash
#
my %param = @_;
@param{ map { lc $_ } keys %param } = values %param; # lowercase keys
my @threads = @{$self->{'threads'}};
#
# define defaults
#
$param{ '-show_ss' } = 0 if ! defined $param{ '-show_ss' };
$param{ '-show_seq' } = 1 if ! defined $param{ '-show_seq' };
# alignment algorithm:
# 1. the ungapped query sequence is the universal coordinate system
# 2. gaps inserted into the query sequence as a result of
# an alignment to a template, must be reflected in the
# alignments of the templates
my %query_gap; # store the number of gaps inserted by a given alignment and residue number
my %max_gap; # store the largest gap in all the alignments prior to each residue
my @template;
my @ss;
# store an ungapped alignment
my @ungapped_query = grep ! /-/, split '', $threads[0]->qseq_align();
# iterate through each alignment and count the gaps inserted prior
# to each residue and which template alignment inserted that gap.
my $res_num = 0;
for (my $i=0;$i<=$#threads;$i++) {
# store target sequence
$template[$i] = [ split '', $threads[$i]->tseq_align() ];
# store the secondary structure information (this is based on the
# gapped template sequence)
$ss[$i] = [ split '', $threads[$i]->tss() ];
$res_num = 0;
my @query = split //,$threads[$i]->qseq_align();
for( my $j=0; $j<=$#query; $j++ ) {
if ( $query[$j] eq '-' ) { # found gap
$query_gap{$res_num}{$i}++;
} else { # if no gap, then increment the residue number
$res_num++;
}
}
}
# sanity check on query_res_count using the ungapped query aligment
# build consensus query sequence by applying the maximium number of gaps
# prior to each residue
my $consensus_str = '';
for (my $res_num=0; $res_num<=$#ungapped_query; $res_num++) {
if ( defined $query_gap{$res_num} ) {
foreach my $t ( sort {
$query_gap{$res_num}{$b} <=> $query_gap{$res_num}{$a} }
keys %{$query_gap{$res_num}} ) {
$max_gap{$res_num} = $query_gap{$res_num}{$t};
$consensus_str .= ( '-'x$max_gap{$res_num} );
last;
}
}
$consensus_str .= $ungapped_query[$res_num];
}
# Iterate through each template alignment. Fix the template alignments by
# inserting the difference between the maximium number of gap inserts for the
# corresponding query residue and the gap inserts for this template alignment.
for (my $i=0;$i<=$#threads;$i++) {
my $res_num = 0;
my $gaps_inserted=0;
my @query = split //,$threads[$i]->qseq_align();
for( my $j=0; $j<=$#query; $j++ ) {
if ( $query[$j] ne '-' ) {
my $gap_length = ( defined $query_gap{$res_num}{$i} ) ? $max_gap{$res_num} - $query_gap{$res_num}{$i} : $max_gap{$res_num};
if ( defined $gap_length && $gap_length > 0 ) {
# account for the fact that we have already added some gaps into template and ss
my $ins_pos = $j+$gaps_inserted;
print STDERR "insert $gap_length at the $res_num th residue into the $i sequence at the $ins_pos th position\n" if $ENV{'DEBUG'};
print STDERR "template: " . join('',@{$template[$i]}),"\n" if $ENV{'DEBUG'};
for ( my $k=0; $k<$gap_length; $k++ ) {
splice(@{$ss[$i]}, $ins_pos, 0, '-' );
splice(@{$template[$i]}, $ins_pos, 0, '-' );
$gaps_inserted++;
}
print STDERR "template: " . join('',@{$template[$i]}),"\n" if $ENV{'DEBUG'};
}
$res_num++;
}
}
}
my @consensus = split //,$consensus_str;
print STDERR "consensus: " . join('',@consensus) . "\n" if $ENV{'DEBUG'};
# sanity check
for(my $i=0;$i<=$#template;$i++) {
if ( scalar(@{$template[$i]} != $#consensus+1 )) {
warn("Bio::Prospect::Align.pm ERROR: template length(" .
scalar(@{$template[$i]}) .") != query length (" . ($#consensus+1) . ")\n");
}
}
# build clustalw alignment
my $offset = 60;
my $align = "CLUSTAL W(1.81) multiple sequence alignment\n\n\n";
for (my $start=0; $start<=$#consensus; $start+=($offset)) {
my $end = ( $start + $offset - 1 ) < $#consensus ? $start + $offset - 1: $#consensus;
$align .= sprintf("%-22s %s\n","QUERY",join('',@consensus[$start..$end]));
for (my $i=0; $i<=$#template; $i++) {
$align .= sprintf("%-22s %s\n",$threads[$i]->tname(),join('',@{$template[$i]}[$start..$end]))
if $param{'-show_seq'};
$align .= sprintf("%-22s %s\n",'#ss-' . $threads[$i]->tname(),join('',@{$ss[$i]}[$start..$end]))
if $param{'-show_ss'};
}
$align .= "\n";
}
$self->{'alignment'} = $align;
return;
}
1;
=head1 SEE ALSO
@@banner@@
=cut