#!/usr/bin/perl
# PROGRAM : standaloneblast.pl
# PURPOSE : Demonstrate possible uses of Bio::Tools::StandAloneBlast.pm
# AUTHOR : Peter Schattner schattner@alum.mit.edu
# CREATED : Nov 01 2000
#
# INSTALLATION
#
# You will need to enable Blast to find the Blast program. This can be done
# in (at least) two ways:
# 1. define an environmental variable blastDIR:
# export BLASTDIR=/home/peter/blast or
# 2. include a definition of an environmental variable BLASTDIR in every script that will
# use StandAloneBlast.pm.
# BEGIN {$ENV{BLASTDIR} = '/home/peter/blast/'; }
#
# We also need to select the database to be used
my $amino_database = 'swissprot';
#
# We are going to demonstrate 3 possible applications of StandAloneBlast.pm:
# 1. Test effect of varying choice of substitution matrices
# 2. Test effect of varying choice of gap penalty
# 3. Comparison of results of psiblast depending on whether psiblast itself is used
# to identify an alignment to use for blasting or whether an external alignment is given to
# psiblast
#
use strict;
use Getopt::Long;
use Bio::SimpleAlign;
use Bio::Tools::Run::StandAloneBlast;
use Bio::SearchIO;
use Bio::AlignIO;
use Bio::SeqIO;
use Bio::Root::IO;
# set some default values
my $queryseq = Bio::Root::IO->catfile(qw(t data cysprot1.fa) );
my $executable = 'blastpgp';
my $queryaln = Bio::Root::IO->catfile(qw(t data cysprot.msf) );
my @params = ('database' => $amino_database);
# string listing examples to be executed. Default is to execute
# all tests (ie 1,2 and 3)
my $do_only = '';
my $example1param = 'MATRIX'; # parameter to be varied in example 1
my $example2param = 'GAP'; # parameter to be varied in example 1
my $example1values = [ 'BLOSUM62', 'BLOSUM80', 'PAM70']; # MATRIX values to try
my $example2values = [ 7, 9, 25]; # GAP values to be tried
my $queryalnformat = 'msf';
my $jiter = 2;
# only use pos. specific scoring matrix if > 50% of residues have
# consensus letter (and compare with 25% or 75% cut off)
my $maskvalues = [50, 25, 75] ; my $helpflag = 0; # Flag to show usage info.
# get user options
my @argv = @ARGV; # copy ARGV before GetOptions() massacres it.
my $paramvalstring;
my $maskvalstring;
&GetOptions("h!" => \$helpflag,
"help!" => \$helpflag,
"in=s" => \$queryseq,
"inaln=s" => \$queryaln,
"alnfmt=s" => \$queryalnformat,
"param=s" => \$example1param,
"exec=s" => \$executable,
"paramvals=s" => \$paramvalstring,
"do=i" => \$do_only,
"maskvals=s" => \$maskvalstring,
"iter=i" => \$jiter,
) ;
if ($paramvalstring) { @$example1values = split (":", $paramvalstring); }
if ($maskvalstring) { @$maskvalues = split (":", $maskvalstring); }
if ($helpflag) { &example_usage(); exit 0;}
# create factory & set user-specified global blast parameters
foreach my $argv (@argv) {
next unless ($argv =~ /^(.*)=>(.*)$/);
push (@params, $1 => $2);
}
my $factory = Bio::Tools::Run::StandAloneBlast->new(@params);
# If "do" variable not set, do all four examples
if ( ! $do_only) {
&vary_params($queryseq, $example1param, $example1values); # ex. 1
# To compare gap penalties of 7, 9 and 25 we need to set the
# scoring matrix to BLOSUM62 and extension penalty to 2 (these are
# limitations of BLAST)
$factory->MATRIX('BLOSUM62');
$factory->EXTENSION(2);
&vary_params($queryseq, $example2param, $example2values); # ex. 2
# For the psiblast tests we want to restore gap opening and
# extension values to their defaults
$factory->GAP(11);
$factory->EXTENSION(1);
# now do the mask comparison example and ..
&vary_masks($queryseq, $maskvalues); # ex. 3
# do the jumpstart-align vs multiple iteration examples with the
# mask value set to 50%
&aligned_blast($queryseq, $queryaln, $queryalnformat,
$jiter, $maskvalues->[0]); # ex. 4
} elsif ($do_only == 1) {
&vary_params($queryseq,$example1param, $example1values);
} elsif ($do_only == 3) {
&vary_masks($queryseq, $maskvalues);
} elsif ($do_only == 4 ) {
&aligned_blast($queryseq, $queryaln, $queryalnformat, $jiter, $maskvalues->[0]);
} else {
&example_usage();
}
exit 0;
##########
## End of "main"
#################################################
# compare_runs(): Prints out display of which hits were found by different methods
# Various methods are labeled by "tags" found in array @runtags
#
# args:
# $typetag - label describing type of "tags"
# $runtags - reference to array @runtags
# $hashhits - reference to hash of all the hits found by all runs (%hashhits)
# value for each hit is string which is the concatenation of all the "tags" of
# runs that found that hit
# returns: nothing
sub compare_runs {
my $typetag = shift;
my $runtags = shift;
my $hashhits = shift;
my ($tag, @taghits);
print "Comparing BLAST results... \n";
# Get total number of hits found by any method
my $numhits = keys %$hashhits ; # scalar context to get total number of hits by all methods
print "Total number of hits found: $numhits \n";
# Get total number of hits found by every method
my $alltags = join ( "" , @$runtags );
my @alltaghits = grep $$hashhits{$_} =~ /$alltags/ , keys %$hashhits;
print " Number of hits found by every method / parameter-value: " ,
scalar(@alltaghits), "\n";
# If one desires to see the hits found by all methods, uncomment next 2 lines
#print " Hits were found all methods / parameters: \n";
#print join ( "\n", @alltaghits ) , "\n";
# For each method/parameter-value (labeled by type) display hits found
# exclusively by that method
foreach $tag (@$runtags) {
@taghits = grep $$hashhits{$_} =~ /^$tag$/ , keys %$hashhits;
print " Hits found only when $typetag was $tag: \n";
print join ( "\n", @taghits ) , "\n";
}
return 1;
}
#################################################
# vary_params(): Example demonstrating varying of parameter
#
# args:
# $queryseq - query sequence (can be filename (fasta), or Bio:Seq object)
# $param - name of parameter to be varied
# $values - reference to array of values to be used for the parameter
# returns: nothing
sub vary_params {
my $queryseq = shift;
my $param = shift;
my $values = shift;
print "Beginning $param parameter-varying example... \n";
# Now we'll perform several blasts, 1 for each value of the
# selected parameter. In the first default case, we vary the
# MATRIX substitution parameter, creating 3 BLAST reports, using
# MATRIX values of BLOSUM62, BLOSUM80 or PAM70.
# In the second default case, we vary the GAP penalty parameter,
# creating 3 BLAST reports, using GAP penalties of 7, 9 and 25. In
# either case we then automatically parse the resulting report to
# identify which hits are found with any of the parameter values
# and which with only one of them.
# To test the BLAST results to some other parameter it is only
# necessary to change the parameters passed to the script on the
# commandline. The only tricky part is that the BLAST program
# itself only supports a limited range of parameters. See the
# BLAST documentation.
my ($report, $sbjct, $paramvalue);
my $hashhits = { }; # key is hit id, value is string of param values for which hit was found
foreach $paramvalue (@$values) {
$factory->$param($paramvalue); # set parameter value
print "Performing BLAST with $param = $paramvalue \n";
$report = $factory->$executable($queryseq);
my $r = $report->next_result;
while( my $hit = $r->next_hit ) {
$hashhits->{$hit->name} .= "$paramvalue";
}
}
&compare_runs( $param , $values , $hashhits);
return 1;
}
#################################################
# vary_masks(): Example demonstrating varying of parameter
#
# args:
# $queryseq - query sequence (can be filename (fasta), or Bio:Seq object)
# $maskvalues - reference to array of values to be used for the mask threshold
# returns: nothing
# Now we'll perform several blasts, 1 for each value of the mask threshold.
# In the default case, we use thresholds of 25%, 50% and 75%. (Recall the threshold is
# % of resudues which must match the consensus residue before deciding to use the
# position specific scoring matrix rather than the default - BLOSUM or PAM - matrix)
# We then automatically parse the resulting reports to identify which hits
# are found with any of the mask threshold values and which with only one of them.
#
sub vary_masks {
my $queryseq = shift;
my $values = shift;
print "Beginning mask-varying example... \n";
my ($report, $sbjct, $maskvalue);
my $hashhits = { }; # key is hit id, value is string of param values for which hit was found
# Get the alignment file
my $str = Bio::AlignIO->new(-file=> "$queryaln", '-format' => "$queryalnformat", );
my $aln = $str->next_aln();
foreach $maskvalue (@$values) {
print "Performing BLAST with mask threshold = $maskvalue % \n";
# Create the proper mask for 'jumpstarting'
my $mask = &create_mask($aln, $maskvalue);
my $report2 = $factory->blastpgp($queryseq, $aln, $mask);
my $r = $report2->next_result;
while($sbjct = $r->next_hit) {
$hashhits->{$sbjct->name} .= "$maskvalue";
}
}
&compare_runs( 'mask threshold' , $values , $hashhits);
return 1;
}
#################################################
# aligned_blast ():
#
#
# args:
# $queryseq - query sequence (can be filename (fasta), or Bio:Seq object)
# $queryaln - file containing alignment to be used to "jumpstart" psiblast in "-B mode"
# $queryaln *must contain $queryseq with the same name and length
# (psiblast is very picky)
# $queryalnformat - format of alignment (can = "fasta", "msf", etc)
# $jiter - number of iterations in psiblast run
# $maskvalue - threshold indicating how similar residues must be at a sequence location
# before position-specific-scoring matrix is used
# : "0" => use position specific matrix at all residues, or
# "100" => use default (eg BLOSUM) at all residues
# returns: nothing
# For this example, we'll compare the results of psiblast depending on whether psiblast itself is
# used to identify an alignment to use for blasting or whether an external alignment is given to
# psiblast
sub aligned_blast {
my $queryseq = shift;
my $queryaln = shift;
my $queryalnformat = shift;
my $jiter = shift;
my $maskvalue = shift;
my $hashhits = { };
my ($sbjct, $id);
print "\nBeginning aligned blast example... \n";
# First we do a single-iteration psiblast search but with a specified alignment to
# "jump start" psiblast
print "\nBeginning jump-start psiblast ... \n";
my $tag1 = 'jumpstart';
# $factory->j('1'); # perform single iteration
# Get the alignment file
my $str = Bio::AlignIO->new(-file=> "$queryaln", '-format' => "$queryalnformat", );
my $aln = $str->next_aln();
# Create the proper mask for 'jumpstarting'
my $mask = &create_mask($aln, $maskvalue);
my $report2 = $factory->blastpgp($queryseq, $aln, $mask);
while($sbjct = $report2->next_result) {
$hashhits->{$sbjct->name} .= "$tag1";
}
# Then we do a "plain" psiblast multiple-iteration search
print "\nBeginning multiple-iteration psiblast ... \n";
my $undefineB ;
$factory->B($undefineB);
my $tag2 = 'iterated';
$factory->j($jiter); # 'j' is blast parameter for # of iterations
my $report1 = $factory->blastpgp($queryseq);
my $total_iterations = $report1->number_of_iterations;
my $last_iteration = $report1->round($total_iterations);
while($sbjct = $last_iteration->next_result) {
$hashhits->{$sbjct->name} .= "$tag2";
}
# Now we compare the results of the searches
my $tagtype = 'iterated_or_jumpstart';
my $values = [ $tag1, $tag2];
&compare_runs( $tagtype , $values , $hashhits);
return 1;
}
#################################################
# create_mask(): creates a mask for the psiblast jumpstart alignment
# that determines at what residues position-specific
# scoring matrices (PSSMs) are used and at what
# residues default scoring matrices (eg BLOSUM) are
# used. See psiblast documentation for more details,
# args:
# $aln - SimpleAlign object with alignment
# $maskvalue - label describing type of "tags"
# returns: actual mask, ie a string of 0's and 1's which is the
# same length as each sequence in the alignment and has
# a "1" at locations where (PSSMs) are to be used
# and a "0" at all other locations.
sub create_mask {
my $aln = shift;
my $maskvalue = shift;
my $mask = "";
die "psiblast jumpstart requires all sequences to be same length \n"
unless $aln->is_flush();
my $len = $aln->length();
if ($maskvalue =~ /^(\d){1,3}$/ ) {
$mask = $aln->consensus_string($maskvalue) ;
$mask =~ s/[^\?]/1/g ;
$mask =~ s/\?/0/g ;
}
else { die "maskvalue must be an integer between 0 and 100 \n"; }
return $mask ;
}
#----------------
sub example_usage {
#----------------
#-----------------------
# Prints usage information for general parameters.
print STDERR <<"QQ_PARAMS_QQ";
Command-line accessible script variables and commands:
-------------------------------
-h : Display this usage info and exit.
-in <str> : File containing input sequences in fasta format (default = $queryseq) .
-inaln <str> : File containing input alignment for example 3 (default = $queryaln) .
-alnfmt <str> : Format of input alignment for example 3, eg "msf", "fasta", "pfam".
(default = $queryalnformat) .
-do <int> : Number of test to be executed ("1" => vary parameters,
"3" => compare iterated & jumpstart psiblast.) If omitted,
three default tests performed.
-exec <str> : Blast executable to be used in example 1. Can be "blastall" or
"blastpgp" (default is "blastpgp").
-param <str> : Parameter to be varied in example 1. Any blast parameter
can be varied (default = 'MATRIX')
-paramvals <str>: String containing parameter values in example 1, separated
by ":"'s. (default = 'BLOSUM62:BLOSUM80:PAM70')
-iter <int> : Maximum number of iterations in psiblast in example 3 (default = 2)
-maskvals <str>: String containing mask threshold values (in per-cents) for example 3,
separated by ":"'s. (default = '50:75:25')
In addition, any valid Blast parameter can be set using the syntax "parameter=>value" as in "database=>swissprot"
So some typical command lines might be:
>standaloneblast.pl -do 1 -param expectation -paramvals '1e-10:1e-5'
or
>standaloneblast.pl -do 1 -exec blastall -param q -paramvals '-1:-7' -in='t/dna1.fa' "pr=>blastn" "d=>ecoli.nt"
or
>standaloneblast.pl -do 4 -maskvals 0 -iter 3
or
>standaloneblast.pl -do 3 -maskvals '10:50:90' -in 't/data/cysprot1.fa' -alnfmt msf -inaln 't/cysprot.msf'
QQ_PARAMS_QQ
}