The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl

# PROGRAM  : standaloneblast.pl
# PURPOSE  : Demonstrate possible uses of Bio::Tools::StandAloneBlast.pm
# AUTHOR   : Peter Schattner schattner@alum.mit.edu
# CREATED  : Nov 01 2000
#
# INSTALLATION
#

# You will need to enable Blast to find the Blast program. This can be done
# in (at least) two ways:
#  1. define an environmental variable blastDIR:
#	export BLASTDIR=/home/peter/blast   or
#  2. include a definition of an environmental variable BLASTDIR in every script that will
#     use StandAloneBlast.pm.
#	BEGIN {$ENV{BLASTDIR} = '/home/peter/blast/'; }
#
#  We also need to select the database to be used
my $amino_database = 'swissprot';


# 
#  We are going to demonstrate 3 possible applications of StandAloneBlast.pm:
#	1. Test effect of varying choice of substitution matrices	
#	2. Test effect of varying choice of gap penalty 
#	3. Comparison of results of psiblast depending on whether psiblast itself is used
#	to identify an alignment to use for blasting or whether an external alignment is given to 
#	psiblast
#
use strict;
use Getopt::Long;
use Bio::SimpleAlign;
use Bio::Tools::Run::StandAloneBlast;
use Bio::SearchIO;
use Bio::AlignIO;
use Bio::SeqIO;
use Bio::Root::IO;

# set some default values
my $queryseq = Bio::Root::IO->catfile(qw(t data cysprot1.fa) );
my $executable = 'blastpgp';
my $queryaln = Bio::Root::IO->catfile(qw(t data cysprot.msf) );
my @params = ('database' => $amino_database);
# string listing examples to be executed. Default is to execute
# all tests (ie 1,2 and 3)
my $do_only = ''; 	
my $example1param = 'MATRIX';  # parameter to be varied in example 1
my $example2param = 'GAP';  # parameter to be varied in example 1
my $example1values = [ 'BLOSUM62', 'BLOSUM80', 'PAM70']; # MATRIX values to try
my $example2values = [ 7, 9, 25]; # GAP values to be tried
my $queryalnformat = 'msf';
my $jiter = 2;
# only use pos. specific scoring matrix if > 50% of residues have
# consensus letter (and compare with 25% or 75% cut off)
my  $maskvalues = [50, 25, 75] ; my $helpflag = 0;   # Flag to show usage info.

# get user options
my @argv       = @ARGV;  # copy ARGV before GetOptions() massacres it.
my $paramvalstring;
my $maskvalstring;

&GetOptions("h!"          => \$helpflag, 
				"help!"       => \$helpflag,
				"in=s"        => \$queryseq,
				"inaln=s"     => \$queryaln,
				"alnfmt=s"    => \$queryalnformat,
				"param=s"     => \$example1param,
				"exec=s"      => \$executable,
				"paramvals=s" => \$paramvalstring,
				"do=i"        =>  \$do_only,
				"maskvals=s"  => \$maskvalstring,
				"iter=i"      =>  \$jiter,
	) ;

if ($paramvalstring) { @$example1values = split (":", $paramvalstring); }
if ($maskvalstring)  { @$maskvalues     = split (":", $maskvalstring);  }

if ($helpflag) { &example_usage(); exit 0;}

# create factory & set user-specified global blast parameters
foreach my $argv (@argv) {
	next unless ($argv =~ /^(.*)=>(.*)$/);
	push (@params, $1 => $2);
}
my  $factory = Bio::Tools::Run::StandAloneBlast->new(@params);
	
# If "do" variable not set, do all four examples
if ( ! $do_only)  {
    &vary_params($queryseq, $example1param, $example1values); # ex. 1

    # To compare gap penalties of 7, 9 and 25 we need to set the
    # scoring matrix to BLOSUM62 and extension penalty to 2 (these are
    # limitations of BLAST)

    $factory->MATRIX('BLOSUM62');  

    $factory->EXTENSION(2);  
    &vary_params($queryseq, $example2param, $example2values); # ex. 2
 
    # For the psiblast tests we want to restore gap opening and
    # extension values to their defaults

    $factory->GAP(11);
    $factory->EXTENSION(1);
    # now do the mask comparison example and ..
    &vary_masks($queryseq, $maskvalues); # ex. 3
    # do the jumpstart-align vs multiple iteration examples with the 
    # mask value set to 50%
    &aligned_blast($queryseq, $queryaln, $queryalnformat, 
		   $jiter, $maskvalues->[0]); # ex. 4
} elsif ($do_only  == 1) {
    &vary_params($queryseq,$example1param, $example1values);
} elsif ($do_only  == 3) {
    &vary_masks($queryseq, $maskvalues);
} elsif ($do_only  == 4 ) {
    &aligned_blast($queryseq, $queryaln, $queryalnformat, $jiter, $maskvalues->[0]);
}  else {
    &example_usage();
}

exit 0;

##########
## End of "main"


#################################################
#   compare_runs(): Prints out display of which hits were found by different methods
#	Various methods are labeled by "tags" found in array @runtags
#
#  args: 
#	$typetag  -  label describing type of "tags"
#	$runtags  -  reference to array @runtags
#	$hashhits  - reference to hash of all the hits found by all runs (%hashhits) 
#		value for each hit is string which is the concatenation of all the "tags" of
#		runs that found that hit
#  returns: nothing  

sub compare_runs {
    my $typetag = shift;
    my $runtags = shift;

    my $hashhits = shift;   

    my ($tag, @taghits);

    print "Comparing BLAST results... \n";

# Get total number of hits found by any method
    my $numhits = keys %$hashhits ; # scalar context to get total number of hits by all methods
    print  "Total number of hits found: $numhits \n";

# Get total number of hits found by every method
    my $alltags =  join ( "" ,  @$runtags );
    my  @alltaghits = grep  $$hashhits{$_} =~ /$alltags/  ,  keys %$hashhits;
    print  " Number of hits found by every method / parameter-value: " ,   
    scalar(@alltaghits), "\n";

# If one desires to see the hits found by all methods, uncomment next 2 lines
#print  " Hits were found all methods / parameters: \n";
#print   join ( "\n", @alltaghits ) ,  "\n";

# For each method/parameter-value (labeled by type) display  hits found 
# exclusively by that method
    foreach $tag (@$runtags)  {
		 @taghits = grep  $$hashhits{$_} =~ /^$tag$/  ,  keys %$hashhits;
		 print  " Hits found only when $typetag was $tag: \n";
		 print   join ( "\n", @taghits ) ,  "\n";
    }
    return 1;
}


#################################################
#   vary_params(): Example demonstrating varying of parameter
#
#  args: 
#	$queryseq  - query sequence (can be filename (fasta),  or Bio:Seq object) 
#	$param  - name of parameter to be varied 
#	$values  - reference to array of values to be used for the parameter 
#  returns: nothing  



sub vary_params {

    my $queryseq = shift;   
    my $param = shift;   
    my $values = shift;  


    print "Beginning $param parameter-varying example... \n";

    # Now we'll perform several blasts, 1 for each value of the
    # selected parameter.  In the first default case, we vary the
    # MATRIX substitution parameter, creating 3 BLAST reports, using
    # MATRIX values of BLOSUM62, BLOSUM80 or PAM70.

    # In the second default case, we vary the GAP penalty parameter,
    # creating 3 BLAST reports, using GAP penalties of 7, 9 and 25. In
    # either case we then automatically parse the resulting report to
    # identify which hits are found with any of the parameter values
    # and which with only one of them.

 
    # To test the BLAST results to some other parameter it is only
    # necessary to change the parameters passed to the script on the
    # commandline.  The only tricky part is that the BLAST program
    # itself only supports a limited range of parameters.  See the
    # BLAST documentation.

    my ($report, $sbjct, $paramvalue);

    my $hashhits = { };	# key is hit id, value is string of param values for which hit was found
    
    foreach $paramvalue (@$values)  {
	
	$factory->$param($paramvalue); # set parameter value

	print "Performing BLAST with $param = $paramvalue \n";

	$report = $factory->$executable($queryseq);
	my $r = $report->next_result;
	while( my $hit = $r->next_hit ) {
	    $hashhits->{$hit->name} .= "$paramvalue";
	}
    }

    &compare_runs( $param , $values , $hashhits);  

    return 1;

}

#################################################

#   vary_masks(): Example demonstrating varying of parameter
#
#  args:
#	$queryseq  - query sequence (can be filename (fasta),  or Bio:Seq object)
#	$maskvalues  - reference to array of values to be used for the mask threshold
#  returns: nothing

# Now we'll perform several blasts, 1 for each value of the mask threshold.
# In the default case, we use thresholds of 25%, 50% and 75%. (Recall the threshold is
# % of resudues which must match the consensus residue before deciding to use the
# position specific scoring matrix rather than the default - BLOSUM or PAM - matrix)
# We then automatically parse the resulting reports to identify which hits
# are found with any of the mask threshold values and which with only one of them.
#

sub vary_masks {

my $queryseq = shift;
my $values = shift;


print "Beginning mask-varying example... \n";

my ($report, $sbjct, $maskvalue);

my $hashhits = { };     # key is hit id, value is string of param values for which hit was found

# Get the alignment file
my $str = Bio::AlignIO->new(-file=> "$queryaln", '-format' => "$queryalnformat", );
my $aln = $str->next_aln();

foreach $maskvalue (@$values)  {

    print "Performing BLAST with mask threshold = $maskvalue % \n";

    # Create the proper mask for 'jumpstarting'
    my $mask = &create_mask($aln, $maskvalue);
    my $report2 = $factory->blastpgp($queryseq, $aln, $mask);
    my $r = $report2->next_result;
    while($sbjct = $r->next_hit) {
	$hashhits->{$sbjct->name} .= "$maskvalue";			
    }
}

&compare_runs( 'mask threshold' , $values , $hashhits);

return 1;

}

#################################################
#  aligned_blast ():
#
#
#  args: 
#	$queryseq  - query sequence (can be filename (fasta),  or Bio:Seq object) 
#	$queryaln  - file containing alignment to be used to "jumpstart" psiblast in "-B mode"
#			$queryaln *must contain $queryseq with the same name and length
#				(psiblast is very picky)
#	$queryalnformat  - format of alignment (can = "fasta", "msf", etc)
#	$jiter  - number of iterations in psiblast run
#	$maskvalue  - threshold indicating how similar residues must be at a sequence location
#		before position-specific-scoring matrix is used
#		: "0" => use position specific matrix at all residues,  or
#			"100" => use default (eg BLOSUM) at all residues
#  returns: nothing  


# For this example, we'll compare the results of psiblast depending on whether psiblast itself is 

#  used to identify an alignment to use for blasting or whether an external alignment is given to 
#  psiblast

sub aligned_blast {


my     $queryseq  =  shift; 
my	$queryaln  =  shift; 
my	$queryalnformat  =  shift;
my	$jiter  =  shift;
my	$maskvalue  =  shift;

my $hashhits = { };
my ($sbjct, $id);

print "\nBeginning aligned blast example... \n";


# First we do a  single-iteration psiblast search but with a specified alignment to
#  "jump start" psiblast


print "\nBeginning jump-start psiblast ... \n";


my $tag1 = 'jumpstart';

# $factory->j('1');    # perform single iteration

# Get the alignment file
my $str = Bio::AlignIO->new(-file=> "$queryaln", '-format' => "$queryalnformat", );
my $aln = $str->next_aln();


# Create the proper mask for 'jumpstarting'
my $mask = &create_mask($aln, $maskvalue);


my $report2 = $factory->blastpgp($queryseq, $aln, $mask);
while($sbjct = $report2->next_result) {
		$hashhits->{$sbjct->name} .= "$tag1";			
}

# Then we do a "plain" psiblast multiple-iteration search

print "\nBeginning multiple-iteration psiblast ... \n";

my $undefineB ;
  $factory->B($undefineB);

my $tag2 = 'iterated';
$factory->j($jiter);    # 'j' is blast parameter for # of iterations
my $report1 = $factory->blastpgp($queryseq);
my $total_iterations = $report1->number_of_iterations;
my $last_iteration = $report1->round($total_iterations);


 while($sbjct = $last_iteration->next_result) {
		$hashhits->{$sbjct->name} .= "$tag2";			
	}

# Now we compare the results of the searches

my $tagtype = 'iterated_or_jumpstart'; 
my $values = [ $tag1, $tag2];

&compare_runs( $tagtype , $values , $hashhits);  
 
return 1;

}


#################################################


# create_mask(): creates a mask for the psiblast jumpstart alignment
#                that determines at what residues position-specific
#                scoring matrices (PSSMs) are used and at what
#                residues default scoring matrices (eg BLOSUM) are
#                used. See psiblast documentation for more details,

#  args: 
#	$aln  -  SimpleAlign object with alignment
#	$maskvalue  -  label describing type of "tags"
#  returns: actual mask, ie a string of 0's and 1's which is the 
#           same length as each sequence in the alignment and has 
#           a "1" at locations where (PSSMs) are to be used
#           and a "0" at all other locations.


sub create_mask {
	my $aln = shift;
	my $maskvalue = shift;
	my $mask = "";

	die "psiblast jumpstart requires all sequences to be same length \n"
	  unless $aln->is_flush();
	my $len = $aln->length();

	if ($maskvalue =~ /^(\d){1,3}$/  ) {
		$mask = $aln->consensus_string($maskvalue) ;
		$mask =~ s/[^\?]/1/g ;
		$mask =~ s/\?/0/g ;
	}
	else { die "maskvalue must be an integer between 0 and 100 \n"; }
	return $mask ;
}

#----------------
sub example_usage {
#----------------

#-----------------------
# Prints usage information for general parameters.

    print STDERR <<"QQ_PARAMS_QQ";

 Command-line accessible script variables and commands:
 -------------------------------
 -h 		:  Display this usage info and exit.
 -in <str>	:  File containing input sequences in fasta format (default = $queryseq) .
 -inaln <str>	:  File containing input alignment for example 3 (default = $queryaln) .
 -alnfmt <str>	:  Format of input alignment for example 3, eg "msf", "fasta", "pfam".
		   (default = $queryalnformat) .
 -do <int>	:  Number of test to be executed ("1" => vary parameters,
		   "3" => compare iterated & jumpstart psiblast.) If omitted,
		   three default tests performed.
 -exec <str>  	:  Blast executable to be used in example 1.  Can be "blastall" or
		   "blastpgp" (default is "blastpgp").
 -param <str>  	:  Parameter to be varied in example 1. Any blast parameter
		   can be varied (default = 'MATRIX')
 -paramvals <str>:  String containing parameter values in example 1, separated
		   by ":"'s. (default = 'BLOSUM62:BLOSUM80:PAM70')
 -iter <int>    :  Maximum number of iterations in psiblast in example 3 (default = 2)
 -maskvals <str>:  String containing mask threshold values (in per-cents) for example 3,
		   separated by ":"'s. (default = '50:75:25')

In addition, any valid Blast parameter can be set using the syntax "parameter=>value" as in "database=>swissprot"

So some typical command lines might be:
 >standaloneblast.pl -do 1 -param expectation -paramvals '1e-10:1e-5'
or
 >standaloneblast.pl -do 1 -exec blastall -param q -paramvals '-1:-7' -in='t/dna1.fa' "pr=>blastn" "d=>ecoli.nt"
or
 >standaloneblast.pl -do 4 -maskvals 0 -iter 3
or
 >standaloneblast.pl -do 3 -maskvals '10:50:90'  -in 't/data/cysprot1.fa' -alnfmt msf -inaln 't/cysprot.msf'



QQ_PARAMS_QQ
}