The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/local/bin/perl

# do a quick BLAST search a la fasta
use Getopt::Long;
use IO::File;
use File::Basename 'basename','dirname';
use File::Path 'mkpath','rmtree';
use Boulder::Stream;
use Boulder::Blast;
use strict 'vars';
use sigtrap qw(die normal-signals);

use constant BLAST_DEFAULT => 'blastn';
use constant ARGS => '-gapall -hspmax 10';
use constant WUBLAST => '/usr/local/wublast/bin';

use vars qw/$DB $PROGRAM $PARAMS $DIR $STREAM
  $TMPDIR $DELETE_TMPDIR $BOULDER $TABULAR $CUTOFF $MINLEN 
  $target $blast $hit $hsp/;

GetOptions(
	   'tabular!'    => \$TABULAR,
	   'boulder!'    => \$BOULDER,
	   'db=s'        => \$DB,
           'tmp=s'      => \$TMPDIR,
	   'dir=s'      => \$DIR,
	   'program=s'  => \$PROGRAM,
	   'params=s'   => \$PARAMS,
	   'cutoff=f'   => \$CUTOFF,
	   'minlen=f'   => \$MINLEN,
	  ) || die <<USAGE;
Usage: $0 -db <database file> [options] <search file>

Run BLAST on one or more sequences and summarize the results.  The
source database is an ordinary fasta file.  The program runs pressdb
to create a temporary blast database in /usr/tmp (or other location of
TMPDIR).

Options:
       -source  <database>  source fasta database (no default)
       -dir     <path>      Where to save intermediate results in directory (don\'t save)
       -tmp     <temporary directory>
       -program <path>      Variant of BLAST to run (blastn)
       -params  <string>    Parameters to pass to program
       -minlen  <float>     Minimum HSP length, as fraction of total search length (0.0)
       -cutoff  <float>     Minimum significance cutoff
       -tabular             Produce output in tabular format
       -boulder             Produce output in boulder format (default)
USAGE
;

my $WUBLAST = -d WUBLAST ? WUBLAST : dirname(`which blastn`);
die "Can't find blast" unless -x "$WUBLAST/blastn";
my $PRESSDB = "$WUBLAST/pressdb";
my $SETDB   = "$WUBLAST/setdb";
my $DO_UNLINK = 0;

# parameter consistency checking
$TMPDIR  ||= $ENV{TMPDIR} || '/usr/tmp';
$DB      || die "Specify database to search.  Try -h for help.\n";
$PROGRAM ||= BLAST_DEFAULT;
$BOULDER = !$TABULAR if defined $TABULAR;
$BOULDER++ if !defined($BOULDER) && !defined($TABULAR);

$PARAMS = ARGS unless defined $PARAMS;
if ($DIR) {
  die "Specify a valid directory for output files. Try -h for help.\n" unless -d $DIR || mkpath($DIR);
} else {
  $DIR = make_tmpdir();
  $DELETE_TMPDIR++;
}
$CUTOFF   = 0.01 unless defined($CUTOFF);
$MINLEN ||= 0.0;
die "minimum length must be between 0.0 and 1.0"
  unless $MINLEN >= 0.0 && $MINLEN <= 1.0;

$target = pressdb($DB,$PROGRAM);
$STREAM = new Boulder::Stream if $BOULDER;

$|=1;
{ # localize input record
  local($/) = ">";
  while (<>) {
    chomp;
    next unless my($description,@dna) = split("\n");
    my ($identifier) = $description=~/^(\S+)/;
    my $output_file = "$DIR/$identifier.blast";
    my $blast = IO::File->new("| $PROGRAM $target - $PARAMS > $output_file  2>/dev/null ") ||
      die "Couldn't open BLAST program: $!";
    print $blast ">$description\n";
    foreach (@dna) { print $blast $_,"\n"; }
    $blast->close;
    die "Error during execution of blast program, status code $? ($!)\n" if $?;
    summarize_results($output_file);
  }
}

sub summarize_results {
  my $file = shift;
  $blast = Boulder::Blast->parse($file);
  return unless $blast->Blast_hits;

  # this code is called to write out the boulder stream, if requested
  if ($BOULDER) {
    $STREAM->put($blast);
    return;
  }
  
  # if we get here, we're producing the tabular summary

  # find the longest hit
  foreach $hit ($blast->Blast_hits) {
    next unless $hit->Signif < $CUTOFF;
    foreach $hsp ($hit->Hsps) {
      next unless $hsp->Signif < $CUTOFF;
      next unless abs($hsp->Length/$hit->Length) >= $MINLEN;
      write;
    }
  }
}

sub pressdb {
  my $db = shift;
  my $program = shift;

  # see if there's already a suitable database around
  if ($ENV{BLASTDB}) {
    my @paths = split(':',$ENV{BLASTDB});
    foreach (@paths) {
      next unless  -r "$_/$db.csq";
      undef $DO_UNLINK;
      return "$_/$db";
    }
  }

  # find a suitable temporary name
  $target = "$TMPDIR/quickblast${$}aaaa";
  my $tries = 100;
  while (--$tries) {
    last if IO::File->new($target,O_WRONLY|O_EXCL|O_CREAT,0600);
    $target++;
  }
  return unless $tries;
  my $base = basename($db);

  # Convert source file into temporary file for processing
  if ($program =~ /^(blastp|blastx)$/) {
    # setdb is a pain in the a**.  We have to play stupid little
    # tricks to get it to work the way we want it to work
    unless ($db =~ m!^/!) { # resolve relative paths
      require Cwd;
      my $cwd = Cwd::cwd();
      $db = "$cwd/$db";
    }
    unlink $target;
    symlink($db,$target) || die "Couldn't symlink $db=>$target";
    system($SETDB,'-t',$base,$target);
  } else {
    system($PRESSDB,'-o',$target,'-t',$base,$db);
  }
  die "Couldn't make temporary BLAST database: $!" if $?;
  $DO_UNLINK++;
  return $target;
}

sub make_tmpdir {
  my $dir = "$TMPDIR/quickblastout${$}aaaa";
  my $tries = 100;
  while (--$tries) {
    my ($success) = mkpath ($dir++,0,0700);
    return $success if $success;
  }
}

format STDOUT_TOP=
Search                      Target                            Score          P   Len   % Idnty  Search Range      Target Range
.
format STDOUT=
@<<<<<<<<<<<<<<<<<<<<<<<<<<<@<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @##### @>>>>>>>> @#####   @>>>>   @>>>>>> @>>>>>>  @>>>>>>> @>>>>>>
{
  $blast->Blast_query      ." (".$blast->Blast_query_length.")",
  $hit->Name ." (".$hit->Length.")",
  $hsp->Score,$hsp->Signif,$hsp->Length,
  $hsp->Identity,
  $hsp->Query_start,$hsp->Query_end,
  ($hsp->Orientation eq 'plus' ? ( $hsp->Subject_start,$hsp->Subject_end ) : ( $hsp->Subject_end,$hsp->Subject_start ) )
}
.

# tidy up by removing temporary database
END {  
  if ($target && $DO_UNLINK) {
    unlink $target;
    unlink <$target.*>;
  }
  rmtree ($DIR) if $DIR && $DELETE_TMPDIR;
}