#!/usr/bin/perl
# This script will report the names of the tissues which were seen
# in a BLAST/FASTA report against an EST (or cDNA possibly) library.
# this script assumes you have a directory which you have downloaded
# gbestXX.seq.gz from ncbi genbank release. This will run faster if
# they are uncompressed already, but if will uncompress the files
# on demand. Be sure that there is sufficient space and the uid
# has write permission on the files and in that directory if you
# plan to run this script on compressed files.
# Alternatively you can use this with the -r option and it will
# use the remote sequence databases either genbank or embl to
# retrieve the specific EST (so one can attempt to guess the tissue type)
##
# cmd line options are
# -i/--index=indexname
# -d/--dir=dir where gbest data files are located
# -b/--blast=filename blast filename which compared against an EST db
# -f/--format=(blast|blastxml|fasta) - type of search output either
# from BLAST or FASTA suites
# -c/--cache=filename cache for accession number to tissue
# -p/pvalue=pvalue pvalue to limit search to
# -r/--remote=[GenBank|EMBL] use remote db for searching
use strict;
use DB_File;
use Bio::SeqIO;
use Bio::SearchIO;
use Bio::DB::EMBL;
use Bio::DB::GenBank;
use Bio::Index::GenBank;
use Getopt::Long;
my $GZIP = '/usr/bin/gzip';
my $GUNZIP = '/usr/bin/gunzip';
my $dir = '/home/data/libs/gbest'; # local dir for gbest files
my $index = 'dbest_tissue.idx'; # local index filename
my $cache; # filename to create cache of accession->tissue
my $VERBOSE = 0;# verbosity option
my $blastfile; # blastfile to parse
my $pvalue; # Max P-Value allowed when parsing blastfile
my $remote; # flag for remote database
my $db; # generic database handle
my %accessions; # cache results
my $format = 'blast';
&GetOptions( 'd|dir:s' => \$dir,
'i|index:s' => \$index,
'v|verbose' => \$VERBOSE,
'b|blast:s' => \$blastfile,
'f|format:s' => \$format,
'c|cache:s' => \$cache,
'p|pvalue:s' => \$pvalue,
'r|remote:s'=> \$remote);
if( $cache && -w $cache ) {
print "creating cache file\n";
tie %accessions, "DB_File", $cache, O_RDWR|O_CREAT,0660, $DB_HASH;
}
if( ! $remote ) {
opendir(GBEST, $dir) or die("cannot open $dir");
my $indexfile = new Bio::Index::GenBank(-filename => $index,
-write_flag => 'WRITE');
foreach my $file ( readdir(GBEST) ) {
# print "file is $file\n";
next unless ( $file =~ /(gbest\d+\.seq)(.gz)?$/ );
if( $2 ) {
`$GUNZIP $dir/$file`;
}
$indexfile->make_index("$dir/$1");
}
$indexfile = undef;
$db = new Bio::Index::GenBank(-filename => $index);
} else {
if( $remote =~ /(ncbi)|(genbank)/i ) {
$db = new Bio::DB::GenBank;
} elsif( $remote =~ /embl/i ) {
$db = new Bio::DB::EMBL;
} else {
die("remote must be either 'NCBI' or 'EMBL'");
}
# would need to add code to set proxy info for those who need it
}
if(! $blastfile || ! -r $blastfile ) {
die("Must specify a valid blastfile");
}
my $parser = new Bio::SearchIO(-format => $format,
-file => $blastfile);
my %tissues_seen = ();
my ($result,$hit,$hsp);
while( my $result = $parser->next_result ) {
HIT: while( my $hit = $result->next_hit ) {
if( defined $pvalue ) {
while( my $hsp = $hit->next_hsp ) {
if( $hsp->evalue > $pvalue ) {
print "skipping ", $hit->name, " because of low evalue \n";
# skip this Subject if it contains a pvalue of > $pvalue
next HIT;
}
}
}
my ($id) = split(/\s+/, $hit->name);
# get the last value
my @ids = split(/\|/, $id);
$id = pop @ids;
my ($tissuetype) = get_tissue($id);
if( defined $tissuetype ) {
push @{$tissues_seen{$tissuetype}}, $hit->name;
} else {
print STDERR "could not find tissue for $id\n" if( $VERBOSE);
}
}
print "tissues seen for: ", $result->query_name, "\n";
foreach my $tissue ( sort keys %tissues_seen ) {
print "* $tissue\n-----------\n\t",
join("\n\t",@{$tissues_seen{$tissue}}), "\n\n";
}
}
# cleanup -- avoid segfault here
$db = undef;
# subroutines
sub get_tissue {
my ($id) = @_;
my $tissue;
if( $tissue = $accessions{$id} ) {
return $tissue;
}
my $seq = $db->get_Seq_by_acc($id);
return unless( $seq );
foreach my $feature ( $seq->all_SeqFeatures ) {
if( $feature->primary_tag eq 'source' ) {
foreach my $tag ( sort { $b cmp $a }
$feature->all_tags ) {
if( $tag =~ /tissue/i ||
( ! $tissue &&
$tag =~ /clone_lib/i ) ){
($tissue) = $feature->each_tag_value($tag);
$accessions{$seq->display_id} = $tissue;
return $tissue;
}
}
}
}
return;
}