# $Id: HIV.pm 232 2008-12-11 14:51:51Z maj $
#
# BioPerl module for Bio::DB::HIV
#
# Please direct questions and support issues to <bioperl-l@bioperl.org>
#
# Cared for by Mark A. Jensen <maj@fortinbras.us>
#
# Copyright Mark A. Jensen
#
# You may distribute this module under the same terms as perl itself
# POD documentation - main docs before the code
=head1 NAME
Bio::DB::HIV - Database object interface to the Los Alamos HIV Sequence Database
=head1 SYNOPSIS
$db = new Bio::DB::HIV;
$seq = $db->get_Seq_by_id('94284'); # LANL sequence id
$seq = $db->get_Seq_by_acc('EF432710'); # GenBank accession
$q = new Bio::DB::Query::HIVQuery( " (C D)[subtype] SI[phenotype] (symptomatic AIDS)[patient_health] " );
$seqio = $db->get_Stream_by_query($q);
$seq = $seqio->next_seq();
($seq->annotation->get_Annotations('Virus'))[0]->{subtype} # returns 'D'
($seq->annotation->get_Annotations('Patient'))[0]->{patient_health} # returns 'AIDS'
($seq->annotation->get_Annotations('accession'))[0]->{value} # returns 'K03454'
=head1 DESCRIPTION
Bio::DB::HIV, along with L<Bio::DB::Query::HIVQuery>, provides an
interface for obtaining annotated HIV and SIV sequences from the Los
Alamos National Laboratory (LANL) HIV Sequence Database (
L<http://www.hiv.lanl.gov/content/sequence/HIV/mainpage.html>
). Unannotated sequences can be retrieved directly from the database
object, using either LANL ids or GenBank accessions. Annotations are
obtained via a query object, and are attached to the correct C<Bio::Seq>
objects when the query is handled by C<Bio::DB::HIV::get_Seq_by_query>
or C<Bio::DB::HIV::get_Stream_by_query>.
=head1 FEEDBACK
=head2 Mailing Lists
User feedback is an integral part of the evolution of this and other
Bioperl modules. Send your comments and suggestions preferably to
the Bioperl mailing list. Your participation is much appreciated.
bioperl-l@bioperl.org - General discussion
http://bioperl.org/wiki/Mailing_lists - About the mailing lists
=head2 Support
Please direct usage questions or support issues to the mailing list:
I<bioperl-l@bioperl.org>
rather than to the module maintainer directly. Many experienced and
reponsive experts will be able look at the problem and quickly
address it. Please include a thorough description of the problem
with code and data examples if at all possible.
=head2 Reporting Bugs
Report bugs to the Bioperl bug tracking system to help us keep track
of the bugs and their resolution. Bug reports can be submitted via
the web:
https://redmine.open-bio.org/projects/bioperl/
=head1 AUTHOR - Mark A. Jensen
Email maj@fortinbras.us
=head1 CONTRIBUTORS
Mark A. Jensen
=head1 APPENDIX
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _
=cut
# Let the code begin...
package Bio::DB::HIV;
use strict;
use warnings;
use vars qw( $LANL_BASE $LANL_MAP_DB $LANL_MAKE_SEARCH_IF $LANL_SEARCH );
# Object preamble - inherits from Bio::DB::WebDBSeqI
use Bio::Root::Root;
use HTTP::Request::Common;
use Bio::DB::HIV::HIVAnnotProcessor;
use base qw(Bio::DB::WebDBSeqI);
BEGIN {
# base change of 01/14/09
$LANL_BASE = "http://www.hiv.lanl.gov/components/sequence/HIV/asearch";
$LANL_MAP_DB = "map_db.comp";
$LANL_MAKE_SEARCH_IF = "make_search_if.comp";
$LANL_SEARCH = "search.comp";
@Bio::ResponseProblem::Exception::ISA = qw( Bio::Root::Exception );
@Bio::HIVSorry::Exception::ISA = qw ( Bio::Root::Exception );
@Bio::WebError::Exception::ISA = qw( Bio::Root::Exception );
}
=head1 Constructor
=head2 new
Title : new
Usage : my $obj = new Bio::DB::HIV();
Function: Builds a new Bio::DB::HIV object
Returns : an instance of Bio::DB::HIV
Args :
=cut
sub new {
my($class,@args) = @_;
my $self = $class->SUPER::new(@args);
my ($lanl_base, $lanl_map_db, $lanl_make_search_if, $lanl_search) =
$self->_rearrange([qw(
LANL_BASE
LANL_MAP_DB
LANL_MAKE_SEARCH_IF
LANL_SEARCH
)], @args);
$lanl_base && $self->lanl_base($lanl_base);
$lanl_map_db && $self->map_db($lanl_map_db);
$lanl_make_search_if && $self->make_search_if($lanl_make_search_if);
$lanl_search && $self->search_($lanl_search);
# defaults
$self->lanl_base || $self->lanl_base($LANL_BASE);
$self->map_db || $self->map_db($LANL_MAP_DB);
$self->make_search_if || $self->make_search_if($LANL_MAKE_SEARCH_IF);
$self->search_ || $self->search_($LANL_SEARCH);
$self->url_base_address || $self->url_base_address($self->lanl_base);
$self->request_format("fasta");
return $self;
}
=head1 WebDBSeqI compliance
=head2 get_request
Title : get_request
Usage : my $url = $self->get_request
Function: returns a HTTP::Request object
Returns :
Args : %qualifiers = a hash of qualifiers with keys in
(-ids, -format, -mode, -query)
Note : Several layers of requests are performed to get to the sequence;
see Bio::DB::Query::HIVQuery.
=cut
sub get_request {
my $self = shift;
my %quals = @_;
my ($resp);
my (@ids, $mode, @interface, @query_parms, $query);
# html parsing regexps
my $tags_re = qr{(?:\s*<[^>]+>\s*)};
my $session_id_re = qr{<input.*name="id".*value="([0-9a-f]+)"}m;
my $search_form_re = qr{<form[^>]*action=".*/search.comp"};
my $seqs_found_re = qr{Displaying$tags_re*(?:\s*[0-9-]*\s*)*$tags_re*of$tags_re*\s*([0-9]+)$tags_re*sequences found};
my $no_seqs_found_re = qr{Sorry.*no sequences found};
my $too_many_re = qr{too many records: $tags_re*([0-9]+)};
# find something like:
# <strong>tables without join:</strong><br>SequenceAccessions<br>
my $tbl_no_join_re = qr{tables without join}i;
# my $sorry_bud_re = qr{};
# handle "qualifiers"
foreach (keys %quals) {
m/mode/ && do {
$mode = $quals{$_};
next;
};
m/uids/ && do {
$self->throw(-class=>"Bio::Root::BadParameter",
-text=>"Arrayref required for qualifier \"$_\"",
-value=>$quals{$_}) unless ref($quals{$_}) eq 'ARRAY';
@ids = @{$quals{$_}};
next;
};
m/query/ && do {
$self->throw(-class=>"Bio::Root::BadParameter",
-text=>"Bio::DB::Query::HIVQuery required for qualifier \"$_\"",
-value=>$quals{$_}) unless $quals{$_}->isa("Bio::DB::Query::HIVQuery");
$query = $quals{$_};
next;
};
do {
1; #else stub
};
}
# what kind of request?
for my $m ($mode) {
($m =~ m/single/) && do {
@interface = (
'sequenceentry' => 'se_sequence',
'sequenceentry' => 'se_id',
'action' => 'Search Interface'
);
@query_parms = map { ('sequenceentry.se_id' => $_ ) } @ids;
push @query_parms, (
'sequenceentry.se_sequence'=>'Any',
'order' => 'sequenceentry.se_id',
'sort_dir' => 'ASC',
'action' => 'Search'
);
};
($mode =~ m/acc/) && do {
@interface = (
'sequenceentry' => 'se_sequence',
'sequenceentry' => 'se_id',
'sequenceaccessions' => 'sa_genbankaccession',
'sequenceaccessions' => 'sa_se_id',
'action' => 'Search Interface'
);
@query_parms = map {('sequenceaccessions.sa_genbankaccession' => $_)} @ids;
push @query_parms, (
'sequenceentry.se_sequence' => 'Any',
'order' => 'sequenceaccessions.sa_genbankaccession',
'sort_dir' => 'ASC',
'action' => 'Search'
);
};
($mode =~ m/gi/) && do {
$self->_sorry("-mode=>gi");
};
($mode =~ m/version/) && do {
$self->_sorry("-mode=>version");
};
($mode =~ m/query/) && do {
$self->throw(-class=>"Bio::Root::BadParameter",
-text=>"Query ".($query->{'_RUN_LEVEL'} ? "has been run only at run level ".$query->{'_RUN_LEVEL'} : "has not been run").", run at level 2 with _do_query(2)",
-value=>$query->{'_RUN_LEVEL'}) unless $query->{'_RUN_LEVEL'} == 2;
@interface = (
'sequenceentry' => 'se_sequence',
'sequenceentry' => 'se_id',
'action' => 'Search Interface'
);
@query_parms = ("sequenceentry.se_id" =>sprintf("'%s'",join("\t", $query->ids)));
# @query_parms = map { ( "sequenceentry.se_id" => $_ ) } $query->ids;
push @query_parms, (
'sequenceentry.se_sequence' => 'Any',
'order' => 'sequenceentry.se_id',
'sort_dir' => 'ASC',
'action' => 'Search'
);
};
do {
1; # else stub
};
}
# web work
eval { # capture web errors; throw below...
# negotiate a session with lanl db
if (!$self->_session_id) {
$resp = $self->ua->get($self->_map_db_uri);
$resp->is_success || die "Connect failed";
# get the session id
if (!$self->_session_id) {
($self->{'_session_id'}) = ($resp->content =~ /$session_id_re/);
$self->_session_id || die "Session not established";
}
}
# establish correct "interface" for this session id
$resp = $self->ua->post($self->_make_search_if_uri, [@interface, id=>$self->_session_id]);
$resp->is_success || die "Interface request failed (1)";
$self->_response($resp);
$resp->content =~ /$search_form_re/ || die "Interface request failed (2)";
# interface successful, do the "pre-search"
$resp = $self->ua()->post($self->_search_uri, [(@query_parms, 'id' => $self->_session_id)] );
unless ($resp->is_success) {
die "Search post failed";
}
$self->_response($resp);
# check for error conditions
for ($resp->content) {
/$no_seqs_found_re/ && do {
die "No sequences found";
last;
};
/$too_many_re/ && do {
die "Too many records ($1): must be <10000";
last;
};
/$tbl_no_join_re/ && do {
die "Some required tables went unjoined to query";
last;
};
/$seqs_found_re/ && do {
last;
};
do {
die "Unparsed failure";
last;
};
}
};
$self->throw(-class=>'Bio::WebError::Exception',
-text=>$@,
-value=>$resp->content) if $@;
# "pre-search" successful, return request
### check this post update
return POST $self->_search_uri,
['action Download.x' => 1,
'action Download.y'=>1,
'id'=>$self->_session_id
];
}
=head2 postprocess_data
Title : postprocess_data
Usage : $self->postprocess_data ( 'type' => 'string',
'location' => \$datastr);
Function: process downloaded data before loading into a Bio::SeqIO
Returns : void
Args : hash with two keys - 'type' can be 'string' or 'file'
- 'location' either file location or string
reference containing data
=cut
sub postprocess_data {
# parse tab-separated value content from LANL db
my ( $self, %args) = @_;
my ($type, $loc) = ($args{type}, $args{location});
my (@data, @cols, %rec, $idkey, @flines);
$self->throw(-class=>'Bio::Root::BadParameter',
-text=>"Argument hash requires values for keys \"type\" and \"location\"",
-value=>\%args) unless ($type && $loc);
for ($type) {
m/string/ && do {
@data = split(/\n|\r/, ${$loc});
last;
};
m/file/ && do {
local $/;
undef $/;
open (F, "<", $loc) or
$self->throw(
-class=>'Bio::Root::FileOpenException',
-text=>"Error opening tempfile \"$loc\" for reading",
-value=>$loc
);
@data = split( /\n|\r/, <F>);
close(F);
last;
};
do {
1; # else stub
};
}
$self->throw(-class=>'Bio::Root::BadParameter',
-text=>'No data found in repsonse',
-value=>%args) unless (@data);
my $l;
do {
$l = shift @data;
} while ( defined $l && $l !~ /Number/ ); # number-returned line
@cols = split( /\t/, shift @data);
# if Accession column is present, get_Stream_by_acc was called
# otherwise, return lanl ids
($idkey) = grep /SE.id/i, @cols unless ($idkey) = grep /Accession/i, @cols;
$self->throw(-class=>"Bio::ResponseProblem::Exception",
-text=>"Trouble with column headers in LANL response",
-value=>join(' ',@cols)) unless $idkey;
foreach (@data) {
chop;
@rec{@cols} = split /\t/;
push @flines, ">$rec{$idkey}\n".$rec{'Sequence'}."\n";
}
for ($type) {
m/string/ && do {
${$loc} = join("", @flines);
last;
};
m/file/ && do {
open(F, ">", $loc) or $self->throw(-class=>'Bio::Root::FileOpenException',
-text=>'Error opening tempfile \"$loc\" for writing',
-value=>$loc);
print F join("", @flines);
close(F);
last;
};
do {
1; #else stub
};
}
return;
}
=head1 WebDBSeqI overrides
=head2 get_seq_stream
Title : get_seq_stream
Usage : my $seqio = $self->get_seq_stream(%qualifiers)
Function: builds a url and queries a web db
Returns : a Bio::SeqIO stream capable of producing sequence
Args : %qualifiers = a hash qualifiers that the implementing class
will process to make a url suitable for web querying
Note : Some tightening up of the baseclass version
=cut
sub get_seq_stream {
my ($self, %qualifiers) = @_;
my ($rformat, $ioformat) = $self->request_format();
my ($key) = grep /format$/, keys %qualifiers;
$qualifiers{'-format'} = ($key ? $qualifiers{$key} : $rformat);
($rformat, $ioformat) = $self->request_format($qualifiers{'format'});
# web work is here/maj
my $request = $self->get_request(%qualifiers);
# authorization is here/maj
$request->proxy_authorization_basic($self->authentication)
if ( $self->authentication);
$self->debug("request is ". $request->as_string(). "\n");
# workaround for MSWin systems (no forking available/maj)
$self->retrieval_type('io_string') if $self->retrieval_type =~ /pipeline/ && $^O =~ /^MSWin/;
if ($self->retrieval_type =~ /pipeline/) {
# Try to create a stream using POSIX fork-and-pipe facility.
# this is a *big* win when fetching thousands of sequences from
# a web database because we can return the first entry while
# transmission is still in progress.
# Also, no need to keep sequence in memory or in a temporary file.
# If this fails (Windows, MacOS 9), we fall back to non-pipelined access.
# fork and pipe: _stream_request()=><STREAM>
my ($result,$stream) = $self->_open_pipe();
if (defined $result) {
$DB::fork_TTY = File::Spec->devnull; # prevents complaints from debugge
if (!$result) { # in child process
$self->_stream_request($request,$stream);
POSIX::_exit(0); #prevent END blocks from executing in this forked child
}
else {
return Bio::SeqIO->new('-verbose' => $self->verbose,
'-format' => $ioformat,
'-fh' => $stream);
}
}
else {
$self->retrieval_type('io_string');
}
}
if ($self->retrieval_type =~ /temp/i) {
my $dir = $self->io->tempdir( CLEANUP => 1);
my ( $fh, $tmpfile) = $self->io()->tempfile( DIR => $dir );
close $fh;
my $resp = $self->_request($request, $tmpfile);
if( ! -e $tmpfile || -z $tmpfile || ! $resp->is_success() ) {
$self->throw("WebDBSeqI Error - check query sequences!\n");
}
$self->postprocess_data('type' => 'file','location' => $tmpfile);
# this may get reset when requesting batch mode
($rformat,$ioformat) = $self->request_format();
if( $self->verbose > 0 ) {
open(my $ERR, "<", $tmpfile);
while(<$ERR>) { $self->debug($_);}
}
return Bio::SeqIO->new('-verbose' => $self->verbose,
'-format' => $ioformat,
'-file' => $tmpfile);
}
if ($self->retrieval_type =~ /io_string/i ) {
my $resp = $self->_request($request);
my $content = $resp->content_ref;
$self->debug( "content is $$content\n");
if (!$resp->is_success() || length($$content) == 0) {
$self->throw("WebDBSeqI Error - check query sequences!\n");
}
($rformat,$ioformat) = $self->request_format();
$self->postprocess_data('type'=> 'string',
'location' => $content);
$self->debug( "str is $$content\n");
return Bio::SeqIO->new('-verbose' => $self->verbose,
'-format' => $ioformat,
'-fh' => new IO::String($$content));
}
# if we got here, we don't know how to handle the retrieval type
$self->throw("retrieval type " .
$self->retrieval_type .
" unsupported\n");
}
=head2 get_Stream_by_acc
Title : get_Stream_by_acc
Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]);
Function: Gets a series of Seq objects by GenBank accession numbers
Returns : a Bio::SeqIO stream object
Args : an arrayref of accession numbers for
the desired sequence entries
Note : For LANL DB, alternative to LANL seqids
=cut
sub get_Stream_by_acc {
my ($self, $ids ) = @_;
return $self->get_seq_stream('-uids' => [$ids], '-mode' => 'acc');
}
=head2 get_Stream_by_query
Title : get_Stream_by_query
Usage : $stream = $db->get_Stream_by_query($query);
Function: Gets a series of Seq objects by way of a query string or oject
Returns : a Bio::SeqIO stream object
Args : $query : Currently, only a Bio::DB::Query::HIVQuery object.
It's a good idea to create the query object first and interrogate
it for the entry count before you fetch a potentially large stream.
=cut
sub get_Stream_by_query {
my ($self, $query ) = @_;
my $stream = $self->get_seq_stream('-query' => $query, '-mode'=>'query');
return new Bio::DB::HIV::HIVAnnotProcessor( -hiv_query=>$query, -source_stream=>$stream );
}
sub _request {
my ($self, $request,$tmpfile) = @_;
my ($resp);
if( defined $tmpfile && $tmpfile ne '' ) {
$resp = $self->ua->request($request, $tmpfile);
} else {
$resp = $self->ua->request($request);
}
if( $resp->is_error ) {
$self->throw("WebDBSeqI Request Error:\n".$resp->as_string);
}
return $resp;
}
=head1 Internals
=head2 lanl_base
Title : lanl_base
Usage : $obj->lanl_base($newval)
Function: get/set the base url of the LANL HIV database
Example :
Returns : value of lanl_base (a scalar)
Args : on set, new value (a scalar or undef, optional)
=cut
sub lanl_base{
my $self = shift;
return $self->{'lanl_base'} = shift if @_;
return $self->{'lanl_base'};
}
=head2 map_db
Title : map_db
Usage : $obj->map_db($newval)
Function: get/set the cgi filename for map_db ("Database Map")
Example :
Returns : value of map_db (a scalar)
Args : on set, new value (a scalar or undef, optional)
=cut
sub map_db{
my $self = shift;
return $self->{'map_db'} = shift if @_;
return $self->{'map_db'};
}
=head2 make_search_if
Title : make_search_if
Usage : $obj->make_search_if($newval)
Function: get/set the cgi filename for make_search_if ("Make Search Interface")
Example :
Returns : value of make_search_if (a scalar)
Args : on set, new value (a scalar or undef, optional)
=cut
sub make_search_if{
my $self = shift;
return $self->{'make_search_if'} = shift if @_;
return $self->{'make_search_if'};
}
=head2 search_
Title : search_
Usage : $obj->search_($newval)
Function: get/set the cgi filename for the search query page
("Search Database")
Example :
Returns : value of search_ (a scalar)
Args : on set, new value (a scalar or undef, optional)
=cut
sub search_{
my $self = shift;
return $self->{'search_'} = shift if @_;
return $self->{'search_'};
}
=head2 _map_db_uri
Title : _map_db_uri
Usage :
Function: return the full map_db uri ("Database Map")
Example :
Returns : scalar string
Args : none
=cut
sub _map_db_uri{
my $self = shift;
return $self->url_base_address."/".$self->map_db;
}
=head2 _make_search_if_uri
Title : _make_search_if_uri
Usage :
Function: return the full make_search_if uri ("Make Search Interface")
Example :
Returns : scalar string
Args : none
=cut
sub _make_search_if_uri{
my $self = shift;
return $self->url_base_address."/".$self->make_search_if;
}
=head2 _search_uri
Title : _search_uri
Usage :
Function: return the full search cgi uri ("Search Database")
Example :
Returns : scalar string
Args : none
=cut
sub _search_uri{
my $self = shift;
return $self->url_base_address."/".$self->search_;
}
=head2 _session_id
Title : _session_id
Usage : $obj->_session_id($newval)
Function: Contains HIV db session id (initialized in _do_lanl_request)
Example :
Returns : value of _session_id (a scalar)
Args : on set, new value (a scalar or undef, optional)
=cut
sub _session_id{
my $self = shift;
return $self->{'_session_id'} = shift if @_;
return $self->{'_session_id'};
}
=head2 _response
Title : _response
Usage : $obj->_response($newval)
Function: hold the response to search post
Example :
Returns : value of _response (a scalar)
Args : on set, new value (a scalar or undef, optional)
=cut
sub _response{
my $self = shift;
return $self->{'_response'} = shift if @_;
return $self->{'_response'};
}
=head2 Dude, sorry
Title : _sorry
Usage : $hiv->_sorry
Function: Throws an exception for unsupported option or parameter
Example :
Returns :
Args : scalar string
=cut
sub _sorry{
my $self = shift;
my $parm = shift;
$self->throw(-class=>"Bio::HIVSorry::Exception",
-text=>"Sorry, option/parameter \"$parm\" not (yet) supported. See manpage to complain.",
-value=>$parm);
return;
}
1;