=head1 NAME
DAS::GUS::Segment - DAS-style access to a GUS database
=head1 SYNOPSIS
# Get a Bio::Das::SegmentI object from a DAS::GUS database
$segment = $das->segment( -name => 'Landmark',
-start => $start,
-stop => $stop );
=head1 DESCRIPTION
=head1 AUTHOR
Name: Haiming Wang
Email: hwang@uga.edu
=cut
package DAS::GUS::Segment;
use strict;
use Bio::Root::Root;
use Bio::Das::SegmentI;
use DAS::GUS::Segment::Feature;
use constant DEBUG => 1;
use vars '@ISA', '$VERSION';
@ISA = qw(Bio::Root::Root Bio::SeqI Bio::Das::SegmentI);
$VERSION = 0.11;
use overload '""' => 'asString';
our $dlm = ";;"; # separate attributes $tag=$value pairs
=head2 new
Title : new
Usage : $segment = $db->segment(-name => 'AAEL01000015',
-start => $start,
-stop => $stop );
Function: Create a segment object
Returns : a new DAS::GUS::Segment object
Args : see below
This method creates a new DAS::GUS::Segment object
accoring to a segment name, such as contig 'AAEL0100015'. Generally
this is called automatically by the DAS::GUS module.
There are five positional arguments:
$factory a DAS::GUS adaptor to use for database access
$start start of the desired segment relative to source sequence
$stop stop of the desired segment relative to source sequence
$srcfeature_id ID of the source sequence
$class type of the sequence, i.e. chromosome, contig
$name name of the segment
$atts attributes of the segment
=cut
sub new {
my $self = shift;
my ( $name, $factory, $start, $stop, $atts ) = @_;
my $query = $factory->parser->getSQL("Segment.pm", "new:Segment");
die "Couldn't find Segment.pm sql for new:Segment\n" unless $query;
$query =~ s/(\$\w+)/eval $1/eg;
my $sth = $factory->dbh->prepare($query);
$sth->execute();
my $hashref = $sth->fetchrow_hashref;
warn "END or STARTM of $name could not be determined by sql: $query\n"
unless exists $$hashref{'END'} && exists $$hashref{'STARTM'};
my $length = $$hashref{'END'} - $$hashref{'STARTM'} + 1;
$stop = ($stop && ($stop < $length)) ? int($stop) : $length;
$start = ($start && ($start > 0)) ? int($start) : 1;
return bless { factory => $factory,
start => $start,
end => $stop,
srcfeature_id => $$hashref{'SRCFEATURE_ID'},
length => $length,
class => $$hashref{'TYPE'},
name => $$hashref{'NAME'},
atts => $$hashref{'ATTS'},
}, ref $self || $self;
}
=head2 name
Title : name
Usage : $segname = $seg->name();
Function : Return the name of the segment
Returns : see avove
Args : none
Status : public
=cut
sub name {
my $self = shift;
return $self->{'name'} = shift if @_;
return $self->{'name'};
}
=head2 class
Title : class
Usage : $obj->class($newval)
Function: Return the segment class
Returns : value of class (a scalar)
Args : on set, new value (a scalar or undef, optional)
=cut
sub class {
my $self = shift;
return $self->{'class'} = shift if @_;
return $self->{'class'};
}
*type = \&class;
=head2 attributes
Title : attributes
Usage : $obj->attributes($newval)
Function: Return the segment attributes
Returns : attributes string for gff3 dump
Args :
=cut
sub attributes {
my $self = shift;
return $self->{'atts'} = shift if @_;
return $self->{'atts'};
}
=head2 seq_id
Title : seq_id
Usage : $ref = $s->seq_id
Function: return the ID of the landmark, aliased to name() for
backward compatibility
Return : a string
Args : none
Status : public
=cut
*seq_id = \&name;
=head2 start
Title : start
Usage : $s->start
Function: start of segment
Returns : integer
Args : none
Status : Public
=cut
sub start {
my $self = shift;
return $self->{'start'} = shift if @_;
return $self->{'start'};
}
=head2 low
Title : low
Usage : $s->low
Function: start of segment;
Alias of start for backward compatibility
Returns : integer
Args : none
Status : Public
=cut
*low = \&start;
=head2 end
Title : end
Usage : $s->end
Function: end of segment;
Returns : integer
Args : none
Status : Public
=cut
sub end {
my $self = shift;
return $self->{'end'} = shift if @_;
return $self->{'end'};
}
=head2 high
Title : high
Usage : $s->high
Function: end of segment;
Alias of end for backward compatibility.
Returns : integer
Args : none
Status : Public
=cut
*high = \&end;
=head2 stop
Title : stop
Usage : $s->stop
Function: end of segment;
Alias of end for backward compatibility.
Returns : integer
Args : none
Status : Public
=cut
*stop = \&end;
=head2 length
Title : length
Usage : $s->length
Function: length of segment;
Returns : integer
Args : none
Status : Public
=cut
sub length {
#shift->{length};
abs($_[0]->{start} - $_[0]->{end}) + 1
}
=head2 features
Title : features
Usage : @features = $s->features(@args)
Function: get features that overlap this segment
Returns : a list of Bio::SeqFeatureI objects
Args : see below
Status : public
This method will find all features that intersect the segment in a variety
of ways and returns a list of Bio::SeqFeatureI objects. The feature locations
will use coordinates relative to the reference sequence in effect at the
time that features() was called.
The returned list can be limited to certain types, attributes or
range intersection modes. Types of range intersection are one of:
"overlaps" the default
"contains" return features completely contained within the segment
"contained_in" retunr features that completely contain the segment
Two types of argument lists are accepts. In the positional argument form,
the arguments are treated as a list of feature types. In the named
parameter form, the arguments are a series of -name=E<gt>value pairs.
Argument Description
-------------------------------------
-types An array reference to type names in the format "method:source"
-attributes A hashref containing a set of attributes to match
-rangetype One of "overlaps", "contains", or "contained_in".
-iterator Return an iterator across the features.
-callback A callback to invoke on each feature
The -attributes argument is a hashref containing one or more attributes to
match against:
-attributes => { Gene => 'abc-1',
Note => 'confirmed' }
Attribute matching is simple string matching, and multiple attributes are ANDed
together. More complex filtering can be performed using the -callback
option (see below)
If -iterator is true, then the method returns an object reference that implements
the next_seq() method. Each call to next_seq() returns a new Bio::SeqFeatureI object.
If -callback is passed a code reference, the code reference will be invoked on
each feture returned. The code will be passed two arguments consisting of the
current feature and the segment object itself, and must return a true value. If
the code returns a false value, feature retrieval will be aborted.
-callback and -iterator are mutually exclusive options. If -iterator is defined,
then -callback is ignored.
=cut
sub features {
my $self = shift;
my ($sql, @features, $base_start, $rend);
my ($type, $types,$attributes,$rangetype,$iterator,$callback,$start,
$stop,$feature_id,$factory) = $self->_rearrange([qw(TYPE
TYPES
ATTRIBUTES
RANGETYPE
ITERATOR
CALLBACK
START
STOP
FEATURE_ID
FACTORY
)
], @_);
$types ||= $type;
my $srcfeature_id = $self->srcfeature_id;
my $segname = $self->name;
$base_start = $self->start;
$rend = $self->end;
$factory ||= $self->factory;
###########################################################
#
# You can write queries from here to retrieve TOP level
# features (such as gene, blast...) from GUS.
#
# NOTE: a query name MUST follow the format in the conf file,
# for example: in conf file, a gene track like,
#
# [Gene]
# feature = gene:Genbank
#
# Your gene query name MUST be gene:Genbank
#
###########################################################
foreach my $typeHash ( _getUniqueTypes($types) ) {
my @types = keys(%$typeHash);
my $type = shift @types;
my $typeString = $typeHash->{$type};
$sql = $factory->parser->getSQL("Segment.pm", $type);
warn "Couldn't find Segment.pm sql for $type\n" unless $sql;
next unless $sql;
$sql =~ s/(\$\w+)/eval $1/eg;
my $sth = $factory->dbh->prepare($sql);
$sth->execute()
or $self->throw("getting feature query failed");
my @tempfeats = ();
while (my $featureRow = $sth->fetchrow_hashref) {
push @tempfeats, $self->_makeFeature($featureRow, $factory);
}
# filter out blastx. it is used by cryptodb
if($typeString =~ /blastx/i) {
warn "filtering blastx results for feature type: $type";
@tempfeats = _blastx_filter(\@tempfeats);
}
push(@features, @tempfeats);
my $bulkSubFeatureSql = $factory->parser->getSQL("Feature.pm", "$type:bulksubfeatures");
if($bulkSubFeatureSql) {
$bulkSubFeatureSql =~ s/(\$\w+)/eval $1/eg;
$self->_addBulkSubFeatures(\@features, $bulkSubFeatureSql, $factory)
}
my $bulkAttributeSql = $factory->parser->getSQL("Feature.pm", "$type:bulkAttribute");
next unless $bulkAttributeSql;
$bulkAttributeSql =~ s/(\$\w+)/eval $1/eg;
$self->_addBulkAttribute(\@features, $bulkAttributeSql, $factory);
}
if($iterator) {
return DAS::GUSIterator->new(\@features);
} elsif ( wantarray ) {
return @features;
} else {
return \@features;
}
}
sub _addBulkAttribute {
my($self, $features, $bulkAttributeSql, $factory) = @_;
my %featuresById;
map { $featuresById{$_->feature_id} = $_ } @$features;
my $sth = $factory->dbh->prepare($bulkAttributeSql);
$sth->execute()
or $self->throw("getting bulk attribute query failed");
my @bulkAtts;
while (my $featureRow = $sth->fetchrow_hashref) {
my $feature = $featuresById{$$featureRow{'FEATURE_ID'}};
if ($feature) {
$feature->bulkAttributes($featureRow);
}
}
}
sub _addBulkSubFeatures {
my ($self, $features, $subFeatureSql, $factory) = @_;
my %featuresById;
map { $featuresById{$_->feature_id} = $_ } @$features;
my $sth = $factory->dbh->prepare($subFeatureSql);
$sth->execute()
or $self->throw("getting bulk subfeature query failed");
while (my $featureRow = $sth->fetchrow_hashref) {
my $feature = $featuresById{$$featureRow{'PARENT_ID'}};
if ($feature) {
$feature->_addSubFeatureFromRow($featureRow);
} else {
$self->warn("sub feature [" . $$featureRow{'FEATURE_ID'} . "]'s parent feature ["
. $$featureRow{'PARENT_ID'} . "] could not be found. bulk subfeature query is:\n"
. $subFeatureSql);
}
}
}
=head2 _getUniqueTypes
Title : _getUniqueTypes
Usage : $segment->_getUniqueTypes()
Function: filter out the duplicate types and return an array of
unique feature types in 'type_source' format
Returns : an array of feature types
Args : array ref
Status : Private
For example, in the config file
[Gene]
feature = gene:Genbank
[BLASTX]
feature = match:WU_BLASTX
_getUniqeTypes will subsitute ':' with '_'
and return [ gene_Genbank, match_WU_BLASTX ], the latter can be used to
find the corresponding element in SQL xml files.
=cut
sub _getUniqueTypes() {
my $types = shift;
my @uniqtypes = ();
my %seen;
for my $type (@{$types || []}) {
push(@uniqtypes, { $type => $type }) unless $seen{$type}++;
}
return @uniqtypes;
}
sub _makeFeature() {
my ($self, $featureRow, $factory) = @_;
my $type = $$featureRow{'TYPE'};
my $source = $$featureRow{'SOURCE'};
my $feature_id = $$featureRow{'FEATURE_ID'};
my $unique_name = "$type.$feature_id";
$type .= ":$source";
my $feat = DAS::GUS::Segment::Feature->new(
$factory,
$self, # parent
$self->seq_id,
$$featureRow{'STARTM'}, # start
$$featureRow{'END'}, # end
$$featureRow{'TYPE'}.':'.$$featureRow{'SOURCE'}, # type
$$featureRow{'SCORE'}, # score
$$featureRow{'STRAND'}, # strand
$$featureRow{'PHASE'}, # phase
$$featureRow{'NAME'}, # group
$$featureRow{'ATTS'}, # attributes
$unique_name,
$$featureRow{'FEATURE_ID'}, # feature_id
);
$feat->source($source);
# if this is mRNA .. then get the exons ....
# depending on the type, build sub_feature here
$feat;
}
# filter out the blastx output. First, sort the blastx data using sql
# by e-value, match length and start.
# Second, filter features with overlap > 5
sub _blastx_filter {
my $feats = shift;
my $counter = 0;
#my $idx = -1;
my $old_end = 2000000;
my @newfeats = ();
foreach my $f(@$feats) {
my $name = $f->name;
#$idx = $idx + 1;
my $start = $f->start;
my $end = $f->end;
if($start <= $old_end) {
$counter = $counter + 1; # find one;
} else {
$old_end = $end;
push(@newfeats, $f);
$counter = 0;
next;
}
if($counter >= 5) {
#splice(@$feats, $idx, 1);
#$idx = $idx - 1; # reset index
} else {
push(@newfeats, $f);
$old_end = $end;
}
}
return @newfeats;
}
=head2 get_all_SeqFeature, get_SeqFeatures, top_SeqFeatures, all_SeqFeatures
Title : get_all_SeqFeature, get_SeqFeatures, top_SeqFeatures,
all_SeqFeatures
Usage : $s->get_all_SeqFeature()
Function: get the sequence string fro this segment
Several aliases of features() for backword compatibility
Returns : a string
Args : none
Status : Public
=cut
*get_all_SeqFeature = *get_SeqFeatures = *top_SeqFeatures = *all_SeqFeatures = \&features;
=head2 seq
Title : seq
Usage : $s->seq
Function: get the sequence string for this segment
Returns : a string
Args : none
Status : Public
=cut
sub seq {
my $self = shift;
return $self->{'seq'} = shift if @_;
return $self->{'seq'} if( $self->{'seq'});
my ($ref, $class, $base_start, $stop, $strand)
= @{$self}{qw(sourceseq class start end strand)};
my $srcfeature_id = $self->{srcfeature_id};
my $has_start = defined $base_start;
my $has_stop = defined $stop;
$strand ||= 0;
my $reversed;
if($has_start && $has_stop && ($base_start > $stop)) {
$reversed++;
($base_start, $stop) = ($stop, $base_start);
} elsif( $strand < 0) {
$reversed++;
}
my $seqQuery = $self->factory->parser->getSQL("Segment.pm", "get_sequence");
warn "Couldn't find Segment.pm sql for get_sequence\n" unless $seqQuery;
return unless $seqQuery;
$seqQuery =~ s/(\$\w+)/eval $1/eg;
my $sth = $self->factory->dbh->prepare($seqQuery);
$sth->execute();
my ($seq) = $sth->fetchrow_array();
if (!$has_start && !$has_stop) {
# do nothing, sequence is already complete
} elsif (!$has_start) {
$seq = substr($seq, 0, $stop - 1);
} elsif(!$has_stop) {
$seq = substr($seq, $base_start - 1);
} else { # has both start and stop
$seq = substr($seq, $base_start - 1, $stop - $base_start + 1);
}
if($reversed) {
$seq = reverse $seq;
$seq =~ tr/gatcGATC/ctagCTAG/;
}
return $seq;
}
*protein = *dna = \&seq;
=head2 secondary_structure_encodings
Title : secondary_structure_encodings
Usage : $s->secondary_structure_encodings
Function: get the secondary structure prediction scores for segment
Returns : hash ref { secondary_structure_type => string of 0-9 one digit per base }
Args : none
Status : Public
=cut
sub secondary_structure_encodings {
my $self = shift;
my $srcfeature_id = $self->{srcfeature_id};
my $strucQuery = $self->factory->parser->getSQL("Segment.pm", "get_2d_struc");
warn "Couldn't find Segment.pm sql for get_2d_struc\n" unless $strucQuery;
return unless $strucQuery;
$strucQuery =~ s/(\$\w+)/eval $1/eg;
my $sth = $self->factory->dbh->prepare($strucQuery);
$sth->execute();
my $encodings = undef;
while (my ($type, $encoding) = $sth->fetchrow_array()) {
$encodings = {} unless defined($encodings);
$type = 'helix' if $type =~ /^h$/i;
$type = 'coil' if $type =~ /^c$/i;
$type = 'strand' if $type =~ /^e$/i;
$encodings->{$type} = $encoding;
}
unless ($encodings && $encodings->{helix}) {
warn "no structure encodings retrieved by sql: $strucQuery\n";
}
return $encodings;
}
=head2 factory
Title : factory
Usage : $factory = $s->factory
Function: return the segment factory
Returns : a Bio::DasI object
Args : see below
Status : Public
This method returns a Bio::DasI object that can be used to fetch more segments.
This is typically the Bio::DasI object from which the segments was originally
generated.
=cut
sub factory { shift->{factory} }
=head2 srcfeature_id
Title : srcfeature_id
Usage : $obj->srcfeature_id($newval)
Function:
Returns : value of srcfeature_id (a scalar)
Args : on set, new value (a scalar or undef, optional)
=cut
sub srcfeature_id {
my $self = shift;
return $self->{'srcfeature_id'} = shift if @_;
return $self->{'srcfeature_id'};
}
=head2 alphabet
Title : alphabet
Usage : $obj->alphabet($newval)
Function:
Returns : scalar 'dna'
Args : on set, new value ( a scalar or undef, optional )
Status : Public
=cut
sub alphabet {
return 'dna';
}
=head2 display_id, display_name, accession_number, desc
Title : display_id, display_name, accession_number, desc
Usage : $s->display_name()
Function: Alias of name()
Several aliases for name; it may be that these could do something
better than just giving back the name.
Returns : string
Args : none
Status : Public
=cut
*display_id = *display_name = *accession_number = *desc = \&name;
=head2 get_feature_stream
Title : get_feature_stream
Usage :
Function:
Returns :
Args :
Status :
=cut
sub get_feature_stream {
my $self = shift;
my @args = @_;
my $features = $self->features(@args);
return DAS::GUSIterator->new($features);
}
sub get_seq_stream {
my @features = shift->features(@_);
return DAS::GUSIterator->new(\@features);
}
=head2 clone
Title : clone
Usage : $copy = $s->clone
Function: make a copy of this segment
Returns : a Bio::DB::GFF::Segment object
Args : none
Status : Public
=cut
sub clone {
my $self = shift;
my %h = %$self;
return bless \%h, ref($self);
}
=head2 sourceseq
Title : sourceseq
Usage : $obj->sourceseq($newval)
Function: get feature name according to a feature_id
Returns : value of sourceseq(a scalar)
Args : on set, new value ( a scalar or undef, optional )
Status : Public
=cut
sub sourceseq {
my $self = shift;
return $self->name;
}
=head2 abs_ref
Title : abs_ref
Usage : $obj->abs_ref()
Function: Alisas of sourceseq
Alias of sourceseq for backward compatibility
Returns : value of sourceseq ( a scalar )
Args : none
Status : Public
=cut
*abs_ref = \&sourceseq;
=head2 abs_start
Title : abs_start
Usage : $obj->abs_start()
Function: Alias of start
Returns : value of start ( a scalar )
Args : none
Status : Public
=cut
*abs_start = \&start;
=head2 abs_end
Title : abs_end
Usage : $obj->abs_end()
Function: Alias of end
Returns : value of end ( a scalar )
Args : none
Status : Public
=cut
*abs_end = \&end;
=head2 asString
Title : asString
Usage : $s->asString
Function: human-readable string for segment
Returns a human-readable string representing this sequence.
Format is: sourceseq:start,stop
Returns : a string
Args : none
Status : Public
=cut
sub asString {
my $self = shift;
my $label = $self->refseq;
my $start = $self->start;
my $stop = $self->stop;
return "$label:$start,$stop";
}
# implement SeqI abstract method.
# required by BatchDumper plugin. don't yet know what, if anything, needs
# to happen here. This is sufficient at the moment to quash execptions.
# -mheiges
sub primary_seq {
my $self = shift;
}
1;