Bio/DB/SeqFeature/Store/berkeleydb3.pm

package Bio::DB::SeqFeature::Store::berkeleydb3;

# $Id: berkeleydb3.pm 15987 2009-08-18 21:08:55Z lstein $
# faster implementation of berkeleydb

=head1 NAME

Bio::DB::SeqFeature::Store::berkeleydb3 -- Storage and retrieval of sequence
annotation data in Berkeleydb files

=head1 SYNOPSIS

  # Create a feature database from scratch
  $db     = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
                                             -dsn     => '/var/databases/fly4.3',
                                             -create  => 1);

  # get a feature from somewhere
  my $feature = Bio::SeqFeature::Generic->new(...);

  # store it
  $db->store($feature) or die "Couldn't store!";

=head1 DESCRIPTION

This is a faster version of the berkeleydb storage adaptor for
Bio::DB::SeqFeature::Store. It is used automatically when you create a
new database with the original berkeleydb adaptor. When opening a
database created under the original adaptor, the old code is used for
backward compatibility.

Please see L<Bio::DB::SeqFeature::Store::berkeleydb> for full usage
instructions.

=head1 BUGS

This is an early version, so there are certainly some bugs. Please
use the BioPerl bug tracking system to report bugs.

=head1 SEE ALSO

L<bioperl>,
L<Bio::DB::SeqFeature>,
L<Bio::DB::SeqFeature::Store>,
L<Bio::DB::SeqFeature::GFF3Loader>,
L<Bio::DB::SeqFeature::Segment>,
L<Bio::DB::SeqFeature::Store::memory>,
L<Bio::DB::SeqFeature::Store::DBI::mysql>,

=head1 AUTHOR

Lincoln Stein E<lt>lincoln.stein@gmail.comE<gt>.

Copyright (c) 2009 Ontario Institute for Cancer Research

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=cut

use strict;
use base 'Bio::DB::SeqFeature::Store::berkeleydb';
use DB_File;
use Fcntl qw(O_RDWR O_CREAT :flock);
use Bio::DB::GFF::Util::Rearrange 'rearrange';

# can't have more sequence ids than this
use constant MAX_SEQUENCES => 1_000_000_000;

# used to construct the bin key
use constant C1  => 500_000_000; # limits chromosome length to 500 megabases
use constant C2  => 1000*C1;     # at most 1000 chromosomes

use constant BINSIZE       => 10_000;
use constant MININT        => -999_999_999_999;
use constant MAXINT        => 999_999_999_999;
use constant SUMMARY_BIN_SIZE => 1000;

sub version { return 3.0 }

sub open_index_dbs {
    my $self = shift;
    my ($flags,$create) = @_;

    # Create the main index databases; these are DB_BTREE implementations with duplicates allowed.
    $DB_BTREE->{flags}  = R_DUP;

    my $string_cmp          = DB_File::BTREEINFO->new;
    $string_cmp->{flags}    = R_DUP;
    $string_cmp->{compare}  = sub { lc $_[0] cmp lc $_[1] };

    my $numeric_cmp         = DB_File::BTREEINFO->new;
    $numeric_cmp->{flags}   = R_DUP;
    $numeric_cmp->{compare} = sub { $_[0] <=> $_[1] };

    for my $idx ($self->_index_files) {
        my $path    = $self->_qualify("$idx.idx");
        my %db;
        my $dbtype  = $idx eq 'locations' ? $numeric_cmp
                     :$idx eq 'summary'   ? $numeric_cmp
                     :$idx eq 'types'     ? $numeric_cmp
                     :$idx eq 'seqids'    ? $DB_HASH
                     :$idx eq 'typeids'   ? $DB_HASH
                     :$string_cmp;

        tie(%db,'DB_File',$path,$flags,0666,$dbtype)
            or $self->throw("Couldn't tie $path: $!");
        %db = () if $create;
        $self->index_db($idx=>\%db);
    }

}

sub seqid_db  { shift->index_db('seqids')    }
sub typeid_db { shift->index_db('typeids') }

sub _delete_databases {
    my $self = shift;
    $self->SUPER::_delete_databases;
}

# given a seqid (name), return its denormalized numeric representation
sub seqid_id {
    my $self   = shift;
    my $seqid  = shift;
    my $db     = $self->seqid_db;
    return $db->{lc $seqid};
}

sub add_seqid {
    my $self  = shift;
    my $seqid = shift;

    my $db    = $self->seqid_db;
    my $key   = lc $seqid;
    $db->{$key} = ++$db->{'.nextid'} unless exists $db->{$key};
    die "Maximum number of sequence ids exceeded. This module can handle up to ",
        MAX_SEQUENCES," unique ids" if $db->{$key} > MAX_SEQUENCES;
    return $db->{$key};
}

# given a seqid (name), return its denormalized numeric representation
sub type_id {
    my $self   = shift;
    my $typeid  = shift;
    my $db     = $self->typeid_db;
    return $db->{$typeid};
}

sub add_typeid {
    my $self  = shift;
    my $typeid = shift;

    my $db      = $self->typeid_db;
    my $key     = lc $typeid;
    $db->{$key} = ++$db->{'.nextid'} unless exists $db->{$key};
    return $db->{$key};
}

sub _seq_ids {
    my $self = shift;
    if (my $fa = $self->{fasta_db}) {
        if (my @s = eval {$fa->ids}) {
            return @s;
        }
    } 
    my $l = $self->seqid_db or return;
    return grep {!/^\./} keys %$l;
}

sub _index_files {
    return (shift->SUPER::_index_files,'seqids','typeids','summary');
}

sub _update_indexes {
    my $self = shift;
    my $obj  = shift;
    defined (my $id   = $obj->primary_id) or return;
    $self->SUPER::_update_indexes($obj);
    $self->_update_seqid_index($obj,$id);
}

sub _update_seqid_index {
    my $self = shift;
    my ($obj,$id,$delete) = @_;
    my $seq_name = $obj->seq_id;
    $self->add_seqid(lc $seq_name);
}

sub _update_type_index {
    my $self = shift;
    my ($obj,$id,$delete) = @_;
    my $db = $self->index_db('types')
        or $self->throw("Couldn't find 'types' index file");

    my $key    = $self->_obj_to_type($obj);
    my $typeid = $self->add_typeid($key);
    $self->update_or_delete($delete,$db,$typeid,$id);
}

sub _obj_to_type {
    my $self = shift;
    my $obj  = shift;
    my $tag         = $obj->primary_tag;
    my $source_tag  = $obj->source_tag || '';
    return unless defined $tag;

    $tag           .= ":$source_tag";
    return lc $tag;
}

sub types {
    my $self = shift;
    eval "require Bio::DB::GFF::Typename" 
        unless Bio::DB::GFF::Typename->can('new');
    my $db   = $self->typeid_db;
    return grep {!/^\./} map {Bio::DB::GFF::Typename->new($_)} keys %$db;
}

sub _id2type {
    my $self = shift;
    my $wanted_id   = shift;

    my $db = $self->typeid_db;
    while (my($key,$id) = each %$db) {
        next if $key =~ /^\./;
        return $key if $id == $wanted_id;
    }
    return;
}

# return a hash of typeids that match a human-readable type
sub _matching_types {
    my $self  = shift;
    my $types = shift;
    my @types = ref $types eq 'ARRAY' ?  @$types : $types;
    my $db   = $self->typeid_db;

    my %result;
    my @all_types;

    for my $type (@types) {
        my ($primary_tag,$source_tag);
        if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
            $primary_tag = $type->method;
            $source_tag  = $type->source;
        } else {
            ($primary_tag,$source_tag) = split ':',$type,2;
        }
        if (defined $source_tag) {
            my $id = $db->{lc "$primary_tag:$source_tag"};
            $result{$id}++ if defined $id;
        } else {
            @all_types  = $self->types unless @all_types;
            $result{$db->{$_}}++ foreach grep {/^$primary_tag:/} @all_types;
        }
    }
    return \%result;
}

sub _update_location_index {
    my $self = shift;
    my ($obj,$id,$delete) = @_;

    my $db = $self->index_db('locations')
        or $self->throw("Couldn't find 'locations' index file");

    my $seq_id      = $obj->seq_id || '';
    my $start       = $obj->start  || '';
    my $end         = $obj->end    || '';
    my $strand      = $obj->strand;
    my $bin_min     = int $start/BINSIZE;
    my $bin_max     = int $end/BINSIZE;

    my $typeid      = $self->add_typeid($self->_obj_to_type($obj));
    my $seq_no      = $self->add_seqid($seq_id);

    for (my $bin = $bin_min; $bin <= $bin_max; $bin++ ) {
        my $key = $seq_no * MAX_SEQUENCES + $bin;
        $self->update_or_delete($delete,$db,$key,pack("i5",$id,$start,$end,$strand,$typeid));
    }

}

sub _features {
    my $self = shift;
    my ($seq_id,$start,$end,$strand,
        $name,$class,$allow_aliases,
        $types,
        $attributes,
        $range_type,
        $iterator
       ) = rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND',
                      'NAME','CLASS','ALIASES',
                      ['TYPES','TYPE','PRIMARY_TAG'],
                      ['ATTRIBUTES','ATTRIBUTE'],
                      'RANGE_TYPE',
                      'ITERATOR',
                     ],@_);

    my (@from,@where,@args,@group);
    $range_type ||= 'overlaps';

    my @result;
    unless (defined $name or defined $seq_id or defined $types or defined $attributes) {
        my $is_indexed = $self->index_db('is_indexed');
        @result = $is_indexed ? grep {$is_indexed->{$_}} keys %{$self->db}
                              : grep { !/^\./ }keys %{$self->db};
    }

    my %found = ();
    my $result = 1;

    if (defined($name)) {
        # hacky backward compatibility workaround
        undef $class if $class && $class eq 'Sequence';
        $name     = "$class:$name" if defined $class && length $class > 0;
        $result &&= $self->filter_by_name($name,$allow_aliases,\%found);
    }

    if (defined $seq_id) { # location with or without types
        my $typelist = defined $types ? $self->_matching_types($types) : undef;
        $result &&= $self->filter_by_type_and_location(
            $seq_id, $start, $end, $strand, $range_type, $typelist, \%found
        );
    }

    elsif (defined $types) { # types without location
        $result &&= $self->filter_by_type($types,\%found);
    }

    if (defined $attributes) {
      $result &&= $self->filter_by_attribute($attributes,\%found);
    }

    push @result,keys %found if $result;
    return $iterator ? Bio::DB::SeqFeature::Store::berkeleydb::Iterator->new($self,\@result)
                     : map {$self->fetch($_)} @result;
}

sub filter_by_type {
    my $self = shift;
    my ($types,$filter) = @_;
    my @types = ref $types eq 'ARRAY' ?  @$types : $types;

    my $index = $self->index_db('types');
    my $db    = tied(%$index);

    my @results;

    for my $type (@types) {
        my ($primary_tag,$source_tag);
        if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
            $primary_tag = $type->method;
            $source_tag  = $type->source;
        } else {
            ($primary_tag,$source_tag) = split ':',$type,2;
        }
        $source_tag ||= '';
        $primary_tag  = quotemeta($primary_tag);
        $source_tag    = quotemeta($source_tag);
        my $match = length $source_tag ? "^$primary_tag:$source_tag\$" : "^$primary_tag:";
        my $key      = lc "$primary_tag:$source_tag";
        my $value;

        # If filter is already provided, then it is usually faster to
        # fetch each object.
        if (%$filter) {  
            for my $id (keys %$filter) {
                my $obj = $self->_fetch($id) or next;
                push @results,$id if $obj->type =~ /$match/i;
            }

        }

        else {
            my $types   = $self->typeid_db;
            my @typeids = map {$types->{$_}} grep {/$match/} keys %$types;
            for my $t (@typeids) {
                my $k = $t;
                for (my $status = $db->seq($k,$value,R_CURSOR);
                    $status == 0 && $k == $t;
                    $status = $db->seq($k,$value,R_NEXT)) {
                    next if %$filter && !$filter->{$value};  # don't even bother
                    push @results,$value;
                }
            }
        }
    }
    $self->update_filter($filter,\@results);
}

sub filter_by_type_and_location {
    my $self = shift;
    my ($seq_id,$start,$end,$strand,$range_type,$typelist,$filter) = @_;
    $strand ||= 0;

    my $index    = $self->index_db('locations');
    my $db       = tied(%$index);

    my $binstart = defined $start ? int $start/BINSIZE : 0;
    my $binend   = defined $end   ? int $end/BINSIZE   : MAX_SEQUENCES-1;

    my %seenit;
    my @results;

    $start = MININT  if !defined $start;
    $end   = MAXINT  if !defined $end;

    my $seq_no = $self->seqid_id($seq_id);
    return unless defined $seq_no;

    if ($range_type eq 'overlaps' or $range_type eq 'contains') {
        my $keystart = $seq_no * MAX_SEQUENCES + $binstart;
        my $keystop  = $seq_no * MAX_SEQUENCES + $binend;
        my $value;

        for (my $status = $db->seq($keystart,$value,R_CURSOR);
             $status == 0 && $keystart <= $keystop;
             $status = $db->seq($keystart,$value,R_NEXT)) {
            my ($id,$fstart,$fend,$fstrand,$ftype) = unpack("i5",$value);
            next if $seenit{$id}++;
            next if $strand   && $fstrand != $strand;
            next if $typelist && !$typelist->{$ftype};
            if ($range_type eq 'overlaps') {
                next unless $fend >= $start && $fstart <= $end;
            }
            elsif ($range_type eq 'contains') {
                next unless $fstart >= $start && $fend <= $end;
            }
            next if %$filter && !$filter->{$id};  # don't bother
            push @results,$id;
        }
    }

    # for contained in, we look for features originating and terminating outside the specified range
    # this is incredibly inefficient, but fortunately the query is rare (?)
    elsif ($range_type eq 'contained_in') {
        my $keystart = $seq_no * MAX_SEQUENCES;
        my $keystop  = $seq_no * MAX_SEQUENCES + $binstart;
        my $value;

        # do the left part of the range
        for (my $status = $db->seq($keystart,$value,R_CURSOR);
             $status == 0 && $keystart <= $keystop;
             $status = $db->seq($keystart,$value,R_NEXT)) {
            my ($id,$fstart,$fend,$fstrand,$ftype) = unpack("i5",$value);
            next if $seenit{$id}++;
            next if $strand && $fstrand != $strand;
            next if $typelist && !$typelist->{$ftype};
            next unless $fstart <= $start && $fend >= $end;
            next if %$filter && !$filter->{$id};  # don't bother
            push @results,$id;
        }

        # do the right part of the range
        $keystart = $seq_no*MAX_SEQUENCES+$binend;
        for (my $status = $db->seq($keystart,$value,R_CURSOR);
             $status == 0;
             $status = $db->seq($keystart,$value,R_NEXT)) {
            my ($id,$fstart,$fend,$fstrand,$ftype) = unpack("i5",$value);
            next if $seenit{$id}++;
            next if $strand && $fstrand != $strand;
            next unless $fstart <= $start && $fend >= $end;
            next if $typelist && !$typelist->{$ftype};
            next if %$filter && !$filter->{$id};  # don't bother
            push @results,$id;
        }

    }

    $self->update_filter($filter,\@results);
}

sub build_summary_statistics {
    my $self = shift;

    my $insert = $self->index_db('summary');
    %$insert   = ();

    my $current_bin = -1;
    my (%residuals,$last_bin);

    my $le = -t \*STDERR ? "\r" : "\n";

    print STDERR "\n";

    # iterate through all the indexed features
    my $sbs      = SUMMARY_BIN_SIZE;

    # Sadly we have to do this in two steps. In the first step, we sort
    # features by typeid,seqid,start. In the second step, we read through
    # this sorted list. To avoid running out of memory, we use a db_file
    # temporary database
    my $fh   = File::Temp->new() or $self->throw("Could not create temporary file '$name' for sorting: $!");
    my $name = $fh->filename;
    my %sort;
    my $num_cmp_tre         = DB_File::BTREEINFO->new;
    $num_cmp_tree->{compare} = sub { $_[0] <=> $_[1] };
    $num_cmp_tree->{flags}   = R_DUP;
    my $s = tie %sort, 'DB_File', $name, O_CREAT|O_RDWR, 0666, $num_cmp_tree 
        or $self->throw("Could not create Berkeley DB in temporary file '$name': $!");

    my $index    = $self->index_db('locations');
    my $db       = tied(%$index);
    my $keystart = 0;
    my ($value,$count);
    my %seenit;

    for (my $status = $db->seq($keystart,$value,R_CURSOR);
         $status == 0;
         $status  = $db->seq($keystart,$value,R_NEXT)) {
        my ($id,$start,$end,$strand,$typeid) = unpack('i5',$value);
        next if $seenit{$id}++;

        print STDERR $count," features sorted$le" if ++$count % 1000 == 0;
        my $seqid = int($keystart / MAX_SEQUENCES);
        my $key   = $self->_encode_summary_key($typeid,$seqid,$start-1);
        $sort{$key}=$end;
    }
    print STDERR "COUNT = $count\n";
    
    my ($current_type,$current_seqid,$end);
    my $cum_count = 0;

    $keystart = 0;
    $count    = 0;

    # the second step allows us to iterate through this
    for (my $status = $s->seq($keystart,$end,R_CURSOR);
         $status == 0;
         $status  = $s->seq($keystart,$end,R_NEXT)) {

        print STDERR $count," features processed$le" if ++$count % 1000 == 0;
        my ($typeid,$seqid,$start) = $self->_decode_summary_key($keystart);

        my $bin   = int($start/$sbs);
        
        # because the input is sorted by start, no more features will contribute to the 
        # current bin so we can dispose of it
        if ($bin != $current_bin) {
            if ($seqid != $current_seqid or $typeid != $current_type) {
                # load all bins left over
                $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid);
                %residuals = () ;
                $cum_count = 0;
            } else {
                # load all up to current one
                $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid,$current_bin); 
            }
        }

        $last_bin = $current_bin;
        ($current_seqid,$current_type,$current_bin) = ($seqid,$typeid,$bin);

        # summarize across entire spanned region
        my $last_bin = int(($end-1)/$sbs);
        for (my $b=$bin;$b<=$last_bin;$b++) {
            $residuals{$b}++;
        }
    }

    # handle tail case
    # load all bins left over
    $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid);

    undef %sort;
    undef $fh;
}

sub _load_bins {
    my $self = shift;
    my ($insert,$residuals,$cum_count,$typeid,$seqid,$stop_after) = @_;
    for my $b (sort {$a<=>$b} keys %$residuals) {
        last if defined $stop_after and $b > $stop_after;
        $$cum_count += $residuals->{$b};
        my $key         = $self->_encode_summary_key($typeid,$seqid,$b);
        $insert->{$key} = $$cum_count;
        delete $residuals->{$b}; # no longer needed
    }
}

sub coverage_array {
    my $self = shift;
    my ($seq_name,$start,$end,$types,$bins) = 
        rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],
                   ['TYPES','TYPE','PRIMARY_TAG'],'BINS'],@_);

    $bins  ||= 1000;
    $start ||= 1;
    unless ($end) {
        my $segment = $self->segment($seq_name) or $self->throw("unknown seq_id $seq_name");
        $end        = $segment->end;
    }

    my $binsize = ($end-$start+1)/$bins;
    my $seqid   = $self->seqid_id($seq_name) || 0;

    return [] unless $seqid;

    # where each bin starts
    my @his_bin_array = map {$start + $binsize * $_}       (0..$bins);
    my @sum_bin_array = map {int(($_-1)/SUMMARY_BIN_SIZE)} @his_bin_array;

    my $interval_stats_idx = $self->index_db('summary');
    my $db                 = tied(%$interval_stats_idx);
    my $t                  = $self->_matching_types($types);

    my (%bins,$report_tag);
    for my $typeid (sort keys %$t) {
        $report_tag ||= $typeid;

        for (my $i=0;$i<@sum_bin_array;$i++) {
            my $cum_count;
            my $bin = $sum_bin_array[$i];
            my $key = $self->_encode_summary_key($typeid,$seqid,$bin);
            my $status = $db->seq($key,$cum_count,R_CURSOR);
            next unless $status == 0;
            push @{$bins{$typeid}},[$bin,$cum_count];
        }
    }
    
    my @merged_bins;
    my $firstbin = int(($start-1)/$binsize);
    for my $type (keys %bins) {
        my $arry       = $bins{$type};
        my $last_count = $arry->[0][1]-1;
        my $last_bin   = -1;
        my $i          = 0;
        my $delta;
        for my $b (@$arry) {
            my ($bin,$count) = @$b;
            $delta              = $count - $last_count if $bin > $last_bin;
            $merged_bins[$i++]  = $delta;
            $last_count         = $count;
            $last_bin           = $bin;
        }
    }
    my $returned_type = $self->_id2type($report_tag);
    return wantarray ? (\@merged_bins,$returned_type) : \@merged_bins;
}


sub _encode_summary_key {
    my $self                 = shift;
    my ($typeid,$seqid,$bin) = @_;
    $self->throw('Cannot index chromosomes larger than '.C1*SUMMARY_BIN_SIZE/1e6.' megabases')
        if $bin > C1;
    return ($typeid-1)*C2 + ($seqid-1)*C1 + $bin;
}

sub _decode_summary_key {
    my $self   = shift;
    my $key    = shift;
    my $typeid   = int($key/C2);
    my $residual =     $key%C2;
    my $seqid    = int($residual/C1);
    my $bin      =     $residual%C1;
    return ($typeid+1,$seqid+1,$bin);
}


1;
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)