package Bio::DB::SeqFeature::Store::BedLoader;
use strict;
use Carp 'croak';
use Text::ParseWords 'shellwords';
use Bio::DB::SeqFeature::Store::LoadHelper;
use base 'Bio::DB::SeqFeature::Store::GFF3Loader';
sub create_load_data { #overridden
my $self = shift;
$self->SUPER::create_load_data;
$self->{load_data}{IndexSubfeatures} = $self->index_subfeatures();
$self->{load_data}{TemporaryLoadID} ||= "F0000";
$self->{load_data}{track_conf}{name}||= 'Feature0000';
$self->{load_data}{LoadedTypes} = {};
$self->{load_data}{Helper} =
Bio::DB::SeqFeature::Store::LoadHelper->new($self->{tmpdir});
}
sub load_line {
my $self = shift;
my $line = shift;
chomp $line;
# make sure these are always defined
if ($line =~ /^track/) {
$self->handle_track_conf($line);
} elsif ($line) {
$self->handle_feature($line);
}
}
sub handle_track_conf {
my $self = shift;
my $line = shift;
my $load_data = $self->{load_data};
my %attributes = $self->parse_track_line($line);
$load_data->{track_conf} = \%attributes;
}
sub parse_track_line {
my $self = shift;
my $line = shift;
my @tokens = shellwords($line);
shift @tokens;
return map {split '='} @tokens;
}
sub handle_feature {
my $self = shift;
my $line = shift;
my ($chrom,$chromStart,$chromEnd, # mandatory
$name,$score,$Strand,
$thickStart,$thickEnd,$itemRGB,
$blockCount,$blockSizes,$blockStarts) = split /\s+/,$line;
my $ld = $self->{load_data};
my $load_id = $ld->{TemporaryLoadID}++;
# status reporting
if (++$ld->{count} % 1000 == 0) {
my $now = $self->time();
my $nl = -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
local $^W = 0; # kill uninit variable warning
$self->msg(sprintf("%d features loaded in %5.2fs (%5.2fs/1000 features)...%s$nl",
$ld->{count},$now - $ld->{start_time},
$now - $ld->{millenium_time},
' ' x 80
));
$ld->{millenium_time} = $now;
}
my $isGene = $thickStart || $blockCount;
my $method = $isGene ? 'mRNA' : 'region';
my $source = $ld->{track_conf}{name}; # bogus, but works
$ld->{LoadedTypes}{"$method:$source"}++;
my $strand = $Strand && $Strand eq '-' ? -1 : +1;
my $start = $chromStart+1;
my $end = $chromEnd;
my %attributes;
%attributes = (RGB => $itemRGB) if $itemRGB && $itemRGB;
# create parent feature
my @args = (-display_name => $name,
-seq_id => $chrom,
-start => $start,
-end => $end,
-strand => $strand,
-score => $score,
-primary_tag => $method,
-source => $source,
-tag => \%attributes,
-attributes => \%attributes,
);
my $feature = $self->sfclass->new(@args);
$feature->object_store($self->store) if $feature->can('object_store'); # for lazy table features
$ld->{CurrentFeature} = $feature;
$ld->{CurrentID} = $load_id;
my $helper = $ld->{Helper};
$helper->indexit($load_id=>1); # index toplevel features
$helper->toplevel($load_id=>1) if !$self->{fast};
$self->store_current_feature(); # this will clear out CurrentFeature and CurrentID
if ($isGene) {
my @children;
# parts is an array of [start,end,method]
my ($parts) = $self->split_gene_bits($chromStart,$chromEnd,
$thickStart,$thickEnd,
$blockCount,$blockSizes,$blockStarts);
for my $u (@$parts) {
my $f = $self->sfclass->new(-seq_id => $chrom,
-start => $u->[0],
-end => $u->[1],
-strand => $strand,
-source => $source,
-score => $score,
-primary_tag => $u->[2],
-attributes => \%attributes,
);
my $id = $ld->{TemporaryLoadID}++;
$ld->{CurrentFeature} = $f;
$ld->{CurrentID} = $id;
$self->store_current_feature();
push @children,$id;
}
# remember parentage using the helper
for my $child_id (@children) {
$helper->add_children($load_id=>$child_id);
}
}
}
sub split_gene_bits {
my $self = shift;
my ($chromStart,$chromEnd,
$thickStart,$thickEnd,
$numBlocks,$blockSizes,$blockStarts) = @_;
# no internal structure, so just create UTRs and one CDS in the middle
# remember that BED format uses 0-based indexing, hence the +1s
unless ($blockSizes) {
my @bits = ([$chromStart+1,$thickStart,'UTR'],
[$thickStart+1,$thickEnd,'CDS'],
[$thickEnd+1,$chromEnd,'UTR']);
return \@bits;
}
# harder -- we have internal exons
my @block_sizes = split ',',$blockSizes;
my @block_starts = split ',',$blockStarts;
croak "Invalid BED file: blockSizes != blockStarts"
unless @block_sizes == @block_starts && @block_sizes == $numBlocks;
my @bits;
for (my $i=0;$i<@block_starts;$i++) {
my $start = $chromStart + $block_starts[$i];
my $end = $chromStart + $block_starts[$i] + $block_sizes[$i];
if ($start < $thickStart) {
if ($end < $thickStart) { # UTR wholly contained in an exon
push @bits,[$start+1,$end,'UTR'];
}
elsif ($end >= $thickStart) { # UTR partially contained in an exon
push @bits,[$start+1,$thickStart,'UTR'];
push @bits,[$thickStart+1,$end,'CDS'];
}
}
elsif ($start < $thickEnd) {
if ($end <= $thickEnd) { # CDS wholly contained in an exon
push @bits,[$start+1,$end,'CDS'];
}
elsif ($end > $thickEnd) { # CDS partially contained in an exon
push @bits,[$start+1,$thickEnd,'CDS'];
push @bits,[$thickEnd+1,$end,'UTR'];
}
}
elsif ($start > $thickEnd) {
push @bits,[$start+1,$end,'UTR']; # UTR wholly contained in an exon
}
elsif ($start == $thickEnd) {
push @bits,[$start+1,$end,'UTR']; # non-coding gene ?!
}
else {
croak "Programmer error when calculating UTR bounds";
}
}
return \@bits;
}
sub loaded_types {
my $self = shift;
my $ld = $self->{load_data};
return keys %{$ld->{LoadedTypes}};
}
1;