The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Bio::DB::SeqFeature::Store::BedLoader;

use strict;
use Carp 'croak';
use Text::ParseWords 'shellwords';
use Bio::DB::SeqFeature::Store::LoadHelper;

use base 'Bio::DB::SeqFeature::Store::GFF3Loader';

sub create_load_data { #overridden
  my $self = shift;
  $self->SUPER::create_load_data;
  $self->{load_data}{IndexSubfeatures}    = $self->index_subfeatures();
  $self->{load_data}{TemporaryLoadID} ||= "F0000";
  $self->{load_data}{track_conf}{name}||= 'Feature0000';
  $self->{load_data}{LoadedTypes}       = {};
    

  $self->{load_data}{Helper}           = 
      Bio::DB::SeqFeature::Store::LoadHelper->new($self->{tmpdir});
}

sub load_line {
    my $self = shift;
    my $line = shift;
    chomp $line;

    # make sure these are always defined
    if ($line =~ /^track/) {
	$self->handle_track_conf($line);
    } elsif ($line) {
	$self->handle_feature($line);
    }
}

sub handle_track_conf {
    my $self = shift;
    my $line = shift;
    my $load_data  = $self->{load_data};
    my %attributes = $self->parse_track_line($line);
    $load_data->{track_conf}  = \%attributes;
}

sub parse_track_line {
    my $self = shift;
    my $line = shift;
    my @tokens = shellwords($line);
    shift @tokens;
    return map {split '='} @tokens;
}

sub handle_feature {
    my $self = shift;
    my $line = shift;
    my ($chrom,$chromStart,$chromEnd, # mandatory
	$name,$score,$Strand,
	$thickStart,$thickEnd,$itemRGB,
	$blockCount,$blockSizes,$blockStarts) = split /\s+/,$line;

    my $ld       = $self->{load_data};
    my $load_id  = $ld->{TemporaryLoadID}++;

    # status reporting
    if (++$ld->{count} % 1000 == 0) {
	my $now = $self->time();
	my $nl = -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
	local $^W = 0; # kill uninit variable warning
	$self->msg(sprintf("%d features loaded in %5.2fs (%5.2fs/1000 features)...%s$nl",
			   $ld->{count},$now - $ld->{start_time},
			   $now - $ld->{millenium_time},
			   ' ' x 80
		   ));
	$ld->{millenium_time} = $now;
    }


    my $isGene = $thickStart || $blockCount;
    my $method = $isGene ? 'mRNA' : 'region';
    my $source = $ld->{track_conf}{name};  # bogus, but works

    $ld->{LoadedTypes}{"$method:$source"}++;

    my $strand = $Strand && $Strand eq '-' ? -1 : +1;
    my $start  = $chromStart+1;
    my $end    = $chromEnd;

    my %attributes;
    %attributes = (RGB => $itemRGB) if $itemRGB && $itemRGB;

    # create parent feature
    my @args = (-display_name => $name,
		-seq_id       => $chrom,
		-start        => $start,
		-end          => $end,
		-strand       => $strand,
		-score        => $score,
		-primary_tag  => $method,
		-source       => $source,
		-tag          => \%attributes,
		-attributes   => \%attributes,
	);

    my $feature = $self->sfclass->new(@args);    
    $feature->object_store($self->store) if $feature->can('object_store');  # for lazy table features
    
    $ld->{CurrentFeature}    = $feature;
    $ld->{CurrentID}         = $load_id;

    my $helper = $ld->{Helper};
    $helper->indexit($load_id=>1);   # index toplevel features
    $helper->toplevel($load_id=>1) if !$self->{fast};

    $self->store_current_feature();  # this will clear out CurrentFeature and CurrentID

    if ($isGene) {
	my @children;

	# parts is an array of [start,end,method]
	my ($parts) = $self->split_gene_bits($chromStart,$chromEnd,
					     $thickStart,$thickEnd,
					     $blockCount,$blockSizes,$blockStarts);
	for my $u (@$parts) {
	    my $f = $self->sfclass->new(-seq_id => $chrom,
					-start  => $u->[0],
					-end    => $u->[1],
					-strand => $strand,
					-source => $source,
					-score  => $score,
					-primary_tag => $u->[2],
					-attributes  => \%attributes,
		);
	    my $id = $ld->{TemporaryLoadID}++;
	    $ld->{CurrentFeature} = $f;
	    $ld->{CurrentID}      = $id;
	    $self->store_current_feature();
	    push @children,$id;
	}

	# remember parentage using the helper
	for my $child_id (@children) {
	    $helper->add_children($load_id=>$child_id);
	}
    }
}

sub split_gene_bits {
    my $self = shift;
    my ($chromStart,$chromEnd,
	$thickStart,$thickEnd,
	$numBlocks,$blockSizes,$blockStarts) = @_;

    # no internal structure, so just create UTRs and one CDS in the middle
    # remember that BED format uses 0-based indexing, hence the +1s
    unless ($blockSizes) {  
	my @bits = ([$chromStart+1,$thickStart,'UTR'],
		    [$thickStart+1,$thickEnd,'CDS'],
		    [$thickEnd+1,$chromEnd,'UTR']);
	return \@bits;
    }

    # harder -- we have internal exons
    my @block_sizes  = split ',',$blockSizes;
    my @block_starts = split ',',$blockStarts;
    croak "Invalid BED file: blockSizes != blockStarts"
	unless @block_sizes == @block_starts && @block_sizes == $numBlocks;

    my @bits;
    for (my $i=0;$i<@block_starts;$i++) {
	my $start = $chromStart + $block_starts[$i];	
	my $end   = $chromStart + $block_starts[$i] + $block_sizes[$i];

	if ($start < $thickStart) {
	    if ($end < $thickStart) {          # UTR wholly contained in an exon
		push @bits,[$start+1,$end,'UTR'];
	    }
	    elsif ($end >= $thickStart) {      # UTR partially contained in an exon
		push @bits,[$start+1,$thickStart,'UTR'];
		push @bits,[$thickStart+1,$end,'CDS'];
	    }
	}

	elsif ($start < $thickEnd) {
	    if ($end <= $thickEnd) {           # CDS wholly contained in an exon
		push @bits,[$start+1,$end,'CDS'];
	    }
	    elsif ($end > $thickEnd) {         # CDS partially contained in an exon
		push @bits,[$start+1,$thickEnd,'CDS'];
		push @bits,[$thickEnd+1,$end,'UTR'];
	    }
	}

	elsif ($start > $thickEnd) {
	    push @bits,[$start+1,$end,'UTR'];  # UTR wholly contained in an exon
	}

	elsif ($start == $thickEnd) {
	    push @bits,[$start+1,$end,'UTR'];  # non-coding gene ?!
	}
	else {
	    croak "Programmer error when calculating UTR bounds";
	}

    }

    return \@bits;
}

sub loaded_types {
    my $self = shift;
    my $ld   = $self->{load_data};
    return keys %{$ld->{LoadedTypes}};
}

1;