The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Bio::DB::Bam::AlignWrapper;

# $Id$

=head1 NAME

Bio::DB::Bam::AlignWrapper -- Add high-level methods to Bio::DB::Bam::Alignment

=head1 SYNOPSIS

See L<Bio::DB::Bam::Alignment>.

=head1 DESCRIPTION

This is a wrapper around Bio::DB::Bam::Alignment that adds the
following high-level methods. These are described in detail in
L<Bio::DB::Bam::Alignment/High-level Bio::DB::Bam::Alignment methods>.

 add_segment()         add a new subfeature to split alignments
 get_SeqFeatures()     fetch subfeatures from split alignments
 split_splices()       process cigar strings to produce split alignments
 expand_flags()        return true if flags should be expanded into tags
 seq_id()              return human-readable reference sequence name
 seq()                 return Bio::PrimarySeq object for reference sequence
 subseq()              return a subsequence across the indicated range
 mate_seq_id()         return human-readable mate reference sequence name
 dna()                 return the DNA of the reference sequence
 tam_line()            return the text representation of the alignment
 attributes()          synonym for get_tag_values()
 get_all_tags()        return all tag names
 get_tag_values()      return the values of the given tag
 has_tag()             return true if the given tag is defined

=head1 SEE ALSO

L<Bio::Perl>, L<Bio::DB::Sam>, L<Bio::DB::Bam::Constants>

=head1 AUTHOR

Lincoln Stein E<lt>lincoln.stein@oicr.on.caE<gt>.
E<lt>lincoln.stein@bmail.comE<gt>

Copyright (c) 2009 Ontario Institute for Cancer Research.

This package and its accompanying libraries is free software; you can
redistribute it and/or modify it under the terms of the GPL (either
version 1, or at your option, any later version) or the Artistic
License 2.0.  Refer to LICENSE for the full license text. In addition,
please see DISCLAIMER.txt for disclaimers of warranty.

=cut


use strict;
use Bio::DB::Sam::Constants;

our $AUTOLOAD;
use Carp 'croak';

sub new {
    my $package = shift;
    my ($align,$sam) = @_;

    my $self = bless {sam   => $sam,
		      align => $align},ref $package || $package;

    $self->add_segment($self->split_splices)
	if $sam->split_splices && $align->cigar_str =~ /N/;

    return $self; 
}

sub AUTOLOAD {
  my($pack,$func_name) = $AUTOLOAD=~/(.+)::([^:]+)$/;
  return if $func_name eq 'DESTROY';

  no strict 'refs';
  $_[0] or die "autoload called for non-object symbol $func_name";
  croak qq(Can't locate object method "$func_name" via package "$pack")
      unless $_[0]->{align}->can($func_name);

  *{"${pack}::${func_name}"} = sub { shift->{align}->$func_name(@_) };

  shift->$func_name(@_);
}

sub score {shift->{align}->qual}

sub can {
    my $self = shift;
    return 1 if $self->SUPER::can(@_);
    return ref $self && $self->{align}->can(@_);
}

sub add_segment {
    my $self     = shift;
    my @subfeat  = @_;
    $self->{segments} ||= [];
    push @{$self->{segments}},@subfeat;
}

sub get_SeqFeatures {
    my $self = shift;
    return unless $self->{segments};
    return @{$self->{segments}};
}

sub split_splices {
    my $self  = shift;
    my $cigar = $self->cigar_array;
    my @results;

    my $start    = 0;
    my $end      = 0;
    my $skip     = 0;
    my $partial_cigar = '';

    # in case sequence is missing?
    my $qseq = $self->qseq;
    $qseq  ||= 'N' x $self->length;

    for my $op (@$cigar,['N',0]) {
	my ($operation,$count) = @$op;

	if ($operation eq 'N') {
	    my $s = $self->start + $start   + $skip;
	    my $e = $self->start + $end - 1 + $skip;
	    my $f = Bio::DB::Bam::SplitAlignmentPart->new(-name   => $self->display_name,
							  -start  => $s,
							  -end    => $e,
							  -seq_id => $self->seq_id,
							  -strand => $self->strand,
							  -seq    => [$self,$start+$skip,$end-$start], # deferred rendering
							  -type   => $self->type);

	    $f->hit(-name   => $self->display_name,
		    -seq_id => $self->display_name,
		    -start  => $start+1,
		    -end    => $end,
		    -strand => $self->strand,
		    -seq    => substr($qseq,$start,$end-$start),
		);
	    $f->cigar_str($partial_cigar);
	    $partial_cigar = '';

	    push @results,$f;
	    $start += $end-$start;
	} else {
	    $partial_cigar .= "$operation$count";
	}
	$end  += $count if $operation =~ /^[MDSHP]/i;
	$skip += $count if $operation eq 'N';
	if ($operation eq 'H' and $start == 0) {
	    $qseq = 'N' x $count . $qseq;
	}
    }
    return @results;
}

sub expand_flags {
    shift->{sam}->expand_flags(@_);
}

sub query {
    my $self = shift;
    return Bio::DB::Bam::Query->new($self);
}

sub target {
    my $self = shift;
    return Bio::DB::Bam::Target->new($self);
}

sub hit { shift->target(@_); }

sub seq_id {
    my $self = shift;
    my $tid  = $self->tid;
    return if $tid < 0 || $self->unmapped;
    $self->{sam}->target_name($tid);
}

sub mate_seq_id {
    my $self = shift;
    my $tid  = $self->mtid;
    return if $tid < 0 || $self->munmapped;
    $self->{sam}->target_name($tid);
}

sub abs_ref    { shift->seq_id }
sub abs_start  { shift->start  }
sub abs_end    { shift->end    }
sub low        { shift->start  }
sub high       { shift->end    }
sub type       { shift->primary_tag }
sub method     { shift->primary_tag }
sub source     { return shift->source_tag; }
sub name       { shift->qname }
sub class      { shift->primary_tag }

sub seq      {
    my $self   = shift;
    my $dna    = $self->dna;
    return Bio::PrimarySeq->new(-seq => $dna,
				-id  => $self->seq_id);
}

sub subseq {
    my $self = shift;
    my ($start,$end) = @_;
    $start = 1 if $start < 1;
    $end   = $self->high if $end > $self->high;
    my $dna = $self->dna;
    return Bio::PrimarySeq->new(-seq=>substr($dna,
					     $start-1,
					     $end-$start+1)
				);
}

sub padded_alignment {
    my $self  = shift;

    my $cigar = $self->cigar_array;

    my $sdna  = $self->dna;
    my $tdna  = $self->query->dna;

    my ($pad_source,$pad_target,$pad_match);
    for my $event (@$cigar) {
	my ($op,$count) = @$event;
	if ($op eq 'I' || $op eq 'S') {
	    $pad_source .= '-' x $count;
	    $pad_target .= substr($tdna,0,$count,'');
	    $pad_match  .= ' ' x $count;
	}
	elsif ($op eq 'D' || $op eq 'N') {
	    $pad_source .= substr($sdna,0,$count,'');
	    $pad_target .= '-' x $count;
	    $pad_match  .= ' ' x $count;
	}
	elsif ($op eq 'P') {
	    $pad_source .= '*' x $count;
	    $pad_target .= '*' x $count;
	    $pad_match  .= ' ' x $count;
	}
	elsif ($op eq 'H') {
	    # nothing needs to be done in this case
	} else {  # everything else is assumed to be a match -- revisit
	    $pad_source .= substr($sdna,0,$count,'');
	    $pad_target .= substr($tdna,0,$count,'');
	    $pad_match  .= '|' x $count;
	}
    }
    return ($pad_source,$pad_match,$pad_target);
}

sub dna {
    my $self = shift;

    my $sam  = $self->{sam};
    if (my $md   = $self->get_tag_values('MD')) {  # try to use MD string
	my $qseq = $self->qseq;

	#preprocess qseq using cigar array
	my $cigar = $self->cigar_array;
	my $seq   = '';
	for my $op (@$cigar) {
	    my ($operation,$count) = @$op;
	    if ($operation eq 'M') {
		$seq .= substr($qseq,0,$count,''); # include these residues
	    } elsif ($operation eq 'S' or $operation eq 'I') {
		substr($qseq,0,$count,'');         # skip soft clipped and inserted residues
	    }
	}

	my $start = 0;
	my $result;
	while ($md =~ /(\d+)|\^([gatcn]+)|([gatcn]+)/ig) {
	    if (defined $1) {
		$result .= substr($seq,$start,$1);
		$start  += $1;
	    } elsif ($2) {
		$result .= $2;
	    } elsif ($3) {
		$result .= $3;
		$start  += length $3;
	    }
	}
	return $result;
    }

    else {
	return $self->{sam}->seq($self->seq_id,$self->start,$self->end);
    }
}

sub tseq {
    shift->dna(@_);
}

sub attributes {
    my $self = shift;
    my $tag  = shift;
    if (defined $tag) {
	return $self->get_tag_values($tag);
    } else {
	return map {$_=>$self->get_tag_values($_)} $self->get_all_tags;
    }
}

sub get_all_tags {
    my $self      = shift;
    return $self->{align}->get_all_tags(@_)
	if $self->expand_flags;
    return ($self->aux_keys,'FLAGS');
}

sub get_tag_values {
    my $self = shift;
    my $tag  = shift;
    defined $tag or return;

    return $self->{align}->get_tag_values($tag) 
	if $self->expand_flags;
    if ($tag eq 'FLAGS') {
	$self->flag_str;
    } else {
	$self->aux_get($tag);
    }
}

sub has_tag {
    my $self = shift;
    my $tag  = shift;
    defined $tag or return;
    $self->{align}->get_tag_values($tag) 
	if $self->expand_flags;
    if ($tag eq 'FLAGS') {
	return 1;
    } else {
	my %keys = map {$_=>1} $self->aux_keys;
	return exists $keys{uc $tag};
    }
}

sub gff_string { shift->gff3_string(@_) }

sub gff3_string {
    my $self = shift;
    my $recurse   = shift;
    my $parent_id = shift;

    my $group      = $self->format_attributes($parent_id);
    my $name       = $self->name;
    my $id         = $self->primary_id;

    my $class = $self->class;
    my $strand = ('-','.','+')[$self->strand+1];
    my $p = join("\t",
		 $self->seq_id||'.',
		 $self->source||'.',
		 $self->method||'.',
		 $self->start||'.',
		 $self->stop||'.',
		 defined($self->score) ? $self->score : '.',
		 $strand||'.',
		 defined($self->phase) ? $self->phase : '.',
		 $group||'');
    my @rsf = $self->get_SeqFeatures;
    return join("\n",
		$p,
		map {$_->gff3_string($id)} @rsf);
}

sub phase { return } 

sub escape {
  my $self     = shift;
  my $toencode = shift;
  $toencode    =~ s/([^a-zA-Z0-9_.:?^*\(\)\[\]@!+-])/uc sprintf("%%%02x",ord($1))/eg;
  $toencode;
}


sub format_attributes {
  my $self        = shift;
  my $parent_id   = shift;

  my @tags = $self->get_all_tags;
  my @result;
  for my $t (@tags) {
    my @values = $self->each_tag_value($t);
    push @result,join '=',$self->escape($t),join(',', map {$self->escape($_)} @values) if @values;
  }
  my $id        = $self->escape($self->primary_id);

  my $name = $self->display_name;
  unshift @result,"ID=".$id                                    if defined $id;
  unshift @result,"Parent=".$parent_id                         if defined $parent_id;
  unshift @result,"Name=".$self->escape($name)                 if defined $name;
  return join ';',@result;
}

sub tam_line {
    my $self = shift;
    return join ("\t",
		 $self->qname,
		 $self->flag,
		 $self->tid >= 0 ? $self->{sam}->target_name($self->tid) : '*',
		 $self->pos+1,
		 $self->qual,
		 $self->cigar_str || '*',
		 $self->mtid >= 0 ? ($self->mtid == $self->tid ? '=' : $self->{sam}->target_name($self->mtid)) : '*',
		 $self->mpos+1,
		 $self->isize,
		 $self->qseq,
		 join('',map{chr($_+33)} $self->qscore),
		 $self->aux || ()
	);
}

package Bio::DB::Bam::SplitAlignmentPart;

use base 'Bio::SeqFeature::Lite';

sub hit {
    my $self = shift;
    my $d    = $self->{hit};
    $self->{hit} = Bio::SeqFeature::Lite->new(@_) if @_;
    return $d;
}

sub seq {
    my $self = shift;
    my $seq  = $self->{seq};
    return $self->SUPER::seq() unless ref $seq;
    return substr($seq->[0]->dna,$seq->[1],$seq->[2]);
}

sub Bio::SeqFeature::Lite::subseq {
    my $self = shift;
    my ($start,$end) = @_;
    $start = 1 if $start < 1;
    $end   = $self->high if $end > $self->high;
    return Bio::PrimarySeq->new(-seq=>substr($self->dna,
					     $start-1,
					     $end-$start+1)
				);
}

sub cigar_str {
    my $self = shift;
    my $d    = $self->{cigar_str};
    $self->{cigar_str} = shift if @_;
    $d;
}

1;