The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Bio::Gonzales::Seq::Util;

use warnings;
use strict;
use Carp;

use Scalar::Util qw/blessed/;
use Data::Dumper;
use Bio::Gonzales::Matrix::IO;
use File::Which qw/which/;
use Bio::Gonzales::Seq::IO;
use Bio::Gonzales::Seq;

use base 'Exporter';
our ( @EXPORT, @EXPORT_OK, %EXPORT_TAGS );
our $VERSION = '0.078'; # VERSION

@EXPORT      = qw();
%EXPORT_TAGS = ();
@EXPORT_OK   = qw(
  pairwise_identity_l
  pairwise_identity_s
  pairwise_identity_gaps_l
  pairwise_identity_gaps_s
  pairwise_identities
  map_seqids
  seqid_mapper
  crc64
  strand_convert
  seq_lengths
  seq_apply
  revcom_seq_string
);

our %STRAND_CHAR_TABLE = (
  '+' => 1,
  '-' => -1,
  '.' => 0,
  -1  => '-',
  1   => '+',
  0   => '.',
);

our $BLASTDB_CMD  = which('blastdbcmd');
our $SAMTOOLS_CMD = which('samtools');

sub strand_convert {
  if ( @_ && @_ > 0 && $_[-1] && exists( $STRAND_CHAR_TABLE{ $_[-1] } ) ) {
    return $STRAND_CHAR_TABLE{ $_[-1] };
  } else {
    return '.';
  }
}

sub seq_lengths {
  my $f = shift;

  my $d;
  if ( -f $f . ".fai" ) {
    $d = mslurp( $f . ".fai", { header => undef } );
  } elsif ( -f $f . ".nhr" ) {
    open my $fh, '-|', $BLASTDB_CMD, '-db', $f, '-entry', 'all', '-outfmt', "%a\t%l"
      or die "Can't open filehandle: $!";
    $d = mslurp( $fh, { header => undef } );
    close $fh;
  } elsif ($SAMTOOLS_CMD) {
    system( 'samtools', 'faidx', $f ) == 0 or die "system failed: $?";
    $d = mslurp( $f . ".fai", { header => undef } ) if ( -f $f . ".fai" );
  }

  # nothing worked so far
  unless ($d) {
    say STDERR "could not use samtools or blast indices, using std method";
    my @lengths;
    my $fit = faiterate($f);
    while ( my $seq = $fit->() ) {
      push @lengths, [ $seq->id, $seq->length ];
    }
    $d = \@lengths;
  }

  my %sl;
  for my $r (@$d) {
    die "double ID" if ( $sl{ $r->[0] } );
    $sl{ $r->[0] } = $r->[1];
  }

  return \%sl;
}

sub revcom_seq_string {
  return Bio::Gonzales::Seq::_revcom_from_string(@_);
}

sub seq_apply {
  my ( $f, $sub, $pattern ) = @_;

  die "file or sub ref not correct" unless ( ref $sub eq 'CODE' && -f $f );
  my $seq_lengths = seq_lengths($f);
  my @res;
  my $num = keys %$seq_lengths;
  for my $sid ( keys %$seq_lengths ) {
    my $spat;
    if ($pattern) {
      $spat = $pattern;
      $spat =~ s/\{id\}/$sid/g;
      $spat =~ s/\{begin\}/1/g;
      $spat =~ s/\{end\}/$seq_lengths->{$sid}/g;
    }
    push @res, $sub->( { id => $sid, begin => 1, end => $seq_lengths->{$sid}, num => $num }, $spat );
  }
  return \@res;
}

sub pairwise_identity_l {
  my ( $seq1, $seq2 ) = @_;
  return _pairwise_identity_generic( $seq1, $seq2, 1, 0 );
}

sub pairwise_identity_s {
  my ( $seq1, $seq2 ) = @_;
  return _pairwise_identity_generic( $seq1, $seq2, 0, 0 );
}

sub pairwise_identity_gaps_l {
  my ( $seq1, $seq2 ) = @_;
  return _pairwise_identity_generic( $seq1, $seq2, 1, 1 );
}

sub _pairwise_identity_generic {
  my ( $seq1, $seq2, $use_longest, $include_gaps ) = @_;

  $seq1 = $seq1->seq if ( blessed $seq1);
  $seq2 = $seq2->seq if ( blessed $seq2);

  my $seq1_gaps = 0;
  my $seq2_gaps = 0;
  if ( !$include_gaps ) {
    $seq1_gaps = $seq1 =~ y/-/./;
    $seq2_gaps = () = $seq2 =~ /-/g;
  }

  my $mask    = $seq1 ^ $seq2;
  my $matches = $mask =~ tr/\x0/\x0/;

  my $longest;
  my $shortest;
  if ( length($seq2) - $seq2_gaps < length($seq1) - $seq1_gaps ) {
    $longest  = length($seq1) - $seq1_gaps;
    $shortest = length($seq2) - $seq2_gaps;
  } else {
    $shortest = length($seq1) - $seq1_gaps;
    $longest  = length($seq2) - $seq2_gaps;
  }

  if ($use_longest) {
    return ( $matches / $longest );
  } else {
    return ( $matches / $shortest );
  }
}

sub pairwise_identity_gaps_s {
  my ( $seq1, $seq2 ) = @_;

  return _pairwise_identity_generic( $seq1, $seq2, 0, 1 );
}

sub pairwise_identities {
  my ( $sub, @seqs ) = @_;

  #creating an upper triangular matrix

  my @dist;
  for ( my $i = 0; $i < @seqs; $i++ ) {
    push @dist, [];
  }

  my $i;
  for ( $i = 0; $i < @seqs - 1; $i++ ) {
    $dist[$i][$i] = 1;
    for ( my $j = $i + 1; $j < @seqs; $j++ ) {
      $dist[$j][$i] = $sub->( $seqs[$j], $seqs[$i] );
    }
  }
  $dist[$i][$i] = 1;

  return \@dist;
}

sub map_seqids {
  my ( $seqs, $pattern ) = @_;

  my %map;

  my $i = seqid_mapper($pattern);

  for my $s (@$seqs) {
    my ( $new, $old ) = $i->($s);
    $map{$new} = $old;
  }

  return \%map;
}

sub seqid_mapper {
  my ( $pattern, @extra_args ) = @_;
  $pattern = 's%08d' unless defined $pattern;

  my $handler;
  if ( ref $pattern eq 'CODE' ) {
    $handler = $pattern;
  } elsif ( ref $pattern eq 'HASH' ) {
    my $i = 1;
    $handler = sub {
      my $id = shift;
      return defined $pattern->{$id} ? $pattern->{$id} : 'unknown_' . $i++;
    };
  } else {
    my $i = 1;
    $handler = sub {
      return sprintf $pattern, $i++;

    };
  }

  return sub {
    my ($seq) = @_;
    return unless ($seq);
    unless ( blessed($seq) ) {
      if ( ref $seq eq 'ARRAY' ) {
        croak "you supplied an array to the mapper function, use single seq Bio::Gonzales::Seq objects";
      } else {
        confess Dumper $seq ;
      }
    }

    my $orig_id = $seq->id;
    my $id = $handler->( $orig_id, @extra_args );
    $seq->id($id);

    return ( $id, $orig_id );
  };
}

{
  my $POLY64REVh = 0xd8000000;
  my @CRCTableh  = 256;
  my @CRCTablel  = 256;
  my $initialized;

  sub crc64 {
    my $sequence = shift;
    my $crcl     = 0;
    my $crch     = 0;
    if ( !$initialized ) {
      $initialized = 1;
      for ( my $i = 0; $i < 256; $i++ ) {
        my $partl = $i;
        my $parth = 0;
        for ( my $j = 0; $j < 8; $j++ ) {
          my $rflag = $partl & 1;
          $partl >>= 1;
          $partl |= ( 1 << 31 ) if $parth & 1;
          $parth >>= 1;
          $parth ^= $POLY64REVh if $rflag;
        }
        $CRCTableh[$i] = $parth;
        $CRCTablel[$i] = $partl;
      }
    }

    foreach ( split '', $sequence ) {
      my $shr        = ( $crch & 0xFF ) << 24;
      my $temp1h     = $crch >> 8;
      my $temp1l     = ( $crcl >> 8 ) | $shr;
      my $tableindex = ( $crcl ^ ( unpack "C", $_ ) ) & 0xFF;
      $crch = $temp1h ^ $CRCTableh[$tableindex];
      $crcl = $temp1l ^ $CRCTablel[$tableindex];
    }
    return wantarray ? ( $crch, $crcl ) : sprintf( "%08X%08X", $crch, $crcl );
  }
}

1;
__END__

=head1 NAME



=head1 SYNOPSIS

    use Bio::Gonzales::Seq::Util qw(
        pairwise_identity_l
        pairwise_identity_s
        pairwise_identity_gaps_l
        pairwise_identity_gaps_s
        pairwise_identities
        map_seqids
        seqid_mapper
        overlaps
        cluster_overlapping_ranges
    );

=head1 DESCRIPTION

=head1 SUBROUTINES

=over 4

=item B<< $map = map_seqids($seqs, I<$pattern>) >>

Maps all sequence ids of C<$seqs> in situ. If C<$pattern> is not given, C<s%9d> is taken as default. 


=item B<< $clustered_ranges = cluster_overlapping_ranges(\@ranges) >>

This function takes some ranges and clusters them by overlap. 

    @ranges = (
        [ $start, $stop, $some, $custom, $elements ],
        [ ... ],
        ...
    );

    $clustered_ranges = [
        # first cluster
        [
            [ $start, $stop, $some, $custom, $elements ],
            ...
        ],
        # next cluster
        [
            [ $start, $stop, $some, $custom, $elements ],
            ...
        ],
        ...
    ];


=item B<< $map = map_seqids(\@seqs!, $pattern) >>

Wrapper around C<seqid_mapper()>, maps the ids of @seqs and returns the map.
This function works directly on the sequences.

=item B<< $mapper = seqid_mapper(\%idmap) >>

Create a mapper that maps given sequences to new ids generated by the argument
given. A hash as argument will be used as mapping base, taking the key as old and the
value as new id. In case the sequence id is non-existent in the hash, an artificial id
following the pattern C<unknown_$i> with C<$i> running from 0
onwards will be generated..

=item B<< $mapper = seqid_mapper(\&handler) >>

Create a mapper that maps given sequences to new ids generated by successive
calls to the handler. The handler will get the existing/original id as
argument and shall return a new id. A simple mapper would be:


    my $i = 1;
    my $mapper = seqid_mapper(sub { sprintf "id%d", $i++ });
    or
    my $mapper = seqid_mapper(sub { my $id = shift; $id =~ s/pep/cds/; return $id});

    my ($old_id, $new_id) = $mapper->($sequence_object);

The sequence object WILL BE ALTERED IN SITU.

=item B<< $mapper = seqid_mapper($pattern) >>

Use pattern as basis for sequence id mapping, "%s" or "%d" must be included
ONLY ONCE and will be substituted by a couter, running from 0 to INF.

=item B<< $mapper = seqid_mapper() >>

Same as C<$mapper = seqid_mapper("s%9d")>

=item B<< overlaps([$a_begin, $a_end], [$b_begin, $b_end]) >>

Returns true if a overlaps with b, false otherwise.

=back

=head1 SEE ALSO

=head1 AUTHOR

jw bargsten, C<< <joachim.bargsten at wur.nl> >>

=cut