The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Bio::Gonzales::Seq::IO;

use warnings;
use strict;
use Carp qw/cluck confess croak carp/;

use Bio::Gonzales::Seq::IO::Fasta;

use Data::Dumper;
use Bio::Gonzales::Util::File qw/open_on_demand/;
use Bio::Gonzales::Util qw/flatten/;
use Bio::Gonzales::Seq;

use base 'Exporter';
our ( @EXPORT, @EXPORT_OK, %EXPORT_TAGS );
our $VERSION = '0.060'; # VERSION

@EXPORT      = qw(faslurp faspew fasubseq faiterate);
%EXPORT_TAGS = ();
@EXPORT_OK   = qw(fahash);

our $WIDTH = $Bio::Gonzales::Seq::WIDTH;

our $SEQ_FORMAT = 'all';

sub faslurp {
  my ($src) = @_;

  my @fa;

  my ( $fh, $fh_was_open ) = open_on_demand( $src, '<' );

  my $fasta = Bio::Gonzales::Seq::IO::Fasta->new($fh);
  while ( my $entry = $fasta->next_seq ) {
    confess unless ($entry);
    push @fa, $entry;
  }

  $fh->close unless ($fh_was_open);

  return wantarray ? @fa : \@fa;
}

sub fasubseq {
  my ( $src, $ids_with_ranges, $c ) = @_;

  my $ids;
  if ( ref $ids_with_ranges eq 'ARRAY' ) {
    return unless (@$ids_with_ranges);

    if ( ref $ids_with_ranges->[0] eq 'ARRAY' ) {
      #array of array with id and range
      $ids = {};

      for my $idrange (@$ids_with_ranges) {
        my ( $id, @range ) = @$idrange;
        $ids->{$id} = [] unless defined $ids->{$id};
        push @{ $ids->{$id} }, \@range;
      }

    } else {
      #just plain ids
      $ids = { map { $_ => [] } @$ids_with_ranges };
    }
  }

  my ( $fh, $fh_was_open ) = open_on_demand( $src, '<' );

  my $fasta = Bio::Gonzales::Seq::IO::Fasta->new($fh);
  my @fa;
  while ( my $entry = $fasta->next_seq ) {
    if ( exists( $ids->{ $entry->id } ) ) {
      my $ranges = $ids->{ $entry->id };
      for my $range (@$ranges) {

        eval { push @fa, $entry->subseq( $range, $c ) };
        if ($@) {
          carp Dumper $entry->clone_empty;
          croak $@;
        }
      }

      #empty ranges array
      push @fa, $entry unless (@$ranges);
    }
  }
  $fh->close unless ($fh_was_open);

  return wantarray ? @fa : \@fa;
}

sub fahash {
  my $faraw = faslurp(@_);
  my %fa;
  for my $s (@$faraw) {
    confess "Dupicate entry: " . $s->id if ( exists( $fa{ $s->id } ) );
    $fa{ $s->id } = $s;
  }
  return wantarray ? %fa : \%fa;
}

sub faiterate {
  my @srcs = flatten(@_);

  confess "no arguments supplied" unless ( @srcs > 0 );
  my ( $fh, $fh_was_open ) = open_on_demand( shift(@srcs), '<' );
  my $fasta = Bio::Gonzales::Seq::IO::Fasta->new($fh);

  return sub {
    my $entry = $fasta->next_seq;
    unless ( defined($entry) ) {
      $fh->close unless ($fh_was_open);

      if ( my $src = shift @srcs ) {
        ( $fh, $fh_was_open ) = open_on_demand( $src, '<' );
        $fasta = Bio::Gonzales::Seq::IO::Fasta->new($fh);
      } else {
        return;
      }
    }
    return $entry;
  };
}

sub faspew {
  my ( $dest, @data ) = @_;

  #open destination, if necessary
  my ( $fh, $fh_was_open ) = open_on_demand( $dest, '>' );

  carp "no sequences supplied" unless ( @data > 0 );
  # take appropriate steps for the sequence objects
  for my $d (@data) {
    if ( ref $d eq 'HASH' ) {
      for my $e ( values %{$d} ) {
        print $fh $e->$SEQ_FORMAT;
      }
    } elsif ( ref $d eq 'ARRAY' ) {
      for my $e ( @{$d} ) {
        print $fh $e->$SEQ_FORMAT;
      }
    } elsif ( ref($d) eq 'Bio::Gonzales::Seq' ) {
      print $fh $d->$SEQ_FORMAT;
    } else {
      unless ($d) {
        cluck "Undefined argument supplied";
        next;
      }
      confess "error";
    }
  }
  $fh->close
    unless ($fh_was_open);
  return;
}


sub format_seq_string {return Bio::Gonzales::Seq::Format_seq_string($_[0], $WIDTH); }

1;
__END__

=head1 NAME

Bio::Gonzales::Seq::IO - fast utility functions for sequence IO

=head1 SYNOPSIS

    use Bio::Gonzales::Seq::IO qw( faslurp faspew fahash fasubseq faiterate )

=head1 DESCRIPTION

=head1 SUBROUTINES

=over 4

=item B<< @seqs = faslurp(@filenames) >>

=item B<< $seqsref = faslurp(@filenames) >>

C<faslurp> reads in all sequences from C<@filenames> and returns an array in
list or an arrayref in scalar context of the read sequences. The sequences are
stored as FAlite2::Entry objects.

=item B<< $iterator = faiterate($filename) >>

Allows you to create an iterator for the fasta file C<$filename>. This
iterator can be used to loop over the sequence file w/o reading in all content
at once. Iterator usage:

    while(my $sequence_object = $iterator->()) {
        #do something with the sequence object
    }


=item B<< $seqs = fasubseq($file, \@ids_with_locations, \%c) >>

=item B<< $seqs = fasubseq($file, \@id_list, \%c) >>

    #ARRAY OF ARRAYS
    @ids_with_locations = (
        [ $id, $begin, $end, $strand ],
        ...
    );

Config options can be:

    %c = (
        keep_id => 1, # keeps the original id of the sequence
        wrap => 1, # see further down
        relaxed_range => 1, # substitute 0 or undef for $begin with '^' and for $end with '$'
    );


There are several possibilities for C<$begin> and C<$end>:

    GGCAAAGGA ATGATGGTGT GCAGGCTTGG CATGGGAGAC
    ^..........^                                (1,11) OR ('^', 11)
       ^.....................................^  (4,'$')
                          ^..............^      (21,35) { with wrap on: OR (-19,35) OR (-19, -5) }
                          ^..................^  (21,35) { with wrap on: OR (-19,'$') }
    
C<wrap>: The default is to limit all negative
values to the sequence boundaries, so a negative begin would be equal to 1 or
'^' and a negative end would be equal to '$'.

=item B<< $sref = fahash(@filenames) >>

=item B<< %seqs = fahash(@filenames) >>

Does the same as L<faslurp>, but returns an hash with the sequence ids as keys
and the sequence objects as values.

=item B<< faspew($file, $seq1, $seq2, ...) >>

"spew" out the given sequences to a file. Every C<$seqN> argument can be an
hash reference with L<FAlite2::Entry> objects as values or an array reference
of L<FAlite2::Entry> objects or just plain L<FAlite2::Entry> objects.
    
=item B<< $iterator = faspew_iterate($filename) >>

=item B<< $iterator = faspew_iterate($fh) >>

Creates an iterator that writes the sequences to the given C<$filename> or C<$fh>.

    for my $sequence_object (@sequences) {
        $iterator->($sequence_object)
    }
    #DO NOT FORGET THIS, THIS CALL WILL CLOSE THE FILEHANDLE
    $iterator->();

    #this is equal to:

    $iterator->(@sequences);
    $iterator->();
    #or
    $iterator->(\@sequences);
    $iterator->();


    #DO NOT DO THIS:

    $iterator->();

The filehandle will not be closed in case one supplies not a C<$filename> but a C<$fh> handle.

=back

=head1 ADVANCED

=over 4

=item B<< change the output format >>

    $Bio::Gonzales::Seq::IO::WIDTH = 60; #sequence width in fasta output

    #but only if set to 'all_pretty' ('all' is default)
    $Bio::Gonzales::Seq::IO::SEQ_FORMAT = 'all_pretty'; 

=back

=head1 SEE ALSO

=head1 AUTHOR

jw bargsten, C<< <joachim.bargsten at wur.nl> >>

=cut