The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl
#Copyright (c) 2010 Joachim Bargsten <code at bargsten dot org>. All rights reserved.

use warnings;
use strict;
use Carp;

use 5.010;
use Getopt::Long;
use Pod::Usage;
use List::MoreUtils qw/any/;
use Bio::Gonzales::Util::Text qw/ccount/;
use Bio::Gonzales::Seq::IO qw/faiterate faspew/;
use Bio::Gonzales::Stat::Util qw/hist_text/;
my (
  $is_revcom_seq, $help,    $length,   @id,          $desc,  $seq,       $only_id,
  $only_head,     $verbose, $only_len, $is_inverted, $count, $only_char, $print_hist, $is_print_full
);

my $shift_id = 0;
GetOptions(
  "only_headers|oh" => \$only_head,
  "filter_length=s" => \$length,
  "filter_id=s"     => \@id,
  "filter_seq=s"    => \$seq,
  "filter_desc=s"   => \$desc,
  "list_ids|i"      => \$only_id,
  "list_length|l"   => \$only_len,
  "histogram|hi"    => \$print_hist,
  "list_char|c"     => \$only_char,
  "help|h|?"        => \$help,
  "shift_id=i"      => \$shift_id,
  "verbose|v"       => \$verbose,
  "revcom"          => \$is_revcom_seq,
  "print" => \$is_print_full,
  "invert"          => \$is_inverted,
  "num_ids|n"       => \$count,
) or pod2usage( -verbose => 2 );

pod2usage( -verbose => 2, -noperldoc => 1 ) if $help;

my @files = @ARGV;

push @files, \*STDIN
  unless (@files);

if ( $seq && $is_revcom_seq ) {
  $seq = Bio::Gonzales::Seq->new( seq => $seq, id => '1' )->revcom->seq;
}

for my $file (@files) {
  my $seqin;
  my @lengths;
  if ( $file && -f $file ) {
    say STDERR "Using file $file";

    #special case to speed things up
    system( qw/grep -c ^>/, $file ) == 0 and next if ($count);
  } elsif ( ref $file ) {
    say STDERR "Using STDIN";
  } else {
    confess "file does not exist: $file";
  }

  $seqin = faiterate($file);

  my $seqout = \*STDOUT;

  @id = map {qr/$_/} @id;

  my $number_of_seqs = 0;
  while ( my $so = $seqin->() ) {
    if (
      ( !$length || eval($so->length . $length) )
      && ((
           ( @id && any { $so->id =~ /$_/ } @id )
        || ( $seq  && $so->seq =~ /$seq/ )
        || ( $desc && $so->desc =~ /$desc/ )

      )
      || (
        !( @id || $seq || $desc )
        && ( $print_hist
          || $count
          || $only_id
          || $only_head
          || $only_len
          || $only_char )
      )
       ||  $is_print_full))
    {
      say STDERR $so->id . " matches."
        if ($verbose);

      if ($shift_id) {
        my $desc;
        my $id;
        for ( my $i = 0; $i < $shift_id && $so->desc =~ /(\S+)\s+/g; $i++ ) {
          $id   = $1;
          $desc = $';
        }
        $so->id($id);
        $so->desc($desc);
      }

      if ($only_id) {
        say $so->id;
      } elsif ($only_head) {
        say $so->id . " " . $so->desc;
      } elsif ($only_len) {
        say $so->length . "\t" . $so->id;
      } elsif ($only_char) {
        my $char_count = ccount( $so->seq );
        say $so->id . ": "
          . join(
          ", ",
          (
            map { $_ . " => " . $char_count->{$_} }
            sort keys %$char_count
          ),
          "sum: " . $so->length
          );
      } elsif ($count) {
        $number_of_seqs++;
      } elsif ($print_hist) {
        push @lengths, $so->length;
      } else {
        faspew( $seqout, $so );
      }
    }
  }

  say $number_of_seqs if ($count);
  print hist_text( \@lengths, { skip_empty => 1 } ) if ($print_hist);
}

__END__

=head1 NAME

fasta-grep - grep fasta files

=head1 SYNOPSIS

    ./fasta-grep [OPTIONS] [--help] <fasta-file>

=head1 DESCRIPTION

find regex in id-header or sequence or specifiy a length condition to select
matching fasta sequences. All options, except length, are joined by "or".

To modify the regex to be case insensitive put C<(?i)> in front of it (see
also C<man perlre>)

=head1 OPTIONS

=over 4

=item B<--only_headers|--oh>

Print only fasta headers.

=item B<< --filter_length <len> >>

With len '>=', '<=', '>', '==', ...

=item B<< --filter_id <id_regex> >>

=item B<< --filter_seq <seq_regex> >>

=item B<< --filter_desc <desc_regex> >>

=item B<< --list_ids | -i >>

Extract the ids of the fasta file

=item B<< --list_length | -l >>

Print length and id of every seq

=item B<< --list_char | -c >>

Show character distributions for every sequence

=item B<--verbose | -v>

Print matching ids to STDERR

=item B<--print>

just print the sequence (use in combination with filter length)

=item B<< --shift_id <N> >>

Removes [ID_1] - [ID_N] in fasta header where the header has the form:

    >[ID_1] [ID_2] ... [ID_N] description

=item B<--help>

Display this msg.

=back

=head1 AUTHOR

jw bargsten, C<< <joachim.bargsten at wur.nl> >>

=cut