The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl
# created on 2015-01-14

use warnings;
use strict;
use 5.010;

use Bio::Gonzales::Seq::IO;
use Data::Dumper;
use List::MoreUtils qw/indexes/;
use Bio::Gonzales::Seq;
use Bio::Gonzales::Seq::Filter qw/clean_dna_seq/;

use Bio::Gonzales::Feat::IO::GFF3;
use Bio::Gonzales::Feat;

use Pod::Usage;
use Getopt::Long;

my %opt = ();
GetOptions( \%opt, 'help', 'clean_seq', 'include_junk', 'seq_f=s' ) or pod2usage(2);

pod2usage( -exitval => 0, -verbose => 2 ) if ( $opt{help} );
pod2usage(2)
  unless ( @ARGV && @ARGV == 2 );

my ( $regex, $file ) = @ARGV;

pod2usage( { -msg => "regular expression or file argument missing", -exit_val => 2 } )
  unless ( $regex && $file );

my $gffo = Bio::Gonzales::Feat::IO::GFF3->new( fh => \*STDOUT, mode => '>', escape_whitespace => 1 );
my $seq_fh;

if ( $opt{seq_f} ) {
  open $seq_fh, '>', $opt{seq_f} or die "Can't open filehandle: $!";
}

my $fait = faiterate($file);

while ( my $so = $fait->() ) {
  if ( $opt{clean_seq} ) {
    clean_dna_seq($so);
  }
  repl( $regex, $so );
}

if ($seq_fh) {
  $seq_fh->close;
}

sub repl {
  my $regex = shift;
  my $so    = shift;
  my $id    = $so->id;

  my $full_seq  = $so->seq;
  my $last_pos  = 0;
  my $match_cnt = 0;
  while ( $full_seq =~ /$regex/g ) {
    $match_cnt++;
    # we found a "restriction site"
    if ( $-[0] != 0 ) {
      # omit strings starting with "restriction site"
      my $gap = $+[0] - $-[0];
      my $match_id = sprintf( "%s|nomatch:%d|%d..%d+%d", $id, $match_cnt, $last_pos + 1, $-[0], $gap );
      $gffo->write_feat(
        Bio::Gonzales::Feat->new(
          seq_id     => $id,
          source     => 'fasplit',
          type       => 'match',
          start      => $last_pos + 1,
          end        => $-[0],
          strand     => 0,
          attributes => { ID => [$match_id], followed_gap_size => [ $gap ] },
        )
      );

      if ($seq_fh) {
        my $seq = substr $full_seq, $last_pos, $-[0] - $last_pos;
        # output everything before it
        faspew(
          $seq_fh,
          Bio::Gonzales::Seq->new(
            id  => $match_id,
            seq => $seq
          )
        );
      }
    }
    if ( $opt{include_junk} ) {
      $match_cnt++;
      my $match_id = sprintf( "%s|match:%d|%d..%d", $id, $match_cnt, $-[0] + 1, $+[0] );
      $gffo->write_feat(
        Bio::Gonzales::Feat->new(
          seq_id     => $id,
          source     => 'fasplit',
          type       => 'gap',
          start      => $-[0] + 1,
          end        => $+[0],
          strand     => 0,
          attributes => { ID => [$match_id] },
        )
      );
      if ($seq_fh) {
        faspew(
          $seq_fh,
          Bio::Gonzales::Seq->new(
            id  => $match_id,
            seq => substr( $full_seq, $-[0], $+[0] - $-[0] ),
          )
        );
      }
    }
    $last_pos = $+[0];
  }
  if ( $last_pos < length($full_seq) ) {

    my $match_id
      = sprintf( "%s|nomatch:%d|%d..%d+%d", $id, $match_cnt + 1, $last_pos + 1, length($full_seq), 0 );
    $gffo->write_feat(
      Bio::Gonzales::Feat->new(
        seq_id     => $id,
        source     => 'fasplit',
        type       => 'match',
        start      => $last_pos + 1,
        end        => length($full_seq),
        strand     => 0,
        attributes => { ID => [$match_id], followed_gap_size => [0] },
      )
    );

    if ($seq_fh) {
      my $seq = substr $full_seq, $last_pos, length($full_seq) - $last_pos;
      faspew(
        $seq_fh,
        Bio::Gonzales::Seq->new(
          id  => $match_id,
          seq => $seq
        )
      );
    }
  }
}

=head1 NAME

bg_fasplit.pl - split sequences in fasta files by given regular expression

=head1 SYNOPSIS

bg_fasplit.pl [options] <regex> <file>

  <regex> the split pattern in form of a perl regular expression
  <file>  input sequence file in fata format

Options:

  --help            brief help message
  --clean_seq       clean DNA sequence (mask unknown characters with N)
  --include_junk    sequence matched by <regex> will be included in the output
  --seq_f <file>    write the split sequences into <file>