The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl
# -*-CPerl-*-
# Last changed Time-stamp: <2017-06-09 11:18:37 michl>
#
# Extract nucleotide and amino acid sequence data from Fasta file,
# provided by a BED file
#
# ***********************************************************************
# *  Copyright notice
# *
# *  Copyright 2017 Michael T. Wolfinger <michael@wolfinger.eu>
# *  All rights reserved
# *
# *  This program is free software: you can redistribute it and/or modify
# *  it under the terms of the GNU General Public License as published by
# *  the Free Software Foundation, either version 3 of the License, or
# *  (at your option) any later version.
# *
# *  This program is distributed in the hope that it will be useful,
# *  but WITHOUT ANY WARRANTY; without even the implied warranty of
# *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# *  GNU General Public License for more details.
# *
# *  You should have received a copy of the GNU General Public License
# *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
# *
# *  This copyright notice MUST APPEAR in all copies of the script!
# ***********************************************************************

use strict;
use warnings;
use Getopt::Long qw( :config posix_default bundling no_ignore_case );
use Pod::Usage;
use Data::Dumper;
use Cwd;
use Carp;
use Path::Class;
use Bio::ViennaNGS::Fasta;
use Bio::ViennaNGS::FeatureIO;
use Bio::SeqUtils;

#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
#^^^^^^^^^^ Variables ^^^^^^^^^^^#
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#

my ($fastaO,$bed,$line,$cwd,$f,$seq,$theid,$intervals,$out);
my $bed_in = undef;
my $fa_in  = undef;
my $stdout = 0;
my $translate = 0;  # translate nucleotide sequence

Getopt::Long::config('no_ignore_case');
pod2usage(-verbose => 1) unless GetOptions("bed=s"      => \$bed_in,
					   "fa=s"       => \$fa_in,
					   "a|aa"       => sub{$translate=1},
					   "s|stdout"   => sub{$stdout=1},
					   "man"        => sub{pod2usage(-verbose => 2)},
					   "help|h"     => sub{pod2usage(1)}
					   );

unless(-f $bed_in){
  warn "Could not find input BED input file provided via ---bed option";
  pod2usage(-verbose => 0);
}
unless(-f $fa_in){
  warn "Could not find input FASTA input file provided via ---fa option";
  pod2usage(-verbose => 0);
}
$cwd = getcwd();
unless ($bed_in =~ /^\// || $bed_in =~ /^\.\//){$bed_in = file($cwd,$bed_in);}
unless ($fa_in  =~ /^\// || $fa_in  =~ /^\.\//){$fa_in  = file($cwd,$fa_in);}

$fastaO    = Bio::ViennaNGS::Fasta->new(fasta=>"$fa_in");
$intervals = Bio::ViennaNGS::FeatureIO->new(
					    file => "$bed_in",
					    filetype => 'Bed6',
					    instanceOf => 'Feature',
					   );
foreach  $f (@{$intervals->data}){
  confess "ERROR: ID ".$f->chromosome." not found in Fasta file $fa_in"
    unless ($fastaO->has_sequid($f->chromosome));
  $seq = $fastaO->stranded_subsequence($f->chromosome,
				       $f->start+1,
				       $f->end,
				       $f->strand);
  #my $fn = $f->chromosome.".".$f->name.".".eval($f->start+1)."-".$f->end.".fa";
  my $fn = "sequence.fa";
  my $newid = $f->chromosome."|".$f->name."|".eval($f->start+1)."-".$f->end."|";
  $stdout == 0 ? open($out, ">", $fn) : open($out, ">&STDOUT");
  print $out ">$newid\n";
  print $out join "\n", (unpack "(a70)*",$seq);
  print $out "\n";
  if($translate == 1){
    my $seqobj = Bio::Seq->new (-seq => $seq,
				-id  => 'foobar',
				-accession_number => 'ABCD');
    my @prots = Bio::SeqUtils->translate_3frames($seqobj);
    my $i=1;
    foreach my $s (@prots){
      print $out ">$newid|AA frame $i|\n"; $i++;
      print $out join "\n", (unpack "(a70)*",$s->seq);
      print $out "\n";
    }
  }
  close $out;
}

__END__


=head1 NAME

bed2nt2aa.pl - Get nucleotde and amino acid sequences from BED intervals

=head1 SYNOPSIS

bed2nt2aa.pl [--bed I<FILE>] [--fa I<FILE>] [--aa] [options]

=head1 DESCRIPTION

Extract sequence intervals from Fasta files, provided as BED6
records. Optionally translate nucleotide into amino acid
sequences. Output is written to a 'sequence.fa' output file. This can
be overriden with the C<-s> switch, which triggers STDOUT output.

=head1 OPTIONS

=over

=item B<--bed>

Input file in BED6 format (mandatory)

=item B<--fa>

Input file in Fasta format (mandatory)

=item B<-a|--aa>

Translate nucleotide into amino acid sequences, providing all three
possible frames

=item B<-s|--stdout>

Write output to STDOUT instead of an output file 'sequence.fa'

=item B<--help -h>

Print short help

=item B<--man>

Prints the manual page and exits

=back

=head1 AUTHOR

Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt>

=cut