The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
# -*-CPerl-*-
# Last changed Time-stamp: <2017-06-10 18:18:45 michl>

package Bio::ViennaNGS::AnnoC;
use Bio::ViennaNGS;
use Bio::ViennaNGS::Util qw(sortbed);
use Bio::Tools::GFF;
use Path::Class;
use Carp;
use Moose;
use namespace::autoclean;
use version; our $VERSION = version->declare("$Bio::ViennaNGS::VERSION");

#my $VERSION = $Bio::ViennaNGS::VER;

has 'accession' => (
		    is => 'rw',
		    isa => 'Str',
		    predicate => 'has_accession',
		   );

has 'features' => (
		   is => 'ro',
		   isa => 'HashRef',
		   predicate => 'has_features',
		   default => sub { {} },
		  );

has 'nr_features' => (
		     is => 'ro',
		     isa => 'Int',
		     builder => '_get_nr_of_features',
		     lazy => 1,
		     );

has 'featstat' => (
		   is => 'ro',
		   isa => 'HashRef',
		   builder => '_set_featstat',
		   predicate => 'has_featstat',
		   lazy => 1,
		  );

before 'featstat' => sub {
  my $self = shift;
  $self->_get_nr_of_features();
};

sub _set_featstat {
  my $self = shift;
  my $this_function = (caller(0))[3];
  my %fs = ();
  confess "ERROR [$this_function] \$self->features not available"
    unless ($self->has_features);
  $fs{total} = 0;
  $fs{origin} = "$this_function ".$VERSION;
  $fs{count} = $self->nr_features;
  foreach my $uid ( keys %{$self->features} ){
    my $gbkey = ${$self->features}{$uid}->{gbkey};
    $fs{total} += 1;
    unless (exists $fs{$gbkey}){
      $fs{$gbkey} = 0;
    }
    $fs{$gbkey} += 1;
  }
  return \%fs;
}

sub _get_nr_of_features {
  my $self = shift;
  my $this_function = (caller(0))[3];
  confess "ERROR [$this_function] \$self->features not available"
    unless ($self->has_features);
  return (keys %{$self->features});
}

sub parse_gff {
  my ($self,$in_file) = @_;
  my ($i,$gffio,$header,$f,$gbkey);
  my $this_function = (caller(0))[3];

  $gffio = Bio::Tools::GFF->new(-file         => $in_file,
				-gff_version  => 3,
			       );
  $gffio->ignore_sequence(1);
  if ($header = $gffio->next_segment() ){
    $self->accession( $header->display_id() );
  }
  else{ carp "ERROR [$this_function] Cannot parse GFF header\n" }

  while($f = $gffio->next_feature()) {
    my ($uid,$feat_name);
    my @name = my @id = my @gbkeys = ();

    next if ($f->primary_tag() eq "exon");

    # 1) determine gbkey of the current feature
    @gbkeys = $f->get_tag_values("gbkey");
    $gbkey  = $gbkeys[0];

    # 2) get a unique ID for each feature
    if ($f->has_tag('ID')){
      @id = $f->get_tag_values('ID');
      $uid = $id[0]; # ID=id101
    }
    else {
      croak "ERROR [$this_function] Feature '$gbkey' at pos.\
             $f->start does not have \'ID\' attribute\n";
    }

    # 3) assign parent's unique ID in case a parent record exists
    if ($f->has_tag('Parent')){
      @id = $f->get_tag_values('Parent');
      $uid = $id[0]; # ID=id101
    }

    # 4) find a name for the current feature, use 'Name' or 'ID' attribute
    if ($f->has_tag('Name')){
      @name = $f->get_tag_values('Name');
      $feat_name = $name[0];
    }
    elsif ($f->has_tag('ID')){
      @id = $f->get_tag_values('ID');
      $feat_name = $id[0]; # ID=id101, use ID as feature name
    }
    else {
      croak "ERROR [$this_function] Cannot set name for feature \
              $f->gbkey at pos. $f->start\n";
    }

    unless (exists ${$self->features}{$uid}) { # gene / ribosome_entry_site / etc.
      ${$self->features}{$uid}->{start}   = $f->start;
      ${$self->features}{$uid}->{end}     = $f->end;
      ${$self->features}{$uid}->{strand}  = $f->strand;
      ${$self->features}{$uid}->{length}  = $f->length;
      ${$self->features}{$uid}->{seqid}   = $f->seq_id;
      ${$self->features}{$uid}->{score}   = $f->score || 0;
      ${$self->features}{$uid}->{gbkey}   = $gbkey;
      ${$self->features}{$uid}->{name}    = $feat_name;
      ${$self->features}{$uid}->{uid}     = $uid;
    }
    else { # CDS / tRNA / rRNA / etc.
      ${$self->features}{$uid}->{gbkey} = $gbkey;  # gbkey for tRNA/ rRNA/ CDS etc
    }
  }
  $gffio->close();
}

sub features2bed {
 my ($self,$gbkey,$dest,$bn,$log) = @_;
 my ($chrom,$chrom_start,$chrom_end,$name,$score,$strand,$thick_start);
 my ($thick_end,$reserved,$block_count,$block_sizes,$block_starts);
 my @ft = ();
 my $this_function = (caller(0))[3];

 croak "ERROR [$this_function] $self->features not available"
   unless ($self->has_features);
 croak "ERROT [$this_function] $self->featstat not available"
   unless ($self->has_featstat);
 croak "ERROR [$this_function] $dest does not exist"
    unless (-d $dest);
 if (defined $log){open(LOG, ">>", $log) or croak $!;}

 if (defined $gbkey){ # dump features of just one genbank key
   confess "ERROR [$this_function] genbank key \'$gbkey\' N/A in hash "
     unless (exists ${$self->featstat}{$gbkey});
   $ft[0] = $gbkey;
 }
 else{ # dump features for all genbank keys
   foreach my $gbk (keys %{$self->featstat}) {
     next if ($gbk eq 'total' || $gbk eq 'Src' || $gbk eq 'accession' ||
	      $gbk eq 'origin' || $gbk eq 'count');
     push @ft,$gbk;
   }
 }

 foreach my $f (@ft){
   my $bedname   = file($dest,"$bn.$f.bed");
   my $bedname_u = file($dest,"$bn.$f.u.bed");
   open (BEDOUT, "> $bedname_u") or croak $!;

   # dump unsorted gene annotation from DS to BED12
   foreach my $uid (keys %{$self->features}){
      next unless (${$self->features}{$uid}->{gbkey} eq $f);
      my @bedline = ();
      $chrom        = ${$self->features}{$uid}->{seqid};
      $chrom_start  = ${$self->features}{$uid}->{start};
      $chrom_start--; # BED is 0-based
      $chrom_end    = ${$self->features}{$uid}->{end};
      $name         = ${$self->features}{$uid}->{name};
      $score        = ${$self->features}{$uid}->{score};
      $strand       = ${$self->features}{$uid}->{strand} == -1 ? '-' : '+'; #default to +
      $thick_start  = $chrom_start;
      $thick_end    = $chrom_end;
      $reserved     = 0; 
      $block_count  = 1;
      $block_sizes  = ${$self->features}{$uid}->{length}.",";
      $block_starts = "0,";
      @bedline = join ("\t", ($chrom,$chrom_start,$chrom_end,
			      $name,$score,$strand,$thick_start,
			      $thick_end,$reserved,$block_count,
			      $block_sizes, $block_starts));
      print BEDOUT "@bedline\n";
    }
   close (BEDOUT);

   sortbed($bedname_u,".",$bedname,1,undef);  # sort bed file

 } # end foreach
 if (defined $log){close(LOG)};
}

sub feature_summary {
  my ($self, $dest) = @_;
  my ($fn,$fh);
  my $this_function = (caller(0))[3];

  croak "ERROR [$this_function] $dest does not exist\n"
    unless (-d $dest);
  croak "ERROR [$this_function] $self->accession not available\n"
    unless ($self->has_accession);

  $fn = dir($dest,$self->accession.".summary.txt");
  open $fh, ">", $fn or croak $!;

  print $fh "Accession\t ".$self->accession."\n";
  print $fh "Origin   \t ${$self->featstat}{origin}\n";
  foreach my $ft (sort keys %{$self->featstat}){
    next if ($ft =~ /total/ || $ft =~ /accession/ || $ft =~ /origin/);
    print $fh "$ft\t${$self->featstat}{$ft}\n";
  }
  print $fh "Total\t${$self->featstat}{total}\n";
  close $fh;
}

__PACKAGE__->meta->make_immutable;

1;

__END__

=head1 NAME

Bio::ViennaNGS::AnnoC - Object-oriented interface for storing and
converting biological sequence annotation formats

=head1 SYNOPSIS

  use Bio::ViennaNGS::AnnoC;

  my $obj = Bio::ViennaNGS::AnnoC->new();

  # parse GFF3 file to internal data straucture
  $obj->parse_gff($gff3_file);

  # compute summary of parsed annotation
  $obj->featstat;

  # dump feature summary to file
  $obj->feature_summary($dest);

  # dump all tRNAs contained in data structure as BED12
  $obj->features2bed("tRNA",$dest,$bn,$log)

=head1 DESCRIPTION

This module provides an object-oriented interface for storing and
converting biological sequence annotation data. Based on the C<Moose>
object system, it maintains a central data structure which is curently
designed to represent simple, non-spliced (ie single-exon) annotation
data. Future versions of the module will account for more generic
scenarios, including spliced isoforms.

=head1 METHODS

=over

=item parse_gff

Title : parse_gff

Usage : C<$obj-E<gt>parse_gff($gff3_file);>

Function: Parses GFF3 annotation files of non-spliced genomes into
          C<$self-E<gt>features>

Args : The full path to a GFF3 file

Returns :

Notes : The GFF3 specification is available at
        L<http://www.sequenceontology.org/resources/gff3.html>. This
        routine has been tested with NCBI bacteria GFF3 annotation.

=item feature_summary

Title : feature_summary

Usage : C<$obj-E<gt>feature_summary($dest);>

Function : Generate a summary file for all features present in
           C<$self-E<gt>features>

Args : Full output path for summary.txt file

Returns :

=item features2bed

Title : features2bed

Usage : C<$obj-E<gt>features2bed($feature,$workdir,$bn,$log);>

Function : Dumps genomic features from C<$self-E<gt>features> hash to a
           BED12 file.

Args : C<$gbkey> can be either a string corresponding to a genbank
       key in C<$self-E<gt>featstat> or C<undef>. If defined, only
       features of the speficied key will be dumped to a single BED12
       file. If C<$gbkey> is C<undef>, BED12 files will be generated
       for each type present in C<$self-E<gt>featstat>.  C<$dest> is the
       output directory and C<$bn> the basename for all output
       files. C<$log> is either be the full path to a logfile or
       C<undef>.

Returns  :

=back

=head1 DEPENDENCIES

=over

=item L<Bio::Tools::GFF>

=item L<IPC::Cmd>

=item L<Path::Class>

=item L<Carp>

=back

=head1 AUTHORS

Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2014-2017 Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt>

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.10.0 or,
at your option, any later version of Perl 5 you may have available.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.


=cut