The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Bio::Gonzales::Feat::IO::GFF3;

use Mouse;

use warnings;
use strict;

use 5.010;
use Carp;

use List::MoreUtils qw/zip/;
use Bio::Gonzales::Feat;
use Data::Dumper;
use Carp;
use Scalar::Util qw/blessed/;
use Bio::Gonzales::Seq::Util qw/strand_convert/;

extends 'Bio::Gonzales::Feat::IO::Base';

our $VERSION = '0.068'; # VERSION

our $FASTA_RE         = qr/^\>/;
our $SEQ_REGION_RE    = qr/^\#\#sequence-region\s+(\S+)\s+(\S+)\s+(\S+)\s*/;
our $ATTR_UNESCAPE_RE = qr/%([0-9A-Fa-f]{2})/;

our @GFF_COLUMN_NAMES = qw/
  seq_id
  source
  type
  start
  end
  score
  strand
  phase
  attributes
  /;

has segments => ( is => 'rw', default => sub { [] } );
has _wrote_sth_before => ( is => 'rw' );
has pragmas           => ( is => 'rw', default => sub { {} } );
has preprocess        => ( is => 'rw' );
has comments          => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } );
has no_header         => ( is => 'rw' );
has escape_whitespace => ( is => 'rw' );
has quiet             => ( is => 'rw' );

sub BUILD {
  my ($self) = @_;

  $self->_parse_header if ( $self->mode eq '<' );
}

sub _parse_header {
  my ($self) = @_;

  my $fhi = $self->_fhi;

  my $l;
  while ( defined( $l = $fhi->() ) ) {
    next if ( !$l || $l =~ /^\s*$/ );

    if ( $l =~ /^##gff-version\s+(\d+)/ ) {
      confess "I need gff version 3" if ( $1 != 3 );
      # if no tag given, assume 3 by default
    } elsif ( $l =~ /$SEQ_REGION_RE/ ) {
      my ( $seqid, $start, $end ) = ( $1, $2, $3 );
      push @{ $self->segments }, { id => $seqid, start => $start, end => $end };
    } elsif ( $l =~ /^\#\#(feature-ontology)\s+(.*)$/ ) {
      $self->pragmas->{$1} //= [];
      push @{ $self->pragmas->{$1} }, $2;
    } elsif ( $l =~ /^\#\#(attribute-ontology)\s+(.*)$/ ) {
      $self->pragmas->{$1} //= [];
      push @{ $self->pragmas->{$1} }, $2;
    } elsif ( $l =~ /^\#\#(source-ontology)\s+(.*)$/ ) {
      $self->pragmas->{$1} //= [];
      push @{ $self->pragmas->{$1} }, $2;
    } elsif ( $l =~ /^\#\#(species)\s+(.*)$/ ) {
      $self->pragmas->{$1} //= [];
      push @{ $self->pragmas->{$1} }, $2;
    } elsif ( $l =~ /^\#\#(genome-build)\s+(.*)$/ ) {
      $self->pragmas->{$1} //= [];
      push @{ $self->pragmas->{$1} }, $2;
    } elsif ( $l =~ /^(\#\#\#)/ ) {
    } elsif ( $l =~ /^(\#\#FASTA)/ ) {
    } elsif ( $l =~ /$FASTA_RE/ ) {
      $self->_parse_seq($l);
    }

    #looks like the header is over!
    last unless $l =~ /^\#/;
  }

  push @{ $self->_cached_records }, $l;

  return;
}

sub _write_header {
  my ($self) = @_;

  $self->_wrote_sth_before(1) && return if ( $self->no_header );
  my $fh = $self->fh;
  say $fh '##gff-version 3';

  while ( my ( $p, $entries ) = each %{ $self->pragmas } ) {
    for my $e (@$entries) {
      say $fh '##' . $p . " " . $e;
    }
  }

  for my $c ( @{ $self->comments } ) {
    say $fh '#' . $c;
  }

  for my $s ( @{ $self->segments } ) {
    say $fh join ' ', '##sequence-region', @{$s}{qw/id start end/};
  }

  $self->_wrote_sth_before(1);
}

sub _parse_seq {
  my ( $self, $faheader ) = @_;

  my $fhi = $self->_fhi;

  # defined check not necessary
  while ( my $l = $fhi->() ) {
    if ( $l =~ /^\#/ ) {
      last;
    }
    if ( $l =~ /$FASTA_RE/ ) {
      push @{ $self->_cached_records }, $l;
      last;
    }
  }
  return;
}

sub next_feat {
  my ($self) = @_;

  my $fhi = $self->_fhi;

  my $l;
  while ( defined( $l = $fhi->() ) ) {
    if ( $l =~ /$SEQ_REGION_RE/ ) {
      my ( $seqid, $start, $end ) = ( $1, $2, $3 );
      push @{ $self->segments }, { id => $seqid, start => $start, end => $end };
    } elsif ( $l =~ /^\#\#\#/ ) {
      next;
    } elsif ( $l =~ /^\#/ || $l =~ /^\s*$/ || $l =~ m{^//} ) {
      next;
    } elsif ( $l =~ /$FASTA_RE/ ) {
      $self->_parse_seq($l);
      next;
    } else {
      last;
    }
  }
  return unless $l;

  my $feat = $self->_parse_gff_line($l);

  return $feat;
}

sub next_feat_raw {
  my ($self) = @_;

  my $fhi = $self->_fhi;

  my $l;
  while ( defined( $l = $fhi->() ) ) {
    if ( $l =~ /$SEQ_REGION_RE/ ) {
      my ( $seqid, $start, $end ) = ( $1, $2, $3 );
      push @{ $self->segments }, { id => $seqid, start => $start, end => $end };
    } elsif ( $l =~ /^\#\#\#/ ) {
      next;
    } elsif ( $l =~ /^\#/ || $l =~ /^\s*$/ || $l =~ m{^//} ) {
      next;
    } elsif ( $l =~ /$FASTA_RE/ ) {
      $self->_parse_seq($l);
      next;
    } else {
      last;
    }
  }
  return unless $l;

  return $l;
}

sub write_feat {
  my ( $self, @feats ) = @_;
  my $fh = $self->fh;

  $self->_write_header
    unless ( $self->_wrote_sth_before );

  for my $f (@feats) {
    confess "feature is no a Bio::Gonzales::Feat: " . Dumper($f)
      unless ( blessed $f eq 'Bio::Gonzales::Feat' );
    print $fh $f->to_gff3( $self->escape_whitespace );
  }

  return $self;
}

sub write_comment {
  my ( $self, $text ) = @_;

  my $fh = $self->fh;

  $self->_write_header
    unless ( $self->_wrote_sth_before );

  print $fh '#' . $text . "\n";

  return $self;
}

sub _from_gff3_string {
}

sub _split_attributes {
  my ( $self, $attributes ) = @_;

  my %attrs;
  my @groups = split( /\s*;\s*/, $attributes );

  for my $group (@groups) {
    my ( $tag, $value ) = split /=/, $group;

    $tag =~ s/$ATTR_UNESCAPE_RE/chr(hex($1))/eg;
    if ( defined($value) ) {
      my @values = map { s/$ATTR_UNESCAPE_RE/chr(hex($1))/eg; $_ } split /,/, $value;
      $attrs{$tag} //= [];
      push @{ $attrs{$tag} }, @values;
    } else {
      carp "WARNING: Problems to extract attribute, found: $tag" unless ( $self->quiet );
    }
  }
  return \%attrs;
}

sub _parse_gff_line {
  my ( $self, $l ) = @_;

  my @d = split( /\t/, $l );

  confess "error in gff:\n$l\n"
    unless ( @d == 9 );

  if ( ref $self->preprocess eq 'CODE' ) {
    @d = $self->preprocess->(@d);
  }

  $d[8] = $self->_split_attributes( $d[8] );

  if    ( $d[6] eq '-' ) { $d[6] = -1; }
  elsif ( $d[6] eq '+' ) { $d[6] = 1; }
  else                   { $d[6] = 0; }

  my %feat;
  for ( my $i = 0; $i < @GFF_COLUMN_NAMES; $i++ ) {
    $feat{ $GFF_COLUMN_NAMES[$i] } = $d[$i] unless ( $d[$i] eq '.' );
  }

  return Bio::Gonzales::Feat->new( \%feat );
}

sub write_collected_feats {
  my ( $self, $sub ) = @_;
  my $fh = $self->fh;

  $self->_connect_feats;
  my $parents           = $self->_find_parent_feats;
  my $escape_whitespace = $self->escape_whitespace;

  my $gsub = sub {
    my ( $f, $id ) = @_;
    $sub->( $f, $id ) if ($sub);
    $f->sort_subfeats;
    print $fh $f->to_gff3($escape_whitespace);
    return;
  };

  for my $p (@$parents) {
    $gsub->($p);
    $p->recurse_subfeats($gsub);
  }
  return;

}

1;

__END__

=head1 NAME

Bio::Gonzales::Feat::IO::GFF3 - read and write gff files

=head1 SYNOPSIS

  use Bio::Gonzales::Feat::IO::GFF3;

  my $output = Bio::Gonzales::Feat::IO::GFF3->new( file => 'a.gff', mode => '>', escape_whitespace => 1 );
  my $gffin = Bio::Gonzales::Feat::IO::GFF3->new( file => 'a.gff' );

  # gzipped files can be read directly.
  my $gffin = Bio::Gonzales::Feat::IO::GFF3->new( file => 'a.gff.gz' );

  my $gffin = Bio::Gonzales::Feat::IO::GFF3->new('a.gff');

  while ( my $feat = $gffin->next_feat ) {
    # $feat is a Bio::Gonzales::Feat
    next if ( $feat->type ne 'mRNA' );

    say STDERR $feat->id . " - " . $feat->parent_id;
  }

  $gffin->close;


=head1 DESCRIPTION

=head1 OPTIONS

=over 4

=item B<< mode => $mode >>

Bio::Gonzales::Feat::IO::GFF3 supports 3 different modes,

  Bio::Gonzales::Feat::IO::GFF3->new(mode => '>', ...); #output
  Bio::Gonzales::Feat::IO::GFF3->new(mode => '<', ...); #input, DEFAULT
  Bio::Gonzales::Feat::IO::GFF3->new(mode => '>>', ...); #append

all modes also work with gzipped files (ending on '.gz').

=item B<< fh => $fh >>

Bio::Gonzales::Feat::IO::GFF3 uses $fh to read or write data.

  open my $fh, '<', 'file.gff3' or confess "Can't open filehandle: $!";
  my $gff = Bio::Gonzales::Feat::IO::GFF3->new(fh => $fh, ...);

  # ... do something ...

  $gff->close;
  close $fh;

=item B<< file => $file >>

read from or write to the file C<$file>

=item B<< escape_whitespace => 1 >>

Usually, only whitespaces in the C<Target> attribute are escaped. If this
feature is turned on, whitespace in all attribute values will be escaped.

=item B<< record_filter => sub { ... } >>

Before reading in the GFF3 information, filter the raw line content according
to the supplied function. This functionality is handy for big gff3 files where
only a part of the output should be parsed.

Example:

  my $sub = sub {
    my $line = shift;

    return $line =~ /\tmRNA\t/;
  };
  my $gff = Bio::Gonzales::Feat::IO::GFF3->new( file => '...', mode => '<', record_filter => $sub );

  # ... only lines with the type 'mRNA' will be parsed ...

  $gff->close;


=back

=head1 METHODS

=over 4

=item B<< $gffio->write_feat($feat) >>

Write the feature to the output. Do not forget to call C<$gffio->close> at the
end of the processing, otherwise you probably end up writing only half of the
features.

=item B<< my $feat = $gffio->next_feat() >>

Retrieve the next feature, if in reading mode.

=item B<< $gffio->segments >>

=item B<< $gffio->pragmas >>

=item B<< $gffio->preprocess(\&process) >>

Change the gff input before the feature object gets instantiated. Arguments of the C<&process> sub are the nine columns of the gff file split into an array.

Example sub:
    sub process {
        my @cols = @_;

        $cols[1] = "createdbyme";
        return @cols;
    }

=item B<< $gffio->comments >>

=back

=head1 SEE ALSO

=head1 AUTHOR

jw bargsten, C<< <joachim.bargsten at wur.nl> >>

=cut