The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Bio::GenBankParser;

# $Id: GenBankParser.pm 27 2010-06-23 16:45:50Z kyclark $

use warnings;
use strict;
use Carp qw( croak );
use File::Spec::Functions;
use Parse::RecDescent;
use Readonly;

Readonly my $GENBANK_RECORD_SEPARATOR => "//\n";

=pod

=head1 NAME

Bio::GenBankParser - Parse::RecDescent parser for a GenBank record

=head1 VERSION

Version 0.05

=cut

our $VERSION = '0.05';

=head1 SYNOPSIS

This module aims to improve on the BioPerl GenBank parser by using
a grammar-based approach with Parse::RecDescent.

    use Bio::GenBankParser;

    my $parser = Bio::GenBankParser->new();

    local $/ = "//\n";
    while ( my $rec = <$input> ) {
        my $gb_rec = $parser->parse( $rec );
    }

Or:

    my $parser = Bio::GenBankParser->new( file => $file );
    while ( my $seq = $parser->next_seq ) {
        ...
    }

=head1 METHODS

=cut

# ----------------------------------------------------------------
sub new {

=pod

=head2 new

  use Bio::GenBankParser;
  my $parser = Bio::GenBankParser->new;

=cut

    my $class = shift;
    my %args  = ( @_ && ref $_[0] eq 'HASH' ) ? %{ $_[0] } : @_;
    my $self  = bless \%args, $class;

    if ( $args{'file'} ) {
        $self->file( $args{'file'} );
    }

    return $self;
}

# ----------------------------------------------------------------
sub DESTROY {
    my $self = shift;

    if ( my $fh = $self->{'fh'} ) {
        close $fh;
    }
}

# ----------------------------------------------------------------
sub file {

=pod

=head2 file

    $parser->file('/path/to/file');

Informs the parser to read sequentially from a file.

=cut

    my $self = shift;

    if ( my $file = shift ) {
        $file = canonpath( $file );

        if ( -e $file && -s _ && -r _ ) { 
            open my $fh, '<', $file or croak("Can't read file '$file'; $!\n");

            $self->{'file'} = $file;
            $self->{'fh'}   = $fh;
        }
        else {
            croak("Non-existent, empty or unreadable file: '$file'");
        }
    }

    return 1;
}

# ----------------------------------------------------------------
sub current_record {

=pod

=head2 current_record

    my $genbank_record = $parser->current_record;

Returns the current unparsed GenBank record.

=cut

    my $self = shift;

    return $self->{'current_record'};
}

# ----------------------------------------------------------------
sub next_seq {

=pod

=head2 next_seq

    my $seq = $parser->next_seq;

Returns the next sequence from the C<file>.

=cut

    my $self = shift;

    if ( my $fh = $self->{'fh'} ) {
        local $/ = $GENBANK_RECORD_SEPARATOR; 

        my $rec;
        for (;;) {
            $rec = <$fh>;
            last if !defined $rec || $rec =~ /\S+/;
        }

        if ( defined $rec && $rec =~ /\S+/ ) {
            return $self->parse( $rec );
        }
        else {
            return undef;
        }
    }
    else {
        croak("Can't call 'next_seq' without a 'file' argument");
    }
}

# ----------------------------------------------------------------
sub parse {

=pod

=head2 parse

    my $rec = $parser->parse( $text );
    print $rec->{'ACCESSION'};

Parses a (single) GenBank record into a hash reference.

=cut

    my $self   = shift;
    my $text   = shift() or croak('No input to parse');
    my $parser = $self->parser or croak('No parser');

    $self->{'current_record'} = $text;

    return $parser->startrule( $text );
}

# ----------------------------------------------------------------
sub parser {

=pod

=head2 parser

Returns the Parse::RecDescent object.

=cut

    my $self = shift;

    if ( !defined $self->{'parser'} ) {
        my $grammar = $self->grammar or croak('No grammar');
        $self->{'parser'} = Parse::RecDescent->new( $grammar );
    }

    return $self->{'parser'};
}

# ----------------------------------------------------------------
sub grammar {

=pod

=head2 grammar

Returns the Parse::RecDescent grammar for a GenBank record.

=cut

    my $self = shift;
    return <<'END_OF_GRAMMAR';
{
    my $ref_num  = 1;
    my %record   = ();
    my %ATTRIBUTE_PROMOTE = map { $_, 1 } qw[ 
        mol_type 
        cultivar 
        variety 
        strain 
    ];

    $::RD_ERRORS; # report fatal errors
#    $::RD_TRACE  = 0;
#    $::RD_WARN   = 0; # Enable warnings. This will warn on unused rules &c.
#    $::RD_HINT   = 0; # Give out hints to help fix problems.
}

startrule: section(s) eofile 
    { 
        if ( !$record{'ACCESSION'} ) {
            $record{'ACCESSION'} = $record{'LOCUS'}->{'genbank_accession'};
        }

        if ( ref $record{'SEQUENCE'} eq 'ARRAY' ) {
            $record{'SEQUENCE'} = join('', @{ $record{'SEQUENCE'} });
        }

        $return = { %record };
        %record = ();
    }
    | <error>

section: commented_line
    | header
    | locus
    | dbsource
    | definition
    | accession_line
    | project_line
    | version_line
    | keywords
    | source_line
    | organism
    | reference
    | features
    | base_count
    | contig
    | origin
    | comment
    | record_delimiter
    | <error>

header: /.+(?=\nLOCUS)/xms

locus: /LOCUS/xms locus_name sequence_length molecule_type
    genbank_division(?) modification_date
    {
        $record{'LOCUS'} = {
            locus_name        => $item{'locus_name'},
            sequence_length   => $item{'sequence_length'},
            molecule_type     => $item{'molecule_type'},
            genbank_division  => $item{'genbank_division(?)'}[0],
            modification_date => $item{'modification_date'},
        }
    }

locus_name: /\w+/

space: /\s+/

sequence_length: /\d+/ /(aa|bp)/ { $return = "$item[1] $item[2]" }

molecule_type: /\w+/ (/[a-zA-Z]{4,}/)(?)
    { 
        $return = join(' ', map { $_ || () } $item[1], $item[2][0] ) 
    }

genbank_division: 
    /(PRI|CON|ROD|MAM|VRT|INV|PLN|BCT|VRL|PHG|SYN|UNA|EST|PAT|STS|GSS|HTG|HTC|ENV)/

modification_date: /\d+-[A-Z]{3}-\d{4}/

definition: /DEFINITION/ section_continuing_indented
    {
        ( $record{'DEFINITION'} = $item[2] ) =~ s/\n\s+/ /g;
    }

section_continuing_indented: /.*?(?=\n[A-Z]+\s+)/xms

section_continuing_indented: /.*?(?=\n\/\/)/xms

accession_line: /ACCESSION/ section_continuing_indented
    {
        my @accs = split /\s+/, $item[2];
        $record{'ACCESSION'} = shift @accs;
        push @{ $record{'VERSION'} }, @accs;
    }

version_line: /VERSION/ /(.+)(?=\n)/
    {
        push @{ $record{'VERSION'} }, split /\s+/, $item[2];
    }

project_line: /PROJECT/ section_continuing_indented
    {
        $record{'PROJECT'} = $item[2];
    }

keywords: /KEYWORDS/ keyword_value
    { 
        $record{'KEYWORDS'} = $item[2];
    }

keyword_value: section_continuing_indented
    { 
        ( my $str = $item[1] ) =~ s/\.$//;
        $return = [ split(/,\s*/, $str ) ];
    }
    | PERIOD { $return = [] }

dbsource: /DBSOURCE/ /\w+/ /[^\n]+/xms
    {
        push @{ $record{'DBSOURCE'} }, {
            $item[2], $item[3]
        };
    }

source_line: /SOURCE/ source_value 
    { 
        ( my $src = $item[2] ) =~ s/\.$//;
        $src =~ s/\bsp$/sp./;
        $record{'SOURCE'} = $src;
    }

source_value: /(.+?)(?=\n\s{0,2}[A-Z]+)/xms { $return = $1 }

organism: organism_line classification_line
    { 
        $record{'ORGANISM'} = $item[1];
        $record{'CLASSIFICATION'} = $item[2];
    }

organism_line: /ORGANISM/ organism_value { $return = $item[2] }

organism_value: /([^\n]+)(?=\n)/xms { $return = $1 }

classification_line: /([^.]+)[.]/xms { $return = [ split(/;\s*/, $1) ] }

word: /\w+/

reference: /REFERENCE/ NUMBER(?) parenthetical_phrase(?) authors(?) consrtm(?) title journal remark(?) pubmed(?) remark(?)
    {
        my $num    = $item[2][0] || $ref_num++;
        my $remark = join(' ', map { $_ || () } $item[8][0], $item[10][0]);
        $remark    = undef if $remark !~ /\S+/;

        push @{ $record{'REFERENCES'} }, {
            number  => $num,
            authors => $item{'authors(?)'}[0],
            title   => $item{'title'},
            journal => $item{'journal'},
            pubmed  => $item[9][0],
            note    => $item[3][0],
            remark  => $remark,
            consrtm => $item[5][0],
        };

    }

parenthetical_phrase: /\( ([^)]+) \)/xms
    {
        $return = $1;
    }

authors: /AUTHORS/ author_value { $return = $item[2] }

author_value: /(.+?)(?=\n\s{0,2}[A-Z]+)/xms 
    { 
        $return = [ 
            grep  { !/and/ }      
            map   { s/,$//; $_ } 
            split /\s+/, $1
        ];
    }

title: /TITLE/ /.*?(?=\n\s{0,2}[A-Z]+)/xms
    { ( $return = $item[2] ) =~ s/\n\s+/ /; }

journal: /JOURNAL/ journal_value 
    { 
        $return = $item[2] 
    }

journal_value: /(.+)(?=\n\s{3}PUBMED)/xms 
    { 
        $return = $1; 
        $return =~ s/\n\s+/ /g; 
    }
    | /(.+?)(?=\n\s{0,2}[A-Z]+)/xms 
    { 
        $return = $1; 
        $return =~ s/\n\s+/ /g; 
    }

pubmed: /PUBMED/ NUMBER
    { $return = $item[2] }

remark: /REMARK/ section_continuing_indented
    { $return = $item[2] }

consrtm: /CONSRTM/ /[^\n]+/xms { $return = $item[2] }

features: /FEATURES/ section_continuing_indented
    { 
        my ( $location, $cur_feature_name, %cur_features, $cur_key );
        for my $fline ( split(/\n/, $item[2]) ) {
            next if $fline =~ m{^\s*Location/Qualifiers};
            next if $fline !~ /\S+/;

            if ( $fline =~ /^\s{21}\/ (\w+?) = (.+)$/xms ) {
                my ( $key, $value )   = ( $1, $2 );
                $value                =~ s/^"|"$//g;
                $cur_key              = $key;
                $cur_features{ $key } = $value;

                if ( $key eq 'db_xref' && $value =~ /^taxon:(\d+)$/ ) {
                    $record{'NCBI_TAXON_ID'} = $1;
                }

                if ( $ATTRIBUTE_PROMOTE{ $key } ) {
                    $record{ uc $key } = $value;
                }
            }
            elsif ( $fline =~ /^\s{5}(\S+) \s+ (.+)$/xms ) {
                my ( $this_feature_name, $this_location ) = ( $1, $2 );
                $cur_key = '';

                if ( $cur_feature_name ) {
                    push @{ $record{'FEATURES'} }, {
                        name     => $cur_feature_name,
                        location => $location,
                        feature  => { %cur_features },
                    };

                    %cur_features = ();
                }

                ( $cur_feature_name, $location ) = 
                    ( $this_feature_name, $this_location );
            }
            elsif ( $fline =~ /^\s{21}([^"]+)["]?$/ ) {
                if ( $cur_key ) {
                    $cur_features{ $cur_key } .= 
                        $cur_key eq 'translation' 
                            ? $1
                            : ' ' . $1;
                }
            }
        }

        push @{ $record{'FEATURES'} }, {
            name     => $cur_feature_name,
            location => $location,
            feature  => { %cur_features },
        };
    }

base_count: /BASE COUNT/ base_summary(s)
    {
        for my $sum ( @{ $item[2] } ) {
            $record{'BASE_COUNT'}{ $sum->[0] } = $sum->[1];
        }
    }

base_summary: /\d+/ /[a-zA-Z]+/
    {
        $return = [ $item[2], $item[1] ];
    }

origin: /ORIGIN/ origin_value 
    { 
        $record{'ORIGIN'} = $item[2] 
    }

origin_value: /(.*?)(?=\n\/\/)/xms
    {
        my $seq = $1;
        $record{'SEQUENCE'} = [];
        while ( $seq =~ /([actg]+)/gc ) {
            push @{ $record{'SEQUENCE'} }, $1;
        }

        $return = $seq;
    }

comment: /COMMENT/ comment_value

comment_value: /(.+?)(?=\n[A-Z]+)/xms
    { 
        $record{'COMMENT'} = $1;
    }

contig: /CONTIG/ section_continuing_indented 
    {
        $record{'CONTIG'} = $item[2];
    }

commented_line: /#[^\n]+/

NUMBER: /\d+/

PERIOD: /\./

record_delimiter: /\/\/\s*/xms

eofile: /^\Z/

END_OF_GRAMMAR
}

# ----------------------------------------------------------------
=pod

=head1 AUTHOR

Ken Youens-Clark E<lt>kclark at cpan.orgE<gt>.

=head1 BUGS

Please report any bugs or feature requests to C<bug-bio-genbankparser
at rt.cpan.org>, or through the web interface at
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Bio-GenBankParser>.
I will be notified, and then you'll automatically be notified of
progress on your bug as I make changes.

=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc Bio::GenBankParser

=over 4

=item * RT: CPAN's request tracker

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Bio-GenBankParser>

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/Bio-GenBankParser>

=item * CPAN Ratings

L<http://cpanratings.perl.org/d/Bio-GenBankParser>

=item * Search CPAN

L<http://search.cpan.org/dist/Bio-GenBankParser>

=back

=head1 ACKNOWLEDGEMENTS

Lincoln Stein, Doreen Ware and everyone at Cold Spring Harbor Lab.

=head1 COPYRIGHT & LICENSE

Copyright 2008 Ken Youens-Clark, all rights reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of Bio::GenBankParser