The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
=head1 NAME

    Bio::KEGGI::ko

=head1 DESCRIPTION

    Parse KEGG ko file (ftp://ftp.genome.jp/pub/kegg/genes/ko).
    
=head1 METHODS

=head2 next_rec

    Name:   next_rec
    Desc:   Get next KEGG record
    Usage:  $o_keggi->next_rec()
    Args:   none
    Return: A Bio::KEGG::ko object
    
=head1 VERSION

    v0.1.5
    
=head1 AUTHOR
    
    Haizhou Liu (zeroliu-at-gmail-dot-com)
    
=cut

=begin NOTE

    Retruned data structure:
    
    ----------------------------------------------------------------------------
    
    $rh_rec = {
        'id'      => $id,
        'name'    => $name,
        'definit' => $definition,
        'ec'      => [ $ec, ... ],
        'pathway' => [ $pathway_id, ... ],
        'module'  => [ $module_id, ... ],
        'class'   => [ $class, ... ],
        'dblink'  => [
            {
                'db'   => $db,
                'link' => [ $link_id, ... ],
            },
            ...
        ],
        'gene'    => [
            {
                'org'  => $org,
                'org_gene' => [
                    {
                        entry => $entry,
                        name  => $name,
                    },
                    ...
                ],
                ...
            },
            ...
        ],
        'pmid'    => [ $pmid, ... ],
    }
    
    ----------------------------------------------------------------------------

=cut

package Bio::KEGGI::ko;

use strict;
use warnings;

use Switch;
use Text::Trim;

use Bio::KEGG::ko;

# use Smart::Comments;

our $VERSION = 'v0.1.5';

use base qw(Bio::KEGGI);

=begin next_rec
    Name:   next_rec
    Desc:   Get next KEGG record
    Usage:  $o_keggi->next_rec()
    Args:   none
    Return: A KEGG object
=cut

sub next_rec {
    my $self = shift;
    
    my $ra_rec = _get_next_rec($self->{'_FH'});
    my $rh_rec = _parse_ko_rec($ra_rec);
    
    bless($rh_rec, "Bio::KEGG::ko") if (defined $rh_rec);
    
    return $rh_rec;
}


=begin _get_next_rec
    Name:   _get_next_rec
    Desc:   Read a record from KEGG file
    Usage:  _get_next_rec(FH)
    Args:   A filehandle of KEGG file
    Return: A ref of an array for a KEGG record
=end
=cut

sub _get_next_rec {
    my $ifh = shift;
    
    # Since a KEGG record ended with '///'
    local $/ = "\/\/\/\n";
    
    my $rec;
    
    if ($rec = <$ifh>) {
        my @rec = split(/\n/, $rec);
        
        return \@rec;
    }
    else { # To the end of file
        return;
    }
}

=begin _parse_ko_rec
    Name:   _parse_ko_rec
    Desc:   Parse KEGG ko record
    Usage:  _parse_ko_rec($ra_rec)
    Args:   A reference to an array of Bio::KEGGI::ko record
    Return: A reference to a hash of Bio::KEGG record
=cut

sub _parse_ko_rec {
    my $ra_rec = shift;
    
    # KO record: $ra_rec
    
    my $rh_rec;
    my $cur_section = '';
    my $db_tag = '';
    my $gene_org_tag = '';
    
    for my $row ( @{$ra_rec} ) {
        next if ( $row =~ /^\s*$/);
        next if ( $row =~ /\/\/\// );
        
        if ($row =~ /^ENTRY\s+(.+?)\s+/) {
            $rh_rec->{'id'} = $1;
        }
        elsif ($row =~ /^NAME/ ) {
            # 'NAME        E1.1.1.3'
            # 'NAME        E1.2.1.2B2'
            # 'NAME        E1.1.-.-'
            if ($row =~ /^NAME\s{8}E[\w\.\-]+$/) {
                next;
                # $rh_rec->{'ec'} = $1;
            }
            # 'NAME        E1.1.1.1, adh'
            # 'NAME        E1.2.99.2A, cdhA'
            elsif ($row =~ /^NAME\s{8}E[\w\.\-]+,\s(.+?)$/) {
                # $rh_rec->{'ec'} = $1;
                $rh_rec->{'name'} = $1;
            }
            # 'NAME        BDH, butB'
            elsif ($row =~ /^NAME\s{8}(.+?)$/) {
                $rh_rec->{'name'} = $1;
            }
            else {  # Unrecognized NAME
                # Do nothing
                ### Unrecognized NAME: $row
            }
        }
        elsif ($row =~ /^DEFINITION\s{2}(.+)?/) { # There might be multi rows for DEFINITION
            $cur_section = 'DEFINITION';
            
            my $defin = $1;
            
            $rh_rec->{'definit'} = $defin;
            
            # if it ended with a ']'
            if ($defin =~ /]$/) {
                if (my $ra_ec = _get_definition_ec($defin) ) {
                    $rh_rec->{'ec'} = $ra_ec;
                }
            }
        }
        elsif ($row =~ /^PATHWAY\s{5}(\w+\d{5})\s/) {
            $cur_section = 'PATHWAY';
            
            push @{ $rh_rec->{'pathway'} }, $1;
        }
        elsif ($row =~ /^MODULE\s{6}(M\d{5})/) {
            $cur_section = 'MODULE';
            
            push @{ $rh_rec->{'module'} }, $1;
        }
        elsif ($row =~ /^DISEASE\s{5}(H\d{5})/) {
            $cur_section = 'DISEASE';
            
            push @{ $rh_rec->{'disease'} }, $1;
        }
        elsif ($row =~ /^CLASS\s{7}(.+)$/) {
            $cur_section = 'CLASS';
            
            push @{ $rh_rec->{'class'} }, $1;
        }
        elsif ($row =~ /DBLINKS\s{5}(\S+?):\s(.+)?/) {
            $cur_section = 'DBLINKS';
            $db_tag = $1;
            
            my $dblinks = $2;
            my @dblinks = split(/\s/, $dblinks);
            
            my $rh_dblink = {
                'db' => $db_tag,
                'link' => \@dblinks,
            };
            
            push @{ $rh_rec->{'dblink'} }, $rh_dblink;
        }
        elsif ($row =~ /^GENES\s{7}(\S+):\s(.+?)$/) {
            $cur_section = 'GENES';
            $gene_org_tag = lc($1); # convert upper-cased organism name to lower-case
            
            my $genes = $2;
            # my @genes = split(/\s/, $genes);
            my $ra_org_gene = _parse_gene($genes);
            
            my $rh_gene = {
                'org'      => $gene_org_tag,
                'org_gene' => $ra_org_gene,
            };
            
            push @{ $rh_rec->{'gene'} }, $rh_gene;
        }
        elsif ($row =~ /^REFERENCE/) {
            if ($row =~ /^REFERENCE\s+PMID:(\d+)/) {
                $cur_section = 'REFERENCE';
                push @{ $rh_rec->{'pmid'} }, $1;
            }
            elsif ($row =~ /^REFERENCE/) {
                # Do nothing
            }
            else {
                print '-'x50, "\n", "Unrecognized:\n", $row, "\n", '-'x50, "\n";
            }
        }
        elsif ($row =~ /^\s{2}(?:AUTHORS|TITLE|JOURNAL)/) { # REFERENCE section
            # Do nothing
        }
        elsif ($row =~ /^\s{12}\S/) { # Continuous text for existing section
            switch ($cur_section) {
                case 'DEFINITION' {
                    trim($row);
                    
                    $rh_rec->{'definit'} .= " $row";
                    
                    if ($rh_rec->{'definit'} =~ /\]$/) {    # possible EC definition
                        if (my $ra_ec = _get_definition_ec( $rh_rec->{'definit'} ) ) {
                            $rh_rec->{'ec'} = $ra_ec;
                        }
                    }
                }
                case 'PATHWAY' {
                    # trim($row);
                    
                    if ($row =~ /ko(\d{5})/) {
                        push @{ $rh_rec->{'pathway'} }, $1;
                    }
                    else {
                        ### Unrecognized PATHWAY: $row
                    }
                }
                case 'MODULE' {
                    if ($row =~ /(M\d{5})/) {
                        push @{ $rh_rec->{'module'} }, $1;
                    }
                    else {
                        ### Unrecognized MODULE: $row
                    }
                }
                case 'DISEASE' {
                    if ($row =~ /(\H\d{5})/) {
                        push @{ $rh_rec->{'disease'} }, $1;
                    }
                    else {
                        ### Unrecognized MODULE: $row
                    }
                }
                case 'CLASS' {
                    trim($row);
                    
                    if ($row =~ /^[A-Z]/) {
                        push @{ $rh_rec->{'class'} }, $row;
                    }
                    else {
                        my $class = pop @{ $rh_rec->{'class'} };
                        
                        $class .= " $row";
                        
                        push @{ $rh_rec->{'class'} }, $class;
                    }
                }
                case 'DBLINKS' {
                    if ($row =~ /(\S+):\s(.+?)$/) {
                        $db_tag = $1;
                        my $dblinks = $2;
                        my @dblinks = split(/\s/, $dblinks);
                        
                        my $rh_dblink = {
                            'db'   => $db_tag,
                            'link' => \@dblinks,
                        };
                        
                        push @{ $rh_rec->{'dblink'} }, $rh_dblink;
                    }
                    else {
                        ### Unrecognized DBLINKS: $row
                    }
                }
                # =>'            OSA: 4330090(Os02g0637700) 4348152(Os10g0159800)'
                case 'GENES' {
                    if ($row =~ /([A-Z]+):\s(.+?)$/) {
                        $gene_org_tag = lc($1);
                        
                        my $genes = $2;
=begin
                        my @genes = split(/\s/, $genes);
                        
                        my $rh_gene = {
                            'org'  => $gene_org_tag,
                            'gene' => \@genes,
                        };
=end
=cut

                        my $ra_org_gene = _parse_gene($genes);
                        
                        my $rh_gene = {
                            'org'      => $gene_org_tag,
                            'org_gene' => $ra_org_gene,
                        };
                        
                        push @{ $rh_rec->{'gene'} }, $rh_gene;
                    }
                    else {
                        ### Unrecognized GENES: $row
                    }
                }
                case 'REFERENCE' {
                    # Do nothing
                }
                else {
                    ### Unrecognized row: $row
                }
            }
        }
        elsif ($row =~ /^\s{16}/) {
            switch ($cur_section) {
                case 'PATHWAY' {
                    # Do nothing
                }
                case 'DISEASE' {
                    # Do nothing
                }
                case 'DBLINKS' {
                    trim($row);
                    my @dblinks = split(/\s/, $row);
                    
                    my $rh_dblink = pop @{ $rh_rec->{'dblink'} };
                    
                    push @{ $rh_dblink->{'link'} }, @dblinks;
                    
                    push @{ $rh_rec->{'dblink'} }, $rh_dblink;
                }
                #   '             OSA: 4330090(Os02g0637700) 4348152(Os10g0159800)'
                # =>'                   4350053(Os11g0210300) 4350054(Os11g0210500)'
                case 'GENES' {
                    trim($row);
                    
                    # my @genes = split(/\s/, $row);
                    my $ra_org_gene = _parse_gene($row);
                    
                    my $rh_gene = pop @{ $rh_rec->{'gene'} };
                    
                    for my $rh_org_gene ( @{ $ra_org_gene } ) {
                        push @{ $rh_gene->{'org_gene'} }, $rh_org_gene;
                    }
                    
                    push @{ $rh_rec->{'gene'} }, $rh_gene;
                }
                case 'MODULE' {
                    # Do nothing
                }
                case 'REFERENCE' {
                    # Do nothing
                }
                else {
                    ### Unrecognized row: $row
                }
            }
        }
        else {    # Unparsed row
            # Do nothing
            
            ### Current entry: $rh_rec->{'id'}
            ### Unrecognized row:  $row
        }
    }
    
    return $rh_rec;
}

=begin _get_definition_ec
    Name:   _get_definition_ec
    Desc:   Parse EC from DEFINITION
    Usage:  _get_definition_ec($def)
    Args:   A string of DEFINITION
    Return: A string
=cut

sub _get_definition_ec {
    my $str = shift;
    
    if ($str =~ /\[EC:(.+?)\]/) {
        my $ecs = $1;
        my @ecs = split(/\s/, $ecs);
        
        return \@ecs;
    }
    else {  # without EC information
        return;
    }
}

=begin _parse_gene
    Name:   _parse_gene
    Desc:   Parse gene entries and names for an organism
            This subroutine presumes there is only ONE name for a gene entry.
            Such as: 'AT1G32780' 'AT1G77120(ADH1)'.
            If there was multiple names for a gene entry, it will print
            DEBUG messages.
            
            Return:
            ----------------------------------------------------------
            $ra_org_genes = [
                {
                    'entry' => $entry,
                    'name'  => $name,
                },
                ...
            ]
            ----------------------------------------------------------
    Usage:  _parse_gene($genes)
    Args:   A string of genes
    Return: A reference of hash
=cut

sub _parse_gene {
    my ($genes) = @_;
    
    my @genes = split(/\s/, $genes);
    
    my @org_genes;
    
    for my $gene (@genes) {
        my $rh_gene = {};   # Init hash
        
        if ($gene =~ /(\S+)\((\S+)\)/) {    # 'AT1G77120(ADH1)'
            $rh_gene->{'entry'} = $1;
            $rh_gene->{'name'}  = $2;
            
            # DEBUG
            if ($rh_gene->{'name'} =~ /\s/) {
                ### Multiple names for a gene:
                ### ENTRY: $rh_gene->{'entry'}
                ### NAMES: $rh_gene->{'name'}
            }
        }
        else {  # 'AT1G32780'
            $rh_gene->{'entry'} = $gene;
        }
        
        push @org_genes, $rh_gene;
    }
    
    return \@org_genes;
}

1;