The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# $Id: base_parser.pm,v 1.18 2008/03/13 05:16:40 cmungall Exp $
#
#
# see also - http://www.geneontology.org
#          - http://www.godatabase.org/dev
#
# You may distribute this module under the same terms as perl itself

package GO::Parsers::base_parser;

=head1 NAME

  GO::Parsers::base_parser     - base class for parsers

=head1 SYNOPSIS

  do not use this class directly; use GO::Parser

=cut

=head1 DESCRIPTION

=head1 AUTHOR

=cut

use Carp;
use FileHandle;
use Digest::MD5 qw(md5_hex);
use GO::Parser;
use Data::Stag qw(:all);
use base qw(Data::Stag::BaseGenerator Exporter);
use strict qw(subs vars refs);

# Exceptions

sub throw {
    my $self = shift;
    confess("@_");
}

sub warn {
    my $self = shift;
    warn("@_");
}

sub messages {
    my $self = shift;
    $self->{_messages} = shift if @_;
    return $self->{_messages};
}

*error_list = \&messages;

sub message {
    my $self = shift;
    my $msg = shift;
    CORE::warn 'deprecated';
    $self->parse_err($msg);
}

=head2 show_messages

  Usage   -
  Returns -
  Args    -

=cut

sub show_messages {
    my $self = shift;
    my $fh = shift;
    $fh = \*STDERR unless $fh;
    foreach my $e (@{$self->error_list || []}) {
        printf $fh "\n===\n  Line:%s [%s]\n%s\n  %s\n\n", $e->{line_no} || "", $e->{file} || "", $e->{line} || "", $e->{msg} || "";
    }
}

sub init {
    my $self = shift;

    $self->messages([]);
    $self->acc2name_h({});
    $self;
}

sub parsed_ontology {
    my $self = shift;
    $self->{parsed_ontology} = shift if @_;
    return $self->{parsed_ontology};
}

=head2 normalize_files

  Usage   - @files = $parser->normalize_files(@files)
  Returns -
  Args    -

takes a list of filenames/paths, "glob"s them, uncompresses any compressed files and returns the new file list

=cut

sub normalize_files {
    my $self = shift;
    my $dtype;
    my @files = map {glob $_} @_;
    my @errors = ();
    my @nfiles = ();
    
    # uncompress any compressed files
    foreach my $fn (@files) {
        if ($fn =~ /\.gz$/) {
            my $nfn = $fn;
            $nfn =~ s/\.gz$//;
            my $cmd = "gzip -dc $fn > $nfn";
            #print STDERR "Running $cmd\n";
            my $err = system("$cmd");
            if ($err) {
                push(@errors,
                     "can't uncompress $fn");
                next;
            }
            $fn = $nfn;
        }
        if ($fn =~ /\.Z$/) {
            my $nfn = $fn;
            $nfn =~ s/\.Z$//;
            my $cmd = "zcat $fn > $nfn";
            print STDERR "Running $cmd\n";
            my $err = system("$cmd");
            if ($err) {
                push(@errors,
                     "can't uncompress $fn");
                next;
            }
            $fn = $nfn;
        }
        push(@nfiles, $fn);
    }
    my %done = ();
    @files = grep { my $d = !$done{$_}; $done{$_} = 1; $d } @nfiles;
    return @files;
}

sub fire_source_event {
    my $self = shift;
    my $file = shift || die "need to pass file argument";
    my @fileparts = split(/\//, $file);
    my @stat = stat($file);
    my $mtime = $stat[9];
    my $parsetime = time;
    my $md5 = md5_hex($fileparts[-1]);
    $self->event(source => [
				     [source_id => $file ],
				     [source_type => 'file'],
				     [source_fullpath => $file ],
				     [source_path => $fileparts[-1] ],
				     [source_md5 => $md5],
				     [source_mtime => $mtime ],
				     [source_parsetime => $parsetime ],
				    ]
			 );
    return;
}

sub parse_assocs {
    my $self = shift;
    my $fn = shift;
    $self->dtype('go_assoc');
    my $p = GO::Parser->get_parser_impl('go_assoc');
    %$p = %$self;
    $p->parse($fn);
    return;
}

sub parse_to_graph {
    my $self = shift;
    my $h = GO::Parser->create_handler('obj');
    $self->handler($h);
    $self->parse(@_);
    return $h->graph;
}

sub set_type {
    my ($self, $fmt) = @_;
    $self->dtype($fmt);
    my $p = GO::Parser->get_parser_impl($fmt);
    bless $self, ref($p);
    return;
}
sub dtype {
    my $self = shift;
    $self->{_dtype} = shift if @_;
    return $self->{_dtype};
}

sub parse_file {
    my ($self, $file, $dtype) = @_;

    $self->dtype($dtype);
    $self->parse($file);
}

sub xslt {
    my $self = shift;
    $self->{_xslt} = shift if @_;
    return $self->{_xslt};
}

sub force_namespace {
    my $self = shift;
    $self->{_force_namespace} = shift if @_;
    return $self->{_force_namespace};
}

sub replace_underscore {
    my $self = shift;
    $self->{_replace_underscore} = shift if @_;
    return $self->{_replace_underscore};
}

# EXPERIMENTAL: cache objects
sub use_cache {
    my $self = shift;
    $self->{_use_cache} = shift if @_;
    return $self->{_use_cache};
}

# EXPERIMENTAL: returns subroutine
# sub maps name to cached name
sub file_to_cache_sub {
    my $self = shift;
    my $lite = $self->litemode;
    my $suffix = $lite ? ".lcache" : ".cache";
    $self->{_file_to_cache_sub} = shift if @_;
    return $self->{_file_to_cache_sub} ||
      sub {
          my $f = shift;
          $f =~ s/\.\w+$//;
          $f .= $suffix;
          return $f;
      };
}


sub cached_obj_file {
    my $self = shift;
    return $self->file_to_cache_sub->(@_);
}

sub parse {
    my ($self, @files) = @_;

    my $dtype = $self->dtype;
    foreach my $file (@files) {

        $file = $self->download_file_if_required($file);

        $self->file($file);
        #printf STDERR "parsing: $file %d\n", $self->use_cache;

        if ($self->use_cache) {
            my $cached_obj_file = $self->cached_obj_file($file);
            my $reparse;
            if (-f $cached_obj_file) {
                my @stat1 = lstat($file);
                my @stat2 = lstat($cached_obj_file);
                my $t1 = $stat1[9];
                my $t2 = $stat2[9];
                if ($t1 >= $t2) {
                    $reparse = 1;
                }
                else {
                    $reparse = 0;
                }
            }
            else {
                $reparse = 1;
            }

            if ($reparse) {
                # make/remake cache
                my $hclass = "GO::Handlers::obj_storable";
                $self->load_module($hclass);
                my $cache_handler =
                  $hclass->new;
                $self->use_cache(0);
                my $orig_handler = $self->handler;
                $self->handler($cache_handler);
                $cache_handler->file($cached_obj_file);
                $self->parse($file);
                my $g = $cache_handler->graph;
                $self->use_cache(1);
                my $p2 = GO::Parser->new({
                                          format=>'GO::Parsers::obj_emitter'});
                $p2->handler($orig_handler);
                # this is the only state we need to copy across
                if ($self->can('xslt')) {
                    $p2->xslt($self->xslt);
                }
                $p2->emit_graph($g);
            }
            else {
                # use cache
                my $p2 = GO::Parser->new({format=>'obj_storable'});
                $p2->handler($self->handler);
                # this is the only state we need to copy across
                if ($self->can('xslt')) {
                    $p2->xslt($self->xslt);
                }
                $p2->parse_file($cached_obj_file);
            }
            next;
        }

        # check for XSL transform
        if ($self->can('xslt') && $self->xslt) {
            my $xsl = $self->xslt;
            my $xslt_file = $xsl;

            if (!-f $xslt_file) {
                # if GO_ROOT is set then this specifies the location of the xslt dir
                #  if it is not set, assume we are using an installed version of go-perl,
                #  in which case, the xslts will be located along with the perl modules
                my $GO_ROOT = $ENV{GO_ROOT};
                if ($GO_ROOT) {
                    # env var takes precedence;
                    # developers should use this
                    $xslt_file = "$GO_ROOT/xml/xsl/$xsl.xsl";
                }
                
                # default location is with perl modules
                if (!$xslt_file || !-f $xslt_file) {
                    # user-mode; xsl will be wherever the GO modules are installed
                    require "GO/Parser.pm";
                    my $dir = $INC{'GO/Parser.pm'};
                    $dir =~ s/Parser\.pm/xsl/;
                    $xslt_file = "$dir/$xsl.xsl";
                }
            }
            if (!-f $xslt_file) {
                $self->throw("No such file: $xslt_file OR $xsl");
            }

            # first parse input file to intermediate xml
            my $file1 = _make_temp_filename($file, "-1.xml");
            my $handler = $self->handler;
            my $outhandler1 =
              Data::Stag->getformathandler("xml");
            $outhandler1->file($file1);
            $self->handler($outhandler1);
            $self->SUPER::parse($file);
            $self->handler->finish;

            # transform intermediate xml using XSLT
            my $file2 = _make_temp_filename($file, "-2.xml");
            # $results contains the post-xslt XML doc;
            # we either want to write this directly, or
            # pass it to a handler

            if ($handler->isa("Data::Stag::XMLWriter")) {
                # WRITE DIRECTLY:
                # if the final goal is XML, then write this
                # directly
                if ($handler->file) {
                    # $ss->output_file($results,$handler->file);
                    xsltproc($xslt_file,$file1,$handler->file);
                } else {
                    my $fh = $handler->fh;
                    if (!$fh) {
                        $fh = \*STDOUT;
                        xsltproc($xslt_file,$file1);
                    }
                    else {
                        xsltproc($xslt_file,$file1,$file2);
                        my $infh = FileHandle->new($file2) || die "cannot open $file2";
                        while (<$infh>) {print $fh $_}
                        unlink($file2);
                    }
                    #$ss->output_fh($results,$handler->fh);
                }
            } else {
                # PASS TO HANDLER:
                # we need to do another transform, in perl.
                # 
                # write $results of stylesheet transform
                #$ss->output_file($results,$file2);
                xsltproc($xslt_file,$file1,$file2);
                
                # clear memory
                #$ss=undef;
                #$xslt=undef;
                #$results=undef;

                # we pass the final XML to the handler
                my $load_parser = new GO::Parser ({format=>'obo_xml'});
                $load_parser->handler($handler);
                $load_parser->errhandler($self->errhandler);
                $load_parser->parse($file2);
                unlink($file2);
            }

            # restore to previous state
            $self->handler($handler);
        } else {
            # no XSL transform - perform parse as normal
            # (use Data::Stag superclass)
            $self->SUPER::parse($file);
        }
    }
}

# applies XSLT and removes input file
sub xsltproc {
    my ($xf,$inf,$outf) = @_;
    my $cmd = "xsltproc $xf $inf";
    if ($outf) {
        $cmd .= " > $outf";
    }
    my $err = system($cmd);
    unlink($inf);
    if ($err) {
        confess("problem running: $cmd");
    }
    return;
}

sub _make_temp_filename {
    my ($base, $suffix) = @_;
    $base =~ s/.*\///;
    return "TMP.$$.$base$suffix";
}

sub download_file_if_required {
    my $self = shift;
    my $f = shift;
    if ($f =~ /^http:/) {
        my $tmpf = _make_temp_filename($f,'.obo');
        system("wget -O $tmpf $f");
        return $tmpf;
    }
    else {
        return $f;
    }
}

=head2 litemode

  Usage   - $p->litemode(1)
  Returns -
  Args    - bool

when set, parser will only throw the following events:

id|name|is_a|relationship|namespace

(optimisation for fast parsing)

=cut

sub litemode {
    my $self = shift;
    $self->{_litemode} = shift if @_;
    return $self->{_litemode};
}

=head2 acc2name_h

  Usage   - $n = $p->acc2name_h->{'GO:0003673'}
  Returns - hashref
  Args    - hashref [optional]

gets/sets a hash mapping IDs to names

this will be automatically set by an ontology parser

a non-ontology parser will use this index to verify the parsed data
(see $p->acc_not_found($id), below)

=cut

sub acc2name_h {
    my $self = shift;
    $self->{_acc2name_h} = shift if @_;
    $self->{_acc2name_h} = {} 
      unless $self->{_acc2name_h};
    return $self->{_acc2name_h};
}


=head2 acc_not_found

  Usage   - if ($p->acc_not_found($go_id)) { warn("$go_id not known") }
  Returns - bool
  Args    - acc string

uses acc2name_h - if this hash mapping has been created AND the acc is
not in the hash, THEN it is considered not found

This is useful for non-ontology parsers (xref_parser, go_assoc_parser)
to check whether a referenced ID is actually present in the ontology

note that if acc2name_h has not been created, then accs cannot be
considered not-found, and this will always return 0/false

=cut

sub acc_not_found {
    my $self = shift;
    my $acc = shift;
    my $h = $self->acc2name_h;
    if (scalar(keys %$h) && !$h->{$acc}) {
        return 1;
    }
    return 0;
}

sub dtd {
    undef;
}

1;