# $Id: base_parser.pm,v 1.18 2008/03/13 05:16:40 cmungall Exp $
#
#
# see also - http://www.geneontology.org
# - http://www.godatabase.org/dev
#
# You may distribute this module under the same terms as perl itself
package GO::Parsers::base_parser;
=head1 NAME
GO::Parsers::base_parser - base class for parsers
=head1 SYNOPSIS
do not use this class directly; use GO::Parser
=cut
=head1 DESCRIPTION
=head1 AUTHOR
=cut
use Carp;
use FileHandle;
use Digest::MD5 qw(md5_hex);
use GO::Parser;
use Data::Stag qw(:all);
use base qw(Data::Stag::BaseGenerator Exporter);
use strict qw(subs vars refs);
# Exceptions
sub throw {
my $self = shift;
confess("@_");
}
sub warn {
my $self = shift;
warn("@_");
}
sub messages {
my $self = shift;
$self->{_messages} = shift if @_;
return $self->{_messages};
}
*error_list = \&messages;
sub message {
my $self = shift;
my $msg = shift;
CORE::warn 'deprecated';
$self->parse_err($msg);
}
=head2 show_messages
Usage -
Returns -
Args -
=cut
sub show_messages {
my $self = shift;
my $fh = shift;
$fh = \*STDERR unless $fh;
foreach my $e (@{$self->error_list || []}) {
printf $fh "\n===\n Line:%s [%s]\n%s\n %s\n\n", $e->{line_no} || "", $e->{file} || "", $e->{line} || "", $e->{msg} || "";
}
}
sub init {
my $self = shift;
$self->messages([]);
$self->acc2name_h({});
$self;
}
sub parsed_ontology {
my $self = shift;
$self->{parsed_ontology} = shift if @_;
return $self->{parsed_ontology};
}
=head2 normalize_files
Usage - @files = $parser->normalize_files(@files)
Returns -
Args -
takes a list of filenames/paths, "glob"s them, uncompresses any compressed files and returns the new file list
=cut
sub normalize_files {
my $self = shift;
my $dtype;
my @files = map {glob $_} @_;
my @errors = ();
my @nfiles = ();
# uncompress any compressed files
foreach my $fn (@files) {
if ($fn =~ /\.gz$/) {
my $nfn = $fn;
$nfn =~ s/\.gz$//;
my $cmd = "gzip -dc $fn > $nfn";
#print STDERR "Running $cmd\n";
my $err = system("$cmd");
if ($err) {
push(@errors,
"can't uncompress $fn");
next;
}
$fn = $nfn;
}
if ($fn =~ /\.Z$/) {
my $nfn = $fn;
$nfn =~ s/\.Z$//;
my $cmd = "zcat $fn > $nfn";
print STDERR "Running $cmd\n";
my $err = system("$cmd");
if ($err) {
push(@errors,
"can't uncompress $fn");
next;
}
$fn = $nfn;
}
push(@nfiles, $fn);
}
my %done = ();
@files = grep { my $d = !$done{$_}; $done{$_} = 1; $d } @nfiles;
return @files;
}
sub fire_source_event {
my $self = shift;
my $file = shift || die "need to pass file argument";
my @fileparts = split(/\//, $file);
my @stat = stat($file);
my $mtime = $stat[9];
my $parsetime = time;
my $md5 = md5_hex($fileparts[-1]);
$self->event(source => [
[source_id => $file ],
[source_type => 'file'],
[source_fullpath => $file ],
[source_path => $fileparts[-1] ],
[source_md5 => $md5],
[source_mtime => $mtime ],
[source_parsetime => $parsetime ],
]
);
return;
}
sub parse_assocs {
my $self = shift;
my $fn = shift;
$self->dtype('go_assoc');
my $p = GO::Parser->get_parser_impl('go_assoc');
%$p = %$self;
$p->parse($fn);
return;
}
sub parse_to_graph {
my $self = shift;
my $h = GO::Parser->create_handler('obj');
$self->handler($h);
$self->parse(@_);
return $h->graph;
}
sub set_type {
my ($self, $fmt) = @_;
$self->dtype($fmt);
my $p = GO::Parser->get_parser_impl($fmt);
bless $self, ref($p);
return;
}
sub dtype {
my $self = shift;
$self->{_dtype} = shift if @_;
return $self->{_dtype};
}
sub parse_file {
my ($self, $file, $dtype) = @_;
$self->dtype($dtype);
$self->parse($file);
}
sub xslt {
my $self = shift;
$self->{_xslt} = shift if @_;
return $self->{_xslt};
}
sub force_namespace {
my $self = shift;
$self->{_force_namespace} = shift if @_;
return $self->{_force_namespace};
}
sub replace_underscore {
my $self = shift;
$self->{_replace_underscore} = shift if @_;
return $self->{_replace_underscore};
}
# EXPERIMENTAL: cache objects
sub use_cache {
my $self = shift;
$self->{_use_cache} = shift if @_;
return $self->{_use_cache};
}
# EXPERIMENTAL: returns subroutine
# sub maps name to cached name
sub file_to_cache_sub {
my $self = shift;
my $lite = $self->litemode;
my $suffix = $lite ? ".lcache" : ".cache";
$self->{_file_to_cache_sub} = shift if @_;
return $self->{_file_to_cache_sub} ||
sub {
my $f = shift;
$f =~ s/\.\w+$//;
$f .= $suffix;
return $f;
};
}
sub cached_obj_file {
my $self = shift;
return $self->file_to_cache_sub->(@_);
}
sub parse {
my ($self, @files) = @_;
my $dtype = $self->dtype;
foreach my $file (@files) {
$file = $self->download_file_if_required($file);
$self->file($file);
#printf STDERR "parsing: $file %d\n", $self->use_cache;
if ($self->use_cache) {
my $cached_obj_file = $self->cached_obj_file($file);
my $reparse;
if (-f $cached_obj_file) {
my @stat1 = lstat($file);
my @stat2 = lstat($cached_obj_file);
my $t1 = $stat1[9];
my $t2 = $stat2[9];
if ($t1 >= $t2) {
$reparse = 1;
}
else {
$reparse = 0;
}
}
else {
$reparse = 1;
}
if ($reparse) {
# make/remake cache
my $hclass = "GO::Handlers::obj_storable";
$self->load_module($hclass);
my $cache_handler =
$hclass->new;
$self->use_cache(0);
my $orig_handler = $self->handler;
$self->handler($cache_handler);
$cache_handler->file($cached_obj_file);
$self->parse($file);
my $g = $cache_handler->graph;
$self->use_cache(1);
my $p2 = GO::Parser->new({
format=>'GO::Parsers::obj_emitter'});
$p2->handler($orig_handler);
# this is the only state we need to copy across
if ($self->can('xslt')) {
$p2->xslt($self->xslt);
}
$p2->emit_graph($g);
}
else {
# use cache
my $p2 = GO::Parser->new({format=>'obj_storable'});
$p2->handler($self->handler);
# this is the only state we need to copy across
if ($self->can('xslt')) {
$p2->xslt($self->xslt);
}
$p2->parse_file($cached_obj_file);
}
next;
}
# check for XSL transform
if ($self->can('xslt') && $self->xslt) {
my $xsl = $self->xslt;
my $xslt_file = $xsl;
if (!-f $xslt_file) {
# if GO_ROOT is set then this specifies the location of the xslt dir
# if it is not set, assume we are using an installed version of go-perl,
# in which case, the xslts will be located along with the perl modules
my $GO_ROOT = $ENV{GO_ROOT};
if ($GO_ROOT) {
# env var takes precedence;
# developers should use this
$xslt_file = "$GO_ROOT/xml/xsl/$xsl.xsl";
}
# default location is with perl modules
if (!$xslt_file || !-f $xslt_file) {
# user-mode; xsl will be wherever the GO modules are installed
require "GO/Parser.pm";
my $dir = $INC{'GO/Parser.pm'};
$dir =~ s/Parser\.pm/xsl/;
$xslt_file = "$dir/$xsl.xsl";
}
}
if (!-f $xslt_file) {
$self->throw("No such file: $xslt_file OR $xsl");
}
# first parse input file to intermediate xml
my $file1 = _make_temp_filename($file, "-1.xml");
my $handler = $self->handler;
my $outhandler1 =
Data::Stag->getformathandler("xml");
$outhandler1->file($file1);
$self->handler($outhandler1);
$self->SUPER::parse($file);
$self->handler->finish;
# transform intermediate xml using XSLT
my $file2 = _make_temp_filename($file, "-2.xml");
# $results contains the post-xslt XML doc;
# we either want to write this directly, or
# pass it to a handler
if ($handler->isa("Data::Stag::XMLWriter")) {
# WRITE DIRECTLY:
# if the final goal is XML, then write this
# directly
if ($handler->file) {
# $ss->output_file($results,$handler->file);
xsltproc($xslt_file,$file1,$handler->file);
} else {
my $fh = $handler->fh;
if (!$fh) {
$fh = \*STDOUT;
xsltproc($xslt_file,$file1);
}
else {
xsltproc($xslt_file,$file1,$file2);
my $infh = FileHandle->new($file2) || die "cannot open $file2";
while (<$infh>) {print $fh $_}
unlink($file2);
}
#$ss->output_fh($results,$handler->fh);
}
} else {
# PASS TO HANDLER:
# we need to do another transform, in perl.
#
# write $results of stylesheet transform
#$ss->output_file($results,$file2);
xsltproc($xslt_file,$file1,$file2);
# clear memory
#$ss=undef;
#$xslt=undef;
#$results=undef;
# we pass the final XML to the handler
my $load_parser = new GO::Parser ({format=>'obo_xml'});
$load_parser->handler($handler);
$load_parser->errhandler($self->errhandler);
$load_parser->parse($file2);
unlink($file2);
}
# restore to previous state
$self->handler($handler);
} else {
# no XSL transform - perform parse as normal
# (use Data::Stag superclass)
$self->SUPER::parse($file);
}
}
}
# applies XSLT and removes input file
sub xsltproc {
my ($xf,$inf,$outf) = @_;
my $cmd = "xsltproc $xf $inf";
if ($outf) {
$cmd .= " > $outf";
}
my $err = system($cmd);
unlink($inf);
if ($err) {
confess("problem running: $cmd");
}
return;
}
sub _make_temp_filename {
my ($base, $suffix) = @_;
$base =~ s/.*\///;
return "TMP.$$.$base$suffix";
}
sub download_file_if_required {
my $self = shift;
my $f = shift;
if ($f =~ /^http:/) {
my $tmpf = _make_temp_filename($f,'.obo');
system("wget -O $tmpf $f");
return $tmpf;
}
else {
return $f;
}
}
=head2 litemode
Usage - $p->litemode(1)
Returns -
Args - bool
when set, parser will only throw the following events:
id|name|is_a|relationship|namespace
(optimisation for fast parsing)
=cut
sub litemode {
my $self = shift;
$self->{_litemode} = shift if @_;
return $self->{_litemode};
}
=head2 acc2name_h
Usage - $n = $p->acc2name_h->{'GO:0003673'}
Returns - hashref
Args - hashref [optional]
gets/sets a hash mapping IDs to names
this will be automatically set by an ontology parser
a non-ontology parser will use this index to verify the parsed data
(see $p->acc_not_found($id), below)
=cut
sub acc2name_h {
my $self = shift;
$self->{_acc2name_h} = shift if @_;
$self->{_acc2name_h} = {}
unless $self->{_acc2name_h};
return $self->{_acc2name_h};
}
=head2 acc_not_found
Usage - if ($p->acc_not_found($go_id)) { warn("$go_id not known") }
Returns - bool
Args - acc string
uses acc2name_h - if this hash mapping has been created AND the acc is
not in the hash, THEN it is considered not found
This is useful for non-ontology parsers (xref_parser, go_assoc_parser)
to check whether a referenced ID is actually present in the ontology
note that if acc2name_h has not been created, then accs cannot be
considered not-found, and this will always return 0/false
=cut
sub acc_not_found {
my $self = shift;
my $acc = shift;
my $h = $self->acc2name_h;
if (scalar(keys %$h) && !$h->{$acc}) {
return 1;
}
return 0;
}
sub dtd {
undef;
}
1;