The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# $Id: Parser.pm,v 1.15 2006/04/20 22:48:23 cmungall Exp $
#
#
# see also - http://www.geneontology.org
#          - http://www.godatabase.org/dev
#
# You may distribute this module under the same terms as perl itself

=head1 NAME

  GO::Parser     - parses all GO files formats and types

=head1 SYNOPSIS

fetch L<GO::Model::Graph> objects using a parser:

  # Scenario 1: Getting objects from a file
  use GO::Parser;
  my $parser = new GO::Parser({handler=>'obj',use_cache=>1});
  $parser->parse("function.ontology");     # ontology
  $parser->parse("GO.defs");               # definitions
  $parser->parse("ec2go");                 # external refs
  $parser->parse("gene-associations.sgd"); # gene assocs
  # get GO::Model::Graph object
  my $graph = $parser->handler->graph;

  # Scenario 2: Getting OBO XML from a file
  use GO::Parser;
  my $parser = new GO::Parser({handler=>'xml'});
  $parser->handler->file("output.xml");
  $parser->parse("gene_ontology.obo");

  # Scenario 3: Using an XSL stylesheet to convert the OBO XML
  use GO::Parser;
  my $parser = new GO::Parser({handler=>'xml'});
  # xslt files are kept in in $ENV{GO_ROOT}/xml/xsl
  # (if $GO_ROOT is not set, defaults to install directory)
  $parser->xslt("oboxml_to_owl"); 
  $parser->handler->file("output.owl-xml");
  $parser->parse("gene_ontology.obo");

  # Scenario 4: via scripts
  my $cmd = "go2xml gene_ontology.obo | xsltproc my-transform.xsl -";
  my $fh = FileHandle->new("$cmd |") || die("problem initiating $cmd");
  while(<$fh>) { print $_ }
  $fh->close || die("problem running $cmd");

=cut

=head1 DESCRIPTION

Module for parsing GO flat files; for examples of GO/OBO flatfile
formats see:

L<ftp://ftp.geneontology.org/pub/go/ontology>

L<ftp://ftp.geneontology.org/pub/go/gene-associations>

For a description of the various file formats, see:

L<http://www.geneontology.org/GO.format.html>

L<http://www.geneontology.org/GO.annotation.html#file>

This module will generate XML events from a correctly formatted GO/OBO
file

=head1 SEE ALSO

This module is a part of go-dev, see:

L<http://www.godatabase.org/dev>

for more details

=head1 PUBLIC METHODS

=head2 new

 Title   : new
 Usage   : my $p = GO::Parser->new({format=>'obo_xml',handler=>'obj'});
           $p->parse("go.obo-xml");
           my $g = $p->handler->graph;
 Synonyms: 
 Function: creates a parser object
 Example : 
 Returns : GO::Parser
 Args    : a hashref of arguments:
            format: a format for which a parser exists
            handler: a format for which a perl handler exists
            use_cache: (boolean) see caching below

=head2 parse

 Title   : parse
 Usage   : $p->parse($file);
 Synonyms: 
 Function: parses a file
 Example : 
 Returns : 
 Args    : str filename

=head2 handler

 Title   : handler
 Usage   : my $handler = $p->handler;
 Synonyms: 
 Function: gets/sets a GO::Handler object
 Example : 
 Returns : L<GO::Handlers::base>
 Args    : L<GO::Handlers::base>

=head1 FORMATS

This module is a front end wrapper for a number of different GO/OBO
formats - see the relevant module documentation below for details.

The full list of parsers can be found in the go-perl/GO/Parsers/
directory

=over

=item obo_text

Files with suffix ".obo"

This is a new file format replacement for the existing GO flat file
formats. It handles ontologies, definitions and xrefs (but not
associations)

=item go_ont

Files with suffix ".ontology"

These store the ontology DAGs

=item go_def

Files with suffix ".defs"

=item go_xref

External database references for GO terms

Files with suffix "2go" (eg ec2go, metacyc2go)

=item go_assoc

Annotations of genes or gene products using GO

Files with prefix "gene-association."

=item obo_xml

Files with suffix ".obo.xml" or ".obo-xml"

This is the XML version of the OBO flat file format above

See L<http://www.godatabase.org/dev/xml/doc/xml-doc.html>

=item obj_yaml

A YAML dump of the perl L<GO::Model::Graph> object. You need L<YAML>
from CPAN for this to work

=item obj_storable

A dump of the perl L<GO::Model::Graph> object. You need L<Storable>
from CPAN for this to work. This is intended to cache objects on the
filesystem, for fast access. The obj_storable representation may not
be portable

=head2 PARSING ARCHITECTURE

Each parser fires XML B<events>. The XML events are known as
B<Obo-XML>.  

These XML events can be B<caught> by a handler written in perl, or
they can be caught by an XML parser written in some other language, or
by using XSL stylesheets.

go-dev comes with a number of stylesheets in the 
  go-dev/xml/xsl
directory

Anything that catches these XML events is known as a B<handler>

go-perl comes with some standard perl XML handlers, in addition to
some standard XSL stylesheets. These can be found in the
B<go-dev/go-perl/GO/Handlers> directory

If you are interested in getting perl B<objects> from files then you
will want the B<obj> handler, which gives back L<GO::Model::Graph>
objects

The parsing architecture gives you the option of using the go-perl
object model, or just parsing the XML events directly

If you are using the go-db-perl library, the load-go-into-db.pl script
will perform the following processes when loading files into the
database

=over

=item Obo-XML events fired using GO::Parser::* classes

=item Obo-XML transformed into godb xml using oboxml_to_godb_prestore.xsl

=item godb_prestore.xml stored in database using generic loader

=back

=head2 Obo-XML

The Obo-XML format DTD is stored in the go-dev/xml/dtd directory

=head2 HOW IT WORKS

Currently the various parsers and perl event handlers use the B<stag>
module for this - see L<Data::Stag> for more details, or
http://stag.sourceforge.net

=head2 NESTED EVENTS

nested events can be thought of as xml, without attributes; nested
events can easily be turned into xml

events have a start, a body and an end

event handlers can *catch* these events and do something with them.

an object handler can turn the events into objects, centred around the
GO::Model::Graph object; see GO::Handlers::obj

other handlers can catch the events and convert them into other
formats, eg OWL or OBO

Or you can bypass the handler and get output as an XML stream - to do
this, just run the go2xml script

a database loading event handler can catch the events and turn them
into SQL statements, loading a MySQL or postgres database (see the
go-db-perl library)

the advantage of an event based parsing architecture is that it is
easy to build lightweight parsers, and heavy weight object models can
be bypassed if prefered.

=head2 EXAMPLES

To see examples of the events generated by the GO::Parser class, run
the script go2xml; for example

  go2xml function.ontology

on any GO-formatted flatfile

This also works on OBO-formatted files:

  go2xml gene_ontology.obo

You can also use the script "stag-parse.pl" which comes with the
L<Data::Stag> distribution. for example

  stag-parse.pl -p GO::Parsers::go_assoc_parser gene-association.fb

=head2 XSLT HANDLERS

The full list can be found in the go-dev/xml/xsl directory

=head2 PERL HANDLERS

see GO::Handlers::* for all the different handlers possible;
more can be added dynamically.

you can either create the handler object yourself, and pass it as an argument,
e.g.

  my $apph    = new GO::AppHandle(-db=>"go");
  my $handler = new GO::Handlers::godb({apph=>$apph});
  my $parser  = new GO::Parser({handler=>$handler});
  $parser->parse(@files);

or you can use one of the registered handlers:

  my $parser = new GO::Parser({handler=>'db',
                               handler_args=>{apph=>$apph}});

or you can just do things from the command line

  go2fmt.pl -w oboxml function.ontology


the registered perl handlers are as follows:

=over

=item obo_xml

writes out OBO-XML (which is basically a straightforward conversion of
the event stream into XML)

=item obo_text

=item go_ont

legacy GO-ontology file format

=item go_xref

GO xref file, for linking GO terms to terms and dbxrefs in other ontologies

=item go_defs

legacy GO-definitions file format

=item go_assoc

GO association file format

=item rdf

GO XML-RDF file format

=item owl

OWL format (default: OWL-DL)

OWL is a W3C standard format for ontologies

You will need the XSL files from the full go-dev distribution to run
this; see the XML section in L<http://www.godatabase.org/dev>

=item prolog

prolog facts - you will need a prolog compiler/interpreter to use
these. You can reason over these facts using Obol or the forthcoming
Bio-LP project

=item sxpr

lisp style S-Expressions, conforming to the OBO-XML schema; you will
need lisp to make full use of these. you can also do some nice stuff
just within emacs (use lisp-mode and load an sxpr file into your
buffer)

=item godb

this is actually part of the go-db-perl library, not the go-perl library

catches events and loads them into a database conforming to the GO
database schema; see the directory go-dev/sql, as part of the whole
go-dev distribution; or www.godatabase.org/dev/database

=item obj_yaml

A YAML dump of the perl L<GO::Model::Graph> object. You need L<YAML>
from CPAN for this to work

=item obj_storable

A dump of the perl L<GO::Model::Graph> object. You need L<Storable>
from CPAN for this to work. This is intended to cache objects on the
filesystem, for fast access. The obj_storable representation may not
be portable

=back

=head1 EXAMPLES OF DATATYPE TEXT FORMATS

=head2 go_ont format

eg format: go_ont for storing graphs and metadata; for example:

  !version: $Revision: 1.15 $
  !date: $Date: 2006/04/20 22:48:23 $
  !editors: Michael Ashburner (FlyBase), Midori Harris (SGD), Judy Blake (MGD)
  $Gene_Ontology ; GO:0003673
   $cellular_component ; GO:0005575
    %extracellular ; GO:0005576
     <fibrinogen ; GO:0005577
      <fibrinogen alpha chain ; GO:0005972
      <fibrinogen beta chain ; GO:0005973

See GO::Parsers::go_ont_parser for more details

this is the following file parsed with events turned directly into OBO-XML:
  
  <?xml version="1.0" encoding="UTF-8"?>
  <obo>
    <source>
      <source_type>file</source_type>
      <source_path>z.ontology</source_path>
      <source_mtime>1075164285</source_mtime>
    </source>
    <term>
      <id>GO:0003673</id>
      <name>Gene_Ontology</name>
      <ontology>root</ontology>
    </term>
    <term>
      <id>GO:0005575</id>
      <name>cellular_component</name>
      <ontology>root</ontology>
      <is_a>GO:0003673</is_a>
    </term>
    <term>
      <id>GO:0005576</id>
      <name>extracellular</name>
      <ontology>root</ontology>
      <is_a>GO:0005575</is_a>
    </term>
    <term>
      <id>GO:0005577</id>
      <name>fibrinogen</name>
      <ontology>root</ontology>
      <relationship>
        <type>part_of</type>
        <to>GO:0005576</to>
      </relationship>
    </term>
    <term>
      <id>GO:0005972</id>
      <name>fibrinogen alpha chain</name>
      <ontology>root</ontology>
      <relationship>
        <type>part_of</type>
        <to>GO:0005577</to>
      </relationship>
    </term>
    <term>
      <id>GO:0005973</id>
      <name>fibrinogen beta chain</name>
      <ontology>root</ontology>
      <relationship>
        <type>part_of</type>
        <to>GO:0005577</to>
      </relationship>
    </term>
  </obo>

=head2 go_def format

eg format: go_defs for storing definitions:

  !Gene Ontology definitions
  !
  term: 'de novo' protein folding
  goid: GO:0006458
  definition: Processes that assist the folding of a nascent peptide chain into its correct tertiary structure.
  definition_reference: Sanger:mb

See GO::Parsers::go_def_parser for more details

=head2 go_xref format

eg format: go_xrefs for storing links between GO IDs and IDs for terms
in other DBs:

  EC:1.-.-.- > GO:oxidoreductase ; GO:0016491
  EC:1.1.-.- > GO:1-phenylethanol dehydrogenase ; GO:0018449

See GO::Parsers::go_xref_parser for more details

=head2 go_assoc format

eg format: go-assocs for storing gene-associations:

  SGD     S0004660        AAC1            GO:0005743      SGD:12031|PMID:2167309 TAS             C       ADP/ATP translocator    YMR056C gene    taxon:4932 20010118
  SGD     S0004660        AAC1            GO:0006854      SGD:12031|PMID:2167309 IDA             P       ADP/ATP translocator    YMR056C gene    taxon:4932 20010118

See GO::Parsers::go_assoc_parser for more details

=head2 obo_text format

L<http://www.geneontology.org/GO.format.html>

=cut

package GO::Parser;

use Exporter;

use Carp;
use GO::Model::Term;
use FileHandle;
use strict qw(subs vars refs);
use base qw(GO::Model::Root);

# Exceptions


# Constructor


=head2 new

  Usage   - my $parser = GO::Parser->new()
  Returns - GO::Parser

creates a new parser

=cut

sub new {
    my $proto = shift; 
    my $class = ref($proto) || $proto;;
    my $self = {};
    bless $self, $class;

    my $init_h = $_[0] || {};
    if (!ref($init_h)) {
        $init_h = {@_};
    }
    my $fmt = $init_h->{format} || $init_h->{fmt} || '';
    my $use_cache = $init_h->{use_cache};
    $fmt = lc($fmt) unless $fmt =~ /::/;
#    $fmt = 'gotext' unless $fmt;
    if (!$fmt) {
	# this parser guesses/defers on what type it is parsing
	$fmt = "unknown_format";
    }

    my $p = $self->get_parser_impl($fmt);
    if ($init_h) {
	map {$p->{$_} = $init_h->{$_}} keys %$init_h;
    }
    
    my $handler = $init_h->{handler} || "base";
    if (UNIVERSAL::isa($handler, "GO::AppHandle")) {
        require "GO/Handlers/DbStoreHandler.pm";
        $handler = GO::Handlers::DbStoreHandler->new({apph=>$handler});
    }
    unless (ref($handler)) {
	my $hclass = $handler;
	if ($handler !~ /::/) {
	    if ($handler =~ /^::/) {
		$hclass = $handler;
		$hclass =~ s/^:://;
	    }
	    else {
		$hclass = "GO::Handlers::$handler";
	    }
	}
        eval {
            $class->load_module($hclass);
        };
        if ($@) {
            print STDERR $@, "\n\n\n";
            
            $self->throw("No such handler: $handler");
        }
        $handler = $hclass->new($init_h->{handler_args});
    }
    $p->handler($handler);
    $p->use_cache($use_cache);

    delete $init_h->{parser};
    delete $init_h->{handler};

    return $p;
}


sub get_parser_impl {
    my $self = shift;
    my $fmt = shift;
    my $mod;
    if ($fmt =~ /::/) {
	$mod = $fmt;
    }
    else {
	$mod = "GO::Parsers::$fmt"."_parser";
    }
    $self->load_module($mod);
    my $p = $mod->new();
    return $p;
}


=head2 create_handler

  Usage   - my $handler = GO::Parser->create_handler('obj');
  Returns - L<GO::Handlers::base>
  Args    - handler type [str]

=cut

sub create_handler {
    my $self = shift;
    my $type = shift || 'obj';
    my $p = $self->new({handler=>$type});
    return $p->handler;
}

sub load_module {

    my $self = shift;
    my $classname = shift;
    my $mod = $classname;
    $mod =~ s/::/\//g;

    if ($main::{"_<$mod.pm"}) {
    }
    else {
	eval {
	    require "$mod.pm";
	};
	if ($@) {
	    $self->throw("No such module: $classname;;\n$@");
	}
    }
}

1;