The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# $Id: OBOParser.pm 2014-11-14 erick.antezana $
#
# Module  : OBOParser.pm
# Purpose : Parse OBO-formatted files.
# License : Copyright (c) 2006-2014 by Erick Antezana. All rights reserved.
#           This program is free software; you can redistribute it and/or
#           modify it under the same terms as Perl itself.
# Contact : Erick Antezana <erick.antezana -@- gmail.com>
#
package OBO::Parser::OBOParser;

use OBO::Core::Dbxref;
use OBO::Core::Instance;
use OBO::Core::Ontology;
use OBO::Core::Relationship;
use OBO::Core::RelationshipType;
use OBO::Core::SubsetDef;
use OBO::Core::SynonymTypeDef;
use OBO::Core::Term;
use OBO::Util::IDspaceSet;
use OBO::Util::Set;
use OBO::XO::OBO_ID;

use Carp;
use Date::Manip qw(ParseDate UnixDate);
use strict;
use warnings;

use open qw(:std :utf8); # Make All I/O Default to UTF-8

$Carp::Verbose = 1;

sub new {
	my $class         = shift;
	my $self          = {};
	
	$self->{OBO_FILE} = undef; # required, (1)
        
	bless ($self, $class);
	return $self;
}

=head2 work

  Usage    - $OBOParser->work($obo_file_path)
  Returns  - the parsed OBO ontology
  Args     - the OBO file to be parsed
  Function - parses an OBO file
  
=cut

sub work {
	my $self = shift;
	if (defined $_[0]) {
		$self->{OBO_FILE} = shift;
	} else {
		croak 'You have to provide an OBO file as input';
	}
	
	open (OBO_FILE, $self->{OBO_FILE}) || croak 'The OBO file (', $self->{OBO_FILE}, ') cannot be opened: ', $!;
	
	$/ = ""; # one paragraph at the time
	chomp(my @chunks = <OBO_FILE>);
	chomp(@chunks);
	close OBO_FILE;

	#
	# Treat OBO file header tags
	#
	my $file_line_number = 0;
	if (defined $chunks[0] && $chunks[0] =~ /^format-version:\s*(.*)/) {

		my @header        = split (/\n/, $chunks[0]);
		$file_line_number = $#header + 2; # amount of lines in the header
		$chunks[0]        = join("\n", @header);

		#
		# format-version
		#
		my $format_version;
		if ($chunks[0] =~ /(^format-version:\s*(.*)\n?)/m) { # required tag
			$format_version = $2;
			$chunks[0]      =~ s/$1//;
		}

		#
		# data_version
		#
		my $data_version;
		if ($chunks[0] =~ /(^data-version:\s*(.*)\n?)/m) {
			$data_version = $2;
			$chunks[0]    =~ s/$1//;
		}
		
		#
		# ontology
		#
		my $ontology_id_space;
		if ($chunks[0] =~ /(^ontology:\s*(.*)\n?)/m) { # as of OBO spec 1.4
			$ontology_id_space = $2;
			$chunks[0]         =~ s/$1//;
		}
		
		#
		# date
		#
		my $date;
		if ($chunks[0] =~ /(^date:\s*(.*)\n?)/m) {
			$date      = $2;
			$chunks[0] =~ s/$1//;
		}
		
		#
		# saved_by
		#
		my $saved_by;
		if ($chunks[0] =~ /(^saved-by:\s*(.*)\n?)/m) {
			$saved_by  = $2;
			$chunks[0] =~ s/$1//;
		}

		#
		# auto_generated_by
		#
		my $auto_generated_by;
		if ($chunks[0] =~ /(^auto-generated-by:\s*(.*)\n?)/m) {
			$auto_generated_by = $2;
			$chunks[0]         =~ s/$1//;
		}

		#
		# imports
		#
		my $imports = OBO::Util::Set->new();
		while ($chunks[0] =~ /(^import:\s*(.*)\n?)/m) {
			$imports->add($2);
			$chunks[0] =~ s/$1//;
		}

		#
		# subsetdef
		#
		my $subset_def_map = OBO::Util::SubsetDefMap->new();
		while ($chunks[0] =~ /(^subsetdef:\s*(\S+)\s+\"(.*)\"\n?)/m) {
			my $line = quotemeta($1);
			my $ssd  = OBO::Core::SubsetDef->new();
			$ssd->name($2);
			$ssd->description($3);
			$subset_def_map->put($2, $ssd);
			$chunks[0] =~ s/${line}//;
		}
		
		#
		# synonymtypedef
		#
		my $synonym_type_def_set = OBO::Util::SynonymTypeDefSet->new();
		while ($chunks[0] =~ /(^synonymtypedef:\s*(\S+)\s+\"(.*)\"(.*)?\n?)/m) {
			my $line = quotemeta($1);
			my $std  = OBO::Core::SynonymTypeDef->new();
			$std->name($2);
			$std->description($3);
			my $sc = $4;
			$std->scope($sc) if (defined $sc && $sc =~s/\s//);
			$synonym_type_def_set->add($std);
			$chunks[0] =~ s/${line}//;
		}
		
		#
		# idspace
		#
		my $idspaces = OBO::Util::IDspaceSet->new();
		while ($chunks[0] =~ /(^idspace:\s*(\S+)\s*(\S+)\s+(\"(.*)\")?\n?)/m) {
			my $line        = quotemeta($1);
			my $new_idspace = OBO::Core::IDspace->new();
			$new_idspace->local_idspace($2);
			$new_idspace->uri($3);
			my $dc = $5;
			$new_idspace->description($dc) if (defined $dc);
			$idspaces->add($new_idspace);
			$chunks[0] =~ s/${line}//;
		}
		
		#
		# default-relationship-id-prefix
		# e.g. default-relationship-id-prefix: OBO_REL
		#
		my $default_relationship_id_prefix;
		if ($chunks[0] =~ /(^default_relationship_id_prefix:\s*(.*)\n?)/m) {
			$default_relationship_id_prefix = $2;
			$chunks[0] =~ s/$1//;
		}
		
		#
		# default-namespace
		#
		my $default_namespace;
		if ($chunks[0] =~ /(^default-namespace:\s*(.*)\n?)/m) {
			$default_namespace = $2;
			$chunks[0] =~ s/$1//;
		}		
		
		#
		# remark
		#
		my $remarks = OBO::Util::Set->new();
		while ($chunks[0] =~ /(^remark:\s*(.*)\n?)/m) {
			my $line = quotemeta($1);
			$remarks->add($2);
			$chunks[0] =~ s/${line}//;
		}

		if (!defined $format_version) {
			croak "The OBO file '", $self->{OBO_FILE},"' does not have a correct header, please verify it.";
		}
		
		#
		# treat-xrefs-as-equivalent
		#
		my $treat_xrefs_as_equivalent = OBO::Util::Set->new();
		while ($chunks[0] =~ /(^treat-xrefs-as-equivalent:\s*(.*)\n?)/m) {
			$treat_xrefs_as_equivalent->add($2);
			$chunks[0] =~ s/$1//;
		}
		
		#
		# treat-xrefs-as-is_a
		#
		my $treat_xrefs_as_is_a = OBO::Util::Set->new();
		while ($chunks[0] =~ /(^treat-xrefs-as-is_a:\s*(.*)\n?)/m) {
			$treat_xrefs_as_is_a->add($2);
			$chunks[0] =~ s/$1//;
		}
		
		#
		# store the values in header tags
		#
		my $result = OBO::Core::Ontology->new();
		
		$result->data_version($data_version) if ($data_version);
		$result->id($ontology_id_space) if ($ontology_id_space);
		$result->date($date) if ($date);
		$result->saved_by($saved_by) if ($saved_by);
		#$result->auto_generated_by($auto_generated_by) if ($auto_generated_by);
		$result->subset_def_map($subset_def_map);
		$result->imports($imports->get_set());
		$result->synonym_type_def_set($synonym_type_def_set->get_set());
		$result->idspaces($idspaces->get_set());
		$result->default_relationship_id_prefix($default_relationship_id_prefix) if ($default_relationship_id_prefix);
		$result->default_namespace($default_namespace) if ($default_namespace);
		$result->remarks($remarks->get_set());
		$result->treat_xrefs_as_equivalent($treat_xrefs_as_equivalent->get_set());
		$result->treat_xrefs_as_is_a($treat_xrefs_as_is_a->get_set());

		if ($chunks[0]) {
			print STDERR "The following line(s) has been ignored from the header:\n", $chunks[0], "\n";
		}
		
		#
		# Keep log's
		#
		my %used_subset; # of the used subsets to pin point nonused subsets defined in the header (subsetdef's)
		
		my %used_synonym_type_def; # of the used synonymtypedef to pin point nonused synonymtypedef's defined in the header (synonymtypedef's)
		
		#
		# Regexps
		#
		#my $r_db_acc     = qr/([ \*\.\w-]*):([ '\#~\w:\\\+\?\{\}\$\/\(\)\[\]\.=&!%_,-]*)/o;
		my $r_db_acc     = qr/\s+(\w+:\w+)/o;
		my $r_dbxref     = qr/\s+(\[.*\])/o;
		my $syn_scope    = qr/(\s+(EXACT|BROAD|NARROW|RELATED))?/o;
		my $r_true_false = qr/\s*(true|false)/o; 
		my $r_comments   = qr/\s*(\!\s*(.*))?/o;
		
		my $intersection_of_counter = 0;
		my $union_of_counter        = 0;
		
		my %allowed_data_types = (	'xsd:simpleType' => 1,			# Indicates any primitive type (abstract)
									'xsd:string' => 1,				# A string
									'xsd:integer' => 1,				# Any integer
									'xsd:decimal' => 1,				# Any real number
									'xsd:negativeInteger' => 1,		# Any negative integer
									'xsd:positiveInteger' => 1,		# Any integer > 0
									'xsd:nonNegativeInteger' => 1,	# Any integer >= 0
									'xsd:nonPositiveInteger' => 1,	# Any integer < 0
									'xsd:boolean' => 1,				# True or false
									'xsd:date' => 1					# An XML-Schema date
		);
		
		foreach my $chunk (@chunks) {
			my @entry  = split (/\n/, $chunk);
			my $stanza = shift @entry;
					
			if ($stanza && $stanza =~ /\[Term\]/) { # treat [Term]'s
				
				my $term;
				#
				# to check we have zero or at least two intersection_of's and zero or at least two union_of's
				#
				$intersection_of_counter = 0;
				$union_of_counter        = 0;
				
				$file_line_number++;
				
				my $only_one_id_tag_per_entry   = 0;
				my $only_one_name_tag_per_entry = 0;
				
				foreach my $line (@entry) {
					$file_line_number++;
					if ($line =~ /^id:\s*(\S+)/) { # get the term id
						if ($line =~ /^id:$r_db_acc/) { # Does it follow the "convention"?
							croak "The term with id '", $1, "' has a duplicated 'id' tag in the file '", $self->{OBO_FILE} if ($only_one_id_tag_per_entry);
							$term = $result->get_term_by_id($1); # does this term is already in the ontology?
							if (!defined $term){
								$term = OBO::Core::Term->new();  # if not, create a new term
								$term->id($1);
								$result->add_term($term);        # add it to the ontology
								$only_one_id_tag_per_entry = 1;
							} elsif (defined $term->def()->text() && $term->def()->text() ne '') {
								# The term is already in the ontology since it has a definition! (maybe empty?)
								croak "The term with id '", $1, "' is duplicated in the OBO file.";
							}
						} else {
							carp "The term with id '", $1, "' does NOT follow the ID convention: 'IDSPACE:UNIQUE_IDENTIFIER', e.g. GO:1234567";
						}						
					} elsif ($line =~ /^is_anonymous:$r_true_false/) {
						$term->is_anonymous(($1 eq 'true')?1:0);
					} elsif ($line =~ /^name:\s*(.*)/) {
						carp "The term with id '", $1, "' has a duplicated 'name' tag in the file '", $self->{OBO_FILE} if ($only_one_name_tag_per_entry);
						if (!defined $1) {
							warn "The term with id '", $term->id(), "' has no name in file '", $self->{OBO_FILE}, "'";
						} else {
							$term->name($1);
							$only_one_name_tag_per_entry = 1;
						}
					} elsif ($line =~ /^namespace:\s*(.*)/) {
						$term->namespace($1); # it is a Set
					} elsif ($line =~ /^alt_id:$r_db_acc/) {
						$term->alt_id($1);
					} elsif ($line =~ /^def:\s*\"(.*)\"$r_dbxref/) { # fill the definition
						my $def = OBO::Core::Def->new();
						$def->text($1);
						$def->dbxref_set_as_string($2);
						$term->def($def);
					} elsif ($line =~ /^comment:\s*(.*)/) {
						$term->comment($1);
					} elsif ($line =~ /^subset:\s*(\S+)/) {
						my $ss = $1;
						if ($result->subset_def_map()->contains_key($ss)) {
							$term->subset($ss); # it is a Set (i.e. added to a Set)
							
							$used_subset{$ss}++; # check subsets usage
						} else {
							croak "The subset '", $ss, "' is not defined in the header! Check your OBO file line '", $file_line_number, "'";
						}
					} elsif ($line =~ /^(exact|narrow|broad|related)_synonym:\s*\"(.*)\"$r_dbxref/) { # OBO spec 1.1
						$term->synonym_as_string($2, $3, uc($1));
					} elsif ($line =~ /^synonym:\s*\"(.*)\"$syn_scope(\s+([-\w]+))?$r_dbxref/) {
						my $scope = (defined $3)?$3:'RELATED';
						# As of OBO flat file spec v1.2, we use:
						# synonym: "endomitosis" EXACT []
						if (defined $5) { # if a 'synonym type name' is given
							my $found = 0; # check that the 'synonym type name' was defined in the header!
							foreach my $st ($result->synonym_type_def_set()->get_set()) {
								if ($st->name() eq $5) {
									if (!defined $3) { # if no scope is given, use the one defined in the header!
										my $default_scope = $st->scope();
										$scope = $default_scope if (defined $default_scope);
									}
									$found = 1;
									last;
								}
							}
							croak 'The synonym type name (', $5,') used in line ',  $file_line_number, " in the file '", $self->{OBO_FILE}, "' was not defined" if (!$found);
							$used_synonym_type_def{$5}++; # check synonymtypedef usage
						}
						$term->synonym_as_string($1, $6, $scope, $5);
						
					} elsif ($line =~ /^xref:\s*(.*)/ || $line =~ /^xref_analog:\s*(.*)/ || $line =~ /^xref_unknown:\s*(.*)/) {
						$term->xref_set_as_string($1);
					} elsif ($line =~ /^is_a:$r_db_acc$r_comments/) { # The comment is ignored here but generated later internally
						my $t_id = $term->id();
						if ($t_id eq $1) {
							warn "The term '", $t_id, "' has a reflexive is_a relationship, which was ignored!";
							next;
						}
						my $rel = OBO::Core::Relationship->new();
						$rel->id($t_id.'_is_a_'.$1);
						$rel->type('is_a');
						my $target = $result->get_term_by_id($1); # Is this term already in the ontology?
						if (!defined $target) {
							$target = OBO::Core::Term->new(); # if not, create a new term
							$target->id($1);
							$result->add_term($target);
						}
						$rel->link($term, $target);
						$result->add_relationship($rel);
					} elsif ($line =~ /^intersection_of:\s*([\w\/]+)?$r_db_acc$r_comments/) {
						# TODO Improve the 'intersection_of' treatment
						my $rel = OBO::Core::Relationship->new();
						my $r   = $1 || 'nil';
						my $id  = $term->id().'_'.$r.'_'.$2; 
						$id     =~ s/\s+/_/g;
						$rel->id($id);
						$rel->type($r);
						my $target = $result->get_term_by_id($2); # Is this term already in the ontology?
						if (!defined $target) {
							$target = OBO::Core::Term->new(); # if not, create a new term
							$target->id($2);
							$result->add_term($target);
						}
						$rel->head($target);
						$term->intersection_of($rel);
						$intersection_of_counter++;
					} elsif ($line =~ /^union_of:$r_db_acc$r_comments/) {
						# TODO wait until the OBO spec 1.4 be released
						my $target = $result->get_term_by_id($1); # Is this term already in the ontology?
						if (!defined $target) {
							$target = OBO::Core::Term->new(); # if not, create a new term
							$target->id($1);
							$result->add_term($target);
						}
						$term->union_of($1);
						$union_of_counter++;
					} elsif ($line =~ /^disjoint_from:$r_db_acc$r_comments/) {
						$term->disjoint_from($1); # We are assuming that the other term exists or will exist; otherwise , we have to create it like in the is_a section.
					} elsif ($line =~ /^relationship:\s*([\w\/]+)$r_db_acc$r_comments/ || $line =~ /^relationship:\s*$r_db_acc$r_db_acc$r_comments/) {
						my $rel = OBO::Core::Relationship->new();
						my $id  = $term->id().'_'.$1.'_'.$2; # TODO: I have to standarise the id's: term_id1_db:acc_term_id2
						$id     =~ s/\s+/_/g;
						$rel->id($id);
						$rel->type($1);
						#warn "TYPE : '", $id, "'";
						my $target = $result->get_term_by_id($2); # Is this term already in the ontology?
						if (!defined $target) {
							$target = OBO::Core::Term->new(); # if not, create a new term
							$target->id($2);
							$result->add_term($target);
						}
						$rel->link($term, $target);
						$result->add_relationship($rel);
					} elsif ($line =~ /^created_by:\s*(.*)/) {
						$term->created_by($1);
					} elsif ($line =~ /^creation_date:\s*(.*)/) {
						my $d = $1;
						my $pd = ParseDate($d); # Check that the date follows the ISO 8601 format
						if (!$pd) {
							warn "Bad date string: '", $d, "' in line ", $file_line_number, "\n";
#						} else {
#							my ($year, $month, $day) = UnixDate($pd, "%Y", "%m", "%d");
#							warn "Date was $month/$day/$year\n";
						}
						$term->creation_date($d);
					} elsif ($line =~ /^modified_by:\s*(.*)/) {
						$term->modified_by($1);
					} elsif ($line =~ /^modification_date:\s*(.*)/) {
						my $d = $1;
						my $pd = ParseDate($d); # Check that the date follows the ISO 8601 format
						if (!$pd) {
							warn "Bad date string: '", $d, "' in line ", $file_line_number, "\n";
						}
						$term->modification_date($d);
					} elsif ($line =~ /^is_obsolete:$r_true_false/) {
						$term->is_obsolete(($1 eq 'true')?1:0);
					} elsif ($line =~ /^replaced_by:\s*(.*)/) {
						$term->replaced_by($1);
					} elsif ($line =~ /^consider:\s*(.*)/) {
						$term->consider($1);
					} elsif ($line =~ /^builtin:$r_true_false/) {
						$term->builtin(($1 eq 'true')?1:0);
					} elsif ($line =~ /^property_value:\s*(\w+)$r_db_acc/ || $line =~ /^property_value:\s*(\w+)\s+"([ '\#~\w:\\\+\?\{\}\$\/\(\)\[\]\.=&!%_,-]+)"$r_db_acc/) { # TODO some parts in this block might change later on...

						my $r2_type = $result->get_relationship_type_by_id($1); # Is this relationship type already in the ontology?
						if (!defined $r2_type){
							$r2_type = OBO::Core::RelationshipType->new();      # if not, create a new relationship type
							$r2_type->id($1);
							$result->add_relationship_type($r2_type);           # add it to the ontology
						}

						#
						# create the triplet
						#						
						my $rel = OBO::Core::Relationship->new();
						my $id  = $term->id().'_'.$1.'_'.$2;                    # term --> rel --> [term|instance|datatype]
						$id     =~ s/\s+/_/g;
						$rel->id($id);
						$rel->type($r2_type->id());
						
						if (!defined $3) {
							#
							# property_value: lastest_modification_by erick
							#
						    my $target = $result->get_term_by_id($2);           # suggest to OBOF to define TERMs before they are used so that parsers could know they are dealing with terms!
						    
						    if (defined $target) {                              # term --> rel --> term
						    	
						    } else {
								$target = $result->get_instance_by_id($2);
								
								if (!defined $target) {                         # term --> rel --> instance
									$target = OBO::Core::Instance->new();
									$target->id($2);
									$result->add_instance($target);
								}
						    }
						    
							$rel->link($term, $target);                         # triplet: term --> rel --> target
							$term->property_value($rel);
							
							#$result->add_relationship($rel);                   # TODO Do we need this? or better add $ontology->{PROPERTY_VALUES}?
						} elsif (defined $3) {                                  # term --> rel --> datatype
							#
							# property_value: lastest_modification_by "erick" xsd:string or e.g. shoe_size "12" xsd:positiveInteger
							#
							my $target = $result->get_instance_by_id($2);
							
							if (!defined $target) {                             # term --> rel --> datatype
								$target = OBO::Core::Instance->new();
								$target->id($2);
								
								# data type check
								warn "Unrecommended XML-schema pritive (data type) found: '", $3, "'" unless (exists $allowed_data_types{$3});
								
								my $data_type = OBO::Core::Term->new();
								$data_type->id($3);
								#$result->add_term($data_type);                 # TODO Think about it...
								$target->instance_of($data_type);
								#$result->add_instance($target);                # TODO Think about it...
							}

							$rel->link($term, $target);
							$term->property_value($rel);
							
							#$result->add_relationship($rel);                   # TODO Do we need this? or better add $ontology->{PROPERTY_VALUES}?
						}
					} elsif ($line =~ /^!/) {
						# skip line
					} else {					
						warn 'Unknown syntax found (and ignored) in line: ', $file_line_number, " (in file '", $self->{OBO_FILE}, "'):\n\t", $line, "\n";
					}
				}
				# Check for required fields: id
				if (defined $term && !defined $term->id()) {
					croak "No ID found in term:\n", $chunk;
				}
				if ($intersection_of_counter == 1) { # IDEM TEST: ($intersection_of_counter != 0 && $intersection_of_counter < 2) 
					carp "Missing 'intersection_of' tag in term:\n\n", $chunk, "\n";
				}
				if ($union_of_counter == 1) { # IDEM TEST: ($union_of_counter != 0 && $union_of_counter < 2) 
					carp "Missing 'union_of' tag in term:\n\n", $chunk, "\n";
				}
				$file_line_number++;
			} elsif ($stanza && $stanza =~ /\[Typedef\]/) { # treat [Typedef]
				my $type;
				my $only_one_name_tag_per_entry = 0;
				
				#
				# to check we have zero or at least two intersection_of's and zero or at least two union_of's
				#
				$intersection_of_counter = 0;
				$union_of_counter        = 0;
				
				$file_line_number++;
				foreach my $line (@entry) {
					$file_line_number++;
					if ($line =~ /^id:\s*(.*)/) { # get the type id
						$type = $result->get_relationship_type_by_id($1); # Is this relationship type already in the ontology?
						if (!defined $type){
							$type = OBO::Core::RelationshipType->new();  # if not, create a new type
							$type->id($1);
							$result->add_relationship_type($type);        # add it to the ontology
						} elsif (defined $type->def()->text() && $type->def()->text() ne '') {
							# the type is already in the ontology since it has a definition! (not empty)
							croak "The relationship type with id '", $1, "' is duplicated in the OBO file. Check line: '", $file_line_number, "'";
						} else {
							# the type already in the ontology but with an empty def, which most probably will
							# be defined later. This case is the result of adding a relationship while parsing
							# the Term stanzas.
							#warn "Line: '", $line, "', Def: '", $type->def_as_string(), "'\n";
						}
					} elsif ($line =~ /^is_anonymous:$r_true_false/) {
						$type->is_anonymous(($1 eq 'true')?1:0);
					} elsif ($line =~ /^name:\s*(.*)/) {
						croak "The typedef with id '", $1, "' has a duplicated 'name' tag in the file '", $self->{OBO_FILE}, "'. Check line: '", $file_line_number, "'" if ($only_one_name_tag_per_entry);
						$type->name($1);
						$only_one_name_tag_per_entry = 1;
					} elsif ($line =~ /^namespace:\s*(.*)/) {
						$type->namespace($1); # it is a Set
					} elsif ($line =~ /^alt_id:\s*([:\w]+)/) {
						$type->alt_id($1);
					} elsif ($line =~ /^def:\s*\"(.*)\"$r_dbxref/) { # fill in the definition
						my $def = OBO::Core::Def->new();
						$def->text($1);
						$def->dbxref_set_as_string($2);
						$type->def($def);
					} elsif ($line =~ /^comment:\s*(.*)/) {
						$type->comment($1);
					} elsif ($line =~ /^subset:\s*(\S+)/) {
						my $ss = $1;
						if ($result->subset_def_map()->contains_key($ss)) {
							$type->subset($ss); # it is a Set (i.e. added to a Set)
							
							$used_subset{$ss}++; # check subsets usage
						} else {
							croak "The subset '", $ss, "' is not defined in the header! Check your OBO file relationship type in line: '", $file_line_number, "'";
						}
					} elsif ($line =~ /^domain:\s*(.*)/) {
						$type->domain($1);
					} elsif ($line =~ /^range:\s*(.*)/) {
						$type->range($1);
					} elsif ($line =~ /^is_anti_symmetric:$r_true_false/) {
						$type->is_anti_symmetric(($1 eq 'true')?1:0);
					} elsif ($line =~ /^is_cyclic:$r_true_false/) {
						$type->is_cyclic(($1 eq 'true')?1:0);
					} elsif ($line =~ /^is_reflexive:$r_true_false/) {
						$type->is_reflexive(($1 eq 'true')?1:0);
					} elsif ($line =~ /^is_symmetric:$r_true_false/) {
						$type->is_symmetric(($1 eq 'true')?1:0);
					} elsif ($line =~ /^is_transitive:$r_true_false/) {
						$type->is_transitive(($1 eq 'true')?1:0);
					} elsif ($line =~ /^is_a:\s*([:\w]+)$r_comments/) { # intrinsic or not??? # The comment is ignored here but generated (and sometimes fixed) later internally
						my $r    = $1;
						my $r_id = $type->id();
						if ($r_id eq $r) {
							warn "The term '", $r_id, "' has a reflexive is_a relationship, which was ignored!";
							next;
						}
						my $rel = OBO::Core::Relationship->new();
						$rel->id($r_id.'_is_a_'.$r);
						$rel->type('is_a');
						my $target = $result->get_relationship_type_by_id($r); # Is this relationship type already in the ontology?
						if (!defined $target) {
							$target = OBO::Core::RelationshipType->new(); # if not, create a new relationship type
							$target->id($r);
							$result->add_relationship_type($target);
						}
						$rel->link($type, $target); # add a relationship between two relationship types
						$result->add_relationship($rel);
					} elsif ($line =~ /^is_metadata_tag:$r_true_false/) {
						$type->is_metadata_tag(($1 eq 'true')?1:0);
					} elsif ($line =~ /^is_class_level:$r_true_false/) {
						$type->is_class_level(($1 eq 'true')?1:0);
					} elsif ($line =~ /^(exact|narrow|broad|related)_synonym:\s*\"(.*)\"$r_dbxref/) {
						$type->synonym_as_string($2, $3, uc($1));
					} elsif ($line =~ /^synonym:\s*\"(.*)\"$syn_scope(\s+(\w+))?$r_dbxref/) {
						my $scope = (defined $3)?$3:'RELATED';
						# From OBO flat file spec v1.2, we use:
						# synonym: "endomitosis" EXACT []
						if (defined $5) {
							my $found = 0; # check that the 'synonym type name' was defined in the header!
							foreach my $st ($result->synonym_type_def_set()->get_set()) {
								# Adapt the scope if necessary to the one defined in the header!
								if ($st->name() eq $5) {
									$found = 1;
									my $default_scope = $st->scope();
									$scope = $default_scope if (defined $default_scope);
									last;
								}
							}
							croak 'The synonym type name (', $5,') used in line ',  $file_line_number, " in the file '", $self->{OBO_FILE}, "' was not defined" if (!$found);
							$used_synonym_type_def{$5}++; # check synonymtypedef usage
						}
						$type->synonym_as_string($1, $6, $scope, $5);
					} elsif ($line =~ /^xref:\s*(.*)/ || $line =~ /^xref_analog:\s*(.*)/ || $line =~ /^xref_unk:\s*(.*)/) {
						$type->xref_set_as_string($1);
					} elsif ($line =~ /^intersection_of:\s*([\w\/]+)?$r_db_acc$r_comments/) {
						# TODO Improve the 'intersection_of' treatment
						my $rel = OBO::Core::Relationship->new();
						my $r   = $1 || 'nil';
						my $id  = $type->id().'_'.$r.'_'.$2; 
						$id     =~ s/\s+/_/g;
						$rel->id($id);
						$rel->type($r);
						my $target = $result->get_term_by_id($2); # Is this term already in the ontology?
						if (!defined $target) {
							$target = OBO::Core::Term->new(); # if not, create a new term
							$target->id($2);
							$result->add_term($target);
						}
						$rel->head($target);
						$type->intersection_of($rel);
						$intersection_of_counter++;
					} elsif ($line =~ /^union_of:\s*(.*)/) {
						# TODO wait until the OBO spec 1.4 be released
						my $target = $result->get_relationship_type_by_id($1); # Is this relationship type already in the ontology?
						if (!defined $target) {
							$target = OBO::Core::RelationshipType->new(); # if not, create a new relationship type
							$target->id($1);
							$result->add_relationship_type($target);
						}
						$type->union_of($1);
						$union_of_counter++;
					} elsif ($line =~ /^disjoint_from:\s*([:\w]+)$r_comments/) {
						$type->disjoint_from($1); # We are assuming that the other relation type exists or will exist; otherwise , we have to create it like in the is_a section.
					} elsif ($line =~ /^inverse_of:\s*([:\w]+)$r_comments/) { # e.g. inverse_of: has_participant ! has participant
						my $inv_id   = $1;
						my $inv_type = $result->get_relationship_type_by_id($inv_id); # Is this INVERSE relationship type already in the ontology?
						if (!defined $inv_type){
							$inv_type = OBO::Core::RelationshipType->new();  # if not, create a new type
							$inv_type->id($inv_id);
							#$inv_type->name($3) if ($3); # not necessary, this name could be wrong...
							$result->add_relationship_type($inv_type);       # add it to the ontology
						}
						$type->inverse_of($inv_type);
					} elsif ($line =~ /^transitive_over:\s*(.*)/) {
						$type->transitive_over($1);
					} elsif ($line =~ /^holds_over_chain:\s*([:\w]+)\s*([:\w]+)$r_comments/) { # R <- R1.R2 
						my $r1_id   = $1;
						my $r2_id   = $2;
						my $r1_type = $result->get_relationship_type_by_id($r1_id); # Is this relationship type already in the ontology?
						if (!defined $r1_type){
							$r1_type = OBO::Core::RelationshipType->new();  # if not, create a new type
							$r1_type->id($r1_id);
							$result->add_relationship_type($r1_type);       # add it to the ontology
						}
						my $r2_type = $result->get_relationship_type_by_id($r2_id); # Is this relationship type already in the ontology?
						if (!defined $r2_type){
							$r2_type = OBO::Core::RelationshipType->new();  # if not, create a new type
							$r2_type->id($r2_id);
							$result->add_relationship_type($r2_type);       # add it to the ontology
						}
						$type->holds_over_chain($r1_type->id(), $r2_type->id());
					} elsif ($line =~ /^equivalent_to_chain:\s*(.*)/) {
						# TODO
					} elsif ($line =~ /^disjoint_over:\s*(.*)/) {
						# TODO
					} elsif ($line =~ /^is_functional:$r_true_false/) {
						$type->is_functional(($1 eq 'true')?1:0);
					} elsif ($line =~ /^is_inverse_functional:$r_true_false/) {
						$type->is_inverse_functional(($1 eq 'true')?1:0);
					} elsif ($line =~ /^created_by:\s*(.*)/) {
						$type->created_by($1);
					} elsif ($line =~ /^creation_date:\s*(.*)/) {
						my $d = $1;
						my $pd = ParseDate($d); # Check that the date follows the ISO 8601 format
						if (!$pd) {
							warn "Bad date string: '", $d, "' in line ", $file_line_number, "\n";
						}
						$type->creation_date($d);
					} elsif ($line =~ /^modified_by:\s*(.*)/) {
						$type->modified_by($1);
					} elsif ($line =~ /^modification_date:\s*(.*)/) {
						my $d = $1;
						my $pd = ParseDate($d); # Check that the date follows the ISO 8601 format
						if (!$pd) {
							warn "Bad date string: '", $d, "' in line ", $file_line_number, "\n";
						}
						$type->modification_date($d);
					} elsif ($line =~ /^is_obsolete:\s*(.*)/) {
						$type->is_obsolete(($1 eq 'true')?1:0);
					} elsif ($line =~ /^replaced_by:\s*(.*)/) {
						$type->replaced_by($1);
					} elsif ($line =~ /^consider:\s*(.*)/) {
						$type->consider($1);
					} elsif ($line =~ /^builtin:$r_true_false/) {
						$type->builtin(($1 eq 'true')?1:0);
					} elsif ($line =~ /^!/) {
						# skip line
					} else {
						warn 'Unknown syntax found (and ignored) in line: ', $file_line_number, " (in file '", $self->{OBO_FILE}, "'):\n\t", $line, "\n";
					}	
				}
				# Check for required fields: id
				if (!defined $type->id()) {
					croak "No ID found in type:\n\n", $chunk, "\n\nfrom file '", $self->{OBO_FILE}, "'";
				}
				if ($intersection_of_counter == 1) { # IDEM TEST: ($intersection_of_counter != 0 && $intersection_of_counter < 2) 
					carp "Missing 'intersection_of' tag in relationship type:\n\n", $chunk, "\n";
				}
				if ($union_of_counter == 1) { # IDEM TEST: ($union_of_counter != 0 && $union_of_counter < 2) 
					carp "Missing 'union_of' tag in relationship type:\n\n", $chunk, "\n";
				}
				$file_line_number++;
			} elsif ($stanza && $stanza =~ /\[Instance\]/) { # treat [Instance]
				my $instance;
				
				#
				# to check we have zero or at least two intersection_of's and zero or at least two union_of's
				#
				# TODO do INSTANCES have these tags?
				$intersection_of_counter = 0;
				$union_of_counter        = 0;
				
				$file_line_number++;
				
				my $only_one_id_tag_per_entry   = 0;
				my $only_one_name_tag_per_entry = 0;
				
				foreach my $line (@entry) {
					$file_line_number++;
					if ($line =~ /^id:\s*(\S+)/) { # get the instance id
						if ($line =~ /^id:$r_db_acc/) { # Does it follow the "convention"?
							croak "The instance with id '", $1, "' has a duplicated 'id' tag in the file '", $self->{OBO_FILE} if ($only_one_id_tag_per_entry);
							$instance = $result->get_instance_by_id($1); # does this instance is already in the ontology?
							if (!defined $instance){
								$instance = OBO::Core::Instance->new();  # if not, create a new instance
								$instance->id($1);
								$result->add_instance($instance);        # add it to the ontology
								$only_one_id_tag_per_entry = 1;
							#} elsif (defined $instance->def()->text() && $instance->def()->text() ne '') {
								# TODO Do instances have a definition?
							#	# The instance is already in the ontology since it has a definition! (maybe empty?)
							#	croak "The instance with id '", $1, "' is duplicated in the OBO file.";
							}
						} else {
							croak "The instance with id '", $1, "' does NOT follow the ID convention: 'IDSPACE:UNIQUE_IDENTIFIER', e.g. GO:1234567";
						}						
					} elsif ($line =~ /^is_anonymous:$r_true_false/) {
						$instance->is_anonymous(($1 eq 'true')?1:0);
					} elsif ($line =~ /^name:\s*(.*)/) {
						croak "The instance with id '", $1, "' has a duplicated 'name' tag in the file '", $self->{OBO_FILE} if ($only_one_name_tag_per_entry);
						if (!defined $1) {
							warn "The instance with id '", $instance->id(), "' has no name in file '", $self->{OBO_FILE}, "'";
						} else {
							$instance->name($1);
							$only_one_name_tag_per_entry = 1;
						}
					} elsif ($line =~ /^namespace:\s*(.*)/) {
						$instance->namespace($1); # it is a Set
					} elsif ($line =~ /^alt_id:$r_db_acc/) {
						# TODO do INSTANCES have this tag?
						$instance->alt_id($1);
					} elsif ($line =~ /^def:\s*\"(.*)\"$r_dbxref/) { # fill in the definition
						my $def = OBO::Core::Def->new();
						$def->text($1);
						$def->dbxref_set_as_string($2);
						$instance->def($def);
					} elsif ($line =~ /^comment:\s*(.*)/) {
						$instance->comment($1);
					} elsif ($line =~ /^subset:\s*(\S+)/) {
						my $ss = $1;
						if ($result->subset_def_map()->contains_key($ss)) {
							$instance->subset($ss); # it is a Set (i.e. added to a Set)
							
							$used_subset{$ss}++; # check subsets usage
						} else {
							croak "The subset '", $ss, "' is not defined in the header! Check your OBO file line '", $file_line_number, "'";
						}
					} elsif ($line =~ /^(exact|narrow|broad|related)_synonym:\s*\"(.*)\"$r_dbxref/) { # OBO spec 1.1
						$instance->synonym_as_string($2, $3, uc($1));
					} elsif ($line =~ /^synonym:\s*\"(.*)\"$syn_scope(\s+([-\w]+))?$r_dbxref/) {
						my $scope = (defined $3)?$3:'RELATED';
						# As of OBO flat file spec v1.2, we use:
						# synonym: "endomitosis" EXACT []
						if (defined $5) {
							my $found = 0; # check that the 'synonym type name' was defined in the header!
							foreach my $st ($result->synonym_type_def_set()->get_set()) {
								# Adapt the scope if necessary to the one defined in the header!
								if ($st->name() eq $5) {
									$found = 1;
									my $default_scope = $st->scope();
									$scope = $default_scope if (defined $default_scope);
									last;
								}
							}
							croak 'The synonym type name (', $5,') used in line ',  $file_line_number, " in the file '", $self->{OBO_FILE}, "' was not defined" if (!$found);
							$used_synonym_type_def{$5}++; # check synonymtypedef usage
						}
						$instance->synonym_as_string($1, $6, $scope, $5);
					} elsif ($line =~ /^xref:\s*(.*)/ || $line =~ /^xref_analog:\s*(.*)/ || $line =~ /^xref_unknown:\s*(.*)/) {
						$instance->xref_set_as_string($1);
					} elsif ($line =~ /^instance_of:$r_db_acc$r_comments/) { # The comment is ignored here but retrieved later internally
						my $t = $result->get_term_by_id($1); # Is this instance already in the ontology?
						if (!defined $t) {
							$t = OBO::Core::Term->new(); # if not, create a new Term
							$t->id($1);
							$result->add_term($t);
						}
						$instance->instance_of($t);
					} elsif ($line =~ /^intersection_of:\s*([\w\/]+)?$r_db_acc$r_comments/) {
						# TODO Improve the 'intersection_of' treatment
						# TODO do INSTANCES have this tag?
						my $rel = OBO::Core::Relationship->new();
						my $r   = $1 || 'nil';
						my $id  = $instance->id().'_'.$r.'_'.$2; 
						$id     =~ s/\s+/_/g;
						$rel->id($id);
						$rel->type($r);
						my $target = $result->get_instance_by_id($2); # Is this instance already in the ontology?
						if (!defined $target) {
							$target = OBO::Core::Instance->new(); # if not, create a new instance
							$target->id($2);
							$result->add_instance($target);
						}
						$rel->head($target);
						$instance->intersection_of($rel);
						$intersection_of_counter++;
					} elsif ($line =~ /^union_of:\s*(.*)/) {
						# TODO wait until the OBO spec 1.4 be released
						# TODO do INSTANCES have this tag?
						my $target = $result->get_instance_by_id($1); # Is this instance already in the ontology?
						if (!defined $target) {
							$target = OBO::Core::Instance->new(); # if not, create a new instance
							$target->id($1);
							$result->add_instance($target);
						}
						$instance->union_of($1);
						$union_of_counter++;
					} elsif ($line =~ /^disjoint_from:$r_db_acc$r_comments/) {
						# TODO do INSTANCES have this tag?
						$instance->disjoint_from($1); # We are assuming that the other instance exists or will exist; otherwise , we have to create it like in the is_a section.
					} elsif ($line =~ /^relationship:\s*([\w\/]+)$r_db_acc$r_comments/ || $line =~ /^relationship:\s*$r_db_acc$r_db_acc$r_comments/) {
						# TODO do INSTANCES have this tag?
						my $rel = OBO::Core::Relationship->new();
						my $id  = $instance->id().'_'.$1.'_'.$2; # TODO see the line (TODO) of the 'term' section
						$id     =~ s/\s+/_/g;
						$rel->id($id);
						$rel->type($1);
						my $target = $result->get_instance_by_id($2); # Is this instance already in the ontology?
						if (!defined $target) {
							$target = OBO::Core::Instance->new(); # if not, create a new instance
							$target->id($2);
							$result->add_instance($target);
						}
						$rel->link($instance, $target);
						$result->add_relationship($rel);
					} elsif ($line =~ /^created_by:\s*(.*)/) {
						$instance->created_by($1);
					} elsif ($line =~ /^creation_date:\s*(.*)/) {
						my $d = $1;
						my $pd = ParseDate($d); # Check that the date follows the ISO 8601 format
						if (!$pd) {
							warn "Bad date string: '", $d, "' in line ", $file_line_number, "\n";
						}
						$instance->creation_date($d);
					} elsif ($line =~ /^modified_by:\s*(.*)/) {
						$instance->modified_by($1);
					} elsif ($line =~ /^modification_date:\s*(.*)/) {
						my $d = $1;
						my $pd = ParseDate($d); # Check that the date follows the ISO 8601 format
						if (!$pd) {
							warn "Bad date string: '", $d, "' in line ", $file_line_number, "\n";
						}
						$instance->modification_date($d);
					} elsif ($line =~ /^is_obsolete:$r_true_false/) {
						$instance->is_obsolete(($1 eq 'true')?1:0);
					} elsif ($line =~ /^replaced_by:\s*(.*)/) {
						$instance->replaced_by($1);
					} elsif ($line =~ /^consider:\s*(.*)/) {
						$instance->consider($1);
					} elsif ($line =~ /^builtin:$r_true_false/) {
						# TODO do INSTANCES have this tag?
						$instance->builtin(($1 eq 'true')?1:0);
					} elsif ($line =~ /^property_value:\s*(\w+)$r_db_acc/ || $line =~ /^property_value:\s*(\w+)\s+"(\w+)"$r_db_acc/) { # TODO re-implement this once the OBO spec is more mature...
					
						my $r2_type = $result->get_relationship_type_by_id($1); # Is this relationship type already in the ontology?
						if (!defined $r2_type){
							$r2_type = OBO::Core::RelationshipType->new();      # if not, create a new relationship type
							$r2_type->id($1);
							$result->add_relationship_type($r2_type);           # add it to the ontology
						}

						#
						# create the triplet
						#						
						my $rel = OBO::Core::Relationship->new();
						my $id  = $instance->id().'_'.$1.'_'.$2;                # instance --> rel --> [term|instance|datatype]
						$id     =~ s/\s+/_/g;
						$rel->id($id);
						$rel->type($r2_type->id());
						
						if (!defined $3) {
							#
							# property_value: lastest_modification_by erick
							#
						    my $target = $result->get_term_by_id($2);           # suggest to OBOF to define TERMs before they are used so that parsers could know they are dealing with terms!
						    
						    if (defined $target) {                              # instance --> rel --> term
						    	
						    } else {
								$target = $result->get_instance_by_id($2);
								
								if (!defined $target) {                         # instance --> rel --> instance
									$target = OBO::Core::Instance->new();
									$target->id($2);
									$result->add_instance($target);
								}
						    }
						    
							$rel->link($instance, $target);                     # triplet: instance --> rel --> target
							$instance->property_value($rel);
							
							#$result->add_relationship($rel);                   # TODO Do we need this? or better add $ontology->{PROPERTY_VALUES}?
						} elsif (defined $3) {                                  # instance --> rel --> datatype
							#
							# property_value: lastest_modification_by "erick" xsd:string or e.g. shoe_size "12" xsd:positiveInteger
							#
							my $target = $result->get_instance_by_id($2);
							
							if (!defined $target) {                             # instance --> rel --> datatype
								$target = OBO::Core::Instance->new();
								$target->id($2);
								
								# data type check
								warn "Unrecommended XML-schema pritive (data type) found: '", $3, "'" unless (exists $allowed_data_types{$3});
								
								my $data_type = OBO::Core::Term->new();
								$data_type->id($3);
								#$result->add_term($data_type);                 # TODO Think about it...
								$target->instance_of($data_type);
								#$result->add_instance($target);                # TODO Think about it...
							}

							$rel->link($instance, $target);
							$instance->property_value($rel);
							
							#$result->add_relationship($rel);                   # TODO Do we need this? or better add $ontology->{PROPERTY_VALUES}?
						}						
					} elsif ($line =~ /^!/) {
						# skip line
					} else {					
						warn 'Unknown syntax found (and ignored) in line: ', $file_line_number, " (in file '", $self->{OBO_FILE}, "'):\n\t", $line, "\n";
					}
				}
				# Check for required fields: id
				if (defined $instance && !defined $instance->id()) {
					croak "No ID found in instance:\n", $chunk;
				}
				if ($intersection_of_counter == 1) { # IDEM TEST: ($intersection_of_counter != 0 && $intersection_of_counter < 2)
					 # TODO do INSTANCES have this tag?
					croak "Missing 'intersection_of' tag in instance:\n\n", $chunk, "\n";
				}
				if ($union_of_counter == 1) { # IDEM TEST: ($union_of_counter != 0 && $union_of_counter < 2) 
					carp "Missing 'union_of' tag in instance:\n\n", $chunk, "\n";
				}
				$file_line_number++;
			} elsif ($stanza && $stanza =~ /\[Annotation\]/) { # treat [Annotation]
				# TODO "Annotations are ignored by ONTO-PERL (they might be supported in the future).";
			}
		}
		
		#
		# Warn (and delete) on non used subsets which were defined in the header (subsetdef's)
		#
		my @set_of_all_ss = $result->subset_def_map()->key_set()->get_set();
		foreach my $pss (sort @set_of_all_ss) {
			if (!$used_subset{$pss}) {
				$result->subset_def_map()->remove($pss);
				warn "Unused subset found (and removed): '", $pss, "' (in file '", $self->{OBO_FILE}, "')";
			}
		}
		
		#
		# Warn (and delete) on non used synonym type def's which were defined in the header (synonymtypedef's)
		#
		my @set_of_all_synonymtypedef = $result->synonym_type_def_set()->get_set();
		foreach my $st (@set_of_all_synonymtypedef) {
			if (!$used_synonym_type_def{$st->name()}) {
				$result->synonym_type_def_set()->remove($st);
				warn "Unused synonym type def found (and removed): '", $st->name(), "' (in file '", $self->{OBO_FILE}, "')";
			}
		}
		
		#
		# Work-around for some ontologies like GO: Explicitly add the implicit 'is_a' if missing
		#
		if (!$result->has_relationship_type_id('is_a')){
			my $type = OBO::Core::RelationshipType->new();  # if not, create a new type
			$type->id('is_a');
			$type->name('is_a');
			$result->add_relationship_type($type);
		}
		
		$/ = "\n";

		return $result;
	} else { # if no header (chunk[0])
		carp "The OBO file '", $self->{OBO_FILE},"' does not have a correct header, please verify it.";
	}
}

1;

__END__


=head1 NAME

OBO::Parser::OBOParser  - An OBO (Open Biomedical Ontologies) file parser.
    
=head1 SYNOPSIS

use OBO::Parser::OBOParser;

use strict;

my $my_parser = OBO::Parser::OBOParser->new;

my $ontology = $my_parser->work("apo.obo");

$ontology->has_term($ontology->get_term_by_id("APO:B9999993"));

$ontology->has_term($ontology->get_term_by_name("small molecule"));

$ontology->get_relationship_by_id("APO:B9999998_is_a_APO:B0000000")->type() eq 'is_a';

$ontology->get_relationship_by_id("APO:B9999996_part_of_APO:B9999992")->type() eq 'part_of'; 


my $ontology2 = $my_parser->work("apo.obo");

$ontology2->has_term($ontology2->get_term_by_id("APO:B9999993"));

$ontology2->has_term($ontology2->get_term_by_name("cell cycle"));

$ontology2->get_relationship_by_id("APO:P0000274_is_a_APO:P0000262")->type() eq 'is_a';

$ontology2->get_relationship_by_id("APO:P0000274_part_of_APO:P0000271")->type() eq 'part_of'; 

=head1 DESCRIPTION

An OBOParser object parses an OBO-formatted file:

	http://www.geneontology.org/GO.format.obo-1_4.shtml
	
	http://berkeleybop.org/~cjm/obo2owl/obo-syntax.html

=head1 AUTHOR

Erick Antezana, E<lt>erick.antezana -@- gmail.comE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (c) 2006-2014 by Erick Antezana

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.7 or,
at your option, any later version of Perl 5 you may have available.

=cut