The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl

use strict;
use warnings;
use diagnostics;

use Test;
BEGIN { plan tests => 7991 };

# File       : GO-TermFinder.t
# Author     : Gavin Sherlock
# Date Begun : September 1st 2003

# $Id: GO-TermFinder-Obo.t,v 1.2 2007/11/15 18:34:39 sherlock Exp $

# This file forms a set of tests for the GO::TermFinder class

use GO::TermFinder;
use GO::AnnotationProvider::AnnotationParser;
use GO::OntologyProvider::OboParser;

$|=1;

# turn off warnings from the TermFinder

$GO::TermFinder::WARNINGS = 0;

my $ontologyFile   = "t/gene_ontology_edit.obo";
my $annotationFile = "t/gene_association.sgd"; 
my $aspect         = "P";

# we'll check that all the public methods still exist in the interface

my @methods = qw (findTerms);

my $ontology = GO::OntologyProvider::OboParser->new(ontologyFile => $ontologyFile,
						    aspect       => $aspect);

my $annotation = GO::AnnotationProvider::AnnotationParser->new(annotationFile=>$annotationFile);

my $termFinder = GO::TermFinder->new(annotationProvider=> $annotation,
				     ontologyProvider  => $ontology,
				     aspect            => $aspect);

ok($termFinder->isa("GO::TermFinder"));

# check the object returns a code reference when asked it it can do a
# method that should exist

foreach my $method (@methods){

    ok(ref($termFinder->can($method)), "CODE", "TermFinder can $method");

}

# now check that the findTerms method actually returns the correct
# answers for a selected list of genes (actually the methionine
# cluster from Spellman et al, 1998).

my @pvalues = $termFinder->findTerms(genes=>[qw(YPL250C
						MET11
						MXR1
						MET17
						SAM3
						MET28
						STR3
						MMP1
						MET1
						YIL074C
						MHT1
						MET14
						MET16
						MET3
						MET10
						ECM17
						MET2
						MUP1
						MET6)]);

&testHypotheses(@pvalues);

# now let's run exactly the same test again, but with a different
# casing of the genes

my @newpvalues = $termFinder->findTerms(genes=>[qw(ypl250c
						   Met11
						   mxr1
						   Met17
						   SAM3
						   met28
						   Str3
						   MMp1
						   mET1
						   YIl074c
						   Mht1
						   mEt14
						   Met16
						   Met3
						   mET10
						   ecm17
						   Met2
						   MuP1
						   MeT6)]);

# and compare that the stuff returned looks exactly the same

&compareHypotheses(\@pvalues, \@newpvalues, 1);

# now let's test the functionality of using a defined population
# to create the background distribution.  If we simply say that
# the defined population is every gene from the annotation parser
# then we should get the same result
    
my $newTermFinder = GO::TermFinder->new(annotationProvider=> $annotation,
					ontologyProvider  => $ontology,
					population        => [$annotation->allDatabaseIds],
					aspect            => $aspect);

my @poppvalues = $newTermFinder->findTerms(genes=>[qw(ypl250c
						      Met11
						      mxr1
						      Met17
						      SAM3
						      met28
						      Str3
						      MMp1
						      mET1
						      YIl074c
						      Mht1
						      mEt14
						      Met16
						      Met3
						      mET10
						      ecm17
						      Met2
						      MuP1
						      MeT6)]);

# again, check that the stuff returned looks exactly the same

&compareHypotheses(\@pvalues, \@poppvalues, 1);

# now try using a TermFinder with a limited population of just a few genes.
# All of the returned nodes should have a probability of 1

my $nextTermFinder = GO::TermFinder->new(annotationProvider=> $annotation,
					 ontologyProvider  => $ontology,
					 population        => [qw(ypl250c
								  Met11
								  mxr1
								  Met17
								  SAM3
								  met28
								  Str3
								  MMp1
								  mET1
								  YIl074c
								  Mht1
								  mEt14
								  Met16
								  Met3
								  mET10
								  ecm17
								  Met2
								  MuP1
								  MeT6)],
					 aspect            => $aspect);


@pvalues = $nextTermFinder->findTerms(genes=>[qw(ypl250c
						 Met11
						 mxr1
						 Met17
						 SAM3
						 met28
						 Str3
						 MMp1
						 mET1
						 YIl074c
						 Mht1
						 mEt14
						 Met16
						 Met3
						 mET10
						 ecm17
						 Met2
						 MuP1
						 MeT6)]);

foreach my $pvalue (@pvalues){

    # round the pvalue to 2 decimal places, to avoid precision problems

    my $val = sprintf("%.2f", $pvalue->{PVALUE});

    ok($val, "1.00");

}

# Now we need a test to see what happens when we create a term finder,
# and find terms for three 'nonsense' genes, to check that it doesn't
# cause a problem - we'll defined the population as being of a size
# of 3 more than exist in the annotation file, to accommodate these
# 3 extra nonsense genes

my @genes = qw (foo bar baz);

my $numGenes = scalar(@genes);

my $populationSize = $annotation->numAnnotatedGenes + $numGenes;

my $nonsenseTester = GO::TermFinder->new(annotationProvider=> $annotation,
					 ontologyProvider  => $ontology,
					 aspect            => $aspect,
					 totalNumGenes     => $populationSize);

@pvalues = $nonsenseTester->findTerms(genes=>\@genes);

# grab best node, which should be the 'unannotated' node

my $hypothesis = shift(@pvalues);

# check attributes

ok($hypothesis->{NODE}->term, "unannotated");
ok($hypothesis->{NODE}->goid, "GO:XXXXXXX");

# all our tested genes should be annotated to the node

ok($hypothesis->{NUM_ANNOTATIONS}, $numGenes);

# the total number of annotations to this node out of all genes should
# be the total number of genes minus those which have an annotation to
# this aspect.

ok($hypothesis->{TOTAL_NUM_ANNOTATIONS}, 

   ($populationSize - $annotation->numAnnotatedGenes($aspect)));

# all of the above tests have been using the default setting for
# multiple hypothesis correction, which should default to bonferroni.
# We now need to test that using bonferroni as the supplied argument
# gives the same answers as no argument, and also check that it gives
# the expected correction factor, and run the termfinder with the
# 'simulation' argument, and with the 'none' argument.

# first try with an explicit bonferroni argument

my @bonferroni = $termFinder->findTerms(genes      => [qw(ypl250c
							  Met11
							  mxr1
							  Met17
							  SAM3
							  met28
							  Str3
							  MMp1
							  mET1
							  YIl074c
							  Mht1
							  mEt14
							  Met16
							  Met3
							  mET10
							  ecm17
							  Met2
							  MuP1
							  MeT6)],
					correction => 'bonferroni');

# and compare the results to previously generated pvalues

&compareHypotheses(\@newpvalues, \@bonferroni, 1);

# we can also check that the correction value was correct - it should
# be the same as the number of hypotheses we got back; also, in this
# case, we know that should be 37.  Can only test if the corrected
# p_value is less than 1, as we have a ceiling placed on it at 1.

ok(scalar(@newpvalues), 37);

foreach my $hypothesis (@newpvalues){    

    if ($hypothesis->{CORRECTED_PVALUE} < 1){

	ok ($hypothesis->{CORRECTED_PVALUE}/$hypothesis->{PVALUE}, scalar(@newpvalues));

    }

}

# now let's test the termFinder when we ask for no correction - we
# should get identical results as we got above, except there are no
# corrected p-values.

my @noCorrection = $termFinder->findTerms(genes      => [qw(ypl250c
							    Met11
							    mxr1
							    Met17
							    SAM3
							    met28
							    Str3
							    MMp1
							    mET1
							    YIl074c
							    Mht1
							    mEt14
							    Met16
							    Met3
							    mET10
							    ecm17
							    Met2
							    MuP1
							    MeT6)],
					  correction => 'none');

&compareHypotheses(\@newpvalues, \@noCorrection, 0);

# as our final test of multiple hypothesis correction, we want to
# see if the simulation method works correctly

my @simulation = $termFinder->findTerms(genes      => [qw(ypl250c
							  Met11
							  mxr1
							  Met17
							  SAM3
							  met28
							  Str3
							  MMp1
							  mET1
							  YIl074c
							  Mht1
							  mEt14
							  Met16
							  Met3
							  mET10
							  ecm17
							  Met2
							  MuP1
							  MeT6)],
					correction => 'simulation');

# and compare the results to previously generated pvalues, but ignore
# the corrected pvalues

&compareHypotheses(\@newpvalues, \@simulation, 0);

# not sure what tests we'll do for the FDR calculations, but we should
# at least make sure that they don't throw an error when generated,
# and that the pvalues are the same:

my @fdr = $termFinder->findTerms(genes      => [qw(ypl250c
						   Met11
						   mxr1
						   Met17
						   SAM3
						   met28
						   Str3
						   MMp1
						   mET1
						   YIl074c
						   Mht1
						   mEt14
						   Met16
						   Met3
						   mET10
						   ecm17
						   Met2
						   MuP1
						   MeT6)],
				 calculateFDR => 1);

&compareHypotheses(\@fdr, \@bonferroni, 1);

# now let's test that if we say that we're looking for significant
# terms when we simply have a list of all genes, that we get none -
# indeed the uncorrected p-values should all be equal to 1.

my @nonsignificant = $termFinder->findTerms(genes=>[$annotation->allDatabaseIds]);

foreach my $hypothesis (@nonsignificant){

    ok($hypothesis->{PVALUE}, 1);

}

# now we want to test what happens when we use a TermFinder with a
# defined population, and ask if to findTerms for a list of genes,
# some of which are not in the population.

# above, we generated @poppvalues, which were the pvalues generated
# with a list of genes, with all databaseIds as the background.  Now
# we will generate some new pvalues with that same TermFinder object,
# but add in a few bogus genes at the end.  The bogus genes should be
# ignored, and we should get exactly the same result.

my @poppvalues2 = $newTermFinder->findTerms(genes=>[qw(ypl250c
						       Met11
						       mxr1
						       Met17
						       SAM3
						       met28
						       Str3
						       MMp1
						       mET1
						       YIl074c
						       Mht1
						       mEt14
						       Met16
						       Met3
						       mET10
						       ecm17
						       Met2
						       MuP1
						       MeT6

						       BLAH
						       BLAH2
						       XXXZZZ
						       CDCDCDC)]);

my @discardedGenes = $newTermFinder->discardedGenes;

# 4 genes should have been discarded

ok(scalar(@discardedGenes), 4);

# now check that the nodes and pvalues returned look exactly the same
# as we saw before, when there were no genes to be discarded

&compareHypotheses(\@poppvalues, \@poppvalues2, 1);

# also need to test that the genes are correctly discarded when we 
# are doing a correction or calculating the FDR

my @poppvalues3 = $newTermFinder->findTerms(genes=>[qw(ypl250c
						       Met11
						       mxr1
						       Met17
						       SAM3
						       met28
						       Str3
						       MMp1
						       mET1
						       YIl074c
						       Mht1
						       mEt14
						       Met16
						       Met3
						       mET10
						       ecm17
						       Met2
						       MuP1
						       MeT6

						       BLAH
						       BLAH2
						       XXXZZZ
						       CDCDCDC)],

					    calculateFDR => 1);

my @discardedGenes2 = $newTermFinder->discardedGenes;

# 4 genes should have been discarded

ok(scalar(@discardedGenes), 4);
 
my @poppvalues4 = $newTermFinder->findTerms(genes=>[qw(ypl250c
						       Met11
						       mxr1
						       Met17
						       SAM3
						       met28
						       Str3
						       MMp1
						       mET1
						       YIl074c
						       Mht1
						       mEt14
						       Met16
						       Met3
						       mET10
						       ecm17
						       Met2
						       MuP1
						       MeT6

						       BLAH
						       BLAH2
						       XXXZZZ
						       CDCDCDC)],

					    correction => 'simulation');

my @discardedGenes3 = $newTermFinder->discardedGenes;

# 4 genes should have been discarded

ok(scalar(@discardedGenes), 4);

# now let's test that if we have a background population defined, and
# that if none of the genes provided to find terms for are in the
# background, that a fatal error is thrown

eval {

    $newTermFinder->findTerms(genes=>[qw(BLAH
					 BLAH2
					 XXXZZZ
					 CDCDCDC)]);

};

ok($@, qr/None of the genes provided for analysis are found in the background population/, "should die if genes not in background");

# now some tests that check that we have GO::TermFinder working
# correctly with respect to the aspect node - in this case, test the
# biological_process node, using a bunch of genes that are all
# annotated directly to this node.  Note, this is to accommodate the
# changed behaviour, required by the change Ontologies, where they
# eliminated the unannotated nodes

my @unannotatedGenes = qw(YPR108W-A
			  YPR109W
			  YPR114W
			  YPR115W
			  YPR116W
			  YPR117W
			  YPR127W
			  YPR145C-A
			  YPR147C
			  YPR148C
			  YPR153W
			  YPR157W
			  YPR158W
			  YPR159C-A
			  YPR172W
			  YPR174C
			  YPR196W
			  YPR202W
			  YPR203W
			  YPR204W);

my @unannotatedListPValues = $newTermFinder->findTerms(genes=>\@unannotatedGenes);

ok(scalar(@unannotatedListPValues), 1, "unannotated genes return a single term.");

my $topHypothesis = $unannotatedListPValues[0];

ok($topHypothesis->{NODE}->goid, "GO:0008150");
ok($topHypothesis->{NODE}->term, "biological_process");
ok($topHypothesis->{NUM_ANNOTATIONS}, scalar(@unannotatedGenes), "all in list should be in this node");
ok($topHypothesis->{TOTAL_NUM_ANNOTATIONS}, 1505, "total num directly annotated to biological_process, hand checked");

# now add a single annotated gene, and make sure that we still get the
# same number of total annotatations

@unannotatedListPValues = $newTermFinder->findTerms(genes=>[(@unannotatedGenes, "CLB2", "CDC28")]);

ok(scalar(@unannotatedListPValues), 15, "many terms are now tested.");

$topHypothesis = $unannotatedListPValues[0];

ok($topHypothesis->{NODE}->goid, "GO:0008150");
ok($topHypothesis->{NODE}->term, "biological_process");
ok($topHypothesis->{NUM_ANNOTATIONS}, scalar(@unannotatedGenes), "should still be the same size as the list");
ok($topHypothesis->{TOTAL_NUM_ANNOTATIONS}, 1505, "total num directly annotated to biological_process, hand checked");

######################################################################################
sub testHypotheses{
######################################################################################
# the following are what should be the 11 most significant goids for
# the test set of genes using this frozen dataset.  Note if two nodes
# result in the same p-value, they will be sorted by GOID, using a
# text sort.

    my @pvalues = @_;

    my @topGoids = ("GO:0006790",
		    "GO:0000096",
		    "GO:0006555",
		    "GO:0000097",
		    "GO:0006520",
		    "GO:0006519",
		    "GO:0009066",
		    "GO:0009308",
		    "GO:0006807",
		    "GO:0044272",
		    "GO:0000103");

    # now check that these are returned by the TermFinder

    for (my $i = 0; $i< @topGoids; $i++){

	ok($pvalues[$i]->{NODE}->goid, $topGoids[$i], "$topGoids[$i] is ${i}th in the list of top hypotheses");        

    }

    return;

}

######################################################################################
sub compareHypotheses{
######################################################################################
# This subroutine expects to receive two arrays (by reference) of
# hypotheses generated by GO::TermFinder.  It will check whether they
# are identical.  An third arguments indicates if corrected p-values
# should be compared.

    my ($ref1, $ref2, $shouldCompareCorrectedPValues) = @_;

    for (my $i = 0; $i < @{$ref1}; $i++){

	ok($ref1->[$i]->{PVALUE},                $ref2->[$i]->{PVALUE}, $i."th p-value");

	# Sometimes we don't want to compare the corrected p-values,
	# as a different method of multiple hypothesis correction may
	# have been used between two different runs

	if ($shouldCompareCorrectedPValues){

	    ok($ref1->[$i]->{CORRECTED_PVALUE},      $ref2->[$i]->{CORRECTED_PVALUE}, $i."th corrected p-value");  

	}

	ok($ref1->[$i]->{NUM_ANNOTATIONS},       $ref2->[$i]->{NUM_ANNOTATIONS},       $i."th NUM_ANNOTATIONS");  
	ok($ref1->[$i]->{TOTAL_NUM_ANNOTATIONS}, $ref2->[$i]->{TOTAL_NUM_ANNOTATIONS}, $i."th TOTAL_NUM_ANNOTATIONS"); 
	ok($ref1->[$i]->{NODE}->goid,            $ref2->[$i]->{NODE}->goid,            $i."th GOID");  
	ok($ref1->[$i]->{NODE}->term,            $ref2->[$i]->{NODE}->term,            $i."th TERM");  

	# now check the genes

	# same number

	ok(scalar keys (%{$ref1->[$i]->{ANNOTATED_GENES}}), scalar keys (%{$ref2->[$i]->{ANNOTATED_GENES}}), $i."th number of annotated genes");

	foreach my $gene (keys (%{$ref1->[$i]->{ANNOTATED_GENES}})){

	    # each one exists

	    ok (exists $ref2->[$i]->{ANNOTATED_GENES}{$gene}, 1,  $i."th ANNOTATED_GENE exists");

	    # and has the same name - note has to be done case-insensitively
	    
	    ok(uc($ref1->[$i]->{ANNOTATED_GENES}{$gene}), uc($ref2->[$i]->{ANNOTATED_GENES}{$gene}), $i."th ANNOTATED_GENE name");

	}

    }

    return;

}


=head1 Modifications

 List them here.

 CVS information:

 # $Author: sherlock $
 # $Date: 2007/11/15 18:34:39 $
 # $Log: GO-TermFinder-Obo.t,v $
 # Revision 1.2  2007/11/15 18:34:39  sherlock
 # Tweaked test to be regex instead of equality, as different Perls might
 # add different things to exceptions.  This fixes a failure I was seeing
 # on Solaris.
 #
 # Revision 1.1  2007/03/18 01:33:14  sherlock
 # Adding new test files
 #
 # Revision 1.10  2006/07/28 00:02:46  sherlock
 # added new tests to make sure discarded genes are not lost when
 # calculating FDR or running simulations.
 #
 # Revision 1.9  2006/07/23 21:27:19  sherlock
 # forgot to turn warnings back off
 #
 # Revision 1.8  2006/07/23 00:41:40  sherlock
 # uncommented tests that were previously commented out for performance
 # reasons, as they seem to run fine.  Also, added test for discarded
 # genes when using a population.
 #
 # Revision 1.7  2004/10/14 22:32:04  ihab
 # Removed tests of the internal P value computation; these are now in
 # GO-TermFinder-Native.t and are testing the C++ version.
 #
 # Revision 1.6  2004/05/06 01:58:27  sherlock
 # Added in tests to check that simulation, bonferroni and FDR options
 # work correctly.
 #
 # Revision 1.5  2003/12/11 19:47:35  sherlock
 # added in some tests to check that TermFinder behaves correctly in the
 # case that unrecognized gene names are passed in, which was not working
 # properly previously.
 #
 # Revision 1.4  2003/12/03 02:30:25  sherlock
 # added in a bunch of tests to be more precise in the testing of the
 # term finder, and to test the functionality of providing a population
 # of genes from which to calculate the background distribution
 #
 # Revision 1.3  2003/11/22 00:09:12  sherlock
 # added some new tests to see that differently cased versions of the
 # gene names still give the same result.
 #

=cut