The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Bio::DOOP::Util::Run::GeneMerge;

use strict;
use warnings;
use POSIX;

=head1 NAME

Bio::DOOP::Util::Run::GeneMerge - GeneMerge based GO analyzer

=head1 VERSION

Version 0.02

=cut

our $VERSION = '0.02';

=head1 SYNOPSIS

  #!/usr/bin/perl -w

  use Bio::DOOP::DOOP;

  $test = Bio::DOOP::Util::Run::GeneMerge->new();

  if ($test->getDescFile("GO/use/GO.BP.use") < 0){
     print"Desc error\n"
  }

  if ($test->getAssocFile("GO/assoc/A_thaliana.converted.BP") < 0){
     print"Assoc error\n"
  }

  if ($test->getPopFile("GO/pop.500") < 0){
     print"Pop error\n"
  }

  if ($test->getStudyFile("GO/study.500/combined1314.list") < 0){
     print"Study error\n"
  }

  $results = $test->getResults();

  foreach $res (@{$results}) {
     print $$res{'GOterm'}," ",$$res{'RawEs'},"\n";
  }

=head1 DESCRIPTION

This is a module based on GeneMerge v1.2.

Original program described in:

Cristian I. Castillo-Davis and Daniel L. Hartl
GeneMerge - post-genomic analysis, data mining, and hypothesis testing
Bioinformatics Vol. 19 no. 7 2003, Pages 891-892

The original program is not really good for large scale analysis, 
because the design uses a lot of I/O processes. This version fetches
everything into memory at start.

=head1 AUTHORS

Tibor Nagy, Godollo, Endre Sebestyen, Martonvasar,

=head1 METHODS

=head2 new

Create new GeneMerge object.

   $genemerge = Bio::DOOP::Util::Run::GeneMerge->new;

=cut

sub new {
  my $self                       = {};
  my $dummy                      = shift;

  $self->{HoAssoc}               = ();
  $self->{HoPopAssocCount}       = ();
  $self->{HoPopAssocFreq}        = ();
  $self->{PopGeneNo}             = 0;
  $self->{HoDesc}                = ();
  $self->{StudyGeneNo}           = 0;
  $self->{StudyGeneNoAssoc}      = 0;
  $self->{HoStudyGeneAssocCount} = ();
  $self->{HoAssocStudyGene}      = ();
  $self->{StudyGeneUniqAssoc}    = 0;
  $self->{BonferroniCorr}        = 0;
  $self->{HoStudyGeneAssocPVal}  = ();

  bless $self;
  return ($self);
}

=head2 getAssocFile

The method loads the GO association file and stores it in memory.
The file format is the following. Each line starts with a cluster id, and after some whitespace
the associated GO ids are enumerated, separated by semicolons.

81001020 GO:0016020;GO:0003674;GO:0008150                                                                                                                    
81001110 GO:0005739;GO:0003674

   $genemerge->getAssocFile('/tmp/assoc.txt');

=cut

sub getAssocFile {
   my $self                = shift;
   my $filename            = shift;

   open ASSOC, $filename or return(-1);

   while(<ASSOC>){
      chomp;
      my @assoc_line = split;
      my $assoc_gene = $assoc_line[0];
      my @assoc_go   = ();

      if ($assoc_line[1]) {
         @assoc_go = split /;/, $assoc_line[1];
         @{$self->{HoAssoc}{$assoc_gene}} = @assoc_go;
      }

   }
   close ASSOC;

   return(0);
}

=head2 getPopFile

The method loads the population file and stores it in memory.
The file format is the following. Each line contains one and only one
cluster id.

81001020
81001110

   $genemerge->getPopFile('/tmp/pop.txt');

=cut

sub getPopFile {
   my $self                = shift;
   my $filename            = shift;

   open POP, $filename or return(-1);
   while (<POP>) {
      chomp;
      my $PopGene = $_;
      $self->{PopGeneNo}++;

      if (exists $self->{HoAssoc}{$PopGene}) {
         foreach my $AssocGO (@{$self->{HoAssoc}{$PopGene}})  {
            $self->{HoPopAssocCount}{$AssocGO}++;
         }
      }
   }
   close POP;

   $self->popFreq();

   return(0);
}

=head2 popFreq

The method calculates the population frequency. Do not use it directly.

=cut

sub popFreq {
   my $self                = shift;

   foreach my $PopAssocCountKey (keys %{$self->{HoPopAssocCount}}) {
      my $freq = $self->{HoPopAssocCount}{$PopAssocCountKey} / $self->{PopGeneNo};
      $self->{HoPopAssocFreq}{$PopAssocCountKey} = $freq;
   }
}

=head2 getDescFile

The method loads the GO description file.
The file format is the following. Each line starts with the GO id, and separated by a tab,
the description of the GO id.

GO:0000007      low-affinity zinc ion transporter activity                                                                                                   
GO:0000008      thioredoxin

   $genemerge->getDescFile('/tmp/desc.txt');

=cut

sub getDescFile {
   my $self                = shift;
   my $filename            = shift;

   open DESC, $filename or return(-1);
   while (<DESC>) {
      chomp;
      my @desc_line = split /\s/, $_, 2;
      $self->{HoDesc}{$desc_line[0]} = $desc_line[1];
   }
   close DESC;

   return(0);
}

=head2 getStudyFile

The method loads the study data set, counts GO frequencies, calculates P values
based on the hypergeometric distribution, and corrects P values, based on the
Bonferroni method.

The file format of the study file is the following. Each line contains one and only one
cluster id.

81001020
81001110

   $genemerge->getStudyFile('/tmp/study.txt');

=cut

sub getStudyFile {
   # TODO we should split this in 2 or 3.
   my $self                = shift;
   my $filename            = shift;

   open STUDY, $filename or return(-1);
   while(<STUDY>) {
      chomp;
      $self->{StudyGeneNo}++;
      my $StudyGene = $_;
      if (exists $self->{HoAssoc}{$StudyGene}) {
         foreach my $StudyGeneGO (@{$self->{HoAssoc}{$StudyGene}}) {
            $self->{HoStudyGeneAssocCount}{$StudyGeneGO}++;
            push @{$self->{HoAssocStudyGene}{$StudyGeneGO}}, $StudyGene;
         }
      } else {
         $self->{StudyGeneNoAssoc}++;
      }

   }
   close STUDY;

   #Bonferroni correction
   foreach my $StudyGeneAssocCountKey (keys %{$self->{HoStudyGeneAssocCount}}){
      $self->{StudyGeneUniqAssoc}++;
      if($self->{HoPopAssocFreq}{$StudyGeneAssocCountKey} > (1 / $self->{PopGeneNo})) {
         $self->{BonferroniCorr}++;
      }
   }

   #Calculate P-values based on hypergeometric distribution
   my $PVal = 0;
   my $PValC = 0;
   my $N = $self->{PopGeneNo};
   my $K = $self->{StudyGeneNo};

   foreach my $StudyGeneAssocCountKey (keys %{$self->{HoStudyGeneAssocCount}}){
      my $P = $self->{HoPopAssocFreq}{$StudyGeneAssocCountKey};
      my $R = $self->{HoStudyGeneAssocCount}{$StudyGeneAssocCountKey};
      if ($R != 1) {
         $PVal  = $self->hypergeometric($N,$P,$K,$R);
         $PValC = ($PVal * $self->{BonferroniCorr} >= 1) ? 1 : $PVal * $self->{BonferroniCorr};
      } else {
         $PVal  = 'NA';
         $PValC = 'NA';
      }
      ${$self->{HoStudyGeneAssocPVal}{$StudyGeneAssocCountKey}}[0] = $PVal;
      ${$self->{HoStudyGeneAssocPVal}{$StudyGeneAssocCountKey}}[1] = $PValC;
   }

   return(0);
}

=head2 getResults

The method gives back all the results as an arrayref of hashes.

  $results = $genemerge->getResults();
  foreach $result (@{$results}) {
    $goterm       = $$result{'GOterm'};
    $popfreq      = $$result{'PopFreq'};
    $popfrac      = $$result{'PopFrac'};
    $studyfrac    = $$result{'StudyFrac'};
    $studyfracall = $$result{'StudyFracAll'};
    $raw_escore   = $$result{'RawEs'};
    $escore       = $$result{'EScore'};
    $desc         = $$result{'Desc'};
    @contrib      = @{$$result{'Contrib'}};
  }

=cut

sub getResults {
   my $self                = shift;
   my @results;

   foreach my $goterm (sort keys %{$self->{HoStudyGeneAssocCount}}) {
      my %result;
      $result{'GOterm'}       = $goterm;
      $result{'PopFreq'}      = $self->{HoPopAssocFreq}{$goterm};
      $result{'PopFrac'}      = $self->{HoPopAssocCount}{$goterm};
      $result{'PopFracAll'}   = $self->{PopGeneNo};
      $result{'StudyFrac'}    = $self->{HoStudyGeneAssocCount}{$goterm};
      $result{'StudyFracAll'} = $self->{StudyGeneNo};
      $result{'RawEs'}        = ${$self->{HoStudyGeneAssocPVal}{$goterm}}[0];
      $result{'EScore'}       = ${$self->{HoStudyGeneAssocPVal}{$goterm}}[1];
      $result{'Desc'}         = $self->{HoDesc}{$goterm};
      $result{'Contrib'}      = \@{$self->{HoAssocStudyGene}{$goterm}};
      push @results, \%result;
   }

   return(\@results);
}

=head2 hypergeometric

This is an internal function to calculate the hypergeometric distribution. Do not use it directly.

=cut

sub hypergeometric {
   my $self             = shift;
   my $n                = shift;
   my $p                = shift;
   my $k                = shift;
   my $r                = shift;

   my $i    = '0';
   my $q    = '0';
   my $np   = '0';
   my $nq   = '0';
   my $top  = '0';
   my $sum  = '0';
   my $lfoo = '0';

   my $logNchooseK = '0';

   $q = 1 - $p;

   $np = floor( $n * $p + 0.5 );
   $nq = floor( $n * $q + 0.5 );

   $logNchooseK = &logNchooseK( $n, $k );

   $top = ($np < $k) ? $np : $k;

   $lfoo = &logNchooseK($np, $top) + &logNchooseK($n * (1 - $p), $k - $top);

   for ($i = $top ; $i >= $r ; $i--) {
      $sum += exp($lfoo - $logNchooseK);
      if ($i > $r) { $lfoo = $lfoo + log($i / ($np - $i + 1)) + log(($nq - $k + $i) / ($k - $i + 1)) }
   }
   return $sum;
}

=head2 logNchooseK

Another internal function for the correct statistical results. Do not use it directly.

=cut

sub logNchooseK {
	my $n = shift;
	my $k = shift;

	my $i = '0';
	my $result = '0';

	$k = ($k > ($n - $k)) ? $n - $k : $k;

	for ($i = $n ; $i > ($n - $k) ; $i--) { $result += log($i) }

	$result -= &lFactorial($k);

	return $result;
}

=head2 lFactorial

Factorial calculating function. Do not use it directly.

=cut

sub lFactorial {
	my $number = shift;
	my $result = 0;
	my $i;

	for ($i = 2 ; $i <= $number ; $i++) { $result += log($i) }

	return $result;
}

1;