The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
#!/usr/bin/perl
=head1 NAME

spearmans.pl - This program calculates Spearman's Rank Correlation 
between two datasets.

=head1 SYNOPSIS

This utility takes two files in the format of umls-similarity.pl output 
and calculates the Spearman's Rank Correlation. 

=head1 USAGE

Usage: spearmans.pl [OPTIONS] FILE1 FILE2

=head1 INPUT 

FILE1 FILE2

Two files in the following format:

score<>CUI1<>CUI2
score<>CUI3<>CUI4
ect ...

The output format of umls-similarity.pl will work here because it 
extracts the CUIs from the paranthesis, so no worries. If you are 
not using the CUI format or a umls-similarity.pl output file though, 
use the --word option described below. 

=head1 Optional Arguments:

Displays the quick summary of program options.

=head2 --word

The format of the input files contains words rather than CUIs and/or 
is not a umls-similarity.pl output file.

=head2 --N 

displays N - the number of term pairs the correlation 
is being calculated over. This would be any term pair 
that has a score greater than or equal to zero.

=head2 --precision NUMBER

Displays values up to NUMBER places of decimal. Default is 4. 

=head2 --version

Displays the version information.

=head1 OUTPUT

The Spearman's Rank Correlation between the two files.

=head1 SYSTEM REQUIREMENTS

=over

=item * Perl (version 5.8.5 or better) - http://www.perl.org

=item * UMLS::Interface - http://search.cpan.org/dist/UMLS-Interface

=item * UMLS::Similarity - http://search.cpan.org/dist/UMLS-Similarity

=back

=head1 CONTACT US
   
  If you have any trouble installing and using UMLS-Similarity, 
  please contact us via the users mailing list :
    
      umls-similarity@yahoogroups.com
     
  You can join this group by going to:
    
      http://tech.groups.yahoo.com/group/umls-similarity/
     
  You may also contact us directly if you prefer :
    
      Bridget T. McInnes: bthomson at cs.umn.edu 

      Ted Pedersen : tpederse at d.umn.edu

=head1 AUTHOR

 Bridget T. McInnes, University of Minnesota

=head1 COPYRIGHT

Copyright (c) 2007-2011,

 Bridget T. McInnes, University of Minnesota
 bthomson at cs.umn.edu
    
 Ted Pedersen, University of Minnesota Duluth
 tpederse at d.umn.edu


 Siddharth Patwardhan, University of Utah, Salt Lake City
 sidd@cs.utah.edu
 
 Serguei Pakhomov, University of Minnesota Twin Cities
 pakh0002@umn.edu

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later
version.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with
this program; if not, write to:

 The Free Software Foundation, Inc.,
 59 Temple Place - Suite 330,
 Boston, MA  02111-1307, USA.

=cut

###############################################################################

#                               THE CODE STARTS HERE
###############################################################################

#                           ================================
#                            COMMAND LINE OPTIONS AND USAGE
#                           ================================

use Getopt::Long;

eval(GetOptions( "version", "help", "word", "N", "precision=s", "t")) or die ("Please check the above mentioned option(s).\n");

#  if help is defined, print out help
if( defined $opt_help ) {
    $opt_help = 1;
    &showHelp();
    exit;
}

#  if version is requested, show version
if( defined $opt_version ) {
    $opt_version = 1;
    &showVersion();
    exit;
}

#  check for precision
my $precision = 4;
if(defined $opt_precision) {
    if ($opt_precision !~ /^\d+$/) {
	print STDERR "Value for switch --precision should be integer >= 0\n";
	&minimalUsageNotes();
	exit;
    }
    $precision = $opt_precision;
}
my $floatformat = join '', '%', '.', $precision, 'f';

# At least 2 terms should be given on the command line.
if( scalar(@ARGV) < 2 ) {
    print STDERR "At least 2 terms or CUIs should be given on the \n";
    print STDERR "command line.\n\n";
    &minimalUsageNotes();
    exit;
}

#  initialize variables
my $xfile = shift;
my $yfile = shift;

open(X, $xfile) || die "Could not open file: $xfile\n";
open(Y, $yfile) || die "Could not open file: $yfile\n";


my $ymean = 0;
my $xmean = 0;

my $ycount = 0;
my $xcount = 0;

my %xhash = ();
my %yhash = ();

my %xlist = ();
my %ylist = ();

my $xnegative = 0;
my $ynegative = 0;

my $xtotal = 0;
my $ytotal = 0;

while(<X>) {
    chomp;
    my ($score, $t1, $t2) = split/<>/;
    
    my $c1 = ""; my $c2 = "";
    if(defined $opt_word) { 
	$c1 = $t1; $c2 = $t2;
    }
    else {
	$t1=~/(C[0-9]+)/;
        $c1 = $1;
    
	$t2=~/(C[0-9]+)/;
	$c2 = $1;
    }

    $xtotal++;
    if($score < 0) { $xnegative++; next; }
    
    $term = "$c1 $c2";
    push @{$xhash{$score}}, $term;
    $xlist{$term}++;
}

while(<Y>) {
    chomp;
    my ($score, $t1, $t2) = split/<>/;

    my $c1 = ""; my $c2 = "";
    if(defined $opt_word) { 
	$c1 = $t1; $c2 = $t2;
    }
    else {
	$t1=~/(C[0-9]+)/;
	$c1 = $1;
    
	$t2=~/(C[0-9]+)/;
	$c2 = $1;
    }

    $ytotal++;
    if($score < 0) { $ynegative++; next; }
    
    $term = "$c1 $c2";
    push @{$yhash{$score}}, $term;
    $ylist{$term}++;
}

my %xrank = ();
my %yrank = ();

my $rank = 1; 
foreach my $score (sort {$b<=>$a} keys %xhash) {

    my $count = 0;
    my $computed_rank = 0; my $crank = $rank + 1;
    foreach my $term (@{$xhash{$score}}) {
	if(exists $ylist{$term}) {
	    $computed_rank += $crank;
	    $count++; $crank++;
	}
    } 
    
    if($count == 0) { next; }
    
    $computed_rank = $computed_rank / $count;
    
    foreach my $term (@{$xhash{$score}}) {
	if(! (exists $ylist{$term})) { next; }
	$xrank{$term} = $computed_rank;
	$xmean += $computed_rank;
	$xcount++;
	$rank++;
    } 
}

$rank = 0; 
foreach my $score (sort {$b<=>$a} keys %yhash) {
    
    my $count = 0;
    my $computed_rank = 0; my $crank = $rank + 1;
    foreach my $term (@{$yhash{$score}}) {
	if(exists $xlist{$term}) {
	    $computed_rank += $crank;
	    $count++; $crank++;
	}
    } 
    
    if($count == 0) { next; }
    
    $computed_rank = $computed_rank / $count;
    
    foreach my $term (@{$yhash{$score}}) {
	if(! (exists $xlist{$term})) { next; }
	$yrank{$term} = $computed_rank;
	$ymean += $computed_rank;
	$ycount++;
	$rank++;
    } 
}


$xmean = $xmean/$xcount;
$ymean = $ymean/$ycount;


my $numerator = 0;
my $xdenom = 0;
my $ydenom = 0;
foreach my $term (sort keys %xrank) {
    my $xi = $xrank{$term};
    my $yi = $yrank{$term};
            
    $numerator += ( ($xi-$xmean) * ($yi-$ymean) );
    
    $xdenom += ( ($xi - $xmean)**2 );
    $ydenom += ( ($yi - $ymean)**2 );
}

my $denominator = sqrt($xdenom * $ydenom);


if($denominator <= 0) { 
    print STDERR "Correlation can not be calculated.\n";
    print STDERR "Files do not contain similar ngrams.\n";
    exit;
}
my $pearsons = $numerator / $denominator;

my $score = sprintf $floatformat, $pearsons;

#  calculate N
my $yN= $ytotal - $ynegative;
my $xN= $xtotal - $xnegative; 
my $N = $yN;
if($xN < $yN){ $N = $xN; }

#  calculate tscore 
my $t = $score/(sqrt( (1-$score**2)/($N-2) ));

#  calculate z-score
my $Fr = .5 * log((1+$score)/(1-$score));
my $z  = sqrt(($N-3)/1.06) * $Fr;

print "Spearman's Rank Correlation: $score "; 
if(defined $opt_N)     { print "(N: $N) "; }
if(defined $opt_t)     { print "(t: $t) "; }
print "\n";

##############################################################################
#  function to output minimal usage notes
##############################################################################
sub minimalUsageNotes {
    
    print "Usage: spearmans.pl [OPTIONS] FILE1 FILE2\n";
    &askHelp();
    exit;
}

##############################################################################
#  function to output help messages for this program
##############################################################################
sub showHelp() {
        
    print "This is a utility that takes as input two files and returns\n";
    print "the spearman's rank correlation between the two datsets.\n\n";
  
    print "Usage: spearmans.pl [OPTIONS] FILE1 FILE2\n\n";

    print "Options:\n\n";

    print "--precision NUMBER       Displays values upto NUMBER places of \n";
    print "                         decimal.\n\n";

    print "--N                      Prints the total number of term\n";
    print "                         pairs the correlation metric is\n";
    print "                         using.\n\n";

    print "--word                   The format of the input files contains words \n";
    print "                         rather than CUIs or is not a umls-similarity.pl \n";
    print "                         output file. \n\n";

    print "--version                Prints the version number\n\n";
    
    print "--help                   Prints this help message.\n\n";
}

##############################################################################
#  function to output the version number
##############################################################################
sub showVersion {
    print '$Id: spearmans.pl,v 1.12 2013/02/15 22:51:39 btmcinnes Exp $';
    print "\nCopyright (c) 2009-2011, Ted Pedersen & Bridget McInnes\n";
}

##############################################################################
#  function to output "ask for help" message when user's goofed
##############################################################################
sub askHelp {
    print STDERR "Type spearmans.pl --help for help.\n";
}