The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl 

=pod

=head1 NAME

umls-senserelate-evaluation.pl - This program evaluates the 
umls-senserelate algorithm using the semeval scorer program 

=head1 SYNOPSIS

This program evaluates the senses assigned by the UMLS-SenseRelate
algorithm using the semeval scorer program 

=head1 USAGE

Usage: umls-senserelate-evaluation.pl LOG_DIRECTORY

=head2 OUTPUT

=head2 Required Options

=head3 LOG_DIRECTORY

This is the directory outputted by the umls-targetword-senserelate.pl program 
which contains the key and answer files for each of the target 
words. 

=head2 General Options:

=head3 --info FILE

Prints out accuracy information to FILE

=head3 --senses DIR|File

Sometimes of the senses in the key file do not map directly to 
the senses in the answer file. This will happen when using the 
NLM-WSD dataset which uses tags to indicate the sense of the 
ambiguous word and then those tags are mapped to concepts in 
the UMLS. The key file would contain the CUIs while the answer 
file contains the tags. Therefore, we need the sense (sometimes 
called choice) files to evaluate the mappings. 

So this options takes the directory that contains the the sense file 
for each target word you are going to disambiguate or just the file 
itself. 

The files for the target word contains the possible senses of the target 
word. 

This may be temporary but right now this is who I have it because often 
times the possible senses change depending on the version of the UMLS that 
you are using. I felt this allowed the most flexibility with it. 

The naming convention for this is a file called: <target word>.choices

The format for this file is:

    <tag>|<target word name>|semantic type|CUI

This format is based on the choice files in the NLM-WSD dataset which 
we use for our experiments. If you are using the NLM-WSD dataset you
can download these choice files from NLM's site. There are the 1999 
tagset and the 2007 tagset available. You can find them here:

  http://wsd.nlm.nih.gov/collaboration.shtml

=head3 --verbose

Prints out accuracy information to STDOUT. 

=head3 --version

Displays the version information.

=head3 --help

Displays the help information

=head1 SYSTEM REQUIREMENTS

=over

=item * Perl (version 5.8.5 or better) - http://www.perl.org

=back

=head1 CONTACT US
   
  If you have any trouble installing and using UMLS-Similarity, 
  please contact us via the users mailing list :
    
      umls-similarity@yahoogroups.com
     
  You can join this group by going to:
    
      http://tech.groups.yahoo.com/group/umls-similarity/
     
  You may also contact us directly if you prefer :
    
      Bridget T. McInnes: bthomson at umn.edu 

      Ted Pedersen : tpederse at d.umn.edu

=head1 AUTHOR

 Bridget T. McInnes, University of Minnesota

=head1 COPYRIGHT

Copyright (c) 2010-2012,

 Bridget T. McInnes, University of Minnesota Twin Cities
 bthomson at umn.edu
    
 Ted Pedersen, University of Minnesota Duluth
 tpederse at d.umn.edu
 
 Serguei Pakhomov, University of Minnesota Twin Cities
 pakh0002 at umn.edu

 Ying Liu, University of Minnesota Twin Cities
 liux0395 at umn.edu

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later
version.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with
this program; if not, write to:

 The Free Software Foundation, Inc.,
 59 Temple Place - Suite 330,
 Boston, MA  02111-1307, USA.

=cut

###############################################################################

#                               THE CODE STARTS HERE
###############################################################################

#                           ================================
#                            COMMAND LINE OPTIONS AND USAGE
#                           ================================


use UMLS::Interface;
use UMLS::SenseRelate::TargetWord;
use Getopt::Long;

eval(GetOptions( "version", "verbose", "info=s", "help", "senses=s", "debug")) or die ("Please check the above mentioned option(s).\n");


my $debug = 0;

#  if help is defined, print out help
if( defined $opt_help ) {
    $opt_help = 1;
    &showHelp();
    exit;
} 

#  if version is requested, show version
if( defined $opt_version ) {
    $opt_version = 1;
    &showVersion();
    exit;
}

#  turn on debug if defined
if( defined $opt_debug ) { 
    $debug = 1;
}

# At least 1 terms should be given on the command line.
if(scalar(@ARGV) < 1) {
    print STDERR "The umls-targetword-senserelate log directory must be given on the command line.\n";
    &minimalUsageNotes();
    exit;
}

local(*INFO);
if(defined $opt_info) { 
    open(INFO, ">$opt_info") || die "Could not open --info FILE ($opt_info)\n";
}

my $log = shift;

my %answerhash = ();
my %keyhash = ();

&getInputFiles($log);
&setSenses();
&evaluateUsingScorer();

#  evaluates the files in the log directory using the scorer program
sub evaluateUsingScorer {

    if($debug) { print STDERR "In evaluateUsingScorer\n"; }    

    my $total_precision = 0; my $total_tw = 0;
    foreach my $tw (sort keys %answerhash) { 
	
	if($tw eq "sd") { next; }
	if($tw eq "or") { next; }

	my $answerfile = $answerhash{$tw};
	my $keyfile    = $keyhash{$tw};
	
	#  check that the key file exists
	if(! (defined $keyfile)) { 
	    print STDERR "ERROR: The key does not exist for $tw.\n";
	    &minimalUsageNotes();
	    exit;
	}
	
	my $evaluationfile = "$log/$tw.evaluation";

	system "score.pl $log/$answerfile $log/$keyfile > $evaluationfile";

	open(EVAL, $evaluationfile) || die "Could not open $evaluationfile\n";
	my $precision = "";
	while(<EVAL>) { 
	    if($_=~/precision\:\s+([0-9\.]+) \(/) {
		$precision = $1;
	    }
	} close EVAL;
	
	if(defined $opt_verbose) { 
	    print STDERR "$tw\t$precision\n";
	}

	if(defined $opt_info) { 
	    print INFO "$tw\t$precision\n";
	}

	$total_precision += $precision; $total_tw++;
    }
    my $overall = $total_precision / $total_tw;
    print STDERR "Evaluation files are located in the input ($log) directory\n";
    print STDERR "The Overall Precision for this dataset is: $overall\n";
}
	    
#  gets the answer and key files from the log directory
sub getInputFiles {
    
    if($debug) { print STDERR "In getInputFiles\n"; }

    my $dir = shift;

    #  open the log directory
    opendir(DIR, $dir) || die "Could not open the log ($dir) directory\n";
    my @files= grep { $_ ne '.' and $_ ne '..' } readdir DIR;

    #  get each of the answer and key files in the directory
    foreach my $file (@files) { 

	if($file=~/(.*?).answers/) { 
	    $answerhash{$1} = $file;
	}
	if($file=~/(.*?).key/) { 
	    $keyhash{$1} = $file;
	}
    }
}


#  get the sense information from the choice files if the --sense option is defined
sub setSenses {

    if(! (defined $opt_senses)) { return; }
    
    if($debug) { print STDERR "In setSenses\n"; } 
     
    my %files = ();
    if(-d $opt_senses) {
	opendir(DIR, $opt_senses) || die "Could not open $opt_senses directory\n";
	my @dirs = grep { $_ ne '.' and $_ ne '..' and $_ ne "CVS" and 
			  $_ ne "raw_summary" and $_ ne "index.shtml"} 
	readdir DIR;
	foreach my $file (@dirs) { 
	    $file=~/(.*?)\.choices/;	    
	    my $tw = $1;
	    $files{$tw} = "$opt_senses/$file";
	}
    }
    else { 
	my @array = split/\//, $opt_senses;
	my $file  = $array[$#array];
	$array[$#array]=~/(.*?)\.choices/;	
	my $tw = $1; 
	$files{$tw} = $opt_senses;
    }
    
    foreach my $tw (sort keys %files) { 
	open(FILE, $files{$tw}) || die "Could not open --sense $file\n";
	my %sensehsah = ();
	while(<FILE>) {
	    chomp;
	    my($tag, $concept, $semantics, $cui) = split/\|/;
	    $sensehash{$tag} = $cui; 
	}close FILE;

	my $keyfile    = $keyhash{$tw};	
	my $nkeyfile   = $keyfile . ".sense";
	open(KEY, "$log/$keyfile") || die "Could not open key $log/$keyfile\n";
	open(NKEY, ">$log/$nkeyfile") || die "Could not open key $log/$nkeyfile\n";
	while(<KEY>) { 
	    chomp;
	    $_=~/(M[0-9]+)$/; 
	    my $tag = $1;
	    my $cui = $sensehash{$tag};
	    $_=~s/\%$tag/\%$cui/g;
	    print NKEY "$_\n";
	} close KEY; close NKEY;
	$keyhash{$tw} = $nkeyfile;
    }
}

##############################################################################
#  function to output minimal usage notes
##############################################################################
sub minimalUsageNotes {
    
    print "Usage: umls-senserelate-evaluation.pl [OPTIONS] LOG_DIRECTORY\n";
    &askHelp();
    exit;
}

##############################################################################
#  function to output help messages for this program
##############################################################################
sub showHelp() {
        
    print "This is a utility \n";
  
    print "Usage: umls-senserelate-evaluation.pl [OPTIONS] LOG_DIRECTORY\n\n";
    
    print "\n\nGeneral Options:\n\n";

    print "--senses FILE|DIR        File or directory containing the sense files\n\n";
    
    print "--verbose                Prints accuracy information to STDOUT\n\n";

    print "--info FILE              Prints accuracy information to FILE\n\n";

    print "--version                Prints the version number\n\n";
    
    print "--help                   Prints this help message.\n\n";

}

##############################################################################
#  function to output the version number
##############################################################################
sub showVersion {
    print '$Id: umls-senserelate-evaluation.pl,v 1.6 2013/05/23 17:48:13 btmcinnes Exp $';
    print "\nCopyright (c) 2010-2012, Ted Pedersen & Bridget McInnes\n";
}

##############################################################################
#  function to output "ask for help" message when user's goofed
##############################################################################
sub askHelp {
    print STDERR "Type umls-senserelate-evaluation.pl --help for help.\n";
}