The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -w

=head1 NAME

mm-xml2aw-xml.pl - This program converts MetaMap xml (mm-xml) formatted  
text into the all words xml (aw-xml) format. 

=head1 SYNOPSIS

This program converts MetaMap xml (mm-xml) formatted text into the 
all words xml (aw-xml) format. 

=head1 USAGE

perl mm-xml2aw-xml.pl SOURCE DESTINATION

=head2 SOURCE
 
=head2 DESTINATION

=head2 Optional Arguments:

=head3 --log DIRECTORY

Directory to contain temporary and log files. DEFAULT: log

=head3 --help

Displays the quick summary of program options.

=head3 --version

Displays the version information.

=head1 OUTPUT

All words xml format similar to the SemEval all words disambiguation 
task. In this format, each term assigned one or more concepts in the 
metamap xml file are outputed as follows:

 <?xml version="1.0"?>
 <!DOCTYPE corpus SYSTEM  "all-words.dtd">
 <corpus lang="en">
 <text id="001">
 <head id="d001.s001.t001" candidates="C1280500,C2348382">effect</head>
 of
 the
 <head id="d001.s001.t004" candidates="C0449238">duration</head>
 </text>
 </corpus>

There exists an addition to the regular SemEval format. The candidate 
tags contain each possible sense of the term assigned by metamap. These 
will be used as the possible senses in the umls-allwords-senserelate.pl 
program when using the --candidate option. Otherwise, the senses come 
from doing a dictionary lookup in the MRCONSO table of the UMLS. 

=head1 PROGRAM REQUIREMENTS

=over

=item * Perl (version 5.8.5 or better) - http://www.perl.org

=back

=head1 AUTHOR

 Bridget T. McInnes, University of Minnesota, Twin Cities

=head1 COPYRIGHT

 Copyright (c) 2007-2008,
 Bridget T. McInnes, University of Minnesota, Twin Cities
 bthomson at cs.umn.edu

 Ted Pedersen, University of Minnesota Duluth
 tpederse at d.umn.edu

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later
version.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with
this program; if not, write to 

 The Free Software Foundation, Inc.,
 59 Temple Place - Suite 330,
 Boston, MA  02111-1307, USA.

=cut

	
###############################################################################

#                               THE CODE STARTS HERE
###############################################################################

#                           ================================
#                            COMMAND LINE OPTIONS AND USAGE
#                           ================================

use Getopt::Long; 
use XML::Twig;
use File::Spec;

eval(GetOptions( "version", "help" , "log=s"))or die ("Please check the above mentioned option(s).\n");


#  if help is defined, print out help
if( defined $opt_help ) {
    $opt_help = 1;
    &showHelp();
    exit;
}

#  if version is requested, show version
if( defined $opt_version ) {
    $opt_version = 1;
    &showVersion();
    exit;
}

my $default = "";
my $set     = "";


#  set the time stamp
my $timestamp = &time_stamp();


#  set the log file
my $log = "log.$timestamp";
if(defined $opt_log) { 
    $log = $opt_log; 
    $set .= "  --log $log\n";
}
else { $default .= "  --log $log\n"; }

if($set ne "") { 
    print STDERR "User Options: \n";
    print STDERR "$set\n";
}
if($default ne "") { 
    print STDERR "Default Options:\n";
    print STDERR "$default\n";
}

# At least 2 terms should be given on the command line.
if(scalar(@ARGV) < 2) {
    print STDERR "The input and output files must be given on the command line.\n";
    &minimalUsageNotes();
    exit;
}

my $outfile = shift;
my $infile  = shift;

#  check that output file has been supplied
if( !($outfile) ) {
    print STDERR "No output file (DESTINATION) was supplied.\n";
    &askHelp();
    exit;
}

#  check if the output file  already exists
if( -e $outfile ) {
    print "DESTINATION ($outfile) already exists! Overwrite (Y/N)?";
    my $reply = <STDIN>;  chomp $reply; $reply = uc($reply);
    exit 0 if ($reply ne "Y"); 
} 

#  open the input and output files
open(INFILE, $infile) || die "Could not open $infile\n";
open(OUTFILE, ">$outfile") || die "Could not open $outfile\n";


my @abstracts = (); my $abstract = "";
while(<INFILE>) { 
    if($_=~/\<?xml version/) { 
	if($abstract ne "") { push @abstracts, $abstract; }
	$abstract = "";
    }
    $abstract .= $_;
}

#  print header information 
print OUTFILE "<?xml version=\"1.0\"?>\n";
print OUTFILE "<!DOCTYPE corpus SYSTEM  \"all-words.dtd\">\n";
print OUTFILE "<corpus lang=\"en\">\n";

my $abstractid = 0;
foreach my $abstract (@abstracts) { 
    
    if($abstract=~/^\s*$/) { next; }
    
    # increment id
    $abstractid++;
    
    #  print document id
    my $aid = sprintf("%03d", $abstractid);
    print OUTFILE "<text id=\"$aid\">\n";

    #  set the xml file for this abstract
    if(-e "$infile.processing") { system "rm $infile.processing"; }
    open(FILE, ">$infile.processing") || die "Could not open $infile.processing\n";
    print FILE "$abstract";
    close FILE;

    #  load the metamap xml output
    my $t= XML::Twig->new();
    $t->parsefile("$infile.processing");
    my $root = $t->root;


    #  initialize variables
    my @cuis    = (); 
    my @matches = ();
    my @tokens  = ();


    #  loop through tokens
    my $method= $root; my $sentenceid = 0; my $tokenid = 0;
    while( $method=$method->next_elt( $root )) { 
		
	if($method->local_name eq "UttText") { 
	    $sentenceid++; 
	    $tokenid = 0;
	}

	if($method->local_name eq "InputMatch") { 
	    my $token = $method->text; push @tokens, $token;
	}
	
	#  check if in mapping
	if($method->local_name eq "Mapping") { $flag = 1; }

	#  if in mapping, get the cui
	if( ($method->local_name eq "CandidateCUI") && ($flag == 1) ) { 
	    my $cui = $method->text; push @cuis, $cui;
	}

	#  if in mapping, get the cui
	if( ($method->local_name eq "CandidateMatched") && ($flag == 1) ) { 
	    my $match = $method->text; 
	    $match=~s/[\*\?\+\(\)\[\]\/ ]//g; 
	    push @matches, lc($match);
	}
	
	if($method->local_name eq "Phrase") { 

	    my %mappings = ();
	    foreach my $i (0..$#matches) { 
		my $term = $matches[$i]; my $mflag = 0;
		while($mflag == 0) {
		    foreach my $token (@tokens) {
			$token=lc($token);
			if($token=~/$term/) {
			    $mappings{$token}{"$cuis[$i]/$matches[$i]"}++;
			    $mflag = 1; 
			}
		    }
		    chop $term;
		    if($term=~/^\s*$/) { $mflag = 1; }
		}
	    }
	    foreach my $token (@tokens) { 
		my $tok = lc($token);
		$tokenid++;
		
		my $a = sprintf("%03d", $abstractid);
		my $s = sprintf("%03d", $sentenceid);
		my $t = sprintf("%03d", $tokenid);
		my $id = "d$a.s$s.t$t";
		
		my $senses = "";
		foreach my $m (sort keys %{$mappings{$tok}}) { 
		    $m=~/(C[0-9]+)\//;
		    $senses .= "$1,";
		} chop $senses;

		if($senses=~/^\s*$/) { print OUTFILE "$token\n"; }
		else                 { print OUTFILE "<head id=\"$id\" candidates=\"$senses\">$token<\/head>\n"; }
	    }
	    @tokens  = ();;
	    @cuis    = ();
	    @matches = ();
	}
    }
    print OUTFILE "<\/text>\n";
    
    #  remove the processing file
    system "rm $infile.processing"; 
}

print OUTFILE "<\/corpus>\n";


##############################################################################
#  SUB FUNCTIONS
##############################################################################
#  function to create a timestamp
sub time_stamp {
    my ($stamp);
    my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);

    $year += 1900;
    $mon++;
    $d = sprintf("%4d%2.2d%2.2d",$year,$mon,$mday);
    $t = sprintf("%2.2d%2.2d%2.2d",$hour,$min,$sec);
    
    $stamp = $d . $t;

    return($stamp);
}

#  function to output minimal usage notes
sub minimalUsageNotes {
    
    print STDERR "Usage: mm-xml2aw-xml.pl [OPTIONS] DESTINATION SOURCE\n";
    askHelp();
}

#  function to output help messages for this program
sub showHelp() {

    print "Usage: mm-xml2aw-xml.pl DESTINATION SOURCE\n\n";
    
    print "Takes as input a machine code MetaMap file and converts it\n";
    print "to all-words xml format for umls-allwords-senserelate.pl.\n\n";

    print "OPTIONS:\n\n";

    print "--log                    Directory to contain temporary and log\n";
    print "                         files. DEFAULT: log.<timestamp>\n\n";

    print "--version                Prints the version number\n\n";

    print "--help                   Prints this help message.\n\n";
}

#  function to output the version number
sub showVersion {
        print '$Id: mm-xml2aw-xml.pl,v 1.7 2011/05/16 14:12:26 btmcinnes Exp $';
        print "\nCopyright (c) 2007, Ted Pedersen & Bridget McInnes\n";
}

#  function to output "ask for help" message when user's goofed
sub askHelp {
    print STDERR "Type mm-xml2aw-xml.pl --help for help.\n";
}