The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/local/bin/perl -w

=head1 NAME

label.pl - Assign labels to clusters in a confusion matrix to maximize agreement 

=head1 SYNOPSIS

 label.pl [OPTIONS] PRELABEL

Type C<label.pl --help> for a quick summary of options

=head1 DESCRIPTION 

Labels the discovered clusters with sense tags such that maximum number of 
contexts are correctly assigned.

=head1 INPUT

=head2 Required Arguments:

PRELABEL

Should be the output of cluto2label.pl.

Sample CLUTO2LABEL format 

 2
 //	cord  phone   text   div
 C0:	 4       3       0       0
 C1:	 2       2       2       2
 C2:	 1       3       3       2

 where the 1st line shows the number of unclustereted instances = 2 

 2nd line shows a space separated list of sense classes starting with // mark.

Each line thereafter shows the sense distribution of the instances belonging
to each discovered cluster in the form of a cluster by sense distribution
matrix. A cell value at (i,j) in the matrix shows the number of instances
belonging to cluster Ci that have the sense tag Sj.

Note that each row begins with the cluster id that precedes a colon (:).
Also, the number of sense classes on 2nd line should be same as the number 
of columns in the cluster by sense distribution table.

=head2 Optional Arguments:

=head3 --help

Displays this message.

=head3 --version

Displays the version information.

=head1 OUTPUT

Output shows the sense labels attached to each of the discovered 
clusters along with the score. Score tells the percentage of the total 
number of instances correctly clustered if the clusters are tagged with 
the sense labels as suggested.

Example :

Prelabel file =>

 0
 //      cord    divi    form    phon    prod    text
 C0:     35      26      44      18      23      43
 C1:     64      34      50      43      57      52
 C2:     0       3       1       2       0       3
 C3:     0       0       2       31      0       0
 C4:     1       28      0       4       6       0
 C5:     0       9       3       2       14      2

Label Output =>

 ClusterID -> SenseID
 C0 -> form
 C1 -> cord
 C2 -> text
 C3 -> phon
 C4 -> divi
 C5 -> prod
 Score = 30.67

shows that 

 cluster C0 represents the 'form' sense
 cluster C1 represents the 'cord' sense
 cluster C2 represents the 'text' sense
 cluster C3 represents the 'phon' sense
 cluster C4 represents the 'divi' sense
 and cluster C5 represents the 'prod' sense

Also, 30.67 % of the total instances are in their right sense classes
if the clusters are tagged with this labeling scheme.

=cut

#			===============================
#			COMMAND LINE OPTIONS AND USAGE 
#			=============================== 	

use Algorithm::Munkres;

use Getopt::Long;
GetOptions ("help","version");

#command option for help
if(defined $opt_help)
{
        $opt_help=1;
        &showhelp();
        exit;
}
#version information
if(defined $opt_version)
{
        $opt_version=1;
        &showversion();
        exit;
}

#show minimal usage
if($#ARGV<0)
{
        &minimal();
        exit;
}

#truncate $0 which contains the complete path to
#the program. Keep just the program name
#this is used in the error messages
$0=~s/.*\/(.+)/$1/;

# input file
if(defined $ARGV[0])
{
	$infile=$ARGV[0];
	if(-e $infile)
	{
		open(IN,$infile) || die "Error($0): Error(code=$!) in opening file <$infile>.\n";
	}
	else
	{
		print STDERR "ERROR($0): PRELABEL file <$infile> doesn't exist...\n";
		exit;
	}
}
else
{
	print STDERR "ERROR($0): Please specify the PRELABEL file name...\n";
	exit;
}

# first line of prelabel file should show number of instances thrown or 
# unclustered 
# label.pl should just pass this information to next program that computes 
# precision and recall
$thrown=<IN>;
if(!defined $thrown)
{
	print STDERR "ERROR($0):
        1st line in the PRELABEL file <$infile> should show 
	the number of instances unclustered or 0.\n";
        exit;
}
chomp $thrown;
if(!($thrown=~/^\s*\d+\s*$/))
{
        print STDERR "ERROR($0):
        1st line in the PRELABEL file <$infile> should show 
	the number of instances unclustered or 0.\n";
        exit;
}

# 2nd line of prelabel output should list all the Sense Classes
# senses should be space separated 
$sense_string=<IN>;
if(defined $sense_string && $sense_string=~/\/\//)
{
	$sense_string=$';
	$sense_string=~s/\s+$//g;
        $sense_string=~s/^\s+//g;
        $sense_string=~s/\s+/ /g;
	# stores all sense classes listed on this line 
	@all_senses=split(/\s+/,$sense_string);
}
else
{
	print STDERR "ERROR($0):
	2nd line in the input file <$infile> should list 
	the sense labels starting with //.\n";
	exit;
}

# accept matrix entries row wise
$i=0;
# total is the total #instances
$total=0;
# read the input file with each row on each line
$line_num=0;
while(<IN>)
{
	chomp;
	s/\s+$//g;
	s/^\s+//g;
	s/\s+/ /g;
	if(/^\s*$/)
	{
		next;
	}

	# we use cluster ids only while printing
	# the final output 
	# during processing, we use the serial 
	# numbers 0,1,... for clusters in the order
	# as they appear in the prelabel file
	($cid,$row)=split(/\s*:\s*/);
	push @cluster_ids,$cid;
	
	#extract the matrix cells
	@row_elements=split(/\s+/,$row);

	if($#row_elements!=$#all_senses)
	{
		print STDERR "ERROR($0):
	        Number of columns (". scalar(@row_elements) . ") at line <$line_num> 
	        in PRELABEL file <$infile> doesn't match the number of senses (" . 
             	scalar(@all_senses) . ") specified on Line 2 of the same file.\n";
		exit;
	}
	
	for($cnt = 0; $cnt <= $#row_elements; $cnt++)
	{
	    if($row_elements[$cnt]!~/^[0-9]+$/)
	    {
		print STDERR "ERROR($0): Line <" . $line_num+1 . "> in PRELABEL file <$infile> contains a non-integer matrix value.\n";
		exit;
	    }
	    else
	    {
		$inp_mat[$line_num][$cnt] = -1 * $row_elements[$cnt];
		$total += $row_elements[$cnt];
	    }
	}

	$line_num++;
}

my @soln_mat = ();

assign(\@inp_mat, \@soln_mat);
$clus_total = 0;

print "ClusterID -> SenseID\n";
for($i=0;$i<=$#inp_mat;$i++)
{
    if(defined $inp_mat[$i][$soln_mat[$i]])
    {
	$clus_total += $inp_mat[$i][$soln_mat[$i]];
    }
    if(defined $all_senses[$soln_mat[$i]])
    {
	print $cluster_ids[$i] . " -> " . $all_senses[$soln_mat[$i]] . "\n";
    }

}
if($total != 0)
{
    $score = $clus_total/$total * -100;
    $score = sprintf("%.2f",$score);
    print "Score = $score\n";
}
else
{
    print "Score = NA\n";    

}


#show minimal usage message
sub minimal()
{
        print "Usage: label.pl [OPTIONS] PRELABEL";
        print "\nTYPE label.pl --help for help\n";
}

#show help
sub showhelp()
{
	print "Usage: label.pl [OPTIONS] PRELABEL\n";
	print "Labels the discovered clusters with sense tags such\n";
	print "that maximum number of contexts are correctly assigned.\n";

	print "\nPRELABEL\n";
        print "Should be an output created by cluto2label.pl\n";
        print "showing a cluster by sense distribution matrix.\n\n";

        print "OPTIONS:\n";

        print "--help
	Displays this message.\n";

        print "--version
	Displays the version information.\n";
}

#version information
sub showversion()
{
 	print '$Id: label.pl,v 1.15 2008/03/30 05:06:07 tpederse Exp $'; 
#       print "\nCopyright (C) 2002-2006, Ted Pedersen, Amruta Purandare, & Anagha Kulkarni\n";
#        print "label.pl      -       Version 0.11\n";
	print "\nLabel discovered clusters with sense tags to maximize agreement\n";
#	print "Date of Last Update:	11/30/2004\n";
	
}

=head1 AUTHORS

 Ted Pedersen, University of Minnesota, Duluth
 tpederse at d.umn.edu

 Amruta Purandare, University of Pittsburgh

 Anagha Kukarni, Carnegie-Mellon University

=head1 COPYRIGHT

Copyright (c) 2002-2008, Ted Pedersen, Amruta Purandare, Anagha Kulkarni

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later
version.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with
this program; if not, write to

 The Free Software Foundation, Inc.,
 59 Temple Place - Suite 330,
 Boston, MA  02111-1307, USA.