#!/usr/local/bin/perl -w
=head1 NAME
vector-input.pl - This program builds the term index file and co-occrrence matrix for umls-similarity.pl to calculate the vector relatedness.
=head1 SYNOPSIS
vector-input.pl takes the bigrams frequency input and build the index and the
co-occurrence matrix.
=head1 DESCRIPTION
We build the index and co-occurrence matrix for the vector method of UMLS-Similarity.
The index file helps to locate each term's vector by recording the start position
and the length of its vector. The matrix file records every term's vector.
See perldoc vector-input.pl
=head1 USAGE
vector-input.pl INDEX MATRIX BIGRAMFILE
example: vector-input.pl Index.txt Matrix.txt BigramsList.txt
=head1 INPUT
=head2 Required Arguments:
=head3 INDEX
output file of the vector-input.pl. It records the index of each term
and the vector start position and length f the co-occurrence matrix.
=head3 MATRIX
output file of the vector-input.pl. Each line is a vector for the
term and its co-occurrence term and their frequency.
=head3 BIGRAMFILE
Input to vector-input.pl should be a single flat file generated by huge-count.pl
of Text-NSP package. If the bigrams list is generated by count.pl, pleasue use
count2huge.pl to convert the results to huge-count.pl. It sorts the bigrams in
the alphabet order. When vector-input.pl generates the index and co-occurrence
matrix file, it requires the bigrams which starts the same term t1 grouped together
and lists next to each other. Because at this step, bigrams are not stored in
memory. If the first term of the bigrams changes, it prints the output and index
position of the vector for the term t1. Especially, if the bigrams are sorted in
the alphabet order, it is faster for vector method of UMLS-Similarity to build the
vector. Because for each concept, it searches the co-occurrence matrix to build
the second order vector. If every term of the vector are sorted, the vector
method can search the co-occurrence matrix from the beginning to the end by the
index position and length. If the co-occurrence matrix is a huge file, it could
save lots of execute time.
=head3 Other Options:
=head4 --stat
The bigram file is from statistics.pl rather than count.pl
=head4 --cutoff SCORE
Only use those ngrams that are greater than SCORE
=head4 --help
Displays the help information.
=head4 --version
Displays the version information.
=head1 AUTHOR
Ying Liu, liux0395 at umn.edu
=head1 SEE ALSO
home page: www.tc.umn.edu/~liux0395
=head1 COPYRIGHT
Copyright (C) 2010, Ying Liu
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
=cut
###############################################################################
#-----------------------------------------------------------------------------
# Start of program
#-----------------------------------------------------------------------------
# we have to use commandline options, so use the necessary package!
use Getopt::Long;
# first check if no commandline options have been provided... in which case
# print out the usage notes!
if ( $#ARGV == -1 )
{
&minimalUsageNotes();
exit;
}
# now get the options!
GetOptions( "version", "help", "stat", "cutoff=s" );
# if help has been requested, print out help!
if ( defined $opt_help )
{
$opt_help = 1;
&showHelp();
exit;
}
# if version has been requested, show version!
if ( defined $opt_version )
{
$opt_version = 1;
&showVersion();
exit;
}
my $cutoff = 0;
if(defined $opt_cutoff) {
$cutoff = $opt_cutoff;
}
my $start_bigram = time();
my $index_file = $ARGV[0];
# check to see if a destination has been supplied at all...
if ( !($index_file ) )
{
print STDERR "No output file (INDEX) supplied.\n";
askHelp();
exit;
}
if (-e $index_file)
{
print "Output file $index_file already exists! Overwrite (Y/N)? ";
my $reply = <STDIN>;
chomp $reply;
$reply = uc $reply;
exit 0 if ($reply ne "Y");
}
open(INDX, ">$index_file")
or die("Error: cannot open file '$index_file' for output index.\n");
my $matrix_file = $ARGV[1];
# check to see if a destination has been supplied at all...
if ( !($matrix_file ) )
{
print STDERR "No output file (MATRIX) supplied.\n";
askHelp();
exit;
}
if (-e $matrix_file)
{
print "Output file $matrix_file already exists! Overwrite (Y/N)? ";
my $reply = <STDIN>;
chomp $reply;
$reply = uc $reply;
exit 0 if ($reply ne "Y");
}
open(MATX, ">$matrix_file")
or die("Error: cannot open file '$matrix_file' for output index.\n");
$bigrams_file = $ARGV[2];
# check to see if a source has been supplied at all...
if ( !($bigrams_file ) )
{
print STDERR "No output file (BIGRAMFILE) supplied.\n";
askHelp();
exit;
}
open(BIGM, "<$bigrams_file")
or die("Error: cannot open file '$bigrams_file' for output index.\n");
# read in the bigrams file
my %index;
my $index_num1 = 1;
my $total = <BIGM>;
while (my $line = <BIGM>)
{
chomp($line);
my @terms = split('<>', $line);
my @nums = split/\s+/, $terms[2];
my $score = shift @nums;
if(defined $opt_stat) {
$score = shift @nums;
}
if($score < $cutoff) { next; }
# index every term of the bigram list
if(!defined $index{$terms[0]})
{
$index{$terms[0]} = $index_num1;
$index_num1++;
}
if(!defined $index{$terms[1]})
{
$index{$terms[1]} = $index_num1;
$index_num1++;
}
}
# sort the index terms of %index and
# initilize the position length array
my $index_num2 = 1;
my @position_length;
$position_length[0] = 0; #index starts from 1
foreach my $t (sort (keys %index))
{
$index{$t} = $index_num2;
$position_length[$index_num2] = 0;
$index_num2++;
}
# go the beginning of the bigrams file
seek BIGM, 0, 0 or die $!;
my $word = "";
my $position = 0;
my $bigrams = "";
$total = <BIGM>;
while (my $line = <BIGM>)
{
chomp($line);
my @terms = split('<>', $line);
my @freqs = split (' ', $terms[2]);
my $score = $freqs[0];
if(defined $opt_stat) {
$score = $freqs[1];
}
if($score < $cutoff) { next; }
# if it is still the same term.
# print out the vector to the matrix file
if( $word eq $terms[0] )
{
#print "word: $word\n";
$bigrams .= "$index{$terms[1]} $freqs[0] ";
printf MATX "$index{$terms[1]} $freqs[0] ";
}
else
{
# the first term of the bigrams changes, record
# the vector position and length of the term
if ($word ne "")
{
$bigrams .= "\n";
my $length = length($bigrams);
$position_length[$index{$word}] = "$position" . " $length";
$position += $length;
$bigrams = "";
printf MATX "\n";
}
# for a new term, print the term and its first bigrams frequency
$word = $terms[0];
if(defined $opt_stat) {
$bigrams .= "$index{$word}: $index{$terms[1]} $freqs[1] ";
printf MATX "$index{$word}: $index{$terms[1]} $freqs[1] ";
}
else {
$bigrams .= "$index{$word}: $index{$terms[1]} $freqs[0] ";
printf MATX "$index{$word}: $index{$terms[1]} $freqs[0] ";
}
}
# reach the end of the bigrams file, record the
# vector position and length of the last term.
if (eof(BIGM))
{
$bigrams .= "\n";
my $length = length($bigrams);
$position_length[$index{$word}] = "$position" . " $length";
printf MATX "\n";
}
}
close MATX;
close BIGM;
# out put the index file
foreach my $t (sort (keys %index))
{
printf INDX "$t $index{$t} $position_length[$index{$t}]\n"
}
close INDX;
#-----------------------------------------------------------------------------
# User Defined Function Definitions
#-----------------------------------------------------------------------------
# function to output a minimal usage note when the user has not provided any
# commandline options
sub minimalUsageNotes
{
print STDERR "Usage: vector-input.pl INDEX MATRIX BIGRAMFILE\n";
askHelp();
}
# function to output "ask for help" message when the user's goofed up!
sub askHelp
{
print STDERR "Type vector-input.pl --help for help.\n";
}
# function to output help messages for this program
sub showHelp
{
print "\n";
print "Usage: vector-input.pl INDEX MATRIX BIGRAMFILE\n\n";
print "build the index file for each term of the bigrams file and\n";
print "create the co-occurence matrix.INDEX is the output index file.\n";
print "MATRIX is the output matrix file. BIGRAMFILE is the output of\n";
print "huge-count.pl of Text-NSP. \n\n";
print "OPTIONS:\n\n";
print " --stat Bigram file is from statistics.pl\n\n";
print " --cutoff SCORE Only include ngrams greater than SCORE\n\n";
print " --version Prints the version number.\n\n";
print " --help Prints this help message.\n\n";
}
# function to output the version number
sub showVersion
{
print STDERR "vector-input.pl - version 0.02\n";
print STDERR "Copyright (C) 2009, Ying Liu\n";
print STDERR "Date of Last Update 03/23/10\n";
}