#!/usr/local/bin/perl -w
=head1 NAME
statistic.pl - Measure the association of Ngrams in text
=head1 SYNOPSIS
Program to take a ngram-frequency file (as output by count.pl) and to
then calculate the given statistic. The statistic to be calculated has
to be supplied as a library which will be dynamically loaded. This library
file has to follow a standard as defined in the readme! Thereafter, to
output the result into a destination file.
=head1 DESCRIPTION
See perldoc README.pod
=head1 AUTHOR
Ted Pedersen, University of Minnesota Duluth
E<lt>tpederse@d.umn.eduE<gt>
Satanjeev Banerjee, Carnegie Mellon University
E<lt>satanjeev@cmu.eduE<gt>
Amruta Purandare, University of Pittsburgh
E<lt>amruta@cs.pitt.eduE<gt>
Bridget Thomson-McInnes, University of Minnesota Twin Cities
E<lt>bthompson@d.umn.eduE<gt>
Saiyam Kohli, University of Minnesota Duluth
E<lt>kohli003@d.umn.eduE<gt>
=head1 HISTORY
Last updated: $Id: statistic.pl,v 1.27 2011/12/23 22:25:04 btmcinnes Exp $
=head1 BUGS
=head1 SEE ALSO
http://groups.yahoo.com/group/ngram/
http://www.d.umn.edu/~tpederse/nsp.html
=head1 COPYRIGHT
Copyright (C) 2000-2010, Ted Pedersen, Satanjeev Banerjee, Amruta
Purandare, Bridget Thomson-McInnes and Saiyam Kohli
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to
The Free Software Foundation, Inc.,
59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA.
Note: a copy of the GNU General Public License is available on the web
at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
distribution as GPL.txt.
=cut
#
###############################################################################
#
# ------- CHANGELOG ---------
#
#version date programmer List of changes change-id
#
# 0.72 08/02/2005 Ted Made use of Config and
# File::Spec modules to
# detect system dependent
# PATH variable separator
# character - : or ; and
# system dependent file
# separator character - / or \.
# Similar changes made to
# all the .pm files in
# Measures sub-directory
#
# 0.69 06/14/2004 Amruta Changed the internal ADP.71
# N-gram separator #
# to <||>
#
# 0.67 02/19/2004 Amruta Used stat scores ADP.67.1
# as keys of the hash
# instead of the N-grams
# This reduces the memory
# consumption when large
# Ngrams have same scores
#
# 0.57 07/01/2003 Ted (1) if destination file TDP.57.3
# found, check for
# source before proceeding
#
###############################################################################
#-----------------------------------------------------------------------------
# Start of Program
#-----------------------------------------------------------------------------
# we have to use commandline options, so use the necessary package!
use Getopt::Long;
use Config;
use File::Spec;
# first check if no commandline options have been provided... in which case
# print out the usage notes!
if ( $#ARGV == -1 )
{
&minimalUsageNotes();
exit;
}
# now get the options!
GetOptions("version", "help", "format", "frequency=i", "rank=i", "precision=i",
"score=f", "extended", "ngram=i", "get_freq_combo=s", "set_freq_combo=s", "pmi_exp=f" );
# if help has been requested, print out help!
if ( defined $opt_help )
{
$opt_help = 1;
showHelp();
exit;
}
# if version has been requested, show version!
if ( defined $opt_version )
{
$opt_version = 1;
showVersion();
exit;
}
# set the variables according to what has been provided!
if ( defined $opt_ngram ) { $ngram = $opt_ngram; }
else { $ngram = 2; }
if ( defined $opt_rank ) { $show = $opt_rank; }
else { $show = -1; } # -1 stands for show all!
if ( defined $opt_precision ) { $precision = $opt_precision; }
else { $precision = 4; } # default precision
if ( defined $opt_score ) { $scoreCutOff = $opt_score; }
if ($precision !~ /^\d+$/)
{
print STDERR "Value for switch --precision should be integer >= 0. Using 4.\n";
$precision = 4;
}
# create the floating point conversion format as required by sprintf!
$floatFormat = join '', '%', '.', $precision, 'f';
# now get hold of the frequency combinations that we need to know
# about so as to understand the input data.
if (defined $opt_set_freq_combo)
{
readFreqCombo($opt_set_freq_combo);
}
# or, by default, everything possible
else
{
getDefaultFreqCombos();
}
if($ngram==2)
{
my $string;
for (my $i = 0; $i < $combIndex; $i++)
{
$string = join (" ", @{$freqComb[$i]}[1..$freqComb[$i][0]]);
if ($string eq "0 1") { $n11FreqIndex = $i; }
elsif ($string eq "0") { $np1FreqIndex = $i; }
elsif ($string eq "1") { $n1pFreqIndex = $i; }
}
# if these frequency values are not being reported then flag an error.
if (!(defined $n11FreqIndex))
{
$errorMessage = "Frequency combination \"0 1\" (frequency of bigram) missing!\n";
die($errorMessage);
}
if (!(defined $np1FreqIndex))
{
$errorMessage = "Frequency combination \"0\" (frequency of bigram) missing!\n";
die($errorMessage);
}
if (!(defined $n1pFreqIndex))
{
$errorMessage = "Frequency combination \"1\" (frequency of bigram) missing!\n";
die($errorMessage);
}
}
if ($ngram eq 3)
{
for ($i = 0; $i < $combIndex; $i++)
{
$str="";
foreach(@{$freqComb[$i]}[1..$freqComb[$i][0]]) { $str.=$_."#"; }
if($str eq "0#1#2#") { $n111Index=$i; }
if($str eq "0#") { $n1ppIndex=$i; }
if($str eq "1#") { $np1pIndex=$i; }
if($str eq "2#") { $npp1Index=$i; }
if($str eq "0#1#") { $n11pIndex=$i; }
if($str eq "1#2#") { $np11Index=$i; }
if($str eq "0#2#") { $n1p1Index=$i; }
}
if (!(defined $n111Index)) { $errorCodeNumber = 100; $errorMessage = "Frequency combination \"0 1 2\" missing!\n";
die($errorMessage);
}
if (!(defined $n1ppIndex)) { $errorCodeNumber = 101; $errorMessage = "Frequency combination \"0\" missing!\n";
die($errorMessage);
}
if (!(defined $np1pIndex)) { $errorCodeNumber = 102; $errorMessage = "Frequency combination \"1\" missing!\n";
die($errorMessage);
}
if (!(defined $npp1Index)) { $errorCodeNumber = 103; $errorMessage = "Frequency combination \"2\" missing!\n";
die($errorMessage);
}
if (!(defined $n11pIndex)) { $errorCodeNumber = 104; $errorMessage = "Frequency combination \"0 1\" missing!\n";
die($errorMessage);
}
if (!(defined $np11Index)) { $errorCodeNumber = 105; $errorMessage = "Frequency combination \"1 2\" missing!\n";
die($errorMessage);
}
if (!(defined $n1p1Index)) { $errorCodeNumber = 106; $errorMessage = "Frequency combination \"0 2\" missing!\n";
die($errorMessage);
}
}
if ($ngram eq 4)
{
for ($i = 0; $i < $combIndex; $i++)
{
$str="";
foreach(@{$freqComb[$i]}[1..$freqComb[$i][0]]) { $str.=$_."#"; }
if($str eq "0#1#2#3#"){ $n1111Index=$i; }
if($str eq "0#") { $n1pppIndex=$i; }
if($str eq "1#") { $np1ppIndex=$i; }
if($str eq "2#") { $npp1pIndex=$i; }
if($str eq "3#") { $nppp1Index=$i; }
if($str eq "0#1#") { $n11ppIndex=$i; }
if($str eq "0#2#") { $n1p1pIndex=$i; }
if($str eq "0#3#") { $n1pp1Index=$i; }
if($str eq "1#2#") { $np11pIndex=$i; }
if($str eq "1#3#") { $np1p1Index=$i; }
if($str eq "2#3#") { $npp11Index=$i; }
if($str eq "0#1#2#") { $n111pIndex=$i; }
if($str eq "0#1#3#") { $n11p1Index=$i; }
if($str eq "0#2#3#") { $n1p11Index=$i; }
if($str eq "1#2#3#") { $np111Index=$i; }
}
if (!(defined $n1111Index)) { $errorCodeNumber = 100; $errorMessage = "Frequency combination \"0 1 2 3\" missing!\n";
die($errorMessage);
}
if (!(defined $n1pppIndex)) { $errorCodeNumber = 101; $errorMessage = "Frequency combination \"0\" missing!\n";
die($errorMessage);
}
if (!(defined $np1ppIndex)) { $errorCodeNumber = 102; $errorMessage = "Frequency combination \"1\" missing!\n";
die($errorMessage);
}
if (!(defined $npp1pIndex)) { $errorCodeNumber = 103; $errorMessage = "Frequency combination \"2\" missing!\n";
die($errorMessage);
}
if (!(defined $nppp1Index)) { $errorCodeNumber = 104; $errorMessage = "Frequency combination \"3\" missing!\n";
die($errorMessage);
}
if (!(defined $n11ppIndex)) { $errorCodeNumber = 105; $errorMessage = "Frequency combination \"0 1\" missing!\n";
die($errorMessage);
}
if (!(defined $n1p1pIndex)) { $errorCodeNumber = 106; $errorMessage = "Frequency combination \"0 2\" missing!\n";
die($errorMessage);
}
if (!(defined $n1pp1Index)) { $errorCodeNumber = 107; $errorMessage = "Frequency combination \"0 3\" missing!\n";
die($errorMessage);
}
if (!(defined $np11pIndex)) { $errorCodeNumber = 108; $errorMessage = "Frequency combination \"1 2\" missing!\n";
die($errorMessage);
}
if (!(defined $np1p1Index)) { $errorCodeNumber = 109; $errorMessage = "Frequency combination \"1 3\" missing!\n";
die($errorMessage);
}
if (!(defined $npp11Index)) { $errorCodeNumber = 110; $errorMessage = "Frequency combination \"2 3\" missing!\n";
die($errorMessage);
}
if (!(defined $n111pIndex)) { $errorCodeNumber = 111; $errorMessage = "Frequency combination \"0 1 2\" missing!\n";
die($errorMessage);
}
if (!(defined $n11p1Index)) { $errorCodeNumber = 112; $errorMessage = "Frequency combination \"0 1 3\" missing!\n";
die($errorMessage);
}
if (!(defined $n1p11Index)) { $errorCodeNumber = 113; $errorMessage = "Frequency combination \"0 2 3\" missing!\n";
die($errorMessage);
}
if (!(defined $np111Index)) { $errorCodeNumber = 114; $errorMessage = "Frequency combination \"1 2 3\" missing!\n";
die($errorMessage);
}
}
if (defined $opt_get_freq_combo)
{
open (FREQ_COMBO_OUT, ">$opt_get_freq_combo") || die ("Couldnt open $opt_get_freq_combo");
for ($i = 0; $i < $combIndex; $i++)
{
for ($j = 1; $j <= $freqComb[$i][0]; $j++)
{
print FREQ_COMBO_OUT "$freqComb[$i][$j] ";
}
print FREQ_COMBO_OUT "\n";
}
close (FREQ_COMBO_OUT);
}
# at the end of those two functions we should have with us the @freqComb
# array!
# check if frequency cut off has been requested. if so find the index
# of the input numbers that contains the frequency. if not found warn
# that frequency cutoff ineffective
if (defined $opt_frequency)
{
# find index
my $requiredString = "";
my $i;
for ($i = 0; $i < $ngram; $i++)
{
$requiredString .= $i;
if ($i < $ngram-1) { $requiredString .= " "; }
}
$ngramFreqIndex = -1;
for ($i = 0; $i < $combIndex; $i++)
{
my $thisString = join (" ", @{$freqComb[$i]}[1..$freqComb[$i][0]]);
if ($requiredString eq $thisString) { $ngramFreqIndex = $i; last; }
}
if ($ngramFreqIndex == -1)
{
print STDERR "Warning: Frequency of n-gram not found, ignoring frequency cut-off!\n";
undef $opt_frequency;
}
}
# having stripped the commandline of all the options et al, we should now be
# left only with the source/destination files
# so, first get hold of the statistic library, and include it!
$statistic = shift;
# check to see if a library has been supplied at all!
if ( !( $statistic ) )
{
print STDERR "No statistic library supplied. ";
askHelp();
exit;
}
# now remove the ".pm" in the end of the statistic filename, if present
@pathComponents = split (/\./, $statistic);
if ( $pathComponents[$#pathComponents] eq "pm" )
{
$#pathComponents --;
$statistic = join ( ".", @pathComponents );
}
use File::Spec;
if($statistic =~ /::/)
{
my @statComponents = split (/::/, $statistic);
$statComponents[$#statComponents] = $statComponents[$#statComponents].".pm";
$includename = File::Spec->catfile(@statComponents);
$usename = $statistic;
}
# else
# {
# foreach $dir (@INC)
# {
#
# }
# }
elsif($statistic eq "ll")
{
if($ngram eq 2 || $ngram eq 3 || $ngram eq 4)
{
$usename = 'Text::NSP::Measures::'.$ngram.'D::MI::'.$statistic;
$includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','MI',$statistic.'.pm');
}
else
{
print STDERR "Error: This measure is only defined for bigrams, trigrams and 4-grams";
exit;
}
}
elsif($statistic eq "pmi" || $statistic eq "tmi" || $statistic eq "ps")
{
if($ngram eq 2 || $ngram eq 3)
{
$usename = 'Text::NSP::Measures::'.$ngram.'D::MI::'.$statistic;
$includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','MI',$statistic.'.pm');
}
else
{
print STDERR "Error: This measure is only defined for bigrams & trigrams";
exit;
}
}
elsif($statistic eq "x2"||$statistic eq "phi")
{
if($ngram eq 2)
{
$usename = 'Text::NSP::Measures::'.$ngram.'D::CHI::'.$statistic;
$includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','CHI',$statistic.'.pm');
}
else
{
print STDERR "Error: This measure is only defined for bigrams";
exit;
}
}
elsif($statistic eq "tscore")
{
if($ngram eq 2 || $ngram eq 3 || $ngram eq 4)
{
$usename = 'Text::NSP::Measures::'.$ngram.'D::CHI::'.$statistic;
$includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','CHI',$statistic.'.pm');
}
else
{
print STDERR "Error: This measure is only defined for bigrams & trigrams & 4-grams";
exit;
}
}
elsif($statistic eq "leftFisher"||$statistic eq "rightFisher"||$statistic eq "twotailed")
{
if($ngram eq 2)
{
if($statistic eq "leftFisher")
{
$statistic = "left";
}
elsif($statistic eq "rightFisher")
{
$statistic = "right";
}
$usename = 'Text::NSP::Measures::'.$ngram.'D::Fisher::'.$statistic;
$includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','Fisher',$statistic.'.pm');
}
else
{
print STDERR "Error: This measure is only defined for bigrams";
exit;
}
}
elsif($statistic eq "ll3"||$statistic eq "tmi3")
{
$statistic =~ s/3//;
if($ngram eq 3)
{
$usename = 'Text::NSP::Measures::'.$ngram.'D::MI::'.$statistic;
$includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','MI',$statistic.'.pm');
}
else
{
print STDERR "Error: This measure is only defined for trigrams";
exit;
}
}
elsif($statistic eq "tscore3")
{
$statistic =~ s/3//;
if($ngram eq 3)
{
$usename = 'Text::NSP::Measures::'.$ngram.'D::CHI::'.$statistic;
$includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','CHI',$statistic.'.pm');
}
else
{
print STDERR "Error: This measure is only defined for trigrams";
exit;
}
}
elsif($statistic eq "ll4")
{
$statistic =~ s/4//;
if($ngram eq 4)
{
$usename = 'Text::NSP::Measures::'.$ngram.'D::MI::'.$statistic;
$includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','MI',$statistic.'.pm');
}
else
{
print STDERR "Error: This measure is only defined for 4-grams";
exit;
}
}
elsif($statistic eq "tscore4")
{
$statistic =~ s/4//;
if($ngram eq 4)
{
$usename = 'Text::NSP::Measures::'.$ngram.'D::CHI::'.$statistic;
$includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','CHI',$statistic.'.pm');
}
else
{
print STDERR "Error: This measure is only defined for trigrams";
exit;
}
}
elsif($statistic eq "dice" || $statistic eq "jaccard")
{
if($ngram eq 2)
{
$usename = 'Text::NSP::Measures::'.$ngram.'D::Dice::'.$statistic;
$includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D','Dice',$statistic.'.pm');
}
else
{
print STDERR "Error: This measure is only defined for bigrams";
exit;
}
}
elsif($statistic eq "odds")
{
if($ngram eq 2)
{
$usename = 'Text::NSP::Measures::'.$ngram.'D::'.$statistic;
$includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D',$statistic.'.pm');
}
else
{
print STDERR "Error: This measure is only defined for bigrams";
exit;
}
}
else
{
if($ngram eq 2)
{
$usename = 'Text::NSP::Measures::'.$ngram.'D::'.$statistic;
$includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D',$statistic.'.pm');
}
elsif($ngram eq 3)
{
$usename = 'Text::NSP::Measures::'.$ngram.'D::'.$statistic;
$includename = File::Spec->catfile('Text','NSP','Measures',$ngram.'D',$statistic.'.pm');
}
else
{
print STDERR "Measure not defined for $ngram-grams\n";
exit;
}
}
require $includename;
import $usename;
# we won't go through the extracting of the short form of the
# measure at this point in the code - instead, we will simply
# look for 'pmi' somewhere in the measure name - this won't
# cause a problem until we have a measure named opmin , etc.
#
# there seems to be some kind of scope issue here - we have
# apparently lost the extracted form of the measure name found
# above in order to get the initailizeStatistic method below
#
# tdp november 2009
## if($statistic eq 'pmi') tdp november 2009
if($statistic =~ /pmi/)
{
if(defined $opt_pmi_exp)
{
initializeStatistic($opt_pmi_exp);
}
}
else
{
initializeStatistic();
if(defined $opt_pmi_exp)
{
print STDERR "The --pmi_exp parameter is not valid for the selected measure.\n";
}
}
# now get hold of the destination filename
$destination = shift;
# check to see if a destination has been supplied at all...
if ( !( $destination ) )
{
print STDERR "No output file (DESTINATION) supplied. ";
askHelp();
exit;
}
## start TDP.57.3 (moved existing code to here)
# now get the name of the source file
$source = shift;
# check to see if a src has been supplied at all...
if ( !( $source ) )
{
print STDERR "No input file (SOURCE) specified. ";
askHelp();
exit;
}
# now see if src exists...
if ( ! ( -e $source ) )
{
print STDERR "Cant find input file (SOURCE) $source.\n";
exit;
}
## finish TDP.57.3 (moved existing code to here)
# check to see if destination exists, and if so, if we should overwrite...
if ( -e $destination )
{
print "Output file $destination already exists! Overwrite (Y/N)? ";
$reply = <STDIN>;
chomp $reply;
$reply = uc $reply;
exit 0 if ($reply ne "Y");
}
# having ascertained that we may open the destination file for output, lets
# do so...
open ( DST, ">$destination" ) || die "Cant open output file $destination";
## old location of TDP.57.3
# now open the source file.
open( SRC, "$source" ) || die "Cant open input file $source, quitting";
# now read in all the @ data and write them out to the destination file
# if -extended has been selected
$flag = 1;
my $lineNo = 0; # will tell us where in the source file we have a problem!
while ($flag)
{
$temp = <SRC>;
$lineNo++;
if ( $temp =~ /^@/ && ! ($temp =~ /^@@/ ) )
{
if ( defined $opt_extended ) { print DST $temp; }
}
else
{
$flag = 0;
}
}
# $temp should now have the total number of ngrams!
$totalNgrams = $temp;
if (defined($totalNgrams))
{
chomp $totalNgrams;
}
# check to see if we really have an ngram-total, or some garbage!
if ( !(defined($totalNgrams)) || (!isInteger($totalNgrams)))
{
print STDERR ("$source does not look like a ngram frequency file at line number $lineNo\n");
exit;
}
my $totalNgramCount=0;
# In 0.65 and earlier versions, there are 2 hashes that use N-grams
# as the hash-keys and store N-gram scores and marginal totals. This
# increases the memory usage as the number of bigrams increase.
# Instead, we create a hash whose keys are the N-gram scores and
# values are the N-gram strings. Our assumption is that, this will
# cut down the memory usage by a lot as large number of N-grams
# usually have same scores.
while(<SRC>)
{
$lineNo++;
# our target record is in $_. now...
chomp $_;
# get the various fields of the record!
if ( defined @tokens ) { undef @tokens; }
if ( defined @numbers ) { undef @numbers; }
# ADP.67.1
# in old versions, ngramString variable was storing only the
# the N-gram tokens and not the scores. Here, we store entire
# line as the value of hash
my $ngramString=$_;
# split on the <>. thus @tokens will have all the separate tokens
# that make up this ngram and its last element will be the string
# of space separated numbers
@tokens = split(/<>/, $_);
# check if we have enough tokens! if not, complain and quit
# interestingly, @tokens will actually have one more element than
# $ngram. so $#tokens should be exactly the same as $ngram!
if ($#tokens != $ngram)
{
print STDERR "Wrong number of tokens in ngram on line $lineNo. Expecting $ngram.\n";
exit;
}
# put the frequency values for this ngram into @numbers
@numbers = split(/ /, $tokens[$#tokens]);
# remove the last element from tokens so that we really have only
# tokens in @tokens
pop @tokens;
# remove bit stuffed '@' symbol from first token if present.
# $tokens[0] =~ s/^@@/@/;
# the number of frequency values should be equal to $combIndex. if
# not, quit! note this is the only check we can do to ascertain if
# there is some problem with the frequency values. as long as we
# have the right number of frequency values, we are happy!
if ($#numbers != $combIndex - 1)
{
print STDERR "Wrong number of frequency values on line $lineNo. Expecting $combIndex.\n";
exit;
}
# if we are doing frequency cutoffs and the frequency of this
# ngram is below the cut off level, then skip this iteration of
# the loop
if (defined $opt_frequency && $numbers[$ngramFreqIndex] < $opt_frequency) { next; }
if ($ngram eq 2)
{
%values = (n11=>$numbers[$n11FreqIndex],
n1p=>$numbers[$n1pFreqIndex],
np1=>$numbers[$np1FreqIndex],
npp=>$totalNgrams);
$totalNgramCount += $numbers[$n11FreqIndex];
}
elsif($ngram eq 3)
{
%values = ( n111=>$numbers[$n111Index],
n1pp=>$numbers[$n1ppIndex],
np1p=>$numbers[$np1pIndex],
npp1=>$numbers[$npp1Index],
n11p=>$numbers[$n11pIndex],
n1p1=>$numbers[$n1p1Index],
np11=>$numbers[$np11Index],
nppp=>$totalNgrams);
$totalNgramCount = $numbers[$n111Index];
}
elsif($ngram eq 4)
{
%values = (
n1111=>$numbers[$n1111Index],
n1ppp=>$numbers[$n1pppIndex],
np1pp=>$numbers[$np1ppIndex],
npp1p=>$numbers[$npp1pIndex],
nppp1=>$numbers[$nppp1Index],
n11pp=>$numbers[$n11ppIndex],
n1p1p=>$numbers[$n1p1pIndex],
n1pp1=>$numbers[$n1pp1Index],
np11p=>$numbers[$np11pIndex],
np1p1=>$numbers[$np1p1Index],
npp11=>$numbers[$npp11Index],
n111p=>$numbers[$n111pIndex],
n11p1=>$numbers[$n11p1Index],
n1p11=>$numbers[$n1p11Index],
np111=>$numbers[$np111Index],
npppp=>$totalNgrams);
$totalNgramCount = $numbers[$n1111Index];
}
# ------------------------------------------------------------------
# ADP.67.1 start
# we don't need to store the Ngram tokens and scores separately in
# two different hashes
# ------------------------------------------------------------------
# having got this far, we are ready to compute! first recreate the ngram string.
# my $ngramString = join("<>", @tokens);
# next create the string with the frequency values in it. we shall
# output this later on, so put it in a hash whose keys are the
# ngram strings
# $NUMBERSTRINGS{$ngramString} = join(" ", @numbers);
# ---------------
# ADP.67.1 end
# ---------------
# calculate the statistic and create the statistic hash.
my $statisticValue = calculateStatistic(%values); # function implemented by stat library
my $errorMessage='';
# check for errors/warnings
if( ($errorCode = getErrorCode()))
{
if ($errorCode =~ /^1/) # error!
{
printf(STDERR "Error from statistic library!\n Error code: %d\n", $errorCode);
$errorMessage = getErrorMessage();
print STDERR " Error message: $errorMessage\n" if( $errorMessage ne "");
exit; # exit on error
}
if ($errorCode =~ /^2/) # warning!
{
printf(STDERR "Warning from statistic library!\n Warning code: %d\n", $errorCode);
$errorMessage = getErrorMessage();
print STDERR " Warning message: $errorMessage\n" if( $errorMessage ne "");
print STDERR "Skipping ngram $ngramString\n";
next; # if warning, dont save the statistic value just computed
}
}
# ah, so no error or warning
# round the statistic value returned according to the precision
# requested by using the float format created earlier.
# ADP.67.1 start
# $STATISTIC{$ngramString} = sprintf $floatFormat, $statisticValue;
$statScore = sprintf $floatFormat, $statisticValue;
# ADP.71.1 start
# changing separator # to <||>
# $STATISTIC{$statScore}.=$ngramString."#";
# ADP.67.1 end
# as noticed by some users, use of # as a separator between the
# N-gram strings causes problems when tokens include #
# in version 0.71, we changed the separator # to <||> which is a
# more rare sequence to appear in the tokens
# also, we issue now an error message when this sequence <||> does
# appear within the ngramString
if($ngramString=~/<\|\|>/)
{
print STDERR "Detected sequence <||> within Ngram - $ngramString.
statistic.pl will not behave as expected.\n";
exit 1;
}
$STATISTIC{$statScore}.=$ngramString."<||>";
# ADP.71.1 end
}
#------------------
# SK.0.91 start
#------------------
# to check that the sum of all Ngram counts is less than or equal
# to the total Ngram count.
if($totalNgramCount > $totalNgrams)
{
print STDERR ("$source does not look like a ngram frequency file. The total ngrams should be greater than the sum of counts of all the ngrams.");
exit;
}
# that completes the calculations. now to write out the data onto the
# destination file, ranking the ngrams according to the statistic just
# calculated. we will do formatted as well as unformatted printing.
# but first print out some @ data if -extended is chosen
$statisticName = getStatisticName();
if(!defined $statisticName)
{
$statisticName = $statistic;
}
if ( defined $opt_extended )
{
# first the name of the statistic...
print DST "\@statistic.StatisticName=$statisticName\n";
# next if output is formatted...
if ( defined $opt_format )
{
print DST "\@statistic.Formatted=1\n";
}
else { print DST "\@statistic.Formatted=0\n"; }
# the frequency cut off, if defined...
if (defined $opt_frequency) { print DST "\@statistic.Frequency=$opt_frequency\n"; }
# the rank...
if ( $show > 0 ) { print DST "\@statistic.Rank=$show\n"; }
# and finally the score cut off
if ( defined $scoreCutOff ) { print DST "\@statistic.Score=$scoreCutOff\n"; }
}
if ( defined $opt_format ) { &formattedPrinting(); }
else { &unformattedPrinting(); }
# close all open files...
close SRC;
close DST;
# ...and thats it!
#-----------------------------------------------------------------------------
# User Defined Functions
#-----------------------------------------------------------------------------
# function to do unformatted printing to the destination file!
sub unformattedPrinting
{
chomp $totalNgrams;
print DST "$totalNgrams\n";
# we will do the ranking ourselves, whereby all tied ngrams will
# receive the same rank. moreover ranks wont have holes in them,
# which means that no matter how many ngrams have rank x, the next
# lower valued ngram will have a rank of x+1!
my $rank = 1;
# following commented statements belong to code before version 0.67
# this part was re-written and simplified by ADP during 0.67
# my $lastValue = 0;
# find the smallest statistic value and assign to $lastValue
#foreach ( sort { $STATISTIC{$b} <=> $STATISTIC{$a} } keys %STATISTIC )
#{
#$lastValue = $STATISTIC{$_};
#last;
#}
#foreach ( sort { $STATISTIC{$b} <=> $STATISTIC{$a} } keys %STATISTIC )
#{
#my @tokens = split ( /<>/, $_ );
#if ( $lastValue != $STATISTIC{$_} )
#{
# $lastValue = $STATISTIC{$_};
# $rank++;
#}
# ---------------
# ADP.67.1 start
# ---------------
foreach $score (sort {$b <=> $a } keys %STATISTIC)
{
# currentScore is the score associated with the
# current rank.
# only when the score drops, the rank is incremented
if(defined $currentScore)
{
if($score < $currentScore) { $rank++; }
elsif($score > $currentScore)
{
print STDERR "Weird Sorting error.\n";
exit;
}
}
$currentScore=$score;
# if less than score cut-off, then quit!
# if ( defined $scoreCutOff && $STATISTIC{$_} < $scoreCutOff ) { last; }
if ( defined $scoreCutOff && $score < $scoreCutOff ) { last; }
# if exceeded the showing limit for the rank, quit!
if ( ( $show > 0 ) && ( $show < $rank ) ) { last; }
# ADP.71.2 start
# changed separator mark # to <||>
# N-grams stored in STATISTIC are separated by <||>
# removing last <||>
# if($STATISTIC{$score}=~/#$/) { chop $STATISTIC{$score}; }
# @ngramStrings=split(/#/,$STATISTIC{$score});
if($STATISTIC{$score}=~/<\|\|>$/) { $STATISTIC{$score}=~s/<\|\|>$//; }
@ngramStrings=split(/<\|\|>/,$STATISTIC{$score});
# ADP.71.2 end
foreach $ngramString (@ngramStrings)
{
@tokens=split(/<>/,$ngramString);
$numberString=pop @tokens;
$ngram=join "<>", @tokens;
# commented by ADP during version 0.67
# do bit-stuffing
# if ( $_ =~ /^@/ ) { print DST "@"; }
# print DST "$_<>$rank $STATISTIC{$_} $NUMBERSTRINGS{$_}\n";
print DST "$ngram<>$rank $score $numberString\n";
}
}
}
# function to do formatted printing to the destination file!
sub formattedPrinting
{
# we shall do the entire ranking first and create a rank hash so
# that before we print we know the exact string size of the
# biggest rank. basically we want to know exactly how big
# everything is before we start printing so that we can space
# things out just perfectly ("perfectly" being used a lil lightly)
my $spaceBetwFields = 2;
# set up the initial values as the minimum we need per field
my $maxNgramStringLength = length("N-gram");
my $maxStatStringLength = 0;
my $maxFreqLength = 0;
my $rank = 1;
# ---------------
# ADP.67.1 start
# ---------------
# the commented code below belongs to versions 0.65 and earlier
# my $lastValue = 0;
# find the smallest statistic value and assign to $lastValue
# foreach ( sort { $STATISTIC{$b} <=> $STATISTIC{$a} } keys %STATISTIC )
# {
# $lastValue = $STATISTIC{$_};
# last;
# }
# foreach ( sort { $STATISTIC{$b} <=> $STATISTIC{$a} } keys %STATISTIC )
# {
# my @tokens = split ( /<>/, $_ );
# if ( $lastValue != $STATISTIC{$_} )
# {
# $lastValue = $STATISTIC{$_};
# $rank++;
# }
# Code added by ADP.67.1
foreach $score (sort {$b <=> $a} keys %STATISTIC)
{
if(defined $currentScore)
{
if($score < $currentScore) { $rank++; }
elsif($score > $currentScore)
{
print STDERR "Weird sorting error.\n";
exit;
}
}
$currentScore=$score;
# In the following code, ADP changed reference to
# $_ by the N-gram strings stored in $STATISTIC{$score} &
# $STATISTIC{$_} by $score
# during 0.67
# if less than score cut-off, then quit!
# if ( defined $scoreCutOff && $STATISTIC{$_} < $scoreCutOff ) { last; }
if ( defined $scoreCutOff && $score < $scoreCutOff ) { last; }
# if exceeded the showing limit for the rank, quit!
if ( ( $show > 0 ) && ( $show < $rank ) ) { last; }
# ADP.71.3 start
# changed separator # to <||>
#if($STATISTIC{$score}=~/#$/) { chop $STATISTIC{$score}; }
if($STATISTIC{$score}=~/<\|\|>$/) { $STATISTIC{$score}=~s/<\|\|>$//; }
#@ngramStrings=split(/#/,$STATISTIC{$score});
@ngramStrings=split(/<\|\|>/,$STATISTIC{$score});
# ADP.71.3 end
foreach $ngramString (@ngramStrings)
{
@tokens=split(/<>/,$ngramString);
$numberString=pop @tokens;
$ngram=join "<>", @tokens;
# if (length($_) > $maxNgramStringLength) { $maxNgramStringLength = length($_); }
if (length($ngram) > $maxNgramStringLength) { $maxNgramStringLength = length($ngram); }
if (length($numberString) > $maxFreqLength) { $maxFreqLength = length($numberString); }
}
# if (length($STATISTIC{$_}) > $maxStatStringLength) { $maxStatStringLength = length($STATISTIC{$_}); }
if (length($score) > $maxStatStringLength) { $maxStatStringLength = length($score); }
# if (length($NUMBERSTRINGS{$_}) > $maxFreqLength) { $maxFreqLength = length($NUMBERSTRINGS{$_}); }
}
# --------------
# ADP.67.1 end
# --------------
# The following code until next ADP.67.1 start has not been updated by
# ADP
my $maxRankLength = length($rank);
# so thats all our max lengths per field.
# now create the heading string
my $heading = "";
my $spacesToAppend = ($maxNgramStringLength + $spaceBetwFields - length("N-gram")) / 2;
for ($i = 0; $i < $spacesToAppend; $i++)
{
$heading .= " ";
}
$heading .= "N-gram";
for ($i = 0; $i < $spacesToAppend; $i++)
{
$heading .= " ";
}
$spacesToAppend = (length("Rank") > $maxRankLength) ? length("Rank") : $maxRankLength;
$spacesToAppend += $spaceBetwFields;
$spacesToAppend = ($spacesToAppend - length("Rank")) / 2;
for ($i = 0; $i < $spacesToAppend; $i++)
{
$heading .= " ";
}
$heading .= "Rank";
for ($i = 0; $i < $spacesToAppend; $i++)
{
$heading .= " ";
}
$spacesToAppend =
(length("$statisticName value") > $maxStatStringLength) ? length("$statisticName value") : $maxStatStringLength;
$spacesToAppend += $spaceBetwFields;
$spacesToAppend = ($spacesToAppend - length("$statisticName value")) / 2;
for ($i = 0; $i < $spacesToAppend; $i++)
{
$heading .= " ";
}
$heading .= "$statisticName value";
for ($i = 0; $i < $spacesToAppend; $i++)
{
$heading .= " ";
}
$spacesToAppend = (length("Frequency Values") > $maxFreqLength) ? length("Frequency Values") : $maxFreqLength;
$spacesToAppend += $spaceBetwFields;
$spacesToAppend = ($spacesToAppend - length("Frequency Values")) / 2;
for ($i = 0; $i < $spacesToAppend; $i++)
{
$heading .= " ";
}
$heading .= "Frequency Values";
for ($i = 0; $i < $spacesToAppend; $i++)
{
$heading .= " ";
}
my $spacesToAppendForRank = (length("Rank") + $spaceBetwFields - $maxRankLength) / 2;
my $spacesToAppendForStat = (length("$statisticName value") + $spaceBetwFields - $maxStatStringLength) / 2;
my $spacesToAppendForFreqValues = (length("Frequency Values") + $spaceBetwFields - $maxFreqLength) / 2;
printf DST "Total sample size = $totalNgrams\n\n";
print DST "$heading\n";
# now to draw the underline
for ($i = 0; $i < length($heading); $i++) { print DST "-"; }
printf DST "\n";
$rank = 1;
#$lastValue = 0;
# ----------------
# ADP.67.1 start
# ----------------
# find the smallest statistic value and assign to $lastValue
# foreach ( sort { $STATISTIC{$b} <=> $STATISTIC{$a} } keys %STATISTIC )
# {
# $lastValue = $STATISTIC{$_};
# last;
# }
# foreach ( sort { $STATISTIC{$b} <=> $STATISTIC{$a} } keys %STATISTIC )
# {
# my @tokens = split ( /<>/, $_ );
# if ( $lastValue != $STATISTIC{$_} )
# {
# $lastValue = $STATISTIC{$_};
# $rank++;
# }
undef $currentScore;
# Code added by ADP.67.1
foreach $score (sort {$b <=> $a} keys %STATISTIC)
{
if(defined $currentScore)
{
if($score < $currentScore) { $rank++; }
elsif($score > $currentScore)
{
print STDERR "Weird sorting error.\n";
exit;
}
}
$currentScore=$score;
# if less than score cut-off, then quit!
# if ( defined $scoreCutOff && $STATISTIC{$_} < $scoreCutOff ) { last; }
if ( defined $scoreCutOff && $score < $scoreCutOff ) { last; }
# if exceeded the showing limit for the rank, quit!
if ( ( $show > 0 ) && ( $show < $rank ) ) { last; }
# ADP.71.4 start
# if($STATISTIC{$score} =~ /#$/) { chop $STATISTIC{$score}; }
if($STATISTIC{$score} =~ /<\|\|>$/) { $STATISTIC{$score}=~s/<\|\|>$//; }
# @ngramStrings=split(/#/, $STATISTIC{$score});
@ngramStrings=split(/<\|\|>/, $STATISTIC{$score});
# ADP.71.4 end
foreach $ngramString (@ngramStrings)
{
@tokens=split(/<>/,$ngramString);
$numberString=pop @tokens;
$ngram=join "<>", @tokens;
# check size of string...
# $spacesToAppend = ($maxNgramStringLength + $spaceBetwFields - length($_));
$spacesToAppend = ($maxNgramStringLength + $spaceBetwFields - length($ngram));
print DST $ngram;
for ($i = 0; $i < $spacesToAppend; $i++) { print DST " "; }
for ($i = 0; $i < $spacesToAppendForRank; $i++) { print DST " "; }
chomp $rank;
printf(DST "%${maxRankLength}d", $rank);
for ($i = 0; $i < $spacesToAppendForRank; $i++) { print DST " "; }
for ($i = 0; $i < $spacesToAppendForStat; $i++) { print DST " "; }
# chomp $STATISTIC{$_};
# printf(DST "%${maxStatStringLength}.${precision}f", $STATISTIC{$_});
printf(DST "%${maxStatStringLength}.${precision}f", $score);
for ($i = 0; $i < $spacesToAppendForStat; $i++) { print DST " "; }
for ($i = 0; $i < $spacesToAppendForFreqValues; $i++) { print DST " "; }
# chomp $NUMBERSTRINGS{$_};
# printf DST "$NUMBERSTRINGS{$_}\n";
printf DST "$numberString\n";
}
}
}
# function to check if parameter is an integer or not!
sub isInteger
{
my $num = shift;
my @array = split(//, $num);
my $i = 0;
my $flag = 1;
while (defined($array[$i]))
{
if ($array[$i] eq '-' && $i == 0) { $i++; next; }
if ( $array[$i] lt '0' || $array[$i] gt '9' )
{
$flag = 0;
last;
}
$i++;
}
return $flag;
}
# function to create the default frequency combinations to be computed
# and output
sub getDefaultFreqCombos
{
my $i;
# first create the first index of the comb, that is the
# combination that includes all the characters in the window
$combIndex = 0;
$freqComb[0][0] = $ngram;
for ($i = 0; $i < $ngram; $i++)
{
$freqComb[0][$i+1] = $i;
}
$combIndex++;
# now create the rest, starting with size 1
for ($i = 1; $i < $ngram; $i++)
{
createCombination(0, $i);
}
}
# function to read in the user requested frequency combinations
sub readFreqCombo
{
my $sourceFile = shift;
# open the source file
open (FREQ_SRC, $sourceFile) || die ("Couldnt open $sourceFile\n");
# read in the freq combo's one by one into the @freqComb array
$combIndex = 0;
while (<FREQ_SRC>)
{
s/^\s*//;
s/\s*$//;
my @tempArray = split(/\s+/);
# first how many words make up this combination
$freqComb[$combIndex][0] = $#tempArray+1;
# next the indices of the words. note that these indices
# shouldnt exceed $ngram-1... we'll check for that here.
my $i;
for ($i = 1; $i <= $freqComb[$combIndex][0]; $i++)
{
$freqComb[$combIndex][$i] = $tempArray[$i-1];
# check!
if ($freqComb[$combIndex][$i] >= $ngram)
{
printf STDERR ("Illegal index value at row %d column %d in file %s\n", $combIndex+1, $i, $sourceFile);
exit;
}
}
$combIndex++;
}
}
sub createCombination
{
my $level = shift;
my $size = shift;
if ($level == $size)
{
$freqComb[$combIndex][0] = $size;
my $i;
for ($i = 1; $i <= $size; $i++)
{
$freqComb[$combIndex][$i] = $tempCombArray[$i-1];
}
$combIndex++;
}
else
{
my $i;
my $loopStart = (!$level)?0:$tempCombArray[$level-1]+1;
for ($i = $loopStart; $i < $ngram; $i++)
{
$tempCombArray[$level] = $i;
createCombination($level+1, $size);
}
}
}
# function to output a minimal usage note when the user has not provided any
# commandline options
sub minimalUsageNotes
{
print "Usage: statistic.pl [OPTIONS] STATISTIC_LIBRARY DESTINATION SOURCE\n";
askHelp();
}
# function to output help messages for this program
sub showHelp
{
print "Usage: statistic.pl [OPTIONS] STATISTIC_LIBRARY DESTINATION SOURCE\n\n";
print "Loads the given STATISTIC_LIBRARY, calculates the statistic on n-grams\n";
print "in SOURCE and outputs results to DESTINATION. SOURCE must be an\n";
print "n-gram-frequency file output by count.pl. N-grams in DESTINATION are\n";
print "ranked on the value of their statistic.\n\n";
print "OPTIONS:\n\n";
print " --ngram N Assumes that n-grams in SOURCE file have N\n";
print " tokens each. N = 2 by default.\n\n";
print " --set_freq_combo FILE \n";
print " Uses the frequency combinations in FILE to\n";
print " decode the \"meaning\" of the frequency\n";
print " values in SOURCE. By default, the default\n";
print " frequency combinations output by count.pl\n";
print " for ngrams of size N are assumed.\n\n";
print " --get_freq_combo FILE \n";
print " Prints out the frequency combinations being\n";
print " used to FILE. If frequency combinations have\n";
print " been provided through --set_freq_combo switch\n";
print " above these are output; otherwise the default\n";
print " combinations being used are output.\n\n";
print " --frequency N Ignores all n-grams with frequency < N.\n\n";
print " --rank N Shows only n-grams with rank <= N.\n\n";
print " --precision N Displays values upto N places of decimal.\n\n";
print " --score N Shows only n-grams which have score >= N.\n\n";
print " --extended Outputs chosen parameters in \"extended\"\n";
print " format, and retains any extended data in\n";
print " SOURCE. By default, suppresses any extended\n";
print " information in SOURCE, and outputs no new\n";
print " parameters.\n\n";
print " --format Creates formatted output.\n\n";
print " --version Prints the version number.\n\n";
print " --help Prints this help message.\n\n";
}
# function to show version number
sub showVersion
{
print "statistic.pl - version 0.69\n";
print "Copyright (C) 2000-2004, Ted Pedersen, Satanjeev Banerjee, Amruta Purandare\n";
print "Date of Last Update: 06/14/2004\n";
}
# function to output "ask for help" message when the user's goofed up!
sub askHelp
{
print "Type statistic.pl --help for help.\n";
}