#!/usr/local/bin/perl -w
=head1 NAME
huge-delete.pl - Delete the bigrams by low/high frequency
=head1 SYNOPSIS
huge-delete.pl takes a unique bigrams list and remove the bigrams
by the low or hight frequency cutoffs.
=head1 DESCRIPTION
See perldoc huge-delete.pl
=head1 USGAE
huge-delete.pl [OPTIONS] SOURCE DESTINATION
=head1 INPUT
=head2 Required Arguments:
=head3 SOURCE
Input to huge-delete.pl should be a single flat file generated by
count.pl or huge-count.pl. The result file will be the DESTINATION.
=head2 Optional Arguments:
=head4 --remove L
low frequency cutoff --remove option will remove the bigrams which
have less than cutoff frequencies.
=head4 --uremove L
high frequency cutoff --uremove option will remove the bigrams which
have more than cutoff frequencies.
=head4 --frequency F
low frequency cutoff --frequency option will not print out the bigrams
which have less than cutoff frequencies.
=head4 --ufrequency F
high frequency cutoff --frequency option will not print out the bigrams
which have more than cutoff frequencies.
=head3 Other Options:
=head4 --help
Displays the help information.
=head4 --version
Displays the version information.
=head1 AUTHOR
Ying Liu, University of Minnesota, Twin Cities.
liux0395@umn.edu
=head1 COPYRIGHT
Copyright (C) 2009-2010 Ying Liu
Ying Liu, University of Minnesota, Twin Cities.
liux0395@umn.edu
Ted Pedersen, University of Minnesota, Duluth.
tpederse@umn.edu
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
=cut
###############################################################################
#-----------------------------------------------------------------------------
# Start of program
#-----------------------------------------------------------------------------
###################################################################################
use Getopt::Long;
# first check if no commandline options have been provided... in which case
# print out the usage notes!
if ( $#ARGV == -1 )
{
&minimalUsageNotes();
exit;
}
# now get the options!
GetOptions( "remove=i", "frequency=i", "uremove=i", "ufrequency=i", "version", "help" );
if(defined $opt_help)
{
$opt_help=1;
&showhelp();
exit;
}
# show version information
if(defined $opt_version)
{
$opt_version=1;
&showversion();
exit;
}
if ((defined $opt_remove) and (defined $opt_uremove))
{
if ($opt_remove > $opt_uremove)
{
print "--remove must be smaller than --uremove!\n";
print STDERR "Type huge-delete.pl --help for help.\n";
exit;
}
}
if ((defined $opt_frequency) and (defined $opt_ufrequency))
{
if ($opt_frequency > $opt_ufrequency)
{
print "--frequency must be smaller than --ufrequency!\n";
print STDERR "Type huge-delete.pl --help for help.\n";
exit;
}
}
if((!defined $opt_remove) and (!defined $opt_uremove) and (!defined $opt_frequency) and (!defined $opt_ufrequency))
{
printf STDERR "No (u)remove and (u)frequency cutoffs, the result file is the same with the input file!\n";
exit;
}
my $final_merge = $ARGV[0];
open(FMERGE, "<$final_merge") or die("Error: cannot open file '$final_merge'\n");
my $temp = "$final_merge" . ".temp";
open(TEMP, "+>$temp") or die("Error: cannot open file '$temp'\n");
my $total = <FMERGE>;
my $total_bigrams = 0;
# remove bigrams with low or high frequency
while (my $line = <FMERGE>)
{
chop ($line);
my @bigrams= split('<>', $line);
my @words = split(' ', $bigrams[2]);
$w1{$bigrams[0]} = $words[1] if (!defined $w1{$bigrams[0]});
$w2{$bigrams[1]} = $words[2] if (!defined $w2{$bigrams[1]});
if((defined $opt_remove) and (defined $opt_uremove))
{
if (($words[0]>=$opt_remove) and ($words[0]<=$opt_uremove))
{
printf TEMP "$line\n";
$total_bigrams += $words[0];
}
else
{
$w1{$bigrams[0]} -= $words[0];
$w2{$bigrams[1]} -= $words[0];
}
}
elsif((defined $opt_remove) and (!defined $opt_uremove))
{
if ($words[0]>=$opt_remove)
{
printf TEMP "$line\n";
$total_bigrams += $words[0];
}
else
{
$w1{$bigrams[0]} -= $words[0];
$w2{$bigrams[1]} -= $words[0];
}
}
elsif((!defined $opt_remove) and (defined $opt_uremove))
{
if ($words[0]<=$opt_uremove)
{
printf TEMP "$line\n";
$total_bigrams += $words[0];
}
else
{
$w1{$bigrams[0]} -= $words[0];
$w2{$bigrams[1]} -= $words[0];
}
}
else
{
printf TEMP "$line\n";
$total_bigrams += $words[0];
}
}
close FMERGE;
my $final_output = $ARGV[1];
open(FINAL, ">$final_output") or die("Error: cannot open file '$final_output'\n");
printf FINAL "$total_bigrams\n";
seek TEMP, 0, 0;
while (my $line = <TEMP>)
{
chop ($line);
my @bigrams= split('<>', $line);
my @words = split(' ', $bigrams[2]);
if((defined $opt_frequency) and (defined $opt_ufrequency))
{
if (($words[0]>=$opt_frequency) and ($words[0]<=$opt_ufrequency))
{
printf FINAL "$bigrams[0]<>$bigrams[1]<>$words[0] $w1{$bigrams[0]} $w2{$bigrams[1]} \n";
}
}
elsif((defined $opt_frequency) and (!defined $opt_ufrequency))
{
if ($words[0]>=$opt_frequency)
{
printf FINAL "$bigrams[0]<>$bigrams[1]<>$words[0] $w1{$bigrams[0]} $w2{$bigrams[1]} \n";
}
}
elsif((!defined $opt_frequency) and (defined $opt_ufrequency))
{
if ($words[0]<=$opt_ufrequency)
{
printf FINAL "$bigrams[0]<>$bigrams[1]<>$words[0] $w1{$bigrams[0]} $w2{$bigrams[1]} \n";
}
}
else
{
printf FINAL "$bigrams[0]<>$bigrams[1]<>$words[0] $w1{$bigrams[0]} $w2{$bigrams[1]} \n";
}
}
close TEMP;
close FINAL;
system ("rm $temp");
#-----------------------------------------------------------------------------
# User Defined Function Definitions
#-----------------------------------------------------------------------------
# function to output a minimal usage note when the user has not provided any
# commandline options
sub minimalUsageNotes
{
print STDERR "Usage: huge-delete.pl [OPTIONS] SOURCE DESTINATION\n";
askHelp();
}
# function to output "ask for help" message when the user's goofed up!
sub askHelp
{
print STDERR "Type huge-delete.pl --help for help.\n";
}
# function to output help messages for this program
sub showhelp
{
print "\n";
print "Usage: huge-delete.pl [OPTIONS] SOURCE DESTINATION\n\n";
print "huge-delete.pl takes the bigrams file generated by\n";
print "huge-count.pl or count.pl as input, and output the\n";
print "results to the destination file.\n\n";
print "OPTIONS:\n\n";
print " --remove L Bigrams with counts < L will be removed from sample.\n";
print " --uremove L Bigrams with counts > L will be removed from sample.\n";
print " -remove must be smaller than --uremove.\n\n";
print " --frequency F Bigrams with counts < F will not be displayed.\n";
print " --ufrequency F Bigrams with counts > F will not be displayed.\n";
print " --frequency must be smaller than --ufrequency.\n\n";
print " --help Prints this help message.\n";
print " --version Prints this version message.\n";
}
# function to output the version number
sub showversion
{
print STDERR "huge-delete.pl - version 0.2\n";
print STDERR "Copyright (C) 2010, Ying Liu\n";
print STDERR "Date of Last Update 03/22/2010\n";
}