The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/local/bin/perl -w

=head1 NAME

huge-delete.pl - Delete the bigrams by low/high frequency 

=head1 SYNOPSIS

huge-delete.pl takes a unique bigrams list and remove the bigrams 
by the low or hight frequency cutoffs. 

=head1 DESCRIPTION

See perldoc huge-delete.pl 

=head1 USGAE

huge-delete.pl [OPTIONS] SOURCE DESTINATION

=head1 INPUT

=head2 Required Arguments:

=head3 SOURCE

Input to huge-delete.pl should be a single flat file generated by 
count.pl or huge-count.pl. The result file will be the DESTINATION.

=head2 Optional Arguments:

=head4 --remove L

low frequency cutoff --remove option will remove the bigrams which
have less than cutoff frequencies.

=head4 --uremove L

high frequency cutoff --uremove option will remove the bigrams which
have more than cutoff frequencies.

=head4 --frequency F

low frequency cutoff --frequency option will not print out the bigrams 
which have less than cutoff frequencies.

=head4 --ufrequency F

high frequency cutoff --frequency option will not print out the bigrams 
which have more than cutoff frequencies.

=head3 Other Options:

=head4 --help

Displays the help information.

=head4 --version

Displays the version information.

=head1 AUTHOR

Ying Liu, University of Minnesota, Twin Cities.
liux0395@umn.edu

=head1 COPYRIGHT

Copyright (C) 2009-2010 Ying Liu 

Ying Liu, University of Minnesota, Twin Cities.
liux0395@umn.edu

Ted Pedersen, University of Minnesota, Duluth.
tpederse@umn.edu


This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

=cut


###############################################################################
#-----------------------------------------------------------------------------
#                              Start of program
#-----------------------------------------------------------------------------
###################################################################################


use Getopt::Long;

# first check if no commandline options have been provided... in which case
# print out the usage notes!
if ( $#ARGV == -1 )
{
    &minimalUsageNotes();
    exit;
}

# now get the options!
GetOptions( "remove=i", "frequency=i", "uremove=i", "ufrequency=i", "version", "help" );

if(defined $opt_help)
{
        $opt_help=1;
        &showhelp();
        exit;
}

# show version information
if(defined $opt_version)
{
        $opt_version=1;
        &showversion();
        exit;
}


if ((defined $opt_remove) and (defined $opt_uremove))
{
        if ($opt_remove > $opt_uremove)
        {
                print "--remove must be smaller than --uremove!\n";
                print STDERR "Type huge-delete.pl --help for help.\n";
                exit;
        }
}

if ((defined $opt_frequency) and (defined $opt_ufrequency))
{
        if ($opt_frequency > $opt_ufrequency)
        {
                print "--frequency must be smaller than --ufrequency!\n";
                print STDERR "Type huge-delete.pl --help for help.\n";
                exit;
        }
}


if((!defined $opt_remove) and (!defined $opt_uremove) and (!defined $opt_frequency) and (!defined $opt_ufrequency))
{
	printf STDERR "No (u)remove and (u)frequency cutoffs, the result file is the same with the input file!\n";
	exit;
}

my $final_merge = $ARGV[0]; 
open(FMERGE, "<$final_merge") or die("Error: cannot open file '$final_merge'\n");

my $temp = "$final_merge" . ".temp";
open(TEMP, "+>$temp") or die("Error: cannot open file '$temp'\n");

my $total = <FMERGE>;
my $total_bigrams = 0; 

# remove bigrams with low or high frequency
while (my $line = <FMERGE>)
{
  	chop ($line);
    my @bigrams= split('<>', $line);
    my @words = split(' ', $bigrams[2]);

    $w1{$bigrams[0]} = $words[1] if (!defined $w1{$bigrams[0]});
    $w2{$bigrams[1]} = $words[2] if (!defined $w2{$bigrams[1]});

    if((defined $opt_remove) and (defined $opt_uremove))
    {
	    if (($words[0]>=$opt_remove) and ($words[0]<=$opt_uremove))
        {
         	printf TEMP "$line\n";
            $total_bigrams += $words[0];
        }
        else
        {
          	$w1{$bigrams[0]} -= $words[0];
            $w2{$bigrams[1]} -= $words[0];
        }
    }
    elsif((defined $opt_remove) and (!defined $opt_uremove))
    {
     	if ($words[0]>=$opt_remove)
        {
      		printf TEMP "$line\n";
            $total_bigrams += $words[0];
        }
        else
        {
            $w1{$bigrams[0]} -= $words[0];
            $w2{$bigrams[1]} -= $words[0];
        }
     }
     elsif((!defined $opt_remove) and (defined $opt_uremove))
     { 
      	if ($words[0]<=$opt_uremove)
       	{
      		printf TEMP "$line\n";
            $total_bigrams += $words[0];
        }
        else
        {
      		$w1{$bigrams[0]} -= $words[0];
            $w2{$bigrams[1]} -= $words[0];
        }
     }
     else
	 {
      		printf TEMP "$line\n";
            $total_bigrams += $words[0];
     }

}
close FMERGE;


my $final_output = $ARGV[1]; 
open(FINAL, ">$final_output") or die("Error: cannot open file '$final_output'\n");
printf FINAL "$total_bigrams\n";

seek TEMP, 0, 0;
while (my $line = <TEMP>)
{
   	chop ($line);
    my @bigrams= split('<>', $line);
    my @words = split(' ', $bigrams[2]);

    if((defined $opt_frequency) and (defined $opt_ufrequency))
    {
	    if (($words[0]>=$opt_frequency) and ($words[0]<=$opt_ufrequency))
        {
        	printf FINAL "$bigrams[0]<>$bigrams[1]<>$words[0] $w1{$bigrams[0]} $w2{$bigrams[1]} \n";
        }
    }
    elsif((defined $opt_frequency) and (!defined $opt_ufrequency))
    {
      	if ($words[0]>=$opt_frequency)
        {
      		 printf FINAL "$bigrams[0]<>$bigrams[1]<>$words[0] $w1{$bigrams[0]} $w2{$bigrams[1]} \n";
        }  
    }
    elsif((!defined $opt_frequency) and (defined $opt_ufrequency))
    {
      	if ($words[0]<=$opt_ufrequency)
        {
       		 printf FINAL "$bigrams[0]<>$bigrams[1]<>$words[0] $w1{$bigrams[0]} $w2{$bigrams[1]} \n";
        }
    }
    else
    {
        printf FINAL "$bigrams[0]<>$bigrams[1]<>$words[0] $w1{$bigrams[0]} $w2{$bigrams[1]} \n";
    }
}

close TEMP;
close FINAL;

system ("rm $temp");

    
#-----------------------------------------------------------------------------
#                       User Defined Function Definitions
#-----------------------------------------------------------------------------

# function to output a minimal usage note when the user has not provided any
# commandline options
sub minimalUsageNotes
{
    print STDERR "Usage: huge-delete.pl [OPTIONS] SOURCE DESTINATION\n";
    askHelp();
}

# function to output "ask for help" message when the user's goofed up!
sub askHelp
{
    print STDERR "Type huge-delete.pl --help for help.\n";
}

# function to output help messages for this program
sub showhelp
{
    print "\n";
    print "Usage: huge-delete.pl [OPTIONS] SOURCE DESTINATION\n\n";
    print "huge-delete.pl takes the bigrams file generated by\n";
    print "huge-count.pl or count.pl as input, and output the\n";
    print "results to the destination file.\n\n";

    print "OPTIONS:\n\n";

    print "  --remove L         Bigrams with counts < L will be removed from sample.\n";
    print "  --uremove L        Bigrams with counts > L will be removed from sample.\n";
    print "                     -remove must be smaller than --uremove.\n\n";

    print "  --frequency F      Bigrams with counts < F will not be displayed.\n";
    print "  --ufrequency F     Bigrams with counts > F will not be displayed.\n";
    print "                     --frequency must be smaller than --ufrequency.\n\n";


    print "  --help             Prints this help message.\n";
    print "  --version          Prints this version message.\n";
}

# function to output the version number
sub showversion
{
    print STDERR "huge-delete.pl      -        version 0.2\n";
    print STDERR "Copyright (C) 2010, Ying Liu\n";
    print STDERR "Date of Last Update 03/22/2010\n";

}