The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#! /usr/bin/perl -w
#
# wordVectors.pl version 2.05
# (Last updated $Id: wordVectors.pl,v 1.19 2008/05/30 23:12:44 sidz1979 Exp $)
#
# -----------------------------------------------------------------------------

# Use external packages
use strict;
use Getopt::Long;
use WordNet::QueryData;
use WordNet::Tools;
use WordNet::vectorFile;

# Declarations!
my $wn;              # WordNet::QueryData object.
my $wntools;         # WordNet::Tools object.
my $fh;              # Filehandle, to hold data file handles.
my $wnPCPath;        # Path to WordNet data files (on Windows).
my $wnUnixPath;      # Path to WordNet data files (on Unix).
my $documentCount;   # Document Count (Gloss Count).
my $saveDocCount;    # The value to be written to output file.
my $saveDims = {};   # The dimesion hash to be written to file.
my $saveMatrix = {}; # The vector matrtix to be written to file.
my %rows;            # Hash holding the rows of the matrix.
my %stopWords;       # List of stop words.
my %wordMatrix;      # Matrix of co-occurrences.
my %wordIndex;       # List of words word -> index mapping.
my %wordTF;          # Term Frequency (for each word).
my %wordDF;          # Document Frequency (for each word).
my %wFreq;           # Word Frequency.

# Get the options!
our ($opt_version, $opt_help, $opt_wnpath, $opt_noexamples, $opt_stopfile, $opt_cutoff, $opt_rhigh, $opt_rlow, $opt_chigh, $opt_clow);
&GetOptions("version", "help", "wnpath=s", "noexamples", "stopfile=s","cutoff=f", "rhigh=i",  "rlow=i", "chigh=i", "clow=i");

# If the version information has been requested...
if(defined $opt_version)
{
  &printVersion();
  exit;
}

# If detailed help has been requested...
if(defined $opt_help)
{
  &printHelp();
  exit;
}

# If no database filename, error...
if(@ARGV)
{
  if(-e $ARGV[0])
  {
    my $getuser;
    print STDERR "File $ARGV[0] already exists. Overwrite? [y/n] ";
    $getuser = <STDIN>;
    $getuser =~ s/[\r\f\n]//g;
    $getuser =~ s/\s*//g;
    if(!($getuser =~ /^y$/ || $getuser =~ /^Y$/))
    {
      print STDERR "Exiting.\n";
      exit;
    }
    if(!(-w $ARGV[0]))
    {
      print STDERR "Unable to overwrite $ARGV[0]. Exiting.\n";
      exit;
    }
  }
}
else
{
  print "Specify name of database file.\n";
  &printUsage();
  exit;
}

# Load the stop words if specified
if(defined $opt_stopfile)
{
  print STDERR "Loading stoplist... ";
  open(WORDS, "$opt_stopfile") || die("Couldnt open $opt_stopfile.\n");
  while (<WORDS>)
  {
    s/[\r\f\n]//g;
    s/\s+//g;
    $stopWords{$_} = 1;
  }
  close WORDS;
  print STDERR "done.\n";
}

# Check if path to WordNet Data files has been provided ... If so ... save it.
print STDERR "Loading WordNet... ";
if(defined $opt_wnpath)
{
  $wnPCPath = $opt_wnpath;
  $wnUnixPath = $opt_wnpath;
  $wn = WordNet::QueryData->new($opt_wnpath);
}
else
{
  if (defined $ENV{WNSEARCHDIR})
  {
    $wnPCPath = $ENV{WNSEARCHDIR};
    $wnUnixPath = $ENV{WNSEARCHDIR};
  }
  elsif (defined $ENV{WNHOME})
  {
    $wnPCPath = $ENV{WNHOME} . "\\dict";
    $wnUnixPath = $ENV{WNHOME} . "/dict";
  }
  else
  {
    $wnPCPath = "C:\\Program Files\\WordNet\\3.0\\dict";
    $wnUnixPath = "/usr/local/WordNet-3.0/dict";
  }
  $wn = WordNet::QueryData->new;
}
die "Unable to create WordNet::QueryData object.\n" if(!$wn);
$wnPCPath = $wnUnixPath = $wn->dataPath() if($wn->can('dataPath'));
$wntools = WordNet::Tools->new($wn);
die "Unable to create WordNet::Tools object.\n" if(!$wntools);
print STDERR "done.\n";
print STDERR "Creating word vectors...                                       ";
open(NIDX, $wnUnixPath."/data.noun") || open(NIDX, $wnPCPath."\\noun.dat") || die "Unable to open data file.\n";
open(VIDX, $wnUnixPath."/data.verb") || open(VIDX, $wnPCPath."\\verb.dat") || die "Unable to open data file.\n";
open(AIDX, $wnUnixPath."/data.adj") || open(AIDX, $wnPCPath."\\adj.dat") || die "Unable to open data file.\n";
open(RIDX, $wnUnixPath."/data.adv") || open(RIDX, $wnPCPath."\\adv.dat") || die "Unable to open data file.\n";
$documentCount = 0;

foreach $fh (*NIDX, *VIDX, *AIDX, *RIDX)
{
  my $line;
  my $word1;
  my $word2;
  my @parts;
  my @words;
  my @walk;
  my %tHash;
  while($line = <$fh>)
  {
    next if ($line =~ m/^\s/);
    $line =~ s/[\r\f\n]//g;
    $line =~ s/.*\|//;
    $line = lc($line);
    $line =~ s/\".*\"//g if(defined $opt_noexamples);
    @parts = split(/\;/, $line);
    @words = ();
    %tHash = ();

    while(@parts)
    {
      $line = shift(@parts);
      $line =~ s/\'//g;
      $line =~ s/[^a-z0-9]+/ /g;
      $line =~ s/^\s*//;
      $line =~ s/\s*$//;
      $line = $wntools->compoundify($line);
      @walk = split(/\s+/,$line);
      @walk = &_removeStopWords(@walk) if(defined $opt_stopfile);
      @walk = &_stem(@walk);
      @walk = &_removeStopWords(@walk) if(defined $opt_stopfile);
      push @words, @walk;
    }
    foreach $word1 (@words)
    {
      $tHash{$word1} = 1;
      $wordTF{$word1}++;
      foreach $word2 (@words)
      {
        $wordMatrix{$word1}{$word2}++;
      }
    }
    foreach $word1 (keys %tHash)
    {
      $wordDF{$word1}++;
    }
    $documentCount++;
    print STDERR "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
    printf STDERR "%6d (of approximately 118000) done.", $documentCount;
  }
}
print STDERR "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
printf STDERR "%6d (of approximately 118000) done.", $documentCount;
close(NIDX);
close(VIDX);
close(AIDX);
close(RIDX);
print STDERR "\n";
print STDERR "Inconsistent TERMFREQ and DOCFREQ hashes.\n" if(scalar(keys(%wordTF)) != scalar(keys(%wordDF)));

# Pruning columns...
%rows = %wordTF;
if(defined $opt_chigh || defined $opt_clow)
{
  print STDERR "Pruning columns... ";
  my $word;
  my %tmpTF = %wordTF;
  my %tmpDF = %wordDF;
  %wordTF = ();
  %wordDF = ();
  my $c = 0;
  foreach $word (sort {$tmpTF{$b} <=> $tmpTF{$a}} keys %tmpTF)
  {
    next if(defined $opt_chigh && $tmpTF{$word} > $opt_chigh);
    last if(defined $opt_clow && $tmpTF{$word} < $opt_clow);
    $wordTF{$word} = $tmpTF{$word};
    $wordDF{$word} = $tmpDF{$word};
    $c++;
  }
  print STDERR "done.\n";
}

# Assigning indices to words... pruning cut-off words...
print STDERR "Writing dimensions...                       ";
my $word;
my @words = keys %wordTF;
my $final = scalar(@words);
my $c = 0;
foreach $word (@words)
{

  if(defined $opt_cutoff && &_tfidf($word) > $opt_cutoff)
  {
    delete $wordTF{$word};
    delete $wordDF{$word};
    next;
  }
  $wordIndex{$word} = $c;
  $saveDims->{$word} = "$c $wordTF{$word} $wordDF{$word}";
  print STDERR "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
  printf STDERR "%6d of %6d done.", $c, $final;
  $c++;
}
print STDERR "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
printf STDERR "%6d of %6d done.", $c, $final;
print STDERR "\n";

# Write out the document count...
print STDERR "Writing the document count... ";
$saveDocCount = $documentCount;
print STDERR "done.\n";

# Writing out the Word Vectors to the database...
print STDERR "Writing word vectors...                       ";
$final = scalar(keys(%wordMatrix));
$c = 0;
foreach $word (sort {$rows{$b} <=> $rows{$a}} keys %wordMatrix)
{
  my $key;
  my $value;
  next if(defined $opt_rhigh && $opt_rhigh < $rows{$word});
  last if(defined $opt_rlow && $opt_rlow > $rows{$word});
  $value = "";
  foreach $key (keys %{$wordMatrix{$word}})
  {
    if(defined $wordIndex{$key})
    {
      $value .= "$wordIndex{$key} ".($wordMatrix{$word}{$key})." ";
    }
  }
  $value =~ s/\s+$//;
  $saveMatrix->{$word} = $value;
  print STDERR "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
  printf STDERR "%6d of %6d done.", $c, $final;
  $c++;
}
print STDERR "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
printf STDERR "%6d of %6d done.", $c, $final;
print STDERR "\n";

# Now that everything has been computed, write to file...
print STDERR "Writing data to file... ";
my $errCode = WordNet::vectorFile->writeVectors($ARGV[0], $saveDocCount, $saveDims, $saveMatrix);
if(!defined $errCode || $errCode == 0)
{
  print STDERR "Error writing data to file.\n";
  exit;
}
print STDERR "done.\n";

# ----------------- Subroutines Start Here ----------------------
# Subroutine to stem a list of words...
# INPUT PARAMS  : @words        .. list of words.
# RETURN VALUES : @stemmedWords .. Stemmed list.
sub _stem
{
  my $word;
  my @words = @_;
  my @stemmedWords = ();
  foreach $word (@words)
  {
    my @wnForms = $wn->validForms($word);
    my @tmp = @wnForms;
    my $tp;
    if($#wnForms > 0)
    {
      @wnForms = ();
      foreach $tp (@tmp)
      {
        push(@wnForms, $tp) if($tp =~ /\#n$/);
      }
    }
    $word = $wnForms[0] if($#wnForms == 0);
    $word =~ s/\#.*//;
    push @stemmedWords, $word;
  }
  return @stemmedWords;
}

# Subroutine to remove stop words...
# INPUT PARAMS  : @words        .. list of words.
# RETURN VALUES : @cleanWords .. clean list.
sub _removeStopWords
{
  my $word;
  my $prt;
  my @words = @_;
  my @comp;
  my @cleanWords = ();
  foreach $word (@words)
  {
    @comp = split(/_+/, $word);
    foreach $prt (@comp)
    {
      if(!$stopWords{$prt})
      {
        push @cleanWords, $word;
        last;
      }
    }
  }
  return @cleanWords;
}

# Subroutine to filter words with information content below
# a certain cutoff...
sub _tfidf
{
  my $word = shift;
  if(defined $wordTF{$word} && defined $wordDF{$word} && $wordDF{$word} > 0)
  {
    return 0 if(!defined $documentCount || $documentCount < 1);
    return $wordTF{$word}*log($documentCount/$wordDF{$word});
  }
  return 0;
}

# Subroutine to print detailed help
sub printHelp
{
  &printUsage();
  print "\nThis program writes out word vectors computed from WordNet glosses in\n";
  print "a database file specified by filename DBFILE.\n";
  print "Options: \n";
  print "--stopfile       Option specifying a list of stopwords to not be\n";
  print "                 considered while counting.\n";
  print "--wnpath         WNPATH specifies the path of the WordNet data files.\n";
  print "                 Ordinarily, this path is determined from the \$WNHOME\n";
  print "                 environment variable. But this option overides this\n";
  print "                 behavior.\n";
  print "--noexamples     Removes examples from the glosses before processing.\n";
  print "--cutoff         Option used to restrict the dimensions of the word\n";
  print "                 vectors with an tf/idf cutoff. VALUE is the cutoff\n";
  print "                 above which is an acceptable tf/idf value of a word.\n";
  print "--rhigh          RHIGH is the upper frequency cutoff of the words\n";
  print "                 selected to have a word-vector entry in the database.\n";
  print "--rlow           RLOW is the lower frequency cutoff of the words\n";
  print "                 selected to have a word-vector entry in the database.\n";
  print "--chigh          CHIGH is the upper frequency cutoff of words that form\n";
  print "                 the dimensions of the word-vectors.\n";
  print "--clow           CLOW is the lower frequency cutoff of words that form\n";
  print "                 the dimensions of the word-vectors.\n";
  print "--help           Displays this help screen.\n";
  print "--version        Displays version information.\n\n";
}

# Subroutine to print minimal usage notes
sub minimalUsageNotes
{
  &printUsage();
  print "Type wordVectors.pl --help for detailed help.\n";
}

# Subroutine that prints the usage
sub printUsage
{
  print "Usage: wordVectors.pl [{ [--stopfile STOPLIST] [--wnpath WNPATH]";
  print " [--noexamples] [--cutoff VALUE] [--rhigh RHIGH] [--rlow RLOW] [--chigh CHIGH] [--clow CLOW] DBFILE\n";
  print "                      | --help \n";
  print "                      | --version }]\n";
}

# Subroutine to print the version information
sub printVersion
{
  print "wordVectors.pl version 2.05\n";
  print "Copyright (c) 2005-2008, Ted Pedersen and Siddharth Patwardhan.\n";
}
__END__

=head1 NAME

wordVectors.pl - Write word vectors from WordNet glosses to a file for use by vector
and vector_pairs measures

=head1 SYNOPSIS

 wordVectors.pl [[--stopfile STOPLIST]
	  [--wnpath WNPATH] [--noexamples] [--cutoff VALUE] 
 	  [--rhigh RHIGH] [--rlow RLOW] 
	  [--chigh CHIGH] [--clow CLOW] DBFILE 
          | --help | --version]

=head1 DESCRIPTION

This program writes out word vectors computed from WordNet glosses in a
database file specified by filename DBFILE.  The database
file is intended for use by the WordNet::Similarity::vector Perl module,
but if you can think of something else to do with it, then go ahead.

=head1 OPTIONS

B<--stopfile>=I<file>

    Option specifying a list of stopwords to not be
    considered while counting.

B<--wnpath>=I<path>

    Specifies the path to the WordNet data files.
    Ordinarily, this path is determined from the $WNHOME
    environment variable. But this option overides this
    behavior.

B<--noexamples>

    Removes examples from the glosses before processing.

B<--cutoff>=I<number>

    Option used to restrict the dimensions of the word
    vectors with an tf/idf cutoff. VALUE is the cutoff
    above which is an acceptable tf/idf value of a word.

B<--rhigh>=I<number>

    the upper frequency cutoff of the words
    selected to have a word-vector entry in the database.

B<--rlow>=I<number>

    the lower frequency cutoff of the words
    selected to have a word-vector entry in the database.

B<--chigh>=I<number>

    the upper frequency cutoff of words that form
    the dimensions of the word-vectors.

B<--clow>=I<number>

    the lower frequency cutoff of words that form
    the dimensions of the word-vectors.

B<--help>

    Displays a detailed usage message

B<--version>

    Displays version information.


=head1 BUGS

Report to WordNet::Similarity mailing list :
 L<http://groups.yahoo.com/group/wn-similarity>

=head1 SEE ALSO

L<WordNet::Similarity>

WordNet home page : 
 L<http://wordnet.princeton.edu>

WordNet::Similarity home page :
 L<http://wn-similarity.sourceforge.net>

=head1 AUTHORS

 Ted Pedersen, University of Minnesota, Duluth
 tpederse at d.umn.edu

 Siddharth Patwardhan, University of Utah, Salt Lake City
 sidd at cs.utah.edu

=head1 COPYRIGHT

Copyright (c) 2005-2008, Ted Pedersen and Siddharth Patwardhan

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to:

    The Free Software Foundation, Inc., 
    59 Temple Place - Suite 330, 
    Boston, MA  02111-1307, USA.

=cut