The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# stem.pm version 1.03
# (Last updated $Id: stem.pm,v 1.7 2006/02/19 19:11:09 sidz1979 Exp $)
#
# Package used by WordNet::Similarity::lesk module that
# computes semantic relatedness of word senses in WordNet
# using gloss overlaps.
#
# Copyright (c) 2005,
#
# Ted Pedersen, University of Minnesota Duluth
# tpederse at d.umn.edu
#
# Satanjeev Banerjee, Carnegie Mellon University, Pittsburgh
# banerjee+ at cs.cmu.edu
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to 
#
# The Free Software Foundation, Inc., 
# 59 Temple Place - Suite 330, 
# Boston, MA  02111-1307, USA.
#
# ------------------------------------------------------------------

package stem;

=head1 NAME

stem - Module that find the stem of a word or the stems of a string 
of words, using WordNet.

=head1 SYNOPSIS

  use stem;

  my $wn = WordNet::QueryData->new();

  my $stemmer = stem->new($wn)

  my @stems = $stemmer->stemWord($word);

  my $string = $stemmer->stemString($inString, $cache);

=head1 DESCRIPTION

This module uses the internal stemming algorithm of WordNet to
stem words and strings of words. This module is used by the
lesk measure of the WordNet::Similarity package.

=head2 Methods

=over

=cut

use strict;
use Exporter;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);

@ISA = qw(Exporter);

%EXPORT_TAGS = ();

@EXPORT_OK = ();

@EXPORT = ();

$VERSION = '1.03';

=item new

Creates a new stemmer object and initilizes it with a 
WordNet::QueryData object.

Parameters: $wn

Returns: $stemmer

=cut

# function to create the stemmer object
sub new
{
    my $className = shift;
    my $wn = shift;
    my $self = {};

    $self->{wn} = $wn;
    $self->{wordStemHash} = ();
    $self->{stringStemHash} = ();
    bless($self, $className);

    return $self;
}

=item stemString

Takes a string of words as input and returns a string of stemmed words.

Parameters: $inString

Returns: $retString

=cut

# Function to take a string, and process it in such a way that all the
# words in it get stemmed. Note that if a single word has two or more
# possible stems, we return the original surface form since there is
# no way to select from the competing stems. The stem of the string
# can be cached if requested. Useful if the calling function knows
# which strings it will have to stem over and over again. Strings that
# will be only stemmed ones need not be cached - thereby saving space.
sub stemString
{
    my $self = shift;
    my $inputString = shift;
    my $cache = shift;
    
    # whether or not this string has been requested for cacheing,
    # check in the cache
    return $self->{'stringStemHash'}->{$inputString} if (defined $self->{'stringStemHash'}->{$inputString});
    
    # Not in cache. Stem.
    
    # for each word in the input get the stem and put in the output string
    my $outputString = "";
    while ($inputString =~ /(\w+)/g)
    {
	my $word = $1;
	my @stems = $self->stemWord($word);
	
	# if multiple or no stems, use surface form.
	$outputString .= ($#stems != 0) ? "$word " : "$stems[0] ";
    }
    
    # if cache required, do so
    $self->{'stringStemHash'}->{$inputString} = $outputString if (defined($cache));
    
    # return the string
    return($outputString);
}

=item stemWord

Takes a word as input and returns its stems. A word may have more than
one stem. All are returned.

Parameters: $word

Returns: @stems

=back

=cut

# stem the word passed to this function and return an array of words
# that contain all the possible stems of this word. All possible stems
# of the word may include the surface form too if its a valid WordNet
# lemma.
sub stemWord
{
    my $self = shift;
    my $word = shift;
    my $wn = $self->{wn};
    my @stems = ();
    
    # if not in the cache, create and put in cache
    if (!defined $self->{wordStemHash}->{$word})
    {
	# So not in the hash. gotta check for all possible parts of speech.
	my %stems = ();
	my $possiblePartsOfSpeech = "nvar";
	
	my $pos;
	while ("nvar" =~ /(.)/g)
	{
	    foreach ($wn->validForms("$word\#$1"))
	    {
		# put underscore for space
		$_ =~ s/ /_/g;
		
		# remove part of speech if any
		$_ =~ s/\#\w$//;
		
		# put in stems hash (the hash allows us to not worry about
		# multiple copies of the same stem!)
		$stems{$_} = 1;
	    }
	}
	
	# put in the cache
	$self->{wordStemHash}->{$word} = join(" ", (keys %stems));
    }
    
    # return the stems
    return (split / /, $self->{wordStemHash}->{$word});
}

1;

__END__

=head1 SEE ALSO

perl(1), WordNet::Similarity(3), WordNet::QueryData(3)

http://www.cs.utah.edu/~sidd

http://wordnet.princeton.edu

http://www.ai.mit.edu/~jrennie/WordNet

http://groups.yahoo.com/group/wn-similarity

=head1 AUTHORS

 Ted Pedersen, University of Minnesota Duluth
 tpederse at d.umn.edu

 Satanjeev Banerjee, Carnegie Mellon University, Pittsburgh
 banerjee+ at cs.cmu.edu

=head1 BUGS

None.

To report bugs, go to http://groups.yahoo.com/group/wn-similarity/ or
e-mail "S<tpederse at d.umn.edu>".

=head1 COPYRIGHT AND LICENSE

Copyright (c) 2005, Ted Pedersen and Satanjeev Banerjee

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to

   The Free Software Foundation, Inc.,
   59 Temple Place - Suite 330,
   Boston, MA  02111-1307, USA.

Note: a copy of the GNU General Public License is available on the web
at L<http://www.gnu.org/licenses/gpl.txt> and is included in this
distribution as GPL.txt.

=cut