The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# WordNet::SenseRelate::Preprocess::Compounds v0.02
# (Last updated $Id: Compounds.pm,v 1.7 2005/06/24 13:55:38 sidz1979 Exp $)

package WordNet::SenseRelate::Preprocess::Compounds;

use strict;
use warnings;
use Exporter;
use WordNet::SenseRelate::Tools;
use WordNet::SenseRelate::Word;

our @ISA     = qw(Exporter);
our $VERSION = '0.02';

# Constructor for this module
sub new
{
    my $class   = shift;
    my $wntools = shift;
    my $self    = {};
    my $trace   = shift;
    $trace = 0 if (!defined $trace);

    # Create the preprocessor object
    $class = ref $class || $class;
    bless($self, $class);

    # Read in the wordnet data
    if (   !defined $wntools
        || !ref $wntools
        || ref($wntools) ne "WordNet::SenseRelate::Tools")
    {
        $wntools = WordNet::SenseRelate::Tools->new($wntools);
        return undef if (!defined $wntools);
    }
    $self->{wntools}     = $wntools;
    $self->{trace}       = $trace;
    $self->{tracestring} = "";

    # No options for this module
    $self->{optionlist} = {};

    return $self;
}

# Preprocess a given input instance:
#  (1) Convert to lowercase
#  (2) Remove unwanted characters
#  (3) Combine all consecutive occurrence of numbers into one
#  (4) Detect compounds
#  (5) Split the words on white spaces
#  (6) Create a context object
sub preprocess
{
    my $self     = shift;
    my $instance = shift;
    my $wntools  = $self->{wntools};
    if (!defined($wntools))
    {
        $wntools = {};
        $wntools->{compounds} = {};
    }

    return undef if (!defined $self || !defined $instance);
    my $trace = $self->{trace};

    # Create empty context object
    # Instance Fields:
    #   (a) text (array)
    #   (b) words (array)
    #   (c) head (scalar:int)
    #   (d) target (scalar:int)
    #   (e) wordobjects (array)
    #   (f) lexelt (scalar:"art.n")
    #   (g) id (scalar:string)
    #   (h) answer (scalar:string, optional)
    #   (i) targetpos (scalar:string, "n/v/a/r")
    my $context = {};
    $context->{words}       = [];
    $context->{text}        = [];
    $context->{head}        = -1;
    $context->{target}      = -1;
    $context->{wordobjects} = [];
    $context->{lexelt} = $instance->{lexelt} if (defined $instance->{lexelt});
    $context->{id}     = $instance->{id} if (defined $instance->{id});
    $context->{answer} = $instance->{answer} if (defined $instance->{answer});
    $context->{targetpos} = $instance->{targetpos}
      if (defined $instance->{targetpos});

    # Check that the text segments exist
    if (defined $instance->{text})
    {
        foreach my $textseg (@{$instance->{text}})
        {
            push(@{$context->{text}}, $textseg);
        }
    }
    $context->{head} = $instance->{head} if (defined $instance->{head});

    # Detect compounds in the words
    my $string;
    my $done;
    my $temp;
    my $firstPointer;
    my $secondPointer;

    # Start compound detection
    $firstPointer = 0;
    $string       = "";

    while ($firstPointer <= $#{$instance->{words}})
    {
        $secondPointer = (
                            ($#{$instance->{words}} > ($firstPointer + 7))
                          ? ($firstPointer + 7)
                          : ($#{$instance->{words}})
        );
        $done = 0;
        while ($secondPointer > $firstPointer && !$done)
        {
            $temp =
              join("_", @{$instance->{words}}[$firstPointer .. $secondPointer]);
            if (defined $wntools->{compounds}->{$temp})
            {
                push(@{$context->{words}}, $temp);
                push(
                     @{$context->{wordobjects}},
                     WordNet::SenseRelate::Word->new($temp)
                );
                $context->{target} = scalar(@{$context->{words}}) - 1
                  if (   defined $instance->{target}
                      && $instance->{target} >= $firstPointer
                      && $instance->{target} <= $secondPointer);
                $done = 1;
            }
            else
            {
                $secondPointer--;
            }
        }
        if (!$done)
        {
            push(@{$context->{words}}, $instance->{words}->[$firstPointer]);
            push(
                 @{$context->{wordobjects}},
                 WordNet::SenseRelate::Word->new(
                                             $instance->{words}->[$firstPointer]
                 )
            );
            $context->{target} = scalar(@{$context->{words}}) - 1
              if (defined $instance->{target}
                  && $instance->{target} == $firstPointer);
        }
        $firstPointer = $secondPointer + 1;
    }

    return $context;
}

# Get the trace string, and reset the trace
sub getTraceString
{
    my $self = shift;
    return ""
      if (   !defined $self
          || !ref($self)
          || ref($self) ne "WordNet::SenseRelate::Preprocess::Compounds");
    my $returnString = "";
    $returnString = $self->{tracestring} if (defined $self->{tracestring});
    $self->{tracestring} = "";
    return $returnString;
}

1;


__END__


=head1 NAME

WordNet::SenseRelate::Preprocess::Compounds - Perl module that detects compounds in a piece of text.

=head1 SYNOPSIS

  use WordNet::SenseRelate::Preprocess::Compounds;

  $preprocess = WordNet::SenseRelate::Preprocess::Compounds->new($wntools);

  $newInstance = $preprocess->preprocess($instance);

=head1 DESCRIPTION

This module is a preprocessor module for the WordNet::SenseRelate::TargetWord algorithm. It 
take as input an instance, and detects WordNet compounds within this instance.

=head2 EXPORT

None by default.

=head1 SEE ALSO

perl(1)

WordNet::SenseRelate::TargetWord

=head1 AUTHOR

Siddharth Patwardhan, sidd at cs.utah.edu

Satanjeev Banerjee, banerjee+ at cs.cmu.edu

Ted Pedersen, tpederse at d.umn.edu

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2005 by Siddharth Patwardhan, Satanjeev Banerjee and Ted Pedersen

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.3 or,
at your option, any later version of Perl 5 you may have available.

=cut