The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Text::OverlapFinder;

# Text::OverlapFinder

# Copyright (C) 2004 by Jason Michelizzi, Ted Pedersen, 
# Siddharth Patwardhan, and Satanjeev Banerjee

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

use strict;
use warnings;

our @ISA = ();
our $VERSION = '0.02';

use constant MARKER => '###';

sub contains(\@@);
sub containsReplace(\@@);

my %stoplist;
my %stemmer;

# new (stoplist => $stoplist, stemmer => 1)
sub new
{
    my $class = shift;
    $class = ref $class || $class;
    my $self = bless [], $class;
    
    my $stoplist;
    my $stemmer;
    while (scalar @_) {
	my $arg = shift;
	if ($arg =~ /stoplist/i) {
	    $stoplist = shift;
	    if (-z $stoplist) {
		die "'$stoplist' is not a stoplist file";
	    }
	}
	elsif ($arg =~ /stemmer/i) {
	    $stemmer = shift; 
	    unless (ref $stemmer) {
		die "'$stemmer' is not a reference to a stemmer object";
	    }
	}
	else {
	    die "Unknown argument '$arg'";
	}
    }

    # stemming
    # stoplist
    if (defined $stoplist) {
	$self->_loadStoplist ($stoplist);
    }

    if (defined $stemmer) {
	warn "Stemmer defined but ignored";
    }

    return $self;
}

sub DESTROY
{
    my $self = shift;
    delete $stoplist{$self};
    delete $stemmer{$self};
}

sub doStop {0}

# adapted from a function in string_compare.pm (distributed with
# WordNet::Similarity)
sub getOverlaps
{
    my $self = shift;
    my $string0 = shift;
    my $string1 = shift;

    my %overlapsHash = ();

    $string0 =~ s/^\s+//;
    $string0 =~ s/\s+$//;
    $string1 =~ s/^\s+//;
    $string1 =~ s/\s+$//;

    $string0 = $self->_removeStopWords ($string0);
    $string1 = $self->_removeStopWords ($string1);

    # if stemming on, stem the two strings
    my $stemmingReqd = 0;
    if ($stemmingReqd)
    {
	my $stemmer = bless [];
        $string0 = $stemmer->stemString($string0, 1); # 1 turns on caching
        $string1 = $stemmer->stemString($string1, 1);
    }

    my @words0 = split /\s+/, $string0;
    my @words1 = split /\s+/, $string1;

    my $wc0 = scalar @words0;
    my $wc1 = scalar @words1;

    # for each word in string0, find out how long an overlap can start from it.
    my @overlapsLengths = ();
    my $matchStartIndex = 0;
    my $currIndex = -1;

    while ($currIndex < $#words0)
    {
        # forward the current index to look at the next word
        $currIndex++;

        # if this works, carry on!
        if (contains (@words1, @words0[$matchStartIndex..$currIndex])) {
	    next
	}
	else {
	    # XXX shouldn't this be $currIndex - $matchStartIndex + 1 ?
	    $overlapsLengths[$matchStartIndex] = $currIndex - $matchStartIndex;
	    $currIndex-- if ($overlapsLengths[$matchStartIndex] > 0);
	    $matchStartIndex++;
	}
    }

    for (my $i = $matchStartIndex; $i <= $currIndex; $i++)
    {
        $overlapsLengths[$i] = $currIndex - $i + 1;
    }

    my ($longestOverlap) = sort {$b <=> $a} @overlapsLengths;

    while (defined($longestOverlap) && ($longestOverlap > 0))
    {
        for (my $i = 0; $i <= $#overlapsLengths; $i++)
        {
            next if ($overlapsLengths[$i] < $longestOverlap);

            # form the string
            my $stringEnd = $i + $longestOverlap - 1;

            # check if still there in $string1. replace in string1 with a mark

            if (1 #!doStop($temp)
		&& containsReplace (@words1, @words0[$i..$stringEnd]))
            {
                # so its still there. we have an overlap!
		my $temp = join (" ", @words0[$i..$stringEnd]);
                $overlapsHash{$temp}++;

                # adjust overlap lengths forward
                for (my $j = $i; $j < $i + $longestOverlap; $j++)
                {
                    $overlapsLengths[$j] = 0;
                }

                # adjust overlap lengths backward
                for (my $j = $i-1; $j >= 0; $j--)
                {
                    last if ($overlapsLengths[$j] <= $i - $j);
                    $overlapsLengths[$j] = $i - $j;
                }
            }
            else
	    {
                # ah its not there any more in string1! see if
                # anything smaller than the full string works
                my $k = $longestOverlap - 1;
                while ($k > 0)
                {
                    # form the string
                    my $stringEnd = $i + $k - 1;
		    last if contains (@words1, @words0[$i..$stringEnd]);
                    $k--;
                }

                $overlapsLengths[$i] = $k;
            }
        }
        ($longestOverlap) = sort {$b <=> $a} @overlapsLengths;
    }

    return (\%overlapsHash, $wc0, $wc1);
}

# returns true of the first array contains the list, otherwise returns false
# See also containsReplace()
# e.g., contains (@Array, LIST);
sub contains (\@@)
{
    my $array2_ref = shift;
    my @array1 = @_;

    return 0 if $#{$array2_ref} < $#array1;

    for my $j (0..($#{$array2_ref} - $#array1)) {
	next if $array2_ref->[$j] eq MARKER;

	if ($array1[0] eq $array2_ref->[$j]) {
	    my $match = 1;
	    for my $i (1..$#array1) {
		if ($array2_ref->[$j + $i] eq MARKER
		    or $array1[$i] ne $array2_ref->[$j + $i]) {
		    $match = 0;
		    last;
		}
	    }
	    if ($match) {
		return 1;
	    }
	}
    }
    
    return 0;
}

# same functionality as contains(), but replaces each word in the match
# with the constant MARKER
sub containsReplace (\@@)
{
    my $array2_ref = shift;
    my @array1 = @_;

    return 0 if $#{$array2_ref} < $#array1;

    for my $j (0..($#{$array2_ref} - $#array1)) {
	next if $array2_ref->[$j] eq MARKER;

	if ($array1[0] eq $array2_ref->[$j]) {
	    my $match = 1;
	    for my $i (1..$#array1) {
		if ($array2_ref->[$j + $i] eq MARKER
		    or $array1[$i] ne $array2_ref->[$j + $i]) {
		    $match = 0;
		    last;
		}
	    }
	    
	    # match found, remove match and return true
	    if ($match) {
		for my $k ($j..($j+$#array1)) {
		    $array2_ref->[$k] = MARKER;
		}
		return 1;
	    }
	}
    }
   
    # no match found
    return 0;
}

sub _removeStopWords
{
    my $self = shift;
    my $str = shift;
    my @words = split /\s+/, $str;
    my @newwords;
    foreach my $word (@words) {
	push (@newwords, $word) unless exists $stoplist{$self}->{$word};
    }
    return join (' ', @newwords);
}

sub _loadStoplist
{
    my $self = shift;
    my $list = shift;
    open FH, '<', $list or die "Cannot open stoplist file '$list': $!";
   
    while (<FH>) {
	chomp;
	$stoplist{$self}->{$_} = 1;
    }

    close FH;
}


1;

__END__

=head1 NAME

Text::OverlapFinder - find overlaps in strings

=head1 SYNOPSIS

    use Text::OverlapFinder;
    my $finder = Text::OverlapFinder->new;
    my $overlaps = $finder->getOverlaps ($string1, $string2);
    foreach my $overlap (keys %$overlaps) {
        print "$overlap occurred $overlaps->{$overlap} times.\n";
    }

=head1 DESCRIPTION

This module is useful for efficiently finding word overlaps in strings.

=head1 AUTHORS

Jason Michelizzi, E<lt>jmichelizzi at sourceforge.netE<gt>

Ted Pedersen, E<lt>tpederse at d.umn.eduE<gt>

Siddharth Patwardhan, E<lt>sidd at cs.utah.eduE<gt>

Satanjeev Banerjee, E<lt>satanjee+ at cs.cmu.eduE<gt>

=head1 BUGS

None.

To submit a bug report, e-mail E<lt>jmichelizzi at sourceforge.netE<gt>.

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2004 by Jason Michelizzi, Ted Pedersen, Siddharth
Patwardhan, and Satanjeev Banerjee

This library is free software; you may redistribute it and/or modify
it under the terms of the GNU General Public License, version 2 or,
at your option, any later version.

=cut