lib/Text/OverlapFinder.pm

package Text::OverlapFinder;

use strict;
use warnings;

our @ISA = ();
our $VERSION = '0.05';

use constant MARKER => '###';

# manually patched with the following :
# https://rt.cpan.org/Public/Ticket/Attachment/999948/520850

##sub contains(\@@);
##sub containsReplace(\@@);

sub contains(\@$@);
sub containsReplace(\@$@);

## stemmer support not available as yet

my $stopregex = "";
my %stemmer;

# new (stoplist => $stoplist, stemmer => 1)
sub new
{
    my $class = shift;
    $class = ref $class || $class;
    my $self = bless [], $class;
    
    my $stoplist;
    my $stemmer;
    while (scalar @_) {
	my $arg = shift;
	if ($arg =~ /stoplist/i) {
	    $stoplist = shift;
	    if (-z $stoplist) {
		die "'$stoplist' is not a stoplist file";
	    }
	}
	elsif ($arg =~ /stemmer/i) {
	    $stemmer = shift; 
	    unless (ref $stemmer) {
		die "'$stemmer' is not a reference to a stemmer object";
	    }
	}
	else {
	    die "Unknown argument '$arg'";
	}
    }

    # stemming
    # stoplist
    if (defined $stoplist) {
	$self->_loadStoplist ($stoplist);
    }

    if (defined $stemmer) {
	warn "Stemmer defined but ignored";
    }

    return $self;
}

sub DESTROY
{
    my $self = shift;
    delete $stemmer{$self};
}

sub doStop {0}

# originally adapted from a function in string_compare.pm 
# (distributed with earlier versions of WordNet::Similarity)
# now WordNet::Similarity uses Text::Similarity and no longer
# includes string_compare.pm
sub getOverlaps
{
    my $self = shift;
    my $string0 = shift;
    my $string1 = shift;

    my %overlapsHash = ();

    $string0 =~ s/^\s+//;
    $string0 =~ s/\s+$//;
    $string1 =~ s/^\s+//;
    $string1 =~ s/\s+$//;


	if ($stopregex ne "")
	{
    	$string0 = $self->_removeStopWords ($string0);
    	$string1 = $self->_removeStopWords ($string1);
	}

    # if stemming on, stem the two strings
    my $stemmingReqd = 0;
    if ($stemmingReqd)
    {
	my $stemmer = bless [];
        $string0 = $stemmer->stemString($string0, 1); # 1 turns on caching
        $string1 = $stemmer->stemString($string1, 1);
    }

    my @words0 = split /\s+/, $string0;
    my @words1 = split /\s+/, $string1;

    my %first;
    foreach my $offset (0 .. $#words1) {
       push @{$first{$words1[$offset]}}, $offset;
    }

    my $wc0 = scalar @words0;
    my $wc1 = scalar @words1;

    # for each word in string0, find out how long an overlap can start from it.
    my @overlapsLengths = ();
    my $matchStartIndex = 0;
    my $currIndex = -1;

    while ($currIndex < $#words0)
    {
        # forward the current index to look at the next word
        $currIndex++;

        # if this works, carry on!
#        if (contains (@words1, @words0[$matchStartIndex..$currIndex])) {
	 if (contains (@words1, $first{$words0[$matchStartIndex]},@words0[$matchStartIndex..$currIndex])) {
	    next
	}
	else {
	    # XXX shouldn't this be $currIndex - $matchStartIndex + 1 ?
	    $overlapsLengths[$matchStartIndex] = $currIndex - $matchStartIndex;
	    $currIndex-- if ($overlapsLengths[$matchStartIndex] > 0);
	    $matchStartIndex++;
	}
    }

    for (my $i = $matchStartIndex; $i <= $currIndex; $i++)
    {
        $overlapsLengths[$i] = $currIndex - $i + 1;
    }

    my ($longestOverlap) = sort {$b <=> $a} @overlapsLengths;

    while (defined($longestOverlap) && ($longestOverlap > 0))
    {
        for (my $i = 0; $i <= $#overlapsLengths; $i++)
        {
            next if ($overlapsLengths[$i] < $longestOverlap);

            # form the string
            my $stringEnd = $i + $longestOverlap - 1;

            # check if still there in $string1. replace in string1 with a mark

            if (1 #!doStop($temp)
##		&& containsReplace (@words1, @words0[$i..$stringEnd]))
	        && exists $first{$words0[$i]}
		&& containsReplace (@words1, $first{$words0[$i]}, @words0[$i..$stringEnd]))
            {
                # so its still there. we have an overlap!
		my $temp = join (" ", @words0[$i..$stringEnd]);
                $overlapsHash{$temp}++;

                # adjust overlap lengths forward
                for (my $j = $i; $j < $i + $longestOverlap; $j++)
                {
                    $overlapsLengths[$j] = 0;
                }

                # adjust overlap lengths backward
                for (my $j = $i-1; $j >= 0; $j--)
                {
                    last if ($overlapsLengths[$j] <= $i - $j);
                    $overlapsLengths[$j] = $i - $j;
                }
            }
            else
	    {
                # ah its not there any more in string1! see if
                # anything smaller than the full string works
                my $k = $longestOverlap - 1;
                while ($k > 0)
                {
                    # form the string
                    my $stringEnd = $i + $k - 1;
##		    last if contains (@words1, @words0[$i..$stringEnd]);
		     last if contains (@words1, $first{$words0[$i]}, @words0[$i..$stringEnd]);

                    $k--;
                }

                $overlapsLengths[$i] = $k;
            }
        }
        ($longestOverlap) = sort {$b <=> $a} @overlapsLengths;
    }

    return (\%overlapsHash, $wc0, $wc1);
}

# returns true if the first array contains the list, otherwise returns false
# See also containsReplace()
# e.g., contains (@Array, LIST);
##sub contains (\@@)
sub contains (\@$@)
{
    my $array2_ref = shift;

    my $positions = shift;
    return 0 if (not defined $positions);

    my @array1 = @_;

    return 0 if $#{$array2_ref} < $#array1;

##    for my $j (0..($#{$array2_ref} - $#array1)) {
    for my $j (@$positions) {
        next if ($j > $#{$array2_ref} - $#array1);

	next if $array2_ref->[$j] eq MARKER;

	if ($array1[0] eq $array2_ref->[$j]) {
	    my $match = 1;
	    for my $i (1..$#array1) {
		if ($array2_ref->[$j + $i] eq MARKER
		    or $array1[$i] ne $array2_ref->[$j + $i]) {
		    $match = 0;
		    last;
		}
	    }
	    if ($match) {
		return 1;
	    }
	}
    }
    
    return 0;
}

# same functionality as contains(), but replaces each word in the match
# with the constant MARKER
##sub containsReplace (\@@)
sub containsReplace (\@$@)

{
    my $array2_ref = shift;

    my $positions = shift;
    return 0 if (not defined $positions);

    my @array1 = @_;

    return 0 if $#{$array2_ref} < $#array1;

  #  for my $j (0..($#{$array2_ref} - $#array1)) {

	for my $j (@$positions) {
        next if ($j > $#{$array2_ref} - $#array1);

	next if $array2_ref->[$j] eq MARKER;

	if ($array1[0] eq $array2_ref->[$j]) {
	    my $match = 1;
	    for my $i (1..$#array1) {
		if ($array2_ref->[$j + $i] eq MARKER
		    or $array1[$i] ne $array2_ref->[$j + $i]) {
		    $match = 0;
		    last;
		}
	    }
	    
	    # match found, remove match and return true
	    if ($match) {
		for my $k ($j..($j+$#array1)) {
		    $array2_ref->[$k] = MARKER;
		}
		return 1;
	    }
	}
    }
   
    # no match found
    return 0;
}

sub _removeStopWords
{
    my $self = shift;
    my $str = shift;
    my @words = split /\s+/, $str;
    my @newwords;
    foreach my $word (@words) {
		if(!($word =~ /$stopregex/))
        {
			push (@newwords, $word); 
        }
    }
    return join (' ', @newwords);
}

sub _loadStoplist
{
    my $self = shift;
    my $list = shift;
    open FH, '<', $list or die "Cannot open stoplist file '$list': $!";
  
	$stopregex = "(";
    while (<FH>) {
		chomp;
		if ($_ ne "")
		{	
			$_=~s/\///g;
			if ($_=~m/\\b/)
			{
				$stopregex .= "$_|";
			}
			else
			{
				my $word = "\\b"."$_"."\\b";
				$stopregex .= "$word|";
			}
		}
    }
	chop $stopregex; $stopregex .= ")";
    close FH;
}


1;

__END__

=head1 NAME

Text::OverlapFinder - Find Overlapping Words in Strings

=head1 SYNOPSIS

    # this will list out the overlaps found in two strings
    # note that the overlaps are found among space separated
    # tokens, there are no partial word matches
    # ('cat' will not match 'at' or 'cats', for example)

    use Text::OverlapFinder;
    my $finder = Text::OverlapFinder->new;
    defined $finder or die "Construction of Text::OverlapFinder failed";

    my $string1 = 'aaa bbb ccc ddd eee';
    my $string2 = 'aa bbb ccc dd ee aaa';

    # overlaps is a hash of references to the overlaps found
    # len1 and len2 are the lengths of the strings in terms of words

    my ($overlaps, $len1, $len2) = $finder->getOverlaps ($string1, $string2); 
    foreach my $overlap (keys %$overlaps) {
        print "$overlap occurred $overlaps->{$overlap} times.\n";
    }
    print "length of string 1 = $len1 length of string 2 = $len2\n";

=head1 DESCRIPTION

This module finds word overlaps in strings. It finds the longest 
possible overlap, and keeps track of how many time each overlap occurs.

There is a mechanism available for a user to provide a stemming module, 
but no stemmer is provided by this package as yet. 

=head1 AUTHORS

 Ted Pedersen, University of Minnesota, Duluth
 tpederse at d.umn.edu

 Siddharth Patwardhan, University of Utah
 sidd at cs.utah.edu

 Satanjeev Banerjee, Carnegie-Mellon University
 banerjee at cs.cmu.edu

 Jason Michelizzi 

 Ying Liu, University of Minnesota, Twin Cities
 liux0395 at umn.edu

Last modified by:
$Id: OverlapFinder.pm,v 1.4 2015/10/08 13:06:27 tpederse Exp $

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2004-2010 by Jason Michelizzi, Ted Pedersen, Siddharth 
Patwardhan, Satanjeev Banerjee and Ying Liu

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)