The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl

# intersections.pl - extract and count the common ngrams from two texts

# Eric Lease Morgan <eric_morgan@infomotions.com>
# September 11, 2010 - first investigations
# September 12, 2010 - made it more general


# configure
use constant TEXTONE => '../etc/walden.txt';
use constant TEXTTWO => '../etc/rivers.txt';
use constant LENGTH  => 10;

# require
use lib '../lib';
use strict;
use Lingua::EN::Ngram;

# get input and sanity check
my $length = $ARGV[ 0 ];
if ( ! $length ) {

	print "Usage: $0 <integer>\n";
	exit;
	
}


# build corpus
my $textone = Lingua::EN::Ngram->new( file => TEXTONE );
my $texttwo = Lingua::EN::Ngram->new( file => TEXTTWO );
my $corpus  = Lingua::EN::Ngram->new;

# calculate intersections
my $intersections = $corpus->intersection( corpus => [ ( $textone, $texttwo ) ], length => $length );

# process each intersection
print 'Top ', LENGTH, " $length-gram phrases common to both ", TEXTONE, ' and ', TEXTTWO, ":\n";
my $index = 0;
foreach ( sort { $$intersections{ $b } <=> $$intersections{ $a }} keys %$intersections ) {

	# skip punctuation
	next if ( $_ =~ /[,.?!:;()\-]/ );
	next if ( $_ =~ /^'/ or $_ =~ /' / );
	
	# increment
	$index++;
	last if ( $index > LENGTH );
	
	# print summary
	print $$intersections{ $_ }, "\t$_\n";
	
}

# done
exit;