bin/intersections.pl - metacpan.org

#!/usr/bin/perl

# intersections.pl - extract and count the common ngrams from two texts

# Eric Lease Morgan <eric_morgan@infomotions.com>
# September 11, 2010 - first investigations
# September 12, 2010 - made it more general


# configure
use constant TEXTONE => '../etc/walden.txt';
use constant TEXTTWO => '../etc/rivers.txt';
use constant LENGTH  => 10;

# require
use lib '../lib';
use strict;
use Lingua::EN::Ngram;

# get input and sanity check
my $length = $ARGV[ 0 ];
if ( ! $length ) {

	print "Usage: $0 <integer>\n";
	exit;
	
}


# build corpus
my $textone = Lingua::EN::Ngram->new( file => TEXTONE );
my $texttwo = Lingua::EN::Ngram->new( file => TEXTTWO );
my $corpus  = Lingua::EN::Ngram->new;

# calculate intersections
my $intersections = $corpus->intersection( corpus => [ ( $textone, $texttwo ) ], length => $length );

# process each intersection
print 'Top ', LENGTH, " $length-gram phrases common to both ", TEXTONE, ' and ', TEXTTWO, ":\n";
my $index = 0;
foreach ( sort { $$intersections{ $b } <=> $$intersections{ $a }} keys %$intersections ) {

	# skip punctuation
	next if ( $_ =~ /[,.?!:;()\-]/ );
	next if ( $_ =~ /^'/ or $_ =~ /' / );
	
	# increment
	$index++;
	last if ( $index > LENGTH );
	
	# print summary
	print $$intersections{ $_ }, "\t$_\n";
	
}

# done
exit;

	Global
`s`	Focus search bar
`?`	Bring up this help dialog

	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)

	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse

	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)