@@ -1,5 +1,20 @@
Revision history for Perl module Text::Levenshtein
+0.12 2014-10-26 NEILB
+ - Require Unicode::Collate 1.04 or higher. Tests on early Perls were
+ failing, most often down to Unicode::Collate 0.52, on Perl 5.8.9.
+
+0.11 2014-10-26 NEILB
+ - The first implementation of the ignore_diacritics option was
+ very inefficient. It's now at least an order of magnitude faster,
+ if calling it multiple times.
+
+0.10 2014-10-24 NEILB
+
+ - Added { ignore_diacritics => 1 } option, which can be passed as the
+ last argument to the distance() and fastdistance() subs.
+ OALDERS++ for the suggestion in RT#97883.
+
0.09 2014-04-26 NEILB
- Added Text::Levenshtein::TestUtils in t/lib/ to make it easy to
@@ -1,4 +1,4 @@
-# This file was automatically generated by Dist::Zilla::Plugin::Manifest v5.015.
+# This file was automatically generated by Dist::Zilla::Plugin::Manifest v5.020.
Changes
LICENSE
MANIFEST
@@ -15,4 +15,5 @@ t/20-swedish.t
t/21-russian.t
t/22-japanese.t
t/23-greek.t
+t/24-ignore-diacritics.t
t/lib/Text/Levenshtein/TestUtils.pm
@@ -11,9 +11,9 @@ build_requires:
parent: '0'
utf8: '0'
configure_requires:
- ExtUtils::MakeMaker: '6.30'
+ ExtUtils::MakeMaker: '0'
dynamic_config: 0
-generated_by: 'Dist::Zilla version 5.015, CPAN::Meta::Converter version 2.133380'
+generated_by: 'Dist::Zilla version 5.020, CPAN::Meta::Converter version 2.141170'
license: perl
meta-spec:
url: http://module-build.sourceforge.net/META-spec-v1.4.html
@@ -26,10 +26,11 @@ requires:
Carp: '0'
Exporter: '0'
List::Util: '0'
+ Unicode::Collate: '1.04'
perl: '5.006'
strict: '0'
warnings: '0'
resources:
homepage: https://github.com/neilbowers/Text-Levenshtein
repository: https://github.com/neilbowers/Text-Levenshtein.git
-version: '0.09'
+version: '0.12'
@@ -1,20 +1,19 @@
-# This file was automatically generated by Dist::Zilla::Plugin::MakeMaker v5.015.
+# This file was automatically generated by Dist::Zilla::Plugin::MakeMaker v5.020.
use strict;
use warnings;
use 5.006;
-use ExtUtils::MakeMaker 6.30;
+use ExtUtils::MakeMaker ;
my %WriteMakefileArgs = (
"ABSTRACT" => "calculate the Levenshtein edit distance between two strings",
"AUTHOR" => "Dree Mistrut <dree\@friul.it>, Josh Goldberg <jgoldberg\@cpan.org>, Neil Bowers <neilb\@cpan.org>",
- "BUILD_REQUIRES" => {},
"CONFIGURE_REQUIRES" => {
- "ExtUtils::MakeMaker" => "6.30"
+ "ExtUtils::MakeMaker" => 0
},
"DISTNAME" => "Text-Levenshtein",
"EXE_FILES" => [],
@@ -24,6 +23,7 @@ my %WriteMakefileArgs = (
"Carp" => 0,
"Exporter" => 0,
"List::Util" => 0,
+ "Unicode::Collate" => "1.04",
"strict" => 0,
"warnings" => 0
},
@@ -34,7 +34,7 @@ my %WriteMakefileArgs = (
"parent" => 0,
"utf8" => 0
},
- "VERSION" => "0.09",
+ "VERSION" => "0.12",
"test" => {
"TESTS" => "t/*.t"
}
@@ -46,6 +46,7 @@ my %FallbackPrereqs = (
"Exporter" => 0,
"List::Util" => 0,
"Test::More" => "0.88",
+ "Unicode::Collate" => "1.04",
"lib" => 0,
"open" => 0,
"parent" => 0,
@@ -6,13 +6,15 @@ license = Perl_5
copyright_holder = Dree Mistrut
copyright_year = 2002
-version = 0.09
+version = 0.12
[@Filter]
-bundle = @Basic
-remove = Readme
[PkgVersion]
[AutoPrereqs]
+[Prereqs]
+Unicode::Collate = 1.04
[MetaNoIndex]
package = Text::Levenshtein::TestUtils
[GithubMeta]
@@ -1,5 +1,5 @@
package Text::Levenshtein;
-$Text::Levenshtein::VERSION = '0.09';
+$Text::Levenshtein::VERSION = '0.12';
use 5.006;
use strict;
use warnings;
@@ -15,26 +15,53 @@ our %EXPORT_TAGS = ();
sub distance
{
+ my $opt = pop(@_) if @_ > 0 && ref($_[-1]) eq 'HASH';
croak "distance() takes 2 or more arguments" if @_ < 2;
my ($s,@t)=@_;
my @results;
+ $opt = {} if not defined $opt;
+
foreach my $t (@t) {
- push(@results, fastdistance($s, $t));
+ push(@results, fastdistance($s, $t, $opt));
}
return wantarray ? @results : $results[0];
}
+my $eq_with_diacritics = sub {
+ my ($x, $y) = @_;
+ return $x eq $y;
+};
+
+my $eq_without_diacritics;
+
# This is the "Iterative with two matrix rows" version
# from the wikipedia page
# http://en.wikipedia.org/wiki/Levenshtein_distance#Computing_Levenshtein_distance
sub fastdistance
{
- croak "fastdistance() takes exactly 2 arguments" unless @_ == 2;
+ my $opt = pop(@_) if @_ > 0 && ref($_[-1]) eq 'HASH';
+ croak "fastdistance() takes 2 or 3 arguments" unless @_ == 2;
my ($s, $t) = @_;
my (@v0, @v1);
my ($i, $j);
+ my $eq;
+
+ $opt = {} if not defined $opt;
+ if ($opt->{ignore_diacritics}) {
+ if (not defined $eq_without_diacritics) {
+ require Unicode::Collate;
+ my $collator = Unicode::Collate->new(normalization => undef, level => 1);
+ $eq_without_diacritics = sub {
+ return $collator->eq(@_);
+ };
+ }
+ $eq = $eq_without_diacritics;
+ }
+ else {
+ $eq = $eq_with_diacritics;
+ }
return 0 if $s eq $t;
return length($s) if !$t || length($t) == 0;
@@ -51,7 +78,8 @@ sub fastdistance
$v1[0] = $i + 1;
for ($j = 0; $j < $t_length; $j++) {
- my $cost = substr($s, $i, 1) eq substr($t, $j, 1) ? 0 : 1;
+ # my $cost = substr($s, $i, 1) eq substr($t, $j, 1) ? 0 : 1;
+ my $cost = $eq->(substr($s, $i, 1), substr($t, $j, 1)) ? 0 : 1;
$v1[$j + 1] = List::Util::min(
$v1[$j] + 1,
$v0[$j + 1] + 1,
@@ -71,6 +99,8 @@ sub fastdistance
__END__
+=encoding UTF8
+
=head1 NAME
Text::Levenshtein - calculate the Levenshtein edit distance between two strings
@@ -126,6 +156,24 @@ but they now run the same function to calculate the edit distance.
Unlike C<distance()>, C<fastdistance()> only takes two strings,
and returns the edit distance between them.
+=head1 ignore_diacritics
+
+Both the C<distance()> and C<fastdistance()> functions can take
+a hashref with optional arguments, as the final argument.
+At the moment the only option is C<ignore_diacritics>.
+If this is true, then any diacritics are ignored when calculating
+edit distance. For example, "cafe" and "café" normally have an edit
+distance of 1, but when diacritics are ignored, the distance will be 0:
+
+ use Text::Levenshtein 0.11 qw/ distance /;
+ $distance = distance($word1, $word2, {ignore_diacritics => 1});
+
+If you turn on this option, then L<Unicode::Collate> will be loaded,
+and used when comparing characters in the words.
+
+Early version of C<Text::Levenshtein> didn't support this version,
+so you should require version 0.11 or later, as above.
+
=head1 SEE ALSO
There are many different modules on CPAN for calculating the edit
@@ -16,14 +16,14 @@ ok($@ && $@ =~ m!takes 2 or more arguments!,
"passing one argument to distance() should croak");
eval { $distance = fastdistance() };
-ok($@ && $@ =~ m!takes exactly 2 arguments!,
+ok($@ && $@ =~ m!takes 2 or 3 arguments!,
"passing no arguments to fastdistance() should croak");
eval { $distance = fastdistance('pink') };
-ok($@ && $@ =~ m!takes exactly 2 arguments!,
+ok($@ && $@ =~ m!takes 2 or 3 arguments!,
"passing one argument to fastdistance() should croak");
eval { $distance = fastdistance('pink', 'blue', 'brown') };
-ok($@ && $@ =~ m!takes exactly 2 arguments!,
+ok($@ && $@ =~ m!takes 2 or 3 arguments!,
"passing three argument to fastdistance() should croak");
@@ -0,0 +1,27 @@
+#!perl
+
+use strict;
+use warnings;
+use utf8;
+
+use lib 't/lib';
+use Text::Levenshtein::TestUtils qw/ run_data_tests /;
+
+run_data_tests({ ignore_diacritics => 1 });
+
+__DATA__
+låsa
+läsa
+0
+--
+mån
+man
+0
+--
+mått
+mätt
+0
+--
+café
+cafe
+--
@@ -16,9 +16,13 @@ our @EXPORT_OK = qw/ run_data_tests /;
sub run_data_tests
{
+ my $opt = {};
my $package = (caller(0))[0];
my $distance;
my $fh;
+ my @extra;
+
+ @extra = @_;
$fh = do {
no strict 'refs';
@@ -30,19 +34,19 @@ sub run_data_tests
plan tests => 4 * @tests;
foreach my $test (@tests) {
- $distance = distance($test->{word1}, $test->{word2});
+ $distance = distance($test->{word1}, $test->{word2}, @extra);
ok($distance == $test->{distance},
"$test->{title} (distance)");
- $distance = distance($test->{word2}, $test->{word1});
+ $distance = distance($test->{word2}, $test->{word1}, @extra);
ok($distance == $test->{distance},
"$test->{title} (reverse distance)");
- $distance = fastdistance($test->{word1}, $test->{word2});
+ $distance = fastdistance($test->{word1}, $test->{word2}, @extra);
ok($distance == $test->{distance},
"$test->{title} (fastdistance)");
- $distance = fastdistance($test->{word2}, $test->{word1});
+ $distance = fastdistance($test->{word2}, $test->{word1}, @extra);
ok($distance == $test->{distance},
"$test->{title} (reverse fastdistance)");
}