The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
Changes 011
MANIFEST 12
META.yml 34
Makefile.PL 56
dist.ini 11
lib/Text/Levenshtein.pm 452
t/04-invalid-inputs.t 33
t/24-ignore-diacritics.t 027
t/lib/Text/Levenshtein/TestUtils.pm 48
9 files changed (This is a version diff) 21114
@@ -1,5 +1,16 @@
 Revision history for Perl module Text::Levenshtein
 
+0.11 2014-10-26 NEILB
+    - The first implementation of the ignore_diacritics option was
+      very inefficient. It's now at least an order of magnitude faster,
+      if calling it multiple times.
+
+0.10 2014-10-24 NEILB
+
+    - Added { ignore_diacritics => 1 } option, which can be passed as the
+      last argument to the distance() and fastdistance() subs.
+      OALDERS++ for the suggestion in RT#97883.
+
 0.09 2014-04-26 NEILB
 
     - Added Text::Levenshtein::TestUtils in t/lib/ to make it easy to
@@ -1,4 +1,4 @@
-# This file was automatically generated by Dist::Zilla::Plugin::Manifest v5.015.
+# This file was automatically generated by Dist::Zilla::Plugin::Manifest v5.020.
 Changes
 LICENSE
 MANIFEST
@@ -15,4 +15,5 @@ t/20-swedish.t
 t/21-russian.t
 t/22-japanese.t
 t/23-greek.t
+t/24-ignore-diacritics.t
 t/lib/Text/Levenshtein/TestUtils.pm
@@ -11,9 +11,9 @@ build_requires:
   parent: '0'
   utf8: '0'
 configure_requires:
-  ExtUtils::MakeMaker: '6.30'
+  ExtUtils::MakeMaker: '0'
 dynamic_config: 0
-generated_by: 'Dist::Zilla version 5.015, CPAN::Meta::Converter version 2.133380'
+generated_by: 'Dist::Zilla version 5.020, CPAN::Meta::Converter version 2.141170'
 license: perl
 meta-spec:
   url: http://module-build.sourceforge.net/META-spec-v1.4.html
@@ -26,10 +26,11 @@ requires:
   Carp: '0'
   Exporter: '0'
   List::Util: '0'
+  Unicode::Collate: '0'
   perl: '5.006'
   strict: '0'
   warnings: '0'
 resources:
   homepage: https://github.com/neilbowers/Text-Levenshtein
   repository: https://github.com/neilbowers/Text-Levenshtein.git
-version: '0.09'
+version: '0.11'
@@ -1,20 +1,19 @@
 
-# This file was automatically generated by Dist::Zilla::Plugin::MakeMaker v5.015.
+# This file was automatically generated by Dist::Zilla::Plugin::MakeMaker v5.020.
 use strict;
 use warnings;
 
 use 5.006;
 
-use ExtUtils::MakeMaker 6.30;
+use ExtUtils::MakeMaker ;
 
 
 
 my %WriteMakefileArgs = (
   "ABSTRACT" => "calculate the Levenshtein edit distance between two strings",
   "AUTHOR" => "Dree Mistrut <dree\@friul.it>, Josh Goldberg <jgoldberg\@cpan.org>, Neil Bowers <neilb\@cpan.org>",
-  "BUILD_REQUIRES" => {},
   "CONFIGURE_REQUIRES" => {
-    "ExtUtils::MakeMaker" => "6.30"
+    "ExtUtils::MakeMaker" => 0
   },
   "DISTNAME" => "Text-Levenshtein",
   "EXE_FILES" => [],
@@ -24,6 +23,7 @@ my %WriteMakefileArgs = (
     "Carp" => 0,
     "Exporter" => 0,
     "List::Util" => 0,
+    "Unicode::Collate" => 0,
     "strict" => 0,
     "warnings" => 0
   },
@@ -34,7 +34,7 @@ my %WriteMakefileArgs = (
     "parent" => 0,
     "utf8" => 0
   },
-  "VERSION" => "0.09",
+  "VERSION" => "0.11",
   "test" => {
     "TESTS" => "t/*.t"
   }
@@ -46,6 +46,7 @@ my %FallbackPrereqs = (
   "Exporter" => 0,
   "List::Util" => 0,
   "Test::More" => "0.88",
+  "Unicode::Collate" => 0,
   "lib" => 0,
   "open" => 0,
   "parent" => 0,
@@ -6,7 +6,7 @@ license = Perl_5
 copyright_holder = Dree Mistrut
 copyright_year   = 2002
 
-version = 0.09
+version = 0.11
 
 [@Filter]
 -bundle = @Basic
@@ -1,5 +1,5 @@
 package Text::Levenshtein;
-$Text::Levenshtein::VERSION = '0.09';
+$Text::Levenshtein::VERSION = '0.11';
 use 5.006;
 use strict;
 use warnings;
@@ -15,26 +15,53 @@ our %EXPORT_TAGS = ();
 
 sub distance
 {
+    my $opt = pop(@_) if @_ > 0 && ref($_[-1]) eq 'HASH';
     croak "distance() takes 2 or more arguments" if @_ < 2;
 	my ($s,@t)=@_;
     my @results;
 
+    $opt = {} if not defined $opt;
+
 	foreach my $t (@t) {
-		push(@results, fastdistance($s, $t));
+		push(@results, fastdistance($s, $t, $opt));
 	}
 
 	return wantarray ? @results : $results[0];
 }
 
+my $eq_with_diacritics = sub {
+    my ($x, $y) = @_;
+    return $x eq $y;
+};
+
+my $eq_without_diacritics;
+
 # This is the "Iterative with two matrix rows" version
 # from the wikipedia page
 # http://en.wikipedia.org/wiki/Levenshtein_distance#Computing_Levenshtein_distance
 sub fastdistance
 {
-    croak "fastdistance() takes exactly 2 arguments" unless @_ == 2;
+    my $opt = pop(@_) if @_ > 0 && ref($_[-1]) eq 'HASH';
+    croak "fastdistance() takes 2 or 3 arguments" unless @_ == 2;
     my ($s, $t) = @_;
     my (@v0, @v1);
     my ($i, $j);
+    my $eq;
+
+    $opt = {} if not defined $opt;
+    if ($opt->{ignore_diacritics}) {
+        if (not defined $eq_without_diacritics) {
+            require Unicode::Collate;
+            my $collator = Unicode::Collate->new(normalization => undef, level => 1);
+            $eq_without_diacritics = sub {
+                return $collator->eq(@_);
+            };
+        }
+        $eq = $eq_without_diacritics;
+    }
+    else {
+        $eq = $eq_with_diacritics;
+    }
 
     return 0 if $s eq $t;
     return length($s) if !$t || length($t) == 0;
@@ -51,7 +78,8 @@ sub fastdistance
         $v1[0] = $i + 1;
 
         for ($j = 0; $j < $t_length; $j++) {
-            my $cost = substr($s, $i, 1) eq substr($t, $j, 1) ? 0 : 1;
+            # my $cost = substr($s, $i, 1) eq substr($t, $j, 1) ? 0 : 1;
+            my $cost = $eq->(substr($s, $i, 1), substr($t, $j, 1)) ? 0 : 1;
             $v1[$j + 1] = List::Util::min(
                               $v1[$j] + 1,
                               $v0[$j + 1] + 1,
@@ -71,6 +99,8 @@ sub fastdistance
 
 __END__
 
+=encoding UTF8
+
 =head1 NAME
 
 Text::Levenshtein - calculate the Levenshtein edit distance between two strings
@@ -126,6 +156,24 @@ but they now run the same function to calculate the edit distance.
 Unlike C<distance()>, C<fastdistance()> only takes two strings,
 and returns the edit distance between them.
 
+=head1 ignore_diacritics
+
+Both the C<distance()> and C<fastdistance()> functions can take
+a hashref with optional arguments, as the final argument.
+At the moment the only option is C<ignore_diacritics>.
+If this is true, then any diacritics are ignored when calculating
+edit distance. For example, "cafe" and "café" normally have an edit
+distance of 1, but when diacritics are ignored, the distance will be 0:
+
+ use Text::Levenshtein 0.11 qw/ distance /;
+ $distance = distance($word1, $word2, {ignore_diacritics => 1});
+
+If you turn on this option, then L<Unicode::Collate> will be loaded,
+and used when comparing characters in the words.
+
+Early version of C<Text::Levenshtein> didn't support this version,
+so you should require version 0.11 or later, as above.
+
 =head1 SEE ALSO
 
 There are many different modules on CPAN for calculating the edit
@@ -16,14 +16,14 @@ ok($@ && $@ =~ m!takes 2 or more arguments!,
    "passing one argument to distance() should croak");
 
 eval { $distance = fastdistance() };
-ok($@ && $@ =~ m!takes exactly 2 arguments!,
+ok($@ && $@ =~ m!takes 2 or 3 arguments!,
    "passing no arguments to fastdistance() should croak");
 
 eval { $distance = fastdistance('pink') };
-ok($@ && $@ =~ m!takes exactly 2 arguments!,
+ok($@ && $@ =~ m!takes 2 or 3 arguments!,
    "passing one argument to fastdistance() should croak");
 
 eval { $distance = fastdistance('pink', 'blue', 'brown') };
-ok($@ && $@ =~ m!takes exactly 2 arguments!,
+ok($@ && $@ =~ m!takes 2 or 3 arguments!,
    "passing three argument to fastdistance() should croak");
 
@@ -0,0 +1,27 @@
+#!perl
+
+use strict;
+use warnings;
+use utf8;
+
+use lib 't/lib';
+use Text::Levenshtein::TestUtils qw/ run_data_tests /;
+
+run_data_tests({ ignore_diacritics => 1 });
+
+__DATA__
+låsa
+läsa
+0
+--
+mån
+man
+0
+--
+mått
+mätt
+0
+--
+café
+cafe
+--
@@ -16,9 +16,13 @@ our @EXPORT_OK = qw/ run_data_tests /;
 
 sub run_data_tests
 {
+    my $opt = {};
     my $package = (caller(0))[0];
     my $distance;
     my $fh;
+    my @extra;
+
+    @extra = @_;
 
     $fh = do {
         no strict 'refs';
@@ -30,19 +34,19 @@ sub run_data_tests
     plan tests => 4 * @tests;
 
     foreach my $test (@tests) {
-        $distance = distance($test->{word1}, $test->{word2});
+        $distance = distance($test->{word1}, $test->{word2}, @extra);
         ok($distance == $test->{distance},
            "$test->{title} (distance)");
 
-        $distance = distance($test->{word2}, $test->{word1});
+        $distance = distance($test->{word2}, $test->{word1}, @extra);
         ok($distance == $test->{distance},
            "$test->{title} (reverse distance)");
 
-        $distance = fastdistance($test->{word1}, $test->{word2});
+        $distance = fastdistance($test->{word1}, $test->{word2}, @extra);
         ok($distance == $test->{distance},
            "$test->{title} (fastdistance)");
 
-        $distance = fastdistance($test->{word2}, $test->{word1});
+        $distance = fastdistance($test->{word2}, $test->{word1}, @extra);
         ok($distance == $test->{distance},
            "$test->{title} (reverse fastdistance)");
     }