The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Lingua::JA::Gairaigo::Fuzzy;
require Exporter;
@ISA = qw(Exporter);
@EXPORT_OK = qw/same_gairaigo/;
%EXPORT_TAGS = (
    all => \@EXPORT_OK,
);
use warnings;
use strict;
use Carp;
our $VERSION = 0.06;
use utf8;

use Text::Fuzzy 'fuzzy_index';
use Lingua::JA::Moji ':all';

sub same_gairaigo
{
    my ($kana, $n, $debug) = @_;
    if ($kana eq $n) {
	return 1;
    }
    if (chouon ($kana, $n)) {
	my $gotcha = usual_suspect ($kana, $n, $debug);
	if ($gotcha) {
	    return 1;
	}
    }
    return undef;
}

# Check a few likely things

sub usual_suspect
{
    my ($kana, $n, $debug) = @_;

    # The following is an undocumented routine in Text::Fuzzy.

    my ($dist, $edits) = fuzzy_index ($kana, $n, 1);

    # Is this a likely candidate?

    my $gotcha;

    if ($edits =~ /ii|dd|rr/) {

	# A double delete, double insertion, or double replace means
	# this is unlikely to be the same word.

	return;
    }
    my @kana = split //, $kana;
    my @nkana = split //, $n;
    my @edits = split //, $edits;

    if ($debug) {
	printf ("%d %d\n", scalar (@kana), scalar (@nkana));
    }

    # $i is the offset in @kana, and $j is the offset in @nkana. Note
    # that @kana and @nkana may have different lengths and the offsets
    # are adjusted as we look though what edit is necessary to change
    # "$kana" to "$n".

    my $i = 0;
    my $j = 0;

    for my $edit (@edits) {

	if ($debug) {
	    print "i = $i, j = $j, edit = $edit\n";
	}
	if ($edit eq 'r') {

	    # Replaced $k with $q.

	    my $k = $kana[$i];
	    my $q = $nkana[$j];
	    if ($debug) {
		print "Replace $k with $q\n";
	    }
	    if ($k =~ /[ーィイ]/ && $q =~ /[ーィイ]/) {

		# Check whether the previous kana ends in "e", so it
		# is something like "ヘイ" and "ヘー".

		if (ends_in_e (\@kana, $i)) {
		    $gotcha = 1;
		}
		if (($k eq 'ー' && $q eq 'イ') ||
		    ($q eq 'ー' && $k eq 'イ')) {
		    if (ends_in_i (\@kana, $i)) {
			$gotcha = 1;
		    }
		}
	    }
	    if ($k =~ /[ーッ]/ && $q =~ /[ーッ]/) {

		# A chouon has been replaced with a sokuon, or
		# vice-versa.

		$gotcha = 1;
	    }
	    if (($k eq 'ー' && $q eq 'ウ') ||
		($q eq 'ー' && $k eq 'ウ')) {
		if (ends_in_ou (\@kana, $i)) {
		    $gotcha = 1;
		}
	    }

	    # Whatever we had, increment $i and $j equally because a
	    # character was replaced.

	    $i++;
	    $j++;
	}
	elsif ($edit eq 'd') {

	    # Character $k was deleted from $kana to get $n, so we
	    # just increment $i.

	    my $k = $kana[$i];
	    if ($k eq 'ー' || $k eq '・' || $k eq 'ッ') {

		# A chouon, nakaguro, or sokuon was deleted from $kana
		# to get $n.

		$gotcha = 1;
	    }
	    # Check we are not at the end of the string.
	    if ($j < scalar (@kana)) {
		my $q = $kana[$j];
		if (! defined $q) {
		    warn "baba";
		}
		if ($q =~ /[ーィイ]/) {
		    if (ends_in_e (\@kana, $i)) {
			$gotcha = 1;
		    }
		}
	    }
	    $i++;
	}
	elsif ($edit eq 'i') {

	    # Character $k was inserted into $n, so we just increment
	    # $j, not $i.

	    my $k = $nkana[$j];
	    if ($k eq 'ー' || $k eq '・' || $k eq 'ッ') {

		# A chouon, nakaguro, or sokuon was inserted into
		# $kana to get $n.

		$gotcha = 1;
	    }
	    $j++;
	}
	elsif ($edit eq 'k') {

	    # The two strings are the same at this point, so do not do
	    # any checking but just increment the offsets.

	    $i++;
	    $j++;
	}
    }

    # Check we did not make a mistake scanning the two strings.

    if ($i != scalar @kana) {
	warn "Bug: Mismatch $i";
    }
    if ($j != scalar @nkana) {
	warn "Bug: Mismatch $j";
    }
    return $gotcha;
}

# Work out whether the kana before the one at $i ends in "e".

sub ends_in_e
{
    my ($kana_ref, $i) = @_;
    my $prev;
    if ($i >= 1) {
	$prev = $kana_ref->[$i - 1];
	$prev = kana2romaji ($prev);
	if ($prev =~ /e$/) {
	    return 1;
	}
    }
    return undef;
}

# Work out whether the kana before the one at $i ends in "ou".

sub ends_in_ou
{
    my ($kana_ref, $i) = @_;
    my $prev;
    if ($i >= 1) {
	$prev = $kana_ref->[$i - 1];
	$prev = kana2romaji ($prev);
	if ($prev =~ /[ou]$/) {
	    return 1;
	}
    }
    return undef;
}
# Work out whether the kana before the one at $i ends in "e".

sub ends_in_i
{
    my ($kana_ref, $i) = @_;
    my $prev;
    if ($i >= 1) {
	$prev = $kana_ref->[$i - 1];
	$prev = kana2romaji ($prev);
	if ($prev =~ /i$/) {
	    return 1;
	}
    }
    return undef;
}
# Work out whether $x and $y differ in the ways we expect.

# The name "chouon" is a misnomer.

sub chouon
{
    my ($x, $y) = @_;
    my %xa = alph ($x);
    my %ya = alph ($y);
    my $found;
    my $mismatch = check (\%xa, \%ya, \$found);
    if ($mismatch) {
	return undef;
    }
    $mismatch = check (\%ya, \%xa, \$found);
    if ($mismatch) {
	return undef;
    }
    if ($found) {
	return 1;
    }
    return undef;
}

# Given a word $x, make an alphabet of its consituent letters.

sub alph
{
    my ($x) = @_;
    my %xa;
    my @xl = split //, $x;
    @xa{@xl} = @xl;
    return %xa;
}

# Go through the keys of $ya, and check whether the keys which are not
# in $xa are the right kind of keys.

sub check
{
    my ($xa, $ya, $found) = @_;
    my $ok;
    for my $k (keys %$ya) {
	next if $xa->{$k};
	if ($k eq 'ー' ||
	    $k eq 'イ' ||
	    $k eq 'ィ' ||
	    $k eq '・' ||
	    $k eq 'ッ' || 
	    $k eq 'ウ') {
	    $ok = 1;
	    next;
	}
	return $k;
    }
    if ($ok) {
	$$found = $ok;
    }
    return;
}

1;