The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#===============================================================================
#
#         FILE:  Translit.pm
#
#  DESCRIPTION:  Cyrillic transliteration routines
#
#         NOTE:  This module ported from Wono framework "as is"
#       AUTHOR:  Michael Bochkaryov (Rattler), <misha@rattler.kiev.ua>
#      COMPANY:  Net.Style
#      VERSION:  1.044
#      CREATED:  03.08.2008 15:04:22 EEST
#===============================================================================

=head1 NAME

NetSDS::Util::Translit - transliteration routines

=head1 SYNOPSIS

	use NetSDS::Const;
	use NetSDS::Util::Translit;

	# Transliterate cyrillic string
	$trans_string = trans_cyr_lat($cyr_string);

	# Reverse transliteration to russian language
	$rus_string = trans_lat_cyr("Vsem privet", LANG_RU);

=head1 DESCRIPTION

C<NetSDS::Util::Translit> module contains routines for bidirectional
cyrillic text transliteration. Now it supports russian and ukrainian
languages processing.

=cut

package NetSDS::Util::Translit;

use 5.8.0;
use warnings 'all';
use strict;

use base 'Exporter';

use version; our $VERSION = '1.044';

use NetSDS::Util::String;

our @EXPORT = qw(
  trans_cyr_lat
  trans_lat_cyr
);

use constant LANG_BE => 'be';
use constant LANG_EN => 'en';
use constant LANG_RU => 'ru';
use constant LANG_UK => 'uk';

use constant DEFAULT_LANG     => LANG_RU;

my %PREP = (
	LANG_RU() => {
		'а'   => 'a',
		'б'   => 'b',
		'в'   => 'v',
		'г'   => 'g',
		'д'   => 'd',
		'е'   => 'e',
		'ё'   => 'yo',
		'ж'   => 'zh',
		'з'   => 'z',
		'и'   => 'i',
		'й'   => 'j',
		'к'   => 'k',
		'л'   => 'l',
		'м'   => 'm',
		'н'   => 'n',
		'о'   => 'o',
		'п'   => 'p',
		'р'   => 'r',
		'с'   => 's',
		'т'   => 't',
		'у'   => 'u',
		'ф'   => 'f',
		'х'   => 'kh',
		'ц'   => 'tc',
		'ч'   => 'ch',
		'ш'   => 'sh',
		'щ'   => 'sch',
		'ъ'   => '"',
		'ы'   => 'y',
		'ые' => 'yje',
		'ыё' => 'yjo',
		'ыу' => 'yiu',
		'ыю' => 'yju',
		'ыя' => 'yja',
		'ь'   => "'",
		'ье' => 'jie',
		'ьё' => 'jio',
		'ью' => 'jiu',
		'ья' => 'jia',
		'э'   => 'ye',
		'ю'   => 'yu',
		'я'   => 'ya',
	},

	LANG_UK() => {
		"'"  => '"',
		'а' => 'a',
		'б' => 'b',
		'в' => 'v',
		'ґ' => 'g',
		'г' => 'h',
		'д' => 'd',
		'е' => 'e',
		'є' => 'ye',
		'ж' => 'zh',
		'з' => 'z',
		'і' => 'i',
		'и' => 'y',
		'ї' => 'yi',
		'й' => 'j',
		'к' => 'k',
		'л' => 'l',
		'м' => 'm',
		'н' => 'n',
		'о' => 'o',
		'п' => 'p',
		'р' => 'r',
		'с' => 's',
		'т' => 't',
		'у' => 'u',
		'ф' => 'f',
		'х' => 'kh',
		'ц' => 'tc',
		'ч' => 'ch',
		'ш' => 'sh',
		'щ' => 'sch',
		'ь' => "'",
		'ю' => 'yu',
		'я' => 'ya',
	},

	LANG_BE() => {
		"'"    => '"',
		'а'   => 'a',
		'б'   => 'b',
		'в'   => 'v',
		'ґ'   => 'g',
		'г'   => 'h',
		'д'   => 'd',
		'е'   => 'ye',
		'ё'   => 'yo',
		'ж'   => 'zh',
		'з'   => 'z',
		'і'   => 'i',
		'и'   => 'i',
		'ї'   => 'yi',
		'й'   => 'j',
		'к'   => 'k',
		'л'   => 'l',
		'м'   => 'm',
		'н'   => 'n',
		'о'   => 'o',
		'п'   => 'p',
		'р'   => 'r',
		'с'   => 's',
		'т'   => 't',
		'у'   => 'u',
		'ў'   => 'w',
		'ф'   => 'f',
		'х'   => 'kh',
		'ц'   => 'tc',
		'ч'   => 'ch',
		'ш'   => 'sh',
		'щ'   => 'sch',
		'ы'   => 'y',
		'ые' => 'yje',
		'ыё' => 'yjo',
		'ыу' => 'yiu',
		'ыю' => 'yju',
		'ыя' => 'yja',
		'ь'   => "'",
		'ье' => 'jie',
		'ьё' => 'jio',
		'ью' => 'jiu',
		'ья' => 'jia',
		'э'   => 'e',
		'ю'   => 'yu',
		'я'   => 'ya',
	},
);

my %TO_LAT = ();

my %TO_CYR = ();

#*********************************************************************************************
sub _prep_translit {
	my ($lang) = @_;

	return if ( $PREP{prepared}->{$lang} );

	my $rfw = {};
	my $rbw = {};
	while ( my ( $fw, $bw ) = each %{ $PREP{$lang} } ) {
		$fw = str_encode($fw);
		$bw = str_encode($bw);
		my $lf = length($fw);
		my $lb = length($bw);
		if ( ( $lf == 1 ) and ( $lb == 1 ) ) {
			$rfw->{0}->{ uc($fw) }      = uc($bw);
			$rfw->{0}->{ ucfirst($fw) } = ucfirst($bw);
			$rfw->{0}->{$fw}            = $bw;

			$rbw->{0}->{ uc($bw) }      = uc($fw);
			$rbw->{0}->{ ucfirst($bw) } = ucfirst($fw);
			$rbw->{0}->{$bw}            = $fw;
		} else {
			$rfw->{$lf}->{ uc($fw) }      = uc($bw);
			$rfw->{$lf}->{ ucfirst($fw) } = ucfirst($bw);
			$rfw->{$lf}->{$fw}            = $bw;

			$rbw->{$lb}->{ uc($bw) }      = uc($fw);
			$rbw->{$lb}->{ ucfirst($bw) } = ucfirst($fw);
			$rbw->{$lb}->{$bw}            = $fw;
		}
	} ## end while ( my ( $fw, $bw ) =...

	$TO_LAT{$lang} = [];
	foreach my $ord ( reverse sort { $a <=> $b } keys %{$rfw} ) {
		my $tra = $rfw->{$ord};
		my $fnd = join( '|', keys %{$tra} );
		push( @{ $TO_LAT{$lang} }, [ $fnd, $tra ] );
	}

	$TO_CYR{$lang} = [];
	foreach my $ord ( reverse sort { $a <=> $b } keys %{$rbw} ) {
		my $tra = $rbw->{$ord};
		my $fnd = join( '|', keys %{$tra} );
		push( @{ $TO_CYR{$lang} }, [ $fnd, $tra ] );
	}

	$PREP{prepared}->{$lang} = 1;
} ## end sub _prep_translit

#*********************************************************************************************

=head1 EXPORTS

=over

=item B<trans_cyr_lat($text[, $lang])> - transliterate string

Convert text from cyrillic to latin encoding.

Language may be set if not default one.

	$lat = trans_cyr_lat($string);

=cut

#-----------------------------------------------------------------------
sub trans_cyr_lat {
	my ( $text, $lang ) = @_;

	$lang ||= DEFAULT_LANG();

	_prep_translit($lang);

	$text = str_encode($text);

	foreach my $row ( @{ $TO_LAT{$lang} } ) {
		my ( $fnd, $has ) = @{$row};
		$text =~ s/($row->[0])/$row->[1]->{$1}/ge;
	}
	$text =~ s/[^\x{0}-\x{7f}]+/\?/g;

	return str_decode($text);
}

#*********************************************************************************************

=item B<trans_lat_cyr($text[, $lang])> - reverse transliteration

This function transliterate string from latin encoding to cyrillic one.

Target language may be set if not default one.

	$cyr = trans_lat_cyr("Sam baran", "ru");

=cut

#-----------------------------------------------------------------------
sub trans_lat_cyr {
	my ( $text, $lang ) = @_;

	$lang ||= DEFAULT_LANG();

	_prep_translit($lang);

	$text = str_encode($text);

	$text =~ s/[^\x{0}-\x{7f}]+/\?/g;
	foreach my $row ( @{ $TO_CYR{$lang} } ) {
		my ( $fnd, $has ) = @{$row};
		$text =~ s/($row->[0])/$row->[1]->{$1}/sg;
	}

	return str_decode($text);
}

1;
__END__

=back

=head1 EXAMPLES

None yet

=head1 BUGS

Unknown yet

=head1 TODO

Implement examples and tests.

=head1 SEE ALSO

L<Encode>, L<perlunicode>

=head1 AUTHORS

Valentyn Solomko <pere@pere.org.ua>

=cut