The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Lingua::JA::NormalizeText;

use 5.008_001;
use strict;
use warnings;
use utf8;

use Carp ();
use Exporter           qw/import/;
use Unicode::Normalize ();
use HTML::Entities     ();
use HTML::Scrubber     ();
use Lingua::JA::Regular::Unicode ();
use Lingua::JA::Dakuon ();
use Lingua::JA::Moji   ();

our $VERSION   = '0.45';
our @EXPORT    = qw();
our @EXPORT_OK = qw(nfkc nfkd nfc nfd decode_entities strip_html
alnum_z2h alnum_h2z space_z2h space_h2z katakana_z2h katakana_h2z
katakana2hiragana hiragana2katakana wave2tilde tilde2wave
wavetilde2long wave2long tilde2long fullminus2long dashes2long
drawing_lines2long unify_long_repeats nl2space unify_long_spaces
unify_whitespaces unify_nl trim ltrim rtrim old2new_kana old2new_kanji
tab2space remove_controls remove_spaces dakuon_normalize
handakuon_normalize all_dakuon_normalize
square2katakana circled2kana circled2kanji
remove_DFC decompose_parenthesized_kanji);

our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );

my %AVAILABLE_OPTS;
@AVAILABLE_OPTS{ (qw/lc uc/, @EXPORT_OK) } = ();

my %parenthesized_kanji_map = (
    '㈠' => '一',  '㈡' => '二',  '㈢' => '三',  '㈣' => '四',  '㈤' => '五',  '㈥' => '六',
    '㈦' => '七',  '㈧' => '八',  '㈨' => '九',  '㈩' => '十',  '㈪' => '月',  '㈫' => '火',
    '㈬' => '水',  '㈭' => '木',  '㈮' => '金',  '㈯' => '土',  '㈰' => '日',  '㈱' => '株',
    '㈲' => '有',  '㈳' => '社',  '㈴' => '名',  '㈵' => '特',  '㈶' => '財',  '㈷' => '祝',
    '㈸' => '労',  '㈹' => '代',  '㈺' => '呼',  '㈻' => '学',  '㈼' => '監',  '㈽' => '企',
    '㈾' => '資',  '㈿' => '協',  '㉀' => '祭',  '㉁' => '休',  '㉂' => '自',  '㉃' => '至',
);

our $SCRUBBER = HTML::Scrubber->new;

# This does not work on Perl 5.8.8 !!
# Error message:
# - couldn't find subroutine named lc in package CORE
# - Undefined subroutine &CORE::lc called
#*lc = \&CORE::lc;
#*uc = \&CORE::uc;

*nfkc                 = \&Unicode::Normalize::NFKC;
*nfkd                 = \&Unicode::Normalize::NFKD;
*nfc                  = \&Unicode::Normalize::NFC;
*nfd                  = \&Unicode::Normalize::NFD;
*decode_entities      = \&HTML::Entities::decode_entities;
*alnum_z2h            = \&Lingua::JA::Regular::Unicode::alnum_z2h;
*alnum_h2z            = \&Lingua::JA::Regular::Unicode::alnum_h2z;
*space_z2h            = \&Lingua::JA::Regular::Unicode::space_z2h;
*space_h2z            = \&Lingua::JA::Regular::Unicode::space_h2z;
*katakana_z2h         = \&Lingua::JA::Regular::Unicode::katakana_z2h;
*katakana_h2z         = \&Lingua::JA::Regular::Unicode::katakana_h2z;
*katakana2hiragana    = \&Lingua::JA::Regular::Unicode::katakana2hiragana;
*hiragana2katakana    = \&Lingua::JA::Regular::Unicode::hiragana2katakana;
*dakuon_normalize     = \&Lingua::JA::Dakuon::dakuon_normalize;
*handakuon_normalize  = \&Lingua::JA::Dakuon::handakuon_normalize;
*all_dakuon_normalize = \&Lingua::JA::Dakuon::all_dakuon_normalize;
*square2katakana      = \&Lingua::JA::Moji::square2katakana;
*circled2kana         = \&Lingua::JA::Moji::circled2kana;
*circled2kanji        = \&Lingua::JA::Moji::circled2kanji;

$Lingua::JA::Dakuon::EnableCombining = 1;

sub new
{
    my $class = shift;

    my @opts = (ref $_[0] eq 'ARRAY' ? @{$_[0]} : @_);

    Carp::croak("at least one option required") unless scalar @opts;

    my $self = bless {}, $class;

    $self->{converters} = [];

    my @unavailable_opts;

    for my $opt (@opts)
    {
        if (ref $opt ne 'CODE')
        {
            if ( exists $AVAILABLE_OPTS{$opt} )
            {
                push( @{ $self->{converters} }, $opt );
            }
            else { push(@unavailable_opts, $opt); }
        }
        else
        {
            # external functions
            push( @{ $self->{converters} }, $opt );
        }
    }

    Carp::croak( "unknown option(s): " . join(', ', @unavailable_opts) ) if scalar @unavailable_opts;

    return $self;
}

sub normalize
{
    my ($self, $text) = @_;

    return undef unless defined $text;

    no strict 'refs';
    $text = $_->($text) for @{ $self->{converters} };

    return $text;
}

sub lc { return defined $_[0] ? CORE::lc $_[0] : undef; }
sub uc { return defined $_[0] ? CORE::uc $_[0] : undef; }

sub strip_html { $SCRUBBER->scrub(shift); }

sub wave2tilde           { local $_ = shift; return undef unless defined $_; tr/\x{301C}\x{3030}/\x{FF5E}/; $_; }
sub tilde2wave           { local $_ = shift; return undef unless defined $_; tr/\x{FF5E}/\x{301C}/; $_; }
sub wavetilde2long       { local $_ = shift; return undef unless defined $_; tr/\x{301C}\x{3030}\x{FF5E}/\x{30FC}/; $_; }
sub wave2long            { local $_ = shift; return undef unless defined $_; tr/\x{301C}\x{3030}/\x{30FC}/; $_; }
sub tilde2long           { local $_ = shift; return undef unless defined $_; tr/\x{FF5E}/\x{30FC}/; $_; }
sub fullminus2long       { local $_ = shift; return undef unless defined $_; tr/\x{FF0D}/\x{30FC}/; $_; }
sub dashes2long          { local $_ = shift; return undef unless defined $_; tr/\x{2012}\x{2013}\x{2014}\x{2015}/\x{30FC}/; $_; }
sub drawing_lines2long   { local $_ = shift; return undef unless defined $_; tr/\x{2500}\x{2501}\x{254C}\x{254D}\x{2574}\x{2576}\x{2578}\x{257A}/\x{30FC}/; $_; }
sub unify_long_repeats   { local $_ = shift; return undef unless defined $_; tr/\x{30FC}/\x{30FC}/s; $_; }
sub unify_long_spaces    { local $_ = shift; return undef unless defined $_; tr/\x{0020}/\x{0020}/s; tr/\x{3000}/\x{3000}/s; s/[\x{0020}\x{3000}]{2,}/\x{0020}/g; $_; }
sub unify_whitespaces    { local $_ = shift; return undef unless defined $_; tr/\x{000B}\x{000C}\x{0085}\x{00A0}\x{1680}\x{2000}-\x{200A}\x{2028}\x{2029}\x{202F}\x{205F}/\x{0020}/; $_; }
sub trim                 { local $_ = shift; return undef unless defined $_; s/^\s+//; s/\s+$//; $_; }
sub ltrim                { local $_ = shift; return undef unless defined $_; s/^\s+//; $_; }
sub rtrim                { local $_ = shift; return undef unless defined $_; s/\s+$//; $_; }
sub nl2space             { local $_ = shift; return undef unless defined $_; s/\x{000D}\x{000A}/\x{0020}/g; tr/\x{000D}\x{000A}/\x{0020}/; $_; }
sub unify_nl             { local $_ = shift; return undef unless defined $_; s/\x{000D}\x{000A}/\n/g;       tr/\x{000D}\x{000A}/\n/; $_;       }
sub tab2space            { local $_ = shift; return undef unless defined $_; tr/\x{0009}/\x{0020}/; $_; }
sub old2new_kana         { local $_ = shift; return undef unless defined $_; tr/ゐヰゑヱ/いイえエ/; s/ヸ/イ\x{3099}/g; s/ヹ/エ\x{3099}/g; $_; }
sub remove_controls      { local $_ = shift; return undef unless defined $_; tr/\x{0000}-\x{0008}\x{000B}\x{000C}\x{000E}-\x{001F}\x{007F}-\x{009F}//d; $_; }
sub remove_spaces        { local $_ = shift; return undef unless defined $_; tr/\x{0020}\x{3000}//d; $_; }
sub remove_DFC           { local $_ = shift; return undef unless defined $_; tr/\x{061C}\x{2066}-\x{2069}\x{200E}\x{200F}\x{202A}-\x{202E}//d; $_; }

sub decompose_parenthesized_kanji { local $_ = shift; return undef unless defined $_; s/([\x{3220}-\x{3243}])/"($parenthesized_kanji_map{$1})"/ge; $_; }

sub old2new_kanji
{
    local $_ = shift;
    return undef unless defined $_;
    tr/亞惡壓圍爲醫壹逸稻飮隱營榮衞驛謁圓緣艷鹽奧應橫歐毆黃溫穩假價禍畫會壞悔懷海繪慨槪擴殼覺學嶽樂喝渴褐勸卷寬歡漢罐觀關陷顏器既歸氣祈龜僞戲犧舊據擧虛峽挾狹鄕響曉勤謹區驅勳薰徑惠揭溪經繼莖螢輕鷄藝擊缺儉劍圈檢權獻硏縣險顯驗嚴效廣恆鑛號國穀黑濟碎齋劑櫻册殺雜參慘棧蠶贊殘祉絲視齒兒辭濕實舍寫煮社者釋壽收臭從澁獸縱祝肅處暑緖署諸敍奬將涉燒祥稱證乘剩壤孃條淨狀疊讓釀囑觸寢愼眞神盡圖粹醉隨髓數樞瀨聲靜齊攝竊節專戰淺潛纖踐錢禪曾祖僧雙壯層搜插巢爭瘦總莊裝騷增憎臟藏贈卽屬續墮體對帶滯臺瀧擇澤單嘆擔膽團彈斷癡遲晝蟲鑄著廳徵懲聽敕鎭塚遞鐵轉點傳都黨盜燈當鬭德獨讀突屆繩難貳惱腦霸廢拜梅賣麥發髮拔繁晚蠻卑碑祕濱賓頻敏甁侮福拂佛倂塀竝變邊勉辨瓣辯舖步穗寶襃豐墨沒飜每萬滿免麵默餠戾彌藥譯豫餘與譽搖樣謠來賴亂欄覽隆龍虜兩獵綠壘淚類勵禮隸靈齡曆歷戀練鍊爐勞廊朗樓郞錄灣堯巖晉槇渚猪琢瑤祐祿禎穰聰遙/亜悪圧囲為医壱逸稲飲隠営栄衛駅謁円縁艶塩奥応横欧殴黄温穏仮価禍画会壊悔懐海絵慨概拡殻覚学岳楽喝渇褐勧巻寛歓漢缶観関陥顔器既帰気祈亀偽戯犠旧拠挙虚峡挟狭郷響暁勤謹区駆勲薫径恵掲渓経継茎蛍軽鶏芸撃欠倹剣圏検権献研県険顕験厳効広恒鉱号国穀黒済砕斎剤桜冊殺雑参惨桟蚕賛残祉糸視歯児辞湿実舎写煮社者釈寿収臭従渋獣縦祝粛処暑緒署諸叙奨将渉焼祥称証乗剰壌嬢条浄状畳譲醸嘱触寝慎真神尽図粋酔随髄数枢瀬声静斉摂窃節専戦浅潜繊践銭禅曽祖僧双壮層捜挿巣争痩総荘装騒増憎臓蔵贈即属続堕体対帯滞台滝択沢単嘆担胆団弾断痴遅昼虫鋳著庁徴懲聴勅鎮塚逓鉄転点伝都党盗灯当闘徳独読突届縄難弐悩脳覇廃拝梅売麦発髪抜繁晩蛮卑碑秘浜賓頻敏瓶侮福払仏併塀並変辺勉弁弁弁舗歩穂宝褒豊墨没翻毎万満免麺黙餅戻弥薬訳予余与誉揺様謡来頼乱欄覧隆竜虜両猟緑塁涙類励礼隷霊齢暦歴恋練錬炉労廊朗楼郎録湾尭巌晋槙渚猪琢瑶祐禄禎穣聡遥/;
    return $_;
}

1;

__END__

=for stopwords lc nfkc nfkd nfc nfd wavetilde2long fullminus2long nl2space whitespace ltrim rtrim

=encoding utf-8

=head1 NAME

Lingua::JA::NormalizeText - All-in-One Japanese text normalizer

=head1 SYNOPSIS

  use Lingua::JA::NormalizeText;
  use utf8;

  my @options = ( qw/nfkc decode_entities/, \&dearinsu_to_desu );
  my $normalizer = Lingua::JA::NormalizeText->new(@options);

  my $text = $normalizer->normalize('鳥が㌧㌦でありんす♥'); # => '鳥がトンドルです♥'

  sub dearinsu_to_desu
  {
      my $text = shift;
      $text =~ s/でありんす/です/g;

      return $text;
  }

# or

  use Lingua::JA::NormalizeText qw/old2new_kanji/;
  use utf8;

  my $text = old2new_kanji('惡の華'); # => '悪の華'


=head1 DESCRIPTION

This module provides a lot of Japanese text normalization options.
These options facilitate Japanese text pre-processing.

=head1 METHODS

=head2 new(@options)

Creates a new Lingua::JA::NormalizeText instance.

The following options are available:

  OPTION                 SAMPLE INPUT           OUTPUT FOR SAMPLE INPUT
  ---------------------  ---------------------  -----------------------
  lc                     DdD                    ddd
  uc                     DdD                    DDD
  nfkc                   ガ                     ガ (U+30AC)
  nfkd                   ガ                     ガ (U+30AB. U+3099)
  nfc                    ド                     ド (U+30C9)
  nfd                    ド                     ド (U+30C8, U+3099)
  decode_entities        ♥               ♥
  strip_html             <em>あ</em>            あ
  alnum_z2h              ABC123           ABC123
  alnum_h2z              ABC123                 ABC123
  space_z2h              \x{3000}               \x{0020}
  space_h2z              \x{0020}               \x{3000}
  katakana_z2h           ハァハァ               ハァハァ
  katakana_h2z           スーハースーハー               スーハースーハー
  katakana2hiragana      パンツ                 ぱんつ
  hiragana2katakana      ぱんつ                 パンツ
  wave2tilde             〜, 〰                 ~
  tilde2wave             ~                     〜
  wavetilde2long         〜, 〰, ~             ー
  wave2long              〜, 〰                 ー
  tilde2long             ~                     ー
  fullminus2long         -                     ー
  dashes2long            —                     ー
  drawing_lines2long     ─                     ー
  unify_long_repeats     ヴァーーー             ヴァー
  nl2space               (LF)(CR)(CRLF}         (space)(space)(space)
  unify_nl               (LF)(CR)(CRLF)         \n\n\n
  unify_long_spaces      あ(space)(space)あ     あ(space)あ
  unify_whitespaces      \x{00A0}               (space)
  trim                   (space)あ(space)あ(space)  あ(space)あ
  ltrim                  (space)あ(space)       あ(space)
  rtrim                  ああ(space)(space)     ああ
  old2new_kana           ゐヰゑヱヸヹ           いイえエイ゙エ゙
  old2new_kanji          亞逸鬭                 亜逸闘
  tab2space              (tab)(tab)             (space)(space)
  remove_controls        あ\x{0000}あ           ああ
  remove_DFC             \x{202E}HOGE           HOGE
  remove_spaces          \x{0020}あ\x{3000}あ\x{0020}  ああ
  dakuon_normalize       さ\x{3099}             ざ (U+3056)
  handakuon_normalize    は\x{309A}             ぱ (U+3071)
  all_dakuon_normalize   さ\x{3099}は\x{309A}   ざぱ (U+3056, U+3071)
  square2katakana        ㌢                     センチ
  circled2kana           ㋙㋛㋑㋟㋑             コシイタイ
  circled2kanji          ㊩㊫㊚㊒㊖             医学男有財
  decompose_parenthesized_kanji  ㈱             (株)

The order in which these options are applied is according to the order of
the elements of @options.
(i.e., The first element is applied first, and the last element is applied last.)

External functions can be added.
(See dearinsu_to_desu function of the SYNOPSIS section.)


=head2 normalize($text)

normalizes $text.


=head1 OPTIONS

=head2 lc, uc

These options are the same as CORE::lc and CORE::uc.

=head2 nfkc, nfkd, nfc, nfd

See L<Unicode::Normalize>.

=head2 decode_entities

See L<HTML::Entities>.

=head2 strip_html

Strips HTML tags.

=head2 alnum_z2h, alnum_h2z

Converts English alphabet, numbers and symbols ZENKAKU <-> HANKAKU.

ZENKAKU:

  !"#$%&'()*+,-./0123456789:;<=>
  ?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\
  ]^_`abcdefghijklmnopqrstuvwxyz
  {|}~⦅⦆¢£¬ ̄¦¥₩

HANKAKU:

  !"#$%&'()*+,-./0123456789:;<=>
  ?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\
  ]^_`abcdefghijklmnopqrstuvwxyz
  {|}~¢£¥¦¬¯₩⦅⦆


=head2 space_z2h, space_h2z

SPACE (U+0020) <-> IDEOGRAPHIC SPACE (U+3000)

=head2 katakana_z2h, katakana_h2z

Converts katakanas ZENKAKU <-> HANKAKU.

See L<Lingua::JA::Regular::Unicode>.

=head2 hiragana2katakana

INPUT:

  ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそぞ
  ただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺほぼ
  ぽまみむめもゃやゅゆょよらりるれろゎわゐゑをんゔゕゖゝゞ

OUTPUT FOR INPUT:

  ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾ
  タダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボ
  ポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶヽヾ


=head2 katakana2hiragana

INPUT:

  ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾ
  タダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボ
  ポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶヽヾ
  ヲァィゥェォャュョッアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン

OUTPUT FOR INPUT:

  ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそぞ
  ただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺほぼ
  ぽまみむめもゃやゅゆょよらりるれろゎわゐゑをんゔゕゖゝゞ
  をぁぃぅぇぉゃゅょっあいうえおかきくけこさしすせそたちつてと
  なにぬねのはひふへほまみむめもやゆよらりるれろわん


=head2 wave2tilde

Converts WAVE DASH (U+301C) and WAVY DASH (U+3030) into tilde (U+FF5E).

=head2 tilde2wave

Converts tilde (U+FF5E) into wave (U+301C).

=head2 wavetilde2long

Converts WAVE DASH (U+301C), WAVY DASH (U+3030) and tilde (U+FF5E) into long (U+30FC).

=head2 wave2long

Converts WAVE DASH (U+301C) and WAVY DASH (U+3030) into long (U+30FC).

=head2 tilde2long

Converts tilde (U+FF5E) into long (U+30FC).

=head2 fullminus2long

Converts FULLWIDTH HYPHEN-MINUS (U+FF0D) into long (U+30FC).

=head2 dashes2long

Converts the following characters into long (U+30FC).

  U+2012  FIGURE DASH
  U+2013  EN DASH
  U+2014  EM DASH
  U+2015  HORIZONTAL BAR

Note that this option does not convert hyphens into long.


=head2 drawing_line2long

Converts the following characters into long (U+30FC).

  U+2500  BOX DRAWINGS LIGHT HORIZONTAL
  U+2501  BOX DRAWINGS HEAVY HORIZONTAL
  U+254C  BOX DRAWINGS LIGHT DOUBLE DASH HORIZONTAL
  U+254D  BOX DRAWINGS HEAVY DOUBLE DASH HORIZONTAL
  U+2574  BOX DRAWINGS LIGHT LEFT
  U+2576  BOX DRAWINGS LIGHT RIGHT
  U+2578  BOX DRAWINGS HEAVY LEFT
  U+257A  BOX DRAWINGS HEAVY RIGHT


=head2 unify_long_repeats

Unifies long (U+30FC) repeats.

=head2 nl2space

Converts new lines (LF, CR, CRLF) into SPACE (U+0020).

=head2 unify_nl

Unifies new lines.

=head2 unify_long_spaces

Unifies long spaces (U+0020 and U+3000).

=head2 unify_whitespaces

Converts the following characters into SPACE (U+0020).

  U+000B  LINE TABULATION
  U+000C  FORM FEED
  U+0085  NEXT LINE
  U+00A0  NO-BREAK SPACE
  U+1680  OGHAM SPACE MARK
  U+2000  EN QUAD
  U+2001  EM QUAD
  U+2002  EN SPACE
  U+2003  EM SPACE
  U+2004  THREE-PER-EM SPACE
  U+2005  FOUR-PER-EM SPACE
  U+2006  SIX-PER-EM SPACE
  U+2007  FIGURE SPACE
  U+2008  PUNCTUATION SPACE
  U+2009  THIN SPACE
  U+200A  HAIR SPACE
  U+2028  LINE SEPARATOR
  U+2029  PARAGRAPH SEPARATOR
  U+202F  NARROW NO-BREAK SPACE
  U+205F  MEDIUM MATHEMATICAL SPACE

Note that this option does not convert the following characters:

  U+0009  CHARACTER TABULATION
  U+000A  LINE FEED
  U+000D  CARRIAGE RETURN
  U+3000  IDEOGRAPHIC SPACE


=head2 trim

Removes leading and trailing whitespace.

=head2 ltrim

Removes only leading whitespace.

=head2 rtrim

Removes only trailing whitespace.

=head2 old2new_kana

  INPUT  OUTPUT FOR INPUT
  -----  --------------------
  ゐ     い
  ヰ     イ
  ゑ     え
  ヱ     エ
  ヸ     イ゙ (U+30A4, U+3099)
  ヹ     エ゙ (U+30A8, U+3099)


=head2 old2new_kanji

INPUT:

  亞惡壓圍爲醫壹逸稻飮隱營榮衞驛謁圓緣艷鹽奧應橫歐毆黃溫穩假價
  禍畫會壞悔懷海繪慨槪擴殼覺學嶽樂喝渴褐勸卷寬歡漢罐觀關陷顏器
  既歸氣祈龜僞戲犧舊據擧虛峽挾狹鄕響曉勤謹區驅勳薰徑惠揭溪經繼
  莖螢輕鷄藝擊缺儉劍圈檢權獻硏縣險顯驗嚴效廣恆鑛號國穀黑濟碎齋
  劑櫻册殺雜參慘棧蠶贊殘祉絲視齒兒辭濕實舍寫煮社者釋壽收臭從澁
  獸縱祝肅處暑緖署諸敍奬將涉燒祥稱證乘剩壤孃條淨狀疊讓釀囑觸寢
  愼眞神盡圖粹醉隨髓數樞瀨聲靜齊攝竊節專戰淺潛纖踐錢禪曾祖僧雙
  壯層搜插巢爭瘦總莊裝騷增憎臟藏贈卽屬續墮體對帶滯臺瀧擇澤單嘆
  擔膽團彈斷癡遲晝蟲鑄著廳徵懲聽敕鎭塚遞鐵轉點傳都黨盜燈當鬭德
  獨讀突屆繩難貳惱腦霸廢拜梅賣麥發髮拔繁晚蠻卑碑祕濱賓頻敏甁侮
  福拂佛倂塀竝變邊勉辨瓣辯舖步穗寶襃豐墨沒飜每萬滿免麵默餠戾彌
  藥譯豫餘與譽搖樣謠來賴亂欄覽隆龍虜兩獵綠壘淚類勵禮隸靈齡曆歷
  戀練鍊爐勞廊朗樓郞錄灣堯巖晉槇渚猪琢瑤祐祿禎穰聰遙

OUTPUT FOR INPUT:

  亜悪圧囲為医壱逸稲飲隠営栄衛駅謁円縁艶塩奥応横欧殴黄温穏仮価
  禍画会壊悔懐海絵慨概拡殻覚学岳楽喝渇褐勧巻寛歓漢缶観関陥顔器
  既帰気祈亀偽戯犠旧拠挙虚峡挟狭郷響暁勤謹区駆勲薫径恵掲渓経継
  茎蛍軽鶏芸撃欠倹剣圏検権献研県険顕験厳効広恒鉱号国穀黒済砕斎
  剤桜冊殺雑参惨桟蚕賛残祉糸視歯児辞湿実舎写煮社者釈寿収臭従渋
  獣縦祝粛処暑緒署諸叙奨将渉焼祥称証乗剰壌嬢条浄状畳譲醸嘱触寝
  慎真神尽図粋酔随髄数枢瀬声静斉摂窃節専戦浅潜繊践銭禅曽祖僧双
  壮層捜挿巣争痩総荘装騒増憎臓蔵贈即属続堕体対帯滞台滝択沢単嘆
  担胆団弾断痴遅昼虫鋳著庁徴懲聴勅鎮塚逓鉄転点伝都党盗灯当闘徳
  独読突届縄難弐悩脳覇廃拝梅売麦発髪抜繁晩蛮卑碑秘浜賓頻敏瓶侮
  福払仏併塀並変辺勉弁弁弁舗歩穂宝褒豊墨没翻毎万満免麺黙餅戻弥
  薬訳予余与誉揺様謡来頼乱欄覧隆竜虜両猟緑塁涙類励礼隷霊齢暦歴
  恋練錬炉労廊朗楼郎録湾尭巌晋槙渚猪琢瑶祐禄禎穣聡遥


=head2 tab2space

Converts CHARACTER TABULATION (U+0009) into SPACE (U+0020).

=head2 remove_controls

Removes the following control characters:

  U+0000 .. U+0008
  U+000B
  U+000C
  U+000E .. U+001F
  U+007F .. U+009F

Note that this option does not remove the following characters:

  U+0009  CHARACTER TABULATION
  U+000A  LINE FEED
  U+000D  CARRIAGE RETURN


=head2 remove_DFC

Removes the following Directional Formatting Characters:

  U+061C  ARABIC LETTER MARK
  U+2066  LEFT-TO-RIGHT ISOLATE
  U+2067  RIGHT-TO-LEFT ISOLATE
  U+2068  FIRST STRONG ISOLATE
  U+2069  POP DIRECTIONAL ISOLATE
  U+200E  LEFT-TO-RIGHT MARK
  U+200F  RIGHT-TO-LEFT MARK
  U+202A  LEFT-TO-RIGHT EMBEDDING
  U+202B  RIGHT-TO-LEFT EMBEDDING
  U+202C  POP DIRECTIONAL FORMATTING
  U+202D  LEFT-TO-RIGHT OVERRIDE
  U+202E  RIGHT-TO-LEFT OVERRIDE

See L<http://www.unicode.org/reports/tr9/> for more information about Directional Formatting Characters.


=head2 remove_spaces

Removes SPACE (U+0020) and IDEOGRAPHIC SPACE (U+3000).

=head2 dakuon_normalize, handakuon_normalize, all_dakuon_normalize

See L<Lingua::JA::Dakuon>.

Note that Lingua::JA::NormalizeText enables $Lingua::JA::Dakuon::EnableCombining flag.

=head2 square2katakana, circled2kana, circled2kanji

See L<Lingua::JA::Moji>.

=head2 decompose_parenthesized_kanji

Decomposes the following parenthesized kanji:

  ㈠㈡㈢㈣㈤㈥㈦㈧㈨㈩㈪㈫㈬㈭㈮㈯㈰㈱㈲㈳㈴㈵㈶㈷㈸㈹㈺㈻㈼㈽㈾㈿㉀㉁㉂㉃


=head1 AUTHOR

pawa E<lt>pawapawa@cpan.orgE<gt>

=head1 SEE ALSO

L<新旧字体表|http://www.asahi-net.or.jp/~ax2s-kmtn/ref/old_chara.html>

L<康熙字典|http://ja.wikipedia.org/wiki/%E5%BA%B7%E7%86%99%E5%AD%97%E5%85%B8>

L<Lingua::JA::Regular::Unicode>

L<Lingua::JA::Dakuon>

L<Lingua::JA::Moji>

L<Unicode::Normalize>

L<Unicode::Number>

L<HTML::Entities>

L<HTML::Scrubber>

=head1 LICENSE

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=cut