package Lingua::Han::Utils;
use warnings;
use strict;
use base 'Exporter';
use vars qw/$VERSION @EXPORT_OK/;
$VERSION = '0.13';
@EXPORT_OK = qw/Unihan_value csplit cdecode csubstr clength/;
use Encode;
use Encode::Detect::CJK qw(detect);
sub cdecode {
my $word = shift;
my $encoding = detect($word);
$encoding = 'cp936' if $encoding eq 'iso-8859-1'; # hard fix
$word = decode($encoding, $word);
return $word;
}
sub Unihan_value {
my $word = shift;
$word = cdecode($word) unless Encode::is_utf8($word);
my @unihan = map { uc sprintf("%x",$_) } unpack ("U*", $word);
return wantarray?@unihan:(join('', @unihan));
}
sub csplit {
my $word = shift;
my $encoding = detect($word);
my @return_words;
my @code = Unihan_value($word);
foreach my $code (@code) {
my $value = pack("U*", hex $code);
$value = encode($encoding, $value);
push @return_words, $value if ($value);
}
return wantarray?@return_words:(join('', @return_words));
}
sub csubstr {
my ($word, $offset, $len) = @_;
my @words = csplit($word);
$len = scalar @words - $offset unless ($len);
@words = splice(@words, $offset, $len);
return wantarray?@words:(join('', @words));
}
sub clength {
my $word = shift;
my @words = csplit($word);
return scalar @words;
}
1;
__END__
=encoding utf8
=head1 NAME
Lingua::Han::Utils - The utility tools of Chinese character(HanZi)
=head1 SYNOPSIS
use Lingua::Han::Utils qw/Unihan_value csplit cdecode csubstr clength/;
# cdecode
# the same as decode('cp936', $word) in ASCII editing mode
# and decode('utf8', $word) in Unicode editing mode
my $word = cdecode($word);
# Unihan_value
# return the first field of Unihan.txt on unicode.org
my $word = "我";
my $unihan = Unihan_value($word); # return '6211'
my $words = "爱你";
my @unihan = Unihan_value($word); # return (7231, 4F60)
my $unihan = Unihan_value($word); # return 72314F60
# csplit
# split the Chinese characters into an array
my $words = "我爱你";
my @words = csplit($words); # return ("我", "爱", "你")
# csubstr
# treat the Chinese characters as one
# so it's the same as splice(csplit($words), $offset, $length)
my $words = "我爱你啊";
my @words = csubstr($words, 1, 2); # return ("爱", "你")
my @words = csubstr($words, 1); # return ("爱", "你", "啊")
my $words = csubstr($words, 1, 2); # 爱你
# clength
# treat the Chinese character as one
my $words = "我爱你";
print clength($words); # 3
=head1 EXPORT
Nothing is exported by default.
=head1 EXPORT_OK
=over 4
=item cdecode
use L<Encode::Guess> to decode the character. It behavers like: decode('cp936', $word) under ASCII editing mode and decode('utf8', $word) under Unicode editing mode.
=item Unihan_value
the first field of Unihan.txt is the Unicode scalar value as U+[x]xxxx, we return the [x]xxxx.
=item csplit
split the Chinese characters into an array, English words can be mixed in.
=item csubstr(WORD, OFFSET, LENGTH)
treat the Chinese character as one word, substr it.
(BE CAFEFUL! it's NOT lvalue, we cann't use csubstr($word, 2, 3) = $REPLACEMENT)
if no LENGTH is specified, substr form OFFSET to END.
=item clength
treat the Chinese character as one word(length 1).
=back
=head1 DOCUMENT
a Chinese version of document can be found @ L<http://www.fayland.org/journal/Lingua-Han-Utils.html>
=head1 AUTHOR
Fayland Lam, C<< <fayland at gmail.com> >>
=head1 BUGS
Please report any bugs or feature requests to
C<bug-lingua-han-utils at rt.cpan.org>, or through the web interface at
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Lingua-Han-Utils>.
I will be notified, and then you'll automatically be notified of progress on
your bug as I make changes.
=head1 SUPPORT
You can find documentation for this module with the perldoc command.
perldoc Lingua::Han::Utils
You can also look for information at:
=over 4
=item * AnnoCPAN: Annotated CPAN documentation
L<http://annocpan.org/dist/Lingua-Han-Utils>
=item * CPAN Ratings
L<http://cpanratings.perl.org/d/Lingua-Han-Utils>
=item * RT: CPAN's request tracker
L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Lingua-Han-Utils>
=item * Search CPAN
L<http://search.cpan.org/dist/Lingua-Han-Utils>
=back
=head1 ACKNOWLEDGEMENTS
the wonderful L<Encode::Guess>
=head1 COPYRIGHT & LICENSE
Copyright 2005 Fayland Lam, all rights reserved.
This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.