# $File: //member/autrijus/Lingua-ZH-Keywords/Keywords.pm $ $Author: autrijus $
# $Revision: #9 $ $Change: 3723 $ $DateTime: 2003/01/20 22:15:45 $
package Lingua::ZH::Keywords;
$Lingua::ZH::Keywords::VERSION = '0.04';
use strict;
use vars qw($VERSION @ISA @EXPORT @StopWords);
use Exporter;
use Lingua::ZH::TaBE ();
=head1 NAME
Lingua::ZH::Keywords - Extract keywords from Chinese text
=head1 SYNOPSIS
# Exports keywords() by default
use Lingua::ZH::Keywords;
print join(",", keywords($text)); # Prints five keywords
print join(",", keywords($text, 10)); # Prints ten keywords
=head1 DESCRIPTION
This is a very simple algorithm which removes stopwords from the
text, and then counts up what it considers to be the most important
B<keywords>. The C<keywords> subroutine returns a list of keywords
in order of relevance.
The stopwords list is accessible as C<@Lingua::ZH::Keywords::StopWords>.
If the input C<$text> is an Unicode string, the returned keywords
will also be Unicode strings; otherwise they are assumed to be
Big5-encoded bytestrings.
=cut
@ISA = qw(Exporter);
@EXPORT = qw(keywords);
@StopWords = qw(
´£¨Ñ ¬ÛÃö §ÚÌ ¥i¥H ¦p¦ó ¦]¬° ¥Ø«e ¦pªG ¨ä¥L §Úªº ¤j®a ¨S¦³ ¥Dn ©Ò¥H
¥H¤W ³oÓ ©Ò¦³ ¦³Ãö ´N¬O ¥LÌ ¦]¦¹ ¦ý¬O ¥H¤Î ¬O§_ ¥Ñ©ó ¹ï©ó ¥ô¦ó ¤°»ò
³o¨Ç ²{¦b µLªk ¦¨¬° ¥i¯à ¤£¹L ¥]¬A ¥²¶· Ãö©ó ³o¬O ³o¼Ë ¥H¤U ¤w¸g §Aªº
ÁöµM ³\¦h ¤]¬O ¤£¬O °£¤F ÁÙ¬O ¬°¤F ¤§«á ¥un ¨ä¤¤ ³£¬O ¦UºØ ÁÙ¦³ «D±`
¦Ó¥B ³oºØ ¨ä¥¦ ¤£n §Ún ¥Lªº ¥u¬O ¦U¦ì ¥u¦³ ªº¸Ü ¤£¯à ³o¸Ì ¬Û·í §Ú¬O
¥þ³¡ «Ü¦h ¥i¬O ©Î¬O ¨ä¹ê ¨º»ò §AÌ ¤U¦C ¦p¦¹ ¥t¥~ µM«á ¦U¶µ ¤~¯à ¤£·|
¬Æ¦Ü Á`·| ¤£±o «ç»ò §Y¥i §@¬° ¦Ü©ó ·íµM ®Ú¾Ú §Ú·Q ¯à°÷ ¤§¶¡ ¬°¦ó ¤£ª¾
¨Ò¦p ´Á¶¡ ®ÉÔ ¤]¦³ ±`¨£ ¨Ã¥B ®e©ö §Ú¦³ ¹ê»Ú ¦³¤H ¦³¨Ç ¤À§O ¨Ã¤£ ¥H«á
¨Ï±o ¸g¥Ñ «·s ¦p¤U ¦b¦¹ ³o»ò ¨º¨Ç ¾ãÓ ³£¦³ ³o¦¸ ¤§«e ¥O¤H ¨Óªº ´N·|
¤Wz ¦ì©ó ¨ºÓ ¦Ó¤w ¨Ï¥Î °²¦p ©ó¬O ÁÙ±o ¬O¦b µLªk ¦óªp ´¿¸g §Ú̪º
);
my $Tabe;
sub keywords {
$Tabe ||= Lingua::ZH::TaBE->new;
eval { require Encode::compat } if $] < 5.007;
my $is_utf8 = eval { require Encode; Encode::is_utf8($_[0]) };
my (%hist, %ref);
$hist{$_}++ for grep {
length > 2 and index($_, '¤@') == -1
} $Tabe->split(
$is_utf8 ? Encode::encode(big5 => $_[0]) : $_[0]
);
delete @hist{@StopWords};
my $count = $_[1] || 5;
# By occurence, then freq, then lexical order
map {
$is_utf8 ? Encode::decode(big5 => $_) : $_
} grep length, (sort {
$hist{$b} <=> $hist{$a}
or
($ref{$b} ||= freq($b)) <=> ($ref{$a} ||= freq($a))
or
$b cmp $a
} keys %hist)[ 0 .. $count-1 ];
}
sub freq {
my $tsi = $Tabe->Tsi($_[0]);
$Tabe->TsiDB->Get($tsi);
return $tsi->refcount;
}
1;
__END__
=head1 SEE ALSO
L<Lingua::ZH::TaBE>, L<Lingua::EN::Keywords>
=head1 ACKNOWLEDGEMENTS
Algorithm adapted from the L<Lingua::EN::Keywords> module by
Simon Cozens, E<lt>simon@simon-cozens.org<gt>.
=head1 AUTHORS
Autrijus Tang E<lt>autrijus@autrijus.orgE<gt>
=head1 COPYRIGHT
Copyright 2003 by Autrijus Tang E<lt>autrijus@autrijus.orgE<gt>.
This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
See L<http://www.perl.com/perl/misc/Artistic.html>
=cut