The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# $File: //member/autrijus/Lingua-ZH-Keywords/Keywords.pm $ $Author: autrijus $
# $Revision: #9 $ $Change: 3723 $ $DateTime: 2003/01/20 22:15:45 $

package Lingua::ZH::Keywords;
$Lingua::ZH::Keywords::VERSION = '0.04';

use strict;
use vars qw($VERSION @ISA @EXPORT @StopWords);

use Exporter;
use Lingua::ZH::TaBE ();

=head1 NAME

Lingua::ZH::Keywords - Extract keywords from Chinese text

=head1 SYNOPSIS

    # Exports keywords() by default
    use Lingua::ZH::Keywords;

    print join(",", keywords($text));	    # Prints five keywords
    print join(",", keywords($text, 10));   # Prints ten keywords

=head1 DESCRIPTION

This is a very simple algorithm which removes stopwords from the
text, and then counts up what it considers to be the most important
B<keywords>.  The C<keywords> subroutine returns a list of keywords
in order of relevance.

The stopwords list is accessible as C<@Lingua::ZH::Keywords::StopWords>.

If the input C<$text> is an Unicode string, the returned keywords
will also be Unicode strings; otherwise they are assumed to be
Big5-encoded bytestrings.

=cut

@ISA	    = qw(Exporter);
@EXPORT	    = qw(keywords);

@StopWords  = qw(
    ´£¨Ñ ¬ÛÃö §Ú­Ì ¥i¥H ¦p¦ó ¦]¬° ¥Ø«e ¦pªG ¨ä¥L §Úªº ¤j®a ¨S¦³ ¥D­n ©Ò¥H
    ¥H¤W ³o­Ó ©Ò¦³ ¦³Ãö ´N¬O ¥L­Ì ¦]¦¹ ¦ý¬O ¥H¤Î ¬O§_ ¥Ñ©ó ¹ï©ó ¥ô¦ó ¤°»ò
    ³o¨Ç ²{¦b µLªk ¦¨¬° ¥i¯à ¤£¹L ¥]¬A ¥²¶· Ãö©ó ³o¬O ³o¼Ë ¥H¤U ¤w¸g §Aªº
    ÁöµM ³\¦h ¤]¬O ¤£¬O °£¤F ÁÙ¬O ¬°¤F ¤§«á ¥u­n ¨ä¤¤ ³£¬O ¦UºØ ÁÙ¦³ «D±`
    ¦Ó¥B ³oºØ ¨ä¥¦ ¤£­n §Ú­n ¥Lªº ¥u¬O ¦U¦ì ¥u¦³ ªº¸Ü ¤£¯à ³o¸Ì ¬Û·í §Ú¬O
    ¥þ³¡ «Ü¦h ¥i¬O ©Î¬O ¨ä¹ê ¨º»ò §A­Ì ¤U¦C ¦p¦¹ ¥t¥~ µM«á ¦U¶µ ¤~¯à ¤£·|
    ¬Æ¦Ü Á`·| ¤£±o «ç»ò §Y¥i §@¬° ¦Ü©ó ·íµM ®Ú¾Ú §Ú·Q ¯à°÷ ¤§¶¡ ¬°¦ó ¤£ª¾
    ¨Ò¦p ´Á¶¡ ®É­Ô ¤]¦³ ±`¨£ ¨Ã¥B ®e©ö §Ú¦³ ¹ê»Ú ¦³¤H ¦³¨Ç ¤À§O ¨Ã¤£ ¥H«á
    ¨Ï±o ¸g¥Ñ ­«·s ¦p¤U ¦b¦¹ ³o»ò ¨º¨Ç ¾ã­Ó ³£¦³ ³o¦¸ ¤§«e ¥O¤H ¨Óªº ´N·|
    ¤W­z ¦ì©ó ¨º­Ó ¦Ó¤w ¨Ï¥Î °²¦p ©ó¬O ÁÙ±o ¬O¦b µLªk ¦óªp ´¿¸g §Ú­Ìªº 
);

my $Tabe;

sub keywords {
    $Tabe ||= Lingua::ZH::TaBE->new;

    eval { require Encode::compat } if $] < 5.007;
    my $is_utf8 = eval { require Encode; Encode::is_utf8($_[0]) };

    my (%hist, %ref);
    $hist{$_}++ for grep {
	length > 2 and index($_, '¤@') == -1
    } $Tabe->split(
	$is_utf8 ? Encode::encode(big5 => $_[0]) : $_[0]
    );
    delete @hist{@StopWords};

    my $count = $_[1] || 5;

    # By occurence, then freq, then lexical order
    map {
	$is_utf8 ? Encode::decode(big5 => $_) : $_
    } grep length, (sort {
	$hist{$b} <=> $hist{$a}
	    or
	($ref{$b} ||= freq($b)) <=> ($ref{$a} ||= freq($a))
	    or
	$b cmp $a
    } keys %hist)[ 0 .. $count-1 ];
}

sub freq {
    my $tsi = $Tabe->Tsi($_[0]);
    $Tabe->TsiDB->Get($tsi);
    return $tsi->refcount;
}

1;

__END__

=head1 SEE ALSO

L<Lingua::ZH::TaBE>, L<Lingua::EN::Keywords>

=head1 ACKNOWLEDGEMENTS

Algorithm adapted from the L<Lingua::EN::Keywords> module by
Simon Cozens, E<lt>simon@simon-cozens.org<gt>.

=head1 AUTHORS

Autrijus Tang E<lt>autrijus@autrijus.orgE<gt>

=head1 COPYRIGHT

Copyright 2003 by Autrijus Tang E<lt>autrijus@autrijus.orgE<gt>.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

See L<http://www.perl.com/perl/misc/Artistic.html>

=cut