package WWW::BookBot::Chinese;
use 5.008;
use strict;
use warnings;
no warnings qw(uninitialized);
use base qw(WWW::BookBot);
use vars qw($VERSION);
$VERSION = '0.12';
#-------------------------------------------------------------
# Default settings
# $class->default_settings => \%settings
#-------------------------------------------------------------
sub default_settings {
my $self = shift->SUPER::default_settings;
$self->{get_language}='zh-cn';
$self->{language_decode}='gbk';
$self->{language_encode}='gbk';
$self;
}
#-------------------------------------------------------------
# Redefined functions
# $bot->decode_entity($content_dein_deout) => N/A
# $bot->trandict_init => $bot->{translate_dict}
# $bot->msg_init => $bot->{messages}
#-------------------------------------------------------------
sub decode_entity {
#chinese novels sometimes add \x{FF1B} after unkown unicode string
$_[1]=~s/(?:&\#(\d{1,5});?\x{FF1B}?)/chr($1)/esg;
$_[1]=~s/(?:&\#[xX]([0-9a-fA-F]{1,5});?\x{FF1B}?)/chr(hex($1))/esg;
$_[1]=~s/(&([0-9a-zA-Z]{1,9});?)/$WWW::BookBot::entity2char{$2} or $1/esg;
#normalize middle dot
$_[1]=~s/\x{2022}/\x{00B7}/sg;
}
sub trandict_init {
shift->{translate_dict} = {
'log' => "ÈÕÖ¾",
'result' => "½á¹û",
'DB' => "Êý¾Ý",
'debug' => "µ÷ÊÔ",
}
}
sub msg_init {
my $skip_info="\n".'$pargs->{levelspace} url=$pargs->{url}'."\n";
shift->{messages} = {
TestMsg => '²âÊÔ: $pargs->{TestInfo} $pargs->{TestNum}',
BookStart => '$pargs->{levelspace} [$pargs->{bpos_limit}/$pargs->{book_num}] $pargs->{title_limit} ',
BookBinaryOK => '$pargs->{data_len_KB} $pargs->{write_file}'."\n",
BookChapterErr => ' - ÎÞ·¨·ÖÎö'.$skip_info,
BookChapterMany => '[$pargs->{chapter_num_limit}ÕÂ]',
BookChapterOne => '[µ¥Õ½Ú]',
BookChapterOK => '$pargs->{data_len_KB}'."\n",
BookTOCFinish => '$pargs->{TOC_len_KB}'."\n",
CatalogInfo => 'È¡ÊéÄ¿: ',
CatalogResultErr=> ' 0Ì×Êé'."\n",
CatalogResultOK => ' $pargs->{book_num}Ì×Êé'."\n",
CatalogURL => '$pargs->{url}',
CatalogURLEmpty => '[ʧ°Ü] Ë÷ÒýµÄURLΪ¿Õ'."\n",
DBBookErr => "\t".' \$bot->go_book({$pargs->{allargs}});'."\t#´íÎó\n",
DBBookOK => "\t".'#\$bot->go_book({$pargs->{allargs}});'."\n",
DBCatalogErr => ' \$bot->go_catalog({$pargs->{allargs}});'."\t#´íÎó\n",
DBCatalogOK => '#\$bot->go_catalog({$pargs->{allargs}});'."\n",
DBHead => <<'DATA',
#!$pargs->{perlcmd}
##======================================
## ×Ô¶¯Éú³ÉµÄÊý¾ÝÎļþ£¬ÓÃÓÚ$pargs->{classname}
## Éú³Éʱ¼ä: $pargs->{createtime}
##======================================
use $pargs->{classname};
my \$bot = new $pargs->{classname};
DATA
FailClearDB => 'ÎÞ·¨Çå³ýÊý¾ÝÎļþ$pargs->{filename}: $pargs->{errmsg}',
FailClose => 'ÎÞ·¨¹Ø±Õ$self->{translate_dict}->{$pargs->{filetype}}Îļþ$pargs->{filename}: $pargs->{errmsg}',
FailMkDir => '½¨Ä¿Â¼$pargs->{dir}ʧ°Ü: $pargs->{errmsg}',
FailOpen => 'ÎÞ·¨´ò¿ª$self->{translate_dict}->{$pargs->{filetype}}Îļþ$pargs->{filename}: $pargs->{errmsg}',
FailWrite => 'ÎÞ·¨Ð´Èë$self->{translate_dict}->{$pargs->{filetype}}Îļþ$pargs->{filename}: $pargs->{errmsg}',
GetFail404 => <<'DATA',
[$pargs->{code},ʧ°Ü] ÕÒ²»µ½Îļþ
$pargs->{url_real}
DATA
GetFail404Detail=> <<'DATA',
[$pargs->{code},ʧ°Ü] ÕÒ²»µ½Îļþ
>>>>ÇëÇó
$pargs->{req_content}<<<<ÏìÓ¦
$pargs->{status_line}
DATA
GetFailRetries => <<'DATA',
[$pargs->{code},ʧ°Ü] ÖØÊÔÌ«¶à£¬·ÅÆú
$pargs->{url_real}
DATA
GetFailRetriesDetail => <<'DATA',
[$pargs->{code},ʧ°Ü] ÖØÊÔÌ«¶à£¬·ÅÆú
>>>>ÇëÇó
$pargs->{req_content}<<<<ÏìÓ¦
$pargs->{status_line}
$pargs->{res_content}
DATA
GetURLSuccess => '$pargs->{len_KB} ',
GetURLRetry => '[$pargs->{code},ÖØÊÔ] ',
GetWait => 'µÈ´ý..',
SkipMaxLevel => '[Ìø¹ý]²ãÊý>$self->{book_max_levels}'.$skip_info,
SkipMedia => '[Ìø¹ý]ýÌåÎļþ'.$skip_info,
SkipTitleEmpty => '[Ìø¹ý]±êÌâΪ¿Õ'.$skip_info,
SkipUrlEmpty => '[Ìø¹ý]µØַΪ¿Õ'."\n",
SkipVisited => '[Ìø¹ý]ÒÑ·ÃÎʹý'."\n",
SkipZip => '[Ìø¹ý]ѹËõÎļþ'.$skip_info,
};
}
#-------------------------------------------------------------
# patterns
#-------------------------------------------------------------
sub getpattern_space2_data {
<<'DATA';
[¡¡£ ¡@]
DATA
}
sub getpattern_line_head_data {
'¡¡¡¡';
}
sub getpattern_parentheses_data {
shift->SUPER::getpattern_parentheses_data().<<'DATA';
¡¨ ¡¨
¡® ¡¯
¡° ¡±
¡² ¡³
¡´ ¡µ
¡¶ ¡·
¡¸ ¡¹
¡º ¡»
¡¼ ¡½
¡¾ ¡¿
¡ä ¡ä
¡å ¡å
£¢ £¢
£§ £§
£¨ £©
£¼ £¾
£Û £Ý
£à £à
£à £§
£û £ý
¦à ¦á
¦â ¦ã
¦ä ¦å
¦æ ¦ç
¦è ¦é
¦ê ¦ë
¦î ¦ï
¦ð ¦ñ
¨A ¨@
¨F ¨F
¨ ¨
©v ©w
©x ©y
©z ©{
© ©
DATA
}
sub getpattern_mark_dash_data {
<<'DATA';
[#-&\*\+\-=@_~¡¥¡ª¡«¡¬¡¡Á¡Â¡Ë¡Ñ¡Ô¡Ö¡×¡Þ¡ç¡è¡é¡ë¡ì£££¥£¦£ª£«££½£À£ß£ü¨C¨D¨E¨O©W©\©`©¤-¡á©¡þ¡ù¦ò-¦õ©h-©n©~©©©©©©]
DATA
}
sub getpattern_mark_wordsplit_data {
<<'DATA';
[\.\,\?\!\:\;¡Ã¡¢¡£¡¤£¡£¬£®£º£»£¿©U©o©p©q©r©s©t©u]
DATA
}
sub getpattern_word_finish_data {
<<'DATA';
(?:È«[ÎÄÊé]|)[ÍêÖÕ]
DATA
}
sub getpattern_remove_line_by_end_data {
<<'DATA';
(case)
[±¨ÍøÉçѶ]
[Á¬ÖØÅÅÕû³öÌáÍÆɨУ½Ï±àÊéÊÀÊÓÎÄ¿ÆÔÚÌÖС¹¤×ª][ѧ»ÃÂÛ×÷]?(?:[ÔØÌùÅÅ°æÀíÆ·¹©³öÈëУ½ÏÃèÕý¶ÔÕßÎÝ¿â³Ç·½çÔ·ÏßÇø×éÊÒ]|º£Ñó|ÍûÔ¶¾µ|ÌÒ»¨Ô´|-K12)(?:Íê³É|)
Çë(?:ÉêÇëÊÚȨ|±£Áôվ̨ÐÅÏ¢)[¡££®©q\.£¡©u]?
ÖÆ×÷
[Oo£Ï£ï][Cc£Ã£ã][Rr£Ò£ò]
²É±àÖÐÐÄ
Òà·²¹«ÒæͼÊé¹Ý
ÁúµÄÌì¿Õ
ʧÂäµÄÐdz½
ÊéÏãÃŵÚ
¾ÉÓêÂ¥
Ò»½£Ð¡ÌìÏÂ
Öñ¶ºÉ·ç
Ñï½£Ðù¾ÓÊ¿
»ÃÏëʱ´ú
ðÏÕÕßÌìÌÃ
ÐÅÏ¢ÖÐÐÄ
cnread[\.¡££®¡¤©q]net
ezla[\.¡££®¡¤©q]com?[\.¡££®¡¤©q]tw
thebook[\.¡££®¡¤©q]yeah[\.¡££®¡¤©q]net
y(?:esho[\.¡££®¡¤©q]com/wenxue|uzispy[\.¡££®¡¤©q]yeah[\.¡££®¡¤©q]net)
www[\.¡££®¡¤©q](?:v-war|oldrain)[\.¡££®¡¤©q](?:net|com)
DATA
}
sub getpattern_remove_line_by_end_special_data {
<<'DATA';
±¨ÍøÉçѶ
DATA
}
1;
__END__
=head1 NAME
WWW::BookBot::Chinese - Virtual class of bots to process chinese e-texts.
=head1 SYNOPSIS
use WWW::BookBot::Chinese::Novel::DragonSky;
my $bot=WWW::BookBot::Chinese::Novel::DragonSky->new({work_dir=>'/output'});
$bot->go_catalog({});
use WWW::BookBot::Chinese::Novel::ShuKu;
my $bot=WWW::BookBot::Chinese::Novel::ShuKu->new({});
$bot->go_catalog({desc=>'NewNovel', cat1=>0, cat2=>1, pageno=>0});
=head1 ABSTRACT
Virtual class of bots to process chinese e-texts.
=head1 DESCRIPTION
Virtual class of bots to process chinese e-texts.
to be added.
=head2 EXPORT
None by default.
=head1 BUGS, REQUESTS, COMMENTS
Please report any requests, suggestions or bugs via
http://rt.cpan.org/NoAuth/ReportBug.html?Queue=WWW-BookBot
=head1 AUTHOR
Qing-Jie Zhou E<lt>qjzhou@hotmail.comE<gt>
=head1 SEE ALSO
L<WWW::BookBot>
=cut