The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# ABSTRACT: 自动化获取网页内容,并解码为unicode
package Novel::Robot::Browser;

use strict;
use warnings;
use utf8;

our $VERSION = 0.14;

use Encode::Detect::CJK qw/detect/;
use Encode;
use WWW::Mechanize;
use Term::ProgressBar;

sub new {
    my ($self, %opt) = @_;
    $opt{retry} ||= 5;
    $opt{max_process_num} ||=5;
    $opt{browser} ||= _init_browser();
    bless { %opt } , __PACKAGE__;
}

sub _init_browser {
    my $http = WWW::Mechanize->new(
        #onerror => sub { print "fail get url\n"; }, 
        onerror => sub { return; }, 
        stack_depth => 2, 
    );

    my %default_headers = (
        'Accept' =>
          'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset'  => 'gb2312,utf-8;q=0.7,*;q=0.7',
        'Accept-Encoding' => "gzip, deflate",
        'Connection'      => 'keep-alive',
        'User-Agent' =>
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; MALC)',
        'Accept-Language' => "zh-cn,zh-tw;q=0.7, en-us,*;q=0.3",
    );

    while ( my ( $k, $v ) = each %default_headers ) {
        $http->add_header( $k, $v );
    }

    return $http;
} ## end sub init_browser

#use threads;
#use threads::shared;
#use Thread::Semaphore;
#sub request_urls {
    #my ($self, $arr, %opt) = @_;
    #$opt{request_sub} ||= sub {
        #my ($x) = @_;
        #my ($url, $post_var) = exists $opt{url_sub} ? $opt{url_sub}->($x) : 
        #ref($x) eq 'HASH' ? ( $x->{url}, $x->{post_var} )  :
        #ref($x) eq 'ARRAY' ? @$x : 
        #($x, undef);
        #return $self->request_url($url, $post_var);
    #};

    #my $sem = Thread::Semaphore->new($self->{max_process_num});
    #my $work_func = sub {
        #my ($i) = @_;
        #my $x = $arr->[$i];
        #my $s = $opt{request_sub}->($x);
        #$s = $opt{deal_sub}->($x, $s) if(exists $opt{deal_sub});
        #$sem->up;
        #return [$i, $s ];
    #};
    #my @threads = map {
        #$sem->down;
        #threads->create($work_func, $_);
    #} 0..$#$arr;

    #my @res;
    #for (@threads){
        #my $r = $_->join;
        #$res[$r->[0]] = $r->[1];
    #}
    #return \@res;
#}

#use Coro;
#sub request_urls {
    #my ($self, $arr, %opt) = @_;
    #$opt{request_sub} ||= sub {
        #my ($x) = @_;
        #my ($url, $post_var) = exists $opt{url_sub} ? $opt{url_sub}->($x) : 
        #ref($x) eq 'HASH' ? ( $x->{url}, $x->{post_var} )  :
        #ref($x) eq 'ARRAY' ? @$x : 
        #($x, undef);
        #return $self->request_url($url, $post_var);
    #};

    #my @tasks;
    #my @res;
    #for my $i (0 .. $#$arr){
        #push @tasks, async {
            #my $x = $arr->[$i];
            #my $s = $opt{request_sub}->($x);
            #$s = $opt{deal_sub}->($x, $s) if(exists $opt{deal_sub});
            #$res[$i] = $s;
        #}
    #}
    #for (@tasks){
        #$_->join;
    #}
    #return \@res;
#}

use Parallel::ForkManager;
sub request_urls {
    my ($self, $arr, %opt) = @_;

    my $progress;
    $progress = Term::ProgressBar->new ({ count => scalar(@$arr) }) 
    if($opt{show_progress_bar});
    my $cnt = 0;

    my @res;
    my $pm = Parallel::ForkManager->new( $self->{max_process_num} );
    $pm->run_on_finish(
        sub {
            my ( $pid, $exit_code, $ident, $exit, $core, $data ) = @_;
            $cnt++;
            $progress->update($cnt) if($opt{show_progress_bar});
            return unless($data);
            my ($i, $r) = @$data;
            $res[$i] = $r;
        }
    );

    $opt{request_sub} ||= sub {
        my ($x) = @_;
        my ($url, $post_var) = exists $opt{url_sub} ? $opt{url_sub}->($x) : 
        ref($x) eq 'HASH' ? ( $x->{url}, $x->{post_var} )  :
        ref($x) eq 'ARRAY' ? @$x : 
        ($x, undef);
        return $self->request_url($url, $post_var);
    };

    for my $i (0 .. $#$arr){
        my $pid = $pm->start and next;
        my $x = $arr->[$i];

        my $s = $opt{request_sub}->($x);
        $s = $opt{deal_sub}->($x, $s) if(exists $opt{deal_sub});

        $pm->finish( 0, [ $i, $s ] );
    }
    $pm->wait_all_children;
    return \@res;
}

sub request_url {
    my ( $self, $url, $post_data ) = @_;

    my $response;
    for my $i ( 1 .. $self->{retry} ) {
        eval { $response = $self->make_request( $url, $post_data ); };
        return $self->decode_response_content($response) if ($response);
        sleep 2;
    } ## end for my $i ( 1 .. $self->...)

    return;
} ## end sub get_url_ref

sub make_request {
    my ( $self, $url, $post_data ) = @_;

    if ($post_data) {
        $self->{browser}->post( $url, $post_data );
    }
    else {
        $self->{browser}->get($url);
    }

    return unless ( $self->{browser}->success() );
    return $self->{browser}->response();
}

sub decode_response_content {
    my ( $self, $response ) = @_;

    my $html = $response->decoded_content( charset => 'none' );

    my $charset = detect($html);
    $html = decode( $charset, $html, Encode::FB_XMLCREF );

    return \$html;
}

1;