The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# ABSTRACT: get/post url, return unicode content, auto detect CJK charset
package Novel::Robot::Browser;

use strict;
use warnings;
use utf8;

our $VERSION = 0.16;

use Encode::Detect::CJK qw/detect/;
use Encode;
use HTTP::Tiny;
use Parallel::ForkManager;
use Term::ProgressBar;
use IO::Uncompress::Gunzip qw(gunzip);
use URI::Escape;

our $DEFAULT_URL_CONTENT = '';
our %DEFAULT_HEADER = (
    'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 
    'Accept-Charset'  => 'gb2312,utf-8;q=0.7,*;q=0.7',
    'Accept-Encoding' => "gzip",
    'Accept-Language' => 'zh,zh-cn;q=0.8,en-us;q=0.5,en;q=0.3', 
    'Connection'      => 'keep-alive',
    'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0', 
    'DNT' => 1, 
);

sub new {
    my ( $self, %opt ) = @_;
    $opt{retry}           ||= 5;
    $opt{max_process_num} ||= 5;
    $opt{browser}         ||= _init_browser( $opt{browser_headers} );
    bless {%opt}, __PACKAGE__;
}

sub _init_browser {
    my ($headers) = @_;

    $headers ||= {};
    my %h = ( %DEFAULT_HEADER, %$headers );

    my $http = HTTP::Tiny->new( default_headers => \%h, );

    return $http;
} ## end sub init_browser

sub request_urls {
    my ( $self, $src_arr, %opt ) = @_;
    # arr : url / { url => .. }
    # process_sub => sub { my ($html_ref) = @_ ; ... }
    my $arr = $opt{select_url_sub}->( $src_arr );
    
    my $iter_sub = sub {
        my ($r) = @_;
        return $opt{data_sub}->(@_) if($opt{no_auto_request_url});

        my ( $url, $post_data ) =
        ref($r) eq 'HASH' ? @{$r}{qw/url post_data/} 
        : ( $r, undef );

        my @procss_data = ($url, $post_data); 
        if($url=~/^https?:/){
            my $h = $self->request_url( $url, $post_data );
            @procss_data = ( \$h );
        }
        my @return_data = exists $opt{data_sub} ?
        $opt{data_sub}->(@procss_data) : @procss_data;
        return @return_data;
    };

    my $progress;
    $progress = Term::ProgressBar->new( { count => scalar(@$arr) } )
      if ( $opt{show_progress_bar} );

    my $cnt = 0;
    my @res_arr;

    my $pm = Parallel::ForkManager->new( $self->{max_process_num} );
    $pm->run_on_finish(sub {
            my ( $pid, $exit_code, $ident, $exit, $core, $data ) = @_;

            $cnt++;
            $progress->update($cnt) if ( $opt{show_progress_bar} );
            return unless ($data);

            my ( $id, @res ) = @$data;
            $res_arr[$id] = $#res==0 ? $res[0] : \@res;
        });

    for my $i ( 0 .. $#$arr ) {
        my $pid = $pm->start and next;
        my @s = $iter_sub->($arr->[$i]);
        $pm->finish( 0, [ $i, @s ] );
    }

    $pm->wait_all_children;

    return \@res_arr;
}

sub request_urls_iter {
    my ( $self, $url, %o ) = @_;

    my $html = $self->request_url($url, $o{post_data});

    my $info      = $o{info_sub}->( \$html )      || {};
    my $data_list = $o{data_list_sub}->( \$html ) || [];

    return ( $info, $data_list ) if ( $o{stop_sub}->( $info, $data_list ) );

    my $url_list = $o{url_list_sub}->( \$html ) || [];
    my $select_url_list = $o{select_url_sub}->( $url_list );
    for my $u (@$select_url_list) {
        return ( $info, $data_list ) if ( $o{stop_sub}->( $info, $data_list ) );

        my ( $u_url, $u_post_data ) =
        ref($u) eq 'HASH' ? @{$u}{qw/url post_data/} 
        : ( $u, undef );

        my $c = $self->request_url($u_url, $u_post_data);

        my $fs = $o{data_list_sub}->( \$c ) || [];
        push @$data_list, @$fs;
    }

    return ( $info, $data_list );
} ## end sub get_tiezi_ref

sub request_url {
    my ( $self, $url, $form ) = @_;
    return $DEFAULT_URL_CONTENT unless($url);

    my $c;
    for my $i ( 1 .. $self->{retry} ) {
        eval { $c = $self->request_url_simple( $url, $form ); };
        last if ($c);
        sleep 2;
    }

    return $c || $DEFAULT_URL_CONTENT;
}

sub format_post_content {
    my ($self, $form) = @_;

    my @params;
    while(my ($k, $v) = each %$form){
        push @params, uri_escape($k)."=".uri_escape($v);
    }

    my $post_str = join("&", @params);
    return $post_str;
}

sub request_url_simple {
    my ( $self, $url, $form ) = @_;

    my $res =
        $form
      ? 
      $self->{browser}->request('POST', $url, {
              content => $self->format_post_content($form),
               headers => { 
                   %DEFAULT_HEADER, 
                   'content-type' => 'application/x-www-form-urlencoded' }, 
          })
      : $self->{browser}->get($url);
    return $DEFAULT_URL_CONTENT unless ( $res->{success} );

    my $html;
    my $content = $res->{content};
    if ( $res->{headers}{'content-encoding'} and
         $res->{headers}{'content-encoding'} eq 'gzip' ) {
        gunzip \$content => \$html, MultiStream => 1, Append => 1;
    }

    my $charset = detect( $html || $content );
    my $r = decode( $charset, $html || $content, Encode::FB_XMLCREF );

    return $r || $DEFAULT_URL_CONTENT;
}

1;