lib/Scrapar/Mechanize.pm

package WWW::Mechanize::Cached;

use strict;
use warnings FATAL => 'all';

use vars qw( $VERSION );
$VERSION = '1.33';

use base qw( WWW::Mechanize );
use Carp qw( carp croak );
use Storable qw( freeze thaw );
use Compress::Zlib;
require Scrapar::Util;
use File::Find::Rule;

my $cache_key = __PACKAGE__;

sub _recycle_cache {
    my $self = shift;
    my $namespace = shift;

    # recycle cache files
    #
    # 1. tmp disk usage is more than 50%
    if (Scrapar::Util::disk_usage('/tmp') > 50) {
	$ENV{SCRAPER_LOGGER}->info("Recycling cache files in $namespace");
	
	# 2. older than 30 days
	for my $file (File::Find::Rule->file()
		      ->mtime("<" . (time - 30*86400))
		      ->in('/tmp/FileCache/' . $namespace)) {
	    unlink $file;
	}

	$ENV{SCRAPER_LOGGER}->info("Recycling done");
    }
}

sub new {
    my $class = shift;
    my %mech_args = @_;

    my $cache = delete $mech_args{cache};
    if ( $cache ) {
        my $ok = (ref($cache) ne "HASH") && $cache->can("get") && $cache->can("set");
        if ( !$ok ) {
            carp "The cache parm must be an initialized cache object";
            $cache = undef;
        }
    }

    my $self = $class->SUPER::new( %mech_args );

    if ( !$cache ) {
        require Cache::FileCache;
	my $cache_namespace = $ENV{SCRAPER_BACKEND} ?
	    'www-mechanize-cached-' . $ENV{SCRAPER_BACKEND} :
	    'www-mechanize-cached';
        my $cache_parms = {
            default_expires_in => ($self->{cache_expires_in} || "1d"),
            namespace => $cache_namespace,
        };
        $cache = Cache::FileCache->new( $cache_parms );
	$self->_recycle_cache($cache_namespace);
    }

    $self->{$cache_key} = $cache;

    return $self;
}

sub is_cached {
    my $self = shift;

    return $self->{_is_cached};
}

sub _make_request {
    my $self = shift;
    my $request = shift;

    my $req = $request->as_string;
    my $cache = $self->{$cache_key};
    my $response = $cache->get( $req );
    $ENV{SCRAPER_REQUESTS}++;
    if ( $response ) {

	# display the cache file
	require Cache::FileBackend;
	print("Cache key: " . Cache::FileBackend::_Build_Unique_Key($req) 
	      . "\n");

	print "[Cache hit!] " . $request->uri() . "\n";
        $response = thaw uncompress $response;
        $self->{_is_cached} = 1;
	$ENV{SCRAPER_CACHE_HITS}++;
    } 
    else {
        $response = $self->SUPER::_make_request( $request, @_ );
        
        # http://rt.cpan.org/Public/Bug/Display.html?id=42693
        $response->decode();
        delete $response->{handlers};
        
        $cache->set( $req, compress freeze($response) );
        $self->{_is_cached} = 0;
    }

    # An odd line to need.
    $self->{proxy} = {} unless defined $self->{proxy};

    return $response;
}

# Add an filter output
package HTML::Element;

use lib qw(lib);
use Scrapar::Util;
use Digest::MD5 qw(md5_hex);

my $text_tag = '_text_' . md5_hex time;

sub as_filtered_text {
    my $self = shift;
    my $sub_ref = shift;

    my $text = $self->as_text;
    
    return $text if ref($sub_ref) ne 'CODE';

    return $sub_ref->($text);
}

# text_objectified should be overridden, but somehow overriding causes the module not to be loaded.... 
# to be investigated.
sub text_objectified {
  my (@stack) = ($_[0]);

  my ($this);
  while (@stack) {
      foreach my $c (@{( $this = shift @stack )->{'_content'}}) {
	  if (ref($c)) {
	      unshift @stack, $c;
	  } 
	  else {
	      # pure text
	      my $text = $c;
	      $c = ( $this->{'_element_class'} || __PACKAGE__ )
		->new($text_tag, '_parent' => $this);
	      $c->push_content($text);
	  }
      }
  }
  return;
}

sub as_deobjectified_HTML {
    my $self = shift;
    my $html = $self->as_HTML(@_);
    $html =~ s[<$text_tag>(.+?)</$text_tag>][$1]sg;
    return $html;
}

sub as_headtrimmed_text {
    my $self = shift;
    my $regex = shift;

    return trim_head($self->as_text, $regex);
}

sub as_tailtrimmed_text {
    my $self = shift;
    my $regex = shift;

    return trim_tail($self->as_text, $regex);
}

sub innerHTML {
    my $self = shift;

    my $html = $self->as_HTML;
    $html =~ s[\A[\n\r\s]*<[^>]+?>(.+)<[^>]+?>[\n\r\s]*\z][$1]s;

    return $html;
}

sub query_first {
    my $self = shift;
    my $query = shift || return;

    my @r = $self->query($query);
    if (defined $r[0] && ref $r[0]) {
	return $r[0];
    }
    else {
	return HTML::Element->new('span');
    }
}

package Scrapar::Mechanize;

use strict;
use warnings;
use DB_File;
use Digest::MD5 qw(md5);
use URI;
use Data::Dumper;
use Scrapar::HTMLQuery 'query';
use HTML::TreeBuilder;
use HTML::Element;
use HTTP::Cookies;
use Scrapar::Util;

our @ISA;

BEGIN {
    if ($ENV{SCRAPER_CACHE}) {
	eval "push \@ISA, 'WWW::Mechanize::Cached';";
    }
    else {
	eval "use base qw(WWW::Mechanize)";
    }
    die $@ if $@;
}

my $history_filename = $ENV{PWD} . '/scraper-history';
END {
    unlink $history_filename;
}
tie my %history, 
    'DB_File', $history_filename, O_CREAT | O_RDWR, 0644, $DB_BTREE ;

sub visited {
    my $key = md5(Dumper(URI->new($_[0])));
    return 1 if $history{$key};
}

sub cache_expires_in {
    my $self = shift;
    $self->{cache_expires_in} = shift || $self->{cache_expires_in} || '1d';

    return $self->{cache_expires_in};
}

sub new {
    my $class = shift;
    my $self = $class->SUPER::new(@_);
    $self->stack_depth(0);
    $self->{fetch_count} = 0;

    my $cookie_folder = "$ENV{HOME}/.scraper-cookies";
    mkdir $cookie_folder;
    my $cookie_jar = HTTP::Cookies->new(
	file => ($ENV{SCRAPER_BACKEND} ?
		 "$cookie_folder/$ENV{SCRAPER_BACKEND}.dat" :
		 "$cookie_folder/hq-cookies.dat"),
	autosave => 1
	);
    $self->cookie_jar($cookie_jar);
    $self->agent_alias('Windows IE 6');
    $self->proxy([ 'http' ], $ENV{SCRAPER_PROXY}) if $ENV{SCRAPER_PROXY};

    $self;
}

sub referer {
    my $self = shift;
    $self->add_header(Referer => shift);
}

sub no_referer {
    my $self = shift;
    $self->delete_header('Referer');
}

sub get {
    my $self = shift;
    my $u;
    if (ref $_[0]) {
	if (ref $_[0] eq 'WWW::Mechanize::Link') {
	    $u = URI->new($_[0]->url_abs);
	}
    }
    else {
	$u = URI->new($_[0]);
    }
    my $key = md5(Dumper($u));

    sleep(int rand 10) if $ENV{SCRAPER_TIME_INTERVAL};

    $self->{_html_tree_cache} = undef;

    my ($caller_sub) = (caller(1))[3];

    if (!$history{$key}) {
	$self->{fetch_count}++;
	if (exists $ENV{SCRAPER_MAX_LINKS} 
	    && $self->{fetch_count} >= $ENV{SCRAPER_MAX_LINKS}) {
	    print "Max link is reached. Exiting ...\n";
	    exit;
	}

	print "\n-- $caller_sub --\n\n[$self->{fetch_count}][Mechanize] Getting " . ($u ? $u->as_string : '--- url ---') . "\n";

	$ENV{SCRAPER_LOGGER}->info("($self->{fetch_count}) Get " .($u ? $u->as_string : '--- url ---') . " $caller_sub");

#	print Scrapar::Util::free_mem_ratio(), $/;
	if (Scrapar::Util::free_mem_ratio() < 0.2) {
	    $ENV{SCRAPER_LOGGER}->warning("Free memory ratio less then 0.2. Aborting");
	    exit;
	}

	$self->SUPER::get(@_);
	$history{$key}++;
    }
}

sub html_tree {
    my $self = shift;
    my $args_ref = shift;

    # memoize html tree
    if (!$self->{_html_tree_cache}) {
        my $tree = $self->{_html_tree_cache} = HTML::TreeBuilder->new;

        $tree->parse($self->content);

	if ($args_ref->{objectify_text}) {
	    $tree->text_objectified;
	    $tree->{_is_text_objectified} = 1;
	}
    }

    return $self->{_html_tree_cache};
}

sub query {
    my $self = shift;
    my $query = shift || return;
    my $args_ref = shift || {};

    my @results = $self->html_tree($args_ref)->query($query);

    return wantarray ? @results : \@results;
}

sub query_first {
    my $self = shift;
    my $query = shift || return;
    my $args_ref = shift;

    my @r = $self->query($query, $args_ref);
    if (defined $r[0] && ref $r[0]) {
	return $r[0];
    }
    else {
	return HTML::Element->new('span');
    }
}

sub bulk_query_first {
    my $self = shift;
    my $r = shift;

    for my $req (@_) {
	my ($field, $query, $render, $render_arg) = @{$req};
	$r->{$field} = $self->query_first($query)->$render($render_arg);
    }
}

sub find_email {
    my $self = shift;
    return Scrapar::Util::find_email($self->content);
}

1;
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)