The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# $Id: /mirror/gungho/lib/Gungho/Component/RobotRules.pm 31300 2007-11-29T11:51:03.339151Z lestrrat  $
#
# Copyright (c) 2007 Daisuke Maki <daisuke@endeworks.jp>

package Gungho::Component::RobotRules;
use strict;
use warnings;
use base qw(Gungho::Component);
use Gungho::Component::RobotRules::Rule;
use WWW::RobotRules::Parser;

__PACKAGE__->mk_classdata($_) for qw(pending_robots_txt robot_rules_parser robot_rules_storage pending_count);

sub setup
{
    my $c = shift;
    $c->pending_robots_txt({});
    $c->pending_count(0);
    $c->setup_robot_rules_storage();
    $c->setup_robot_rules_parser();
    $c->next::method(@_);
}

sub send_request
{
    my ($c, $request) = @_;

    my $allowed = 
        $request->uri->path eq '/robots.txt' ||
        $c->allowed($request)
    ;
    if ($allowed == -2) {
        $c->log->debug("[ROBOT RULES] Fetch for /robots.txt already scheduled for " . $request->original_uri->host_port);
    } elsif ($allowed == -1) {
        $c->log->debug("[ROBOT RULES] No robot rules found for " . $request->original_uri->host_port . ", going to fetch one");
    } elsif ($allowed) {
        return $c->next::method($request);
    } else {
        $c->log->debug("[ROBOT RULES] " . $request->uri . " is disallowed by robot rules");
    }
    return 0;
}

sub allowed
{
    my ($c, $request) = @_;

    my $rule = $c->robot_rules_storage->get_rule( $c, $request );
    if (! $rule) {
        if ($c->push_pending_robots_txt($request) == 0) {
            return -2;
        }

        my $uri = $request->original_uri;
        $uri->path('/robots.txt');
        $uri->query(undef);
        $uri->fragment(undef);
        my $req = Gungho::Request->new(GET => $uri);
        $req->notes('auto_robot_rules' => 1);
        $c->pushback_request( $req );
        return -1;
    } else {
        return $rule->allowed( $c, $request->uri );
    }
}

sub handle_response
{
    my $c = shift;
    my ($request, $response) = @_;

    if ($request->uri->path eq '/robots.txt' && $request->notes('auto_robot_rules')) {
        $c->log->debug("Handling robots.txt response for " . $request->uri);
        $c->parse_robot_rules($request, $response);
        $c->dispatch_pending_robots_txt($request);
        return;
    }

    $c->next::method(@_);
}

sub push_pending_robots_txt
{
    my ($c, $request) = @_;
    $c->pending_count( $c->pending_count + 1 );
    $c->log->debug(
        "[ROBOT RULES]: Requests still pending: " . $c->pending_count
    );
    return $c->robot_rules_storage->push_pending_robots_txt( $c, $request );
}

sub dispatch_pending_robots_txt
{
    my ($c, $request) = @_;

    my $pending = $c->robot_rules_storage->get_pending_robots_txt($c, $request);
    if ($pending && ref $pending eq 'HASH') {
        foreach my $request (values %$pending) {
            $c->pending_count( $c->pending_count - 1 );
            $c->pushback_request( $request );
        }
    }
    $c->log->debug(
        "[ROBOT RULES]: Requests still pending: " . $c->pending_count
    );
}

sub setup_robot_rules_storage
{
    my $c = shift;

    my $config = $c->config->{robotrules}{storage} || {};

    my $pkg = $config->{module} || 'DB_File';
    my $pkg_config = $config->{config} || {};
    $pkg = $c->load_gungho_module($pkg, 'Component::RobotRules::Storage');
    my $storage = $pkg->new(%$config);
    $storage->setup($c);
    $c->robot_rules_storage( $storage );
}

sub setup_robot_rules_parser
{
    my $c = shift;

    my $config = $c->config->{robotrules}{parser} || {};

    my $pkg = $config->{module} || '+WWW::RobotRules::Parser';
    my $pkg_config = $config->{config} || {};
    $pkg = $c->load_gungho_module($pkg, 'Component');
    my $parser = $pkg->new($config);
    $c->robot_rules_parser( $parser );
}

sub parse_robot_rules
{
    my ($c, $request, $response) = @_;

    my $h = ($request && $response && $response->is_success && $response->content) ?
        $c->robot_rules_parser->parse($request->original_uri, $response->content) :
        {}
    ;
    $c->log->debug("Parse robot rules " . $request->uri . ": " . keys(%$h) . " rules");
    my $rule = Gungho::Component::RobotRules::Rule->new($h);
    $c->robot_rules_storage->put_rule($c, $request, $rule);
}

1;

=head1 NAME

Gungho::Component::RobotRules - Respect robots.txt

=head1 SYNOPSIS

  ---
  components:
    - RobotRules

=head1 METHODS

=head2 setup

=head2 setup_robot_rules_parser

=head2 setup_robot_rules_storage

=head2 handle_response

=head2 send_request

=head2 allowed($request)

Returns 1 if request is allowed to be fetched, 0 if not. -1 and -2 are returned
when there is a pending request to fetch /robots.txt

=head2 dispatch_pending_robots_txt

Dispatches requests that were pending because of a missing robots.txt entry

=head2 push_pending_robots_txt

Pushes a request in the wait queue for a robots.txt

=head2 parse_robot_rules

Parses the robot rule and stores it

=cut