The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# $Id: /mirror/perl/WWW-RobotRules-Parser/trunk/lib/WWW/RobotRules/Parser.pm 31629 2007-12-01T13:31:37.690053Z daisuke  $
#
# Copyright (c) 2006-2007 Daisuke Maki <daisuke@endeworks.jp>
# A lot of this code is based on WWW::RobotRules.

package WWW::RobotRules::Parser;
use strict;
use warnings;
use vars qw($VERSION);
use URI;

BEGIN
{
    $VERSION = '0.04001';
}

sub new { bless { }, shift }

sub parse_uri
{
    my $self = shift;
    my $uri  = shift;

    require LWP::UserAgent;
    my $ua   = LWP::UserAgent->new;
    my $res  = $ua->get($uri);

    if (! $res->is_success) {
        require Carp;
        Carp::croak("Failed to retrieve $uri: Got HTTP code " . $res->code);
    }
        
    $self->parse($uri, $res->content);
}

sub parse
{
    my $self = shift;
    my $robot_txt_uri = shift;
    my $text = shift;

    $robot_txt_uri = URI->new($robot_txt_uri) if ($robot_txt_uri);

    my %result;
    # blank lines are significant, so turn CRLF into LF to avoid generating
    # false ones
    $text =~ s/\015\012/\012/g;

    my $ua_pattern;
    # split at \012 or \015
    for (split(/[\012\015]/, $text)) {
        # Lines containing only a comment are discarded completely, and
        # therefore do not indicate a record boundary
        next if /^\s*\#/;

        s/\s*\#.*//; # Remove comments at end-of-line

        if (/^\s*User-Agent\s*:\s*(.*)/i) {
            $ua_pattern = $1;
            $ua_pattern =~ s/\s+$//;
        } elsif (/^\s*Disallow\s*:\s*(.*)/i) {
            if (! defined $ua_pattern) {
                $ua_pattern = '*';
            }

            my $disallow = $1;
            $disallow =~ s/\s+$//;
            if (length $disallow) {
                if ($robot_txt_uri) {
                    my $ignore;
                    eval {
                        my $u = URI->new_abs($disallow, $robot_txt_uri);
                        $ignore++ if $u->scheme ne $robot_txt_uri->scheme;
                        $ignore++ if lc($u->host) ne lc($robot_txt_uri->host);
                        $ignore++ if $u->port ne $robot_txt_uri->port;
                        $disallow = $u->path_query;
                        $disallow = "/" unless length $disallow;
                    };
                    next if $@;
                    next if $ignore;
                }
                push @{$result{$ua_pattern}}, $disallow;
            }
        }
    }
    return wantarray ? %result : \%result
}

1;

__END__

=head1 NAME

WWW::RobotRules::Parser - Just Parse robots.txt

=head1 SYNOPSIS

  use WWW::RobotRules::Parser;
  my $p = WWW::RobotRules::Parser->new;
  $p->parse($robots_txt_uri, $text);

  $p->parse_uri($robots_txt_uri);

=head1 DESCRIPTION

WWW::RobotRules::Parser allows you to simply parse robots.txt files as
described in http://www.robotstxt.org/wc/norobots.html. Unlike WWW::RobotRules
(which is very cool), this module does not take into consideration your
user agent name when parsing. It just parses the structure and returns
a hash containing the whole set of rules. You can then use this to do
whatever you like with it.

I mainly wrote this to store away the parsed data structure else where for
later use, without having to specify an user agent.

=head1 METHODS

=head2 new

Creates a new instance of WWW::RobotRules::Parser

=head2 parse($uri, $text)

Given the URI of the robots.txt file and its contents, parses the content and returns a data structure that looks like the following:

  {
     '*' => [ '/private', '/also_private' ],
     'Another UserAgent' => [ '/dont_look' ]
  }

Where the key is the user agent name, and the value is an arrayref of all
paths that are prohibited by that user agent

=head2 parse_uri($uri)

Given the URI of the robots.txt file, retrieves and parses the file.

=head1 SEE ALSO 

L<WWW::RobotRules>

=head1 AUTHOR

Copyright (c) 2006-2007 Daisuke Maki E<lt>daisuke@endeworks.jpE<gt>

=head1 LICENSE

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

See http://www.perl.com/perl/misc/Artistic.html

=cut