The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# $Id: /mirror/perl/HTTP-RobotsTag/trunk/lib/HTTP/RobotsTag.pm 31676 2007-12-10T00:06:35.669605Z daisuke  $
#
# Copyright (c) 2007 Daisuke Maki <daisuke@endewoks.jp>
# All rights reserved.

package HTTP::RobotsTag;
use strict;
use warnings;
use Carp qw(croak);
use DateTime::Format::Strptime;
use HTTP::Headers;
use HTTP::RobotsTag::Rules;
our $VERSION = '0.00001';

sub new
{
    my $class = shift;
    my $self  = bless {}, $class;
    return $self;
}

my $re = qr{
    (
        (?:no)?
        (?:
            index   |
            archive |
            snippet |
            follow  |
            unavailable_after:.+
        )
    )
    (?:
        \s*,\s*
        |
        $
    )
}x;

sub parse_headers
{
    my ($self, $headers) = @_;

    if (! eval { $headers->can('header') }) {
        croak "argument does not implement a header() function";
    }

    my $fmt = DateTime::Format::Strptime->new(
        pattern => '%d %b %Y %H:%M:%S %Z'
    );

    my %directives;
    my @tags = $headers->header( 'x-robots-tag' );
    foreach my $tag (@tags) {
        while ($tag =~ /$re/g) {
            my($key, $val) = split(/:/, $tag, 2);
            $key = lc $key;
            if ($key eq 'unavailable_after') {
                $val =~ s/^\s+//;
                $val =~ s/\s+$//;
                $directives{ $key } = $fmt->parse_datetime($val) or die;
            } else {
                $directives{ $key } = $val || 1;
            }
        }
    }

    return HTTP::RobotsTag::Rules->new(%directives);
}

1;

__END__

=head1 NAME

HTTP::RobotsTag - Parse Robots Tag In HTTP Headers

=head1 SYNOPSIS

  use HTTP::RobotsTag;

  my $response = $lwp->get( $url );
  my $p        = HTTP::RobotsTag->new();
  my $rule     = $p->parse_headers( $response );

  if ($rule->can_index()) {
    ...
  }

  if ($rule->is_available( $dt )) {
    ...
  }

=head1 DESCRIPTION

HTTP::RobotsTag parses HTTP headers for X-Robots-Tag headers and stores the
information in a Rules object.

=head1 METHODS

=head2 new

=head2 parse_headers($headers)

Accepts a HTTP::Headers (or an object that implements the header()) method,
and looks for X-Robots-Tag headers.

=head1 AUTHOR

Copyright (c) 2007 Daisuke Maki E<lt>daisuke@endeworks.jpE<gt>

=head1 SEE ALSO

L<HTML::RobotsTag::Rules|HTML::RobotsTag::Rules> 

=head1 LICENSE

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

See http://www.perl.com/perl/misc/Artistic.html

=cut