The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package SWISH::Prog::Aggregator::Spider::Response;
use strict;
use warnings;
use base qw( SWISH::Prog::Class );
use Carp;
use Data::Dump qw( dump );
use Search::Tools::UTF8;
use HTML::LinkExtor;
use URI;
use HTML::Tagset;
use HTML::HeadParser;

our $VERSION = '0.73';

__PACKAGE__->mk_accessors(
    qw(
        http_response
        link_tags
        )
);

=pod

=head1 NAME

SWISH::Prog::Aggregator::Spider::Response - spider response

=head1 SYNOPSIS

 use SWISH::Prog::Aggregator::Spider::UA;
 my $ua = SWISH::Prog::Aggregator::Spider::UA->new;
 my $response = $ua->get('http://swish-e.org/');
 my $http_response = $response->http_response;
 
 # $ua isa LWP::RobotUA subclass
 # $response isa SWISH::Prog::Aggregator::Spider::Response
 # $http_response isa HTTP::Response

=head1 DESCRIPTION

SWISH::Prog::Aggregator::Spider::Response wraps the
HTTP::Response class and provides some convenience methods.

=head1 METHODS

=cut

=head2 init

Override internal constructor setup method.

=cut

sub init {
    my $self = shift;
    $self->SUPER::init(@_);

    # TODO set by our UA. duplicate?
    #$self->{link_tags} ||= { a => 1, frame => 1, iframe => 1, };
    return $self;
}

=head2 http_response

Returns internal HTTP::Response object.

=cut

=head2 success

Shortcut for $response->http_response->is_success.

=cut

sub success {
    return shift->http_response->is_success;
}

=head2 status

Shortcut for $response->http_response->code.

=cut

sub status {
    return shift->http_response->code;
}

=head2 ct

Shortcut for $response->response->header('content-type').
Any encoding will be stripped from the returned string.

=cut

sub ct {
    my $self = shift;
    my $ct   = $self->http_response->header('content-type');
    $ct =~ s/;.+// if $ct;
    return $ct;
}

=head2 is_html

Returns true if ct() looks like HTML or XHTML.

=cut

sub is_html {
    my $self = shift;
    my $ct   = $self->ct;
    return defined $ct
        && ( $ct eq 'text/html' || $ct eq 'application/xhtml+xml' );
}

=head2 content

Shortcut for $response->http_response->decoded_content.

=cut

sub content {
    return shift->http_response->decoded_content;
}

=head2 links

Returns array of href targets in content(). Parsed
using HTML::LinkExtor.

=cut

sub links {
    my $self          = shift;
    my @links         = ();
    my $http_response = $self->http_response;
    my $debug         = $self->debug;

    if ( $http_response and $self->is_html ) {
        my $le   = HTML::LinkExtor->new();
        my $base = $http_response->base;
        $le->parse( $self->content );

        my %skipped_tags;

        for my $link ( $le->links ) {
            my ( $tag, %attr ) = @$link;

            # which tags to use
            my $attr = join ' ', map {qq[$_="$attr{$_}"]} keys %attr;

            $debug and SWISH::Prog::Utils->write_log(
                uri => $base,
                msg => "extracted tag '<$tag $attr>'"
            );

            if ( !exists $self->link_tags->{$tag} ) {
                $debug
                    and SWISH::Prog::Utils->write_log(
                    uri => $base,
                    msg => "skipping tag '<$tag $attr>', not on whitelist"
                    );
                next;
            }

            # Grab which attribute(s) which might contain links for this tag
            my $links = $HTML::Tagset::linkElements{$tag};
            $links = [$links] unless ref $links;

            my $found = 0;

            # check each attribute to see if a link exists
            for my $attribute (@$links) {
                if ( $attr{$attribute} ) {

                    # strip any anchors as noise
                    $attr{$attribute} =~ s/#.*//;

                    my $u = URI->new_abs( $attr{$attribute}, $base );
                    push @links, $u;
                    $debug
                        and SWISH::Prog::Utils->write_log(
                        uri => $base,
                        msg => "added '$u' to links",
                        );
                    $found++;
                }
            }

            if ( !$found && $debug ) {
                SWISH::Prog::Utils->write_log(
                    uri => $base,
                    msg => "tag <$tag $attr> has no links or is a duplicate",
                );
            }

        }

        $debug
            and SWISH::Prog::Utils->write_log(
            uri => $base,
            msg => sprintf( "found %d links", scalar @links ),
            );

    }
    return @links;
}

=head2 link_tags( I<hashref> )

Set hashref of tags considered valid "links". Used by the links()
method.

=cut

=head2 title

Returns document title, verifying that UTF-8
flag is set correctly on the response content.

=cut

sub title {
    my $self = shift;
    return unless $self->is_html;

    my $p = HTML::HeadParser->new;

    # HTML::HeadParser throws warning if utf-8 flag is not on for utf-8 bytes.
    # So we trust the content-type header and
    # verify that the utf-8 flag is on.
    if ( $self->http_response->header('content-type') =~ m/utf-8/i ) {
        $p->parse( to_utf8( $self->content ) );
    }
    else {
        $p->parse( $self->content );
    }
    return $p->header('Title');
}

# delegate all other method calls the the http_response object.
# cribbed from HTTP::Message
our $AUTOLOAD;

sub AUTOLOAD {
    my $method = substr( $AUTOLOAD, rindex( $AUTOLOAD, '::' ) + 2 );

    # We create the function here so that it will not need to be
    # autoloaded the next time.
    no strict 'refs';
    *$method = sub {
        local $Carp::Internal{ +__PACKAGE__ } = 1;
        shift->http_response->$method(@_);
    };
    goto &$method;
}

sub DESTROY { }    # avoid AUTOLOADing it

1;

__END__

=head1 AUTHOR

Peter Karman, E<lt>perl@peknet.comE<gt>

=head1 BUGS

Please report any bugs or feature requests to C<bug-swish-prog at rt.cpan.org>, or through
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=SWISH-Prog>.  
I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.

=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc SWISH::Prog


You can also look for information at:

=over 4

=item * RT: CPAN's request tracker

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=SWISH-Prog>

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/SWISH-Prog>

=item * CPAN Ratings

L<http://cpanratings.perl.org/d/SWISH-Prog>

=item * Search CPAN

L<http://search.cpan.org/dist/SWISH-Prog/>

=back

=head1 COPYRIGHT AND LICENSE

Copyright 2008-2009 by Peter Karman

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself. 

=head1 SEE ALSO

L<http://swish-e.org/>