use strict;
package HTML::RSSAutodiscovery;
use base qw (HTML::Parser);
# $Id: RSSAutodiscovery.pm,v 1.5 2004/10/17 04:13:06 asc Exp $
=head1 NAME
HTML::RSSAutodiscovery - methods for retreiving RSS-ish information from an HTML document.
=head1 SYNOPSIS
use HTML::RSSAutodiscovery;
use Data::Dumper;
my $url = "http://www.diveintomark.org/";
my $html = HTML::RSSAutodiscovery->new();
print &Dumper($html->parse($url));
# Mark's gone a bit nuts with this and
# the list is too long to include here...
# see the POD for the 'parse' method for
# details of what it returns.
=head1 DESCRIPTION
Methods for retreiving RSS-ish information from an HTML document.
=cut
use LWP::UserAgent;
use HTTP::Request;
use Carp;
$HTML::RSSAutodiscovery::VERSION = '1.21';
use constant SYNDIC8_PROXY => "http://www.syndic8.com/xmlrpc.php";
use constant SYNDIC8_CLASS => "syndic8";
use constant SYNDIC8_FINDSITES => join(".",SYNDIC8_CLASS,"FindSites");
use constant SYNDIC8_FEEDINFO => join(".",SYNDIC8_CLASS,"GetFeedInfo");
use constant MIMETYPE_RSS => "application/rss+xml";
=head1 PACKAGE METHODS
=head2 __PACKAGE__->new()
Object constructor. Returns an object. Woot!
=cut
sub new {
my $pkg = shift;
my $self = {};
bless $self,$pkg;
if (! $self->init(@_)) {
return undef;
}
return $self;
}
sub init {
my $self = shift;
$self->SUPER::init(start_h=> [\&_start,"self,tagname,attr"]);
return 1;
}
=head1 OBJECT METHODS
=cut
=head2 $obj->parse($arg)
Parse an HTML document and return RSS-ish <link> information.
I<$arg> may be either:
=over 4
=item *
An HTML string, passed as a scalar reference.
=item *
A URI.
=back
Returns an array reference of hash references whose keys are :
=over 4
=item *
I<title>
=item *
I<type>
=item *
I<rel>
=item *
I<href>
=back
=cut
sub parse {
my $self = shift;
my $uri = shift;
my $data = $uri;
if (ref($data) ne "SCALAR") {
$data = $self->_fetch($uri) || return undef;
}
$self->{'__embedded'} ||= [];
$self->{'__links'} ||= [];
$self->SUPER::parse($$data);
return $self->{'__links'};
}
=head2 $obj->locate($uri,\%args)
Like the I<parse> method, but will perform additional lookups, if necessary or specified.
Valid arguments are
=over 4
=item *
B<uri>
String. A live, breathing URI to slurp and parse.
I<Required>
=item *
Hash ref whose keys may be
=over 4
=item *
B<noparse>
Boolean. Don't bother parsing the document, this will also prevent you
from checking for embedded links.
I don't know why you want to do this, but you can.
False, by default.
=item *
B<embedded>
Boolean. Check all embedded links ending in '.xml', '.rss' or '.rdf'
(and then 'xml', 'rss' or 'rdf') for RSS-ness.
False, by default, unless the initial parsing of the URI returns no
RSS links.
=item *
B<embedded_and_remote>
Boolean.
Boolean. Check all embedded links whose root is not the same as I<$uri>
for RSS-ness.
False, by default.
=item *
B<syndic8>
Boolean. Check the syndic8 servers for sites matching I<$uri>
False, by default, unless the initial parsing of the URI and any embedded links
returns no RSS links.
=back
=back
Returns an array reference of hash references whose keys are :
=over 4
=item *
I<title>
=item *
I<type>
=item *
I<rel>
=item *
I<href>
=back
=cut
sub locate {
my $self = shift;
my $uri = shift;
my $args = shift;
$self->{'__embedded'} = [];
$self->{'__links'} = [];
my $parse = 1;
my $embedded = 0;
my $syndic8 = 0;
if (ref($args) eq "HASH") {
$parse = ((defined($args->{noparse})) && ($args->{noparse})) ? 0 : 1;
$embedded = ((defined($args->{embedded})) && ($args->{embedded})) ? 1 : 0;
$syndic8 = ((defined($args->{syndic8})) && ($args->{syndic8})) ? 1 : 0;
}
if ($parse) {
# This is a hack. Do as I say, not as I do
if ($embedded) {
$self->{'__check_embedded'} = ($args->{embedded_and_remote}) ? 2 : 1;
}
$self->parse($uri);
}
if (($parse) && (($embedded) || (scalar(@{$self->{'__links'}}) < 1))) {
$self->_check_embedded($uri);
if (scalar(@{$self->{'__links'}}) < 1) {
$self->_check_embedded($uri,{liberal=>1});
}
}
if (($syndic8) || (scalar(@{$self->{'__links'}}) < 1)) {
$self->_check_syndic8($uri);
}
return $self->{'__links'};
}
sub _fetch {
my $self = shift;
my $uri = shift;
$self->{'__ua'} ||= LWP::UserAgent->new();
my $res = $self->{'__ua'}->request(HTTP::Request->new(GET=>$uri));
if (! $res->is_success()) {
return undef;
}
return \$res->content();
}
sub _check_embedded {
my $self = shift;
my $uri = shift;
my $args = shift;
my $rss = $self->_rss()
|| return 0;
# How anal...I mean, liberal do I need to be about this?
my $pattern = $args->{'liberal'} ? "r([dfs]+)" : "\\.r([dfs]+)";
my @links = grep { $_ =~ /(?:$pattern)$/ } @{$self->{'__embedded'}};
if (! @links) {
return 1;
}
# We just get this out of the way
# now in case $link is a relative
# URL
unless ($uri =~ /\/$/) {
$uri .= "/";
}
foreach my $link (@links) {
if (($link =~ /^http/) && ($self->{'__check_embedded'} < 2)) {
next unless $link =~ /^$uri/;
}
elsif ($link =~ /^http/) {
next if $link =~ m!127.0.0!
}
else {
$link = $uri.$link;
}
next if ($self->_linked($link));
my $data = $self->_fetch($link);
if (! $data) {
carp "Failed to fetch '$uri', skipping.\n";
next;
}
eval { $rss->parse($$data); };
if ($@) {
# carp "Not RSS, $@\n";
next;
}
next unless (defined($rss->{'_internal'}{'version'}));
push @{$self->{'__links'}} ,{
rel => "alternate",
href => $uri,
title => $rss->{"channel"}{"description"},
type => MIMETYPE_RSS,
};
}
return 1;
}
sub _check_syndic8 {
my $self = shift;
my $uri = shift;
my $rpc = $self->_xmlrpc({proxy=>SYNDIC8_PROXY})
|| return 0;
$uri =~ m!^(?:http://)?(?:www)?([^/]+)(?:/.*)?$!;
if (! $1) {
carp "Failed to parse URI '$uri', skipping lookup.\n";
return 0;
}
my $ids = $rpc->call(SYNDIC8_FINDSITES,$1)->result()
|| return 1;
my $info = $rpc->call(SYNDIC8_FEEDINFO,$ids)->result()
|| return 1;
foreach my $site (@$info) {
next unless ($site->{"fetchable"});
next unless ($site->{status} eq "Syndicated");
next if ($self->_linked($site->{"dataurl"}));
push @{$self->{'__links'}} ,{
rel => "alternate",
href => $site->{"dataurl"},
title => $site->{"description"},
type => MIMETYPE_RSS,
};
}
return 1;
}
sub _rss {
my $self = shift;
if (ref($self->{'__rss'}) eq "ARRAY") {
return undef;
}
#
if (! $self->{'__rss'}) {
eval "require XML::RSS";
if ($@) {
carp "Unable to load RSS parser.\n";
$self->{'__xmlrpc'} = [$@];
return undef;
}
$self->{'__rss'} = XML::RSS->new();
}
return $self->{'__rss'};
}
sub _xmlrpc {
my $self = shift;
my $args = shift;
if (ref($self->{'__xmlrpc'}) eq "ARRAY") {
return undef;
}
#
if ((! $self->{'__xmlrpc'}) ||
(($args->{'proxy'}) && ($self->{'__xmlrpc'}->proxy() ne $args->{'proxy'}))) {
eval "require XMLRPC::Lite";
if ($@) {
carp "Unable to load XMLRPC class. Syndic8 lookup disabled.\n";
$self->{'__xmlrpc'} = [$@];
return undef;
}
$self->{'__xmlrpc'} = XMLRPC::Lite->new();
$self->{'__xmlrpc'}->proxy($args->{'proxy'});
# $self->{'__xmlrpc'}->on_debug(sub{print@_});
}
return $self->{'__xmlrpc'};
}
sub _linked {
my $self = shift;
my $uri = shift;
if (defined($self->{'__linked'}{$uri})) {
return $self->{'__linked'}{$uri};
}
foreach (@{$self->{'__links'}}) {
if ($_->{href} eq $uri) {
$self->{'__linked'}{$uri} = 1;
return 1;
}
}
$self->{'__linked'}{$uri} = 0;
return 0;
}
sub _start {
my $self = shift;
my $tag = shift;
my $attrs = shift;
# Anything to check?
# We may not actually need to check anchors
# but in the interests of keeping things
# simple (read-ability) we defer that check
# for later...
unless ($tag =~ /^(link|a)$/) {
return;
}
# Check anchors
# See note re: __check_emebedded in &locate()
if (($self->{'__check_embedded'}) && ($tag eq "a")) {
if ($attrs->{'href'} =~ /(?:\.)?r(?:df|ss)$/i) {
push @{$self->{'__embedded'}} , $attrs->{'href'};
}
return;
}
# Check links
if ((defined($attrs->{'name'})) &&
($attrs->{'name'} =~ /^(XML|RSS)$/)) {
return;
}
if ((defined($attrs->{'name'})) &&
($attrs->{'type'} ne "application/rss+xml") &&
($attrs->{'type'} ne "text/xml")) {
return;
}
delete $attrs->{"/"};
push @{$self->{'__links'}},$attrs;
}
=head1 VERSION
1.21
=head1 DATE
$Date: 2004/10/17 04:13:06 $
=head1 AUTHOR
Aaron Straup Cope
=head1 SEE ALSO
Because you shouldn't need all that white space to do cool stuff ;-)
http://diveintomark.org/archives/2002/05/30.html#rss_autodiscovery
http://diveintomark.org/archives/2002/08/15.html
http://diveintomark.org/projects/misc/rssfinder.py.txt
=head1 REQUIREMENTS
=head2 BASIC
These packages are required to actually parse an HTML document or URI.
=over 4
=item *
B<HTML::Parser>
=item *
B<LWP::UserAgent>
=item *
B<HTTP::Request>
=back
=head2 EMBEDDED
These packages are required to check the embedded links in a URI for RSS files.
They are not loaded until run-time so they are not required for doing basic parsing
=over 4
=item *
B<XML::RSS>
=back
=head2 SYNDIC8
These packages are required to query the syndic8 servers for RSS files associated with a URI.
They are not loaded until run-time so they are not required for doing basic parsing
=over 4
=item *
B<XMLRPC::Lite>
=back
=head1 LICENSE
Copyright (c) 2002-2004, Aaron Straup Cope. All Rights Reserved.
This is free software, you may use it and distribute it under the same terms as Perl itself.
=cut
return 1;