package Plagger::Plugin::Filter::FindEnclosures;
use strict;
use base qw( Plagger::Plugin );
use HTML::TokeParser;
use Plagger::Util qw( decode_content );
use List::Util qw(first);
use URI;
use DirHandle;
use Plagger::Enclosure;
use Plagger::UserAgent;
sub register {
my($self, $context) = @_;
$context->autoload_plugin({ module => 'Filter::ResolveRelativeLink' });
$context->register_hook(
$self,
'update.entry.fixup' => \&filter,
);
}
sub init {
my $self = shift;
$self->SUPER::init(@_);
$self->load_plugins();
$self->{ua} = Plagger::UserAgent->new;
}
sub load_plugins {
my $self = shift;
my $context = Plagger->context;
my $dir = $self->assets_dir;
my $dh = DirHandle->new($dir) or $context->error("$dir: $!");
for my $file (grep -f $_->[0] && $_->[0] =~ /\.(?:pl|yaml)$/,
map [ File::Spec->catfile($dir, $_), $_ ], sort $dh->read) {
$self->load_plugin(@$file);
}
}
sub load_plugin {
my($self, $file, $base) = @_;
Plagger->context->log(debug => "loading $file");
my $load_method = $file =~ /\.pl$/ ? 'load_plugin_perl' : 'load_plugin_yaml';
push @{ $self->{plugins} }, $self->$load_method($file, $base);
}
sub load_plugin_perl {
my($self, $file, $base) = @_;
open my $fh, '<', $file or Plagger->context->error("$file: $!");
(my $pkg = $base) =~ s/\.pl$//;
my $plugin_class = "Plagger::Plugin::Filter::FindEnclosures::Site::$pkg";
my $code = join '', <$fh>;
unless ($code =~ /^\s*package/s) {
$code = join "\n",
( "package $plugin_class;",
"use strict;",
"use base qw( Plagger::Plugin::Filter::FindEnclosures::Site );",
"sub site_name { '$pkg' }",
$code,
"1;" );
}
eval $code;
Plagger->context->error($@) if $@;
return $plugin_class->new;
}
sub load_plugin_yaml { Plagger->context->error("NOT IMPLEMENTED YET") }
sub filter {
my($self, $context, $args) = @_;
# check $entry->link first, if it links directly to media files
$self->add_enclosure($args->{entry}, [ 'a', { href => $args->{entry}->permalink } ], 'href' );
return unless $args->{entry}->body;
my $parser = HTML::TokeParser->new(\$args->{entry}->body->data);
while (my $tag = $parser->get_tag('a', 'embed', 'img', 'object')) {
if ($tag->[0] eq 'a' ) {
$self->add_enclosure($args->{entry}, $tag, 'href');
} elsif ($tag->[0] eq 'embed') {
$self->add_enclosure($args->{entry}, $tag, 'src', { type => $tag->[1]->{type} });
} elsif ($tag->[0] eq 'img') {
$self->add_enclosure($args->{entry}, $tag, 'src', { inline => 1 });
} elsif ($tag->[0] eq 'object') {
$self->add_enclosure_from_object($args->{entry}, $parser);
}
}
}
sub add_enclosure_from_object {
my($self, $entry, $parser) = @_;
# get param tags and find appropriate FLV movies
my @params;
while (my $tag = $parser->get_tag('param', '/object')) {
last if $tag->[0] eq '/object';
push @params, $tag;
}
# find URL inside flashvars parameter
my $url;
if (my $flashvars = first { lc($_->[1]->{name}) eq 'flashvars' } @params) {
my %values = split /[=&]/, $flashvars->[1]->{value} || '';
$url = first { m!^https?://.*\flv! } values %values;
$url ||= first { m!^https?://.*! } values %values;
}
# if URL isn't found in flash vars, then fallback to <param name="movie" />
if (!$url) {
my $movie = first { lc($_->[1]->{name}) eq 'movie' } @params;
$url = $movie->[1]->{value} if $movie;
}
if ($url) {
Plagger->context->log(info => "Found enclosure $url");
my $enclosure = Plagger::Enclosure->new;
$enclosure->url( URI->new($url) );
$enclosure->auto_set_type;
$entry->add_enclosure($enclosure); # XXX inline?
}
}
sub add_enclosure {
my($self, $entry, $tag, $attr, $opt) = @_;
$opt ||= {};
if ($self->is_enclosure($tag, $attr, $opt->{type})) {
Plagger->context->log(info => "Found enclosure $tag->[1]{$attr}");
my $enclosure = Plagger::Enclosure->new;
$enclosure->url($tag->[1]{$attr});
$enclosure->auto_set_type($opt->{type});
$enclosure->is_inline(1) if $opt->{inline};
$entry->add_enclosure($enclosure);
return;
}
my $url = $tag->[1]{$attr};
my $content;
for my $plugin (@{$self->{plugins}}) {
if ( $plugin->handle($url) ) {
Plagger->context->log(debug => "Try $url with " . $plugin->site_name);
$content ||= $self->fetch_content($url) or return;
if (my $enclosure = $plugin->find({ content => $content, url => $url })) {
Plagger->context->log(info => "Found enclosure " . $enclosure->url ." with " . $plugin->site_name);
$entry->add_enclosure($enclosure);
return;
}
}
}
}
sub fetch_content {
my($self, $url) = @_;
my $ua = Plagger::UserAgent->new;
my $res = $ua->fetch($url, $self, { NoNetwork => 3 * 60 * 60 });
return if !$res->status && $res->is_error;
return decode_content($res);
}
sub is_enclosure {
my($self, $tag, $attr, $type) = @_;
return 1 if $tag->[1]{rel} && $tag->[1]{rel} eq 'enclosure';
return 1 if $self->has_enclosure_mime_type($tag->[1]{$attr}, $type);
return;
}
sub has_enclosure_mime_type {
my($self, $url, $type) = @_;
my $mime = $type ? MIME::Type->new(type => $type) : Plagger::Util::mime_type_of( URI->new($url) );
Plagger::Util::mime_is_enclosure($mime);
}
package Plagger::Plugin::Filter::FindEnclosures::Site;
sub new { bless {}, shift }
sub handle { 0 }
sub find { }
1;
__END__
=head1 NAME
Plagger::Plugin::Filter::FindEnclosures - Auto-find enclosures from entry content using B<< <a> >> / B<< <embed> >> tags
=head1 SYNOPSIS
- module: Filter::FindEnclosures
=head1 DESCRIPTION
This plugin finds enclosures from C<< $entry->body >> by finding 1)
B<< <a> >> links with I<rel="enclosure"> attribute, 2) B<< <a> >>
links to any URL which filename extensions match with known
audio/video formats and 3) I<src> attributes in B<< <img> >> and B<< <embed> >> tags.
For example:
Listen to the <a href="http://example.com/foobar.mp3">Podcast</a> now, or <a rel="enclosure"
href="http://example.com/foobar.m4a">download AAC version</a>. <img src="/img/logo.gif" />
Those 3 links (I<foobar.mp3>, I<foobar.m4a> and I<logo.gif>) are
extracted as enclosures, while I<logo.gif> is marked as "inline", so
that they won't appear as enclosures in Publish::Feed.
You might want to also use Filter::HEADEnclosureMetadata plugin to
know the actual length (bytes-length) of enclosures by sending HEAD
requests.
=head1 AUTHOR
Tatsuhiko Miyagawa
Masahiro Nagano
=head1 SEE ALSO
L<Plagger>, L<Plagger::Plugin::Filter::HEADEnclosureMetadata>, L<http://www.msgilligan.com/rss-enclosure-bp.html>, L<http://forums.feedburner.com/viewtopic.php?t=20>
=cut