#
# Ignorant Yahoo 360 blog scraper (blog.360.yahoo.com)
#
# $Id: Yahoo360.pm 168 2009-05-31 11:51:37Z cosimo $
package WWW::Scraper::Yahoo360;
use strict;
use warnings;
use Carp ();
use Date::Parse ();
use File::Slurp ();
use HTTP::Date ();
use JSON::XS ();
use WWW::Mechanize ();
use constant BLOG_URL => q{http://blog.360.yahoo.com/blog/};
use constant LOGIN_FORM => q{login_form};
use constant LOGIN_URL => q{https://login.yahoo.com/config/login_verify2?.intl=us&.done=http%3A%2F%2Fblog.360.yahoo.com%2Fblog%2F%3F.login%3D1&.src=360};
our $DEBUG = 0;
our $VERSION = '0.09';
sub new {
my ($class, $args) = @_;
$class = ref $class || $class || __PACKAGE__;
my $self = $args;
bless $self, $class;
}
# Fetches high-level blog information
sub blog_info {
my ($self, $blog_page) = @_;
if (! $blog_page) {
$self->debug('Fetching blog main page');
$blog_page = $self->blog_main_page();
if (! $blog_page) {
$self->debug('Failed to fetch blog main page');
return;
}
}
# Get sharing level
# <p class="footnote">Your blog can be seen by <strong>Public</strong>
#
# or:
# <p class="footnote">Your blog can be seen by <strong>Just me (private)</strong>
# <p class="footnote">Your blog can be seen by <strong>Friends</strong>
#
my $sharing = q{};
if ($blog_page =~ m{Your blog can be seen by <strong>([\w\(\)\s]+)</strong>}m) {
$sharing = lc $1;
if ($sharing =~ m{just me}) {
$sharing = 'private';
}
elsif ($sharing =~ m{friend}) {
$sharing = 'friends';
}
$self->debug('Blog sharing found to be "', $sharing, '"');
}
else {
$self->debug('Blog sharing string not found');
}
# Get title
my $title = q{};
if ($blog_page =~ m{<h3>([^<]+)<span class="view-toggle">Full Post View}m) {
$title = $1;
$self->debug('Blog title found to be "', $title, '"');
}
# Get number of posts
#
# <span><em>1 - 5</em> of <em class="limit">13</em> ...
my $start =
my $end =
my $count = 0;
if ($blog_page =~ m{<span><em>(\d+) \- (\d+)</em> of <em class="limit">(\d+)</em>}m) {
$start = $1;
$end = $2;
$count = $3;
$self->debug('Blog post counts found. Start:', $start, ' End:', $end, ' Count:', $count);
}
else {
$self->debug('Blog post counts not found');
}
my $link = q{};
if ($blog_page =~ m{<a href="([^"]+)" class="selected">My Blog</a>}) {
$link = $1;
$self->debug('Blog URL found: ', $link);
}
else {
$self->debug('Blog URL not found');
}
$title =~ s{^\s+}{};
$title =~ s{\s+$}{};
return {
sharing => $sharing,
title => $title,
start => $start,
end => $end,
count => $count,
link => $link,
lastBuildDate => HTTP::Date::time2str(),
language => 'en-us',
};
}
# Fetches the user's main blog page
sub blog_main_page {
my ($self) = @_;
my $mech = $self->mech();
$mech->get(BLOG_URL);
if ($mech->success()) {
$self->debug('Blog main page downloaded successfully');
return $mech->content();
}
$self->debug('Blog main page download failed');
Carp::croak("Failed to retrieve blog main page");
}
# Builds the url to fetch a specific blog page
sub blog_page_url {
my ($self, $link, $start, $per_page, $count) = @_;
my $url = $link;
my $last = $start + $per_page - 1;
if ($last > $count) { $last = $count }
$url .= '&l=' . $start;
$url .= '&u=' . $last;
$url .= '&mx=' . $count;
$url .= '&lmt=' . $per_page;
return $url;
}
sub debug {
return unless $DEBUG;
my ($self, @msg) = @_;
print STDERR @msg, "\n";
return;
}
# Logs in to Yahoo
sub login {
my ($self) = @_;
my $user = $self->{username};
my $pass = $self->{password};
my $mech = $self->mech();
$mech->get(LOGIN_URL);
$mech->submit_form(
form_name => LOGIN_FORM,
fields => {
login => $user,
passwd => $pass,
'.persistent' => 'y',
},
button => '.save',
);
# Not sure how to make this more robust
my $next_page = $mech->content();
if ($next_page =~ m{Invalid ID or password}) {
$self->debug('Login to Yahoo service failed for user "', $user, '"');
return;
}
my $ok = $mech->success();
if ($ok) {
$self->debug('Login to Yahoo service succeeded');
}
else {
$self->debug('Login to Yahoo service failed. Unknown reason?');
}
return $ok;
}
# Dumps last accessed page content to STDOUT
sub dump {
my ($self) = @_;
print $self->mech->content();
}
# Retrieves all comments in the user's blog
sub get_blog_comments {
my ($self, $posts) = @_;
if (! $posts) {
return;
}
my @comments;
for my $post (@{$posts}) {
# No comments, don't fetch them
if ($post->{comments} == 0) {
$self->debug('No comments for post ', $post->{title});
next;
}
#print qq{Found $post->{comments} comments for blog post "$post->{title}"\n};
if (my $post_comm = $self->get_blogpost_comments($post)) {
$self->debug('Got ', scalar(@{ $post_comm }), ' comments for post ', $post->{title});
push @comments, @{ $post_comm };
}
}
return \@comments;
}
# Retrieves all comments to a single blog post
sub get_blogpost_comments {
my ($self, $post, $page) = @_;
# If we didn't get a pre-saved html page, get it now
if (! $page) {
$self->mech->get($post->{link});
$page = $self->mech->success
? $self->mech->content()
: q{};
}
if (! $page) {
warn "ERROR fetching blogpost comments for $post->{title}\n";
return;
}
my @comments;
while ($page =~ m{<li class="user-name"><a href="([^"]+)" title="([^"]+)">}mg) {
my $comment = {
'user-profile' => $1,
username => $2,
link => $post->{link},
};
# Comments can span multiple lines
# but are always enclosed between <p class="comment"> and </p>
if ($page =~ m{<p class="comment">(.*?)</p>}sg) {
$comment->{comment} = $1;
$comment->{comment} =~ s{^\s+}{};
$comment->{comment} =~ s{\s+$}{};
}
if ($page =~ m{<p class="datestamp">([^<]+)\s*<}mg) {
$comment->{date} = $1;
$comment->{date} =~ s{^\s+}{};
$comment->{date} =~ s{\s+$}{};
$comment->{date} = $self->parse_date($comment->{date});
}
$self->debug(
'Found comment "', $comment->{comment},
'" by "', $comment->{username}, '"'
);
push @comments, $comment;
}
$self->debug('Found ', scalar(@comments), ' comments to blog post ', $post->{link});
return \@comments;
}
# Gets all blog posts by a user
sub get_blog_posts {
my ($self, $blog_page, %overrides) = @_;
$self->debug("Start parsing of blog posts");
if (! $blog_page) {
$self->debug("Downloading of main blog page");
$blog_page ||= $self->blog_main_page();
$self->debug("Download complete");
}
else {
$self->debug("Blog main page was already supplied. No need to download.");
}
my $blog_info = $self->blog_info($blog_page);
for (keys %overrides) {
$blog_info->{$_} = $overrides{$_};
}
my $link = $blog_info->{link};
my $start = $blog_info->{start};
my $count = $blog_info->{count};
my $end_page = $blog_info->{end};
my $end_blog = $start + $count - 1;
my $per_page = $end_page - $start + 1;
my @posts = ();
$self->debug("Parsing posts ($start .. $end_blog)");
# Prevent endless loops
if ($start > $end_page) {
$start = $end_page;
}
for (my $n = $start; $n <= $end_blog; ) {
$self->debug(
'Reading post n. ', $n,
' end_of_page:', $end_page,
' end_of_blog:', $end_blog,
);
# Fetch next page and continue
if ($n >= $end_page && $end_page < $end_blog) {
my $next_page_url = $self->blog_page_url(
$link, $end_page + 1, $per_page, $count
);
$end_page += $per_page;
$self->mech->get($next_page_url);
$self->debug('Next url is:', $next_page_url);
$blog_page = $self->mech->content();
if (! $blog_page) {
$self->debug('Failed to read url: ', $next_page_url);
last;
}
}
my $found_posts = 0;
while ($blog_page =~ m{<dt class="post-head">([^<]+)</dt>}gm) {
# Blog post title
my $title = $1;
my $post = {
title => $1,
description => ''
};
$self->debug('Found new blog post "', $title, '" (', $n, ')');
$found_posts = 1;
# Main picture of the blog post
if ($blog_page =~ m{<div class="image-wrapper">(.*?)</div>}gsmc) {
my $pic = $1;
$pic =~ s{^\s*}{}mx;
$pic =~ s{\s*$}{}mx;
if ($pic) {
$post->{description} = '<div align="center">' . $pic . '</div>';
$self->debug(' Image: ', substr($pic, 0, 30), '...');
}
}
# Blog post content
# Read until the end of line (there might be multiple <div>s)
if ($blog_page =~ m{<div class="content-wrapper">(.*)</div>}gmc) {
$post->{description} .= $1;
$self->debug(' Content: ', substr($1, 0, 30), '...');
}
# Tags
if ($blog_page =~ m{<form><input type="hidden" name="tagslist" value="([^"]*)"}gm) {
$post->{tags} = $1;
$self->debug(' Tags: ', $1);
}
# Date of post
if ($blog_page =~ m{<span>([^<]+)<a href="[^"]+">Edit</a>}gm) {
$post->{pubDate} = HTTP::Date::time2str($self->parse_date($1));
$self->debug(' Date: ', $1);
}
# Permanent link
if ($blog_page =~ m{<a href="([^"]+)">Permanent Link</a>}gm) {
$post->{link} = $1;
$self->debug(' Permalink: ', $1);
}
# No. of comments
if ($blog_page =~ m{<a href="[^"]+">(\d+) Comments?</a>}gm) {
$post->{comments} = $1;
$self->debug(' Comments: ', $1);
}
push @posts, $post;
$n++;
}
if (not $found_posts) {
last;
}
}
return \@posts;
}
# Mechanize object accessor
sub mech {
my ($self) = @_;
if (! exists $self->{_mech}) {
$self->{_mech} = WWW::Mechanize->new();
}
return $self->{_mech};
}
# Tries to parse a date in the Yahoo 360 format
sub parse_date {
my ($self, $date) = @_;
$date =~ s{^\s+}{};
$date =~ s{\s+$}{};
if ($date =~ m{^ (\w{3})\w+ \s (\w{3})\w* \s (\d+), \s (\d+) \s - \s (\d+):(\d+)([ap]m) \s \((.*)\) \s* $}x) {
my $dow = $1;
my $month = $2;
my $day = $3;
my $year = $4;
my $hours = $5;
my $mins = $6;
my $ampm = uc $7;
my $tz = uc $8;
# Indochina time zone is not recognized by Date::Parse
if ($tz eq 'ICT') {
$tz = 'UTC+07';
}
if ($ampm eq 'AM' && $hours == 12) {
$hours = 0;
}
elsif ($ampm eq 'PM' && $hours != 12) {
$hours += 12;
if ($hours > 23) {
$hours -= 24;
}
}
my $time = "$hours:$mins:00";
# Wed, 16 Jun 94 07:29:35 CST
$date = "$day $month $year $time $tz";
#arn "# Converted to [$date]\n";
}
my $epoch = Date::Parse::str2time($date);
#arn "# str2time($date) returns ($epoch)\n";
return $epoch;
}
1;
__END__
=head1 NAME
WWW::Scraper::Yahoo360 - Yahoo 360 blogs old-fashioned crappy scraper
=head1 SYNOPSIS
use WWW::Scraper::Yahoo360;
my $y360 = WWW::Scraper::Yahoo360->new({
username => 'myusername',
password => 'mypassword',
});
# Debug what's happening?
$WWW::Scraper::Yahoo360::DEBUG = 1;
# First you have to login
$y360->login() or die "Login failed?";
# High level blog information
my $blog_info = $y360->blog_info();
# Gets all the blog posts
my $posts = $y360->get_blog_posts();
# Gets all the blog post comments
my $comments = $y360->get_blog_comments();
=head1 DESCRIPTION
Ignorant web scraper, based on WWW::Mechanize, that connects to your
Yahoo 360 account and tries to fetch the blog posts and comments
you still have on their service.
If it breaks, well... it's a scraper.
This module is used on the My Opera Community, L<http://my.opera.com>,
to import Yahoo 360 existing blogs into My Opera blog service.
=head1 SUBROUTINES
=head2 C<new(\%args)>
Where C<\%args> is a hashref with C<username> and C<password> of your
B<Yahoo 360> account.
This creates a new C<WWW::Scraper::Yahoo360> object, ready to scrape.
=head2 C<blog_info([$blog_page])>
Fetches high-level blog information for your Yahoo 360 blog.
If a C<$blog_page> argument is supplied, the blog information is
looked up inside the contents of that scalar. Otherwise it's fetched
from the network. C<$blog_page> must contain a full HTML page string.
Returns a hashref with the some/all the following information:
=over 4
=item C<link>
Something like: C<<< http://blog.360.yahoo.com/blog-<yourusername> >>>
=item C<sharing>
Most probably C<public>. Could also be C<friends> or C<friends of friends>,
but never tried it.
=item C<count>
Number of blog posts in total.
=item C<start>
First blog post on the frontpage. Should be 1.
=item C<end>
Last blog post on the frontpage, usually 5.
=item C<title>
Title of the blog.
=back
=head2 C<blog_main_page()>
Fetches the user's main blog page.
Returns a string with the HTML page contents.
This can be used in C<blog_info()> or C<get_blog_posts()>.
=head2 C<blog_page_url($link, $start, $per_page, $count)>
Builds the url to fetch a specific blog page.
=head2 C<dump()>
Dumps last accessed page content to STDOUT
=head2 C<login()>
Logs in to Yahoo service.
Returns a scalar that tells you if the login was successful or not.
=head2 C<get_blog_comments(\@posts)>
Retrieves all comments in the user's blog.
Wants the structure returned by C<get_blog_posts()>.
=head2 C<get_blogpost_comments($post)>
Retrieves all comments to a single blog post.
Wants a single C<$post> entry (hashref): one of the elements
returned by C<get_blog_posts()>.
=head2 C<get_blog_posts([$blog_page, [%overrides]])>
Gets all blog posts by a user. If C<$blog_page> is supplied, it looks
for blog posts in that page only.
C<%overrides> can be a set passed to override some of the properties
about the blog to be scraped and parsed. To see the list of properties,
look at C<blog_info()>.
Returns an array of hashrefs, each one representing a blog post.
Each post (hashref) should have the following keys:
Example:
$y360 = WWW::Scraper::Yahoo360->new({
username => '...'
password => '...',
});
$y360->login() or die "Failed login";
# Fetch only the first blog post, no matter what
my $first_page = $y360->blog_main_page();
my $blog_posts = $y360->get_blog_posts($first_page, count=>1);
=over 4
=item C<comments>
Number of comments to this blog post
=item C<description>
Blog post content
=item C<link>
Permanent URL of the blog post
=item C<pubDate>
Date when the blog post was published, in C<HTTP::Date> format,
ex.: C<Sun, Nov 14 06:20:28 CET>.
=item C<tags>
Comma delimited string of tags (ex.: C<travel, holiday>)
=item C<title>
Title of the blog post
=back
=head2 C<mech()>
C<WWW::Mechanize> object accessor.
=head2 C<parse_date($date_string)>
Tries to parse a date from the Yahoo 360 format to a unix timestamp.
=head1 EXPORTS
None by default.
=head1 AUTHOR
Cosimo Streppone, E<lt>cosimo@cpan.orgE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2009 by Cosimo Streppone, L<cosimo@cpan.org>
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.8 or,
at your option, any later version of Perl 5 you may have available.
=cut