The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# $Id: WWW-Scraper-Yahoo360.t 168 2009-05-31 11:51:37Z cosimo $

use Test::More tests => 44;

BEGIN {
    use_ok('WWW::Scraper::Yahoo360')
}

# Enable debug mode
# $WWW::Scraper::Yahoo360::DEBUG = 1;

my $y360 = WWW::Scraper::Yahoo360->new({
    username => 'fake',
    password => 'even-more-fake',
});


# ---------------------------------------------------
# Parsing of blog posts and comments
# ---------------------------------------------------

diag("Parsing a standard blog page");

my $blog_page = File::Slurp::read_file(q{./t/blog.html});
my $blog_info = $y360->blog_info($blog_page);
#iag( JSON::XS->new->pretty->encode($blog_info) );

is (
    $blog_info->{link},
    'http://blog.360.yahoo.com/blog-jfCUH8k5fqpqLD7PHOY4YMCi5eU-?cq=1',
    'Blog permanent link is correctly extracted'
);

is (
    $blog_info->{sharing},
    'public',
    'Blog sharing level is correctly extracted'
);

is (
    $blog_info->{count}, 13,
    'Blog posts count is correctly extracted'
);

ok(
    $blog_info->{start} == 1 && $blog_info->{end} == 5,
    'Blog posts start/end is correctly extracted'
);

is(
    $blog_info->{title},
    'Dieu Anh's Blog',
    'Title of the blog is extracted correctly'
);

#
# get_blog_posts() tests
#
my $posts = $y360->get_blog_posts($blog_page, start=>1, end=>5, count=>5);
is(scalar @{$posts}, 5, 'Parsed 5 blog posts in the blog main page');

my $first = $posts->[0];

ok(
	ref $first eq 'HASH',
	'First blog post is a hashref'
);

ok(
	$first->{title},
	'Title is parsed correctly (' . $first->{title} . ')'
);

is(
	$first->{comments}, 0,
	'Number of comments is correct'
);

like(
	$first->{tags},
	qr{^myopera},
	'Tags parsed correctly (' . $first->{tags} . ')'
);

like(
	$first->{description},
	qr{<img src="http://files\.myopera\.com/myfrenchopera/files/sitelanguage\.jpg"/></div>$},
	'Blog post is not truncated'
);

is(
	$first->{pubDate},
	'Tue, 16 Dec 2008 13:11:00 GMT',
	'Blog post date is parsed correctly'
);

like(
	$first->{link},
	qr{^http://blog\.360\.yahoo\.com/},
	'Blog post link contains blog.360.yahoo.com',
);

#
# get_blogpost_comments() tests
#
my $blogpost_page = File::Slurp::read_file(q{./t/blogpost_with_1_comment.html});
my $comments = $y360->get_blogpost_comments(
    {link=>'http://360.yahoo.com/blah'}, # Pretend we have a link
    $blogpost_page
);

#iag( JSON::XS->new->pretty->encode($comments) );

is (ref $comments, 'ARRAY', 'comments extracted in an array ref');
is (@$comments, 1, 'found one comment');

my $comment = $comments->[0];

like (
    $comment->{link}, qr{http://.*360\.yahoo\.com/.*},
    'Found link to the original blog post (' . $comment->{link} . ')',
);

like (
    $comment->{'user-profile'}, qr{http://.*360\.yahoo\.com/.*},
    'Found link of the profile of the user that posted the comment',
);


like (
    $comment->{comment}, qr{^welcome u visit},
    'Found the comment body'
);

is (
    $comment->{username}, q{palbongro},
    'Found correct username'
);

# ---------------------------------------------------
# Parsing of a blog post with many comments
# ---------------------------------------------------

diag("Parsing a blog page with many comments");

$blogpost_page = File::Slurp::read_file(q{./t/blogpost_with_many_comments.html});
$comments = $y360->get_blogpost_comments({}, $blogpost_page);

#iag( JSON::XS->new->pretty->encode($comments) );

is (ref $comments, 'ARRAY', 'comments extracted in an array ref');
is (@$comments, 5, 'found correct number of comment');

is (
    $comments->[0]->{username}, 'Not gonna get us',
    'Username of first comment is correct. Extraction order is correct.'
);

# ---------------------------------------------------
# Parsing of dates
# ---------------------------------------------------

diag("Parsing of dates");

# Mon, 25 Aug 2008 12:28:00 GMT
my @dates = (
    [ q{Monday August 25, 2008 - 05:28am (PDT)}, 1219667280 ],
    [ q{Tuesday November 11, 2008 - 10:26pm (ICT)}, 1226417160 ],
    [ q{Wednesday February 4, 2009 - 12:00pm (ICT)}, 1233723600 ],
    [ q{Sunday May 24, 2009 - 12:27am (ICT)}, 1243099620 ],
);

for (@dates) {
    my ($date, $expected_result) = @$_;
    is (
        $y360->parse_date($date),
        $expected_result,
        'Date {' . $date . '} is parsed correctly'
    );
}

# -----------------------------------------------------
# A different page - parsing of blog posts and comments
# -----------------------------------------------------

diag("Parsing of alternative blog page");

$blog_page = File::Slurp::read_file(q{./t/blog2.html});
$blog_info = $y360->blog_info($blog_page);
#iag( JSON::XS->new->pretty->encode($blog_info) );

is(
    $blog_info->{title}, 'Test Blog',
    'Title of blog extracted correctly'
);

is(
    $blog_info->{sharing}, 'private',
    'Blog sharing set to private should be parsed correctly',
);


$posts = $y360->get_blog_posts($blog_page, start=>1, end=>4, count=>4);
is(scalar @{$posts}, 4, 'Parsed 4 blog posts in the alternative test page');
my $post = $posts->[0];

is(
    $post->{title}, 'Entry for March 17, 2007',
    'Title of post extracted correctly'
);

is(
    $post->{link}, 'http://blog.360.yahoo.com/blog-cqkAz2HmPNV3F9wncqkA-?cq=1&p=5',
    'Link to blog post extracted correctly'
);

# Check parsing of pictures
unlike(
    $post->{description},
    qr{<img \s src=}mx,
    'Picture is not added when not present',
);

# Blog post content should be just blog post, no empty newlines or <div>s for picture
is(
    $post->{description},
    '<p>Chuyen sang ngoi nha moi</p> <p>http://my.opera.com/testuser2</p>',
	'Blog post contents with no picture are extracted correctly',
);

$post = $posts->[3];
like(
    $post->{description},
    qr{<img \s src=}mx,
    'Picture is parsed correctly'
);

#iag( JSON::XS->new->pretty->encode($posts) );


# -----------------------------------------------------
# Page that used to hang, only 1 blog post
# -----------------------------------------------------

diag("Parsing of page with just 1 blog post");

$blog_page = File::Slurp::read_file(q{./t/blog3.html});
$blog_info = $y360->blog_info($blog_page);
#iag( JSON::XS->new->pretty->encode($blog_info) );

is(
    $blog_info->{title}, 'Hang test',
    'Title of blog extracted correctly when page has only 1 post'
);

is(
    $blog_info->{sharing}, 'public',
    'Blog sharing parsed correctly when page has only 1 post'
);

# Catch infinite loop parsing regressions
eval {
    local $SIG{ALRM} = sub { die "timeout\n" };
    alarm 5;
    $posts = $y360->get_blog_posts($blog_page, start=>1, end=>1, count=>1);
    alarm 0;
};
if ($@) {
    ok(0, "Regression: get_blog_posts() should not hang when there's only 1 blog post");
}
else {
    is(scalar @{$posts}, 1, 'Parsed 1 blog post page correctly');
}

$post = $posts->[0];

is(
    $post->{title}, 'Blog chuyển sang Opera!',
    'Title of post extracted correctly'
);

is(
    $post->{link}, 'http://blog.360.yahoo.com/blog-w7QmVu4cfGV4rfrQdjX5O6--?cq=1&p=1',
    'Link to blog post extracted correctly'
);


# -----------------------------------------------------
# Another page that used to hang
# -----------------------------------------------------

diag("Parsing another crash-me page with no blog entries");

$blog_page = File::Slurp::read_file(q{./t/blog4.html});
$blog_info = $y360->blog_info($blog_page);
#iag( JSON::XS->new->pretty->encode($blog_info) );

like($blog_page, qr(There are no blog entries), 'No blog entries for this page');

is(
    $blog_info->{title}, 'Test4',
    'Title of blog extracted correctly when page has no posts'
);

is(
    $blog_info->{sharing}, 'public',
    'Blog sharing parsed correctly when page has no posts'
);

# Catch infinite loop parsing regressions
eval {
    local $SIG{ALRM} = sub { die "timeout\n" };
    alarm 5;
    $posts = $y360->get_blog_posts($blog_page, start=>1, end=>1, count=>1);
    alarm 0;
};
if ($@) {
    ok(0, "Regression: get_blog_posts() should not hang when there's no blog posts");
}
else {
    is(scalar @{$posts}, 0, 'Found no blog posts');
}