The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!perl

use File::Slurp qw( slurp );
use HTML::FormatText 2.04;
use HTML::TreeBuilder;
use Test::More;
use HTML::ExtractMain 'extract_main_html';

chdir 't';    # may not be needed
chdir 'test_case_data'
    || BAIL_OUT q{Can't find test data directory "test_case_data"};

my @expected = ({ name         => 'indymedia_feature.html',
                  expect_start => qr/2010 est une ann|pour bannir les armes/,
                  expect_end   => qr/fermera la base/,
                },
                { name         => 'google_blogger.html',
                  expect_start => qr/the Google Maps API Blog/,
                  expect_end   => qr/Product Manager/,
                },
                { name         => 'google_short_blog.html',
                  expect_start => qr/The principle behind the advertising/,
                  expect_end   => qr/check them out/,
                  todo => 'Does not work with v1 of Readability algorithm',
                },
                { name         => 'lessig_blog.html',
                  expect_start => qr/So my blog turns seven today/,
                  expect_end   => qr/Thank you to everyone, again./,
                },
);

plan tests => 5 * scalar @expected;

foreach my $test_set (@expected) {

    my $name = $test_set->{name};
    my $data = slurp($name, binmode => ':utf8') || die $name;
    my $main = extract_main_html($data);

    my $length = length $main;
    ok( $length, 'got main content' );

    my $wiggle_room = int( $length * 0.02 );

    # ensure that test data is reasonable
    unlike( html_to_text($data),
            qr/^.{0,$wiggle_room}$test_set->{expect_start}/s,
            "$name: in original, start of main text is not at start of HTML"
    );
    unlike( html_to_text($data),
            qr/$test_set->{expect_end}.{0,$wiggle_room}$/s,
            "$name: in original, end of main text is not at end of HTML"
    );

    if ( $test_set->{todo} ) {
    TODO: {
            local $TODO = $test_set->{todo};
            like( html_to_text($main),
                  qr/^.{0,$wiggle_room}$test_set->{expect_start}/s,
                  "$name: is start of main text near the beginning?"
            );
            like( html_to_text($main),
                  qr/$test_set->{expect_end}.{0,$wiggle_room}$/s,
                  "$name: is end of main text near the end?" );
        }
    } else {
        like( html_to_text($main),
              qr/^.{0,$wiggle_room}$test_set->{expect_start}/s,
              "$name: is start of main text near the beginning?" );
        like( html_to_text($main),
              qr/$test_set->{expect_end}.{0,$wiggle_room}$/s,
              "$name: is end of main text near the end?" );
    }
}

my $html_formatter;

sub html_to_text {
    $html_formatter ||= HTML::FormatText->new( lm => 0, rm => 1_000_000 );

    my $html_content = shift;
    my $parsed_tree
        = eval { HTML::TreeBuilder->new_from_content($html_content) };
    my $plain_text = $html_formatter->format($parsed_tree);
    return $plain_text;
}

# Local Variables:
# mode: perltidy
# End: