The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package App::Zapzi::Transformers::HTML;
# ABSTRACT: process HTML without doing readability transforms


use utf8;
use strict;
use warnings;

our $VERSION = '0.014'; # VERSION

use Carp;
use Encode;
use HTML::Element;
use HTML::Entities ();
use Moo;

with 'App::Zapzi::Roles::Transformer';


sub name
{
    return 'HTML';
}


sub handles
{
    # By default HTMLExtractMain will handle HTML, not this
    return 0;
}


sub transform
{
    my $self = shift;

    # Use the passed in text if explicity set, else get it from the
    # fetched article object. This is used by derived classes that
    # transform text into HTML then call this method.
    my ($input) = @_;
    $input //= $self->input->text;

    my $encoding = 'utf8';
    if ($self->input->content_type =~ m/charset=([\w-]+)/)
    {
        $encoding = $1;
    }
    my $raw_html = Encode::decode($encoding, $input);

    $self->_extract_title($raw_html);

    my $tree = $self->_extract_html($raw_html);
    return unless $tree;

    # Delete some elements we don't need
    for my $element ($tree->find_by_tag_name(
                         qw{img script noscript object iframe}))
    {
        $element->delete;
    }

    # Set up options to extract the HTML from the tree
    my $entities_to_encode = '<>&\'"';
    my $indent = ' ' x 4;
    my $optional_end_tags = {};

    my $text = $tree->as_HTML($entities_to_encode, $indent,
                              $optional_end_tags);
    $text =~ s|<[/]*body>||sg;
    $self->_set_readable_text($text);
    return 1;
}

sub _extract_title
{
    my $self = shift;
    my ($raw_html) = @_;
    my $title;

    # Try finding the <title> tag first
    my $tree = eval { HTML::TreeBuilder->new_from_content($raw_html) };
    if ($tree)
    {
        my $tag = $tree->find_by_tag_name('title');
        my $content;
        $content = ($tag->content_list)[0] if $tag;

        # Strip surrounding whitespace and decode HTML entities
        $content =~ s/^\s+|\s+$//g if $content;
        $title = HTML::Entities::decode($content) if $content;
    }

    # Use the URL/filename if no title could be found or parsed from
    # the HTML
    if (! $title)
    {
        $title = $self->input->source;
    }

    $self->_set_title($title);
}

sub _extract_html
{
    my $self = shift;
    my ($raw_html) = @_;

    my $tree = eval { HTML::TreeBuilder->new_from_content($raw_html)
                          ->find_by_tag_name('body') };

    return $tree;
}

1;

__END__

=pod

=encoding UTF-8

=head1 NAME

App::Zapzi::Transformers::HTML - process HTML without doing readability transforms

=head1 VERSION

version 0.014

=head1 DESCRIPTION

This class takes HTML and returns the body without doing additional
readable transforms - so tags such as script are removed but no text
should be changed. Use this if HTMLExtractMain does not provide the
desired results.

=head1 METHODS

=head2 name

Name of transformer visible to user.

=head2 handles($content_type)

Returns true if this module handles the given content-type

=head2 transform(input)

Converts L<input> to readable text. Returns true if converted OK.

=head1 AUTHOR

Rupert Lane <rupert@rupert-lane.org>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2014 by Rupert Lane.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut