The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
package HTML::Summary;

#==============================================================================
#
# Start of POD
#
#==============================================================================

=head1 NAME

HTML::Summary - module for generating a summary from a web page.

=head1 SYNOPSIS

    use HTML::Summary;
    use HTML::TreeBuilder;

    my $tree = new HTML::TreeBuilder;
    $tree->parse( $document );

    my $summarizer = new HTML::Summary(
        LENGTH      => 200,
        USE_META    => 1,
    );

    $summary = $summarizer->generate( $tree );
    $summarizer->option( 'USE_META' => 1 );
    $length = $summarizer->option( 'LENGTH' );
    if ( $summarizer->meta_used( ) )
    {
        do something
    }

=head1 DESCRIPTION

The C<HTML::Summary> module produces summaries from the textual content of
web pages. It does so using the location heuristic, which determines the value
of a given sentence based on its position and status within the document; for
example, headings, section titles and opening paragraph sentences may be
favoured over other textual content. A LENGTH option can be used to restrict
the length of the summary produced.

=head1 CONSTRUCTOR

=head2 new( $attr1 => $value1 [, $attr2 => $value2 ] )

Possible attributes are:

=over 4

=item VERBOSE

Generate verbose messages to STDERR.

=item LENGTH

Maximum length of summary (in bytes). Default is 500.

=item USE_META

Flag to tell summarizer whether to use the content of the C<<META>> tag
in the page header, if one is present, instead of generating a summary from the
body text. B<Note that> if the USE_META flag is set, this overrides the LENGTH
flag - in other words, the summary provided by the C<<META>> tag is
returned in full, even if it is greater than LENGTH bytes. Default is 0 (no).

=back

    my $summarizer = new HTML::Summary LENGTH => 200;

=head1 METHODS

=head2 option( )

Get / set HTML::Summary configuration options.

    my $length = $summarizer->option( 'LENGTH' );
    $summarizer->option( 'USE_META' => 1 );

=head2 generate( $tree )

Takes an HTML::Element object, and generates a summary from it.

    my $tree = new HTML::TreeBuilder;
    $tree->parse( $document );
    my $summary = $summarizer->generate( $tree );

=head2 meta_used( )

Returns 1 if the META tag description was used to generate the summary.

    if ( $summarizer->meta_used() )
    {
        # do something ...
    }

=head1 SEE ALSO

    HTML::TreeBuilder
    Text::Sentence
    Lingua::JA::Jcode
    Lingua::JA::Jtruncate

=head1 AUTHORS

    Ave Wrigley <wrigley@cre.canon.co.uk>
    Tony Rose <tgr@cre.canon.co.uk>
    Neil Bowers <neilb@cre.canon.co.uk>

=head1 COPYRIGHT

Copyright (c) 1997 Canon Research Centre Europe (CRE). All rights reserved.
This script and any associated documentation or files cannot be distributed
outside of CRE without express prior permission from CRE.

=cut

#==============================================================================
#
# End of POD
#
#==============================================================================

#==============================================================================
#
# Pragmas
#
#==============================================================================

require 5.004;
use strict;

#==============================================================================
#
# Modules
#
#==============================================================================

use Text::Sentence qw( split_sentences );
use Lingua::JA::Jtruncate qw( jtruncate );

#==============================================================================
#
# Constants
#
#==============================================================================

use constant IGNORE_TEXT => 1;

#==============================================================================
#
# Public globals
#
#==============================================================================

use vars qw( $VERSION );

$VERSION = '0.017';

#==============================================================================
#
# Private globals
#
#==============================================================================

my $DEFAULT_SCORE = 0;

my %ELEMENT_SCORES = (
    'p'         => 100,
    'h1'        => 90,
    'h2'        => 80,
    'h3'        => 70,
);

my %DEFAULTS = (
    'USE_META'  => 0,
    'VERBOSE'   => 0,
    'LENGTH'    => 500,
);

#==============================================================================
#
# Public methods
#
#==============================================================================

#------------------------------------------------------------------------------
#
# new - constructor. Configuration through "hash" type arguments, i.e.
# my $abs = new HTML::Summary( VAR1 => 'foo', VAR2 => 'bar' );
#
#------------------------------------------------------------------------------

sub new
{
    my $class = shift;
    my $self = bless { }, $class;
    return $self->_initialize( @_ );
}

#------------------------------------------------------------------------------
#
# generate - main public interface method to generate a summary
#
#------------------------------------------------------------------------------

sub generate
{
    my $self = shift;
    my $tree = shift;

    my $summary;

    $self->_verbose( 'Generate summary ...' );

    # check to see if there is a summary already defined in a META tag ...

    if ( 
        $self->{ USE_META } and 
        $summary = $self->_get_summary_from_meta( $tree ) 
    )
    {
        $self->_verbose( "use summary from META tag ..." );
        $self->_verbose( $summary );
        return $summary;
    }

    # traverse the HTML tree, building up @summary array

    my @summary = $self->_get_summary( $tree );

    # sort @summary by score, truncate if it is greater than LENGTH
    # characters, and the re-sort by original order. Truncate AFTER the LENGTH
    # has been exceeded, so that last sentence is truncated later by
    # jtruncate

    @summary = sort { $b->{ score } <=> $a->{ score } } @summary;

    my $tot_length = 0;
    my @truncated = ();

    for ( @summary )
    {
        push( @truncated, $_ );
        last if ( $tot_length += $_->{ 'length' } ) > $self->{ LENGTH };
    }
    @truncated = sort { $a->{ order } <=> $b->{ order } } @truncated;

    # these whitespaces will push the length over LENGTH, but jtruncate
    # should take care of this

    $summary = join( ' ', map { $_->{ text } } @truncated );
    $self->_verbose( "truncate the summary to ", $self->{ LENGTH } );
    $summary = jtruncate( $summary, $self->{ LENGTH } );
    return $summary;
}

#------------------------------------------------------------------------------
#
# meta_used - tells whether the description from the META tag was used; returns
# 1 if it was, 0 if the summary was generated automatically
#
#------------------------------------------------------------------------------

sub meta_used
{
    my $self = shift;

    return $self->{ META_USED };
}

#------------------------------------------------------------------------------
#
# option - get / set configuration option
#
#------------------------------------------------------------------------------

sub option
{
    my $self    = shift;
    my $option  = shift;
    my $val     = shift;

    die "No HTML::Summary option name given" unless defined $option;
    die "$option is not an HTML::Summary option" unless 
        grep { $_ eq $option } keys %DEFAULTS
    ;

    if ( defined $val )
    {
        $self->{ $option } = $val;
    }

    return $self->{ $option } = $val;
}

#==============================================================================
#
# Private methods
#
#==============================================================================

#------------------------------------------------------------------------------
#
# _initialize - supports sub-classing
#
#------------------------------------------------------------------------------

sub _initialize
{
    my $self = shift;

    return undef unless @_ % 2 == 0;    # check that config hash has even no.
                                        # of elements

    %{ $self } = ( %DEFAULTS, @_ );     # set options from defaults / config.
                                        # hash passed as arguments

    return $self;
}

#------------------------------------------------------------------------------
#
# _verbose - generate verbose error messages, if the VERBOSE option has been
# selected
#
#------------------------------------------------------------------------------

sub _verbose
{
    my $self = shift;

    return unless $self->{ VERBOSE };
    print STDERR @_, "\n";
}

#------------------------------------------------------------------------------
#
# _get_summary - get sentences from an element to generate the summary from.
# Uses lexically scoped array @sentences to build up result from the traversal
# callback
#
#------------------------------------------------------------------------------

sub _get_summary
{
    my $self = shift;
    my $tree = shift;

    my @summary = ();
    my $add_sentence = sub {
        my $text        = shift;
        my $tag         = shift;
        my $score       = shift || $DEFAULT_SCORE;

        return unless $text =~ /\w/;

        $text =~ s!^\s*!!; # remove leading ...
        $text =~ s!\s*$!!; # ... and trailing whitespace

        $summary[ scalar( @summary ) ] = {
            'text'          => $text,
            'length'        => length( $text ),
            'tag'           => $tag,
            'score'         => $score,
            'order'         => scalar( @summary ),
        };
    };
    $tree->traverse(
        sub {
            my $node = shift;
            my $flag = shift;

            if ( $flag ) # entering node ...
            {
                my $tag = $node->tag;
                return 0 if $tag eq 'head';

                # add sentences which either are scoring, or span no other
                # scoring sentences (and have a score of 0).  In this way, all
                # text is captured, even if it scores 0; the only exception is
                # something like <BODY>some text <P>foobar</P></BODY>, where
                # everything but "foobar" will be lost. However, if you have
                # <BODY><TD>some text</TD><P>foobar</P></BODY> you should get
                # all the text.

                if ( 
                    $ELEMENT_SCORES{ $tag } || 
                    ! _has_scoring_element( $node ) 
                )
                {
                    my $text = _get_text( $node );
                    foreach ( $text ) # alias $_ to $text
                    {
                        # get rid of whitespace (including &nbsp;) from start /
                        # end of $text
                        s/^[\s\160]*//;
                        s/[\s\160]*$//;
                        # get rid of any spurious tags that have slipped
                        # through the HTML::TreeBuilder
                        s!<[^>]+>!!g;
                    }

                    if ( $text =~ /\S/ )
                    {
                        my $score = $ELEMENT_SCORES{ $tag } || $DEFAULT_SCORE;

                        # add all the sentences in the text. Only the first
                        # sentence gets the element score - the rest get the
                        # default score

                        $self->_verbose( "TEXT: $text" );
                        for my $sentence ( 
                            split_sentences( $text, $self->{ 'LOCALE' } )
                        )
                        {
                            $self->_verbose( "SENTENCE: $text" );
                            $add_sentence->( $sentence, $tag, $score );
                            $score = $DEFAULT_SCORE;
                        }
                    }

                    # return 0 to avoid getting the same sentence in a scoring
                    # "daughter" element

                    return 0;
                }
            }

            # continue traversal ...

            return 1;
        },
        IGNORE_TEXT
    );
    return @summary;
}

#------------------------------------------------------------------------------
#
# _get_summary_from_meta - check to see if there is already a summary
# defined in the META tag in the HEAD
#
#------------------------------------------------------------------------------

sub _get_summary_from_meta
{
    my $self = shift;
    my $tree = shift;

    my $summary;

    $tree->traverse(
        sub {
            my $node = shift;
            my $flag = shift;

            if ( $node->tag eq 'meta' )
            {
                if ( lc( $node->attr( 'name' ) ) eq 'description' )
                {
                    $summary = $node->attr( 'content' );
                    $summary = undef if $summary eq 'content';
                }
                return 0;
            }
            return 1;
        },
        IGNORE_TEXT
    );

    $self->{ META_USED } = defined( $summary ) ? 1 : 0;
    return $summary;
}

#==============================================================================
#
# Private functions
#
#==============================================================================

#------------------------------------------------------------------------------
#
# _get_text - get all the text spanned by an element. Uses lexically scoped
# variable $html to build up result from the traversal callback
#
#------------------------------------------------------------------------------
    
sub _get_text
{
    my $node = shift;
    
    my $html = '';
    $node->traverse(
        sub {
            my $node = shift;
            $html .= $node unless ref( $node );
            return 1;
        }
    );
    return $html;
}

#------------------------------------------------------------------------------
#
# _has_scoring_element - check to see if this element spans any scoring
# element.  Uses lexically scoped variable $has_scoring_element to build up
# result from the traversal callback.
#
#------------------------------------------------------------------------------

sub _has_scoring_element
{
    my $node = shift;
    
    my $has_scoring_element = 0;
    $node->traverse(
        sub {
            my $node = shift;
            my $tag = $node->tag;
            $has_scoring_element ||= $ELEMENT_SCORES{ $tag };
            return 1;
        },
        IGNORE_TEXT
    );
    return $has_scoring_element;
}

#==============================================================================
#
# Return TRUE
#
#==============================================================================

1;