The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
package HTML::FormatText;

# ABSTRACT: Format HTML as plaintext


use 5.006_001;
use strict;
use warnings;

# We now use Smart::Comments in place of the old DEBUG framework.
# this should be commented out in release versions....
##use Smart::Comments;

use base 'HTML::Formatter';

our $VERSION = '2.16'; # VERSION
our $AUTHORITY = 'cpan:NIGELM'; # AUTHORITY

# ------------------------------------------------------------------------
sub default_values {
    (   shift->SUPER::default_values(),
        lm => 3,     # left margin
        rm => 72,    # right margin (actually, maximum text width)
    );
}

# ------------------------------------------------------------------------
sub configure {
    my ( $self, $hash ) = @_;

    my $lm = $self->{lm};
    my $rm = $self->{rm};

    $lm = delete $hash->{lm}          if exists $hash->{lm};
    $lm = delete $hash->{leftmargin}  if exists $hash->{leftmargin};
    $rm = delete $hash->{rm}          if exists $hash->{rm};
    $rm = delete $hash->{rightmargin} if exists $hash->{rightmargin};

    my $width = $rm - $lm;
    if ( $width < 1 ) {
        warn "Bad margins, ignored" if $^W;
        return;
    }
    if ( $width < 20 ) {
        warn "Page probably too narrow" if $^W;
    }

    for ( keys %$hash ) {
        warn "Unknown configure option '$_'" if $^W;
    }

    $self->{lm} = $lm;
    $self->{rm} = $rm;
    $self;
}

# ------------------------------------------------------------------------
sub begin {
    my $self = shift;

    $self->SUPER::begin;
    $self->{curpos} = 0;    # current output position.
    $self->{maxpos} = 0;    # highest value of $pos (used by header underliner)
    $self->{hspace} = 0;    # horizontal space pending flag
}

# ------------------------------------------------------------------------
sub end {
    shift->collect("\n");
}

# ------------------------------------------------------------------------
sub header_start {
    my ( $self, $level ) = @_;

    $self->vspace( 1 + ( 6 - $level ) * 0.4 );
    $self->{maxpos} = 0;
    1;
}

# ------------------------------------------------------------------------
sub header_end {
    my ( $self, $level ) = @_;

    if ( $level <= 2 ) {
        my $line;
        $line = '=' if $level == 1;
        $line = '-' if $level == 2;
        $self->vspace(0);
        $self->out( $line x ( $self->{maxpos} - $self->{lm} ) );
    }
    $self->vspace(1);
    1;
}

# ------------------------------------------------------------------------
sub bullet {
    my $self = shift;

    $self->SUPER::bullet( $_[0] . ' ' );
}

# ------------------------------------------------------------------------
sub hr_start {
    my $self = shift;

    $self->vspace(1);
    $self->out( '-' x ( $self->{rm} - $self->{lm} ) );
    $self->vspace(1);
}

# ------------------------------------------------------------------------
sub pre_out {
    my $self = shift;

    # should really handle bold/italic etc.
    if ( defined $self->{vspace} ) {
        if ( $self->{out} ) {
            $self->nl() while $self->{vspace}-- >= 0;
            $self->{vspace} = undef;
        }
    }
    my $indent = ' ' x $self->{lm};
    my $pre    = shift;
    $pre =~ s/^/$indent/mg;
    $self->collect($pre);
    $self->{out}++;
}

# ------------------------------------------------------------------------
sub out {
    my $self = shift;
    my $text = shift;

    if ( $text =~ /^\s*$/ ) {
        $self->{hspace} = 1;
        return;
    }

    if ( defined $self->{vspace} ) {
        if ( $self->{out} ) {
            $self->nl while $self->{vspace}-- >= 0;
        }
        $self->goto_lm;
        $self->{vspace} = undef;
        $self->{hspace} = 0;
    }

    if ( $self->{hspace} ) {
        if ( $self->{curpos} + length($text) > $self->{rm} ) {

            # word will not fit on line; do a line break
            $self->nl;
            $self->goto_lm;
        }
        else {

            # word fits on line; use a space
            $self->collect(' ');
            ++$self->{curpos};
        }
        $self->{hspace} = 0;
    }

    $self->collect($text);
    my $pos = $self->{curpos} += length $text;
    $self->{maxpos} = $pos if $self->{maxpos} < $pos;
    $self->{'out'}++;
}

# ------------------------------------------------------------------------
sub goto_lm {
    my $self = shift;

    my $pos = $self->{curpos};
    my $lm  = $self->{lm};
    if ( $pos < $lm ) {
        $self->{curpos} = $lm;
        $self->collect( " " x ( $lm - $pos ) );
    }
}

# ------------------------------------------------------------------------
sub nl {
    my $self = shift;

    $self->{'out'}++;
    $self->{curpos} = 0;
    $self->collect("\n");
}

# ------------------------------------------------------------------------
sub adjust_lm {
    my $self = shift;

    $self->{lm} += $_[0];
    $self->goto_lm;
}

# ------------------------------------------------------------------------
sub adjust_rm {
    shift->{rm} += $_[0];
}


1;

__END__

=pod

=encoding UTF-8

=head1 NAME

HTML::FormatText - Format HTML as plaintext

=head1 VERSION

version 2.16

=for test_synopsis 1;
__END__

=for stopwords latin1 leftmargin lm plaintext rightmargin rm CPAN homepage

=head1 SYNOPSIS

    use HTML::TreeBuilder;
    $tree = HTML::TreeBuilder->new->parse_file("test.html");

    use HTML::FormatText;
    $formatter = HTML::FormatText->new(leftmargin => 0, rightmargin => 50);
    print $formatter->format($tree);

or, more simply:

    use HTML::FormatText;
    my $string = HTML::FormatText->format_file(
        'test.html',
        leftmargin => 0, rightmargin => 50
        );

=head1 DESCRIPTION

HTML::FormatText is a formatter that outputs plain text. All character
attributes (bold/italic/underline) are ignored. Formatting of HTML tables and
forms is not implemented.

HTML::FormatText is built on L<HTML::Formatter> and documentation for that
module applies to this - especially L<HTML::Formatter/new>,
L<HTML::Formatter/format_file> and L<HTML::Formatter/format_string>.

You might specify the following parameters when constructing the formatter:

=over 4

=item I<leftmargin> (alias I<lm>)

The column of the left margin. The default is 3.

=item I<rightmargin> (alias I<rm>)

The column of the right margin. The default is 72.

=back

=head1 SEE ALSO

L<HTML::Formatter>

=head1 AUTHORS

=over 4

=item *

Nigel Metheringham <nigelm@cpan.org>

=item *

Sean M Burke <sburke@cpan.org>

=item *

Gisle Aas <gisle@ActiveState.com>

=back

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2016 by Nigel Metheringham, 2002-2005 Sean M Burke, 1999-2002 Gisle Aas.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut