The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package HTML::ListScraper::Interactive;

use warnings;
use strict;

use HTML::Entities;

require Exporter;

use vars qw(@ISA @EXPORT_OK);

@ISA = qw(Exporter);
@EXPORT_OK = qw(format_tags canonicalize_tags);

use Class::Generate qw(class);

class 'HTML::ListScraper::FormTag' => {
    name => { type => '$', required => 1 },
    index => { type => '$', required => 1, readonly => 1 },
    link => { type => '$', required => 1, readonly => 1 },
    text => { type => '$', required => 1, readonly => 1 },
    '&close_name' => q{ $name .= '/'; }
};

sub is_opening {
    my $tag = shift;

    return $tag !~ m~\/~;
}

sub is_closing {
    my $tag = shift;

    return $tag =~ m~^\/~;
}

sub format_tags {
    my ($scraper, $tags, $incl) = @_;

    my $incl_attr;
    my $incl_text;
    my $incl_index;
    if (ref($incl)) {
        $incl_attr = $incl->{attr};
        $incl_text = $incl->{text};
        $incl_index = $incl->{index};
    }

    my @buffer;
    my @stack;
    foreach my $td (@$tags) {
        my $name = $td->name;
	my $tag = $name;
	$tag =~ s~^\/~~;

	my $text = $td->text || '';
	$text =~ s/[\s[:cntrl:]]+/ /g;

	my $link = $td->link || '';
	$link =~ s/[\s[:cntrl:]]+//g;

	if ($name eq $tag) {
	    push @stack, [ $tag, scalar(@buffer) ];
	    push @buffer, HTML::ListScraper::FormTag->new(name => $name,
                index => $td->index, link => $link, text => $text);
	} else {
	    while (scalar(@stack) &&
		    ($stack[scalar(@stack) - 1]->[0] ne $tag)) {
		if ($scraper->is_unclosed_tag(
                        $stack[scalar(@stack) - 1]->[0])) {
		    my $pair = pop @stack;

		    $buffer[$pair->[1]]->close_name();
		} else {
		    last;
		}
	    }

	    if (scalar(@stack)) {
	        pop @stack;
	    }

	    push @buffer, HTML::ListScraper::FormTag->new(name => $name,
                index => $td->index, link => $link, text => $text);
	}
    }

    my @out;
    my $prev;
    my $prev_index;
    my $depth = 0;
    foreach my $ft (@buffer) {
        my $name = $ft->name;
        if (defined($prev)) {
	    my $op = is_opening($prev);
	    my $cl = is_closing($name);
  	    if ($op && !$cl) {
	        ++$depth;
	    } elsif (!$op && $cl) {
	        if ($depth > 0) {
		    --$depth;
		}
	    }
        }

	my $indent = ' ' x (2 * $depth);

	my $attr = '';
	if ($incl_attr && $ft->link) {
	    $attr = ' href="' . encode_entities($ft->link, '"') . '"';
	}

	my $lncol = '';
	if ($incl_index) {
	    $lncol = $ft->index . "\t";
	}

	if (defined($prev_index) && (($ft->index - $prev_index) != 1)) {
	    push @out, "\n";
	}

	push @out, "$lncol$indent<$name$attr>\n";
	
	if ($incl_text && ($ft->text !~ /^[\s\r\n]*$/)) {
	    $lncol = $incl_index ? "\t" : "";
	    push @out, $lncol . $indent . encode_entities($ft->text, "<>&") . "\n";
	}

	$prev = $name;
	$prev_index = $ft->index;
    }

    return wantarray ? @out : \@out;
}

sub canonicalize_tags {
    my @out;
    foreach (@_) {
        my $ln = lc $_;
	$ln =~ s/^\s*<//;
	$ln =~ s/\/?>[\s\r\n]*$//;

	if ($ln) {
	    push @out, $ln;
	}
    }

    return wantarray ? @out : \@out;
}

1;

__END__

=head1 NAME

HTML::ListScraper::Interactive - formatting data from L<HTML::ListScraper>

=head1 FUNCTIONS

=head2 format_tags

Formats a tag sequence to emphasize its tree-like structure. Takes 2
or 3 parameters: a L<HTML::ListScraper> object, array reference
containing L<HTML::ListScraper::Tag> objects and an optional hash with
formatting options. C<format_tags> returns an array (array reference
if called in a scalar context) with formatted tag names and text.

The formatting options are

=over

=item attr

Include the C<href> attribute in the output.

=item text

Include the plain text in the output.

=item index

Include tag positions in the output.

=back

The returned values are basically XHTML lines: opening tags, text with
quoted entities and closing tags. Tags are enclosed in angle
brackets. The returned values don't necessarily form a valid XML
fragment, though, i.e. because the input tags need not form a
tree.

When C<index> is set, tag values start with the tag's index, followed
by a tab. Next, spaces show indentation. An opening tag not identified
as missing a closing tag increases indentation by 2 spaces, a closing
tag decreases it back. An opening tag with missing closing tag is
output with '/' appended to its name. For the rules of associating
opening and closing tags, see C<HTML::ListScraper::shapeless>.

When C<attr> is set, links are formatted without whitespace and
enclosed in double quotes. Double quotes in links are escaped, but no
other characters are (which can also make the result invalid
HTML). When C<text> is set, the output text has normalized whitespace;
nodes containing only whitespace are dropped. Gaps between adjacent
tag positions are displayed as an empty line. All values end with a
newline.

=head2 canonicalize_tags

Undoes the formatting done by C<format_tags>. Takes a list of lines
such as those output by C<format_tags> when called without any
formatting options and converts them to a list of tag names. Note that
C<canonicalize_tags> doesn't handle attributes, text lines nor index
numbers.