The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl
# Copyright 2015 Jeffrey Kegler
# This file is part of Marpa::R2.  Marpa::R2 is free software: you can
# redistribute it and/or modify it under the terms of the GNU Lesser
# General Public License as published by the Free Software Foundation,
# either version 3 of the License, or (at your option) any later version.
#
# Marpa::R2 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser
# General Public License along with Marpa::R2.  If not, see
# http://www.gnu.org/licenses/.

use 5.010;
use strict;
use warnings;
use English qw( -no_match_vars );
use Marpa::R2::HTML;
use Fatal qw(open close);
use Getopt::Long;

sub usage
{
    say {*STDERR} "$PROGRAM_NAME html_fmt [uri|file]" or die "say failed: $ERRNO";
    exit 1;
}

my $help_flag              = 0;
my $allowed_avoid_whitespace_flags = { map { ( $_, 1) } qw(comment yes no) };
my $avoid_whitespace_flag  = 'yes';
local $main::ADDED_TAG_COMMENT_FLAG = 1;
my $ws_ok_before_end_tag_flag = 0;
my $ws_ok_after_start_tag_flag = 1;
my $dump_config_flag = 0;
my $dump_AHFA_flag = 0;
my $trace_terminals_flag = 0;
my $trace_cruft_flag = 0;
my $trace_values_flag = 0;
my $compile_flag;
usage()
    if not Getopt::Long::GetOptions(
    'help'               => \$help_flag,
    'avoid-whitespace=s' => \$avoid_whitespace_flag,
    'ws-ok-before-end-tag!' => \$ws_ok_before_end_tag_flag,
    'ws-ok-after-start-tag!' => \$ws_ok_after_start_tag_flag,
    'added-tag-comment!' => \$main::ADDED_TAG_COMMENT_FLAG,

    # undocumented
    'dump-config' => \$dump_config_flag,
    'dump-AHFA' => \$dump_AHFA_flag,
    'compile=s' => \$compile_flag,
    'trace-terminals' => \$trace_terminals_flag,
    'trace-cruft' => \$trace_cruft_flag,
    'trace-values' => \$trace_values_flag,
    );
usage() if $help_flag or scalar @ARGV > 1;

if ( not $allowed_avoid_whitespace_flags->{$avoid_whitespace_flag} ) {
    die "Bad avoid-whitespace flag\n",
        'avoid-whitespace must be one of the following: ', join q{ },
        keys %{$allowed_avoid_whitespace_flags};
}

my $locator = shift;
my $document;
GET_DOCUMENT: {
    if ( not defined $locator ) {
        local $RS = undef;
        ## no critic(InputOutput::ProhibitExplicitStdin)
        $document = <STDIN>;
        last GET_DOCUMENT;
    }
    if ( $locator =~ /\A [[:alnum:]]+ [:] /xms ) {
        require WWW::Mechanize;
        my $mech = WWW::Mechanize->new( autocheck => 1 );
        $mech->get($locator);
        $document = $mech->content;
        undef $mech;
        last GET_DOCUMENT;
    } ## end if ( $locator =~ /\A [[:alnum:]]+ [:] /xms )
    {
        local $RS = undef;
        open my $fh, q{<}, $locator;
        $document = <$fh>;
        close $fh;
    }
} ## end GET_DOCUMENT:

sub post_process {
    my ($value) = @_;
    my @text_pieces = ();
    DATUM: for my $line_data ( map { @{$_} } @{$value} ) {
        my ( $type, $indent ) = @{$line_data};
        if ( $type eq 'text' ) {
            my $text = $line_data->[2];
            my $has_trailing_ws = $text =~ s/ \s+ \z//xms;
            if ( $text =~ s/ \A \s+ //xms ) {
                push @text_pieces, [ 'whitespace', $indent ];
            }
            my @lines = grep { $_ =~ /\S/xms } split /\n/xms, $text;
            for my $line_no ( 0 .. $#lines ) {
                my $line = $lines[$line_no];
                $line =~ s/\A[ \t]+//xms;
                $line =~ s/[ \t]+\z//xms;
                $line =~ s/[ \t]+/ /xms;
                push @text_pieces, [ 'whitespace', $indent ] if $line_no;
                push @text_pieces, [ 'text', $indent, $line ];
            } ## end for my $line_no ( 0 .. $#lines )
            push @text_pieces, [ 'whitespace', $indent ] if $has_trailing_ws;
            next DATUM;
        } ## end if ( $type eq 'text' )
        if ( $type eq 'msg: missing start tag' ) {
            my $location = $line_data->[2];
            my $tagname  = $line_data->[3];
            GIVEN_LOCATION: {
                if ( $location eq 'preceeding' ) {
                    push @text_pieces,
                        [
                        'html_fmt comment',
                        $indent + 1,
                        'Preceeding start tag is replacement for a missing one'
                        ];
                    last GIVEN_LOCATION;
                } ## end if ( $location eq 'preceeding' )
                if ( $location eq 'following pre' ) {
                    push @text_pieces,
                        [
                        'html_fmt comment',
                        $indent,
                        "Inside following <pre>, a start tag is missing: <$tagname>"
                        ];
                    last GIVEN_LOCATION;
                } ## end if ( $location eq 'following pre' )
                Carp::croak(
                    "Internal error: unprovided-for missing start tag location: $_"
                );
            } ## end GIVEN_LOCATION:
            next DATUM;
        } ## end if ( $type eq 'msg: missing start tag' )
        if ( $type eq 'msg: missing end tag' ) {
            my $location = $line_data->[2];
            my $tagname  = $line_data->[3];
            GIVEN_LOCATION: {
                if ( $location eq 'following' ) {
                    push @text_pieces,
                        [
                        'html_fmt comment',
                        $indent + 1,
                        'Following end tag is replacement for a missing one'
                        ];
                    last GIVEN_LOCATION;
                } ## end if ( $location eq 'following' )
                if ( $location eq 'following pre' ) {
                    push @text_pieces,
                        [
                        'html_fmt comment',
                        $indent,
                        "Inside following <pre>, an end tag is missing: </$tagname>"
                        ];
                    last GIVEN_LOCATION;
                } ## end if ( $location eq 'following pre' )
                Carp::croak(
                    "Internal error: unprovided-for missing end tag location: $_"
                );
            } ## end GIVEN_LOCATION:
            next DATUM;
        } ## end if ( $type eq 'msg: missing end tag' )
        if ( $type eq 'msg: missing pre end tag' ) {
            push @text_pieces,
                [
                'html_fmt comment',
                $indent,
                'An missing end tag was added to close the preceding <pre> element'
                ];
            next DATUM;
        } ## end if ( $type eq 'msg: missing pre end tag' )
        if ( $type eq 'msg: missing pre start tag' ) {
            push @text_pieces,
                [
                'html_fmt comment',
                $indent,
                'A missing start tag was added in front of the following <pre> element'
                ];
            next DATUM;
        } ## end if ( $type eq 'msg: missing pre start tag' )
        if ( $type eq 'msg: cruft' ) {
            my $location   = $line_data->[2];
            my $cruft      = $line_data->[3];
            my $token_type = $line_data->[4];
            my $node_desc =
                  $token_type eq 'S' ? 'start tag'
                : $token_type eq 'E' ? 'end tag'
                : $token_type eq 'T' ? 'text'
                :                      'node';
            GIVEN_LOCATION: {
                if ( $location eq 'following' ) {
                    push @text_pieces,
                        [
                        'html_fmt comment',
                        $indent,
                        "Next $node_desc is cruft"
                        ];
                    last GIVEN_LOCATION;
                } ## end if ( $location eq 'following' )
                if ( $location eq 'following pre' ) {

                    # Make sure the cruft quoted inside
                    # the HTML comment does not
                    # disrupt the comment.
                    ( my $safe_cruft = $cruft ) =~ s/--/- -/xms;
                    push @text_pieces,
                        [
                        'html_fmt comment',
                        $indent,
                        qq{Inside the following <pre>, $node_desc is cruft:\n$safe_cruft}
                        ];
                    last GIVEN_LOCATION;
                } ## end if ( $location eq 'following pre' )
                Carp::croak(
                    "Internal error: unprovided-for cruft location: $_");
            } ## end GIVEN_LOCATION:
            next DATUM;
        } ## end if ( $type eq 'msg: cruft' )
        push @text_pieces, [ $type, $indent, $line_data->[2] ];
    } ## end for my $line_data ( map { @{$_} } @{$value} )
    my @ws_unsafe;
    $ws_unsafe[ scalar @text_pieces ] = 0;    # size the array
    $ws_unsafe[0] = 0;
    my $last_non_comment_ix = scalar @text_pieces;
    my $safe_after_piece    = 0;
    my $first_body_tag;
    my $last_body_tag;
    TEXT_PIECE: for my $text_piece_ix ( 0 .. $#text_pieces ) {
        my ( $type, $indent, $text ) = @{ $text_pieces[$text_piece_ix] };
        my $safe_before_piece = 0;
        WHITESPACE_BY_TYPE: {

            # If a comment, after-piece is whatever current value
            # of $safe_after_piece,
            # leave before as-is
            last WHITESPACE_BY_TYPE if $type eq 'html_fmt comment';
            last WHITESPACE_BY_TYPE if $type eq 'comment';
            if ( $type eq 'whitespace' ) {

                # Safe before and after other whitespace
                $safe_before_piece = $safe_after_piece = 1;
                last WHITESPACE_BY_TYPE;
            } ## end if ( $type eq 'whitespace' )
            if ( $type eq 'end tag' ) {

                if ( $text =~ /\A [^[:alnum:]]+ body [^[:alnum:]]+ \z /xmsi )
                {
                    $last_body_tag = $text_piece_ix;
                }

                # Unsafe after an end tag, but safe before it
                $safe_before_piece = $ws_ok_before_end_tag_flag;
                $safe_after_piece  = 0;
                last WHITESPACE_BY_TYPE;
            } ## end if ( $type eq 'end tag' )
            if ( $type eq 'start tag' ) {

                if ( not defined $first_body_tag
                    and $text
                    =~ /\A [^[:alnum:]]+ body [^[:alnum:]]+ \z /xmsi )
                {
                    $first_body_tag = $text_piece_ix;
                } ## end if ( not defined $first_body_tag and $text =~ ...)

                # Leave safe status as-is before start tag
                # Whitespace safe after a start tag
                $safe_after_piece = $ws_ok_after_start_tag_flag;
                last WHITESPACE_BY_TYPE;
            } ## end if ( $type eq 'start tag' )
            if ( $type eq 'text' ) {

                # Leave safe status as-is before text
                # Whitespace not safe after text
                $safe_after_piece = 0;
                last WHITESPACE_BY_TYPE;
            } ## end if ( $type eq 'text' )
                # Blocks can occur inline, etc.
                # So with everything else, be conservative.
                # Whitespace is unsafe after, and left as-is before.
            $safe_after_piece = 0;
        } ## end WHITESPACE_BY_TYPE:
        $ws_unsafe[ $text_piece_ix + 1 ] = $safe_after_piece ? 0 : 1;
        if ($safe_before_piece) {
            for my $ix ( $last_non_comment_ix + 1 .. $text_piece_ix ) {
                $ws_unsafe[$ix] = 0;
            }
        }
        if ( $type ne 'comment' and $type ne 'html_fmt comment' ) {
            $last_non_comment_ix = $text_piece_ix;
        }
    } ## end TEXT_PIECE: for my $text_piece_ix ( 0 .. $#text_pieces )

    # Whitespace safe before the body
    $first_body_tag //= -1;
    for (
        my $text_piece_ix = 0;
        $text_piece_ix <= $first_body_tag;
        $text_piece_ix++
        )
    {
        $ws_unsafe[$text_piece_ix] = 0;
    } ## end for ( my $text_piece_ix = 0; $text_piece_ix <= $first_body_tag...)

    # Whitespace safe after the body,
    # or at least at the EOF if no end tag
    $last_body_tag //= $#text_pieces;
    for (
        my $text_piece_ix = scalar @text_pieces;
        $text_piece_ix > $last_body_tag;
        $text_piece_ix--
        )
    {
        $ws_unsafe[$text_piece_ix] = 0;
    } ## end for ( my $text_piece_ix = scalar @text_pieces; $text_piece_ix...)

    my @output = ();
    CREATE_OUTPUT: {
        if ( $avoid_whitespace_flag eq 'comment' ) {
            TEXT_PIECE: for my $text_piece_ix ( 0 .. $#text_pieces ) {
                my ( $type, $indent, $text ) =
                    @{ $text_pieces[$text_piece_ix] };
                next TEXT_PIECE if $type eq 'whitespace';
                my $suffix = $ws_unsafe[ $text_piece_ix + 1 ] ? '<!--' : q{};
                my $indentation = q{  } x $indent;
                if ( $type eq 'html_fmt comment' ) {
                    push @output,
                          $indentation
                        . '  html_fmt: '
                        . $text . ' -->'
                        . $suffix;
                    next TEXT_PIECE;
                } ## end if ( $type eq 'html_fmt comment' )
                if ( $ws_unsafe[$text_piece_ix] ) {
                    $indentation =
                          $indentation
                        . '  html_fmt: this comment is to avoid introducing whitespace'
                        . "\n"
                        . $indentation . '-->';
                } ## end if ( $ws_unsafe[$text_piece_ix] )
                if ( $type eq 'text' and $text =~ /\S/xms ) {
                    push @output, $indentation . $text . $suffix;
                    next TEXT_PIECE;
                }
                push @output, $indentation . $text . $suffix;
            } ## end TEXT_PIECE: for my $text_piece_ix ( 0 .. $#text_pieces )
            last CREATE_OUTPUT;
        } ## end if ( $avoid_whitespace_flag eq 'comment' )
        {
            # $avoid_whitespace eq 'yes' or $avoid_whitespace eq 'no'
            my $avoid_whitespace_is_yes = $avoid_whitespace_flag ne 'no';
            TEXT_PIECE: for my $text_piece_ix ( 0 .. $#text_pieces ) {
                my ( $type, $indent, $text ) =
                    @{ $text_pieces[$text_piece_ix] };
                next TEXT_PIECE if $type eq 'whitespace';
                if ( $avoid_whitespace_is_yes and $ws_unsafe[$text_piece_ix] )
                {
                    my $indentation = q{  } x ($indent);
                    if ( $type eq 'html_fmt comment' ) {
                        $text =
                              qq{<!--\n}
                            . $indentation
                            . '    html_fmt: '
                            . $text . "\n"
                            . $indentation . '  -->';
                    } ## end if ( $type eq 'html_fmt comment' )
                    $output[-1] .= $text;
                    next TEXT_PIECE;
                } ## end if ( $avoid_whitespace_is_yes and $ws_unsafe[...])
                my $indentation = q{  } x $indent;
                if ( $type eq 'text' and $text =~ /\S/xms ) {
                    push @output, $indentation . $text;
                    next TEXT_PIECE;
                }
                if ( $type eq 'html_fmt comment' ) {
                    push @output,
                        $indentation . '<!-- html_fmt: ' . $text . ' -->';
                    next TEXT_PIECE;
                }
                push @output, $indentation . $text;
            } ## end TEXT_PIECE: for my $text_piece_ix ( 0 .. $#text_pieces )
            last CREATE_OUTPUT;
        }
    } ## end CREATE_OUTPUT:
    return join "\n", @output, q{};
} ## end sub post_process

sub do_pre {
    my @new_line_data = ();
    my $start_tag     = Marpa::R2::HTML::start_tag();
    my $end_tag       = Marpa::R2::HTML::end_tag();

    CHILD:
    for my $value ( @{ Marpa::R2::HTML::values() } ) {
        for my $line_data ( @{$value} ) {
            if ( $line_data->[0] =~ /\A msg [:] /xms ) {
                push @new_line_data,
                    [
                    $line_data->[0], 0,
                    'following pre', @{$line_data}[ 3 .. $#{$line_data} ]
                    ];
            } ## end if ( $line_data->[0] =~ /\A msg [:] /xms )
        } ## end for my $line_data ( @{$value} )
    } ## end CHILD: for my $value ( @{ Marpa::R2::HTML::values() } )

    if ( not defined $start_tag ) {
        push @new_line_data, [ 'msg: missing pre start tag', 0 ]
           if $main::ADDED_TAG_COMMENT_FLAG;
        push @new_line_data, [ 'start tag', 0, '<pre>' ];
    }

    my $original = Marpa::R2::HTML::original();
    push @new_line_data, [ 'pre', 0, $original ];

    if ( not defined $end_tag ) {
        push @new_line_data, [ 'end tag', 0, '</pre>' ]
           if $main::ADDED_TAG_COMMENT_FLAG;
        push @new_line_data, [ 'msg: missing pre end tag', 0 ];
    }
    return \@new_line_data;
} ## end sub do_pre

sub do_cruft {
    my $literal = Marpa::R2::HTML::literal();
    my @new_line_data = ( [ 'msg: cruft', 0, 'following', $literal, Marpa::R2::HTML::token_type() ] );
    push @new_line_data, [ 'cruft', 0, $literal ];
    return \@new_line_data;
} ## end sub do_cruft

sub do_comment {
    my $literal = Marpa::R2::HTML::literal();
    return [ [ 'comment', 0, $literal ] ];
}

sub do_default {
    my $tagname       = Marpa::R2::HTML::tagname();
    my @new_line_data = ();

    my $start_tag = Marpa::R2::HTML::start_tag();
    my $end_tag   = Marpa::R2::HTML::end_tag();
    my $is_empty_element  = Marpa::R2::HTML::is_empty_element();
    {
        ## no critic(Variables::ProhibitPackageVars)
        my $tag_type =
            $is_empty_element
            ? 'empty element'
            : 'start tag';
        ##use critic
        if ( defined $start_tag ) {
            push @new_line_data, [ $tag_type, 0, $start_tag ];
        }
        else {
            push @new_line_data, [ $tag_type, 0, '<' . $tagname . '>' ];
            push @new_line_data,
                [ 'msg: missing start tag', 0, 'preceeding', $tagname ]
                if $main::ADDED_TAG_COMMENT_FLAG;
        } ## end else [ if ( defined $start_tag ) ]
    }

    my @descendant_data = @{ Marpa::R2::HTML::descendants('value,original') };
    my $first_content_child = defined $start_tag ? 1 : 0;
    my $last_content_child = $#descendant_data - ( defined $end_tag ? 1 : 0 );
    CHILD:
    for my $descendant_data_ix ( $first_content_child .. $last_content_child )
    {
        my ( $value, $original ) = @{ $descendant_data[$descendant_data_ix] };
        if ( defined $value ) {
            for my $line_data ( @{$value} ) {
                my ( $type, $indent, @data ) = @{$line_data};
                push @new_line_data, [ $type, $indent + 1, @data ];
            }
            next CHILD;
        } ## end if ( defined $value )
        push @new_line_data, [ 'text', 1, $original ];
    } ## end CHILD: for my $descendant_data_ix ( $first_content_child .. ...)

    if ( not $is_empty_element ) {
        if ( defined $end_tag ) {
            push @new_line_data, [ 'end tag', 0, $end_tag ];
        }
        else {
            push @new_line_data,
                [ 'msg: missing end tag', 0, 'following', $tagname ]
                if $main::ADDED_TAG_COMMENT_FLAG;
            push @new_line_data, [ 'end tag', 0, '</' . $tagname . '>' ];
        } ## end else [ if ( defined $end_tag ) ]
    } ## end if ( not $is_empty_element )

    return \@new_line_data;
} ## end sub do_default

my %html_args = (
    'script' => sub {
        return [ [ 'script', 0, Marpa::R2::HTML::original() ] ];
    },
    ':CRUFT'   => \&do_cruft,
    ':COMMENT' => \&do_comment,
    'pre'      => \&do_pre,
    q{*}       => \&do_default,
    ':TOP'     => sub { return Marpa::R2::HTML::values(); }
);

my %flags = (
    trace_terminals => $trace_terminals_flag,
    trace_cruft     => $trace_cruft_flag,
    trace_values    => $trace_values_flag,
    dump_config     => $dump_config_flag,
    dump_AHFA     => $dump_AHFA_flag,
);
if (defined $compile_flag) {
  open my $fh, q{<}, $compile_flag;
  my $source = join q{}, <$fh>;
  close $fh;
  $flags{compile} = \$source;
}

my $value_ref = Marpa::R2::HTML::html( \$document, \%html_args, \\%flags );
die 'Internal error: no parse' if not defined $value_ref;
if ($dump_config_flag or $dump_AHFA_flag) {
  print ${$value_ref};
  exit 0;
}

print post_process($value_ref) or die "print failed: $ERRNO";

exit 0;

# vim: set expandtab shiftwidth=4: