The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package HTML::LinkFilter;
use strict;
use warnings;
use HTML::Parser;

our $VERSION = "0.03";

## The html tags which might have URLs
# the master list of tagolas and required attributes (to constitute a link)
our %TAGS = ( # Copied from HTML::LinkExtractor 0.13
              a => [qw( href )],
         applet => [qw( archive code codebase src )],
           area => [qw( href )],
           base => [qw( href )],
        bgsound => [qw( src )],
     blockquote => [qw( cite )],
           body => [qw( background )],
            del => [qw( cite )],
            div => [qw( src )], # IE likes it, but don't know where it's documented
          embed => [qw( pluginspage pluginurl src )],
           form => [qw( action )],
          frame => [qw( src longdesc  )],
         iframe => [qw( src )],
         ilayer => [qw( background src )],
            img => [qw( dynsrc longdesc lowsrc src usemap )],
          input => [qw( dynsrc lowsrc src )],
            ins => [qw( cite )],
        isindex => [qw( action )], # real oddball
          layer => [qw( src )],
           link => [qw( src href )],
         object => [qw( archive classid code codebase data usemap )],
              q => [qw( cite )],
         script => [qw( src  )], # HTML::Tagset has 'for' ~ it's WRONG!
          sound => [qw( src )],
          table => [qw( background )],
             td => [qw( background )],
             th => [qw( background )],
             tr => [qw( background )],
  ## the exotic cases
           meta => undef,
     '!doctype' => [qw( url )], # is really a process instruction
);

### HTML::Parser method, not for __PACKAGE__.
my $default_h_sub = sub {
    my( $self, $tagname, $original ) = @_;

    push @{ $self->{link_filter}{tags} }, $original;

    return;
};

### HTML::Parser method, not for __PACKAGE__.
my $start_h_sub = sub {
    my( $self, $tagname, $attr_ref, $original ) = @_;

    unless ( exists $TAGS{ $tagname } ) {
        push @{ $self->{link_filter}{tags} }, $original
            and return;
    }

    unless ( grep { my $name = $_; grep { $_ eq $name } @{ $TAGS{ $tagname } } } keys %{ $attr_ref } ) {
        push @{ $self->{link_filter}{tags} }, $original
            and return;
    }

    unless ( $self->{link_filter}{cb} ) {
        push @{ $self->{link_filter}{tags} }, $original
            and return;
    }

    foreach my $attr ( keys %{ $attr_ref } ) {
        next
            unless grep { $_ eq $attr } @{ $TAGS{ $tagname } };

        my $new = $self->{link_filter}{cb}->(
            $tagname, $attr, $attr_ref->{ $attr }, $attr_ref,
        );

        $attr_ref->{ $attr } = $new
            if defined $new;
    }

    my $tag = do {
        my $build    = q{};
        my $is_xhtml = grep { $_ eq q{/} } keys %{ $attr_ref };
        my $attr     = join q{ }, map {
            join q{=}, $_, join q{}, q{"}, $attr_ref->{ $_ }, q{"},
        } grep { $_ ne q{/} } sort keys %{ $attr_ref };

        if ( $attr && $is_xhtml ) {
            $build = "<$tagname $attr />";
        }
        elsif ( $attr && ! $is_xhtml ) {
            $build = "<$tagname $attr>";
        }
        elsif ( ! $attr && $is_xhtml ) {
            $build = "<$tagname />";
        }
        else {
            $build = "<$tagname>";
        }

        if ( chomp $original ) {
            $build .= "\n";
        }

        $build;
    };

    push @{ $self->{link_filter}{tags} }, $tag;

    return $self;
};

sub new {
    my $class = shift;
    my %param = @_;

    my $self = bless \%param, $class;

    my $p = HTML::Parser->new(
        api_version => 3,
        start_h   => [
            $start_h_sub,   "self, tagname, attr, text",
        ],
        default_h => [
            $default_h_sub, "self, tagname, text",
        ],
    );

    $p->{link_filter} = $self;
    $self->{p}        = $p;
    $self->_init_tags;

    return $self;
}

sub change {
    my $self = shift;
    my( $html, $callback_sub ) = @_;

    $self->_init_tags;
    $self->{cb} = $callback_sub;
    $self->{p}->parse( $html );
    $self->{p}->eof;

    return $self;
}

sub _init_tags {
    my $self = shift;
    $self->{tags} = [ ];
    return $self;
}

sub tags {
    return shift->{tags};
}

sub html {
    my $self = shift;

    return join q{}, @{ $self->tags };
}

1;
__END__

=head1 NAME

HTML::LinkFilter - Changes all links in HTML

=head1 SYNOPSIS

  use HTML::LinkFilter;
  use Data::Dumper;

  my $html = do { local $/; <DATA> };

  my $filter = HTML::LinkFilter->new;
  $filter->change( \$html, \&callback );

  print Dumper $filter->tags;

  sub callback {
      my( $tagname, $attr, $value ) = @_;

      return; # Uses original.
  }

  __DATA__
  <!doctype html>
  <html>
    <head>
      <meta charset="UTF-8" />
    </head>
    <body>
      <h1><a href="/">example.com</a></h1>
    </body>
  </html>

=head1 DESCRIPTION

HTML::LinkFilter can change all links in passed HTML.

This requires callback sub.  The sub takes tagname, attr, value,
and returns new value, then it will be replaced. Or uses original
when returns undef.

*Note* this breaks attributes order in tag.

=head1 METHODS

=over

=item new

Returns instance.

=item change

Changes html to tags by using callback filter.
Callback filter is an argument which changes link.

Callback filter will take args those are tagname, attr, value,
and return value is pushed to $self->tags as a new value.

Callback filter can tell 'use original' to parser by returns undef.

=item tags

Returns some changed HTML tags.

=item html

Returns HTML code which is parsed.

=back

=head1 AUTHOR

kuniyoshi kouji E<lt>kuniyoshi@cpan.orgE<gt>

=head1 SEE ALSO

=head1 LICENSE

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=cut