The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package HTML::RelExtor;

use strict;
use vars qw($VERSION);
$VERSION = '0.03';

use HTML::Parser;
use URI;

use base qw(HTML::Parser);

sub new {
    my($class, %args) = @_;
    my $self = $class->SUPER::new(
	start_h     => [ "_start_tag", "self,tagname,attr" ],
	report_tags => [ qw(a link base) ],
    );
    if (my $base = delete $args{base}) {
	$self->{relextor_base} = $base;
    }
    $self;
}

sub _start_tag {
    my($self, $tag, $attr) = @_;

    # If there's <base href="...">, change the base URL
    if ($tag eq 'base' && exists $attr->{href}) {
	$self->{relextor_base} = $attr->{href};
	return;
    }

    # no 'rel' nor 'rev' attribute
    return unless exists $attr->{rel} or exists $attr->{rev};

    my $href = $attr->{href} or return;
    $href = URI->new_abs($href, $self->{relextor_base})->as_string
	if $self->{relextor_base};
    my $link = HTML::RelExtor::Link->new($tag, $href, $attr);
    if ($tag eq 'a') {
	$self->handler(text => sub {
			   my($self, $text) = @_;
			   $link->{text} = $text;
			   $self->handler(text => undef);
		       }, "self,dtext");
    }
    push @{$self->{links}}, $link;
}

sub links {
    my $self = shift;
    my %args = @_;
    my @links = $self->{links} ? @{$self->{links}} : ();

    if ($args{rel} && $args{rev}) {
        Carp::croak("You can't pass both rev and rel to the links()");
    }

    if ($args{rel}) {
        @links = grep $_->has_rel($args{rel}), @links;
    } elsif ($args{rev}) {
        @links = grep $_->has_rev($args{rev}), @links;
    }

    return @links;
}

sub parse_file {
    my $self = shift;
    delete $self->{links};
    $self->SUPER::parse_file(@_);
}

package HTML::RelExtor::Link;

sub new {
    my($class, $tag, $href, $attr) = @_;
    my @rel = grep length, split /\s+/, ($attr->{rel} || '');
    my @rev = grep length, split /\s+/, ($attr->{rev} || '');
    bless {
	tag  => $tag,
	href => $href,
	attr => $attr,
	rel  => \@rel,
	rev  => \@rev,
    }, $class;
}

sub tag {
    my $self = shift;
    $self->{tag};
}

sub href {
    my $self = shift;
    $self->{href};
}

sub attr {
    my $self = shift;
    $self->{attr};
}

sub rel {
    my $self = shift;
    @{$self->{rel}};
}

sub rev {
    my $self = shift;
    @{$self->{rev}};
}

sub has_rel {
    my($self, $tag) = @_;
    scalar grep { $_ eq $tag } $self->rel;
}

sub has_rev {
    my($self, $tag) = @_;
    scalar grep { $_ eq $tag } $self->rev;
}

sub text {
    my $self = shift;
    $self->{text};
}

1;
__END__

=head1 NAME

HTML::RelExtor - Extract "rel" and "rev" information from LINK and A tags.

=head1 SYNOPSIS

  use HTML::RelExtor;

  my $parser = HTML::RelExtor->new();
  $parser->parse($html);

  for my $link ($parser->links) {
      print $link->href, "\n" if $link->has_rel('nofollow');
  }

  my($canonical) = grep $_->has_rev('canonical'), $parser->links;
  if ($canonical) {
      $shorten_url = $canonical->href;
  }

=head1 DESCRIPTION

HTML::RelExtor is a HTML parser module to extract relationship information from C<A> and L<LINK> HTML tags.

=head1 METHODS

=over 4

=item new

  $parser = HTML::RelExtor->new();
  $parser = HTML::RelExtor->new(base => $base_uri);

Creates new HTML::RelExtor object.

=item parse

  $parser->parse($html);

Parses HTML content. See L<HTML::Parser> for other method signatures.

=item links

  my @links = $parser->links();
  my @links = $parser->links(rel => 'alternate');
  my @links = $parser->links(rev => 'canonical');

Returns list of link information with 'rel' or 'rev' attributes as a
HTML::RelExtor::Link object. When given I<rel> or I<rev> parameter,
returns only links that has the I<rel> or I<rev> value.

  # These are equivalent
  @links = $parser->links(rel => 'alternate');
  @links = grep $_->has_rel('alternate'), $parser->links;

=back

=head1 HTML::RelExtor::Link METHODS

=over 4

=item href

  my $href = $link->href;

Returns 'href' attribute of links.

=item tag

  my $tag = $link->tag;

Returns tag name of links in lowercase, either 'a' or 'link';

=item attr

  my $attr = $link->attr;

Returns a hash reference of attributes of the tag.

=item rel

  my @rel = $link->rel;

Returns list of 'rel' attributes. If a link contains C<< <a href="tag nofollow">blahblah</a> >>, C<rel()> method returns a list that contains C<tag> and C<nofollow>.

=item rev

  my @rev = $link->rev;

Returns list of 'rev' attributes.

=item has_rel

  if ($link->has_rel('nofollow')) { }

A handy shortcut method to find out if a link contains specific relationship.

=item has_rev

  if ($link->has_rev('canonical')) { }

A handy shortcut method to find out if a link contains specific reverse relationship.

=item text

  my $text = $link->text;

Returns text inside tags, only avaiable with A tags. It returns undef value when called with LINK tags.

=back

=head1 EXAMPLES

Collect A links tagged with C<< rel="friend" >> used in XFN (XHTML Friend Network).

  my $p = HTML::RelExtor->new();
  $p->parse($html);

  my @links = map { $_->href }
      grep { $_->tag eq 'a' && $_->has_rel('friend') } $p->links;

=head1 TODO

=over 4

=item *

Accept callback parameter when creating a new instance.

=back

=head1 AUTHOR

Tatsuhiko Miyagawa E<lt>miyagawa at bulknews.netE<gt>

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=head1 SEE ALSO

L<HTML::LinkExtor>, L<HTML::Parser>

L<http://www.w3.org/TR/REC-html40/struct/links.html>

L<http://www.google.com/googleblog/2005/01/preventing-comment-spam.html>

L<http://developers.technorati.com/wiki/RelTag>

L<http://gmpg.org/xfn/11>

L<http://shiflett.org/blog/2009/apr/save-the-internet-with-rev-canonical>

=cut