LinkExtractor.pm - metacpan.org

package HTML::LinkExtractor;

use strict;

use HTML::TokeParser 2; # use HTML::TokeParser::Simple 2;
use URI 1;
use Carp qw( croak );

use vars qw( $VERSION );
$VERSION = '0.13';

## The html tags which might have URLs
# the master list of tagolas and required attributes (to constitute a link)
use vars qw( %TAGS );
%TAGS = (
              a => [qw( href )],
         applet => [qw( archive code codebase src )],
           area => [qw( href )],
           base => [qw( href )],
        bgsound => [qw( src )],
     blockquote => [qw( cite )],
           body => [qw( background )],
            del => [qw( cite )],
            div => [qw( src )], # IE likes it, but don't know where it's documented
          embed => [qw( pluginspage pluginurl src )],
           form => [qw( action )],
          frame => [qw( src longdesc  )],
         iframe => [qw( src )],
         ilayer => [qw( background src )],
            img => [qw( dynsrc longdesc lowsrc src usemap )],
          input => [qw( dynsrc lowsrc src )],
            ins => [qw( cite )],
        isindex => [qw( action )], # real oddball
          layer => [qw( src )],
           link => [qw( src href )],
         object => [qw( archive classid code codebase data usemap )],
              q => [qw( cite )],
         script => [qw( src  )], # HTML::Tagset has 'for' ~ it's WRONG!
          sound => [qw( src )],
          table => [qw( background )],
             td => [qw( background )],
             th => [qw( background )],
             tr => [qw( background )],
  ## the exotic cases
           meta => undef,
     '!doctype' => [qw( url )], # is really a process instruction
);

## tags which contain <.*?> STUFF TO GET </\w+>
use vars qw( @TAGS_IN_NEED );
@TAGS_IN_NEED = qw(
    a
    blockquote
    del
    ins
    q
);

use vars qw( @VALID_URL_ATTRIBUTES );
@VALID_URL_ATTRIBUTES = qw(
        action
        archive
        background
        cite
        classid
        code
        codebase
        data
        dynsrc
        href
        longdesc
        lowsrc
        pluginspage
        pluginurl
        src
        usemap
);


sub new {
    my($class, $cb, $base, $strip) = @_;
    my $self = bless {}, $class;


    $self->{_cb} = $cb if defined $cb;
    $self->{_base} = URI->new($base) if defined $base;
    $self->strip( $strip || 0 );

    return $self;
}

sub strip {
    my( $self, $on ) = @_;
    return $self->{_strip} unless defined $on;
    return $self->{_strip} = $on ? 1 : 0;
}

## $p= HTML::TokeParser->new($filename || FILEHANDLE ||\$filecontents); # ## $p= HTML::TokeParser::Simple->new($filename || FILEHANDLE ||\$filecontents);

sub parse {
    my( $this, $hmmm ) = @_;
    my $tp = new HTML::TokeParser( $hmmm ); #     my $tp = new HTML::TokeParser::Simple( $hmmm );

    unless($tp) {
        croak qq[ Couldn't create a HTML::TokeParser object: $!]; #         croak qq[ Couldn't create a HTML::TokeParser::Simple object: $!];
    }

    $this->{_tp} = $tp;

    $this->_parsola();
    return();
}

sub _parsola {
    my $self = shift;

## a stack of links for keeping track of TEXT
## which is all of "<a href>text</a>"
    my @TEXT = ();
    $self->{_LINKS} = [];


#  ["S",  $tag, $attr, $attrseq, $text]
#  ["E",  $tag, $text]
#  ["T",  $text, $is_data]
#  ["C",  $text]
#  ["D",  $text]
#  ["PI", $token0, $text]

    while (my $T = $self->{_tp}->get_token() ) {
        my $NL; #NewLink
        my $Tag = $T->[1]; #         my $Tag = $T->return_tag;
        my $got_TAGS_IN_NEED=0;
## Start tag?
        if($T->[0] eq 'S' ) { #         if($T->is_start_tag) {
            next unless exists $TAGS{$Tag};

## Do we have a tag for which we want to capture text?
            $got_TAGS_IN_NEED = 0;
            $got_TAGS_IN_NEED = grep { /^\Q$Tag\E$/i } @TAGS_IN_NEED;

## then check to see if we got things besides META :)
            if(defined $TAGS{ $Tag }) {

                for my $Btag(@{$TAGS{$Tag}}) {
## and we check if they do have one with a value
                    if(exists $T->[2]->{ $Btag }) { #                     if(exists $T->return_attr()->{ $Btag }) {

                        $NL = $T->[2]; #                         $NL = $T->return_attr();
## TAGS_IN_NEED are tags in deed (start capturing the <a>STUFF</a>)
                        if($got_TAGS_IN_NEED) {
                            push @TEXT, $NL;
                            $NL->{_TEXT} = "";
                        }
                    }
                }
            }elsif($Tag eq 'meta') {
                $NL = $T->[2]; #                 $NL = $T->return_attr();

                if(defined $$NL{content} and length $$NL{content} and (
                    defined $$NL{'http-equiv'} &&  $$NL{'http-equiv'} =~ /refresh/i
                    or
                    defined $$NL{'name'} &&  $$NL{'name'} =~ /refresh/i
                    ) ) {

                    my( $timeout, $url ) = split m{;\s*?URL=}i, $$NL{content},2;
                    my $base = $self->{_base};
                    $$NL{url} = URI->new_abs( $url, $base ) if $base;
                    $$NL{url} = $url unless exists $$NL{url};
                    $$NL{timeout} = $timeout if $timeout;
                }
            }

            ## In case we got nested tags
            if(@TEXT) {
                $TEXT[-1]->{_TEXT} .= $T->[-1] ; #                 $TEXT[-1]->{_TEXT} .= $T->as_is;
            }

## Text?
        }elsif($T->[0] eq 'T' ) { #         }elsif($T->is_text) {
            $TEXT[-1]->{_TEXT} .= $T->[-2]  if @TEXT; #             $TEXT[-1]->{_TEXT} .= $T->as_is if @TEXT;
## Declaration?
        }elsif($T->[0] eq 'D' ) { #         }elsif($T->is_declaration) {
## We look at declarations, to get anly custom .dtd's (tis linky)
            my $text = $T->[-1] ; #             my $text = $T->as_is;
            if( $text =~ m{ SYSTEM \s \" ( [^\"]* ) \" > $ }ix ) {
                $NL = { raw => $text, url => $1, tag => '!doctype' };
            }
## End tag?
        }elsif($T->[0] eq 'E' ){ #         }elsif($T->is_end_tag){
## these be ignored (maybe not in between <a...></a> tags
## unless we're stacking (bug #5723)
            if(@TEXT and exists $TAGS{$Tag}) {
                $TEXT[-1]->{_TEXT} .= $T->[-1] ; #                 $TEXT[-1]->{_TEXT} .= $T->as_is;
                my $pop = pop @TEXT;
                $TEXT[-1]->{_TEXT} .= $pop->{_TEXT} if @TEXT;
                $pop->{_TEXT} = _stripHTML( \$pop->{_TEXT} ) if $self->strip;
                $self->{_cb}->($self, $pop) if exists $self->{_cb};
            }
        }

        if(defined $NL) {
            $$NL{tag} = $Tag;

            my $base = $self->{_base};

            for my $at( @VALID_URL_ATTRIBUTES ) {
                if( exists $$NL{$at} ) {
                    $$NL{$at} = URI->new_abs( $$NL{$at}, $base) if $base;
                }
            }

            if(exists $self->{_cb}) {
                $self->{_cb}->($self, $NL ) if not $got_TAGS_IN_NEED or not @TEXT; #bug#5470
            } else {
                push @{$self->{_LINKS}}, $NL;
            }
        }
    }## endof while (my $token = $p->get_token)

    undef $self->{_tp};
    return();
}

sub links {
    my $self = shift;
    ## just like HTML::LinkExtor's
    return $self->{_LINKS};
}


sub _stripHTML {
    my $HtmlRef = shift;
    my $tp = new HTML::TokeParser( $HtmlRef ); #     my $tp = new HTML::TokeParser::Simple( $HtmlRef );
    my $t = $tp->get_token(); # MUST BE A START TAG (@TAGS_IN_NEED)
                              # otherwise it ain't come from LinkExtractor
    if($t->[0] eq 'S' ) { #     if($t->is_start_tag) {
        return $tp->get_trimmed_text( '/'.$t->[1] ); #         return $tp->get_trimmed_text( '/'.$t->return_tag );
    } else {
        require Data::Dumper;
        local $Data::Dumper::Indent=1;
        die " IMPOSSIBLE!!!! ",
            Data::Dumper::Dumper(
                '$HtmlRef',$HtmlRef,
                '$t', $t,
            );
    }
}

1;

package main;

unless(caller()) {
    require Data::Dumper;
    if(@ARGV) {
        for my $file( @ARGV ) {
            if( -e $file ) {
                my $LX = new HTML::LinkExtractor( );
                $LX->parse( $file );
                print Data::Dumper::Dumper($LX->links);
                undef $LX;
            } else {
                warn "The file `$file' doesn't exist\n";
            }
        }
        
    } else {
    
        my $INPUT = q{
COUNT THEM BOYS AND GIRLS, LINKS OUTGHT TO HAVE 9 ELEMENTS.

1 <!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd">
2 <meta HTTP-EQUIV="Refresh" CONTENT="5; URL=http://www.foo.com/foo.html">
3 <base href="http://perl.org">
4 <a href="http://www.perlmonks.org">Perlmonks.org</a>
<p>

5 <a href="#BUTTER"  href="#SCOTCH">
    hello there
6 <img src="#AND" src="#PEANUTS">
7    <a href="#butter"> now </a>
</a>

8 <q CITE="http://www.shakespeare.com/">To be or not to be.</q>
9 <blockquote CITE="http://www.stonehenge.com/merlyn/">
    Just Another Perl Hacker,
</blockquote>
    };

        my $LX = new HTML::LinkExtractor();
        $LX->parse(\$INPUT);

        print scalar(@{$LX->links()})." we GOT\n";
        print Data::Dumper::Dumper( $LX->links() );
    }
    
}

__END__


=head1 NAME

HTML::LinkExtractor - Extract I<L<links|/"WHAT'S A LINK-type tag">> from an HTML document

=head1 DESCRIPTION

HTML::LinkExtractor is used for extracting links from HTML.
It is very similar to L<HTML::LinkExtor|HTML::LinkExtor>,
except that besides getting the URL, you also get the link-text.

Example ( B<please run the examples> ):

    use HTML::LinkExtractor;
    use Data::Dumper;

    my $input = q{If <a href="http://perl.com/"> I am a LINK!!! </a>};
    my $LX = new HTML::LinkExtractor();

    $LX->parse(\$input);

    print Dumper($LX->links);
    __END__
    # the above example will yield
    $VAR1 = [
              {
                '_TEXT' => '<a href="http://perl.com/"> I am a LINK!!! </a>',
                'href' => bless(do{\(my $o = 'http://perl.com/')}, 'URI::http'),
                'tag' => 'a'
              }
            ];

C<HTML::LinkExtractor> will also correctly extract nested
I<L<link-type|/"WHAT'S A LINK-type tag">> tags.

=head1 SYNOPSIS

    ## the demo
    perl LinkExtractor.pm
    perl LinkExtractor.pm file.html othefile.html

    ## or if the module is installed, but you don't know where

    perl -MHTML::LinkExtractor -e" system $^X, $INC{q{HTML/LinkExtractor.pm}} "
    perl -MHTML::LinkExtractor -e' system $^X, $INC{q{HTML/LinkExtractor.pm}} '

    ## or

    use HTML::LinkExtractor;
    use LWP qw( get ); #     use LWP::Simple qw( get );

    my $base = 'http://search.cpan.org';
    my $html = get($base.'/recent');
    my $LX = new HTML::LinkExtractor();

    $LX->parse(\$html);

    print qq{<base href="$base">\n};

    for my $Link( @{ $LX->links } ) {
    ## new modules are linked  by /author/NAME/Dist
        if( $$Link{href}=~ m{^\/author\/\w+} ) {
            print $$Link{_TEXT}."\n";
        }
    }

    undef $LX;
    __END__

    ## or

    use HTML::LinkExtractor;
    use Data::Dumper;

    my $input = q{If <a href="http://perl.com/"> I am a LINK!!! </a>};
    my $LX = new HTML::LinkExtractor(
        sub {
            print Data::Dumper::Dumper(@_);
        },
        'http://perlFox.org/',
    );

    $LX->parse(\$input);
    $LX->strip(1);
    $LX->parse(\$input);
    __END__

    #### Calculate to total size of a web-page
    #### adds up the sizes of all the images and stylesheets and stuff

    use strict;
    use LWP; #     use LWP::Simple;
    use HTML::LinkExtractor;
                                                        #
    my $url  = shift || 'http://www.google.com';
    my $html = get($url);
    my $Total = length $html;
                                                        #
    print "initial size $Total\n";
                                                        #
    my $LX = new HTML::LinkExtractor(
        sub {
            my( $X, $tag ) = @_;
                                                        #
            unless( grep {$_ eq $tag->{tag} } @HTML::LinkExtractor::TAGS_IN_NEED ) {
                                                        #
    print "$$tag{tag}\n";
                                                        #
                for my $urlAttr ( @{$HTML::LinkExtractor::TAGS{$$tag{tag}}} ) {
                    if( exists $$tag{$urlAttr} ) {
                        my $size = (head( $$tag{$urlAttr} ))[1];
                        $Total += $size if $size;
    print "adding $size\n" if $size;
                    }
                }
            }
        },
        $url,
        0
    );
                                                        #
    $LX->parse(\$html);
                                                        #
    print "The total size of \n$url\n is $Total bytes\n";
    __END__


=head1 METHODS

=head2 C<$LX-E<gt>new([\&callback, [$baseUrl, [1]]])>

Accepts 3 arguments, all of which are optional.
If for example you want to pass a C<$baseUrl>, but don't
want to have a callback invoked, just put C<undef> in place of a subref.

This is the only class method.

=over 4

=item 1

a callback ( a sub reference, as in C<sub{}>, or C<\&sub>)
which is to be called each time a new LINK is encountered
( for C<@HTML::LinkExtractor::TAGS_IN_NEED> this means
 after the closing tag is encountered )

The callback receives an object reference(C<$LX>) and a link hashref.


=item 2

and a base URL ( URI->new, so its up to you to make sure it's valid
which is used to convert all relative URI's to absolute ones.

    $ALinkP{href} = URI->new_abs( $ALink{href}, $base );

=item 3

A "boolean" (just stick with 1).
See the example in L<"DESCRIPTION">.
Normally, you'd get back _TEXT that looks like

    '_TEXT' => '<a href="http://perl.com/"> I am a LINK!!! </a>',

If you turn this option on, you'll get the following instead

    '_TEXT' => ' I am a LINK!!! ',

The private utility function C<_stripHTML> does this
by using L<HTML::TokeParser|HTML::TokeParser>s
method get_trimmed_text.

You can turn this feature on an off by using
C<$LX-E<gt>strip(undef E<verbar>E<verbar> 0 E<verbar>E<verbar> 1)>

=back

=head2 C<$LX-E<gt>parse( $filename E<verbar>E<verbar> *FILEHANDLE E<verbar>E<verbar> \$FileContent )>

Each time you call C<parse>, you should pass it a
C<$filename> a C<*FILEHANDLE> or a C<\$FileContent>

Each time you call C<parse> a new C<HTML::TokeParser> object 
is created and stored in C<$this-E<gt>{_tp}>.

You shouldn't need to mess with the TokeParser object.

=head2 C<$LX-E<gt>links()>

Only after you call C<parse> will this method return anything.
This method returns a reference to an ArrayOfHashes,
which basically looks like (Data::Dumper output)

    $VAR1 = [ { tag => 'img', src => 'image.png' }, ];

Please note that if yo provide a callback this array will be empty.


=head2 C<$LX-E<gt>strip( [ 0 || 1 ])>

If you pass in C<undef> (or nothing), returns the state of the option.
Passing in a true or false value sets the option.

If you wanna know what the option does see
L<C<$LX-E<gt>new([\&callback, [$baseUrl, [1]]])>|/"METHODS">

=head1 WHAT'S A LINK-type tag

Take a look at C<%HTML::LinkExtractor::TAGS> to see
what I consider to be link-type-tag.

Take a look at C<@HTML::LinkExtractor::VALID_URL_ATTRIBUTES> to see
all the possible tag attributes which can contain URI's (the links!!)

Take a look at C<@HTML::LinkExtractor::TAGS_IN_NEED> to see
the tags for which the C<'_TEXT'> attribute is provided,
like C<E<lt>a href="#"E<gt> TEST E<lt>/aE<gt>>


=head2 How can that be?!?!

I took at look at L<C<%HTML::Tagset::linkElements>|HTML::Tagset>
and the following URL's

    http://www.blooberry.com/indexdot/html/tagindex/all.htm

    http://www.blooberry.com/indexdot/html/tagpages/a/a-hyperlink.htm
    http://www.blooberry.com/indexdot/html/tagpages/a/applet.htm
    http://www.blooberry.com/indexdot/html/tagpages/a/area.htm

    http://www.blooberry.com/indexdot/html/tagpages/b/base.htm
    http://www.blooberry.com/indexdot/html/tagpages/b/bgsound.htm

    http://www.blooberry.com/indexdot/html/tagpages/d/del.htm
    http://www.blooberry.com/indexdot/html/tagpages/d/div.htm

    http://www.blooberry.com/indexdot/html/tagpages/e/embed.htm
    http://www.blooberry.com/indexdot/html/tagpages/f/frame.htm

    http://www.blooberry.com/indexdot/html/tagpages/i/ins.htm
    http://www.blooberry.com/indexdot/html/tagpages/i/image.htm
    http://www.blooberry.com/indexdot/html/tagpages/i/iframe.htm
    http://www.blooberry.com/indexdot/html/tagpages/i/ilayer.htm
    http://www.blooberry.com/indexdot/html/tagpages/i/inputimage.htm

    http://www.blooberry.com/indexdot/html/tagpages/l/layer.htm
    http://www.blooberry.com/indexdot/html/tagpages/l/link.htm

    http://www.blooberry.com/indexdot/html/tagpages/o/object.htm

    http://www.blooberry.com/indexdot/html/tagpages/q/q.htm

    http://www.blooberry.com/indexdot/html/tagpages/s/script.htm
    http://www.blooberry.com/indexdot/html/tagpages/s/sound.htm

    And the special cases 

    <!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd">
    http://www.blooberry.com/indexdot/html/tagpages/d/doctype.htm
    '!doctype'  is really a process instruction, but is still listed
    in %TAGS with 'url' as the attribute

    and

    <meta HTTP-EQUIV="Refresh" CONTENT="5; URL=http://www.foo.com/foo.html">
    http://www.blooberry.com/indexdot/html/tagpages/m/meta.htm
    If there is a valid url, 'url' is set as the attribute.
    The meta tag has no 'attributes' listed in %TAGS.


=head1 SEE ALSO

L<HTML::LinkExtor>, L<HTML::TokeParser>, L<HTML::Tagset>.

=head1 AUTHOR

D.H (PodMaster)


Please use http://rt.cpan.org/ to report bugs.

Just go to
http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-Scrubber
to see a bug list and/or repot new ones.

=head1 LICENSE

Copyright (c) 2003, 2004 by D.H. (PodMaster).
All rights reserved.

This module is free software;
you can redistribute it and/or modify it under
the same terms as Perl itself.
The LICENSE file contains the full text of the license.

=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)