@@ -1,7 +1,14 @@
Revision history for Perl extension HTML::SimpleLinkExtor.
-1.25 - Fri Aug 5 07:56:17 2011
- * I need Test::More 0.96 for subtests and done_testing.
+1.27 - Tue Jan 7 08:01:22 2014
+ * RT #87288 - add the test for this bug to test_manifest
+
+1.26 - Fri Jan 3 14:00:31 2014
+ * Get rid of MYMETA
+
+1.25_01 - Fri Aug 9 14:34:27 2013
+ * Filter links by allowed tags. I should have been doing this
+ all along. RT #87288
1.24 - Thu Aug 4 06:37:30 2011
* Fix clear_links so it actually clears the links
@@ -4,8 +4,8 @@ lib/HTML/SimpleLinkExtor.pm
LICENSE
Makefile.PL
MANIFEST
+MANIFEST.SKIP
META.yml Module meta-data (added by MakeMaker)
-MYMETA.yml
README
scripts/linktractor
t/absolute_links.t
@@ -17,6 +17,8 @@ t/parse.t
t/pod.t
t/pod_coverage.t
t/relative_links.t
+t/rt87288.t
t/schemes.t
t/tags.t
t/test_manifest
+META.json Module JSON meta-data (added by MakeMaker)
@@ -0,0 +1,65 @@
+
+#!start included /usr/local/perls/perl-5.18.1/lib/5.18.1/ExtUtils/MANIFEST.SKIP
+# Avoid version control files.
+\bRCS\b
+\bCVS\b
+\bSCCS\b
+,v$
+\B\.svn\b
+\B\.git\b
+\B\.gitignore\b
+\b_darcs\b
+\B\.cvsignore$
+
+# Avoid VMS specific MakeMaker generated files
+\bDescrip.MMS$
+\bDESCRIP.MMS$
+\bdescrip.mms$
+
+# Avoid Makemaker generated and utility files.
+\bMANIFEST\.bak
+\bMakefile$
+\bblib/
+\bMakeMaker-\d
+\bpm_to_blib\.ts$
+\bpm_to_blib$
+\bblibdirs\.ts$ # 6.18 through 6.25 generated this
+
+# Avoid Module::Build generated and utility files.
+\bBuild$
+\b_build/
+\bBuild.bat$
+\bBuild.COM$
+\bBUILD.COM$
+\bbuild.com$
+
+# Avoid temp and backup files.
+~$
+\.old$
+\#$
+\b\.#
+\.bak$
+\.tmp$
+\.#
+\.rej$
+
+# Avoid OS-specific files/dirs
+# Mac OSX metadata
+\B\.DS_Store
+# Mac OSX SMB mount metadata files
+\B\._
+
+# Avoid Devel::Cover and Devel::CoverX::Covered files.
+\bcover_db\b
+\bcovered\b
+
+# Avoid MYMETA files
+^MYMETA\.
+#!end included /usr/local/perls/perl-5.18.1/lib/5.18.1/ExtUtils/MANIFEST.SKIP
+
+
+\.travis\.yml
+\.releaserc
+\.lwpcookies
+HTML-.*
+
@@ -0,0 +1,56 @@
+{
+ "abstract" : "Extract links from HTML",
+ "author" : [
+ "brian d foy <bdfoy@cpan.org>"
+ ],
+ "dynamic_config" : 1,
+ "generated_by" : "ExtUtils::MakeMaker version 6.86, CPAN::Meta::Converter version 2.120921",
+ "license" : [
+ "perl_5"
+ ],
+ "meta-spec" : {
+ "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
+ "version" : "2"
+ },
+ "name" : "HTML-SimpleLinkExtor",
+ "no_index" : {
+ "directory" : [
+ "t",
+ "inc"
+ ]
+ },
+ "prereqs" : {
+ "build" : {
+ "requires" : {
+ "ExtUtils::MakeMaker" : "0"
+ }
+ },
+ "configure" : {
+ "requires" : {
+ "ExtUtils::MakeMaker" : "6.64"
+ }
+ },
+ "runtime" : {
+ "requires" : {
+ "HTML::LinkExtor" : "1.28",
+ "LWP::UserAgent" : "0",
+ "URI" : "1.09",
+ "URI::file" : "0"
+ }
+ },
+ "test" : {
+ "requires" : {
+ "Test::More" : "0.96",
+ "Test::Output" : "0"
+ }
+ }
+ },
+ "release_status" : "stable",
+ "resources" : {
+ "repository" : {
+ "type" : "git",
+ "web" : "https://github.com/briandfoy/html-simplelinkextor"
+ }
+ },
+ "version" : "1.27"
+}
@@ -1,29 +1,29 @@
---- #YAML:1.0
-name: HTML-SimpleLinkExtor
-version: 1.25
-abstract: Extract links from HTML
+---
+abstract: 'Extract links from HTML'
author:
- - brian d foy <bdfoy@cpan.org>
-license: perl
-distribution_type: module
-configure_requires:
- ExtUtils::MakeMaker: 0
+ - 'brian d foy <bdfoy@cpan.org>'
build_requires:
- ExtUtils::MakeMaker: 0
+ ExtUtils::MakeMaker: 0
+ Test::More: 0.96
+ Test::Output: 0
+configure_requires:
+ ExtUtils::MakeMaker: 6.64
+dynamic_config: 1
+generated_by: 'ExtUtils::MakeMaker version 6.86, CPAN::Meta::Converter version 2.120921'
+license: perl
+meta-spec:
+ url: http://module-build.sourceforge.net/META-spec-v1.4.html
+ version: 1.4
+name: HTML-SimpleLinkExtor
+no_index:
+ directory:
+ - t
+ - inc
requires:
- HTML::LinkExtor: 1.28
- LWP::UserAgent: 0
- Test::More: 0.96
- Test::Output: 0
- URI: 1.09
- URI::file: 0
+ HTML::LinkExtor: 1.28
+ LWP::UserAgent: 0
+ URI: 1.09
+ URI::file: 0
resources:
- repository: https://github.com/briandfoy/html-simplelinkextor
-no_index:
- directory:
- - t
- - inc
-generated_by: ExtUtils::MakeMaker version 6.57_05
-meta-spec:
- url: http://module-build.sourceforge.net/META-spec-v1.4.html
- version: 1.4
+ repository: https://github.com/briandfoy/html-simplelinkextor
+version: 1.27
@@ -1,30 +0,0 @@
----
-abstract: 'Extract links from HTML'
-author:
- - 'brian d foy <bdfoy@cpan.org>'
-build_requires:
- ExtUtils::MakeMaker: 0
-configure_requires:
- ExtUtils::MakeMaker: 0
-distribution_type: module
-dynamic_config: 0
-generated_by: 'ExtUtils::MakeMaker version 6.57_05'
-license: perl
-meta-spec:
- url: http://module-build.sourceforge.net/META-spec-v1.4.html
- version: 1.4
-name: HTML-SimpleLinkExtor
-no_index:
- directory:
- - t
- - inc
-requires:
- HTML::LinkExtor: 1.28
- LWP::UserAgent: 0
- Test::More: 0.96
- Test::Output: 0
- URI: 1.09
- URI::file: 0
-resources:
- repository: https://github.com/briandfoy/html-simplelinkextor
-version: 1.25
@@ -1,38 +1,44 @@
-use ExtUtils::MakeMaker;
+use ExtUtils::MakeMaker 6.64;
require 5.008;
eval "use Test::Manifest 1.21";
-WriteMakefile
- (
- 'NAME' => 'HTML::SimpleLinkExtor',
+WriteMakefile(
+ 'NAME' => 'HTML::SimpleLinkExtor',
'ABSTRACT' => 'Extract links from HTML',
- 'VERSION_FROM' => 'lib/HTML/SimpleLinkExtor.pm',
+ 'VERSION_FROM' => 'lib/HTML/SimpleLinkExtor.pm',
'LICENSE' => 'perl',
'AUTHOR' => 'brian d foy <bdfoy@cpan.org>',
-
- 'PREREQ_PM' => {
- 'HTML::LinkExtor' => '1.28',
- 'LWP::UserAgent' => '0',
+
+ 'CONFIGURE_REQUIRES' => {
+ 'ExtUtils::MakeMaker' => '6.64',
+ },
+
+ 'TEST_REQUIRES' => {
+ 'Test::More' => '0.96', # for subtest, done_testing
'Test::Output' => '0',
+ },
+
+ 'PREREQ_PM' => {
+ 'HTML::LinkExtor' => '1.28',
+ 'LWP::UserAgent' => '0',
'URI' => '1.09',
'URI::file' => '0',
- 'Test::More' => '0.96', # for subtest, done_testing
},
'EXE_FILES' => [ 'scripts/linktractor' ],
-
- 'dist' => {
- 'COMPRESS' => 'gzip -9f',
- 'SUFFIX' => 'gz',
- },
- 'META_MERGE' => {
+ 'META_MERGE' => {
+ 'meta-spec' => { version => 2 },
resources => {
- repository => 'https://github.com/briandfoy/html-simplelinkextor'
+ repository => {
+ type => 'git',
+ url => 'git@github.com:briandfoy/html-simplelinkextor.git',
+ web => 'https://github.com/briandfoy/html-simplelinkextor',
+ },
},
},
- clean => { FILES => q|HTML-SimpleLinkExtor-*| },
+ clean => { FILES => q|HTML-SimpleLinkExtor-*| },
);
@@ -13,7 +13,7 @@ use HTML::LinkExtor;
use LWP::UserAgent;
use URI;
-$VERSION = '1.25';
+$VERSION = '1.27';
@ISA = qw(HTML::LinkExtor);
@@ -36,15 +36,13 @@ $VERSION = '1.25';
sub DESTROY { 1 };
-sub AUTOLOAD
- {
+sub AUTOLOAD {
my $self = shift;
my $method = $AUTOLOAD;
$method =~ s/.*:://;
- unless( exists $AUTO_METHODS{$method} )
- {
+ unless( exists $AUTO_METHODS{$method} ) {
carp __PACKAGE__ . ": method $method unknown";
return;
}
@@ -52,20 +50,17 @@ sub AUTOLOAD
$self->_extract( $method );
}
-sub can
- {
+sub can {
my( $self, @methods ) = @_;
- foreach my $method ( @methods )
- {
+ foreach my $method ( @methods ) {
return 0 unless $self->_can( $method );
}
return 1;
}
-sub _can
- {
+sub _can {
no strict 'refs';
return 1 if exists $AUTO_METHODS{ $_[1] };
@@ -74,23 +69,20 @@ sub _can
return 0;
}
-sub _init_links
- {
+sub _init_links {
my $self = shift;
my $links = shift;
-
do {
delete $self->{'_SimpleLinkExtor_links'};
return
- } unless UNIVERSAL::isa( $links, 'ARRAY' );
+ } unless ref $links eq ref [];
$self->{'_SimpleLinkExtor_links'} = $links;
$self;
}
-sub _link_refs
- {
+sub _link_refs {
my $self = shift;
my @link_refs;
@@ -100,23 +92,21 @@ sub _link_refs
# through this branch. In _init_links I have to use a delete
# which I really don't like. I don't have time to rewrite this
# right now though --brian, 20050816
- if( ref $self->{'_SimpleLinkExtor_links'} )
- {
+ if( ref $self->{'_SimpleLinkExtor_links'} ) {
@link_refs = @{$self->{'_SimpleLinkExtor_links'}};
}
- else
- {
- @link_refs = $self->SUPER::links();
+ else {
+ @link_refs = map {
+ HTML::SimpleLinkExtor::LinkRef->new( $_ )
+ } $self->SUPER::links();
$self->_init_links( \@link_refs );
}
# defined() so that an empty string means "do not resolve"
- unless( defined $self->{'_SimpleLinkExtor_base'} )
- {
+ unless( defined $self->{'_SimpleLinkExtor_base'} ) {
my $count = -1;
my $found = 0;
- foreach my $link ( @link_refs )
- {
+ foreach my $link ( @link_refs ) {
$count++;
next unless $link->[0] eq 'base' and $link->[1] eq 'href';
$found = 1;
@@ -133,32 +123,28 @@ sub _link_refs
return @link_refs;
}
-sub _extract
- {
+sub _extract {
my $self = shift;
- my $method = shift;
+ my $type = shift;
- my $position = $AUTO_METHODS{$method} eq 'tag' ? 0 : 1;
+ my $method = $AUTO_METHODS{$type} eq 'tag' ? 'tag' : 'attribute';
- my @links = map { $$_[2] }
- grep { $_->[$position] eq $method }
+ my @links = map { $_->linkref }
+ grep { $_->$method() eq $type }
$self->_link_refs;
return @links;
}
-sub _add_base
- {
+sub _add_base {
my $self = shift;
my $array_ref = shift;
my $base = $self->{'_SimpleLinkExtor_base'};
return unless $base;
- foreach my $tuple ( @$array_ref )
- {
- foreach my $index ( 1 .. $#$tuple )
- {
+ foreach my $tuple ( @$array_ref ) {
+ foreach my $index ( 1 .. $#$tuple ) {
next unless exists $AUTO_METHODS{ $tuple->[$index] };
my $url = URI->new( $tuple->[$index + 1] );
@@ -213,7 +199,7 @@ not want to deal with the intricacies of C<HTML::Parser> or the
de-referencing needed to get links out of C<HTML::LinkExtor>.
You can extract all the links or some of the links (based on the HTML
-tag name or attribute name). If a E<lt>BASE HREFE<gt> tag is found,
+tag name or attribute name). If a C<< <BASE HREF> >> tag is found,
all of the relative URLs will be resolved according to that reference.
This module is simply a subclass around C<HTML::LinkExtor>, so it can
@@ -233,20 +219,19 @@ reset the link list between files, use the clear_links method.
Create the link extractor object.
=item $extor = HTML::SimpleLinkExtor->new('')
+
=item $extor = HTML::SimpleLinkExtor->new($base)
Create the link extractor object and resolve the relative URLs
accoridng to the supplied base URL. The supplied base URL overrides
any other base URL found in the HTML.
-
Create the link extractor object and do not resolve relative
links.
=cut
-sub new
- {
+sub new {
my $class = shift;
my $base = shift;
@@ -278,8 +263,7 @@ the entire class, including previously created objects.
=cut
-sub add_tags
- {
+sub add_tags {
my $self = shift;
my $tag = lc shift;
@@ -303,8 +287,7 @@ A smarter C<can> that can tell which attributes are also methods.
=cut
-sub add_attributes
- {
+sub add_attributes {
my $self = shift;
my $attr = lc shift;
@@ -319,8 +302,7 @@ created objects.
=cut
-sub remove_tags
- {
+sub remove_tags {
my $self = shift;
my $tag = lc shift;
@@ -335,8 +317,7 @@ class, including previously created objects.
=cut
-sub remove_attributes
- {
+sub remove_attributes {
my $self = shift;
my $attr = lc shift;
@@ -350,8 +331,7 @@ attention to.
=cut
-sub attribute_list
- {
+sub attribute_list {
my $self = shift;
grep { $AUTO_METHODS{ $_ } eq 'attribute' } keys %AUTO_METHODS;
@@ -360,13 +340,13 @@ sub attribute_list
=item HTML::SimpleLinkExtor->tag_list
Returns a list of the tags C<HTML::SimpleLinkExtor> pays attention to.
+These tags have convenience methods.
=back
=cut
-sub tag_list
- {
+sub tag_list {
my $self = shift;
grep { $AUTO_METHODS{ $_ } eq 'tag' } keys %AUTO_METHODS;
@@ -389,8 +369,7 @@ Fetch URL and parse its content for links.
=cut
-sub parse_url
- {
+sub parse_url {
my $data = $_[0]->ua->get( $_[1] )->content;
return unless $data;
@@ -417,9 +396,16 @@ Return a list of the links.
=cut
-sub links
- {
- map { $$_[2] } $_[0]->_link_refs;
+sub links {
+ map { $_->linkref }
+ grep { $_[0]->_is_an_allowed_tag( $_->tag ) }
+ $_[0]->_link_refs
+ }
+
+sub _is_an_allowed_tag {
+ exists $AUTO_METHODS{$_[1]}
+ and
+ $AUTO_METHODS{$_[1]} eq 'tag'
}
=item $extor->img
@@ -492,8 +478,7 @@ the count of the matching links.
=cut
-sub schemes
- {
+sub schemes {
my( $self, @schemes ) = @_;
my %schemes = map { lc, lc } @schemes;
@@ -503,8 +488,8 @@ sub schemes
my $scheme = eval { lc URI->new( $_ )->scheme };
exists $schemes{ $scheme };
}
- map { $_->[2] }
- $self->_link_refs;
+ map { $_->linkref }
+ $self->_link_refs;
wantarray ? @links : scalar @links;
}
@@ -519,8 +504,7 @@ the count of the matching links.
=cut
-sub absolute_links
- {
+sub absolute_links {
my $self = shift;
my @links =
@@ -528,7 +512,7 @@ sub absolute_links
my $scheme = eval { lc URI->new( $_ )->scheme };
length $scheme;
}
- map { $$_[2] }
+ map { $_->linkref }
$self->_link_refs;
wantarray ? @links : scalar @links;
@@ -545,8 +529,7 @@ the count of the matching links.
=cut
-sub relative_links
- {
+sub relative_links {
my $self = shift;
my @links =
@@ -554,7 +537,7 @@ sub relative_links
my $scheme = eval { URI->new( $_ )->scheme };
! defined $scheme;
}
- map { $$_[2] }
+ map { $_->linkref }
$self->_link_refs;
wantarray ? @links : scalar @links;
@@ -585,13 +568,29 @@ brian d foy, C<< <bdfoy@cpan.org> >>
=head1 COPYRIGHT AND LICENSE
-Copyright (c) 2004-2011 brian d foy. All rights reserved.
+Copyright (c) 2004-2014 brian d foy. All rights reserved.
This program is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut
+BEGIN {
+package
+ HTML::SimpleLinkExtor::LinkRef;
+use Carp qw(croak);
+
+sub new {
+ my( $class, $arrayref ) = @_;
+ croak "Not an array reference argument!" unless ref $arrayref eq ref [];
+ bless $arrayref, $class;
+ }
+
+sub tag { $_[0]->[0] }
+sub attribute { $_[0]->[1] }
+sub linkref { $_[0]->[2] }
+}
+
1;
__END__
@@ -42,6 +42,15 @@ subtest 'extor' => sub {
$extor->parse_file( $file );
my @links = $extor->links;
+ ok(
+ exists $extor->{'_SimpleLinkExtor_links'},
+ '_SimpleLinkExtor_links exists'
+ );
+
+ is( scalar @{ $extor->{'_SimpleLinkExtor_links'} },
+ $total_links, "Data structure has the links"
+ );
+
is( scalar @links, $total_links, "Found the right number of links" );
};
@@ -0,0 +1,16 @@
+use Test::More 0.95;
+
+my $class = 'HTML::SimpleLinkExtor';
+use_ok( $class );
+
+my $html = '<html><body><form name="test"
+action="/test"><input type="submit" /></form></body></html>';
+
+my $extor = $class->new();
+isa_ok( $extor, $class );
+
+$extor->parse( $html );
+
+print STDERR join "\n", $extor->links;
+
+done_testing();
@@ -7,3 +7,4 @@ schemes.t
relative_links.t
absolute_links.t
autoload.t
+rt87288.t