package Web::Query;
use strict;
use warnings;
use 5.008001;
use parent qw/Exporter/;
our $VERSION = '0.15';
use HTML::TreeBuilder::XPath;
use LWP::UserAgent;
use HTML::Selector::XPath 0.06 qw/selector_to_xpath/;
use Scalar::Util qw/blessed refaddr/;
use HTML::Entities qw/encode_entities/;
our @EXPORT = qw/wq/;
our $RESPONSE;
sub wq { Web::Query->new(@_) }
our $UserAgent = LWP::UserAgent->new();
sub __ua {
$UserAgent ||= LWP::UserAgent->new( agent => __PACKAGE__ . "/" . $VERSION );
$UserAgent;
}
sub new {
my ($class, $stuff, $options) = @_;
my $self = $class->_resolve_new($stuff);
$self->{indent} = $options->{indent} if $options->{indent};
return $self;
}
sub _resolve_new {
my( $class, $stuff ) = @_;
if (blessed $stuff) {
if ($stuff->isa('HTML::Element')) {
return $class->new_from_element([$stuff]);
}
if ($stuff->isa('URI')) {
return $class->new_from_url($stuff->as_string);
}
die "Unknown source type: $stuff";
}
return $class->new_from_element($stuff) if ref $stuff eq 'ARRAY';
return $class->new_from_url($stuff) if $stuff =~ m{^(?:https?|file)://};
return $class->new_from_html($stuff) if $stuff =~ /<.*?>/;
return $class->new_from_file($stuff) if $stuff !~ /\n/ && -f $stuff;
die "Unknown source type: $stuff";
}
sub new_from_url {
my ($class, $url) = @_;
$RESPONSE = __ua()->get($url);
if ($RESPONSE->is_success) {
return $class->new_from_html($RESPONSE->decoded_content);
} else {
return undef;
}
}
sub new_from_file {
my ($class, $fname) = @_;
my $tree = HTML::TreeBuilder::XPath->new_from_file($fname);
$tree->ignore_unknown(0);
$tree->store_comments(1);
my $self = $class->new_from_element([$tree->elementify]);
$self->{need_delete}++;
return $self;
}
sub new_from_html {
my ($class, $html) = @_;
my $tree = HTML::TreeBuilder::XPath->new();
$tree->ignore_unknown(0);
$tree->store_comments(1);
$tree->parse_content($html);
my $self = $class->new_from_element([$tree->guts]);
$self->{need_delete}++;
return $self;
}
sub new_from_element {
my $class = shift;
my $trees = ref $_[0] eq 'ARRAY' ? $_[0] : +[$_[0]];
return bless { trees => $trees, before => $_[1] }, $class;
}
sub end {
my $self = shift;
return $self->{before};
}
sub size {
my $self = shift;
return scalar(@{$self->{trees}});
}
sub parent {
my $self = shift;
my @new;
for my $tree (@{$self->{trees}}) {
push @new, $tree->getParentNode();
}
return (ref $self || $self)->new_from_element(\@new, $self);
}
sub first {
my $self = shift;
return (ref $self || $self)->new_from_element([$self->{trees}[0] || ()], $self);
}
sub last {
my $self = shift;
return (ref $self || $self)->new_from_element([$self->{trees}[-1] || ()], $self);
}
sub find {
my ($self, $selector) = @_;
my $xpath_rootless = selector_to_xpath($selector);
my @new;
for my $tree (@{$self->{trees}}) {
push @new, $tree if defined $tree->parent && $tree->matches($xpath_rootless);
push @new, $tree->findnodes(selector_to_xpath($selector, root => defined $tree->parent ? './' : '/'));
}
return (ref $self || $self)->new_from_element(\@new, $self);
}
sub as_html {
my $self = shift;
my @html = map { $_->as_HTML( q{&<>'"}, $self->{indent}, {} ) }
@{$self->{trees}};
return wantarray ? @html : $html[0];
}
sub html {
my $self = shift;
my $builder = HTML::TreeBuilder->new;
$builder->store_comments(1);
if (@_) {
map {
$_->delete_content;
my $tree = HTML::TreeBuilder->new;
$tree->ignore_unknown(0);
$tree->store_comments(1);
$tree->parse_content($_[0]);
$_->push_content($tree->guts);
} @{$self->{trees}};
return $self;
}
my @html;
for my $t ( @{$self->{trees}} ) {
push @html, join '', map {
ref $_ ? $_->as_HTML( q{&<>'"}, $self->{indent}, {})
: encode_entities($_)
} $t->content_list;
}
return wantarray ? @html : $html[0];
}
sub text {
my $self = shift;
if (@_) {
map { $_->delete_content; $_->push_content($_[0]) } @{$self->{trees}};
return $self;
} else {
my @html = map { $_->as_text } @{$self->{trees}};
return wantarray ? @html : $html[0];
}
}
sub attr {
my $self = shift;
my @retval = map { $_->attr(@_) } @{$self->{trees}};
return wantarray ? @retval : $retval[0];
}
sub each {
my ($self, $code) = @_;
my $i = 0;
for my $tree (@{$self->{trees}}) {
local $_ = (ref $self || $self)->new_from_element([$tree], $self);
$code->($i++, $_);
}
return $self;
}
sub map {
my ($self, $code) = @_;
my $i = 0;
return +[map {
my $tree = $_;
local $_ = (ref $self || $self)->new($tree);
$code->($i++, $_);
} @{$self->{trees}}];
}
sub filter {
my $self = shift;
if (ref($_[0]) eq 'CODE') {
my $code = $_[0];
my $i = 0;
$self->{trees} = +[grep {
my $tree = $_;
local $_ = (ref $self || $self)->new($tree);
$code->($i++, $_);
} @{$self->{trees}}];
return $self;
} else {
return $self->find($_[0])
}
}
sub remove {
my $self = shift;
my $before = $self->end;
while (defined $before) {
@{$before->{trees}} = grep {
my $el = $_;
not grep { refaddr($el) == refaddr($_) } @{$self->{trees}};
} @{$before->{trees}};
$before = $before->end;
}
$_->delete for @{$self->{trees}};
@{$self->{trees}} = ();
$self;
}
sub replace_with {
my ( $self, $replacement ) = @_;
my $i = 0;
for my $node ( @{ $self->{trees} } ) {
my $rep = $replacement;
if ( ref $rep eq 'CODE' ) {
local $_ = (ref $self || $self)->new($node);
$rep = $rep->( $i++ => $_ );
}
$rep = (ref $self || $self)->new_from_html( $rep )
unless ref $rep;
my $r = $rep->{trees}->[0]->clone;
$r->parent( $node->parent ) if $node->parent;
$node->replace_with( $r );
}
$replacement->remove if ref $replacement eq (ref $self || $self);
return $self;
}
sub append {
my ($self, $stuff) = @_;
$stuff = (ref $self || $self)->new($stuff);
foreach my $t (@{$self->{trees}}) {
$t->push_content($_) for ref($t)->clone_list(@{$stuff->{trees}});
}
$self;
}
sub prepend {
my ($self, $stuff) = @_;
$stuff = (ref $self || $self)->new($stuff);
foreach my $t (@{$self->{trees}}) {
$t->unshift_content($_) for ref($t)->clone_list(@{$stuff->{trees}});
}
$self;
}
sub before {
my ($self, $stuff) = @_;
$stuff = (ref $self || $self)->new($stuff);
foreach my $t (@{$self->{trees}}) {
$t->preinsert(ref($t)->clone_list(@{$stuff->{trees}}));
}
$self;
}
sub after {
my ($self, $stuff) = @_;
$stuff = (ref $self || $self)->new($stuff);
foreach my $t (@{$self->{trees}}) {
$t->postinsert(ref($t)->clone_list(@{$stuff->{trees}}));
}
$self;
}
sub insert_before {
my ($self, $target) = @_;
foreach my $t (@{$target->{trees}}) {
$t->preinsert(ref($t)->clone_list(@{$self->{trees}}));
}
$self;
}
sub insert_after {
my ($self, $target) = @_;
foreach my $t (@{$target->{trees}}) {
$t->postinsert(ref($t)->clone_list(@{$self->{trees}}));
}
$self;
}
sub detach {
my ($self) = @_;
$_->detach for @{$self->{trees}};
$self;
}
sub add_class {
my ($self, $class) = @_;
for (my $i = 0; $i < @{$self->{trees}}; $i++) {
my $t = $self->{trees}->[$i];
my $current_class = $t->attr('class');
my $classes = ref $class eq 'CODE' ? $class->($i, $current_class, $t) : $class;
my @classes = split /\s+/, $classes;
foreach (@classes) {
$current_class .= " $_" unless $current_class =~ /(?:^|\s)$_(?:\s|$)/;
}
$current_class =~ s/(?:^\s*|\s*$)//g;
$current_class =~ s/\s\s+/ /g;
$t->attr('class', $current_class);
}
$self;
}
sub remove_class {
my ($self, $class) = @_;
for (my $i = 0; $i < @{$self->{trees}}; $i++) {
my $t = $self->{trees}->[$i];
my $current_class = $t->attr('class');
next unless defined $current_class;
my $classes = ref $class eq 'CODE' ? $class->($i, $current_class, $t) : $class;
my @remove_classes = split /\s+/, $classes;
my @final = grep {
my $existing_class = $_;
not grep { $existing_class eq $_} @remove_classes;
} split /\s+/, $current_class;
$t->attr('class', join ' ', @final);
}
$self;
}
sub has_class {
my ($self, $class) = @_;
foreach my $t (@{$self->{trees}}) {
return 1 if $t->attr('class') =~ /(?:^|\s)$class(?:\s|$)/;
}
return;
}
sub clone {
my ($self) = @_;
my @clones = map { $_->clone } @{$self->{trees}};
return (ref $self || $self)->new_from_element(\@clones);
}
sub DESTROY {
if ($_[0]->{need_delete}) {
$_->delete for @{$_[0]->{trees}}; # avoid memory leaks
}
}
1;
__END__
=encoding utf8
=head1 NAME
Web::Query - Yet another scraping library like jQuery
=head1 SYNOPSIS
use Web::Query;
wq('http://google.com/search?q=foobar')
->find('h2')
->each(sub {
my $i = shift;
printf("%d) %s\n", $i+1, $_->text
});
=head1 DESCRIPTION
Web::Query is a yet another scraping framework, have a jQuery like interface.
Yes, I know ingy's L<pQuery>. But it's just a alpha quality. It doesn't works.
Web::Query built at top of the CPAN modules, L<HTML::TreeBuilder::XPath>, L<LWP::UserAgent>, and L<HTML::Selector::XPath>.
So, this module uses L<HTML::Selector::XPath> and only supports the CSS 3
selector supported by that module.
Web::Query doesn't support jQuery's extended queries(yet?).
B<THIS LIBRARY IS UNDER DEVELOPMENT. ANY API MAY CHANGE WITHOUT NOTICE>.
=head1 FUNCTIONS
=over 4
=item wq($stuff)
This is a shortcut for C<< Web::Query->new($stuff) >>. This function is exported by default.
=back
=head1 METHODS
=over 4
=item my $q = Web::Query->new($stuff, \%options )
Create new instance of Web::Query. You can make the instance from URL(http, https, file scheme), HTML in string, URL in string, L<URI> object, and instance of L<HTML::Element>.
This method throw the exception on unknown $stuff.
This method returns undefined value on non-successful response with URL.
Currently, the only option valid option is I<indent>, which will be used as
the indentation string if the object is printed.
=item my $q = Web::Query->new_from_element($element: HTML::Element)
Create new instance of Web::Query from instance of L<HTML::Element>.
=item my $q = Web::Query->new_from_html($html: Str)
Create new instance of Web::Query from HTML.
=item my $q = Web::Query->new_from_url($url: Str)
Create new instance of Web::Query from URL.
If the response is not success(It means /^20[0-9]$/), this method returns undefined value.
You can get a last result of response, use the C<< $Web::Query::RESPONSE >>.
Here is a best practical code:
my $url = 'http://example.com/';
my $q = Web::Query->new_from_url($url)
or die "Cannot get a resource from $url: " . Web::Query->last_response()->status_line;
=item my $q = Web::Query->new_from_file($file_name: Str)
Create new instance of Web::Query from file name.
=item my @html = $q->html();
=item my $html = $q->html();
=item $q->html('<p>foo</p>');
Get/Set the innerHTML.
=item $q->as_html();
Return the elements associated with the object as strings.
If called in a scalar context, only return the string representation
of the first element.
=item my @text = $q->text();
=item my $text = $q->text();
=item $q->text('text');
Get/Set the inner text.
=item my $attr = $q->attr($name);
=item $q->attr($name, $val);
Get/Set the attribute value in element.
=item $q = $q->find($selector)
This method find nodes by $selector from $q. $selector is a CSS3 selector.
=item $q->each(sub { my ($i, $elem) = @_; ... })
Visit each nodes. C<< $i >> is a counter value, 0 origin. C<< $elem >> is iteration item.
C<< $_ >> is localized by C<< $elem >>.
=item $q->map(sub { my ($i, $elem) = @_; ... })
Creates a new array with the results of calling a provided function on every element.
=item $q->filter(sub { my ($i, $elem) = @_; ... })
Reduce the elements to those that pass the function's test.
=item $q->end()
Back to the before context like jQuery.
=item my $size = $q->size() : Int
Return the number of DOM elements matched by the Web::Query object.
=item my $parent = $q->parent() : Web::Query
Return the parent node from C<< $q >>.
=item my $first = $q->first()
Return the first matching element.
This method constructs a new Web::Query object from the first matching element.
=item my $last = $q->last()
Return the last matching element.
This method constructs a new Web::Query object from the last matching element.
=item $q->remove()
Delete the elements associated with the object from the DOM.
# remove all <blink> tags from the document
$q->find('blink')->remove;
=item $q->replace_with( $replacement );
Replace the elements of the object with the provided replacement.
The replacement can be a string, a C<Web::Query> object or an
anonymous function. The anonymous function is passed the index of the current
node and the node itself (with is also localized as C<$_>).
my $q = wq( '<p><b>Abra</b><i>cada</i><u>bra</u></p>' );
$q->find('b')->replace_with('<a>Ocus</a>);
# <p><a>Ocus</a><i>cada</i><u>bra</u></p>
$q->find('u')->replace_with($q->find('b'));
# <p><i>cada</i><b>Abra</b></p>
$q->find('i')->replace_with(sub{
my $name = $_->text;
return "<$name></$name>";
});
# <p><b>Abra</b><cada></cada><u>bra</u></p>
=back
=head1 HOW DO I CUSTOMIZE USER AGENT?
You can specify your own instance of L<LWP::UserAgent>.
$Web::Query::UserAgent = LWP::UserAgent->new( agent => 'Mozilla/5.0' );
=head1 INCOMPATIBLE CHANGES
=over 4
=item 0.10
new_from_url() is no longer throws exception on bad response from HTTP server.
=back
=head1 AUTHOR
Tokuhiro Matsuno E<lt>tokuhirom AAJKLFJEF@ GMAIL COME<gt>
=head1 SEE ALSO
L<pQuery>
=head1 LICENSE
Copyright (C) Tokuhiro Matsuno
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut