The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Web::Query;
use strict;
use warnings;
use 5.008001;
use parent qw/Exporter/;
our $VERSION = '0.27';
use HTML::TreeBuilder::XPath;
use LWP::UserAgent;
use HTML::Selector::XPath 0.06 qw/selector_to_xpath/;
use Scalar::Util qw/blessed refaddr/;
use HTML::Entities qw/encode_entities/;

our @EXPORT = qw/wq/;

our $RESPONSE;

sub wq { Web::Query->new(@_) }

our $UserAgent = LWP::UserAgent->new();

sub __ua {
    $UserAgent ||= LWP::UserAgent->new( agent => __PACKAGE__ . "/" . $VERSION );
    $UserAgent;
}

sub _build_tree {
    my( $self, $options ) = @_;

    my $no_space_compacting = ref $self ? $self->{no_space_compacting} 
    : ref $options eq 'HASH' ? $options->{no_space_compacting} : 0;

    my $tree = HTML::TreeBuilder::XPath->new( 
        no_space_compacting => $no_space_compacting
    );
    $tree->ignore_unknown(0);
    $tree->store_comments(1);
    $tree;
}

sub new {
    my ($class, $stuff, $options) = @_;

    my $self = $class->_resolve_new($stuff,$options)
        or return undef;

    $self->{indent} = $options->{indent} if $options->{indent};

    $self->{no_space_compacting} = $options->{no_space_compacting};

    return $self;
}

sub _resolve_new {
    my( $class, $stuff, $options) = @_;

    if (blessed $stuff) {
        if ($stuff->isa('HTML::Element')) {
            return $class->new_from_element([$stuff],$options);
        } 
        
        if ($stuff->isa('URI')) {
            return $class->new_from_url($stuff->as_string,$options);
        } 
        
        if ($stuff->isa($class)) {
            return $class->new_from_element($stuff->{trees}, $options);
        } 

        die "Unknown source type: $stuff";
    }

    return $class->new_from_element($stuff,$options) if ref $stuff eq 'ARRAY';

    return $class->new_from_url($stuff,$options) if $stuff =~ m{^(?:https?|file)://};

    return $class->new_from_html($stuff,$options) if $stuff =~ /<.*?>/;

    return $class->new_from_file($stuff,$options) if $stuff !~ /\n/ && -f $stuff;

    die "Unknown source type: $stuff";
}

sub new_from_url {
    my ($class, $url,$options) = @_;
    $RESPONSE = __ua()->get($url);
    if ($RESPONSE->is_success) {
        return $class->new_from_html($RESPONSE->decoded_content,$options);
    } else {
        return undef;
    }
}

sub new_from_file {
    my ($class, $fname, $options) = @_;
    my $tree = $class->_build_tree($options);
    $tree->parse_file($fname);
    my $self = $class->new_from_element([$tree->disembowel],$options);
    $self->{need_delete}++;
    return $self;
}

sub new_from_html {
    my ($class, $html,$options) = @_;
    my $tree = $class->_build_tree($options);
    $tree->parse_content($html);
    my $self = $class->new_from_element([$tree->disembowel],$options);
    $self->{need_delete}++;
    return $self;
}

sub new_from_element {
    my $class = shift;
    my $trees = ref $_[0] eq 'ARRAY' ? $_[0] : [$_[0]];
    return bless { trees => [ @$trees ], before => $_[1] }, $class;
}

sub end {
    my $self = shift;
    return $self->{before};
}

sub size {
    my $self = shift;
    return scalar(@{$self->{trees}});
}

sub parent {
    my $self = shift;
    my @new;
    for my $tree (@{$self->{trees}}) {
        push @new, $tree->parent();
    }
    return (ref $self || $self)->new_from_element(\@new, $self);
}

sub first {
    my $self = shift;
    return $self->eq(0);
}

sub last {
    my $self = shift;
    return $self->eq(-1);
}

sub get {
    my ($self, $index) = @_;
    return $self->{trees}[$index];
}

sub eq {
    my ($self, $index) = @_;
    return (ref $self || $self)->new_from_element([$self->{trees}[$index] || ()], $self);
}

sub find {
    my ($self, $selector) = @_;
    
    my $xpath = ref $selector ? $$selector : selector_to_xpath($selector, root => './');
    my @new = map { $_->findnodes($xpath) } @{$self->{trees}};
    
    return (ref $self || $self)->new_from_element(\@new, $self);
}

sub contents {
    my ($self, $selector) = @_;
    
    my @new = map { $_->content_list } @{$self->{trees}};
    
    if ($selector) {
        my $xpath = ref $selector ? $$selector : selector_to_xpath($selector);
        @new = grep { $_->matches($xpath) } @new;        
    }
    
    return (ref $self || $self)->new_from_element(\@new, $self);    
}

sub as_html {
    my $self = shift;

    my @html = map { 
        ref $_ ? $_->as_HTML( q{&<>'"}, $self->{indent}, {} ) 
               : $_ }
        @{$self->{trees}};

    return wantarray ? @html : $html[0];
}

sub html {
    my $self = shift;
    
    if (@_) {
        map { 
            $_->delete_content; 
            my $tree = $self->_build_tree;
            $tree->parse_content($_[0]);
            $_->push_content($tree->disembowel);
        } @{$self->{trees}};
        return $self;
    } 

    my @html;
    for my $t ( @{$self->{trees}} ) {
        push @html, join '', map { 
            ref $_ ? $_->as_HTML( q{&<>'"}, $self->{indent}, {}) 
                   : encode_entities($_)
        } $t->content_list;
    }
    
    return wantarray ? @html : $html[0];
}

sub text {
    my $self = shift;
    if (@_) {
        map { $_->delete_content; $_->push_content($_[0]) } @{$self->{trees}};
        return $self;
    } else {
        my @html = map { $_->as_text } @{$self->{trees}};
        return wantarray ? @html : $html[0];
    }
}

sub attr {
    my $self = shift;
    my @retval = map { $_->attr(@_) } @{$self->{trees}};
    return wantarray ? @retval : $retval[0];
}

sub tagname {
    my $self = shift;
    my @retval = map { $_->tag(@_) } @{$self->{trees}};
    return wantarray ? @retval : $retval[0];
}

sub each {
    my ($self, $code) = @_;
    my $i = 0;
    for my $tree (@{$self->{trees}}) {
        local $_ = (ref $self || $self)->new_from_element([$tree], $self);
        $code->($i++, $_);
    }
    return $self;
}

sub map {
    my ($self, $code) = @_;
    my $i = 0; 
    return +[map {
        my $tree = $_;
        local $_ = (ref $self || $self)->new($tree);
        $code->($i++, $_);
    } @{$self->{trees}}];
}   

sub filter {
    my $self = shift;

    if (ref($_[0]) eq 'CODE') {
        my $code = $_[0];
        my $i = 0; 
        $self->{trees} = +[grep {
            my $tree = $_;
            local $_ = (ref $self || $self)->new($tree);
            $code->($i++, $_);
        } @{$self->{trees}}];
        return $self;

    } else {
        my $xpath = ref $_[0] ? ${$_[0]} : selector_to_xpath($_[0]);
        my @new = grep { $_->matches($xpath) } @{$self->{trees}};        
        return (ref $self || $self)->new_from_element(\@new, $self);
    }
}

sub remove {
    my $self = shift;
    my $before = $self->end;
    
    while (defined $before) {
        @{$before->{trees}} = grep {
            my $el = $_;            
            not grep { refaddr($el) == refaddr($_) } @{$self->{trees}};            
        } @{$before->{trees}};

        $before = $before->end;
    }
    
    $_->delete for @{$self->{trees}};
    @{$self->{trees}} = ();
    
    $self;
}

sub replace_with {
    my ( $self, $replacement ) = @_;

    my $i = 0;
    for my $node ( @{ $self->{trees} } ) {
        my $rep = $replacement;

        if ( ref $rep eq 'CODE' ) {
            local $_ = (ref $self || $self)->new($node);
            $rep = $rep->( $i++ => $_ ); 
        }

        $rep = (ref $self || $self)->new_from_html( $rep )
            unless ref $rep;


            
        my $r = $rep->{trees}->[0];
        $r = $r->clone if ref $r;
        $r->parent( $node->parent ) if ref $r and $node->parent;

        $node->replace_with( $r );
    }

    $replacement->remove if ref $replacement eq (ref $self || $self);

    return $self;
}

sub append {
    my ($self, $stuff) = @_;
    $stuff = (ref $self || $self)->new($stuff);
    
    foreach my $t (@{$self->{trees}}) {
        $t->push_content($_) for ref($t)->clone_list(@{$stuff->{trees}});
    }
    
    $self;    
}

sub prepend {
    my ($self, $stuff) = @_;
    $stuff = (ref $self || $self)->new($stuff);
    
    foreach my $t (@{$self->{trees}}) {
        $t->unshift_content($_) for ref($t)->clone_list(@{$stuff->{trees}});
    }
    
    $self;    
}


sub before {
    my ($self, $stuff) = @_;
    $stuff = (ref $self || $self)->new($stuff);
        
    foreach my $t (@{$self->{trees}}) {
        $t->preinsert(ref($t)->clone_list(@{$stuff->{trees}}));
    }
    
    $self;    
}


sub after {
    my ($self, $stuff) = @_;
    $stuff = (ref $self || $self)->new($stuff);
        
    foreach my $t (@{$self->{trees}}) {
        $t->postinsert(ref($t)->clone_list(@{$stuff->{trees}}));
    }
    
    $self;    
}


sub insert_before {
    my ($self, $target) = @_;
        
    foreach my $t (@{$target->{trees}}) {
        $t->preinsert(ref($t)->clone_list(@{$self->{trees}}));
    }
    
    $self;    
}

sub insert_after {
    my ($self, $target) = @_;
        
    foreach my $t (@{$target->{trees}}) {
        $t->postinsert(ref($t)->clone_list(@{$self->{trees}}));
    }
    
    $self;    
}

sub detach {
    my ($self) = @_;
    $_->detach for @{$self->{trees}};
    $self;    
}

sub add_class {
    my ($self, $class) = @_;    
            
    for (my $i = 0; $i < @{$self->{trees}}; $i++) {
        my $t = $self->{trees}->[$i];        
        my $current_class = $t->attr('class') || '';
        
        my $classes = ref $class eq 'CODE' ? $class->($i, $current_class, $t) : $class;
        my @classes = split /\s+/, $classes;
        
        foreach (@classes) {            
            $current_class .= " $_" unless $current_class =~ /(?:^|\s)$_(?:\s|$)/;     
        }
                        
        $current_class =~ s/(?:^\s*|\s*$)//g;
        $current_class =~ s/\s\s+/ /g;
        
        $t->attr('class', $current_class);
    }
    
    $self;    
}


sub remove_class {
    my ($self, $class) = @_;

    for (my $i = 0; $i < @{$self->{trees}}; $i++) {
        my $t = $self->{trees}->[$i];        
        my $current_class = $t->attr('class');
        next unless defined $current_class;        
        
        my $classes = ref $class eq 'CODE' ? $class->($i, $current_class, $t) : $class;
        my @remove_classes = split /\s+/, $classes;
        my @final = grep {
            my $existing_class = $_;     
            not grep { $existing_class eq $_} @remove_classes;
        } split /\s+/, $current_class;
        
        $t->attr('class', join ' ', @final);
    }
    
    $self; 
    
}


sub has_class {
    my ($self, $class) = @_;
    
    foreach my $t (@{$self->{trees}}) {
        return 1 if $t->attr('class') =~ /(?:^|\s)$class(?:\s|$)/;
    }
    
    return;   
}

sub clone {
    my ($self) = @_;
    my @clones = map { $_->clone } @{$self->{trees}};
    return (ref $self || $self)->new_from_element(\@clones);
}

sub add {
    my ($self, @stuff) = @_;
    my @nodes;
    
    # add(selector, context)
    if (@stuff == 2 && !ref $stuff[0] && $stuff[1]->isa('HTML::Element')) {
        my $xpath = ref $stuff[0] ? ${$stuff[0]} : selector_to_xpath($stuff[0]);
        push @nodes, $stuff[1]->findnodes( $xpath, root => './');
    }
    else {
        # handle any combination of html string, element object and web::query object
        push @nodes, map { 
            $self->{need_delete} = 1 if $_->{need_delete};
            delete $_->{need_delete};
            @{$_->{trees}}; 
        } map { (ref $self || $self)->new($_) } @stuff;                
    }
    
    push @{$self->{trees}}, @nodes;
    $self;
}

sub prev {
    my $self = shift;
    my @new;
    for my $tree (@{$self->{trees}}) {
        push @new, $tree->getPreviousSibling;
    }
    return (ref $self || $self)->new_from_element(\@new, $self);
}

sub next {
    my $self = shift;
    my @new;
    for my $tree (@{$self->{trees}}) {
        push @new, $tree->getNextSibling;
    }
    return (ref $self || $self)->new_from_element(\@new, $self);
}

sub last_response {
    my ($class) = @_;
    return $RESPONSE;
}

sub DESTROY {
    return unless $_[0]->{need_delete};

    # avoid memory leaks
    $_->delete for grep { ref $_ } @{$_[0]->{trees}};
}

1;
__END__

=encoding utf8

=for stopwords prev

=head1 NAME

Web::Query - Yet another scraping library like jQuery

=head1 SYNOPSIS

    use Web::Query;

    wq('http://www.w3.org/TR/html401/')
        ->find('div.head dt')
        ->each(sub {
            my $i = shift;
            printf("%d %s\n", $i+1, $_->text);
        });

=head1 DESCRIPTION

Web::Query is a yet another scraping framework, have a jQuery like interface.

Yes, I know Ingy's L<pQuery>. But it's just a alpha quality. It doesn't works.
Web::Query built at top of the CPAN modules, L<HTML::TreeBuilder::XPath>, L<LWP::UserAgent>, and L<HTML::Selector::XPath>.

So, this module uses L<HTML::Selector::XPath> and only supports the CSS 3
selector supported by that module.
Web::Query doesn't support jQuery's extended queries(yet?). If a selector is 
passed as a scalar ref, it'll be taken as a straight XPath expression.

    $wq( '<div><p>hello</p><p>there</p></div>' )->find( 'p' );       # css selector
    $wq( '<div><p>hello</p><p>there</p></div>' )->find( \'/div/p' ); # xpath selector


B<THIS LIBRARY IS UNDER DEVELOPMENT. ANY API MAY CHANGE WITHOUT NOTICE>.

=head1 FUNCTIONS

=over 4

=item C<< wq($stuff) >>

This is a shortcut for C<< Web::Query->new($stuff) >>. This function is exported by default.

=back

=head1 METHODS

=head2 CONSTRUCTORS

=over 4

=item my $q = Web::Query->new($stuff, \%options )

Create new instance of Web::Query. You can make the instance from URL(http, https, file scheme), HTML in string, URL in string, L<URI> object, and instance of L<HTML::Element>.

This method throw the exception on unknown $stuff.

This method returns undefined value on non-successful response with URL.

Currently, the only two valid options are I<indent>, which will be used as
the indentation string if the object is printed, and I<no_space_compacting>, 
which will prevent the compaction of whitespace characters in text blocks.

=item my $q = Web::Query->new_from_element($element: HTML::Element)

Create new instance of Web::Query from instance of L<HTML::Element>.

=item C<< my $q = Web::Query->new_from_html($html: Str) >>

Create new instance of Web::Query from HTML.

=item my $q = Web::Query->new_from_url($url: Str)

Create new instance of Web::Query from URL.

If the response is not success(It means /^20[0-9]$/), this method returns undefined value.

You can get a last result of response, use the C<< $Web::Query::RESPONSE >>.

Here is a best practical code:

    my $url = 'http://example.com/';
    my $q = Web::Query->new_from_url($url)
        or die "Cannot get a resource from $url: " . Web::Query->last_response()->status_line;

=item my $q = Web::Query->new_from_file($file_name: Str)

Create new instance of Web::Query from file name.

=back

=head2 TRAVERSING

=head3 add

Add elements to the set of matched elements.

=over 4

=item add($html)

An HTML fragment to add to the set of matched elements.

=item add(@elements)

One or more @elements to add to the set of matched elements.

=item add($wq)

An existing Web::Query object to add to the set of matched elements.

=item add($selector, $context)

$selector is a string representing a selector expression to find additional elements to add to the set of matched elements.

$context is the point in the document at which the selector should begin matching

=back

=head3 contents

Get the immediate children of each element in the set of matched elements, including text and comment nodes.

=head3 each

Visit each nodes. C<< $i >> is a counter value, 0 origin. C<< $elem >> is iteration item.
C<< $_ >> is localized by C<< $elem >>.

    $q->each(sub { my ($i, $elem) = @_; ... })

=head3 end

Back to the before context like jQuery.

=head3 filter

Reduce the elements to those that pass the function's test.

    $q->filter(sub { my ($i, $elem) = @_; ... })

=head3 find

Get the descendants of each element in the current set of matched elements, filtered by a selector.

    my $q2 = $q->find($selector); # $selector is a CSS3 selector.
    
B<NOTE> If you want to match the element itself, use L</filter>.

B<INCOMPATIBLE CHANGE> 
From v0.14 to v0.19 (inclusive) find() also matched the element itself, which is not jQuery compatible.
You can achieve that result using C<filter()>, C<add()> and C<find()>:

    my $wq = wq('<div class="foo"><p class="foo">bar</p></div>'); # needed because we don't have a global document like jQuery does
    print $wq->filter('.foo')->add($wq->find('.foo'))->as_html; # <div class="foo"><p class="foo">bar</p></div><p class="foo">bar</p>

=head3 first

Return the first matching element.

This method constructs a new Web::Query object from the first matching element.

=head3 last

Return the last matching element.

This method constructs a new Web::Query object from the last matching element.

=head3 map

Creates a new array with the results of calling a provided function on every element.

    $q->map(sub { my ($i, $elem) = @_; ... })

=head3 parent

Get the parent of each element in the current set of matched elements.

=head3 prev

Get the previous node of each element in the current set of matched elements.

    my $prev = $q->prev;

=head3 next

Get the next node of each element in the current set of matched elements.

   my $next = $q->next;

=head2 MANIPULATION

=head3 add_class

Adds the specified class(es) to each of the set of matched elements.

    # add class 'foo' to <p> elements
    wq('<div><p>foo</p><p>bar</p></div>')->find('p')->add_class('foo'); 

=head3 after

Insert content, specified by the parameter, after each element in the set of matched elements.

    wq('<div><p>foo</p></div>')->find('p')
                               ->after('<b>bar</b>')
                               ->end
                               ->as_html; # <div><p>foo</p><b>bar</b></div>
    
The content can be anything accepted by L</new>.

=head3 append

Insert content, specified by the parameter, to the end of each element in the set of matched elements.

    wq('<div></div>')->append('<p>foo</p>')->as_html; # <div><p>foo</p></div>
    
The content can be anything accepted by L</new>.

=head3 as_html

Return the elements associated with the object as strings. 
If called in a scalar context, only return the string representation
of the first element.

=head3 C< attr >

Get/Set the attribute value in element.

    my $attr = $q->attr($name);

    $q->attr($name, $val);

=head3 tagname

Get/Set the tag name of elements.

    my $name = $q->tagname;

    $q->tagname($new_name);

=head3 before

Insert content, specified by the parameter, before each element in the set of matched elements.

    wq('<div><p>foo</p></div>')->find('p')
                               ->before('<b>bar</b>')
                               ->end
                               ->as_html; # <div><b>bar</b><p>foo</p></div>
    
The content can be anything accepted by L</new>.

=head3 clone

Create a deep copy of the set of matched elements.

=head3 detach

Remove the set of matched elements from the DOM.

=head3 has_class

Determine whether any of the matched elements are assigned the given class.

=head3 C< html >

Get/Set the innerHTML.

    my @html = $q->html();

    my $html = $q->html(); # 1st matching element only

    $q->html('<p>foo</p>');

=head3 insert_before

Insert every element in the set of matched elements before the target.

=head3 insert_after

Insert every element in the set of matched elements after the target.

=head3 C< prepend >

Insert content, specified by the parameter, to the beginning of each element in the set of matched elements. 

=head3 remove

Delete the elements associated with the object from the DOM.

    # remove all <blink> tags from the document
    $q->find('blink')->remove;

=head3 remove_class

Remove a single class, multiple classes, or all classes from each element in the set of matched elements.

=head3 replace_with

Replace the elements of the object with the provided replacement. 
The replacement can be a string, a C<Web::Query> object or an 
anonymous function. The anonymous function is passed the index of the current 
node and the node itself (with is also localized as C<$_>).

    my $q = wq( '<p><b>Abra</b><i>cada</i><u>bra</u></p>' );

    $q->find('b')->replace_with('<a>Ocus</a>);
        # <p><a>Ocus</a><i>cada</i><u>bra</u></p>

    $q->find('u')->replace_with($q->find('b'));
        # <p><i>cada</i><b>Abra</b></p>

    $q->find('i')->replace_with(sub{ 
        my $name = $_->text;
        return "<$name></$name>";
    });
        # <p><b>Abra</b><cada></cada><u>bra</u></p>

=head3 size

Return the number of elements in the Web::Query object.

    wq('<div><p>foo</p><p>bar</p></div>')->find('p')->size; # 2

=head3 text

Get/Set the text.

    my @text = $q->text();

    my $text = $q->text(); # 1st matching element only

    $q->text('text');
    
If called in a scalar context, only return the string representation
of the first element

=head2 OTHERS

=over 4

=item Web::Query->last_response()

Returns last HTTP response status that generated by C<new_from_url()>.

=back

=head1 HOW DO I CUSTOMIZE USER AGENT?

You can specify your own instance of L<LWP::UserAgent>.

    $Web::Query::UserAgent = LWP::UserAgent->new( agent => 'Mozilla/5.0' );

=head1 INCOMPATIBLE CHANGES

=over 4

=item 0.10

new_from_url() is no longer throws exception on bad response from HTTP server.

=back

=head1 AUTHOR

Tokuhiro Matsuno E<lt>tokuhirom AAJKLFJEF@ GMAIL COME<gt>

=head1 SEE ALSO

L<pQuery>

=head1 LICENSE

Copyright (C) Tokuhiro Matsuno

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=cut