lib/HTML/Grabber.pm - metacpan.org

package HTML::Grabber;
{
  $HTML::Grabber::VERSION = '0.7';
}

# ABSTRACT: jQuery style DOM traversal/manipulation

use strict;
use warnings;

use Moose;
use HTML::Selector::XPath qw(selector_to_xpath);
use XML::LibXML qw(:libxml);

my $parser = XML::LibXML->new;
$parser->recover(1);

has nodes => (
    traits   => ['Array'],
    isa      => 'ArrayRef[XML::LibXML::Node]',
    writer   => '_nodes',
    required => 1,
    default  => sub { [] },
    handles  => {
        nodes       => 'elements',
        length      => 'count',
    },
);

=head1 NAME

HTML::Grabber

=head1 SYNOPSIS

    use HTML::Grabber;
    use LWP::Simple;

    my $dom = HTML::Grabber->new( html => get('http://twitter.com/ned0r') );

    $dom->find('.tweet-content')->each(sub {
        my $body = $_->find('.tweet-text')->text;
        my $when = $_->find('.js-tweet-timestamp')->attr('data-time');
        my $link = $_->find('.js-permalink')->attr('href');
        print "$body $when (link: $link)\n";
    });

=head1 DESCRIPTION

HTML::Grabber provides a jQuery style interface to HTML documents. This makes
parsing and manipulating HTML documents trivially simple for those people
familiar with L<http://jquery.com>.

It uses L<XML::LibXML> for DOM parsing/manipulation and
L<HTML::Selector::XPath> for converting CSS expressions into XPath.

=head1 AUTHOR

Martyn Smith <martyn@dollyfish.net.nz>

=head1 SELECTORS

All selectors are CSS. They are internally converted to XPath using
L<HTML::Selector::XPath>. If some creative selector you're trying isn't working
as expected, it may well be worth checking out the documentation for that
module to see if it's supported.

=head1 METHODS

=head2 BUILD

=cut

sub BUILD {
    my ($self, $args) = @_;

    if ( exists $args->{html} ) {
        my $dom = $parser->parse_html_string($args->{html}, { suppress_warnings => 1, suppress_errors => 1 });
        $self->_nodes([$dom]);
    }
}

=head2 find( $selector )

Get descendants of each element in the current set of matched elements,
filtered by a selector.

=cut
sub find {
    my ($self, $selector) = @_;

    my $xpath = selector_to_xpath($selector, root => './');

    my @nodes;
    foreach my $node ( $self->nodes ) {
        push @nodes, $node->findnodes($xpath);
    }
    return $self->new(
        nodes => [uniq(@nodes)],
    );
}

=head2 prev( [ $selector ] )

Get the immediately preceding sibling of each element in the set of matched
elements, optionally filtered by a selector.

=cut
sub prev {
    my ($self, $selector) = @_;

    my @nodes;
    foreach my $node ( $self->nodes ) {
        my $prev = $node;
        do {
            $prev = $prev->previousSibling;
        } while $prev and $prev->nodeType != XML_ELEMENT_NODE;
        push @nodes, $prev if $prev;
    }
    my $return = $self->new(
        nodes => [uniq(@nodes)],
    );
    return $return->filter($selector) if $selector;
    return $return;
}

=head2 next( [ $selector ] )

Get the immediately preceding sibling of each element in the set of matched
elements, optionally filtered by a selector.

=cut
sub next {
    my ($self, $selector) = @_;

    my @nodes;
    foreach my $node ( $self->nodes ) {
        my $next = $node;
        do {
            $next = $next->nextSibling;
        } while $next and $next->nodeType != XML_ELEMENT_NODE;
        push @nodes, $next if $next;
    }
    my $return = $self->new(
        nodes => [uniq(@nodes)],
    );
    return $return->filter($selector) if $selector;
    return $return;
}

=head2 filter( $selector )

Reduce the set of matched elements to those that match the selector

=cut

sub filter {
    my ($self, $selector) = @_;

    my $xpath = selector_to_xpath($selector, root => '..');

    my @nodes;

    foreach my $node ( $self->nodes ) {
        push @nodes, $node if grep { $node->isSameNode($_) } $node->findnodes($xpath);
    }

    return $self->new(
        nodes => [uniq(@nodes)],
    );
}

=head2 text_filter( $match )

Filter the current set of matched elements to those that contain the text
specified by $match. If you prefer, $match can also be a Regexp

=cut
sub text_filter {
    my ($self, $match) = @_;

    my $regexp = $match;
    $regexp = qr/\Q$regexp\E/ unless UNIVERSAL::isa($regexp, 'Regexp');

    my @nodes;
    foreach my $node ( $self->nodes ) {
        push @nodes, $node if $node->findvalue('.') =~ $match;
    }
    return $self->new(
        nodes => [uniq(@nodes)],
    );
}

=head2 parent()

Get the parent of each element in the current set of matched elements

=cut
sub parent {
    my ($self) = @_;

    my @nodes;
    foreach my $node ( $self->nodes ) {
        push @nodes, $node->parentNode if $node->parentNode;
    }
    return $self->new(
        nodes => [uniq(@nodes)],
    );
}

=head2 text()

Get the combined text contents of each element in the set of matched elements,
including their descendants.

=cut
sub text {
    my ($self) = @_;

    return join('', map { $_->findvalue('.') } shift->nodes);
}

=head2 text_array()

Return text for each element as a list

=cut
sub text_array {
    my ($self) = @_;

    return map { $_->findvalue('.') } shift->nodes;
}

=head2 html()

Return the HTML of the currently matched elements

=cut
sub html {
    my ($self) = @_;

    return join('', map { $_->toString } shift->nodes);
}

=head2 html_array()

Return the HTML each element as a list

=cut
sub html_array {
    my ($self) = @_;

    return map { $_->toString } shift->nodes;
}

=head2 remove()

Removes the matched nodes from the DOM tree returning them

=cut
sub remove {
    my ($self) = @_;

    foreach my $node ( $self->nodes ) {
        next unless $node->parentNode;
        $node->parentNode->removeChild($node);
    }

    return $self;
}

=head2 attr( $attribute )

Get the value of an attribute for the first element in the set of matched
elements.

=cut
sub attr {
    my ($self, $attr) = @_;

    my ($node) = $self->nodes;

    return unless $node;

    return $node->findvalue("./\@$attr");
}

=head2 each

Execute a sub for each matched node

=cut
sub each {
    my ($self, $sub) = @_;

    foreach my $node ( $self->nodes ) {
        local $_ = $self->new(nodes => [$node]);
        $sub->($_);
    }
}

=head2 map

Execute a sub for each matched node returning a list containing the result of
each sub

=cut
sub map {
    my ($self, $sub) = @_;

    my @results;

    foreach my $node ( $self->nodes ) {
        local $_ = $self->new(nodes => [$node]);
        push @results, $sub->($_);
    }

    return @results;
}

=head1 CLASS METHODS

=head2 uniq( @nodes )

Internal method for taking a list of L<XML::LibXML::Element>s and returning a
unique list

=cut

sub uniq {
    my (@nodes) = @_;
    my %seen;

    my @unique;

    foreach my $node ( @nodes ) {
        push @unique, $node unless $seen{$node->nodePath}++;
    }

    return @unique;
}

__PACKAGE__->meta->make_immutable;

1;
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)