lib/Web/Scraper/Config.pm

# $Id: /mirror/perl/Web-Scraper-Config/trunk/lib/Web/Scraper/Config.pm 7145 2007-05-09T16:36:57.901467Z daisuke  $
#
# Copyright (c) 2007 Daisuke Maki <daisuke@endeworks.jp>

package Web::Scraper::Config;
use strict;
use warnings;
use Config::Any;
use Data::Visitor::Callback;
use Web::Scraper;
use URI;
our $VERSION = '0.01';

sub new
{
    my $class  = shift;
    my $self   = bless {}, $class;
    my $config = shift;
    my $opts   = shift;
    $config = $self->_load_config($config);

    { # BROKEN YAML::Syck
        my $v = Data::Visitor::Callback->new(
            plain_value => sub { $_[1] =~ s/(\w:) /$1/g; $_[1] }
        );
        $config = $v->visit($config);
    }

    $self->{config} = $config;
    $self->{callbacks} = $opts->{callbacks} if $opts && $opts->{callbacks};
    return $self;
}

sub _load_config
{
    my $self = shift;
    my $file = shift;

    if (ref $file eq 'HASH') {
        return $file;
    } else {
        # This is a bit hackish, but we're only loading one file, so
        # we should be okay
        my $list = Config::Any->load_files({ files => [ $file ]});
        if (! @$list ) {
            require Carp;
            Carp::croak("Could not load config file $file: $@");
        }
        return (values %{$list->[0]})[0];
    }
}

sub scrape
{
    my $self = shift;
    my $config = $self->{config};
    my $scraper = $self->_recurse($config)->();
    return $scraper->scrape(@_);
}

sub _recurse
{
    my ($self, $rules) = @_;

    my $ref = ref($rules);
    my $ret;
    if (! $ref) {
        if ($rules =~ /^__callback\(([^\)]+)\)__$/) {
            $rules = $self->{callbacks}{$1};
        }
        $ret = $rules;
    } elsif ($ref eq 'ARRAY') {
        my @elements;
        foreach my $rule (@$rules) {
            if ($rule =~ /^__callback\(([^\)]+)\)__$/) {
                $rule = $self->{callbacks}{$1};
                push @elements, sub { sub { $rule->(@_) } };
            } else {
                push @elements, ref $rule ? $self->_recurse($rule) : $rule;
            }
        }

        if (! grep { my $ref = ref($_); $ref ? $ref ne 'CODE' : 1 } @elements) {
            $ret = sub {
                foreach my $code (@elements) {
                    $code->()
                }
            };
        } else {
            $ret = \@elements;
        }
    } elsif ($ref eq 'HASH'){
        my($op) = keys %$rules;
        my $h = $self->_recurse($rules->{$op});
        my $is_func = ($op =~ /^(?:scraper|process(?:_first)?|result)$/);

        if ($is_func) {
            my @args = (ref $h eq 'ARRAY') ? @$h : ($h);
            if ($op eq 'scraper') {
                $ret = sub { 
                    scraper(sub { for (@args) { $_->() } })
                };
            } else {
                $ret = sub {
                    my $code = sub {
                        @_ = map { (ref $_ eq 'CODE') ? $_->() : $_ }@args;
                        goto &$op;
                    };
                    $code->()
                };
            }
        } else {
            $ret = { $op => $h };
        }
    } else {
        require Data::Dumper;
        die "Web::Scraper::Config does not know how to parse: " . Data::Dumper::Dumper($rules);
    }

    return $ret;
}

1;

__END__

=head1 NAME

Web::Scraper::Config - Run Web::Scraper From Config Files

=head1 SYNOPSIS

  ---
  scraper:
    - process:
      - td>ul>li
      - trailers[]
      - scraper:
        - process_first:
          - li>b
          - title
          -  TEXT
        - process_first:
          - ul>li>a[href]
          - url
          - @href
        - process:
          - ul>li>ul>li>a
          - movies[]
          - __callback(process_movie)__


  my $scraper = Web::Scraper::Config->new(
    $config,
    {
      callbacks => {
        process_movie => sub {
          my $elem = shift;
          return {
            text => $elem->as_text,
            href => $elem->attr('href')
          }
        }
     }
   }
  );
  $scraper->scrape($uri);

=head1 DESCRIPTION

Web::Scraper::Config allows you to harness the power of Web::Scraper from
a config file.

The config files can be written in any format that Config::Any understands,
as long as it conforms to this module's rules.

=head1 METHODS

=head2 new

Creates a new Web::Scraper::Config instance.

The first arguments is either a hashref that represents a config, or a 
filename to the config. The config file can be in any format that Config::Any 
understands as long as it returns a hash that's conformant to the 
Web::Scraper::Config rules.

The second argument (options) is optional, and is currently only used to
provider callbacks to be called from the scraper. When Web::Scraper::Config
encounters an element in the form of:

  __callback(function_name)__

then that is replaced by the corresponding callback specified in the
options hash.

=head2 scrape

Starts scraping. The semantics are exactly the same as Web::Scraper::scrape

=head1 AUTHOR

Daisuke Maki E<lt>daisuke@endeworks.jpE<gt>

=head1 LICENSE

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

See http://www.perl.com/perl/misc/Artistic.html

=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)