The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Scrappy::Scraper;

BEGIN {
    $Scrappy::Scraper::VERSION = '0.94112090';
}

# load OO System
use Moose;

# load other libraries
use Data::Dumper;
use File::Util;
use Scrappy::Logger;
use Scrappy::Plugin;
use Scrappy::Queue;
use Scrappy::Scraper::Control;
use Scrappy::Scraper::Parser;
use Scrappy::Scraper::UserAgent;
use Scrappy::Session;
use Try::Tiny;
use URI;
use Web::Scraper;
use WWW::Mechanize;

# html content attribute
has 'content' => (
    is  => 'rw',
    isa => 'Any'
);

# access control object
has 'control' => (
    is      => 'ro',
    isa     => 'Scrappy::Scraper::Control',
    default => sub {
        Scrappy::Scraper::Control->new;
    }
);

# debug attribute
has 'debug' => (
    is      => 'rw',
    isa     => 'Bool',
    default => 1
);

# log object
has 'logger' => (
    is      => 'ro',
    isa     => 'Scrappy::Logger',
    default => sub {
        Scrappy::Logger->new;
    }
);

# parser object
has 'parser' => (
    is      => 'ro',
    isa     => 'Scrappy::Scraper::Parser',
    default => sub {
        Scrappy::Scraper::Parser->new;
    }
);

# plugins object
has 'plugins' => (
    is      => 'ro',
    isa     => 'Any',
    default => sub {
        Scrappy::Plugin->new;
    }
);

# queue object
has 'queue' => (
    is      => 'ro',
    isa     => 'Scrappy::Queue',
    default => sub {
        Scrappy::Queue->new;
    }
);

# session object
has 'session' => (
    is      => 'ro',
    isa     => 'Scrappy::Session',
    default => sub {
        Scrappy::Session->new;
    }
);

# user-agent object
has 'user_agent' => (
    is      => 'ro',
    isa     => 'Scrappy::Scraper::UserAgent',
    default => sub {
        Scrappy::Scraper::UserAgent->new;
    }
);

# www-mechanize object (does most of the heavy lifting, gets passed around alot)
has 'worker' => (
    is      => 'ro',
    isa     => 'WWW::Mechanize',
    default => sub {
        WWW::Mechanize->new;
    }
);

sub back {
    my $self = shift;

    # specify user-agent
    $self->worker->add_header("User-Agent" => $self->user_agent->name)
      if defined $self->user_agent->name;

    # set html response
    $self->content('');
    try {
        $self->worker->back;
        $self->content($self->worker->content);
    }
    catch {
        $self->log("error", "navigating to the previous page failed");
    };

    return unless $self->content;

    $self->log("info", "navigated back to " . $self->url . " successfully");

    $self->stash->{history} = [] unless defined $self->stash->{history};
    push @{$self->stash->{history}}, $self->url;
    $self->worker->{cookie_jar}->scan(
        sub {

            my ($version, $key,     $val,       $path,
                $domain,  $port,    $path_spec, $secure,
                $expires, $discard, $hash
            ) = @_;

            $self->session->stash('cookies' => {})
              unless defined $self->session->stash('cookies');

            $self->session->stash->{'cookies'}->{$domain}->{$key} = {
                version   => $version,
                key       => $key,
                val       => $val,
                path      => $path,
                domain    => $domain,
                port      => $port,
                path_spec => $path_spec,
                secure    => $secure,
                expires   => $expires,
                discard   => $discard,
                hash      => $hash
            };

            $self->session->write;

        }
    );

    return $self->url;
}

sub cookies {
    my $self = shift;
    $self->worker->{cookie_jar} = $_[0] if defined $_[0];
    return $self->worker->{cookie_jar};
}

sub domain {
    return shift->worker->base->host;
}

sub download {
    my $self = shift;
    my ($url, $dir, $file) = @_;

    $url = URI->new(@_);

    # specify user-agent
    $self->worker->add_header("User-Agent" => $self->user_agent->name)
      if defined $self->user_agent->name;

    # set html response
    if ($url && $dir && $file) {
        $dir =~ s/[\\\/]+$//;
        return unless $self->get($url);
        $self->store(join '/', $dir, $file);
        $self->log("info",
                "$url was downloaded to "
              . join('/', $dir, $file)
              . " successfully");
        $self->back;
    }
    elsif ($url && $dir) {
        $dir =~ s/[\\\/]+$//;
        return unless $self->get($url);
        my @chars = ('a' .. 'z', 'A' .. 'Z', 0 .. 9);
        my $filename = $self->worker->response->filename;
        $filename =
            $chars[rand(@chars)]
          . $chars[rand(@chars)]
          . $chars[rand(@chars)]
          . $chars[rand(@chars)]
          . $chars[rand(@chars)]
          . $chars[rand(@chars)]
          . '.downlaod'
          unless $filename;
        $self->store(join '/', $dir, $filename);
        $self->log("info",
                "$url was downloaded to "
              . join('/', $dir, $filename)
              . " successfully");
        $self->back;
    }
    elsif ($url) {
        return unless $self->get($url);
        my @chars = ('a' .. 'z', 'A' .. 'Z', 0 .. 9);
        my $filename = $self->worker->response->filename;
        $filename =
            $chars[rand(@chars)]
          . $chars[rand(@chars)]
          . $chars[rand(@chars)]
          . $chars[rand(@chars)]
          . $chars[rand(@chars)]
          . $chars[rand(@chars)]
          . '.downlaod'
          unless $filename;
        $dir = $url->path;
        $dir =~ s/^\///g;
        $dir =~ s/\/$filename$//;
        File::Util->new->make_dir($dir) unless -d $dir || !$dir;
        $self->store(join '/', $dir, $filename);
        $self->log("info",
                "$url was downloaded to "
              . join('/', $dir, $filename)
              . " successfully");
        $self->back;
    }
    else {
        croak(
            "To download data from a URI you must supply at least a valid URI "
              . "and download directory path");
    }

    $self->stash->{history} = [] unless defined $self->stash->{history};
    push @{$self->stash->{history}}, $url;

    $self->worker->{params} = {};
    $self->worker->{params} =
      {map { ($_ => $url->query_form($_)) } $url->query_form};

    sleep $self->pause;

    return $self;
}

sub dumper {
    shift;
    return Data::Dumper::Dumper(@_);
}

sub form {
    my $self = shift;
    my $url  = URI->new($self->url);

    # TODO: need to figure out how to determine the form action before submit

    # specify user-agent
    $self->worker->add_header("User-Agent" => $self->user_agent->name)
      if defined $self->user_agent->name;

    # set html response
    $self->content('');
    my @args = @_;
    try {
        $self->content($self->worker->submit_form(@args));
    };
    if ($self->content) {

        # access control
        if ($self->control->is_allowed($self->content)) {
            $self->log("warn", "$url was not fetched, the url is prohibited");
            return 0;
        }
        else {
            $self->log("info", "form posted from $url successfully", @_);
        }
    }
    else {
        $self->log("error", "error POSTing form from $url", @_);
    }

    #$self->stash->{history} = [] unless defined $self->stash->{history};
    #push @{$self->stash->{history}}, $url;

    $self->worker->{cookie_jar}->scan(
        sub {

            my ($version, $key,     $val,       $path,
                $domain,  $port,    $path_spec, $secure,
                $expires, $discard, $hash
            ) = @_;

            $self->session->stash('cookies' => {})
              unless defined $self->session->stash('cookies');

            $self->session->stash->{'cookies'}->{$domain}->{$key} = {
                version   => $version,
                key       => $key,
                val       => $val,
                path      => $path,
                domain    => $domain,
                port      => $port,
                path_spec => $path_spec,
                secure    => $secure,
                expires   => $expires,
                discard   => $discard,
                hash      => $hash
            };

            $self->session->write;

        }
    );

    $self->worker->{params} = {};
    $self->worker->{params} =
      {map { ($_ => $url->query_form($_)) } $url->query_form};

    sleep $self->pause;

    return $self;
}

sub get {
    my $self = shift;
    my $url  = URI->new(@_);

    # specify user-agent
    $self->worker->add_header("User-Agent" => $self->user_agent->name)
      if defined $self->user_agent->name;

    # set html response
    $self->content('');
    try {
        $self->content($self->worker->get($url));
    };
    if ($self->content) {

        # access control
        if (!$self->control->is_allowed($self->content)) {
            $self->log("warn", "$url was not fetched, the url is prohibited");
            return 0;
        }
        else {
            $self->log("info", "$url was fetched successfully");
        }
    }
    else {
        $self->log("error", "error GETing $url");
    }

    $self->stash->{history} = [] unless defined $self->stash->{history};
    push @{$self->stash->{history}}, $url;
    $self->worker->{cookie_jar}->scan(
        sub {

            my ($version, $key,     $val,       $path,
                $domain,  $port,    $path_spec, $secure,
                $expires, $discard, $hash
            ) = @_;

            $self->session->stash('cookies' => {})
              unless defined $self->session->stash('cookies');

            $self->session->stash->{'cookies'}->{$domain}->{$key} = {
                version   => $version,
                key       => $key,
                val       => $val,
                path      => $path,
                domain    => $domain,
                port      => $port,
                path_spec => $path_spec,
                secure    => $secure,
                expires   => $expires,
                discard   => $discard,
                hash      => $hash
            };

            $self->session->write;

        }
    ) if $self->session->file;

    $self->worker->{params} = {};
    $self->worker->{params} =
      {map { ($_ => $url->query_form($_)) } $url->query_form};

    sleep $self->pause;

    return $self;
}

sub page_data {
    my $self = shift;
    my ($data, @args);

    if (scalar(@_) % 2) {
        $data = shift;
        @args = @_;
    }
    else {
        if (@_ == 2) {
            @args = @_;
        }
        else {
            $data = shift;
        }
    }

    if ($data) {
        $self->worker->update_html($data);
    }

    return $self->worker->content(@args);
}

sub page_content_type {
    return shift->worker->content_type;
}

sub page_ishtml {
    return shift->worker->is_html;
}

sub page_loaded {
    return shift->worker->success;
}

sub page_match {
    my $self    = shift;
    my $pattern = shift;
    my $url     = shift || $self->url;
    $url = URI->new($url);
    my $options = shift || {};

    croak("route can't be defined without a valid URL pattern")
      unless $pattern;

    my $route = $self->stash->{patterns}->{$pattern};

    # does route definition already exist?
    unless (keys %{$route}) {

        $route->{on_match} = $options->{on_match};

        # define options
        if (my $host = $options->{host}) {
            $route->{host} = $host;
            $route->{host_re} = ref $host ? $host : qr(^\Q$host\E$);
        }

        $route->{pattern} = $pattern;

        # compile pattern
        my @capture;
        $route->{pattern_re} = do {
            if (ref $pattern) {
                $route->{_regexp_capture} = 1;
                $pattern;
            }
            else {
                $pattern =~ s!
                    \{((?:\{[0-9,]+\}|[^{}]+)+)\} | # /blog/{year:\d{4}}
                    :([A-Za-z0-9_]+)              | # /blog/:year
                    (\*)                          | # /blog/*/*
                    ([^{:*]+)                       # normal string
                !
                    if ($1) {
                        my ($name, $pattern) = split /:/, $1, 2;
                        push @capture, $name;
                        $pattern ? "($pattern)" : "([^/]+)";
                    } elsif ($2) {
                        push @capture, $2;
                        "([^/]+)";
                    } elsif ($3) {
                        push @capture, '__splat__';
                        "(.+)";
                    } else {
                        quotemeta($4);
                    }
                !gex;
                qr{^$pattern$};
            }
        };
        $route->{capture} = \@capture;
        $self->stash->{patterns}->{$route->{pattern}} = $route;
    }

    # match
    if ($route->{host_re}) {
        unless ($url->host =~ $route->{host_re}) {
            return 0;
        }
    }

    if (my @captured = ($url->path =~ $route->{pattern_re})) {
        my %args;
        my @splat;
        if ($route->{_regexp_capture}) {
            push @splat, @captured;
        }
        else {
            for my $i (0 .. @{$route->{capture}} - 1) {
                if ($route->{capture}->[$i] eq '__splat__') {
                    push @splat, $captured[$i];
                }
                else {
                    $args{$route->{capture}->[$i]} = $captured[$i];
                }
            }
        }
        my $match = +{
            (label => $route->{label}),
            %args,
            (@splat ? (splat => \@splat) : ())
        };
        if ($route->{on_match}) {
            my $ret = $route->{on_match}->($self, $match);
            return 0 unless $ret;
        }
        $match->{params} = {%args};
        $match->{params}->{splat} = \@splat if @splat;

        return $match;
    }

    return 0;
}

sub page_reload {
    my $self = shift;

    # specify user-agent
    $self->worker->add_header("User-Agent" => $self->user_agent->name)
      if defined $self->user_agent->name;

    # set html response
    $self->content('');
    try {
        $self->content($self->worker->reload);
    };
    $self->content
      ? $self->log("info",  "page reloaded successfully")
      : $self->log("error", "error reloading page");

    my $url = $self->url;

    $self->stash->{history} = [] unless defined $self->stash->{history};
    push @{$self->stash->{history}}, $url;
    $self->worker->{cookie_jar}->scan(
        sub {

            my ($version, $key,     $val,       $path,
                $domain,  $port,    $path_spec, $secure,
                $expires, $discard, $hash
            ) = @_;

            $self->session->stash('cookies' => {})
              unless defined $self->session->stash('cookies');

            $self->session->stash->{'cookies'}->{$domain}->{$key} = {
                version   => $version,
                key       => $key,
                val       => $val,
                path      => $path,
                domain    => $domain,
                port      => $port,
                path_spec => $path_spec,
                secure    => $secure,
                expires   => $expires,
                discard   => $discard,
                hash      => $hash
            };

            $self->session->write;

        }
    );

    return $self;
}

sub page_status {
    return shift->worker->status;
}

sub page_text {
    return shift->page_data(format => 'text');
}

sub page_title {
    return shift->worker->title;
}

sub plugin {
    my ($self, @plugins) = @_;
    foreach (@plugins) {
        with $self->plugins->load_plugin($_);
    }
    return $self;
}

sub post {
    my $self = shift;
    my $url  = URI->new($_[0]);

    # access control
    unless ($self->control->is_allowed($url)) {
        $self->log("warn", "$url was not fetched, the url is prohibited");
        return 0;
    }

    # specify user-agent
    $self->worker->add_header("User-Agent" => $self->user_agent->name)
      if defined $self->user_agent->name;

    # set html response
    $self->content('');
    my @args = @_;
    try {
        $self->content($self->worker->post(@args));
    };
    if ($self->content) {

        # access control
        if ($self->control->is_allowed($self->content)) {
            $self->log("warn", "$url was not fetched, the url is prohibited");
            return 0;
        }
        else {
            $self->log("info", "posted data to $_[0] successfully", @_);
        }
    }
    else {
        $self->log("error", "error POSTing data to $_[0]", @_);
    }

    $self->stash->{history} = [] unless defined $self->stash->{history};
    push @{$self->stash->{history}}, $url;
    $self->worker->{cookie_jar}->scan(
        sub {

            my ($version, $key,     $val,       $path,
                $domain,  $port,    $path_spec, $secure,
                $expires, $discard, $hash
            ) = @_;

            $self->session->stash('cookies' => {})
              unless defined $self->session->stash('cookies');

            $self->session->stash->{'cookies'}->{$domain}->{$key} = {
                version   => $version,
                key       => $key,
                val       => $val,
                path      => $path,
                domain    => $domain,
                port      => $port,
                path_spec => $path_spec,
                secure    => $secure,
                expires   => $expires,
                discard   => $discard,
                hash      => $hash
            };

            $self->session->write;

        }
    );

    $self->worker->{params} = {};
    $self->worker->{params} =
      {map { ($_ => $url->query_form($_)) } $url->query_form};

    sleep $self->pause;

    return $self;
}

sub proxy {
    my $self     = shift;
    my $proxy    = pop @_;
    my @protocol = @_;
    $self->worker->proxy([@protocol], $proxy);
    $self->log("info", "Set proxy $proxy using protocol(s) " . join ' and ',
        @protocol);
    return $self;
}

sub request_denied {
    my $self = shift;
    my ($last) = reverse @{$self->stash->{history}};
    return 1 if ($self->url ne $last);
}

sub select {
    my ($self, $selector, $html) = @_;
    my $parser = Scrappy::Scraper::Parser->new;
    $parser->html($html ? $html : $self->content);
    return $parser->select($selector);
}

sub log {
    my $self = shift;
    my $type = shift;
    my @args = @_;

    if ($self->debug) {
        if ($type eq 'info') {
            $self->logger->info(@args);
        }
        elsif ($type eq 'warn') {
            $self->logger->warn(@args);
        }
        elsif ($type eq 'error') {
            $self->logger->error(@args);
        }
        else {
            warn $type;
            $self->logger->event($type, @args);
        }

        return 1;
    }
    else {
        return 0;
    }
}

sub pause {
    my $self = shift;
    if (defined $_[0]) {
        if ($_[1]) {
            my @range = (($_[0] < $_[1] ? $_[0] : 0) .. $_[1]);
            $self->worker->{pause_range} = [$_[0], $_[1]];
            $self->worker->{pause} = $range[rand(@range)];
        }
        else {
            $self->worker->{pause} = $_[0];
            $self->worker->{pause_range} = [0, 0] unless $_[0];
        }
    }
    else {
        my $interval = $self->worker->{pause} || 0;

        # select the next random pause value from the range
        if (defined $self->worker->{pause_range}) {
            my @range = @{$self->worker->{pause_range}};
            $self->pause(@range) if @range == 2;
        }

        $self->log("info", "processing was halted for $interval seconds")
          if $interval > 0;
        return $interval;
    }
}

sub response {
    return shift->worker->response;
}

sub stash {
    my $self = shift;
    $self->{stash} = {} unless defined $self->{stash};

    if (@_) {
        my $stash = @_ > 1 ? {@_} : $_[0];
        if ($stash) {
            if (ref $stash eq 'HASH') {
                $self->{stash}->{$_} = $stash->{$_} for keys %{$stash};
            }
            else {
                return $self->{stash}->{$stash};
            }
        }
    }

    return $self->{stash};
}

sub store {

    # return shift->worker->save_content(@_);
    # oh no i didnt just rewrite www:mech save_content, oh yes i did
    # ... in hope to avoid content encoding issues

    my $self     = shift;
    my $filename = shift;

    open(my $fh, '>', $filename)
      or $self->worker->die("Unable to create $filename: $!");

    if (   $self->worker->content_type =~ m{^text/}
        || $self->worker->content_type
        =~ m{^application/(atom|css|javascript|json|rss|xml)})
    {

        # text
        $self->worker->response->decode;
        print {$fh} $self->worker->response->content
          or $self->worker->die("Unable to write to $filename: $!");
    }
    else {

        # binary
        binmode $fh;
        print {$fh} $self->worker->response->content
          or $self->worker->die("Unable to write to $filename: $!");
    }

    close $fh
      or $self->worker->die("Unable to close $filename: $!");

    return $self;
}

sub url {
    return $_[0]->worker->uri
      if $_[0]->content;
}

1;