The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package KeywordsSpider::Core;

use Modern::Perl;
use Moose;
use MooseX::UndefTolerant;
use Data::Dumper;
use LWP::UserAgent;
use HTTP::Request::Common qw(GET);
use HTTP::Cookies;
use HTML::TreeBuilder;

use base qw/ Exporter /;
our @EXPORT = qw/ find_origin /;

has [qw/output_file links/] => (
  is => 'rw',
  required => 1,
);

has 'keywords' => (
  is => 'ro',
  required => 1,
);

has 'allowed_keywords' => (
  is => 'ro',
  default => sub{ {} },
);

has 'debug_enabled' => (
  is => 'ro',
  isa => 'Bool',
  default => 0,
  predicate => 'has_debug_enabled',
);

has 'web_depth' => (
  is => 'ro',
  isa => 'Int',
  default => 3,
  predicate => 'has_web_depth',
);

has 'output' => (
  traits => ['String'],
  is => 'rw',
  isa => 'Str',
  default => '',
  handles => {
    add_text     => 'append',
  },
);

has 'counted' => (
  is => 'rw',
  default => 0,
);

has [qw/website origin origin_domain root alerted/] => (
  is => 'rw',
);

$ENV{PERL_LWP_SSL_VERIFY_HOSTNAME} = 0;

my $ua = LWP::UserAgent->new;
$ua->agent('Mozilla/8.0');
$ua->cookie_jar(
  HTTP::Cookies->new(
    file => 'mycookies.txt',
    autosave => 1
  )
);
$ua->timeout(15);
$ua->max_size(512*1024);

sub find_origin {
  my ($url) = @_;

  my $origin = $url;
  $origin =~ s/https?:\/\///;
  $origin =~ s/\/.*$//;

  my $domain = $origin;
  $domain =~ s/www\.//;

  return ($origin, $domain);
}

sub debug {
  my ($self, $string) = @_;

  if($self->debug_enabled) {
    print "$string";
  }

  return;
}

sub is_already_crawled {
  my ($self) = @_;

  foreach (keys %{$self->links}) {
    # max depth in links hash can be 1 at the moment
    if ($self->links->{$_}{depth} == 1 && not defined $self->links->{$_}{fetched}) {
      return 0;
    }
  }

  return 1;
}

sub get_root {
  my ($self, $url) = @_;

  my $req = GET $url;

  my $res = $ua->request($req);

  if ($res->is_success) {
    if (
      ($res->header('Content-Type') =~ /^text\/html/ ||
        $res->header('Content-Type') =~ /xml$/ ) &&
      ((not defined $res->header('Content-Length')) ||
        $res->header('Content-Length') < 512*1024)
    ) {
      my $content = $res->content;

      $self->root(HTML::TreeBuilder->new_from_content($content));

    } else{
      $self->add_text('SKIPPING because of content type or length' . "\n");
    }
  } else {
      $self->add_text('ERROR:' . $res->status_line . "\n");
  }
}


sub get_content_with_meta_keywords {
  my ($self) = @_;

  my $meta_keywords =
    (defined $self->root->find_by_attribute('name', 'keywords')) ?
      $self->root->find_by_attribute('name', 'keywords')->attr('content') : '';

  my $content_with_meta_keywords = $self->root->as_text . $meta_keywords;

  return $content_with_meta_keywords;
}

sub _add_new_link {
  my ($self, $link, $depth) = @_;
  if (not exists $self->links->{$link}) {
    $self->links->{$link} = {
      depth => $depth,
    };
  }

  return;
}

sub _handle_non_http_link {
  my ($self, $link, $base, $depth) = @_;

  my $href;

  if ($link =~ /^\//) {
    $href = "http://" . $self->origin . $link;
  }
  elsif ($link =~ /^\.\.\//) {
    my $dwo_dots_base = $base;

    while ($link =~ /^\.\.\//) {
      $link =~ s/^\.\.\///;
      $dwo_dots_base =~ s/[\.\-_\w]+\/$//;
    }

    if ($dwo_dots_base =~ /$self->origin_domain/) {
      $href = $dwo_dots_base . $link;
    } else {
      $href = 'http://' . $self->origin . '/' . $link;
    }
  }
  else {
    # get rif of './'
    $link =~ s/^\.\///;
    $href = $base . $link;
  }

  $self->_add_new_link($href, $depth);
}

sub _remove_link_garbage {
  my $link = shift;
  $link =~ s/\#.*$//;
  $link =~ s/^\s+//;
  $link =~ s/[\r\n]+//g;

  foreach my $unwanted_param ('share', 'link') {
    if ( $link =~ /[\?\&]${unwanted_param}=/ ) {
      $link =~ s/${unwanted_param}=.*&//g;
      $link =~ s/${unwanted_param}=.*$//g;
    }
  }

  $link =~ s/&$//g;

  return $link;
}

sub get_base {
  my ($base) = @_;

  # get url til the first slash
  $base =~ s#^(http:\/\/.+/).*#$1#;

  if ($base !~ /\/$/) {
    $base = $base . '/';
  }

  return $base;
}

sub _handle_link {
  my ($self, $link, $base, $depth) = @_;

  if ($link !~ /^http/ ) {
    $self->_handle_non_http_link($link, $base, $depth);
  }
  else {
    my $origin_domain = $self->origin_domain;

    if ($link =~ /^https?:\/\/(www\.)?[a-z\.\-]*${origin_domain}/i) {
      $self->_add_new_link($link, $depth);
    }
  }
}

sub add_links_from_root {
  my ($self, $depth, $url) = @_;

  my @anchors = $self->root->find('a');

  # base is for concatenating links like 'journal.html'
  my $base = get_base($url);

  foreach (@anchors) {
    my $link = $_->attr('href');

    if ($link) {
      $link = _remove_link_garbage($link);

      if ($link =~ /^(mailto|javascript):/i
        ||
        # there can be also space in the end
        $link =~ /\.(mp3|mp4|avi|bmp|gif|jpg|jpeg|zip|rar|msi|exe|png|gz|bz2|tar|swf|pdf|wav|asf|tgz|wmv|flv|rm|mpg)\s?$/i
      ) {
        next;
      }

      $self->_handle_link($link, $base, $depth);
    }
  }
}

sub check_website {
  my ($self, $url) = @_;
  my $content_with_meta_keywords = $self->get_content_with_meta_keywords();

  my $is_alerted = 0;
  my @matched_keywords = ();
  foreach (@{$self->keywords}) {
    if ( $content_with_meta_keywords  =~ /$_/i ) {
      push @matched_keywords, $_;
      if (!exists($self->allowed_keywords->{$_})) {
        $is_alerted = 1;
      }
    }
  }

  $self->add_text("ALERT ") if $is_alerted;

  my $keyword_count = scalar @matched_keywords;

  if ($keyword_count > 0) {
    $self->add_text("possible bad content $url @matched_keywords\n");
    $self->add_text("found keywords: $keyword_count\n\n");
  }

  return $is_alerted;
}

sub fetch_website {
  my ($self, $url, $want_spider, $depth) = @_;

  $self->root(0);
  $self->get_root($url);

  if (!$self->root) {
    return 0;
  }

  if ($want_spider) {
    $self->add_links_from_root($depth, $url);
  }

  return $self->check_website($url);
}

sub spider_website {
  my ($self) = @_;

  my $start = time;
  $self->debug("TIME".$start."\n");

  # fetch initial website
  my $want_spider = 1;
  my $max_depth = 1;
  $self->alerted(
    $self->fetch_website($self->website, $want_spider, $max_depth));

  $self->debug("PO:\n");
  $self->debug(Dumper $self->links);

  my @keys = keys %{$self->links};

  # checks if there are other urls than initial
  if (@keys && !$self->is_already_crawled()) {

    # website is spidered to web_depth
    # referrer to depth = 1
    $want_spider = $self->links->{$self->website}{want_spider} // 0;
    $max_depth = ($want_spider) ? $self->web_depth : 1;

    $self->debug("MAAAAAAX ". $self->website ." $max_depth\n");

    for (my $depth = 1; $depth <= $max_depth; $depth++) {
      $self->debug("DEPTH".$depth."\n");

      $self->proceed($start, $depth, $want_spider);
    }
  }

  return;
}

sub proceed {
  my ($self, $start, $depth, $want_spider) = @_;

  my @keys = keys %{$self->links};
  foreach (@keys) {
    if ($self->links->{$_}{depth} == ($depth) && not defined $self->links->{$_}{fetched}) {
      # skip forums
      if ($_ =~ /forums?\/index.php/) {
        $self->add_text("SKIPPING $_\n");
        $self->links->{$_}{fetched} = 1;
        next;
      }

      if ( (time - $start) > 120 ) {
        $self->add_text("TIMEOUT $self->website, number of links = ". (scalar @keys) ."\n");
        last;
      }
      else {
        $self->debug("fetching $_\n");
        $self->add_text("fetching $_\n");
      }

      my $returned = $self->fetch_website($_, $want_spider, $depth+1);
      $self->links->{$_}{fetched} = 1;
      if ($returned) {
        $self->alerted(1);
      }
    }
  }

  return;
}

sub settle_website {
  my ($self, $website) = @_;

  $self->links->{$website}{fetched} = 1;

  $self->debug("SPIDER ".$website."\n");
  $self->add_text("\nSPIDER ".$website."\n");

  if ( $website !~ /http/ ) {
    $website = 'http://' . $website;
  }

  $self->website($website);

  # origin* may be different for website and for referrer
  my ($origin, $origin_domain) = find_origin($website);
  $self->origin($origin);
  $self->origin_domain($origin_domain);

  $self->debug("ORIGIN". $origin ."\n");
  $self->debug("DOMAIN". $origin_domain ."\n");

  $self->debug("PRED:\n");
  $self->debug(Dumper $self->links);

  return;
}

sub spider_links {
  my ($self) = @_;

  $self->add_text("SPIDER LINKS\n");

  my @zero_keys = keys %{$self->links};

  foreach (@zero_keys) {
    $self->settle_website($_);
    $self->spider_website();
    $self->counted(1) if ($self->alerted);
  }

  if ($self->counted) {
    $self->add_text("this IS counted as alerted\n\n");
  } else {
    $self->add_text("this IS NOT counted as alerted\n\n");
  }

  print {$self->output_file()} $self->output;
  print {$self->output_file()} "----------------------------------------------------------------------\n\n";

  return $self->counted;
}

1;

=head1 NAME

KeywordsSpider::Core - core for web spider searching for keywords

=head1 SYNOPSIS

  use KeywordsSpider::Core;
  my $spider = KeywordsSpider::Core->new(
    output_file => $opened_filehandle,
    links => \%links,
    keywords => \@keywords,
    allowed_keywords => \%allowed_keywords,
    debug_enabled => 1,
    web_depth => 5,
  );

=head1 DESCRIPTION

KeywordsSpider::Core is core for web spider, which spiders links, and matches their content against keywords.
Keyword trigger ALERT to output_file.
Allowed keywords do not trigger ALERT.

Websites are defined by 'want_spider' parameter in the links hash.
The are spidered to 'web_depth' (default 3), and links in their content are added to links hash.
Other links are just checked for keywords, no spidering.

=head1 ARGUMENTS

=over 4

=item output_file

opened file handle

=item keywords

array of keywords you want to find

=item allowed_keywords

hash of keywords which do not trigger ALERT. Like:

  my %allowed_keywords = (
    wuord1 => 1,
  );

=item links

websites and referer urls you want to spider. Like:

  my %links = (
    'http://website.sk' => {
      'want_spider' => 1,
      'depth' => 0,
    },
    'http://referer.sk' => {
      'depth' => 0,
    },
  );

note, that links hash is changed, when running the spider

=item debug_enabled

prints debug messages to standard output

=item web_depth

depth to which website will be scanned. Default is 3.

=back

=head1 METHODS

=over 4

=item spider_links

main method

=item settle_website WEBSITE

makes necessary settings to spider website

=item spider_website

scans website according to settings

=item check_website

checks if url's content matches keywords

=item add_links_from_root

add links in url's content to links hash

=item debug

if debug enabled, prints string to standard output

=back

=head1 SAMPLE OUTPUT

  SPIDER http://domain.sk
  this IS NOT counted as alerted

  ----------------------------------------------------------------------

  SPIDER LINKS

  SPIDER http://trololo.sk
  ERROR:404 Not Found
  this IS NOT counted as alerted

  SPIDER LINKS

  SPIDER http://domain.sk/old.html
  possible bad content http://domain.sk/old.html word2
  found keywords: 1

  fetching http://domain.sk/new.html
  ALERT possible bad content http://domain.sk/new.html  wuord1 word2
  found keywords: 2

  fetching http://domain.sk/lala.txt
  SKIPPING because of content type or length

  SPIDER http://domain.sk
  this IS counted as alerted

=head1 SEE ALSO

L<KeywordsSpider> -- takes files as arguments and prepares attributes for KeywordsSpider::Core

=head1 COPYRIGHT

Copyright 2013 Katarina Durechova

This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.

=cut