package WWW::Spiegel;
use strict;
# use warnings;
use HTML::TokeParser;
use LWP::UserAgent;
use HTTP::Request;
use URI::URL;
require Exporter;
our @ISA = qw(Exporter);
# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.
# This allows declaration use WWW::GameStar ':all';
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
# will save memory.
our %EXPORT_TAGS = ( 'all' => [ qw(
HtmlLinkExtractor
getNews
Get
) ] );
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
our @EXPORT = qw(
HtmlLinkExtractor
getNews
Get
);
our $VERSION = '1.0';
my $Url = "http://www.spiegel.de/";
my $Regex = ",00\.html";
######
my $MaxFileSizeOfWebDocument = (50 * 1024 * 1024); # 5mb
my $MaxRedirectRequests = 15;
my $AuthorEmail = 'yourname@cpan.org';
my $Timeout = 25;
my $CrawlDelay = int(rand(3));
my $Referer = "http://www.google.com/";
my $DEBUG = 1;
######
sub new(){
my $class = shift;
my %args = ref($_[0])?%{$_[0]}:@_;
my $self = \%args;
bless $self, $class;
$self->_init();
return $self;
}; # sub new(){
sub _init(){
my $self = shift;
my $HashRef = $self->Get($Url);
my $ArrRef = $self->HtmlLinkExtractor($HashRef);
$self->{'_CONTENT_ARRAY_REF'} = $ArrRef;
return $self;
}; # sub _init(){
sub getNews(){
my $self = shift;
my $ArrRef = $self->{'_CONTENT_ARRAY_REF'};
my %NoDoubleLinks = {};
my %ReturnLinks = {};
foreach my $entry ( @{$ArrRef} ){
my ($linkname, $url) = split(' ### ', $entry );
if ( !exists $NoDoubleLinks{$url} ) {
$ReturnLinks{$url} = $linkname;
$NoDoubleLinks{$url} = 0;
};
}; # foreach my $entry ( @{$ArrRef} ){
return \%ReturnLinks;
}; # sub getNews(){
# Preloaded methods go here.
sub HtmlLinkExtractor(){
my $self = shift;
my $HashRef = shift;
my $ResponseObj = $HashRef->{'OBJ'};
my $PageContent = $HashRef->{'CNT'};
my @ReturnLinks = ();
return -1 if ( ref($ResponseObj) ne "HTTP::Response" );
my $base = $ResponseObj->base;
my $TokenParser = HTML::TokeParser->new( \$PageContent );
while ( my $token = $TokenParser->get_tag("a")) {
my $url = $token->[1]{href};
my $linktitle = $token->[1]{title};
my $rel = $token->[1]{rel};
my $text = $TokenParser->get_trimmed_text("/a"); # $text = Linktitle
$url = url($url, $base)->abs; # enth�lt die aktuell zu bearbeitende url
chomp($url); chomp($text);
push(@ReturnLinks, "$text ### $url") if ( $url =~ /^(http)/i && $url =~ /$Regex/ig );
}; # while ( my $token = $TokenParser->get_tag("a")) {
return \@ReturnLinks;
}; # sub HtmlLinkExtractor(){
sub Get() {
my $self = shift;
my $url = shift;
my $referer = shift || $url;
my $StatusHashRef = {};
my $UA = LWP::UserAgent->new( keep_alive => 1 );
$UA->agent("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; YPC 3.0.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)");
# $UA->agent("wget");
$UA->timeout( $Timeout );
$UA->max_size( $MaxFileSizeOfWebDocument );
$UA->from( $AuthorEmail );
$UA->max_redirect( $MaxRedirectRequests );
$UA->parse_head( 1 );
$UA->protocols_allowed( [ 'http', 'https', 'ftp', 'ed2k'] );
$UA->protocols_forbidden( [ 'file', 'mailto'] );
$UA->requests_redirectable( [ 'HEAD', 'GET', 'POST'] );
# $ua->credentials( $netloc, $realm, $uname, $pass )
# $ua->proxy(['http', 'ftp'], 'http://proxy.sn.no:8001/'); # f�r protokollschema http und ftp benutze proxy ...
# $ua->env_proxy -> wais_proxy=http://proxy.my.place/ -> export gopher_proxy wais_proxy no_proxy
# sleep $CrawlDelay;
my $req = HTTP::Request->new( GET => $url );
$req->referer($referer);
my $res = $UA->request($req);
if ( $res->is_success ) {
$StatusHashRef->{ 'OBJ' } = $res;
$StatusHashRef->{ 'CNT' } = $res->content;
}; # if ($res->is_success) {
return $StatusHashRef;
}; # sub GET() {
1;
__END__
# Below is stub documentation for your module. You'd better edit it!
=head1 NAME
WWW::Spiegel - Perl extension for getting news http://www.spiegel.de/
=head1 SYNOPSIS
use WWW::Spiegel;
my $obj = WWW::Spiegel->new();
my $ResultHashRef = $obj->getNews();
while ( my ($url,$name)=each(%{$ResultHashRef})){
print "$name => $url\n";
};
=head1 DESCRIPTION
WWW::Spiegel - Perl extension for getting news from http://www.spiegel.de/
=head2 EXPORT
HtmlLinkExtractor - extraction of links from html document
getNews - getting news
Get - http get method
=head2 DEPENDENCIE
use HTML::TokeParser;
use LWP::UserAgent;
use HTTP::Request;
use URI::URL;
use strict;
=head1 SEE ALSO
http://www.zoozle.net
http://www.zoozle.org
http://www.zoozle.biz
NET::IPFilterSimple
NET::IPFilter
WWW::CpanRecent
WWW::Heise
WWW::GameStar
WWW::Popurls
WWW::Golem
WWW::Futurezone
WWW::Teamxbox
WWW::Spiegel
=head1 AUTHOR
Sebastian Enger, bigfish82 |ät! gmail?com
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2008 by Sebastian Enger
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.8 or,
at your option, any later version of Perl 5 you may have available.
=cut