lib/WWW/Spiegel.pm - metacpan.org

package WWW::Spiegel;

use strict;
# use warnings;
use HTML::TokeParser;
use LWP::UserAgent;
use HTTP::Request;
use URI::URL;


require Exporter;

our @ISA = qw(Exporter);

# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.

# This allows declaration	use WWW::GameStar ':all';
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
# will save memory.
our %EXPORT_TAGS = ( 'all' => [ qw(
	HtmlLinkExtractor
	getNews
	Get
) ] );

our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );

our @EXPORT = qw(
	HtmlLinkExtractor
	getNews
	Get
);

our $VERSION 	= '1.0';
my $Url		= "http://www.spiegel.de/";
my $Regex	= ",00\.html";

######
my $MaxFileSizeOfWebDocument	= (50 * 1024 * 1024);	# 5mb
my $MaxRedirectRequests		= 15;
my $AuthorEmail			= 'yourname@cpan.org';
my $Timeout			= 25;
my $CrawlDelay			= int(rand(3));
my $Referer			= "http://www.google.com/";
my $DEBUG			= 1;
######


sub new(){

	my $class   	= shift;
	my %args 	= ref($_[0])?%{$_[0]}:@_;
	my $self 	= \%args;
	bless $self, $class;
	$self->_init();
	return $self;
		
}; # sub new(){


sub _init(){

	my $self 	= shift;
	my $HashRef 	= $self->Get($Url);	
	my $ArrRef	= $self->HtmlLinkExtractor($HashRef);
	
	$self->{'_CONTENT_ARRAY_REF'} = $ArrRef;
	return $self;

}; # sub _init(){


sub getNews(){

	my $self 		= shift;
	my $ArrRef 		= $self->{'_CONTENT_ARRAY_REF'};
	my %NoDoubleLinks	= {};
	my %ReturnLinks		= {};

	foreach my $entry ( @{$ArrRef} ){

		my ($linkname, $url) = split(' ### ', $entry );
		if ( !exists $NoDoubleLinks{$url} ) {
			$ReturnLinks{$url} = $linkname;	
			$NoDoubleLinks{$url} = 0;
		};
	}; # foreach my $entry ( @{$ArrRef} ){
	
	return \%ReturnLinks;

}; # sub getNews(){


# Preloaded methods go here.

sub HtmlLinkExtractor(){

	my $self			= shift;
	my $HashRef			= shift;
	my $ResponseObj			= $HashRef->{'OBJ'};
	my $PageContent			= $HashRef->{'CNT'};
	
	my @ReturnLinks			= ();
	
	return -1 if ( ref($ResponseObj) ne "HTTP::Response" );

	my $base			= $ResponseObj->base;
	my $TokenParser		= HTML::TokeParser->new( \$PageContent );

	while ( my $token	= $TokenParser->get_tag("a")) {

		my $url		= $token->[1]{href};
		my $linktitle	= $token->[1]{title};
		my $rel		= $token->[1]{rel};
		my $text	= $TokenParser->get_trimmed_text("/a");	# $text = Linktitle
		$url		= url($url, $base)->abs;	# enth�lt die aktuell zu bearbeitende url
	
		chomp($url); chomp($text); 
		push(@ReturnLinks, "$text ### $url") if ( $url =~ /^(http)/i && $url =~ /$Regex/ig );
	
	}; # while ( my $token = $TokenParser->get_tag("a")) {

	return \@ReturnLinks;

}; # sub HtmlLinkExtractor(){


sub Get() {
	
	my $self	= shift;
	my $url		= shift;
	my $referer	= shift || $url;
	
	my $StatusHashRef = {};

	my $UA		= LWP::UserAgent->new( keep_alive => 1 );
	
		$UA->agent("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; YPC 3.0.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)");
	#	$UA->agent("wget");
		$UA->timeout( $Timeout );
		$UA->max_size( $MaxFileSizeOfWebDocument );
		$UA->from( $AuthorEmail );
		$UA->max_redirect( $MaxRedirectRequests );
		$UA->parse_head( 1 );
		$UA->protocols_allowed( [ 'http', 'https', 'ftp', 'ed2k'] );
		$UA->protocols_forbidden( [ 'file', 'mailto'] );
		$UA->requests_redirectable( [ 'HEAD', 'GET', 'POST'] );

		#	$ua->credentials( $netloc, $realm, $uname, $pass )
		#	$ua->proxy(['http', 'ftp'], 'http://proxy.sn.no:8001/');	# f�r protokollschema http und ftp benutze proxy ...
		# $ua->env_proxy ->  wais_proxy=http://proxy.my.place/ -> export gopher_proxy wais_proxy no_proxy
  
	# sleep $CrawlDelay;

	my $req = HTTP::Request->new( GET => $url );
	$req->referer($referer);

	my $res = $UA->request($req);

	if ( $res->is_success ) {

		$StatusHashRef->{ 'OBJ' } = $res; 
		$StatusHashRef->{ 'CNT' } = $res->content; 
	
  	}; # if ($res->is_success) {

	return $StatusHashRef;

}; # sub GET() {


1;
__END__
# Below is stub documentation for your module. You'd better edit it!

=head1 NAME

WWW::Spiegel - Perl extension for getting news http://www.spiegel.de/

=head1 SYNOPSIS

use WWW::Spiegel;
my $obj           =  WWW::Spiegel->new();
my $ResultHashRef = $obj->getNews();

while ( my ($url,$name)=each(%{$ResultHashRef})){

	print "$name => $url\n";

};
  

=head1 DESCRIPTION

WWW::Spiegel - Perl extension for getting news from http://www.spiegel.de/


=head2 EXPORT
	
	HtmlLinkExtractor - extraction of links from html document
	getNews - getting news
	Get - http get method

=head2 DEPENDENCIE

use HTML::TokeParser;
use LWP::UserAgent;
use HTTP::Request;
use URI::URL;
use strict;

=head1 SEE ALSO

http://www.zoozle.net
http://www.zoozle.org
http://www.zoozle.biz

NET::IPFilterSimple
NET::IPFilter
WWW::CpanRecent
WWW::Heise
WWW::GameStar
WWW::Popurls
WWW::Golem
WWW::Futurezone
WWW::Teamxbox
WWW::Spiegel

=head1 AUTHOR

Sebastian Enger, bigfish82 |ät! gmail?com

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2008 by Sebastian Enger

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.8 or,
at your option, any later version of Perl 5 you may have available.


=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)