package LaBrea::Tarpit::Get;
#require 5.005_62;
use strict;
#use diagnostics;
#use warnings;
use vars qw($VERSION @ISA @EXPORT_OK);
$VERSION = do { my @r = (q$Revision: 1.05 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use LaBrea::NetIO qw(open_tcp);
require Exporter;
@ISA = qw(Exporter);
@EXPORT_OK = qw(
parse_http_URL
open_http
parse_http_response
short_response
make_line
not_hour
not_day
auto_update
);
## No Autoload function, all subs are used at least once
=head1 NAME
LaBrea::Tarpit::Get
=head1 SYNOPSIS
use LaBrea::Tarpit::Get;
($rv,$host,$port,$path)=parse_http_URL($url)
($handle,$host,$port,$path)=open_http(*S,$url);
$rv=parse_http_response(\$buffer,\%response);
$rv=short_response($url,\%response,\%content,$timeout);
$line = make_line($url,$err,\%content);
$rv = not_hour($file);
$rv = not_day($file);
$rv=auto_update($url,$file,$cur_ver,$timeout);
=head1 DESCRIPTION - LaBrea::Tarpit::Get
Module connects to a web site running
LaBrea::Tarpit::Report::html_report.plx and retrieves a short_report as
described in LaBrea::Tarpit::Report.
Run C<examples/web_scan.pl> from a cron job hourly or
daily to update the statistics from all know sites running
LaBrea::Tarpit. A report can then be generated showing the activity
worldwide.
# MIN HOUR DAY MONTH DAYOFWEEK COMMAND
30 * * * * ./web_scan.pl ./other_sites.txt ./tmp/site_stats
See: LaBrea::Tarpit::Report::other_sites
=over 2
=item ($handle,$host,$port,$path)= parse_http_URL($url);
Separate an http URL into its components
input: URL of the form
http://www.foo.com[:8080]/file.html
https:// service is not supported
returns: (undef, error message)
or
(file_handle,hostname,port,path)
where port and path may be empty
=cut
sub parse_http_URL {
my ($url) = @_;
return (undef, 'URL must begin with http://')
unless $url =~ m|^http://|;
my $port = '';
my $path = '';
my $remote;
if ( $url =~ m|http://([a-zA-Z0-9\-\.]+)(/[^?]+)|i ) {
$remote = $1;
$path = $2;
} elsif ( $url =~ m|http://([a-zA-Z0-9\-\.]+):(\d+)(/[^?]+)|i ) {
$remote = $1;
$port = $2;
$path = $3;
} elsif ( $url =~ m|http://([a-zA-Z0-9\-\.]+)|i ) {
$remote = $1
} else {
return (undef,'invalid URL');
}
return (1,$remote,$port,$path);
}
=item ($handle,$host,$port,$path)=open_http(*S,$url);
Open connection to http target
input: *S,$url [default port = 80]
returns: (undef, error) on error
(file_handle,
hostname,
port
path ) on success
=cut
sub open_http {
my ($S,$x) = @_;
my ($s,$remote,$port,$path) = parse_http_URL($x);
return (undef,$remote) unless $s; # return error if any
return (undef,'missing filename') unless $path;
$port = 80 unless $port;
$x = open_tcp($S,$remote,$port);
return (undef,$x) if $x;
return ($S,$remote,$port,$path)
}
=item $rv=parse_http_response(\$buffer,\%response);
Parse an http server response into a hash of headers.
i.e. (representative, will vary)
rc => 200
msg => OK
date => Wed, 24 Apr 2002 21:46:30 GMT
server => Apache/1.3.22
protocol => HTTP/1.1
content-type => text/plain
content-length => 92
last-modified => Wed, 24 Apr 2002 21:46:34 GMT
expires => Wed, 24 Apr 2002 21:47:04 GMT
connection => close
content => (complete text buffer)
input: \$text_in, \%response
returns: true on success, %response filled
false on failure
NOTE: %response{rc} (server response code)
%response(msg} (server messages)
are ALWAYS filled with something.
In the case of server failure, the
cause of the failure will be inserted
into %response(msg} and undef returned.
=cut
###################################################
# parse_http_response
#
# input: \$buffer,\%response
# return: true on success, else false
# response is filled
#
sub parse_http_response {
my ($b,$r) = @_;
$$b =~ s/\r//g; # remove dos returns
@_ = split('\n',$$b);
%$r = ();
# get response protocol and response code
unless ( $_[0] =~ /([^\s]+)\s+(\d+)\s*(.*)/ ) {
$r->{rc} = '';
$r->{msg} = 'unknown server response';
return undef;
} else {
$r->{protocol} = $1;
$r->{rc} = $2;
$r->{msg} = $3 || '';
return undef unless $2 == 200; # response OK
}
shift; # zap server response
unless (@_) {
$r->{msg} = 'no headers from server';
return undef;
}
while( $_ = shift @_ ) {
last unless $_;
my ($key,$val) = split(/:\s+/,$_,2);
$r->{lc $key} = $val;
}
$r->{content} = '';
unless (@_) {
$r->{msg} = 'no content, no data found';
return undef;
}
while( @_ ) {
$r->{content} .= (shift @_) . "\n";
}
1;
}
=item $rv=short_response($url,\%response,\%content,$timeout);
Fetch the short report from C<$url> and place the headers in C<%response>,
the content, parsed, in C<%content>. Optional C<$timeout>, default is 60
seocnds.
%response contains http headers
%content contains key => value pairs
LaBrea => version
Tarpit => version
Report => version
Util => version
now => seconds since epoch (local)
tz => time zone (i.e. -0700)
threads => number of threads
total_IPs => total IP's
bw => bandwidth
input: URL, # complete url
i.e. www.foo.com/html_report.plx
\%response,
\%content,
returns: false on success
error message on failure
=cut
sub short_response {
my ($url,$rsp,$cnt,$timr) = @_;
local *S;
my ($s,$r,$port,$path) = open_http(*S,$url);
return $r unless $s;
$timr = 60 unless $timr;
my $max = 1024; # maximum response size
# including headers
my $buffer = '';
eval {
local $SIG{ALRM} = sub {
close $s;
die 'short_response TIMEOUT';
};
alarm $timr;
print $s qq
|GET $path?short HTTP/1.0
Host: $r:$port
User-Agent: LaBrea::Tarpit::Get $VERSION
|;
while ( $_ = readline($s) ) {
$buffer .= $_;
last if length($buffer) > $max;
}
close $s;
alarm 0;
};
return 'timeout, failed to get short response'
if $@ =~ /short_response TIMEOUT/;
return $@ if $@; # show other errors
return 'invalid short response, no data'
unless $buffer;
return $rsp->{rc} . ' ' . $rsp->{msg}
unless parse_http_response(\$buffer,$rsp);
return 'invalid content-type ' . $rsp->{'content-type'}
unless $rsp->{'content-type'} =~ m|text/plain|i;
%$cnt = split(/[=\n]/,$rsp->{content});
return 'invalid data in short response'
unless exists $cnt->{LaBrea} &&
exists $cnt->{Tarpit} &&
exists $cnt->{Report} &&
exists $cnt->{Util} &&
exists $cnt->{now} &&
exists $cnt->{tz} &&
exists $cnt->{threads} &&
exists $cnt->{total_IPs} &&
exists $cnt->{bw};
0;
}
=item $line = make_line($url,$err,\%content);
Make a line of text summarizing the short report where C<$err> is the return
value from C<short_report>
Format:
url threads total_IPs bw time tz version:nn:nn:nn
or
url error message
=cut
sub make_line {
my ($url,$err,$cnt) = @_;
return "$url ", ($err ||
"$cnt->{threads} $cnt->{total_IPs} $cnt->{bw} $cnt->{now} $cnt->{tz} $cnt->{LaBrea}:$cnt->{Tarpit}:$cnt->{Report}:$cnt->{Util}");
}
=item $rv = not_hour($file);
Check if the file has been accessed this hour;
input: path/to/file
returns: true, not current hour
false if accessed this hour
or non-existent or not readable
=cut
sub not_hour {
return undef unless -e $_[0] && -r $_[0];
my @old = localtime((stat($_[0]))[8]);
@_ = localtime(time);
return $old[2] != $_[2] || $old[3] != $_[3];
}
=item $rv = not_day($file);
Check if the file has been accessed this day;
input: path/to/file
returns: true, not accessed this day
false if accessed this day
or non-existent or not readable
=cut
sub not_day {
return 1 unless -e $_[0] && -r $_[0];
return (localtime((stat($_[0]))[8]))[3] != (localtime(time))[3];
}
=item $rv=auto_update($url,$file,$cur_ver,$timeout);
Update the 'other_sites.txt' file from $url on a daily
basis only.
input: url, # complete url to 'other_sites.txt'
# http://scans.bizsystems.net/other_sites.txt
file, # path to your 'other_sites.txt'
cur_ver # optional current version
# the current file will be opened and scanned
# if this is not supplied
timeout # wait for http response
# default 60 seconds
returns: false on success or no update needed
error msg on failure
=back
=cut
sub auto_update {
my ($url,$file,$cur_ver,$timr,$debug) = @_;
$timr = 60 unless $timr;
local *S;
my ($S,$host,$port,$path)=open_http(*S,$url);
return $host unless $S; # return error message
my $buffer = '';
eval {
local $SIG{ALRM} = sub {
close $S;
die 'auto_update TIMEOUT';
};
alarm $timr;
print $S <<EOF;
GET /$path HTTP/1.0
Host: $host:$port
User-Agent: LaBrea::Tarpit::Get $VERSION
EOF
while (<$S>) {
$buffer .= $_;
}
close $S;
alarm 0;
}; # end eval
return 'url timed out' if $@ =~ /TIMEOUT/;
return $@ if $@; # return errors
my %response;
parse_http_response(\$buffer,\%response);
return 'failed to find version number'
unless $response{content} =~ /VERSION\s*=\s*(\d+)/;
my $new_ver = $1;
unless ($cur_ver) { # sigh.... must get old version number
# very inefficient
return "failed to open $file"
unless open(S,$file);
while (<S>) {
next unless $_ =~ /VERSION\s*=\s*(\d+)/;
$cur_ver = $1;
last;
}
close S;
}
return 'failed to find current version number'
unless $cur_ver;
if ( $cur_ver < $new_ver ) {
return "failed to open $file..tmp for update"
unless open(S,'>'.$file.'.tmp');
$_ = select S;
$| = 1;
select $_;
print S $response{content};
close S;
# atomic update
rename $file .'.tmp', $file unless $debug;
}
return undef;
}
1;
__END__
=head1 EXPORT_OK
parse_http_URL
open_http
parse_http_response
short_response
make_line
not_hour
not_day
auto_update
=head1 COPYRIGHT
Copyright 2002 - 2004, Michael Robinton & BizSystems
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
=head1 AUTHOR
Michael Robinton, michael@bizsystems.com
=head1 SEE ALSO
perl(1), LaBrea::Tarpit(3), LaBrea::Codes(3), LaBrea::Tarpit::Report(3),
LaBrea::Tarpit::Util(3), LaBrea::Tarpit::DShield(3)
=cut