# $Id$
package HTTP::Size;
use strict;
use warnings;
no warnings;
=head1 NAME
HTTP::Size - Get the byte size of an internet resource
=head1 SYNOPSIS
use HTTP::Size
my $size = HTTP::Size::get_size( $url );
if( defined $size )
{
print "$url size was $size";
}
elsif( $HTTP::Size::ERROR == $HTTP::Size::INVALID_URL )
{
print "$url is not a valid absolute URL";
}
elsif( $HTTP::Size::ERROR == $HTTP::Size::COULD_NOT_FETCH )
{
print "Could not fetch $url\nHTTP status is $HTTP::Size::HTTP_STATUS";
}
elsif( $HTTP::Size::ERROR == $HTTP::Size::BAD_CONTENT_LENGTH )
{
print "Could not determine content length of $url";
}
=head1 DESCRIPTION
=head1 VARIABLES
The following global variables describes conditions from the last
function call:
$ERROR
$HTTP_STATUS
The C<$ERROR> variable may be set to any of these values:
$INVALID_URL - the URL is not a valid absolute URL
$COULD_NOT_FETCH - the function encountered an HTTP error
$BAD_CONTENT_LENGTH - could not determine a content type
The module does not export these variables, so you need to use
the full package specification outside of the HTTP::Size
package.
=cut
use subs qw( get_size _request );
use vars qw(
$ERROR $HTTP_STATUS $VERSION
$INVALID_URL $COULD_NOT_FETCH $BAD_CONTENT_LENGTH
$CONTENT $CONTENT_TYPE
);
use LWP::UserAgent;
use URI;
use HTTP::Request;
$VERSION = '1.13_01';
my $User_agent = LWP::UserAgent->new();
$INVALID_URL = -1;
$COULD_NOT_FETCH = -2;
$BAD_CONTENT_LENGTH = -3;
=head1 FUNCTIONS
=over 4
=item get_size( URL )
Fetch the specified absolute URL and return its content length.
The URL can be a string or an URI object. The function tries
the HEAD HTTP method first, and on failure, tries the GET method.
In either case it sets $HTTP_STATUS to the HTTP response code.
If the response does not contain a Content-Length header, the
function takes the size of the message body. If the HEAD method
returned a good status, but no Content-Length header, it retries
with the GET method.
On error, the function set $ERROR to one of these values:
$INVALID_URL - the URL is not a valid absolute URL
$COULD_NOT_FETCH - the function encountered an HTTP error
$BAD_CONTENT_LENGTH - could not determine a content type
=cut
sub get_size
{
my $url = shift;
my $method = shift || 0;
_init();
unless( ref $url eq 'URI' )
{
$url = URI->new( $url );
}
unless( $url->scheme )
{
$ERROR = $INVALID_URL;
return;
};
my $response = '';
my $size = 0;
unless( $method )
{
my $request = HTTP::Request->new( HEAD => $url->as_string );
$response = _request( $request );
$HTTP_STATUS = $response->code;
$size = $response->content_length;
}
unless( not $method and $response->is_success and $size )
{
my $request = HTTP::Request->new( GET => $url->as_string );
$response = _request( $request );
$HTTP_STATUS = $response->code;
$CONTENT = $response->content;
unless( $response->is_success )
{
$ERROR = $COULD_NOT_FETCH;
return;
}
elsif( not $response->content_length )
{
$size = length $CONTENT;
}
elsif( $response->content_length )
{
$size = $response->content_length;
}
}
$CONTENT_TYPE = lc $response->content_type;
return $size;
}
=item get_sizes( URL, BASE_URL )
The get_sizes function is like get_size, although for HTML pages
it also fetches all of the images then sums the sizes of the
original page and image sizes. It returns a total download size.
In list context it returns the total download size and a hash
reference whose keys are the URLs that a browser should download
automatically (images):
size
ERROR
HTTP_STATUS
The ERROR and HTTP_STATUS correspond to the values of $ERROR and
$HTTP_STATUS for that URL.
my ( $total, $hash ) = HTTP::Size::get_sizes( $url );
foreach my $key ( keys %$hash )
{
print "$key had an error" unless defined $size;
}
The hash is always returned in list context (a change from
version 0.4).
Relative image links resolve accroding to BASE_URL, or by
a found BASE tag. See L<HTML::SimpleLinkExtor>.
Javascript and style sheet links are unimplemented right now.
=cut
sub get_sizes
{
my $url = shift;
my $base = shift;
my %hash;
my $size = get_size( $url, 'GET' );
@{$hash{$url}}{ qw(size ERROR HTTP_STATUS) }
= ($size, $ERROR, $HTTP_STATUS);
unless( $size and $CONTENT_TYPE eq 'text/html' )
{
return wantarray ? ( $size, \%hash ) : $size;
}
require HTML::SimpleLinkExtor;
my $total = $size;
my $extor = HTML::SimpleLinkExtor->new( $url );
$extor->parse( $CONTENT );
foreach my $img ( $extor->img )
{
my $size = get_size( $img ) || 0;
@{$hash{$img}}{ qw(size ERROR HTTP_STATUS) }
= ( $size, $ERROR, $HTTP_STATUS );
$total += $size;
}
return wantarray ? ( $total, \%hash ) : $total;
}
sub _init
{
$ERROR = $CONTENT_TYPE = $CONTENT = $HTTP_STATUS = '';
}
sub _request
{
my $response = $User_agent->request( shift );
$HTTP_STATUS = $response->code;
return $response;
}
=back
=head1 TO DO
* if i have to use GET, i should use Byte-Ranges to avoid
downloading the whole thing
* add a way to specify Basic Auth credentials
* download javascript and style sheets too.
=head1 SEE ALSO
L<HTML::SimpleLinkExtor>
=head1 SOURCE AVAILABILITY
This source is part of a SourceForge project which always has the
latest sources in CVS, as well as all of the previous releases.
http://sourceforge.net/projects/brian-d-foy/
If, for some reason, I disappear from the world, one of the other
members of the project can shepherd this module appropriately.
=head1 AUTHOR
brian d foy, C<< <bdfoy@cpan.org> >>
=head1 COPYRIGHT AND LICENSE
Copyright (c) 2000-2007 brian d foy. All rights reserved.
This program is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut
1;