The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Archive::Heritrix;

use 5.008005;
use strict;
use warnings;
use Compress::Zlib;
use File::Find;
use HTTP::Response;

our $VERSION = 0.02;

sub new {
  my ( $class, %arg ) = @_;
  my $self = bless {}, $class;

  if ( $arg{ 'file' } && $arg{ 'directory' } ) {
    die "file or directory, not both";
  }

  my @files = ();

  if ( $arg{ 'file' } ) {
    if ( ! -f $arg{ 'file' } ) {
      die "no such file";
    }
    @files = ( $arg{ 'file' } );
  }
  if ( $arg{ 'directory' } ) {
    if ( ! -d $arg{ 'directory' } ) {
      die "no such directory";
    }
    find( sub { push @files, $File::Find::name if $File::Find::name =~ /\.arc\.gz$/ },  $arg{ 'directory' } );
  }

  $self->{ 'files' } = \@files;
  $self->next_file();
  return $self;
}

sub next_file {
  my $self = shift;
  my $f = shift @{ $self->{ 'files' } };
  if ( ! $f ) {
    $self->{ '_fh' } = undef;
    return undef;
  }
  my $gz = gzopen( $f, 'rb' );
  return undef unless $gz;
  $self->{ '_fh' } = $gz;
}

sub next_record {
  my $self = shift;
  my $fh = $self->_fh();

  if ( ! $fh ) {
    if ( $self->next_file() ) {
      $fh = $self->_fh();
    }
    else {
      return undef;
    }
  }

  my $head;
  $fh->gzreadline( $head );
  chomp $head;
  my ($url,$ip,$stamp,$type,$length) = $head =~ m/^(.+?) ([\d\.]+) (\d+) (\S+) (\d+)$/;
  if ( ! $url ) {
    $self->next_file();
    $fh = $self->_fh();
    return undef unless $fh;
    $fh->gzreadline( $head );
    chomp $head;
    ($url,$ip,$stamp,$type,$length) = $head =~ m/^(.+?) ([\d\.]+) (\d+) (\S+) (\d+)$/;
  }
  return undef unless $url;

  my $buf = undef;
  my $read = 0;
  my $l0 = undef;
  while ( $read <= $length ) {
    my $bbuf;
    $fh->gzreadline( $bbuf );
    $l0 ||= $bbuf;
    my $got = length( $bbuf );
    $buf .= $bbuf;
    $read += $got;
  }
  my ( $code, $msg ) = $l0 =~ m#(\d{3})\s+(.+?)[\r\n]*$#;
  my $res = HTTP::Response->parse( $buf );
  $res->{'_headers'}->content_length($length);
  $res->{'_headers'}->referer($url);
  $res->code( $code );
  $res->message( $msg );

  return $res;
}

sub _fh {
  return shift->{ '_fh' };
}

1;
__END__
=head1 NAME

Archive::Heritrix - Perl extension for processing Heritrix archive (.arc) files

=head1 SYNOPSIS

  use Archive::Heritrix;
  my $arc;

  #open a single .arc.gz archive
  $arc = Archive::Heritrix->new( file => 'a.arc.gz' );
  while ( my $rec = $arc->next_record() ) {
    #it's a HTTP::Response object
  }

  #open a directory of .arc.gz archives.  matches recursively on file extension
  $arc = Archive::Heritrix->new( directory => 'eg' );
  while ( my $rec = $arc->next_record() ) {
    #it's a HTTP::Response object
  }

=head1 DESCRIPTION

Process Heritrix archive (arc) files as a stream of HTTP::Response objects.

Heritrix is the archival-grade crawler used by the Internet Archive.

=head1 SEE ALSO

  Heritrix project homepage, http://crawler.archive.org

=head1 AUTHOR

Allen Day, E<lt>allenday@ucla.eduE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2008 by Allen Day

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.5 or,
at your option, any later version of Perl 5 you may have available.

=cut