The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Win32::UrlCache;

use strict;
use warnings;
use Carp;

BEGIN {
  if ( $^O eq 'MSWin32' ) {
    require Win32::UrlCache::FileTime;
    Win32::UrlCache::FileTime->import;
  }
  else {
    require Win32::UrlCache::FileTimePP;
    Win32::UrlCache::FileTimePP->import;
  }
}

our $VERSION = '0.06';

use constant BADFOOD => chr(0x0D).chr(0xF0).chr(0xAD).chr(0x0B);

sub new {
  my $class = shift;
  my $self = bless {}, $class;

  $self->_read_file( @_ );
  $self->_version_check;
  $self->_size_check;
  $self->_get_pointer_to_first_hash;

  $self;
}

sub _read_file {
  my ($self, $file) = @_;

  open my $fh, '<', $file or croak $!;
  binmode $fh;
  sysread $fh, ( my $data ), ( my $size = -s $fh );
  close $fh;

  $self->{_data} = $data;
  $self->{_size} = $size;
  $self->{_pos}  = 0;
}

sub _version_check {
  my $self = shift;
  my $header = 'Client UrlCache MMF Ver 5.2';
  my $read   = $self->_read_string;
  unless ( $read eq $header ) {
    croak "unsupported file type: $read";
  }
}

sub _size_check {
  my $self = shift;
  my $read = _to_int( $self->_read );
  unless ( $read == $self->{_size} ) {
    croak "index file seems broken: $read / ".$self->{_size};
  }
}

sub _get_pointer_to_first_hash {
  my $self = shift;

  $self->{_from} = _to_int( $self->_read );
}

sub _read_hashes {
  my ($self, $target, %options) = @_;

  my $pointer = $self->{_from};

  while( $pointer ) {
    unless ( $self->_read_from( $pointer ) eq 'HASH' ) {
      croak "index file seems broken: HASH not found";
    }
    my $hash_length = _to_int( $self->_read );
    my $next_hash   = _to_int( $self->_read );
    my $unknown     = _to_int( $self->_read );
    my $hash_end    = $pointer + ( $hash_length * 0x80 );

    while ( $self->{_pos} < $hash_end ) {
      my ( $hashkey, $offset ) = ( $self->_read, $self->_read );
      next if $offset eq BADFOOD;

      my $int_offset = _to_int( $offset );
      next unless $int_offset;

      # last of the offset should be 0x80/0x00 (not 0x03 etc)
      next unless ( $int_offset & 0xf ) == 0;

      my $tag = $self->_test_from( $int_offset );
      next if $tag eq BADFOOD;

      if ( $tag =~ /^(?:URL|REDR|LEAK)/ ) {
        next if $target && $target ne $tag;

        my $pos = $self->{_pos};
        $self->_read_entry( $int_offset, %options );
        $self->{_pos} = $pos;
      }
    }
    $pointer = $next_hash or last;
  }
}

sub _read_entry {
  my ($self, $offset, %options) = @_;

  my $tag = $self->_read_from( $offset );
     $tag =~ s/ $//;
  my $class = 'Win32::UrlCache::'.$tag;

  my $item;
  if ( $tag eq 'REDR' ) {
    my $block   = $self->_read;
    my $unknown = $self->_read(8);
    my $url     = $self->_read_string;

    $item = { url => $url };
  }
  if ( $tag eq 'URL' or $tag eq 'LEAK' ) {
    my $block              = $self->_read;
    my $last_modified      = filetime( $self->_read(8) );
    my $last_accessed      = filetime( $self->_read(8) );
    my $maybe_expire       = $self->_read(8);
    my $maybe_filesize     = $self->_read(8);
    my $unknown            = $self->_read(20);
    my $offset_to_filename = _to_int( $self->_read );
    my $unknown2           = $self->_read;
    my $offset_to_headers  = _to_int( $self->_read );
    my $unknown3           = $self->_read(32);
    my $url                = $self->_read_string;
    my $filename           = $offset_to_filename
      ? $self->_read_string_from( $offset + $offset_to_filename )
      : '';
    my $headers            = $offset_to_headers
      ? $self->_read_string_from( $offset + $offset_to_headers )
      : '';

    $item = {
      url           => $url,
      filename      => $filename,
      headers       => $headers,
      filesize      => $maybe_filesize,
      last_modified => $last_modified,
      last_accessed => $last_accessed,
    };
  }
  return unless $item;

  my $object = bless $item, $class;
  if ( $options{callback} ) {
    my $ret = $options{callback}->( $object );
    return unless $ret;
  }

  if ( $options{extract_title} && $item->{filename} && $^O eq 'MSWin32' ) {
    require Win32::UrlCache::Title;
    Win32::UrlCache::Title->import;
    $item->title( Win32::UrlCache::Title->extract( $item->filename ) );
  }

  push @{ $self->{$tag} ||= [] }, $object;
}

sub urls  {
  my $self = shift;
  $self->_read_hashes( 'URL ', @_ );
  return @{ $self->{URL} || [] };
}

sub redrs  {
  my $self = shift;
  $self->_read_hashes( 'REDR', @_ );
  return @{ $self->{REDR} || [] };
}

sub leaks  {
  my $self = shift;
  $self->_read_hashes( 'LEAK', @_ );
  return @{ $self->{LEAK} || [] };
}

sub _to_int {
  my $dword = shift;
  my @bytes = split //, $dword;
  return (
    ord( $bytes[3] ) * (256 ** 3) +
    ord( $bytes[2] ) * (256 ** 2) +
    ord( $bytes[1] ) * (256 ** 1) +
    ord( $bytes[0] ) * (256 ** 0)
  );
}

sub _read {
  my ($self, $length) = @_;

  $length ||= 4;
  my $str = substr( $self->{_data}, $self->{_pos}, $length );
  $self->{_pos} += $length;
  return $str;
}

sub _read_from {
  my ($self, $from, $length) = @_;
  $self->{_pos} = $from;
  $self->_read( $length );
}

sub _read_string {
  my $self = shift;
  my $from = $self->{_pos};
  my $to   = index( $self->{_data}, "\000", $from );
  my $str  = substr( $self->{_data}, $from, $to - $from );
  $self->{_pos} = $to + 1;
  return $str;
}

sub _read_string_from {
  my ($self, $from) = @_;
  $self->{_pos} = $from;
  $self->_read_string;
}

sub _test_from {
  my ($self, $from, $length) = @_;
  $length ||= 4;
  $from     = $self->{_pos} unless defined $from;
  return substr( $self->{_data}, $from, $length );
}

package #
  Win32::UrlCache::URL;

use strict;
use warnings;
use base qw( Class::Accessor::Fast );

__PACKAGE__->mk_accessors(qw(
  url filename headers filesize last_modified last_accessed
  title
));

package #
  Win32::UrlCache::LEAK;

use strict;
use warnings;
use base qw( Class::Accessor::Fast );

__PACKAGE__->mk_accessors(qw(
  url filename headers filesize last_modified last_accessed
  title
));

package #
  Win32::UrlCache::REDR;

use strict;
use warnings;
use base qw( Class::Accessor::Fast );

__PACKAGE__->mk_accessors(qw( url ));

1;

__END__

=head1 NAME

Win32::UrlCache - parse Internet Explorer's history/cache/cookies

=head1 SYNOPSIS

    use Win32::UrlCache;
    my $index = Win32::UrlCache->new( 'index.dat' );
    foreach my $url ( $index->urls ) {
      print $url->url, "\n";
    }

    Or, you can use callback function if you care memory usage.

    use Win32::UrlCache;
    my $index = Win32::UrlCache->new( 'index.dat' );
    $index->urls( callback => \&callback )

    sub callback {
      my $entry = shift;
      my $url = $entry->url;
         $url =~ s/^Visited: //;
      $entry->url( $url );

      print $entry->url, "\n";
      return;  # to prevent the entry from being kept in the object
    }

    If you want to know the title of the cached page (for Win32 only):

    use Win32::UrlCache::Cache;
    use Win32::UrlCache::Title;
    use Encode;
    my $cache = Win32::UrlCache::Cache->new;
       $cache->urls( callback => \&callback )

    sub callback {
      my $entry = shift;

      print $entry->url, "\n";
      my $title = Win32::UrlCache::Title->extract( $entry->filename );
      print encode( shiftjis => $title ), "\n\n" if $title;

      return;
    }

=head1 DESCRIPTION

This parses so-called "Client UrlCache MMF Ver 5.2" index.dat files, which are used to store Internet Explorer's history, cache, and cookies. As of writing this, I've only tested on Win2K + IE 6.0, but I hope this also works with some of the other versions of OS/Internet Explorer. However, note that this is not based on the official/public MSDN specification, but on a hack on the web. So, caveat emptor in every sense, especially for the redr entries ;)

Patches and feedbacks are welcome.

=head1 METHODS

=head2 new

receives a path to an 'index.dat', and parses it to create an object.

=head2 urls

returns URL entries in the 'index.dat' file. Each entry has url, filename, headers, filesize, last_modified, last_accessed, and optionally, title accessors (note that some of them would return meaningless values). As of 0.02, it can receive a callback function. See below. As of 0.04, you can also pass ( extract_title => 1 ) to extract title. However, this extraction is processed after a callback. So, if you want both to use a callback and to extract title, you might want to insert extraction code into the callback as shown in the synopsis.

=head2 leaks

almost the same as urls, but returns LEAK entries (if any) in the 'index.dat' file.

=head2 redrs

returns REDR entries (if any) in the 'index.dat' file. Each entry has a url accessor. As of 0.02, it can receive a callback function.

=head1 CALLBACK

Three methods shown above return all the entries found in the index by default, but this may eat lots of memory especially if you use IE as a main browser. As of 0.02, those methods may receive a callback function, which will take an entry for the first (and only, as of writing this) argument. If the callback returns true, the entry will be stored in the ::UrlCache object, and if the callback returns false, the entry will be discarded after the callback is executed.

=head1 SEE ALSO

L<http://www.latenighthacking.com/projects/2003/reIndexDat/>

=head1 AUTHOR

Kenichi Ishigaki, E<lt>ishigaki at cpan.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2007 by Kenichi Ishigaki.

This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=cut