The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Email::Fingerprint::Cache;
use Class::Std;

use warnings;
use strict;

use Carp qw( croak cluck );
use Scalar::Util qw( reftype blessed );

=head1 NAME

Email::Fingerprint::Cache - Cache observed email fingerprints

=head1 VERSION

Version 0.01

=cut

our $VERSION = '0.01';

=head1 SYNOPSIS

    use Email::Fingerprint::Cache;

    my %fingerprints;           # To access cache contents

    # Create a cache
    my $cache     =  new Email::Fingerprint::Cache(
        backend   => "AnyDBM",
        hash      => \%fingerprints,
        file      => $file,             # Created if doesn't exist
        ttl       => 3600 * 24 * 7,     # Purge records after one week
    );

    # Prepare it for use
    $cache->lock or die "Couldn't lock: $!";    # Waits for lock
    $cache->open or die "Couldn't open: $!";

    # Work with fingerprints
    for my (@message_fingerprints) {

        if ($fingerprints{$_}) {
            print "Fingerprint found: $_\n";
            next;
        }

        my $now = time;
        $fingerprints{$_} = $now;

        print "Fingerprint added: $_\n";
    }

    # Get rid of old records
    $cache->purge;

    # Print a listing of all fingerprints
    $cache->dump;

    # Finish up
    $cache->close;
    $cache->unlock;

=head1 ATTRIBUTES

=cut

my %hash    :ATTR( :get<hash> )                  = ();
my %ttl     :ATTR( :name<ttl> :default(604800) ) = ();
my %backend :ATTR( :init_arg<backend> :default('AnyDBM') ) = ();

=head1 METHODS

=head2 new

    my $fingerprint =  new Email::Fingerprint::Cache(
        file        => $file,   # Default: .maildups
        backend     => "AnyDBM",  # Default: "AnyDBM"
        ttl         => $sec,    # Default: 3600*24*7
        hash        => $ref,    # Optional
    );

Returns a new Email::Fingerprint::Cache. The cache must still be opened
before it can be used.

=head2 BUILD

Internal helper method; never called directly by users.

=cut

sub BUILD {
    my ( $self, $ident, $args ) = @_;

    # Default hash is a fresh-n-tasty anonymous hash
    $hash{$ident} = defined $args->{hash} ? $args->{hash} : {};

    # Backend will also need access to the hash
    $args->{hash} = $hash{$ident};

    # Default backend is AnyDBM
    my $backend = defined $args->{backend} ? $args->{backend} : 'AnyDBM';

    # Default cache file
    $args->{file} ||= '.maildups';

    # Try accessing package as a subclass of Email::Fingerprint::Cache
    my $package = __PACKAGE__ . "::" . $backend;
    eval "use $package";                                    ## no critic

    # Try accessing package using the given name exactly. If this fails,
    # we try constructing a backend anyway, in case the module is already
    # imported--or, e.g., defined in the script file itself.
    if ($@) {
        $package = $backend;
        eval "use $package";                                ## no critic
    }

    undef $backend;

    # Perhaps the correct module was loaded by our caller;
    # try instantiating the backend even if the above steps
    # all failed.
    eval {
        $backend =  $package->new({
            file => $args->{file},
            hash => $args->{hash},
        });
    };

    # It's a fatal error if the backend doesn't load
    croak "Can't load backend module" if $@ or not $backend;

    $backend{$ident} = $backend;
}

=head2 set_file

  $file = $cache->set_file( 'foo' ) or die "Failed to set filename";
  # now $file eq 'foo.db' or 'foo.dir', etc., depending on the backend;
  # it is almost certainly NOT 'foo'.

Sets the file to be used for the cache. Returns the actual filename
on success; false on failure.

The actual filename will probably differ from the 'foo', because
the backend will usually add an extension or otherwise munge it.

C<set_file> has I<no> effect while the cache file is locked or open!

=cut

sub set_file {
    my ($self, $file) = @_;

    # Validation
    return if $self->get_backend->is_locked;
    return if $self->get_backend->is_open;

    # OK, there's no harm in changing the file attribute
    $self->get_backend->set_file($file);

    1; 
}

=head2 get_backend

Returns the backend object for this cache.

=cut

sub get_backend :PRIVATE() {
    my $self = shift;
    return $backend{ident $self};
}

=head2 dump

    # Be a good citizen
    $cache->lock;
    $cache->open;

    $cache->dump;

    # Be a good neighbor
    $cache->close;
    $cache->unlock;

Dump a human-readable version of the contents of the cache. Data is
printed in timestamp order.

The cache I<must> first be opened, and I<should> first be locked.

=cut

sub dump {
    my $self = shift;
    my $hash = $self->get_hash;

    for my $key ( sort { $hash->{$a} <=> $hash->{$b} } keys %$hash )
    {
        my $value = $hash->{$key};
        print "$value\t", scalar gmtime $value, "\t$key\n";
    }
}

=head2 open

    $cache->open or die;

Open the cache file, and tie it to a hash. This is delegated to the
backend.

=cut

sub open {
    my $self = shift;

    return $self->_delegate( "open", @_ );
}

=head2 close

  $cache->close;

Close the cache file and untie the hash.

=cut

sub close {
    my $self = shift;

    return $self->_delegate( "close", @_ );
}

=head2 lock

  $cache->lock or die;                  # returns immediately
  $cache->lock( block => 1 ) or die;    # Waits for a lock
  $cache->lock( %opts ) or die;         # Backend-specific options

Lock the DB file to guarantee exclusive access.

=cut

sub lock {
    my $self = shift;

    return $self->_delegate( "lock", @_ );
}

=head2 unlock

  $cache->unlock or warn "Unlock failed";

Unlock the DB file.

=cut 

sub unlock {
    my $self = shift;
    return $self->_delegate( "unlock", @_ );
}

=head2 purge

    $cache->purge;                  # Use default TTL
    $cache->purge( ttl => 3600 );   # Everything older than 1 hour

Purge the cache of old entries. This reduces the risk of false positives
from things like reused message IDs, but increases the risk of false
negatives.

The default is one week. Dedicated spam-fighters might prefer to use a
longer TTL.

=cut

sub purge {
    my $self = shift;
    my %opts = @_;

    my $hash = $self->get_hash;
    my $ttl  = defined $opts{ttl} ? $opts{ttl} : $self->get_ttl;
    my $now  = time;

    for my $key ( keys %$hash )
    {
        my $timestamp = $hash->{$key} || 0; # Also clobbers bad data like undef
        delete $hash->{$key} if ($now - $timestamp) > $ttl;
    }

    1;
}

=head2 DESTROY

Clean up the module. If the hash is still tied, we warn the user and call
C<close()> on C<$self>.

=head2 DEMOLISH

Internal helper method, never called directly by user.

=cut

sub DEMOLISH {
    my $self   = shift;

    my $backend = $self->get_backend;

    # Failing to close() the cache is bad: data won't be
    # committed to disk.
    if ( $backend and $backend->is_open )
    {
        cluck "Cache DESTROY()ed before it was close()ed";
        $self->close;
    }

    # Failure to unlock() is rude, but we don't say anything.
    $self->unlock;
}

=head2 _delegate

Delegate the specified method to the backend. Internal method.

=cut

sub _delegate :PRIVATE() {
    my ($self, $method, @args) = @_;

    my $backend = $self->get_backend;
    return unless $backend;

    return $backend->$method(@args);
}

=head1 AUTHOR

Len Budney, C<< <lbudney at pobox.com> >>

=head1 BUGS

The C<dump()> method assumes that Perl's C<time()> function returns
seconds since the UNIX epoch, 00:00:00 UTC, January 1, 1970. The
module will work on architectures with non-standard epochs, but the
automated tests will fail.

Please report any bugs or feature requests to
C<bug-email-fingerprint at rt.cpan.org>, or through the web interface at
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Email-Fingerprint>.
I will be notified, and then you'll automatically be notified of progress on
your bug as I make changes.

=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc Email::Fingerprint::Cache

You can also look for information at:

=over 4

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/Email-Fingerprint>

=item * CPAN Ratings

L<http://cpanratings.perl.org/d/Email-Fingerprint>

=item * RT: CPAN's request tracker

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Email-Fingerprint>

=item * Search CPAN

L<http://search.cpan.org/dist/Email-Fingerprint>

=back

=head1 ACKNOWLEDGEMENTS

Email::Fingerprint::Cache is based on caching code in the
C<eliminate_dups> script by Peter Samuel and available at
L<http://www.qmail.org/>.

=head1 COPYRIGHT & LICENSE

Copyright 2006-2011 Len Budney, all rights reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1;