The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at:
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

=head1 NAME

Mail::SpamAssassin::BayesStore - Storage Module for default Bayes classifier

=head1 DESCRIPTION

This is the public API for the Bayesian store methods.  Any implementation of
the storage module for the default Bayes classifier must implement these methods.

=cut

package Mail::SpamAssassin::BayesStore;

use strict;
use warnings;
use bytes;
use re 'taint';
use Mail::SpamAssassin::Logger;

# TODO: if we ever get tuits, it'd be good to make these POD
# method docs more perlish... hardly a biggie.

=head1 METHODS

=over 4

=item new

public class (Mail::SpamAssassin::BayesStore) new (Mail::SpamAssassin::Plugin::Bayes $bayes)

Description:
This method creates a new instance of the Mail::SpamAssassin::BayesStore
object.  You must pass in an instance of the Mail::SpamAssassin::Plugin::Bayes
object, which is stashed for use throughout the module.

=cut

sub new {
  my ($class, $bayes) = @_;

  $class = ref($class) || $class;

  my $self = {
	      'bayes'                => $bayes,
	      'supported_db_version' => 0,
	      'db_version'	     => undef,
	     };

  bless ($self, $class);

  $self;
}

=item DB_VERSION

public instance (Integer) DB_VERSION ()

Description:
This method returns the currently supported database version for the
implementation.

=cut

sub DB_VERSION {
  my ($self) = @_;
  return $self->{supported_db_version};
}

=item read_db_configs

public instance () read_db_configs ()

Description:
This method reads any needed config variables from the configuration object
and then calls the Mail::SpamAssassin::Plugin::Bayes read_db_configs method.

=cut

sub read_db_configs {
  my ($self) = @_;

  # TODO: at some stage, this may be useful to read config items which
  # control database bloat, like
  #
  # - use of hapaxes
  # - use of case-sensitivity
  # - more midrange-hapax-avoidance tactics when parsing headers (future)
  # 
  # for now, we just set these settings statically.
  my $conf = $self->{bayes}->{main}->{conf};

  # Minimum desired database size?  Expiry will not shrink the
  # database below this number of entries.  100k entries is roughly
  # equivalent to a 5Mb database file.
  $self->{expiry_max_db_size} = $conf->{bayes_expiry_max_db_size};
  $self->{expiry_pct} = $conf->{bayes_expiry_pct};
  $self->{expiry_period} = $conf->{bayes_expiry_period};
  $self->{expiry_max_exponent} = $conf->{bayes_expiry_max_exponent};

  $self->{bayes}->read_db_configs();
}

=item prefork_init

public instance (Boolean) prefork_init ()

Description:
This optional method is called in the parent process shortly before
forking off child processes.

=cut

# sub prefork_init {
#   my ($self) = @_;
# }

=item spamd_child_init

public instance (Boolean) spamd_child_init ()

Description:
This optional method is called in a child process shortly after being spawned.

=cut

# sub spamd_child_init {
#   my ($self) = @_;
# }

=item tie_db_readonly

public instance (Boolean) tie_db_readonly ()

Description:
This method opens up the database in readonly mode.

=cut

sub tie_db_readonly {
  my ($self) = @_;
  die "bayes: tie_db_readonly: not implemented\n";
}

=item tie_db_writable

public instance (Boolean) tie_db_writable ()

Description:
This method opens up the database in writable mode.

Any callers of this methods should ensure that they call untie_db()
afterwards.

=cut

sub tie_db_writable {
  my ($self) = @_;
  die "bayes: tie_db_writable: not implemented\n";
}

=item untie_db

public instance () untie_db ()

Description:
This method unties the database.

=cut

sub untie_db {
  my $self = shift;
  die "bayes: untie_db: not implemented\n";
}

=item calculate_expire_delta

public instance (%) calculate_expire_delta (Integer $newest_atime,
                                             Integer $start,
                                             Integer $max_expire_mult)

Description:
This method performs a calculation on the data to determine the optimum
atime for token expiration.

=cut

sub calculate_expire_delta {
  my ($self, $newest_atime, $start, $max_expire_mult) = @_;
  die "bayes: calculate_expire_delta: not implemented\n";
}

=item token_expiration

public instance (Integer, Integer,
                 Integer, Integer) token_expiration(\% $opts,
                                                    Integer $newest_atime,
                                                    Integer $newdelta)

Description:
This method performs the database specific expiration of tokens based on
the passed in C<$newest_atime> and C<$newdelta>.

=cut

sub token_expiration {
  my ($self, $opts, $newest_atime, $newdelta) = @_;
  die "bayes: token_expiration: not implemented\n";
}

=item expire_old_tokens

public instance (Boolean) expire_old_tokens (\% hashref)

Description:
This method expires old tokens from the database.

=cut

sub expire_old_tokens {
  my ($self, $opts) = @_;
  my $ret;

  my $eval_stat;
  eval {
    local $SIG{'__DIE__'};	# do not run user die() traps in here
    if ($self->tie_db_writable()) {
      $ret = $self->expire_old_tokens_trapped ($opts);
    }
    1;
  } or do {
    $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat;
  };

  if (!$self->{bayes}->{main}->{learn_caller_will_untie}) {
    $self->untie_db();
  }

  if (defined $eval_stat) {	# if we died, untie the dbs.
    warn "bayes: expire_old_tokens: $eval_stat\n";
    return 0;
  }
  $ret;
}

=item expire_old_tokens_trapped

public instance (Boolean) expire_old_tokens_trapped (\% $opts)

Description:
This methods does the actual token expiration.

XXX More docs here about the methodology and what not

=cut

sub expire_old_tokens_trapped {
  my ($self, $opts) = @_;

  # Flag that we're doing work
  $self->set_running_expire_tok();

  # We don't need to do an expire, so why were we called?  Oh well.
  if (!$self->expiry_due()) {
    $self->remove_running_expire_tok();
    return 0;
  }

  my $started = time();
  my @vars = $self->get_storage_variables();

  if ( $vars[10] > time ) {
    dbg("bayes: expiry found newest atime in the future, resetting to current time");
    $vars[10] = time;
  }

  # How many tokens do we want to keep?
  my $goal_reduction = int($self->{expiry_max_db_size} * $self->{expiry_pct});
  dbg("bayes: expiry check keep size, ".$self->{expiry_pct}." * max: $goal_reduction");
  # Make sure we keep at least 100000 tokens in the DB
  if ( $goal_reduction < 100000 ) {
    $goal_reduction = 100000;
    dbg("bayes: expiry keep size too small, resetting to 100,000 tokens");
  }
  # Now turn goal_reduction into how many to expire.
  $goal_reduction = $vars[3] - $goal_reduction;
  dbg("bayes: token count: ".$vars[3].", final goal reduction size: $goal_reduction");

  if ( $goal_reduction < 1000 ) { # too few tokens to expire, abort.
    dbg("bayes: reduction goal of $goal_reduction is under 1,000 tokens, skipping expire");
    $self->set_last_expire(time());
    $self->remove_running_expire_tok(); # this won't be cleaned up, so do it now.
    return 1; # we want to indicate things ran as expected
  }

  # Estimate new atime delta based on the last atime delta
  my $newdelta = 0;
  if ( $vars[9] > 0 ) {
    # newdelta = olddelta * old / goal;
    # this may seem backwards, but since we're talking delta here,
    # not actual atime, we want smaller atimes to expire more tokens,
    # and visa versa.
    #
    $newdelta = int($vars[8] * $vars[9] / $goal_reduction);
  }

  # Calculate size difference between last expiration token removal
  # count and the current goal removal count.
  my $ratio = ($vars[9] == 0 || $vars[9] > $goal_reduction) ? $vars[9]/$goal_reduction : $goal_reduction/$vars[9];

  dbg("bayes: first pass?  current: ".time().", Last: ".$vars[4].", atime: ".$vars[8].", count: ".$vars[9].", newdelta: $newdelta, ratio: $ratio, period: ".$self->{expiry_period});

  ## ESTIMATION PHASE
  #
  # Do this for the first expire or "odd" looking results cause a first pass to determine atime:
  #
  # - last expire was more than 30 days ago
  #   assume mail flow stays roughly the same month to month, recompute if it's > 1 month
  # - last atime delta was under expiry period
  #   if we're expiring often max_db_size should go up, but let's recompute just to check
  # - last reduction count was < 1000 tokens
  #   ditto
  # - new estimated atime delta is under expiry period
  #   ditto
  # - difference of last reduction to current goal reduction is > 50%
  #   if the two values are out of balance, estimating atime is going to be funky, recompute
  #
  if ( (time() - $vars[4] > 86400*30) || ($vars[8] < $self->{expiry_period}) || ($vars[9] < 1000)
       || ($newdelta < $self->{expiry_period}) || ($ratio > 1.5) ) {
    dbg("bayes: can't use estimation method for expiry, unexpected result, calculating optimal atime delta (first pass)");

    my $start = $self->{expiry_period}; # exponential search starting at ...?  1/2 day, 1, 2, 4, 8, 16, ...
    my $max_expire_mult = 2**$self->{expiry_max_exponent}; # $max_expire_mult * $start = max expire time (256 days), power of 2.

    dbg("bayes: expiry max exponent: ".$self->{expiry_max_exponent});

    my %delta = $self->calculate_expire_delta($vars[10], $start, $max_expire_mult);

    return 0 unless (%delta);

    # This will skip the for loop if debugging isn't enabled ...
    if (would_log('dbg', 'bayes')) {
      dbg("bayes: atime\ttoken reduction");
      dbg("bayes: ========\t===============");
      for(my $i = 1; $i<=$max_expire_mult; $i <<= 1) {
	dbg("bayes: ".$start*$i."\t".(exists $delta{$i} ? $delta{$i} : 0));
      }
    }
  
    # Now figure out which max_expire_mult value gives the closest results to goal_reduction, without
    # going over ...  Go from the largest delta backwards so the reduction size increases
    # (tokens that expire at 4 also expire at 3, 2, and 1, so 1 will always be the largest expiry...)
    #
    for( ; $max_expire_mult > 0; $max_expire_mult>>=1 ) {
      next unless exists $delta{$max_expire_mult};
      if ($delta{$max_expire_mult} > $goal_reduction) {
        $max_expire_mult<<=1; # the max expire is actually the next power of 2 out
        last;
      }
    }

    # if max_expire_mult gets to 0, either we can't expire anything, or 1 is <= $goal_reduction
    $max_expire_mult ||= 1;

    # $max_expire_mult is now equal to the value we should use ...
    # Check to see if the atime value we found is really good.
    # It's not good if:
    # - $max_expire_mult would not expire any tokens.  This means that the majority of
    #   tokens are old or new, and more activity is required before an expiry can occur.
    # - reduction count < 1000, not enough tokens to be worth doing an expire.
    #
    if ( !exists $delta{$max_expire_mult} || $delta{$max_expire_mult} < 1000 ) {
      dbg("bayes: couldn't find a good delta atime, need more token difference, skipping expire");
      $self->set_last_expire(time());
      $self->remove_running_expire_tok(); # this won't be cleaned up, so do it now.
      return 1; # we want to indicate things ran as expected
    }

    $newdelta = $start * $max_expire_mult;
    dbg("bayes: first pass decided on $newdelta for atime delta");
  }
  else { # use the estimation method
    dbg("bayes: can do estimation method for expiry, skipping first pass");
  }

  my ($kept, $deleted, $num_hapaxes, $num_lowfreq) = $self->token_expiration($opts, $newdelta, @vars);

  my $done = time();

  my $msg = "expired old bayes database entries in ".($done - $started)." seconds";
  my $msg2 = "$kept entries kept, $deleted deleted";

  if ($opts->{verbose}) {
    my $hapax_pc = ($num_hapaxes * 100) / $kept;
    my $lowfreq_pc = ($num_lowfreq * 100) / $kept;
    print "$msg\n$msg2\n"  or die "Error writing: $!";
    printf "token frequency: 1-occurrence tokens: %3.2f%%\n", $hapax_pc
      or die "Error writing: $!";
    printf "token frequency: less than 8 occurrences: %3.2f%%\n", $lowfreq_pc
      or die "Error writing: $!";
  }
  else {
    dbg("bayes: $msg: $msg2");
  }

  return 1;
}

=item sync_due

public instance (Boolean) sync_due ()

Description:
This methods determines if a sync is due.

=cut

sub sync_due {
  my ($self) = @_;
  die "bayes: sync_due: not implemented\n";
}

=item expiry_due

public instance (Boolean) expiry_due ()

Description:
This methods determines if an expire is due.

=cut

sub expiry_due {
  my ($self) = @_;

  $self->read_db_configs();	# make sure this has happened here

  # If force expire was called, do the expire no matter what.
  return 1 if ($self->{bayes}->{main}->{learn_force_expire});

  # if config says not to auto expire then no need to continue
  return 0 if ($self->{bayes}->{main}->{conf}->{bayes_auto_expire} == 0);

  # is the database too small for expiry?  (Do *not* use "scalar keys",
  # as this will iterate through the entire db counting them!)
  my @vars = $self->get_storage_variables();
  my $ntoks = $vars[3];

  my $last_expire = time() - $vars[4];
  if (!$self->{bayes}->{main}->{ignore_safety_expire_timeout}) {
    # if we're not ignoring the safety timeout, don't run an expire more
    # than once every 12 hours.
    return 0 if ($last_expire < 43200);
  }
  else {
    # if we are ignoring the safety timeout (e.g.: mass-check), still
    # limit the expiry to only one every 5 minutes.
    return 0 if ($last_expire < 300);
  }

  dbg("bayes: DB expiry: tokens in DB: $ntoks, Expiry max size: ".$self->{expiry_max_db_size}.", Oldest atime: ".$vars[5].", Newest atime: ".$vars[10].", Last expire: ".$vars[4].", Current time: ".time());

  my $conf = $self->{bayes}->{main}->{conf};
  if ($ntoks <= 100000 ||			# keep at least 100k tokens
      $self->{expiry_max_db_size} > $ntoks ||	# not enough tokens to cause an expire
      $vars[10]-$vars[5] < 43200 ||		# delta between oldest and newest < 12h
      $self->{db_version} < $self->DB_VERSION # ignore old db formats
      ) {
    return 0;
  }

  return 1;
}

=item seen_get

public instance (Char) seen_get (String $msgid)

Description:
This method retrieves the stored value, if any, for C<$msgid>.  The return
value is the stored string ('s' for spam and 'h' for ham) or undef if
C<$msgid> is not found.

=cut

sub seen_get {
  my ($self, $msgid) = @_;
  die "bayes: seen_get: not implemented\n";
}

=item seen_put

public instance (Boolean) seen_put (String $msgid, Char $flag)

Description:
This method records C<$msgid> as the type given by C<$flag>.  C<$flag> is
one of two values 's' for spam and 'h' for ham.

=cut

sub seen_put {
  my ($self, $msgid, $flag) = @_;
  die "bayes: seen_put: not implemented\n";
}

=item seen_delete

public instance (Boolean) seen_delete (String $msgid)

Description:
This method removes C<$msgid> from storage.

=cut

sub seen_delete {
  my ($self, $msgid) = @_;
  die "bayes: seen_delete: not implemented\n";
}

=item get_storage_variables

public instance (@) get_storage_variables ()

Description:
This method retrieves the various administrative variables used by
the Bayes storage implementation.

The values returned in the array are in the following order:

0: scan count base

1: number of spam

2: number of ham

3: number of tokens in db

4: last expire atime

5: oldest token in db atime

6: db version value

7: last journal sync

8: last atime delta

9: last expire reduction count

10: newest token in db atime

=cut

sub get_storage_variables {
  my ($self) = @_;
  die "bayes: get_storage_variables: not implemented\n";
}

=item dump_db_toks

public instance () dump_db_toks (String $template, String $regex, @ @vars)

Description:
This method loops over all tokens, computing the probability for the token
and then printing it out according to the passed in template.

=cut

sub dump_db_toks {
  my ($self, $template, $regex, @vars) = @_;
  die "bayes: dump_db_toks: not implemented\n";
}

=item set_last_expire

public instance (Boolean) _set_last_expire (Integer $time)

Description:
This method sets the last expire time.

=cut

sub set_last_expire {
  my ($self, $time) = @_;
  die "bayes: set_last_expire: not implemented\n";
}

=item get_running_expire_tok

public instance (Time) get_running_expire_tok ()

Description:
This method determines if an expire is currently running and returns the time
the expire started.

=cut

sub get_running_expire_tok {
  my ($self) = @_;
  die "bayes: get_running_expire_tok: not implemented\n";
}

=item set_running_expire_tok

public instance (Time) set_running_expire_tok ()

Description:
This method sets the running expire time to the current time.

=cut

sub set_running_expire_tok {
  my ($self) = @_;
  die "bayes: set_running_expire_tok: not implemented\n";
}

=item remove_running_expire_tok

public instance (Boolean) remove_running_expire_tok ()

Description:
This method removes a currently set running expire time.

=cut

sub remove_running_expire_tok {
  my ($self) = @_;
  die "bayes: remove_running_expire_tok: not implemented\n";
}

=item tok_get

public instance (Integer, Integer, Time) tok_get (String $token)

Description:
This method retrieves the specified token (C<$token>) from storage and returns
it's spam count, ham acount and last access time.

=cut

sub tok_get {
  my ($self, $token) = @_;
  die "bayes: tok_get: not implemented\n";
}

=item tok_get_all

public instance (\@) tok_get_all (@ @tokens)

Description:
This method retrieves the specified tokens (C<@tokens>) from storage and
returns an array ref of arrays spam count, ham count and last access time.

=cut

sub tok_get_all {
  my ($self, $tokens) = @_;
  die "bayes: tok_get_all: not implemented\n";
}

=item tok_count_change

public instance (Boolean) tok_count_change (Integer $spam_count,
                                            Integer $ham_count,
                                            String $token,
                                            Time $atime)

Description:
This method takes a C<$spam_count> and C<$ham_count> and adds it to
C<$token> along with updating C<$token>s atime with C<$atime>.

=cut

sub tok_count_change {
  my ($self, $spam_count, $ham_count, $token, $atime) = @_;
  die "bayes: tok_count_change: not implemented\n";
}

=item multi_tok_count_change

public instance (Boolean) multi_tok_count_change (Integer $spam_count,
 					          Integer $ham_count,
				 	          \% $tokens,
					          String $atime)

Description:
This method takes a C<$spam_count> and C<$ham_count> and adds it to all
of the tokens in the C<$tokens> hash ref along with updating each tokens
atime with C<$atime>.

=cut

sub multi_tok_count_change {
  my ($self, $spam_count, $ham_count, $tokens, $atime) = @_;
  die "bayes: multi_tok_count_change: not implemented\n";
}

=item nspam_nham_get

public instance (Integer, Integer) nspam_nham_get ()

Description:
This method retrieves the total number of spam and the total number of ham
currently under storage.

=cut

sub nspam_nham_get {
  my ($self) = @_;
  die "bayes: nspam_nham_get: not implemented\n";
}

=item nspam_nham_change

public instance (Boolean) nspam_nham_change (Integer $num_spam,
                                             Integer $num_ham)

Description:
This method updates the number of spam and the number of ham in the database.

=cut

sub nspam_nham_change {
  my ($self, $num_spam, $num_ham) = @_;
  die "bayes: nspam_nham_change: not implemented\n";
}

=item tok_touch

public instance (Boolean) tok_touch (String $token,
                                     Time $atime)

Description:
This method updates the given tokens (C<$token>) access time.

=cut

sub tok_touch {
  my ($self, $token, $atime) = @_;
  die "bayes: tok_touch: not implemented\n";
}

=item tok_touch_all

public instance (Boolean) tok_touch_all (\@ $tokens,
                                         Time $atime)

Description:
This method does a mass update of the given list of tokens C<$tokens>, if the existing token
atime is < C<$atime>.

=cut

sub tok_touch_all {
  my ($self, $tokens, $atime) = @_;
  die "bayes: tok_touch_all: not implemented\n";
}

=item cleanup

public instance (Boolean) cleanup ()

Description:
This method performs any cleanup necessary before moving onto the next
operation.

=cut

sub cleanup {
  my ($self) = @_;
  die "bayes: cleanup: not implemented\n";
}

=item get_magic_re

public instance get_magic_re (String)

Description:
This method returns a regexp which indicates a magic token.

=cut

sub get_magic_re {
  my ($self) = @_;
  die "bayes: get_magic_re: not implemented\n";
}

=item sync

public instance (Boolean) sync (\% $opts)

Description:
This method performs a sync of the database.

=cut

sub sync {
  my ($self, $opts) = @_;
  die "bayes: sync: not implemented\n";
}

=item perform_upgrade

public instance (Boolean) perform_upgrade (\% $opts)

Description:
This method is a utility method that performs any necessary upgrades
between versions.  It should know how to handle previous versions and
what needs to happen to upgrade them.

A true return value indicates success.

=cut

sub perform_upgrade {
  my ($self, $opts) = @_;
  die "bayes: perform_upgrade: not implemented\n";
}

=item clear_database

public instance (Boolean) clear_database ()

Description:
This method deletes all records for a particular user.

Callers should be aware that any errors returned by this method
could causes the database to be inconsistent for the given user.

=cut

sub clear_database {
  my ($self) = @_;
  die "bayes: clear_database: not implemented\n";
}

=item backup_database

public instance (Boolean) backup_database ()

Description:
This method will dump the users database in a machine readable format.

=cut

sub backup_database {
  my ($self) = @_;
  die "bayes: backup_database: not implemented\n";
}

=item restore_database

public instance (Boolean) restore_database (String $filename, Boolean $showdots)

Description:
This method restores a database from the given filename, C<$filename>.

Callers should be aware that any errors returned by this method
could causes the database to be inconsistent for the given user.

=cut

sub restore_database {
  my ($self, $filename, $showdots) = @_;
  die "bayes: restore_database: not implemented\n";
}

=item db_readable

public instance (Boolean) db_readable ()

Description:
This method returns whether or not the Bayes DB is available in a
readable state.

=cut

sub db_readable {
  my ($self) = @_;
  die "bayes: db_readable: not implemented\n";
}

=item db_writable

public instance (Boolean) db_writable ()

Description:
This method returns whether or not the Bayes DB is available in a
writable state.

=cut

sub db_writable {
  my ($self) = @_;
  die "bayes: db_writable: not implemented\n";
}


sub sa_die { Mail::SpamAssassin::sa_die(@_); }

1;

=back

=cut