The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl
#
# Usage:
#
#   strip-spam-markup.pl mbox
#
# This script reads a mailbox and strips header fields that may have
# been added by various SPAM and mail filtering programs.  If the SPAM
# filter program attached the original message as an RFC822 message part,
# the original message will be used, and any other message parts introduced
# by the SPAM filter program will be ignored.
#
# Author: Gary Funck <gary@intrepid.com>, 2010-08-22
# This code can be used and modified without restriction.
#
use strict;
use warnings;
use File::Remove 'remove';
use Mail::Box::Manager;
use Mail::Message::Head::SpamGroup;
use aliased 'Mail::Box::Manager' => 'MBM';
use aliased 'Mail::Message::Head::SpamGroup' => 'MMHS';

my $SA;
# If SpamAssassin is installed, we will use
# its remove_spamassassin_markup function.
if (eval 'require Mail::SpamAssassin;') {
  $SA = new Mail::SpamAssassin;
}

# The following sub's perform rewrites, and are
# called from $m->rebuild, below.
# See: Mail::Message::Construct::Rebuild

sub use_orig_msg_part ($$)
{
  my ($self, $part) = @_;
  for my $p ($part->head->isMultipart
               ? $part->body->parts : ($part)) {
    next unless $p->body->isNested;
    my $content_type = $p->contentType;
    my $content_description = $p->get('Content-Description');
    if (defined($content_type) && defined($content_description)
	&& $content_type eq 'message/rfc822'
	&& $content_description =~ /^original message/i) {
      # Use the nested original message body.
      $part->body($p->body->nested->body);
      last;
    }
  }
  return $part;
}

sub remove_dspam_sig_part ($$)
{
  my ($self, $part) = @_;
  my $container = $part->container;
  my $content_type = $part->contentType;
  if (defined($container) && $container->isMultipart
      && defined($content_type) && $content_type eq 'text/plain') {
    my $x_dspam_sig = $part->head->get('X-DSPAM-Signature');
    # delete this part if it has a DSPAM signature header.
    return undef if defined $x_dspam_sig;
  }
  return $part;
}

sub remove_dspam_sig_text ($$)
{
  my ($self, $part) = @_;
  if ($part->body->isText
      && !($part->isMultipart || $part->body->isNested)) {
    # See: Mail::Message::Body::Construct
    $part->body($part->body->foreachLine(
       sub {my $line = $_;
	    $line  =~ s/!DSPAM:\s*(?:\d+,)?[[:xdigit:]]+!//g;
	    return $line;}));
  }
  return $part;
}

#
# DSPAM Headers:
# X-DSPAM-Confidence: 0.5138
# X-DSPAM-Factors: 15,
# X-DSPAM-Improbability: 1 in 107 chance of being ham
# X-DSPAM-Probability: 1.0000
# X-DSPAM-Processed: Fri Jan 20 14:51:41 2006
# X-DSPAM-Reclassified
# X-DSPAM-Result: Spam
# X-DSPAM-Signature: 43d13f4d154401696382214
# X-DSPAM-User
# DSPAM Signature in body:
# !DSPAM:\s(\d+,)?[[:xdigit:]]+!
#
MMHS->fighter('DSPAM',
  fields   => qr/^X-DSPAM-/i,
  isspam   => sub
	       {
		 my ($sg, $head)  = @_;
		 if (my $result = head->get('X-DSPAM-Result')) {
		   return $result =~ /^(?:SPAM|BL[AO]CKLISTED|VIRUS)$/i;
		 }
		 return 0;
	       },
  version  => sub
	       {
		 my ($sg, $head) = @_;
		 if (my $scan_header = $head->get('X-DSPAM-Result')) {
		   # DSPAM doesn't supply a header with its version number.
		   my ($software, $version) = qw/DSPAM 0.0/;
		   return ($software, $version);
		 }
		 return ();
	       }
  );
# MIMEDefang headers (at our installation):
# (There are no standard MIMEDefang headers per se.)
# X-Spam-Score: 7.872 (*******)
#    DATE_IN_PAST_96_XX,FORGED_MUA_OUTLOOK,MSOE_MID_WRONG_CASE,SPF_SOFTFAIL
# X-Scanned-By: MIMEDefang 2.70 on 198.2.168.1
MMHS->fighter('MIMEDefang',
  fields   => qr/^(?:X-Scanned-By|X-Spam-Score)/i,
  isspam   => sub
	       {
		 my ($sg, $head)  = @_;
		 if (my $score_header = $head->get('X-Spam-Score')) {
		   if (my ($spam_score) = ($score_header =~ /^(\d+(?:\.\d+)?)/)) {
		     return $spam_score >= 5.0;
		   }
		 }
		 return 0;
	       },
  version  => sub
	       {
		 my ($sg, $head) = @_;
		 if (my $scan_header = $head->get('X-Scanned-By')) {
		   if (my ($software, $version) =
		          ($scan_header =~ /^(\S+)\s+(\d+(?:\.\d+)?)/i)) {
		     return ($software, $version);
		   }
		 }
		 return ();
	       }
  );
#
# Get the command line arguments.
#
die "Usage: $0 mailbox\n"
    . "    (where 'mailbox' may be either a maildir or mbox file)\n"
    unless @ARGV==1;
my $filename = shift @ARGV;
#
# Open the folders
#
my $outfilename = "${filename}.strip";
my $recursive = \1;
remove($recursive, $outfilename)
  if (-e $outfilename);
my $mgr = MBM->new;
# Open the original folder; don't parse message body unless needed.
my $folder = $mgr->open($filename , access => 'r', extract => 'LAZY')
  or die "Cannot open $filename: $!\n";
my $outbox = $mgr->open($outfilename, access => 'a', create => 1)
  or die "Cannot open $outfilename to write: $!\n";
my $nr_msgs = $folder->nrMessages;
print "Mail folder '$folder' contains $nr_msgs",
      " message" . ($nr_msgs > 1 ? 's' : ''), ":\n";
for my $msg ($folder->messages) {
  printf "%6d. %s\n", $msg->seqnr+1, $msg->subject;
  my $m = $msg->clone;
  # If the SpamAssassin module is available,
  # and there are SA artifacts in the message,
  # then let SA clean up the message first, because
  # the rules for cleaning up SA markup can be rather complex.
  if (defined($SA)
      && MMHS->from($m, types => ['SpamAssassin'])) {
    # See: Mail::Message::Construct::Text
    my $msg_text = $m->string;
    my $SA_msg = $SA->parse($msg_text);
    $msg_text = $SA->remove_spamassassin_markup($SA_msg);
    # See: Mail::Message::Construct::Read
    $m = Mail::Message->read($msg_text);
  }
  # See: Mail::Message::Construct::Rebuild
  $m->rebuild(keep_message_id => 1,
              extra_rules => [\&use_orig_msg_part,
                              \&remove_dspam_sig_part,
                              \&remove_dspam_sig_text]);
  my $head = $m->head;
  # Remove SPAM mark up that is specific to each "SPAM fighter" tool.
  # See: Mail::Message::Head::SpamGroup
  $head->removeSpamGroups;
  # To be on the safe side, remove all the 'X-' fields.
  $head->removeFields(qr/^X-/i);
  my $subj = $head->get('Subject');
  # Remove various Subject line mark ups that are
  # sometimes used to indicate a possible SPAM message.
  if (defined($subj)
      && ($subj =~ s/\[SPAM(?::\s*\d+\.\d+)?\]\s*//gi
          | $subj =~ s/\bSPAM:\s+//gi
          | $subj =~ s/\[?\*+\s*SPAM\s*\*+\]?//gi
          | $subj =~ s/\{SPAM\??\}\s*//gi)) {
    $head->delete('Subject');
    $head->add("Subject: $subj");
  }
  if ($outbox->messageId($m->messageId)) {
    # Assign a new internal message ID.
    # If we don't do this, the message will be detected
    # as a duplicate, and will not be written to the
    # output mailbox.
    print "\tWARNING: Duplicate message ID\n";
    $m->takeMessageId(undef);
  }
  $m->printStructure(select, "\t");
  $outbox->addMessage($m);
}
$folder->close(write => 'NEVER');
$outbox->close(write => 'ALWAYS');