#!/usr/bin/perl
#
# Usage:
#
# strip-spam-markup.pl mbox
#
# This script reads a mailbox and strips header fields that may have
# been added by various SPAM and mail filtering programs. If the SPAM
# filter program attached the original message as an RFC822 message part,
# the original message will be used, and any other message parts introduced
# by the SPAM filter program will be ignored.
#
# Author: Gary Funck <gary@intrepid.com>, 2010-08-22
# This code can be used and modified without restriction.
#
use strict;
use warnings;
use File::Remove 'remove';
use Mail::Box::Manager;
use Mail::Message::Head::SpamGroup;
use aliased 'Mail::Box::Manager' => 'MBM';
use aliased 'Mail::Message::Head::SpamGroup' => 'MMHS';
my $SA;
# If SpamAssassin is installed, we will use
# its remove_spamassassin_markup function.
if (eval 'require Mail::SpamAssassin;') {
$SA = new Mail::SpamAssassin;
}
# The following sub's perform rewrites, and are
# called from $m->rebuild, below.
# See: Mail::Message::Construct::Rebuild
sub use_orig_msg_part ($$)
{
my ($self, $part) = @_;
for my $p ($part->head->isMultipart
? $part->body->parts : ($part)) {
next unless $p->body->isNested;
my $content_type = $p->contentType;
my $content_description = $p->get('Content-Description');
if (defined($content_type) && defined($content_description)
&& $content_type eq 'message/rfc822'
&& $content_description =~ /^original message/i) {
# Use the nested original message body.
$part->body($p->body->nested->body);
last;
}
}
return $part;
}
sub remove_dspam_sig_part ($$)
{
my ($self, $part) = @_;
my $container = $part->container;
my $content_type = $part->contentType;
if (defined($container) && $container->isMultipart
&& defined($content_type) && $content_type eq 'text/plain') {
my $x_dspam_sig = $part->head->get('X-DSPAM-Signature');
# delete this part if it has a DSPAM signature header.
return undef if defined $x_dspam_sig;
}
return $part;
}
sub remove_dspam_sig_text ($$)
{
my ($self, $part) = @_;
if ($part->body->isText
&& !($part->isMultipart || $part->body->isNested)) {
# See: Mail::Message::Body::Construct
$part->body($part->body->foreachLine(
sub {my $line = $_;
$line =~ s/!DSPAM:\s*(?:\d+,)?[[:xdigit:]]+!//g;
return $line;}));
}
return $part;
}
#
# DSPAM Headers:
# X-DSPAM-Confidence: 0.5138
# X-DSPAM-Factors: 15,
# X-DSPAM-Improbability: 1 in 107 chance of being ham
# X-DSPAM-Probability: 1.0000
# X-DSPAM-Processed: Fri Jan 20 14:51:41 2006
# X-DSPAM-Reclassified
# X-DSPAM-Result: Spam
# X-DSPAM-Signature: 43d13f4d154401696382214
# X-DSPAM-User
# DSPAM Signature in body:
# !DSPAM:\s(\d+,)?[[:xdigit:]]+!
#
MMHS->fighter('DSPAM',
fields => qr/^X-DSPAM-/i,
isspam => sub
{
my ($sg, $head) = @_;
if (my $result = head->get('X-DSPAM-Result')) {
return $result =~ /^(?:SPAM|BL[AO]CKLISTED|VIRUS)$/i;
}
return 0;
},
version => sub
{
my ($sg, $head) = @_;
if (my $scan_header = $head->get('X-DSPAM-Result')) {
# DSPAM doesn't supply a header with its version number.
my ($software, $version) = qw/DSPAM 0.0/;
return ($software, $version);
}
return ();
}
);
# MIMEDefang headers (at our installation):
# (There are no standard MIMEDefang headers per se.)
# X-Spam-Score: 7.872 (*******)
# DATE_IN_PAST_96_XX,FORGED_MUA_OUTLOOK,MSOE_MID_WRONG_CASE,SPF_SOFTFAIL
# X-Scanned-By: MIMEDefang 2.70 on 198.2.168.1
MMHS->fighter('MIMEDefang',
fields => qr/^(?:X-Scanned-By|X-Spam-Score)/i,
isspam => sub
{
my ($sg, $head) = @_;
if (my $score_header = $head->get('X-Spam-Score')) {
if (my ($spam_score) = ($score_header =~ /^(\d+(?:\.\d+)?)/)) {
return $spam_score >= 5.0;
}
}
return 0;
},
version => sub
{
my ($sg, $head) = @_;
if (my $scan_header = $head->get('X-Scanned-By')) {
if (my ($software, $version) =
($scan_header =~ /^(\S+)\s+(\d+(?:\.\d+)?)/i)) {
return ($software, $version);
}
}
return ();
}
);
#
# Get the command line arguments.
#
die "Usage: $0 mailbox\n"
. " (where 'mailbox' may be either a maildir or mbox file)\n"
unless @ARGV==1;
my $filename = shift @ARGV;
#
# Open the folders
#
my $outfilename = "${filename}.strip";
my $recursive = \1;
remove($recursive, $outfilename)
if (-e $outfilename);
my $mgr = MBM->new;
# Open the original folder; don't parse message body unless needed.
my $folder = $mgr->open($filename , access => 'r', extract => 'LAZY')
or die "Cannot open $filename: $!\n";
my $outbox = $mgr->open($outfilename, access => 'a', create => 1)
or die "Cannot open $outfilename to write: $!\n";
my $nr_msgs = $folder->nrMessages;
print "Mail folder '$folder' contains $nr_msgs",
" message" . ($nr_msgs > 1 ? 's' : ''), ":\n";
for my $msg ($folder->messages) {
printf "%6d. %s\n", $msg->seqnr+1, $msg->subject;
my $m = $msg->clone;
# If the SpamAssassin module is available,
# and there are SA artifacts in the message,
# then let SA clean up the message first, because
# the rules for cleaning up SA markup can be rather complex.
if (defined($SA)
&& MMHS->from($m, types => ['SpamAssassin'])) {
# See: Mail::Message::Construct::Text
my $msg_text = $m->string;
my $SA_msg = $SA->parse($msg_text);
$msg_text = $SA->remove_spamassassin_markup($SA_msg);
# See: Mail::Message::Construct::Read
$m = Mail::Message->read($msg_text);
}
# See: Mail::Message::Construct::Rebuild
$m->rebuild(keep_message_id => 1,
extra_rules => [\&use_orig_msg_part,
\&remove_dspam_sig_part,
\&remove_dspam_sig_text]);
my $head = $m->head;
# Remove SPAM mark up that is specific to each "SPAM fighter" tool.
# See: Mail::Message::Head::SpamGroup
$head->removeSpamGroups;
# To be on the safe side, remove all the 'X-' fields.
$head->removeFields(qr/^X-/i);
my $subj = $head->get('Subject');
# Remove various Subject line mark ups that are
# sometimes used to indicate a possible SPAM message.
if (defined($subj)
&& ($subj =~ s/\[SPAM(?::\s*\d+\.\d+)?\]\s*//gi
| $subj =~ s/\bSPAM:\s+//gi
| $subj =~ s/\[?\*+\s*SPAM\s*\*+\]?//gi
| $subj =~ s/\{SPAM\??\}\s*//gi)) {
$head->delete('Subject');
$head->add("Subject: $subj");
}
if ($outbox->messageId($m->messageId)) {
# Assign a new internal message ID.
# If we don't do this, the message will be detected
# as a duplicate, and will not be written to the
# output mailbox.
print "\tWARNING: Duplicate message ID\n";
$m->takeMessageId(undef);
}
$m->printStructure(select, "\t");
$outbox->addMessage($m);
}
$folder->close(write => 'NEVER');
$outbox->close(write => 'ALWAYS');