package Mail::Digest::Tools;
$VERSION = 2.12; # 05/14/2011
use strict;
use warnings;
use Time::Local;
our @ISA = ("Exporter");
our @EXPORT_OK = qw(
process_new_digests
reprocess_ALL_digests
reply_to_digest_message
repair_message_order
consolidate_threads_multiple
consolidate_threads_single
delete_deletables
);
our %EXPORT_TAGS = (
all => \@EXPORT_OK,
);
########################## Package Variables ###################################
our %month30 = map {$_, 1} (4,6,9,11);
our %month31 = map {$_, 1} (1,3,5,7,8,10,12);
our %unix = map {$_, 1}
qw| Unix linux darwin freebsd netbsd openbsd mirbsd cygwin solaris |;
############################### Initializer ###################################
sub _config_check {
my ($config_in_ref, $config_out_ref) = @_;
die "Cannot find ${$config_out_ref}{'dir_digest'}: $!"
unless (-d ${$config_out_ref}{'dir_digest'});
die "Missing threads directory: $!"
unless (-d ${$config_out_ref}{'dir_threads'});
die "Except for '\n' newline, backslashes are not permitted\n in Thread Message Delimiter: $!"
if (${$config_out_ref}{'thread_msg_delimiter'} =~ /\\[^n]|\\$/);
# to do:
# here do error checking on other digest.data info that is
# absolutely necessary for all conceivable uses of Mail::Digest::Tools
}
############################ Public Methods ####################################
sub process_new_digests {
my ($config_in_ref, $config_out_ref) = @_;
_config_check($config_in_ref, $config_out_ref);
my $choice = _start_new_only(${$config_out_ref}{'title'});
_main_processor($config_in_ref, $config_out_ref, $choice);
}
sub reprocess_ALL_digests {
my ($config_in_ref, $config_out_ref) = @_;
_config_check($config_in_ref, $config_out_ref);
my $choice = _start_ALL(${$config_out_ref}{'title'});
_main_processor($config_in_ref, $config_out_ref, $choice);
}
sub reply_to_digest_message {
my ($config_in_ref, $config_out_ref,
$dig_number, $dig_entry, $dir_for_reply) = @_;
_config_check($config_in_ref, $config_out_ref);
my $digests_ref = _get_digest_list(
$config_in_ref,
$config_out_ref,
);
my $digest_verified = _identify_target_digest(
$config_in_ref,
$config_out_ref,
$dig_number,
$dig_entry,
$digests_ref
);
my $replyfile = _strip_down_for_reply(
$config_in_ref,
$config_out_ref,
$digest_verified,
$dig_entry,
$dir_for_reply,
);
return $replyfile;
}
sub repair_message_order {
# But what about todays_topics.txt? It will be out of order as well.
my ($config_in_ref, $config_out_ref, $error_date_ref) = @_;
_config_check($config_in_ref, $config_out_ref);
local $_;
my $date_threshold = _verify_date($error_date_ref);
my $delimiter = ${$config_out_ref}{'thread_msg_delimiter'};
my $dir_threads = ${$config_out_ref}{'dir_threads'};
my (@threadfiles, @resorted_threadfiles);
chdir $dir_threads or die "Unable to change to $dir_threads: $!";
opendir DIR, $dir_threads or die "Unable to open $dir_threads: $!";
@threadfiles = grep {! m/^\./ } readdir DIR;
closedir DIR or die "Unable to close $dir_threads: $!";
foreach my $in (@threadfiles) {
my (@msgids);
my (%messages);
my $mtime = (stat($in))[9];
if ($date_threshold < $mtime) {
my $msgs_ref = _get_array_of_messages($in, $delimiter);
foreach my $msg (@{$msgs_ref}) {
my @lines = split(/\n/, $msg);
my ($ln);
while (defined($ln = shift(@lines))) {
if ($ln =~ /^Message: ([\d_]+)$/) {
push(@msgids, $1);
$messages{$1} = $msg;
last;
}
}
}
my ($need_resort_flag);
for (my $el = 1; $el <= $#msgids; $el++) {
if ($msgids[$el] lt $msgids[$el-1]) {
$need_resort_flag++;
last;
}
}
if ($need_resort_flag) {
my $out = "$in.bak";
open OUT, ">$out" or die "Couldn't open $out for writing: $!";
foreach my $msg (sort keys %messages) {
print OUT $messages{$msg}, $delimiter;
}
close OUT or die "Couldn't close $out after writing: $!";
rename($out, $in) or die "Couldn't rename $out to $in: $!";
push(@resorted_threadfiles, $in);
}
}
}
if (@resorted_threadfiles) {
print "Message order has been re-sorted in\n";
print " $_\n" foreach @resorted_threadfiles;
}
}
sub consolidate_threads_multiple {
my ($config_in_ref, $config_out_ref);
$config_in_ref = shift;
$config_out_ref = shift;
my $first_common_letters = defined $_[0] ? $_[0] : 20;
my $delimiter = ${$config_out_ref}{'thread_msg_delimiter'};
my $dir_threads = ${$config_out_ref}{'dir_threads'};
local $_;
my (@threadfiles, %threadstubs, %stubs_for_consol);
chdir $dir_threads or die "Unable to change to $dir_threads: $!";
opendir DIR, $dir_threads or die "Unable to open $dir_threads: $!";
@threadfiles = map {/(.*)\.thr\.txt$/} readdir DIR;
closedir DIR or die "Unable to close $dir_threads: $!";
foreach (@threadfiles) {
my $stub = substr($_, 0, $first_common_letters);
push @{$threadstubs{$stub}}, "$_.thr.txt";
}
my ($k,$v, $consolcount);
CONSOL: while ( ($k,$v) = each(%threadstubs)) {
if (@{$v} > 1) {
$consolcount++;
print "Candidates for consolidation:\n";
foreach my $thrfile (@{$v}) {
print " $thrfile\n";
}
while () {
my ($selection);
print "\nTo consolidate, type YES: ";
chomp ($selection = <>);
if ($selection eq 'YES') {
print "\n Files will be consolidated\n\n";
$stubs_for_consol{$k} = $v;
} else {
print "\n Files will not be consolidated\n\n";
}
next CONSOL;
}
}
}
unless ($consolcount) {
warn "\nAnalysis of the first $first_common_letters letters of each file in\n $dir_threads\n shows no candidates for consolidation. Please hard-code\n names of files you wish to consolidate as arguments to\n \&consolidate_threads_single:\n $!";
}
foreach my $k (keys %stubs_for_consol) {
consolidate_threads_single(
$config_in_ref,
$config_out_ref,
\@{$stubs_for_consol{$k}}
);
}
}
sub consolidate_threads_single {
my ($config_in_ref, $config_out_ref, $filesref) = @_;
my $delimiter = ${$config_out_ref}{'thread_msg_delimiter'};
my $dir_threads = ${$config_out_ref}{'dir_threads'};
local $_;
my (%messages, @superseded);
foreach my $in (@{$filesref}) {
unless ($in =~ /^$dir_threads/) {
$in = "$dir_threads/$in";
}
my $msgs_ref = _get_array_of_messages($in, $delimiter);
foreach my $msg (@{$msgs_ref}) {
my @lines = split(/\n/, $msg);
my ($ln);
while (defined($ln = shift(@lines))) {
if ($ln =~ /^Message: ([\d_]+)$/) {
die "Message $1 already exists: $!"
if (exists $messages{$1});
$messages{$1} = [ $msg, $in ];
last;
}
}
}
push(@superseded, $in);
}
my @msgids = sort keys %messages;
my $first_in_thread = "$messages{$msgids[0]}[1]";
my $out = $first_in_thread . '.bak';
open OUT, ">$out" or die "Couldn't open $out for writing: $!";
foreach (sort keys %messages) {
print OUT $messages{$_}[0], $delimiter;
}
close OUT or die "Couldn't close $out after writing: $!";
foreach (@superseded) {
rename($_, $_ . '.DELETABLE') or die "Couldn't rename $_: $!";
}
rename($out, $first_in_thread)
|| die "Couldn't rename $out to $first_in_thread: $!";
}
sub delete_deletables {
my $config_out_ref = shift;
my $dir_threads = ${$config_out_ref}{'dir_threads'};
local $_;
my (@deletables);
chdir $dir_threads or die "Unable to change to $dir_threads: $!";
opendir DIR, $dir_threads or die "Unable to open $dir_threads: $!";
@deletables = grep { /\.DELETABLE$/ } readdir DIR;
closedir DIR or die "Unable to close $dir_threads: $!";
foreach (@deletables) {
print "Deleting $_\n";
unlink $_ or die "Couldn't unlink $_: $!";
}
}
############################ Private Methods ###################################
sub _start_new_only {
my $full_title = shift;
print "\nProcessing new $full_title digest files only!\n\n";
return '';
}
sub _start_ALL {
# prints screen prompts which ask user to choose between
# default version (process newly arrived digests only) and
# full version (process or re-process all digests)
my $full_title = shift @_;
my ($choice);
print "\n " . uc($full_title) . "\n";
print <<XQ18;
By default, this program processes only NEWLY ARRIVED
$full_title files found in this directory. Messages in
these new digests are sorted and appended to the appropriate
".thr.txt" files in the "Threads" subdirectory.
However, by choosing method 'reprocess_ALL_digests()' you have
indicated that you wish to process ALL digest files found in this
directory -- regardless of whether or not they have previously been
processed. This is recommended ONLY for initialization and testing
of this program.
Since this will wipe out all threads files ('.thr.txt') as well --
including threads files for which you no longer have their source
digest files -- please confirm that this is your intent by typing
ALL at the prompt.
GOT IT?
XQ18
print qq{Hit 'Enter' -- or, to process ALL digests in this directory,
type 'ALL' and hit 'Enter': };
chomp ($choice = <STDIN>);
if ($choice eq 'ALL') {
print qq{
You have chosen to WIPE OUT all '.thr.txt' files currently
existing in the 'Threads' subdirectory and reprocess all
$full_title digest files from scratch.
Please re-confirm your choice by once again typing 'ALL'
and hitting 'Enter': };
chomp (my $confirm = <STDIN>);
if ($choice eq $confirm) {
print "\n Processing ALL digests in this directory!\n";
} else {
die "\n Choice not confirmed; exiting program. $!\n";
}
} else {
print "\n Processing new digest files only!\n";
$choice = '';
}
print "\n";
return $choice;
}
sub _main_processor {
my ($config_in_ref, $config_out_ref, $choice) = @_;
my $recentref = _archive_or_kill($config_out_ref);
my $digests_ref = _get_digest_list($config_in_ref, $config_out_ref);
my $in_out_ref = _prep_source_file(
$config_in_ref, $config_out_ref, $digests_ref); #v1.94
$in_out_ref = _get_log_data($config_out_ref, $choice, $in_out_ref);
my ($message_count, $thread_count);
($in_out_ref, $message_count, $thread_count) = _strip_down(
$in_out_ref,
$config_in_ref,
$config_out_ref,
$recentref,
);
_update_all_topics($choice, $config_out_ref, $in_out_ref);
_print_results(
scalar(keys %$in_out_ref),
$message_count,
$config_out_ref,
$thread_count,
);
}
sub _archive_or_kill {
my $config_out_ref = shift;
my $dir_threads = ${$config_out_ref}{'dir_threads'};
my $trigger = ${$config_out_ref}{'archive_kill_trigger'};
my $threshold = defined ${$config_out_ref}{'archive_kill_days'}
? ${$config_out_ref}{'archive_kill_days'}
: 14; # v1.95
my ($thr, %recent, %nonrecent, $recentref);
chdir($dir_threads) || die "cannot chdir to $dir_threads $!";
opendir THR, $dir_threads or die "cannot open $dir_threads: $!";
while ($thr = readdir THR) {
next unless ( ($thr =~ /\.thr\.txt$/) and (-f $thr) );
if ($trigger == 0) {
$recent{$thr}++;
} else {
-M $thr <= $threshold # v1.95
? $recent{$thr}++
: $nonrecent{$thr}++;
}
}
closedir THR or die "Cannot close $dir_threads: $!";
return \%recent if ($trigger == 0);
if ($trigger == 1) {
_archive_old_files($config_out_ref, \%nonrecent);
} elsif ($trigger == -1) {
_kill_old_files($config_out_ref, \%nonrecent);
} else {
die "$trigger is invalid value for archive_kill_trigger: $!";
}
return \%recent;
}
sub _archive_old_files {
my ($config_out_ref, $nonrecentref) = @_;
my $dir_threads = ${$config_out_ref}{'dir_threads'};
my $archfile = defined ${$config_out_ref}{'archived_today'}
? ${$config_out_ref}{'archived_today'}
: "${$config_out_ref}{'dir_digest'}/archived_today.txt";
my $dir_archive_top = ${$config_out_ref}{'dir_archive_top'};
die "Missing top archive directory: $!" unless (-d $dir_archive_top);
foreach ('a'..'z') {
die "Missing archive subdirectory $_: $!" unless (-d "$dir_archive_top/$_");
}
die "Missing archive subdirectory 'other': $!" unless (-d "$dir_archive_top/other");
open ARCH, ">$archfile" or die "Couldn't open $archfile for writing: $!";
print ARCH 'Archived today (', scalar(localtime), "):\n";
print ARCH '-' x 41, "\n";
my ($thr, $archstr);
my $toarchive = 0;
foreach $thr (sort keys %{$nonrecentref}) {
my $initial = lc(substr $thr, 0, 1);
print "Archiving: $thr\n";
$archstr .= $thr . "\n";
if ($initial =~ /[a-zA-Z]/) {
rename($thr, "$dir_archive_top/$initial/$thr") or die "Couldn't move $thr: $!";
} else {
rename($thr, "$dir_archive_top/other/$thr") or die "Couldn't move $thr: $!";
}
$toarchive++;
print "$toarchive files archived\n\n" if ($toarchive % 100 == 0);
}
print "$toarchive files archived\n\n";
$toarchive ? print ARCH $archstr : print ARCH "[None.]\n";
close ARCH or die "Couldn't close $archfile after writing: $!";
}
sub _kill_old_files {
my ($config_out_ref, $nonrecentref) = @_;
my $dir_threads = ${$config_out_ref}{'dir_threads'};
my $killfile = defined ${$config_out_ref}{'deleted_today'}
? ${$config_out_ref}{'deleted_today'}
: "${$config_out_ref}{'dir_digest'}/deleted_today.txt"; # v1.95
open KILL, ">$killfile" or die "Couldn't open $killfile for writing: $!";
print KILL 'Deleted today (', scalar(localtime), "):\n";
print KILL '-' x 40, "\n";
my ($thr, $killstr);
my $tokill = 0;
foreach $thr (sort keys %{$nonrecentref}) {
print "Unlinking: $thr\n";
$killstr .= $thr . "\n";
unlink $thr or die "Couldn't unlink $thr: $!";
$tokill++;
print "$tokill files deleted\n" if ($tokill % 100 == 0);
}
print "$tokill files deleted\n";
$tokill ? print KILL $killstr : print KILL "[None.]\n";
close KILL or die "Couldn't close $killfile after writing: $!";
}
sub _get_digest_list {
my ($config_in_ref, $config_out_ref) = @_;
opendir(DIR, ${$config_out_ref}{'dir_digest'}) || die "no ${$config_out_ref}{'dir_digest'}?: $!";
my @digests =
sort { lc($a) cmp lc($b) }
grep { /${$config_in_ref}{'grep_formula'}/ }
readdir(DIR);
closedir(DIR) || die "Could not close ${$config_out_ref}{'dir_digest'}: $!";
return \@digests;
}
sub _prep_source_file {
my ($config_in_ref, $config_out_ref, $digests_ref) = @_; # v1.94
# %in_out: hash of all instances in directory of a given digest,
# value refers to digest's title and its message topics
my (%in_out, $id);
foreach (@{$digests_ref}) {
$_ =~ m/${$config_in_ref}{'pattern_target'}/;
$id = eval(${$config_out_ref}{'id_format'}); # v1.94
$in_out{$id} = [ $_ ];
}
return \%in_out;
}
sub _identify_target_digest {
my ($config_in_ref, $config_out_ref,
$dig_number, $dig_entry, $digests_ref) = @_;
my ($hit);
foreach my $digfile (@{$digests_ref}) {
$digfile =~ m/${$config_in_ref}{'pattern_target'}/;
if (defined $2) {
next unless ($2 == $dig_number);
$hit = $digfile;
last;
} elsif ((defined $1) and (! defined $2)) {
next unless ($1 == $dig_number);
$hit = $digfile;
last;
} else {
die "Could'nt process digest filename to identify target digest: $!";
}
}
if (defined $hit) {
return $hit;
} else {
print STDERR "No ${$config_out_ref}{'title'} digest numbered $dig_number could be found in directory\n";
print STDERR " ${$config_out_ref}{'dir_digest'}\n";
exit 0;
}
}
sub _get_log_data {
my ($config_out_ref, $choice, $in_out_ref) = @_;
my $dir_digest = ${$config_out_ref}{'dir_digest'};
my $dir_threads = ${$config_out_ref}{'dir_threads'};
my $logfile = ${$config_out_ref}{'digests_log'};
my $readfile = defined ${$config_out_ref}{'digests_read'} # new in 1.95
? ${$config_out_ref}{'digests_read'}
: "$dir_digest/digests_read.txt";
# hash which pulls in data from an external log file that
# records which digests have been previously processed
my (%hashlog);
open(LOG, $logfile) || die "cannot open $logfile for reading: $!";
while (<LOG>) {
chomp;
my @entrydata = split(/;/);
$hashlog{$entrydata[0]} = [ @entrydata[1..$#entrydata] ];
}
close(LOG) || die "cannot close $logfile: $!";
foreach ( sort keys %$in_out_ref ) {
# if this is 1st time this digest has been seen for processing ...
if (! exists $hashlog{$_}) {
$hashlog{$_}[1] = $hashlog{$_}[0] = scalar localtime;
# if this digest has been seen for processing already ...
} else {
# either we're going to re-process every digest ...
if ($choice eq 'ALL') {
chdir($dir_threads) || die "cannot chdir to $dir_threads $!";
my ($thrfile);
opendir(THREADS, $dir_threads) || die "no $dir_threads?: $!";
while ($thrfile = readdir(THREADS) ) {
next unless $thrfile =~ /\.thr\.txt$/;
unlink $thrfile || warn "having trouble deleting $thrfile: $!";
}
closedir(THREADS) or die "Couldn't close $dir_threads: $!";
chdir($dir_digest) || die "cannot chdir to $dir_digest $!";
$hashlog{$_}[1] = scalar localtime;
# or we're only going to process new digest files
} else {
delete ${$in_out_ref}{$_};
}
}
}
_update_digests_log(\%hashlog, $logfile);
_update_digests_read(
${$config_out_ref}{'title'},
\%hashlog,
$readfile, # new in v1.95
) if ${$config_out_ref}{'digests_read_flag'};
return ($in_out_ref);
}
sub _update_digests_log { # must be supplied with ref to %hashlog
my ($hashlog_ref, $logfile) = @_;
my ($logstring);
foreach ( sort keys %$hashlog_ref ) {
# $logstring .= $_ . ';' . ${%$hashlog_ref}{$_}[0] . ';' .
# ${%$hashlog_ref}{$_}[1]. "\n";
$logstring .= $_ . ';' . ${$hashlog_ref}{$_}[0] . ';' .
${$hashlog_ref}{$_}[1]. "\n";
}
open(LOG, ">$logfile") || die "cannot open $logfile for writing: $!";
print LOG $logstring;
close(LOG) || die "cannot close $logfile: $!";
}
sub _update_digests_read { # must be supplied with $title and ref to %hashlog
my ($title, $hashlog_ref, $readfile) = @_;
my $readstring = '';
$readstring .= "$title Digest\n";
foreach ( sort keys %$hashlog_ref ) {
$readstring .= "\n$_:\n";
# $readstring .= " first processed at ${%$hashlog_ref}{$_}[0]\n";
# $readstring .= " most recently processed at ${%$hashlog_ref}{$_}[1]\n";
$readstring .= " first processed at ${$hashlog_ref}{$_}[0]\n";
$readstring .= " most recently processed at ${$hashlog_ref}{$_}[1]\n";
}
open(READ, ">$readfile") || die "cannot open $readfile for writing: $!";
print READ $readstring;
close(READ) || die "can't close $readfile:$!";
}
sub _strip_down {
my ($in_out_ref, $config_in_ref, $config_out_ref, $recentref) = @_;
my $MIME_cleanup_flag = ${$config_in_ref}{'MIME_cleanup_flag'};
my $topics_intro = ${$config_in_ref}{'topics_intro'};
my $post_topics_delimiter = ${$config_in_ref}{'post_topics_delimiter'};
my $source_msg_delimiter = ${$config_in_ref}{'source_msg_delimiter'};
my $subject_constant = ${$config_in_ref}{'subject_constant'}
if (defined ${$config_in_ref}{'subject_constant'});
my $archive_kill_trigger = ${$config_out_ref}{'archive_kill_trigger'};
my $dir_digest = ${$config_out_ref}{'dir_digest'};
my $dir_threads = ${$config_out_ref}{'dir_threads'};
my $thread_msg_delimiter = ${$config_out_ref}{'thread_msg_delimiter'};
my $optional_fields_ref = ${$config_out_ref}{'optional_fields'}
if (defined ${$config_out_ref}{'optional_fields'});
my $MIME_cleanup_log_flag = ${$config_out_ref}{'MIME_cleanup_log_flag'}
if (defined ${$config_out_ref}{'MIME_cleanup_log_flag'});
my (%recent, $mimelog, %optional_fields);
%recent = defined $recentref ? %$recentref : ();
if (defined $optional_fields_ref) {
my $i = 0;
foreach my $opt (@{$optional_fields_ref}) {
my $longkey = $opt . '_style_flag';
if (defined ${$config_in_ref}{$longkey}) {
next unless (${$config_in_ref}{$longkey} =~ /\^(.*?):/);
$optional_fields{$i} = [ $opt, $1 ];
$i++;
} else {
warn "WARNING:\n '$opt' is not available as a header field for digest ${$config_out_ref}{'title'}\n";
}
}
}
# Analysis of source message delimiter:
my $delimiter_core =
substr( $source_msg_delimiter, 0, index($source_msg_delimiter, "\n") );
my $message_count = 0;
my %seen = ();
my $seen_ref = \%seen;
my ($output_ref);
if ($MIME_cleanup_flag) {
$mimelog = defined ${$config_out_ref}{'mimelog'} # v1.96
? ${$config_out_ref}{'mimelog'}
: "${$config_out_ref}{'dir_digest'}/mimelog.txt";
if ($MIME_cleanup_log_flag) {
open MIME, ">$mimelog" or die "Couldn't open $mimelog for writing: $!";
print MIME <<MIMELOG;
Processed Problem
MIMELOG
}
}
chdir($dir_digest) || die "cannot chdir to $dir_digest $!";
foreach my $digest_no ( sort keys %$in_out_ref ) {
my (@newfile, %messages_sorted_by_thread);
my $file = ${$in_out_ref}{$digest_no}[0];
my ($bigstr, $digest_head, $digest_bal, @digest_header, @digest_balance);
open(IN, $file) || die "cannot open $file for reading: $!";
{
local $/ = undef;
$bigstr = <IN>;
}
close (IN) || die "can't close $file:$!";
if ($bigstr =~ /(.*?)$post_topics_delimiter(.*)/s) {
$digest_head = $1;
$digest_bal = $2;
} else {
die "Couldn't extract: $!";
}
@digest_header = split(/\n/, $digest_head);
@digest_balance = split(/$source_msg_delimiter/, $digest_bal);
pop @digest_balance;
$message_count += scalar(@digest_balance);
# extract topics listing
$in_out_ref = _prepare_todays_topics(
\@digest_header,
$topics_intro,
$delimiter_core,
$in_out_ref,
$digest_no,
);
# process each message in a digest file
foreach my $el (@digest_balance) {
# analyze message's header
my $header_ref = _analyze_message_header(
$el, $config_in_ref, $config_out_ref
);
# clean up message's title to eliminate characters
# forbidden as filenames on this system
my $thread = _clean_up_thread_title(
${$header_ref}{'subject'}, $subject_constant);
my $full_id = $digest_no . '_' . ${$header_ref}{'message_no'};
my $thread_full_id = lc($thread . $full_id);
# clean up message's text to eliminate MIME multiparts
my $text = _analyze_message_body(
$el, $MIME_cleanup_flag, $full_id, $MIME_cleanup_log_flag);
# add info to hash from which output will be generated
$messages_sorted_by_thread{$thread_full_id} = [
$thread,
$full_id,
$header_ref,
$text,
];
}
# prepare output for this digest file
foreach ( sort keys %messages_sorted_by_thread ) {
($seen_ref, $output_ref) = _prepare_output_string(
\%messages_sorted_by_thread,
$seen_ref,
$dir_threads,
$thread_msg_delimiter,
$output_ref,
\%optional_fields, # new in v1.67
);
}
}
if ($MIME_cleanup_log_flag) {
close MIME, ">$mimelog" or die "Couldn't close $mimelog after writing: $!";
}
# If I am not archiving a particular digest, then I would never be calling a
# thread file for that digest back from the archive.
# Hence, I can simply append.
if ($archive_kill_trigger == 0 or $archive_kill_trigger = -1) {
foreach (keys %{$output_ref}) {
open(NOARCH, ">>$_") || die "cannot open $_ for appending: $!";
print NOARCH ${$output_ref}{$_};
close(NOARCH) || die "can't close $_: $!";
}
} elsif ($archive_kill_trigger == 1) {
my $fromarchive = 0;
my $dearchfile = defined ${$config_out_ref}{'de_archived_today'}
? ${$config_out_ref}{'de_archived_today'}
: "${$config_out_ref}{'dir_digest'}/de_archived_today.txt";
my $dir_archive_top = ${$config_out_ref}{'dir_archive_top'};
my ($dearchstr);
open DEARCH, ">$dearchfile"
or die "Couldn't open $dearchfile for writing: $!";
print DEARCH 'De-archived today (', scalar(localtime), "):\n";
print DEARCH '-' x 44, "\n";
# 1st: See if recent thread exists; if so, open for appending
# 2nd: See if archive thread exists;
# if so, move from archive to current and open for appending
# [of course, if a thread has not been active for 14 days,
# we may wish to treat a message
# with the same name as a temporarily new thread and only append it
# when archiving once it's stale ]
# 3rd: If no recent/archive thread can be found, open new file for writing
foreach (keys %{$output_ref}) {
my ($stub);
if ($_ =~ m|[/\\]([^/\\]*)$|) {
$stub = $1;
} else {
die "Couldn't extract stub from $_: $!";
}
if ($recent{$stub}) {
open(OUT2, ">>$_") || die "cannot open $_ for appending: $!";
} else {
my ($initial, $newstub);
$initial = lc(substr $stub, 0, 1);
$newstub = "$dir_threads/$stub";
if ( ($initial =~ /[a-zA-Z]/) and
(-f "$dir_archive_top/$initial/$stub") ) {
rename("$dir_archive_top/$initial/$stub", $newstub ) or
die "Couldn't de-archive $stub: $!";
print "De-archiving: $stub\n";
$dearchstr .= $stub . "\n";
$fromarchive++;
open(OUT2, ">>$newstub") ||
die "cannot open $newstub for appending: $!";
} elsif (-f "$dir_archive_top/other/$stub") {
rename("$dir_archive_top/other/$stub", $newstub ) or
die "Couldn't de-archive $stub: $!";
print "De-archiving: $stub\n";
$dearchstr .= $stub . "\n";
$fromarchive++;
open(OUT2, ">>$newstub") ||
die "cannot open $newstub for appending: $!";
} else {
open(OUT2, ">$_") || die "cannot open $_ for writing: $!";
}
}
print OUT2 ${$output_ref}{$_};
close(OUT2) || die "can't close $_: $!";
}
print "$fromarchive files de-archived\n";
$fromarchive ? print DEARCH $dearchstr : print DEARCH "[None.]\n";
close DEARCH or die "Couldn't close $dearchfile after writing: $!";
} else {
die "Bad value for archive/kill trigger: $!";
}
return ($in_out_ref, $message_count, scalar(keys %{$seen_ref}));
}
sub _strip_down_for_reply {
my ($config_in_ref, $config_out_ref,
$digest_verified, $dig_entry, $dir_for_reply) = @_;
my $MIME_cleanup_flag = ${$config_in_ref}{'MIME_cleanup_flag'};
my $post_topics_delimiter = ${$config_in_ref}{'post_topics_delimiter'};
my $source_msg_delimiter = ${$config_in_ref}{'source_msg_delimiter'};
my $subject_constant = ${$config_in_ref}{'subject_constant'}
if (defined ${$config_in_ref}{'subject_constant'});
my $dir_digest = ${$config_out_ref}{'dir_digest'};
chdir($dir_digest) || die "cannot chdir to $dir_digest $!";
# slurp the digest file in, splitting on message delimiters
# so that each message is an array element
my ($bigstr, $digest_head, $digest_bal, @digest_header, @digest_balance);
open(IN, $digest_verified) ||
die "cannot open $digest_verified for reading: $!";
{
local $/ = undef;
$bigstr = <IN>;
}
close (IN) || die "can't close $digest_verified:$!";
if ($bigstr =~ /(.*?)$post_topics_delimiter(.*)/s) {
$digest_head = $1;
$digest_bal = $2;
} else {
die "Couldn't extract: $!";
}
@digest_balance = split(/$source_msg_delimiter/, $digest_bal);
pop @digest_balance;
my ($el, $replyfile);
while (defined ($el = shift @digest_balance)) {
# analyze message's header
my $header_ref =
_analyze_message_header($el, $config_in_ref, $config_out_ref); # v1.94
next unless (${$header_ref}{'message_no'} == $dig_entry);
# clean up message's title to eliminate characters
# forbidden as filenames on this system
my $thread = _clean_up_thread_title(
${$header_ref}{'subject'}, $subject_constant);
$replyfile = "$dir_for_reply/${thread}.reply.txt";
# clean up message's text to eliminate MIME multiparts
my $text = _analyze_message_body($el, $MIME_cleanup_flag, undef, 0);
my @lines = split(/\n/, $text);
my ($replytext);
foreach my $l (@lines) {
chomp($l);
$replytext .= '> ' . $l . "\n";
}
# print reply
my $old_fh = select(REPLY);
open REPLY, ">$replyfile" or die "Couldn't open $replyfile: $!";
if (defined ${$header_ref}{'reply_to'}) {
print "Reply-To:\n";
print "${$header_ref}{'reply_to'}\n\n";
} elsif (defined ${$header_ref}{'to'}) {
print "To:\n";
print "${$header_ref}{'to'}\n\n";
}
if (defined ${$header_ref}{'subject'}) {
my ($subject_clean);
if (${$header_ref}{'subject'} =~
/^(?:(Re2?|RE2?|re2?|FWD?|Fwd?|AW):?\s+)*(.*)$/) {
$subject_clean = $2;
} else {
$subject_clean = ${$header_ref}{'subject'};
}
print "Subject:\n";
print "$subject_clean\n\n";
}
print "On ${$header_ref}{'date'}, ${$header_ref}{'from'} wrote:\n\n";
print $replytext;
print "\n";
close REPLY or die "Couldn't close $replyfile: $!";
select $old_fh;
last;
}
return $replyfile;
}
sub _prepare_todays_topics {
my ($digest_header_ref, $topics_intro,
$delimiter_core, $in_out_ref, $digest_no) = @_;
my $counter = 0;
my @todays_topics = (); # empty out @todays_topics
foreach ( @{$digest_header_ref} ) {
if (m/^$topics_intro/) { # digest-specific
$counter = 1;
}
if ($counter == 1) {
if (m/^$topics_intro|^$/) { next; } # digest-specific
elsif ($_ !~ m/$delimiter_core/)
{ push (@todays_topics, $_); }
else { last; }
}
# ${%$in_out_ref}{$digest_no}[1] = [ @todays_topics ];
# # Note: this is 1st point at which ${%$in_out_ref}{$digest_no}[1]
${$in_out_ref}{$digest_no}[1] = [ @todays_topics ];
# Note: this is 1st point at which ${$in_out_ref}{$digest_no}[1]
# gets meaningful content
}
return $in_out_ref;
}
sub _analyze_message_header {
my ($el, $config_in_ref, $config_out_ref) = @_; # v1.94
my @all = split(/\n/, $el);
my ($hl, @lines);
while (defined ($hl = shift(@all)) ) {
last if $hl =~ /^\s*$/;
push(@lines, $hl);
}
my (%header, %init, $last_analyzed);
foreach my $key (keys %{$config_in_ref}) {
next unless ($key =~ /_style_flag$/);
my ($shortkey);
if ($key =~ /(.*)_style_flag$/) {
$shortkey = $1;
} else {
warn "Problem in analyzing message header: $!";
}
$init{$shortkey}++ unless defined ${$config_in_ref}{$key};
}
foreach (@lines) {
chomp;
my ($matched);
unless ($init{'message'}) {
if (/${$config_in_ref}{'message_style_flag'}/) {
$header{'message_no'} =
eval(${$config_out_ref}{'output_id_format'});
$init{'message'}++;
$last_analyzed = 'message';
$matched++;
}
}
unless ($init{'from'}) {
if (/${$config_in_ref}{'from_style_flag'}/) {
$header{'from'} = $1;
$init{'from'}++;
$last_analyzed = 'from';
$matched++;
}
}
unless ($init{'subject'}) {
if (/${$config_in_ref}{'subject_style_flag'}/) {
$header{'subject'} = $1;
$init{'subject'}++;
$last_analyzed = 'subject';
$matched++;
}
}
unless ($init{'to'}) {
if (/${$config_in_ref}{'to_style_flag'}/) {
$header{'to'} = $1;
$init{'to'}++;
$last_analyzed = 'to';
$matched++;
}
}
unless ($init{'reply_to'}) {
if (/${$config_in_ref}{'reply_to_style_flag'}/) {
$header{'reply_to'} = $1;
$init{'reply_to'}++;
$last_analyzed = 'reply_to';
$matched++;
}
}
unless ($init{'cc'}) {
if (/${$config_in_ref}{'cc_style_flag'}/i) {
$header{'cc'} = $1;
$init{'cc'}++;
$last_analyzed = 'cc';
$matched++;
}
}
unless ($init{'date'}) {
if (/${$config_in_ref}{'date_style_flag'}/) {
$header{'date'} = $1;
$init{'date'}++;
$last_analyzed = 'date';
$matched++;
}
}
unless ($init{'org'}) {
if (/${$config_in_ref}{'org_style_flag'}/) {
$header{'org'} = $1;
$init{'org'}++;
$last_analyzed = 'org';
$matched++;
}
}
unless ($matched) {
if ($last_analyzed ne 'subject') {
$_ =~ s/^\s+//;
$header{$last_analyzed} .= "\n" . ' ' x 14 . $_;
}
}
}
return \%header;
}
sub _clean_up_thread_title {
my $subj = shift;
my $subject_constant = shift if defined $_[0];
my ($thread, @thread);
$subj = "No subject" unless $subj; #messages on some lists can be subject-less
$subj =~
/^(?:(Re\d?|RE\d?|re\d?|Re\[\d?\]|RE\[\d?\]|re\[\d?\]|FWD?|Fwd?|AW):?\s+)*(.*)$/;
$thread = $2;
if (defined $subject_constant and $thread =~ /^$subject_constant\s+(.*)/) {
$thread = $1;
}
@thread = split(//, $thread);
if ($^O eq 'MSWin32') {
$thread = join("", (grep m/[^*|\\:"<>?\/]/, @thread) ); #"
}
if ($unix{$^O}) { # v2.08
$thread = join("", (grep m/[^\/]/, @thread) );
}
# squish repeated periods anywhere in file name
$thread =~ tr/././s;
# Win32 allows periods in file names,
# but I don't want any periods or spaces immediately before '.thr.txt'
# or at the beginning of the file name
$thread =~ s/[.\s]+$//;
$thread =~ s/^[.\s]+//;
# squish repeated whitespace anywhere in file name (EPP, Item 20, p. 76)
$thread =~ tr/ \n\r\t\f/ /s;
$thread = '[Illegal subject]' unless $thread;
return $thread;
}
sub _analyze_message_body {
my ($el, $MIME_cleanup_flag, $postid, $MIME_cleanup_log_flag) = @_;
my @chunks = split(/\n{2,}/, $el);
return join("\n\n", @chunks[1 .. ($#chunks)] )
unless $MIME_cleanup_flag;
my (@nextparts);
if ($chunks[1] =~ /Content-Type:\smultipart\/alternative/o) {
# New in v1.84 1/23/04
for (my $i=1; $i<=$#chunks; $i++) {
push(@nextparts, $i) if ($chunks[$i] =~ /Content-Type:/);
}
if (@nextparts == 4) {
print MIME "$postid CASE I\n" if $MIME_cleanup_log_flag;
splice @chunks, $nextparts[2], $nextparts[3] - $nextparts[2] + 1;
splice @chunks, 1, 2;
return join("\n\n", @chunks[1 .. ($#chunks-1)] );
} else {
print MIME ' ' x 30,
"$postid; count: ", sprintf("%3d", scalar(@nextparts)), " CASE I\n"
if $MIME_cleanup_log_flag;
return join("\n\n", @chunks );
}
} elsif ($chunks[1] =~ /--Apple-Mail-/o) { # New in v1.85 1/23/04
for (my $i=1; $i<=$#chunks; $i++) {
push(@nextparts, $i) if ($chunks[$i] =~ /--Apple-Mail-/o);
}
if (@nextparts == 3 or @nextparts == 4) {
print MIME "$postid CASE J\n" if $MIME_cleanup_log_flag;
my ($fragment);
if (@nextparts == 4) {
splice @chunks, $nextparts[-1], 1;
}
if ($chunks[$nextparts[1]] =~ /(.*?)--Apple-Mail-/os) {
$fragment = $1;
}
splice @chunks, $nextparts[1];
push @chunks, $fragment if ($fragment);
splice @chunks, $nextparts[0], 1;
return join("\n\n", @chunks[1 .. $#chunks] );
} else {
print MIME ' ' x 30, "$postid; count: ",
sprintf("%3d", scalar(@nextparts)), " CASE J\n"
if $MIME_cleanup_log_flag;
return join("\n\n", @chunks );
}
} elsif ($chunks[1] !~ /^This.+?message.+?MIME format/o) {
return join("\n\n", @chunks[1 .. ($#chunks)] );
} else {
if ($chunks[1] =~ /--=_alternative/) {
for (my $i=1; $i<=$#chunks; $i++) {
push(@nextparts, $i) if ($chunks[$i] =~ /--=_alternative/);
}
if (@nextparts == 3) {
print MIME "$postid CASE A\n" if $MIME_cleanup_log_flag;
splice @chunks,
$nextparts[1] + 1, $nextparts[2] - $nextparts[1] + 1;
$nextparts[1] =~ /^(.*\n)--=_alternative/s;
my $fragment = $1;
$chunks[$nextparts[1]] = $fragment;
splice @chunks, 1, 2;
return join("\n\n", @chunks[1 .. ($#chunks-1)] );
} else {
print MIME ' ' x 30, "$postid; count: ",
sprintf("%3d", scalar(@nextparts)), " CASE B\n"
if $MIME_cleanup_log_flag;
return join("\n\n", @chunks );
}
} elsif ($chunks[1] =~ /cryptographically\ssigned/) {
print MIME "$postid CASE H\n" if $MIME_cleanup_log_flag;
splice @chunks, -3, 2;
splice @chunks, 1, 2;
return join("\n\n", @chunks[1 .. ($#chunks-1)] );
} else {
for (my $i=2; $i<=$#chunks; $i++) {
push(@nextparts, $i) if (
$chunks[$i] =~ /-{4,6}[_\s]?=_NextPart|
--Boundary_|
--------------InterScan_NT_MIME_Boundary/x
);
}
if (@nextparts == 3) {
print MIME "$postid CASE C\n" if $MIME_cleanup_log_flag;
splice @chunks, $nextparts[1], $nextparts[2] - $nextparts[1] + 1;
splice @chunks, 1, 2;
return join("\n\n", @chunks[1 .. ($#chunks-1)] );
} elsif (@nextparts == 1) {
print MIME "$postid CASE D\n" if $MIME_cleanup_log_flag;
splice @chunks, 1, 1;
return join("\n\n", @chunks[1 .. ($#chunks-1)] );
} elsif (@nextparts == 5 or @nextparts == 6) {
print MIME "$postid CASE E\n" if $MIME_cleanup_log_flag;
splice @chunks, $nextparts[2], $nextparts[-1] - $nextparts[2] + 1;
splice @chunks, 1, 3;
return join("\n\n", @chunks[1 .. ($#chunks-1)] );
} elsif (@nextparts == 7 or @nextparts == 8) {
print MIME "$postid CASE F\n" if $MIME_cleanup_log_flag;
splice @chunks, $nextparts[3], $nextparts[-1] - $nextparts[3] + 1;
splice @chunks, 1, 3;
return join("\n\n", @chunks[1 .. ($#chunks-1)] );
} else {
print MIME ' ' x 30, "$postid; count: ",
sprintf("%3d", scalar(@nextparts)), " CASE G\n"
if $MIME_cleanup_log_flag;
return join("\n\n", @chunks[1 .. ($#chunks-1)] );
}
}
}
}
sub _prepare_output_string {
my ($threads_hash_ref, $seen_ref, $dir_threads, $thread_msg_delimiter,
$output_ref, $optional_fields_ref) = @_;
my %messages = %{$threads_hash_ref};
my %seen = %{$seen_ref};
my (%output, %opt_fields);
%output = %{$output_ref} if defined $output_ref;
%opt_fields = %{$optional_fields_ref};
my ($pathsep, $out, $lc_out, $outstr);
$pathsep = ($^O eq 'MSWin32') ? "\\" : '/';
$out = $dir_threads . $pathsep . $messages{$_}[0] . '.thr.txt';
$lc_out = lc($out);
$seen{$lc_out}++;
$outstr = "Thread: $messages{$_}[0]\n";
$outstr .= "Message: $messages{$_}[1]\n";
$outstr .= "From: $messages{$_}[2]{'from'}\n";
foreach my $i (sort keys %opt_fields) {
next unless (defined $messages{$_}[2]{$opt_fields{$i}[0]});
my $space = 13 - length($opt_fields{$i}[1]);
$outstr .= $opt_fields{$i}[1] . ':' . ' ' x $space .
"$messages{$_}[2]{$opt_fields{$i}[0]}\n";
}
$outstr .= 'Text:' . "\n\n" . $messages{$_}[3] . "\n";
$outstr .= "\n";
$outstr .= "$thread_msg_delimiter"
unless (! defined $thread_msg_delimiter);
$output{$out} .= $outstr;
return \%seen, \%output;
}
sub _update_all_topics {
my ($choice, $config_out_ref, $in_out_ref) = @_;
my $title = ${$config_out_ref}{'title'};
my $topicsfile = defined ${$config_out_ref}{'todays_topics'} # v1.96
? ${$config_out_ref}{'todays_topics'}
: "${$config_out_ref}{'dir_digest'}/todays_topics.txt";
my ($topic, $topicstring);
if ($choice eq 'ALL') {
$topicstring = "$title Digest: Today's Topics\n";
foreach ( sort keys %$in_out_ref ) {
$topicstring .= "\n${$in_out_ref}{$_}[0]\n";
foreach $topic ( @{${$in_out_ref}{$_}[1]} ) {
$topicstring .= "$topic\n";
}
}
open(TOPICS, ">$topicsfile")
|| die "cannot open $topicsfile for writing: $!";
print TOPICS $topicstring;
close(TOPICS) || die "can't close $topicsfile:$!";
} else {
$topicstring = '';
foreach ( sort keys %$in_out_ref ) {
$topicstring .= "\n${$in_out_ref}{$_}[0]\n";
foreach $topic ( @{${$in_out_ref}{$_}[1]} ) {
$topicstring .= "$topic\n";
}
}
open(TOPICS, ">>$topicsfile")
|| die "cannot open $topicsfile for appending: $!";
print TOPICS $topicstring;
close(TOPICS) || die "can't close $topicsfile:$!";
}
}
sub _print_results {
my ($total_digests_processed, $message_count,
$config_out_ref, $thread_count) = @_;
print <<XQ19;
RESULTS
Digests processed:\t\t$total_digests_processed
Messages processed:\t\t$message_count
Threads directory:\t\t${$config_out_ref}{'dir_threads'}
Threads created/modified:\t$thread_count
XQ19
}
sub _verify_date {
my $dateref = shift;
die "Incorrect date specification: $!"
unless (
(exists ${$dateref}{'year'}) &&
(exists ${$dateref}{'month'}) &&
(exists ${$dateref}{'day'})
);
die "${$dateref}{'year'} is incorrect year specification: $!"
unless (1900 <= ${$dateref}{'year'});
die "${$dateref}{'month'} is incorrect month specification: $!"
unless (
(1 <= ${$dateref}{'month'}) &&
(${$dateref}{'month'} <= 12)
);
die "${$dateref}{'day'} is incorrect day of month specification: $!"
unless (
(
${$dateref}{'day'} >= 1 and
${$dateref}{'day'} <= 28
)
||
(
$month31{${$dateref}{'month'}} and
${$dateref}{'day'} >= 29 and
${$dateref}{'day'} <= 31
)
||
(
$month30{${$dateref}{'month'}} and
${$dateref}{'day'} >= 29 and
${$dateref}{'day'} <= 30
)
||
(
${$dateref}{'month'} == 2 and
${$dateref}{'day'} == 29 and
(
${$dateref}{'year'} % 400 == 0 or
(
${$dateref}{'year'} % 100 != 0 and
${$dateref}{'year'} % 4 == 0
)
)
)
);
return timelocal(
0, 0, 0,
${$dateref}{'day'},
${$dateref}{'month'} - 1,
${$dateref}{'year'}
);
}
sub _get_array_of_messages {
my ($in, $delimiter) = @_;
my ($fh, $bigstr);
open $fh, $in or die "Couldn't open $in for reading: $!";
{
local $/ = undef;
$bigstr = <$fh>;
}
close $fh or die "Couldn't close $in after reading: $!";
my @messages = split(/$delimiter/, $bigstr);
return \@messages;
}
1;
############################ DOCUMENTATION #####################################
=head1 NAME
Mail::Digest::Tools - Tools for digest versions of mailing lists
=head1 VERSION
This document refers to version 2.12 of digest.pl, released May 14, 2011.
=head1 SYNOPSIS
use Mail::Digest::Tools qw(
process_new_digests
reprocess_ALL_digests
reply_to_digest_message
repair_message_order
consolidate_threads_multiple
consolidate_threads_single
delete_deletables
);
C<%config_in> and C<%config_out> are two configuration hashes whose setup
is discussed in detail below.
process_new_digests(\%config_in, \%config_out);
reprocess_ALL_digests(\%config_in, \%config_out);
$full_reply_file = reply_to_digest_message(
\%config_in,
\%config_out,
$digest_number,
$digest_entry,
$directory_for_reply,
);
repair_message_order(
\%config_in,
\%config_out,
{
year => 2004,
month => 01,
day => 27,
}
);
consolidate_threads_multiple(
\%config_in,
\%config_out,
$first_common_letters, # optional integer argument; defaults to 20
);
consolidate_threads_single(
\%config_in,
\%config_out,
[
'first_dummy_file_for_consolidation.thr.txt',
'second_dummy_file_for_consolidation.thr.txt',
],
);
delete_deletables(\%config_out);
=head1 DESCRIPTION
Mail::Digest::Tools provides useful tools for processing mail which an
individual receives in a 'daily digest' version from a mailing list.
Digest versions of mailing lists are provided by a variety of mail processing
programs and by a variety of list hosts. Within the Perl community, digest
versions of mailing lists are offered by such sponsors as Active State,
Sourceforge, Yahoo! Groups and London.pm. However, you do not have to be
interested in Perl to make use of Mail::Digest::Tools. Mail from I<any> of
the thousands of Yahoo! Groups, for example, may be processed with this module.
If, when you receive e-mail from the digest version of a mailing list, you
simply read the digest in an e-mail client and then discard it, you may stop
reading here. If, however, you wish to read or store such mail by subject,
read on. As printed in a normal web browser, this document contains 40
pages of documentation. You are urged to print this documentation out and
study it before using this module.
To understand how to use Mail::Digest::Tools, we will first take a look at a
typical mailing list digest. We will then sketch how that digest looks once
processed by Mail::Digest::Tool. We will then discuss Mail::Digest::Tool's
exportable functions. Next, we will study how to prepare the two configuration
hashes which hold the configuration data. Finally, we will provide some tips
for everyday use of Mail::Digest::Tools.
=head1 A TYPICAL MAILING LIST DIGEST
Here is a dummied-up version of a typical mailing list digest as it appears
once saved to a plain-text file. For illustrative purposes, let us suppose
that the file is named: 'Perl-Win32-Users Digest, Vol 1 Issue 9999.txt'
Send Perl-Win32-Users mailing list submissions to
perl-win32-users@listserv.ActiveState.com
When replying, please edit your Subject line so it is more specific
than "Re: Contents of Perl-Win32-Users digest..."
Today's Topics:
1. Introducing Mail::Digest::Tools (James E Keenan)
2. A Different Discussion (steve)
3. Re: Introducing Mail::Digest::Tools (David H Adler)
----------------------------------------------------------------------
Message: 1
From: "James E Keenan" <jkeen@some.web.address.com>
To: <Perl-Win32-Users@listserv.activestate.com>
Subject: Introducing Mail::Digest::Tools
Date: Sat, 31 Jan 2004 14:10:20 -0600
Mail::Digest::Tools is the greatest thing since sliced bread.
Go download it now!
------------------------------
Message: 2
From: "steve" <steve@some.web.address.com>
To: <Perl-Win32-Users@listserv.activestate.com>
Subject: A Different Discussion
Date: Sat, 31 Jan 2004 14:40:20 -0600
This is a new topic. I am not discussing Mail::Digest::Tools in this
submission.
------------------------------
Message: 3
From: "David H Adler" <dha@some.web.address.com>
To: <Perl-Win32-Users@listserv.activestate.com>
Subject: Re: Introducing Mail::Digest::Tools
Date: Sat, 31 Jan 2004 14:50:20 -0600
Jim, what's this nonsense about sliced bread. Weren't you on the Atkins
diet? Unlike beer, sliced bread is Off Topic.
------------------------------
_______________________________________________
Perl-Win32-Users mailing list
Perl-Win32-Users@listserv.ActiveState.com
To unsubscribe: http://listserv.ActiveState.com/mailman/mysubs
End of Perl-Win32-Users Digest
Note that the digest has an I<overall> structure, while each message I<within>
the digest has its own structure.
The digest's overall structure consists of:
=over 4
=item *
I<Digest Header>
The digest header consists of one or more paragraphs providing instructions
on how to subscribe, post messages, unsubscribe and contact the list
administrator.
In processing a digest, Mail::Digest::Tools generally discards the digest
header.
=item *
I<Today's Topics>
Next, each daily digest contains a list of the subjects of the messages found
in that particular digest. This list is introduced by a paragraph such as:
Today's Topics
and is followed by a numbered list of the message subjects and authors. Some
digests break the authors into two lines for names and e-mail addresses.
Others, such as the example above, list only names.
When Mail::Digest::Tools process a digest, it extracts the list of topics as a
single chunk and appends it to a file containing the topics from all previous
digests which the user has similarly processed.
=item *
I<Post-Topics Delimiter>
The list of topics is separated from the first message by a string of
characters which the list sponsor has, we hope, determined is not likely to
occur in the text of any message posted to that list. In the example above,
the source message delimiter is the string:
----------------------------------------------------------------------
followed by two C<\n> newlines (so that the delimiter is a paragraph unto
itself). Other digests may use a two-line delimiter such as:
_______________________________________________________
_______________________________________________________
or
--__--__--
=item *
I<Source Message Delimiter>
Most mailing list digests use the same string to delimit individual messages
within the digest that they use to delimit the list of today's topics from the
very first message in the digest. (The author tracked one digest for more
than three-and-a-half years that used the same string for both functions --
only to see that digest's provider change its format while this module was
being prepared for CPAN!) But the digest may use a different string to
separate individual messages from each other. In the sample digest above,
the source message delimiter is the string:
------------------------------
followed by two C<\n> newlines (so that the delimiter is a paragraph unto
itself).
As we shall see below, correctly identifying the post-topics delimiter and
source message delimiter used in a particular digest is essential to correct
configuration of Mail::Digest::Tools, as the module will repeatedly C<split>
digests on this delimiter.
=item *
I<Individual Messages>
Individual messages have their own structure.
=over 4
=item *
I<Headers>
In addition to normal mail headers, a message in a digest must have a
message number representing its position within that day's digest. So a
message in a digest will typically have some or all of the following headers:
Message:
From:
Organization:
Reply-To:
To:
CC:
Date:
Subject:
=item *
I<Message Body>
One of more paragraphs of text, frequently including citations from earlier
postings to the mailing list.
The main objective of Mail::Digest::Tools is to extract headers and bodies
from particular digest entries and to append them to plain-text files which
hold all postings on a particular subject. See discussion of
C<process_new_digests> below.
Many mailing lists allow subscribers to post in either plain-text or HTML.
Some allow users to post attachments; others do not. Others still
incorporate the attachments into the message body, often using 'multipart
MIME' format. Regrettably, certain mailing list digest programs fail to
eliminate redundant MIME parts before posting a message to a digest. This
leads to severe bloat once Mail::Digest::Tools extracts a message's content
and posts it to a thread file. Mail::Digest::Tools, however, provides its
users with the option of stripping redundant MIME parts from a message
before posting.
=item *
I<Source Message Delimiter>
As discussed above, each message within a digest is delimited by a string
which may or may not be the same string which separates the list of Today's
Topics from the first message in the digest.
=back
=item *
I<Digest Footer>
The digest footer consists of one or more paragraphs containing
additional information on the digest and signaling the end of the digest. It
follows the source message delimiter corresponding to the last message in a
particular digest.
In processing a given digest, Mail::Digest::Tools generally discards the
digest footer.
=back
=head2 The Typical Digest After Processing with Mail::Digest::Tools
Using the dummy messages provided above, typical use of Mail::Digest::Tools
would produce (in a bare-bones configuration) the following results:
=over 4
=item *
Two plain-text 'thread' files holding the ongoing discussion of each topic:
=over 4
=item *
F<Introducing Mail::Digest::Tools.thr.txt>
Thread: Introducing Mail::Digest::Tools
Message: 001_9999_001
From: "James E Keenan" <jkeen@some.web.address.com>
Text:
Mail::Digest::Tools is the greatest thing since sliced bread.
Go download it now!
--__--__--
Thread: Introducing Mail::Digest::Tools
Message: 001_9999_003
From: "David H Adler" <dha@some.web.address.com>
Text:
Jim, what's this nonsense about sliced bread. Weren't you on the Atkins
diet? Unlike beer, sliced bread is Off Topic.
--__--__--
=item *
F<A Different Discussion.thr.txt>
Thread: A Different Discussion
Message: 001_9999_002
From: "steve" <steve@some.web.address.com>
Text:
This is a new topic. I am not discussing Mail::Digest::Tools in this
submission.
--__--__--
=back
=item *
A new entry at the end of file F<todays_topics.txt>:
Today's Topics
...
Perl-Win32-Users digest, Vol 1 #9999 - 3 msgs.txt
1. Introducing Mail::Digest::Tools (James E Keenan)
2. A Different Discussion (steve)
3. Re: Introducing Mail::Digest::Tools (David H Adler)
=item *
A new entry at the end of file F<digests_log.txt>:
001_9999;Fri Feb 6 18:57:41 2004;Fri Feb 6 18:57:41 2004
=back
=head1 FUNCTIONS
Mail::Digest::Tools exports no functions by default. Each of its current
seven functions is imported only on request by your script.
In everyday use, you will probably call just I<one> of Mail::Digest::Tool's
exportable functions in a particular Perl script. Typically, you will import
the function as described in the SYNOPSIS above, populate two configuration
hashes, and finally call the one function you have imported.
As will become evident, the most challenging part of using Mail::Digest::Tools
is I<not> calling the functions. Rather, it is the initial setup and testing
of configuration files from which the two configuration hashes passed as
arguments to the various Mail::Digest::Tools functions are drawn.
More on those configuration hashes later. For now, let's look at the
exportable functions.
=head2 C<process_new_digests>
process_new_digests(\%config_in, \%config_out);
C<process_new_digests()> is the Mail::Digest::Tools function which you will
use most frequently on a daily basis. Based on information supplied in the
two configuration hashes passed to it as arguments, C<process_new_digests()>
does the following:
=over 4
=item *
Validates the configuration data.
=item *
Conducts an analysis of the directory in which thread files for a given
digest are stored to determine are old enough:
=over 4
=item *
I<either> to be moved to a subdirectory for archiving -- if you have told the
configuration file that you wish to archive older threads in a subdirectory
=item *
I<or> to be deleted -- if you have told the configuration file that you do
I<not> wish to archive older threads
=back
=item *
Conducts an analysis of the directory in which digest files (I<i.e.,> the
plain-text versions of mailing list digests you have received) are stored to
determine which digest files are new and need processing and which have
previously been processed.
=item *
Updates a log file to put a timestamp on the processing of the new digest
file or files. Based on options set in the configuration file, this function
may also update a more human-readable version of this log file.
=item *
Opens each of the digest files identified as needing processing and proceeds
to 'strip down' those files. This 'stripping down' includes the following:
=over 4
=item *
The digest file's name is analyzed to extract the digest's number as issued by
the provider's mailing list program. This number is used to form part of the
unique identifier which Mail::Digest::Tools assigns to each message within
each digest.
=item *
The list of today's topics in the digest is extracted and appended to a
permanent log file of such topics.
=item *
The digest's contents are split into individual messages. Each message, in
turn, is split into headers and body.
=item *
If you have requested in the configuration file that superfluous multipart
MIME content be purged from messages before posting to thread files, this
purging is now conducted.
=item *
Each message is appended to an appropriate, plain-text thread file which
holds the ongoing discussion of that topic. The following factors are taken
into consideration:
=over 4
=item *
The name of the thread file is derived from the message's subject, though
characters in the message's subject which would not be valid in file names
on your operating system are skipped over.
=item *
To the greatest extent possible, extraneous words in a message's subject
such as 'Re:' or 'Fwd:' are deleted so that all relevant postings on a given
subject can be included in a single thread file. (Should this not succeed
and a new thread file beginning with 'Re:' or some similar term be created,
you can fix this later by using Mail::Digest::Tool's
C<consolidate_threads_single()> function discussed below.)
=back
=item *
A brief summation of results is printed to standard output.
=back
=back
=head2 C<reprocess_ALL_digests>
reprocess_ALL_digests(\%config_in, \%config_out);
C<reprocess_ALL_digests()> is the Mail::Digest::Tools function which you
should use ONLY when you are setting up and fine-tuning Mail::Digest::Tools
to process a given digest -- and you should NEVER use it thereafter!
Why? Read on!
C<reprocess_ALL_digests()> does almost exactly the same things as does
C<process_new_digests()>, but it does them on ALL digest files found in the
directory in which you store such digests -- not just on those previously
processed. But in the process it does not merely append new messages to
already existing thread files, leaving older thread files untouched. Instead,
C<reprocess_ALL_digests()> WIPES OUT your entire directory of thread files and
rebuilds it from scratch.
That's cool if you have retained all instances of a given digest which you
wish to process into thread files. But if you've thrown out older instances
of a given digest and call C<reprocess_ALL_digests()>, you will not be able
to process the messages contained in those discarded digests. The message
sources are gone. That's cool once you're certain that you've got a given
digest configured just the way you want it -- but not until that moment.
=over 4
=item * Example
Let's make this more concrete. Suppose that you have begun to subscribe to
the digest version of the London Perlmongers mailing list. When you receive
e-mails from this provider, you store them in a directory whose contents look
like this:
london.pm digest, Vol 1 #1856 - 7 msgs.txt
london.pm digest, Vol 1 #1857 - 18 msgs.txt
london.pm digest, Vol 1 #1858 - 15 msgs.txt
london.pm digest, Vol 1 #1859 - 17 msgs.txt
london.pm digest, Vol 1 #1860 - 11 msgs.txt
Initially, you decide that you want to post the messages in these digests
to thread files that are discarded after three days. You set up your
configuration files to do precisely this. (See below for how this is done.)
You then write a script which calls
reprocess_ALL_digests(\%config_in, \%config_out);
Three days go by. One or two new london.pm digests arrive each day. You
want to process only the newly arrived files, so each day you simply call:
process_new_digests(\%config_in, \%config_out);
and on Day 4 Mail::Digest::Tools starts to notify you on standard output
that it is discarding thread files which have not been changed (I<i.e.,>
received new postings) in three days.
But then you decide that London.pm's contributors are the most witty and
erudite Perlmongers anywhere and you wish to archive their contributions
until the end of time (or until the first production release of
Perl 6, whichever comes first). Fortunately, you've still got all your
London.pm digest files going back to the beginning of your subscription.
You make appropriate changes to your configuration setup to say, ''Instead
of killing these thread files after 3 days of inactivity, archive them after
3 days instead.'' (Again, we'll see how to do this below.) You then call:
reprocess_ALL_digests(\%config_in, \%config_out);
one last time. All your previously existing thread files are wiped out, and
all your London.pm digests are reprocessed from scratch. But that's okay,
because you've decided to live with your configuration decisions. So you
can now begin to discard older digest files and process newly arrived files
only with
process_new_digests(\%config_in, \%config_out);
Your London.pm thread archive grows exponentially, and you live happily ever
after.
=back
The ALL CAPS in C<reprocess_ALL_digests()> is a little warning that this
Mail::Digest::Tools function is very powerful, but potentially very dangerous.
You are also alerted to this danger by this screen prompt which appears when
you call this function:
By default, this program processes only NEWLY ARRIVED
[London.pm/other digest] files found in this directory. Messages in
these new digests are sorted and appended to the appropriate
'.thr.txt' files in the 'Threads' subdirectory.
However, by choosing method 'reprocess_ALL_digests()' you have
indicated that you wish to process ALL digest files found in this
directory -- regardless of whether or not they have previously been
processed. This is recommended ONLY for initialization and testing
of this program.
Since this will wipe out all threads files ('.thr.txt') as well --
including threads files for which you no longer have their source
digest files -- please confirm that this is your intent by typing
ALL at the prompt.
GOT IT?
To proceed, you must type C<ALL> in ALL CAPS, hit C<[Enter]>, then respond to
yet another prompt:
You have chosen to WIPE OUT all '.thr.txt' files currently
existing in the 'Threads' subdirectory and reprocess all
[London.pm/other digest] digest files from scratch.
Please re-confirm your choice by once again typing 'ALL'
and hitting [Enter]:
You must again type C<ALL> in ALL CAPS and hit C<[Enter]> to reprocess all
digests. Should you fail to type C<ALL> at both of these prompts, your
script will default to C<process_new_digests()> and only process newly
arrived digest files.
=head2 C<reply_to_digest_message>
$full_reply_file = reply_to_digest_message(
\%config_in,
\%config_out,
$digest_number,
$digest_entry,
$directory_for_reply,
);
Once you have begun to follow discussion threads on a mailing list with the
aid of Mail::Digest::Tools, you may wish to join the discussion and reply to
a message.
If you tried to do this by hitting the 'Reply' button in your e-mail client,
you would probably end up with a 'Subject' line in your e-mail that looked
this:
Re: london.pm digest, Vol 1 #1814 - 2 msgs
Needless to say, this is tacky. So tacky that many mailing list digest
programs insert this message into each digest's headers:
When replying, please edit your Subject line so it is more specific
than "Re: Contents of london.pm digest, Vol 1, #xxxx..."
You don't want to be tacky; you want to be lazy. You want Perl to do the
work of initiating an e-mail with a meaningful subject header for you.
Mail::Digest::Tool's C<reply_to_digest_message> does just this. It creates
a plain-text file for you that has a meaningful subject line and prepends
each line of the body of the message with C<\> >. You then open this
plain-text file, edit it to reply to its contents, copy-and-paste it into
your e-mail client, and send it.
The arguments passed to C<reply_to_digest_message()> are:
=over 4
=item *
a reference to the 'in' configuration hash
=item *
a reference to the 'out' configuration hash
=item *
the number of the digest containing the message to which you are replying
=item *
the number of the message to which you are replying within that digest
=item *
a path to the directory in which you want the plain-text reply file to be
created
=back
=over 4
=item * Example
Suppose that you wished to reply to message #2 in London.pm digest #1814:
Message: 2
From: James E Keenan <jkeen@some.web.address.com>
To: London Perlmongers <london.pm@london.pm.org>
Date: Fri, 2 Jan 2004 23:41:01 -0500
Subject: re: language courses
Reply-To: london.pm@london.pm.org
On Fri, 2 Jan 2004 22:38:40 +0000 (GMT), Ali Young wrote concerning:
language courses
> Depends what you count as useful. Learning Esperanto means that you
> can read the current London.pm website.
BTW, wasn't the Esperanto on the website supposed to expire on 31 Dec?
Jim Keenan
Brooklyn, NY
You would call the function as follows:
$full_reply_file = reply_to_digest_message(
\%config_in,
\%config_out,
1814,
2,
'/home/jimk/mail/digest/london',
);
Mail::Digest::Tools will then create a plain-text file which you can use as
the first draft of your reply. It will print this screen prompt:
To complete reply, edit text in:
/home/jimk/mail/digest/london/language_courses.reply.txt
When you open F<language_courses.reply.txt> in your text editor, it will look
like this:
Reply-To:
london.pm@london.pm.org
Subject:
language courses
On Fri, 2 Jan 2004 23:41:01 -0500, James E Keenan
<jkeen@some.web.address.com> wrote:
> On Fri, 2 Jan 2004 22:38:40 +0000 (GMT), Ali Young wrote concerning:
> language courses
>
> > Depends what you count as useful. Learning Esperanto means that you
> can
> > read the current London.pm website.
>
> BTW, wasn't the Esperanto on the website supposed to expire on 31 Dec?
>
> Jim Keenan
> Brooklyn, NY
>
The 'Reply-To' and 'Subject' paragraphs are provided simply to give you
something to cut-and-paste into a GUI e-mail client. The 'Reply-To'
paragraph will only appear if in C<%config_in> the key
C<reply_to_style_flag> is defined for a particular digest.
You edit this plain-text file, pop it into the body of your e-mail
window and send it. Not elegant, but it at least gives you a first draft.
=back
=head2 C<repair_message_order>
repair_message_order(
\%config_in,
\%config_out,
{
year => 2004,
month => 01,
day => 27,
}
);
From time to time you may receive digest versions of mailing lists out of
chronological/numerical sequence. This is especially true when e-mail
traffic is being disrupted by worms or viruses. You may discover that you
have received and processed
london.pm digest, Vol 1 #1856 - 7 msgs
london.pm digest, Vol 1 #1858 - 15 msgs
before realizing that you were missing
london.pm digest, Vol 1 #1857 - 18 msgs
If you were to now process digest 1857 with C<process_new_digests()>, messages
from that digest would be appended to their respective thread files I<after>
messages from digest 1858. Since the whole point of Mail::Digest::Tools is to
be able to read a discussion thread in chronological order, this would not be
desirable.
Fortunately, you can fix this problem as follows:
=over 4
=item * Apply C<process_new_digests()>
Call C<process_new_digests()> as you normally would. In the above example,
go ahead and call it on digest 1857 even though it creates thread files with
messages out of chronological order.
=item * Determine date where need for repair begins
Examine the timestamps on your digest files for the date of the first digest
you received out of sequence. In the above example, that would be the date
of digest 1858. Since digest files were received out of proper sequence on or
after that date, all thread files generated after that date may have
out-of-sequence messages and need re-ordering.
=item * Apply C<repair_message_order()> with the repair date
Call C<repair_message_order()> with the following arguments:
=over 4
=item *
a reference to the 'in' configuration hash
=item *
a reference to the 'out' configuration hash
=item *
a reference to an anonymous hash whose keys are C<year>, C<month> and C<day>,
the values for which keys are the elements of the repair date.
=back
Mail::Digest::Tools will examine all thread files from midnight local time on
that date. Where messages have been posted to the thread files out of proper
sequence, they will be reposted in the correct order. The thread file with
the correct sequence will overwrite the file with the incorrect sequence.
=back
=head2 C<consolidate_threads_multiple>
consolidate_threads_multiple(
\%config_in,
\%config_out,
);
or
consolidate_threads_multiple(
\%config_in,
\%config_out,
$first_common_letters, # optional integer argument
);
As described above, Mail::Digest::Tool's C<process_new_digests()> function
will, to the greatest extent possible, delete extraneous words such as 'Re:'
or 'Fwd:' from a message's subject so that all relevant postings on a given
subject can be included in a single thread file. What happens when this is
not sufficient? For example, suppose someone posts a message to a list with a
slightly misspelled or altered subject line:
=over 4
=item * Original thread file:
Help telnetting to remote host through CGI.thr.txt
=item * Thread file created due to altered subject line:
Help telnetting to remote host thru CGI.thr.txt
=back
Mail::Digest::Tools offers two functions to address this problem.
C<consolidate_threads_multiple()> is the easier to use and will be discussed
first. This function presumes that people who re-type e-mail subject lines
when replying tend to type the first several words correctly, then make errors
or alterations toward the end of the subject line. If the first I<n> letters
of the subject line of two or more messages are identical, there is a strong
chance that the messages are discussing the same topic and should be posted to
the same discussion thread. Mail::Digest::Tool's default value for I<n> is
20, but you can set a different value for a particular digest by passing an
optional third argument as shown above. C<consolidate_threads_multiple()>
accordingly:
=over 4
=item *
Makes a list of all thread files for a particular digest.
=item *
Identifies groups of thread files whose names share the first 20 letters.
=item *
Displays a prompt on standard output asking you whether you wish to
consolidate the files in each such group:
Candidates for consolidation:
Help telnetting to remote host through CGI.thr.txt
Help telnetting to remote host thru CGI.thr.txt
To consolidate, type YES:
=over 4
=item *
If you type C<YES> in ALL CAPS, the files will be consolidated into a single
thread file whose name will be derived from the Subject line of the very first
posting to the discussion thread. Standard output will display:
Files will be consolidated
=item *
If you type anything other than C<YES> in ALL CAPS -- or simply hit C<[Enter]>,
then the files will not be consolidated and standard output will display:
Files will not be consolidated
=item *
If the files are consolidated, the original thread files will not automatically
be deleted. Rather, they are renamed with the extension C<.DELETABLE>.
Help telnetting to remote host through CGI.thr.txt.DELETABLE
Help telnetting to remote host thru CGI.thr.txt.DELETABLE
This is a safety precaution. The user can then delete the deletable files
by calling the C<delete_deletables()> function discussed below.
=back
=item *
If there are no files in the threads directory which share the first 20 letters
in common (or the first I<n> letters if you have passed the optional third
argument), then you are warned at standard output:
Analysis of the first 20 letters of each file in
[threads directory]
shows no candidates for consolidation. Please hard-code
names of files you wish to consolidate as arguments to
&consolidate_threads_single
=back
=head2 C<consolidate_threads_single>
consolidate_threads_single(
\%config_in,
\%config_out,
[
'first_dummy_file_for_consolidation.thr.txt',
'second_dummy_file_for_consolidation.thr.txt',
],
);
Suppose that the thread files which you wish to consolidate have names whose
spelling diverges before the 21st letter. The algorithm which
C<consolidate_threads_multiple()> applies would not detect the potential
rationale for consolidation. This could happen when someone tries to change
the subject of discussion from:
Best book for extreme Newbie to programming
to:
De incunabula nostra (Was Best book for extreme Newbie to programming)
I<Solution:> Hard-code the files to be consolidated as elements of an
anonymous array. Pass a reference to that anonymous array as the third
argument to C<consolidate_threads_single()> as shown above.
As with C<consolidate_threads_multiple()>, the resulting consolidated file
will bear the name of the source file containing the very first posting to
the discussion thread. The files so consolidated will not automatically be
deleted. Rather, they will be renamed with the extension C<.DELETABLE> as a
safety precaution and left for you to delete with C<delete_deletables()>.
=head2 C<delete_deletables>
delete_deletables(\%config_out);
Mail::Digest::Tools function C<delete_deletables()> tidies up after use of
either C<consolidate_threads_multiple()> or C<consolidate_threads_single()>.
Unlike all other public functions provided by Mail::Digest::Tools,
C<delete_deletables()> needs to be passed a reference to only one of the
two configuration hashes, I<viz.,> the 'out' configuration hash. The
function simply changes to the directory where thread files for a given
digest are stored and deletes all files with the extension C<.DELETABLE>.
=head1 CONFIGURATION SETUP OVERVIEW
To use a Mail::Digest::Tool function, you need to answer two fundamental
questions:
=over 4
=item 1
What internal structure has the mailing list sponsor provided for a given
digest?
=item 2
How do I want to structure the results of applying Mail::Digest::Tools to a
particular digest on my system?
=back
Each of these two questions breaks down into sub-parts. Their answers
supply you with the information with which you will construct the two
configuration hashes passed to most Mail::Digest::Tools functions.
Let us take each in turn.
=head1 C<%config_in>: THE INTERNAL STRUCTURE OF A DIGEST
The best way to learn about the internal structure of a mailing list digest
(other than to study the application which created the digest in the first
place) is to accumulate several instances of the digest on your system in a
directory devoted to that purpose. Examine the way the digest's filename is
formed. Then examine the digest file itself. You will soon pick up a feel
for the structure of the digest, which will guide you in configuring
Mail::Digest::Tools for your system. That configuration will take the form
of a Perl hash which, for illustrative purposes, we shall here call
C<%xxx_config_in> where C<xxx> is a short-hand title for a particular digest.
For heuristic purposes we will examine the characteristics of two mailing
list digests which the author has been following and archiving for several
years: ActiveState's 'Perl-Win32-Users' digest and Yahoo! Groups' Perl
Beginners group digest.
=head2 Analysis of Digest's File Name
We must study a digest's file name in order to be able to write a pattern
with which we will be able to distinguish a digest file from any non-digest
file sitting in the same directory, as well as to be able to extract the
digest number from that file name.
Once saved as plain-text files, Perl-Win32-Users digest files typically look
like this in a directory:
Perl-Win32-Users Digest, Vol 1 Issue 1771.txt
Perl-Win32-Users Digest, Vol 1 Issue 1772.txt
Similarly, the Perl Beginner digest files look like this:
[PBML] Digest Number 1491.txt
[PBML] Digest Number 1492.txt
To correctly identify Perl-Win32-Users digest files from any other files in
the same directory, we compose a string which would form the core of a Perl
regular expression, I<i.e.,> everything in a pattern except the outer
delimiters. Internally, Mail::Digest::Tools passes the file name through a
C<grep { /regexp/ }> pattern, so the first key is called C<grep_formula>.
%pw32u_config_in = (
grep_formula => 'Perl-Win32-Users Digest',
...
);
The equivalent pattern for the Perl Beginners digest would be:
%pbml_config_in = (
grep_formula => '\[PBML\]',
...
);
Note that the C<[> and C<]> characters have to be escaped with a C<\>
backslash because they are normally metacharacters inside Perl regular
expressions.
We next have to extract the digest number from the digest's file name.
Certain mailing list programs give individual digests both a 'Volume' number
as well as an individual digest number. Perl-Win32-Users typifies this. In
the example above we need to capture both the C<1> as volume number and C<1771>
as digest number. The next key in our configuration hash is called
C<pattern_target>:
%pw32u_config_in = (
grep_formula => 'Perl-Win32-Users Digest',
pattern_target => '.*Vol\s(\d+),\sIssue\s(\d+)\.txt',
...
);
Note the two sets of capturing parentheses.
Other digests, such as those at Yahoo! Groups, dispense with a volume number
and simply increment each digest number:
%pbml_config_in = (
grep_formula => '\[PBML\]',
pattern_target => '.*\s(\d+)\.txt$',
...
);
Note that this C<pattern_target> contains only one pair of capturing
parentheses.
=head2 Analysis of Digest's Internal Structure
A digest's internal structure is discussed in detail above (see
'A TYPICAL MAILING LIST DIGEST'). Here we need to identify two
characteristics: the way the digest introduces its list of today's topics
and the string it uses to delimit the list of today's topics from the first
individual message in the digest and all subsequent messages from one another.
Continuing with our two examples from above, we provide values for keys
C<topics_intro> and C<source_msg_delimiter>:
%pw32u_config_in = (
grep_formula => 'Perl-Win32-Users digest',
pattern_target => '.*Vol\s(\d+),\sIssue\s(\d+)\.txt',
topics_intro => 'Today\'s Topics:',
source_msg_delimiter => "--__--__--\n\n",
...
);
Note the escaped C<'> apostrophe character in the value for key
C<topics_intro>.
%pbml_config_in = (
grep_formula => '\[PBML\]',
pattern_target => '.*\s(\d+)\.txt$',
topics_intro => 'Topics in this digest:',
source_msg_delimiter => "________________________________________________________________________\n________________________________________________________________________\n\n",
...
);
Note that the values provided for the respective C<source_msg_delimiter> keys
had to be double-quoted strings. That's because all such delimiters include
two or more C<\n> newline characters so that they form paragraphs unto
themselves. Unless indicated otherwise, the values for all other values in
the configuration hash are single-quoted strings.
Note: In early 2004, while Mail::Digest::Tools was being prepared for its
initial distribution on CPAN, ActiveState changed certain features in the
daily digest versions of its mailing lists. Hence, the code example presented
above should not be 'copied-and-pasted' into a configuration hash with which
you, the user, might follow the current Perl-Win32-Users digest. In
particular, the source message delimiter was changed to a string of 30
hyphens followed by 2 C<\n> newline characters:
"------------------------------\n\n"
However, since it is not unheard of for contributors to a mailing list to use
such a string of hyphens within their postings or signatures, using a string
of hyphens is not a particularly apt choice for a source message delimiter.
In this particular case, the author is getting better (but not fully tested)
results by including an additional newline I<before> the hyphen string in
order to more uniquely identify the source message delimiter:
"\n------------------------------\n\n"
=head2 Analysis of Individual Messages
The internal structure of an individual message within a digest is also
discussed in detail above. Here we need to identify patterns with which we
can extract the content of the message's headers.
Certain mailing list digest programs allow a wide variety of headers to appear
in digested messages. The Perl-Win32-Users digest typifies this. Each
message in a Perl-Win32_Users digest I<must> have a message number and headers
for the message's author, recipients, subject and date.
Message: 1
From: Chris Smithson <ChrisSmithson@some.web.address.com>
To: "'Carter Kraus'" <carter@some.web.address.com>,
"Perl-Win32-Users (E-mail)" <perl-win32-users@activestate.com>
Subject: RE: OO Perl Issue.
Date: Wed, 4 Feb 2004 14:17:24 -0600
But a message in this digest may have additional headers for the author's
organization, reply address and/or carbon-copy recipients.
Message: 5
Date: Wed, 4 Feb 2004 15:15:44 -0800
From: Sam Spade <sspade@some.web.address.com>
Organization: Some Web Address
Reply-To: Sam Spade <sspade@some.web.address.com>
To: "Time" <summers@some.web.address.com>
CC: "Perl List" <perl-win32-users@listserv.activestate.com>
Subject: Re: New IE Update causes script problems
Patterns are easily developed to capture this information and store it in the
configuration hash:
%pw32u_config_in = (
grep_formula => 'Perl-Win32-Users digest',
pattern_target => '.*Vol\s(\d+),\sIssue\s(\d+)\.txt',
topics_intro => 'Today\'s Topics:',
source_msg_delimiter => "--__--__--\n\n",
message_style_flag => '^Message:\s+(\d+)$',
from_style_flag => '^From:\s+(.+)$',
org_style_flag => '^Organization:\s+(.+)$',
to_style_flag => '^To:\s+(.+)$',
cc_style_flag => '^CC:\s+(.+)$',
subject_style_flag => '^Subject:\s+(.+)$',
date_style_flag => '^Date:\s+(.+)$',
reply_to_style_flag => '^Reply-To:\s+(.+)$',
...
);
Other mailing list digest programs allow much fewer headers in digested
messages. The Yahoo! Groups digests such as Perl Beginner typify this.
Message: 4
Date: Sun, 7 Dec 2003 19:24:03 +1100
From: Philip Streets <phil@some.web.address.com.au>
Subject: RH9.0, perl 5.8.2 and qmail-localfilter question
The patterns developed to capture this information and store it in the
configuration hash would be as follows:
%pbml_config_in = (
grep_formula => '\[PBML\]',
pattern_target => '.*\s(\d+)\.txt$',
topics_intro => 'Topics in this digest:',
source_msg_delimiter => "________________________________________________________________________\n________________________________________________________________________\n\n",
message_style_flag => '^Message:\s+(\d+)$',
from_style_flag => '^\s+From:\s+(.+)$',
subject_style_flag => '^Subject:\s+(.+)$',
date_style_flag => '^\s+Date:\s+(.+)$',
...
);
Note that this pattern is written to expect 1 or more whitespaces at the
beginning of the C<from_style_flag> and the C<date_style_flag>.
We could -- but do not need to -- add the following key-value pairs to the
C<%pbml_config_in> hash.
org_style_flag => undef,
to_style_flag => undef,
cc_style_flag => undef,
reply_to_style_flag => undef,
=head2 Inspection of Messages for Multipart MIME Content
Certain mailing lists allow subscribers to post messages in either plain-text
or HTML. Certain lists allow subscribers to post attachments; others do not.
When it comes to preparing digests of these messages, the programs which
different lists take lead to different results. The most annoying situation
occurs when a list allows a subscriber to post in 'multipart MIME format' and
then fails to strip out the redundant HTML part after printing the needed
plain-text part.
I<Example:> An all too typical example from an older version of an ActiveState
list digest. (ActiveState changed the format of its digests in early 2004 to
strip out HTML attachments. Hence, the following code no longer accurately
represents what a subscriber to an ActiveState digest will see. Other mailing
lists still suffer from MIME bloat, however, so treat the following code as
illustrative.) The message begins:
Message: 1
To: Perl-Win32-Users@activestate.com
Subject: Can not tie STDOUT to scolled Tk widget
From: John_Wonderman@some.web.address.ca
Date: Thu, 15 Jan 2004 16:25:17 -0500
This is a multipart message in MIME format.
--=_alternative 00750F0485256E1C_=
Content-Type: text/plain; charset="US-ASCII"
Hi;
I am trying to implement a scrolling text widget to capture output for for
at tk app. Without scrolling:
my $text = $mw->Text(-width => 78,
-height => 32,
-wrap => 'word',
-font => ['Courier New','11']
)->pack(-side => 'bottom',
-expand => 1,
-fill => 'both',
);
...
When the plain-text part of the message is finished, it is then repeated in
HTML:
--=_alternative 00750F0485256E1C_=
Content-Type: text/html; charset="US-ASCII"
<br><font size=2 face="Tahoma">Hi;</font>
<p><font size=2 face="Tahoma">I am trying to implement a scrolling text
widget to capture output for for at tk app. Without scrolling:</font>
<p><font size=2 face="Bitstream Vera Sans Mono">my $text = $mw->Text(-width
=> 78,</font>
<br><font size=2 face="Bitstream Vera Sans Mono">
-height => 32,</font>
<br><font size=2 face="Bitstream Vera Sans Mono">
-wrap => 'word',</font>
<br><font size=2 face="Bitstream Vera Sans Mono">
-font => ['Courier New','11']</font>
<br><font size=2 face="Bitstream Vera Sans Mono">)->pack(-side =>
'bottom',</font>
<br><font size=2 face="Bitstream Vera Sans Mono">
-expand => 1,</font>
<br><font size=2 face="Bitstream Vera Sans Mono">
-fill => 'both',</font>
There is no reason to retain this bloat in your thread file. The digest
providers should have stripped it out, but the program they were using failed
to do so. Other digests, such as those at Yahoo! Groups, eliminate all this
blather.
Now, with Mail::Digest::Tools, you can eliminate much of the bloat yourself.
After examining 6-10 instances of a particular mailing list digest, you should
be able to determine whether the digest needs a dose of digital castor oil or
not, and you set key C<MIME_cleanup_flag> accordingly. If the digest contains
unnecessary multipart MIME content, you set this flag to C<1>; otherwise, to
C<0>.
And with that you have completed your analysis of the internal structure of a
given digest and entered the relevant information into the first configuration
hash:
%pw32u_config_in = (
grep_formula => 'Perl-Win32-Users digest',
pattern_target => '.*Vol\s(\d+),\sIssue\s(\d+)\.txt',
topics_intro => 'Today\'s Topics:',
source_msg_delimiter => "--__--__--\n\n",
message_style_flag => '^Message:\s+(\d+)$',
from_style_flag => '^From:\s+(.+)$',
org_style_flag => '^Organization:\s+(.+)$',
to_style_flag => '^To:\s+(.+)$',
cc_style_flag => '^CC:\s+(.+)$',
subject_style_flag => '^Subject:\s+(.+)$',
date_style_flag => '^Date:\s+(.+)$',
reply_to_style_flag => '^Reply-To:\s+(.+)$',
MIME_cleanup_flag => 1,
);
%pbml_config_in = (
grep_formula => '\[PBML\]',
pattern_target => '.*\s(\d+)\.txt$',
topics_intro => 'Topics in this digest:',
source_msg_delimiter => "________________________________________________________________________\n________________________________________________________________________\n\n",
message_style_flag => '^Message:\s+(\d+)$',
from_style_flag => '^\s+From:\s+(.+)$',
subject_style_flag => '^Subject:\s+(.+)$',
date_style_flag => '^\s+Date:\s+(.+)$',
MIME_cleanup_flag => 0,
);
=head1 C<%config_out>: HOW TO PROCESS A DIGEST ON YOUR SYSTEM
C<%config_in> holds the answers to the question: What internal structure has
the mailing list sponsor provided for a given digest? In contrast,
C<%config_out> will hold the answer to this question: How do I want to
structure the results of applying Mail::Digest::Tools to a particular digest
on my system?
For purpose of illustration, we will continue to assume that we are processing
digest files received from the Perl-Win32-Users and Perl Beginner lists. We
will make slightly different choices as to how we process those digest files
so as to illustrate different options available from Mail::Digest::Tools.
We shall also assume that we going to place the scripts from which we call
Mail::Digest::Tools functions in the directory I<above> the directories in
which we store the digest files once they have been saved as plain-text files.
If we call this directory C<digest> and place the scripts in that directory,
then we will have a directory structure that starts out like this:
digest/
process_new.pl
process_ALL.pl
reply_digest_message.pl
repair_digest_order.pl
consolidate_threads.pl
deletables.pl
pw32u/
Perl-Win32-Users Digest, Vol 1 Issue 1771.txt
Perl-Win32-Users Digest, Vol 1 Issue 1772.txt
pbml/
[PBML] Digest Number 1491.txt
[PBML] Digest Number 1492.txt
=head2 Required C<%config_out> Keys
There are 9 keys which are required in C<%config_out> in order for
Mail::Digest::Tools to function properly. They correspond to 9 decisions
which you must make in setting up a Mail::Digest::Tools configuration on
your system.
=over 4
=item 1 Title
Each digest must be given a title which is used whenever Mail::Digest::Tools
needs to prompt or warn you on standard output. The key which holds this
information in C<%config_out> must be called C<title>; the value for this
element should be sensible.
%pw32u_config_out = (
title => 'Perl-Win32-Users',
...
);
%pbml_config_out = (
title => 'Perl Beginner',
...
);
=item 2 Digest Directory
For each digest a directory must be designated where individual digest files
are stored in plain-text format. The key which holds this information in
C<%config_out> must be called C<dir_digest>. In the examples below
directories are named relative to the 'current' directory (C<..>),
I<i.e.,> the directory where the script invoking a
Mail::Digest::Function is stored.
%pw32u_config_out = (
title => 'Perl-Win32-Users',
dir_digest => "../pw32u",
...
);
%pbml_config_out = (
title => 'Perl Beginner',
dir_digest => "../pbml",
...
);
=item 3 Threads Directory
For each digest a directory must be designated where the thread files created
by use of Mail::Digest::Tools functions are stored. The key which holds this
information in C<%config_out> must be called C<dir_threads>. In the examples
below the threads directory is a subdirectory of the digest directory, but
you may make other choices.
%pw32u_config_out = (
title => 'Perl-Win32-Users',
dir_digest => "../pw32u",
dir_threads => "../pw32u/Threads",
...
);
%pbml_config_out = (
title => 'Perl Beginner',
dir_digest => "../pbml",
dir_threads => "../pbml/Threads",
...
);
=item 4 Digests Log File
For each digest a file must be kept which logs whether a given digest file
has already been processed or not and, if so, when. The key which holds this
information in C<%config_out> must be called C<digests_log>. It has been
found convenient to keep this file in the digests directory, but you may make
other choices.
%pw32u_config_out = (
title => 'Perl-Win32-Users',
dir_digest => "../pw32u",
dir_threads => "../pw32u/Threads",
digests_log => "../pw32u/digests_log.txt",
...
);
%pbml_config_out = (
title => 'Perl Beginner',
dir_digest => "../pbml",
dir_threads => "../pbml/Threads",
digests_log => "../pbml/digests_log.txt",
...
);
=item 5 Today's Topics
For each digest a file must be kept which holds an ongoing record of the
list of topics found in each individual digest file. The key which holds this
information in C<%config_out> must be called <todays_topics>. It has been
found convenient to keep this file in the digests directory, but you may make
other choices.
%pw32u_config_out = (
title => 'Perl-Win32-Users',
dir_digest => "../pw32u",
dir_threads => "../pw32u/Threads",
digests_log => "../pw32u/digests_log.txt",
todays_topics => "../pw32u/todays_topics.txt",
...
);
%pbml_config_out = (
title => 'Perl Beginner',
dir_digest => "../pbml",
dir_threads => "../pbml/Threads",
digests_log => "../pbml/digests_log.txt",
todays_topics => "../pbml/todays_topics.txt",
...
);
=item 6 Format for Identifying Digest Number in Output
For each digest you must choose how to format the number(s) of the individual
digest file being processed when messages from that file are written to a
threads file. What you are doing here is formatting the information captured
by the C<pattern_target> key in a given digest's C<%config_in> (see above).
You express this choice as a single-quoted string which formats the data
captured by Perl regular expression which in C<pattern_target>. This
formatting is done via the Perl C<sprintf> function. The resulting string
is assigned to be the value of C<%config_out> key <id_format>.
We saw above that digests from the Perl-Win32-Users list carried both a volume
number and an individual digest number.
Perl-Win32-Users Digest, Vol 1 Issue 1771.txt
Perl-Win32-Users Digest, Vol 1 Issue 1772.txt
Both numbers were captured by the Perl regular expression in
C<%pw32u_config_in> key <pattern_target>.
'.*Vol\s(\d+),\sIssue\s(\d+)\.txt',
Here we have chosen to format the volume number as a 3-digit, 0-padded number
and the individual digest number as a 4-digit, 0-padded number. We then join
these two data with an underscore.
%pw32u_config_out = (
title => 'Perl-Win32-Users',
dir_digest => "../pw32u",
dir_threads => "../pw32u/Threads",
digests_log => "../pw32u/digests_log.txt",
todays_topics => "../pw32u/todays_topics.txt",
id_format => 'sprintf("%03d",$1) . \'_\' . sprintf("%04d",$2)',
...
);
We saw above that digests from the Perl Beginners list carried only an
digest number -- no volume number.
[PBML] Digest Number 1491.txt
[PBML] Digest Number 1492.txt
This number was captured by the Perl regular expression in C<%pbml_config_in>
key <pattern_target>.
'.*\s(\d+)\.txt$'
Here we have chosen to format the digest number as a 5-digit, 0-padded number.
%pbml_config_out = (
title => 'Perl Beginner',
dir_digest => "../pbml",
dir_threads => "../pbml/Threads",
digests_log => "../pbml/digests_log.txt",
todays_topics => "../pbml/todays_topics.txt",
id_format => 'sprintf("%05d",$1)',
...
);
Note that if you allow for a 4-digit number, the highest numbered digest you
can process off a given mailing list will be C<9999>. If you allow for a
5-digit number, the upper limit will be C<99999>. The latter should be
sufficient for a lifetime even for a mailing list (I<e.g.,> London.pm) which
generates 3 or 4 digest files per day or over 1000 per year.
=item 7 Format for Numbering Individual Messages in Output
For each digest you must choose how to format the number which the digest
assigns to its individual messages. Experience suggests that 2 digits should
be more than sufficient to format this number, as all digests which the author
has observed have fewer than 100 entries. However, below we have arbitrarily
decided to allow for up to 9999 entries in a given digest. As with the digest
number, the formatting is accomplished via the Perl C<sprintf> function.
The result is stored in a C<%config_out> key which must be called
C<output_id_format>.
%pw32u_config_out = (
title => 'Perl-Win32-Users',
dir_digest => "../pw32u",
dir_threads => "../pw32u/Threads",
digests_log => "../pw32u/digests_log.txt",
todays_topics => "../pw32u/todays_topics.txt",
id_format => 'sprintf("%03d",$1) .
\'_\' . sprintf("%04d",$2)',
output_id_format => 'sprintf("%04d",$1)',
...
);
%pbml_config_out = (
title => 'Perl Beginner',
dir_digest => "../pbml",
dir_threads => "../pbml/Threads",
digests_log => "../pbml/digests_log.txt",
todays_topics => "../pbml/todays_topics.txt",
id_format => 'sprintf("%05d",$1)',
output_id_format => 'sprintf("%04d",$1)',
...
);
=item 8 Thread Message Delimiter
For each digest you must compose a string which will separate one message in
a threads file from its successor. This string must be double-quoted and
assigned to C<%config_out> key C<thread_msg_delimiter>. For readability, this
string should terminate in two or more C<\n\n> newline characters so that the
delimiter is always a paragraph unto itself.
This delimiter may -- or may not -- be the same string which the mailing list
provider uses to separate messages in the digest files themselves. In other
words, you may choose to use the same string for C<thread_msg_delimiter> in
C<%config_out> as you reported the list provider used in C<%config_in> key
C<source_msg_delimiter>.
In the example below we make the C<thread_msg_delimiter> for the output from
Perl-Win32-Users to be the same as its C<source_msg_delimiter>.
%pw32u_config_out = (
title => 'Perl-Win32-Users',
dir_digest => "../pw32u",
dir_threads => "../pw32u/Threads",
digests_log => "../pw32u/digests_log.txt",
todays_topics => "../pw32u/todays_topics.txt",
id_format => 'sprintf("%03d",$1) .
\'_\' . sprintf("%04d",$2)',
output_id_format => 'sprintf("%04d",$1)',
thread_msg_delimiter => "--__--__--\n\n",
...
);
Note: In light of the earlier discussion of the changes ActiveState made
to its mailing list digests in early 2004, the reader is cautioned that the
code above should not be directly 'copied-and-pasted' into a configuration
hash with which you might follow an ActiveState mailing list. Treat it as
educational. In particular, the author is now testing the following as a
setting for C<$pw32u_config_out{'thread_msg_delimiter'}>:
"\n--__--__--\n\n",
For threads generated by appling Mail::Digest::Tools to the Perl
Beginners list, we choose an output message delimiter which differs from the
source message delimiter.
%pbml_config_out = (
title => 'Perl Beginner',
dir_digest => "../pbml",
dir_threads => "../pbml/Threads",
digests_log => "../pbml/digests_log.txt",
todays_topics => "../pbml/todays_topics.txt",
id_format => 'sprintf("%05d",$1)',
output_id_format => 'sprintf("%04d",$1)',
thread_msg_delimiter => "_*_*_*_*_*_\n_*_*_*_*_*_\n\n\n",
...
);
Whatever choice you make for the C<thread_msg_delimiter> it should be a string
unlikely to occur within the text of a message and should terminate in two or
more newlines.
=item 9 Archive or Delete Threads?
For each digest you process with Mail::Digest::Tools, you must decide whether
to retain the resulting thread files in an archive them in a separate
directory after a specified period of time, to delete them from disk
after a specified period of time, or to do neither and allow them to
accumulate indefinitely in the threads directory. Your decision is represented
as the value of C<%config_out> key <archive_kill_trigger>. This value must
be expressed as one of three numerical values:
0 Thread files are neither archived nor deleted
1 Thread files are archived in a separate directory (or directories)
after the number of days specified by key 'archive_kill_days'
(see below)
-1 Thread files are deleted after I<n> days as specified by key
'archive_kill_days'
In the examples below we have chosen to archive all threads generated by the
Perl-Win32-Users list but to kill all threads generated by the Perl Beginner
list after a number of days whose specification we shall come to shortly.
%pw32u_config_out = (
title => 'Perl-Win32-Users',
dir_digest => "../pw32u",
dir_threads => "../pw32u/Threads",
digests_log => "../pw32u/digests_log.txt",
todays_topics => "../pw32u/todays_topics.txt",
id_format => 'sprintf("%03d",$1) . \'_\' .
sprintf("%04d",$2)',
output_id_format => 'sprintf("%04d",$1)',
thread_msg_delimiter => "--__--__--\n\n",
archive_kill_trigger => 1,
...
);
%pbml_config_out = (
title => 'Perl Beginner',
dir_digest => "../pbml",
dir_threads => "../pbml/Threads",
digests_log => "../pbml/digests_log.txt",
todays_topics => "../pbml/todays_topics.txt",
id_format => 'sprintf("%05d",$1)',
output_id_format => 'sprintf("%04d",$1)',
thread_msg_delimiter => "_*_*_*_*_*_\n_*_*_*_*_*_\n\n\n",
archive_kill_trigger => -1,
...
);
=back
This completes the 9 required keys for C<%config_out>. We now turn to keys
which are either optional or which are required if you have assigned a value
of C<1> or C<-1> to key C<archive_kill_trigger>.
=head2 Optional C<%config_out> Keys
=over 4
=item * Digests Read File
As an option, Mail::Digest::Tools offers file to log which instances of a
particular digest have previously been processed which is more
human-readable than the file named in C<%config_out> key C<digests_log>.
That file logs a digest as follows:
001_9999;Fri Feb 6 18:57:41 2004;Fri Feb 6 18:57:41 2004
It is probably easier to read this data like this:
09999:
first processed at Fri Feb 6 18:57:41 2004
most recently processed at Fri Feb 6 18:57:41 2004
To choose this option you need to set I<two> keys in C<%config_out>:
=over 4
=item 1 C<digests_read_flag>
This must be assigned a true value such as C<1>. This tells
Mail::Digest::Tools that you indeed want a 'digests read' file.
=item 2 C<digests_read>
This should be assigned the name of the 'digests read' file, but it will
default to a file F<digests_read.txt> placed in the directory named by key
C<dir_digest>.
=back
Adding these keys to our C<%config_out>, we get:
%pw32u_config_out = (
title => 'Perl-Win32-Users',
dir_digest => "../pw32u",
dir_threads => "../pw32u/Threads",
digests_log => "../pw32u/digests_log.txt",
todays_topics => "../pw32u/todays_topics.txt",
id_format => 'sprintf("%03d",$1) . \'_\' .
sprintf("%04d",$2)',
output_id_format => 'sprintf("%04d",$1)',
thread_msg_delimiter => "--__--__--\n\n",
archive_kill_trigger => 1,
digests_read_flag => 1,
digests_read => "../pw32u/digests_read.txt",
...
);
%pbml_config_out = (
title => 'Perl Beginner',
dir_digest => "../pbml",
dir_threads => "../pbml/Threads",
digests_log => "../pbml/digests_log.txt",
todays_topics => "../pbml/todays_topics.txt",
id_format => 'sprintf("%05d",$1)',
output_id_format => 'sprintf("%04d",$1)',
thread_msg_delimiter => "_*_*_*_*_*_\n_*_*_*_*_*_\n\n\n",
archive_kill_trigger => -1,
digests_read_flag => 1,
digests_read => "../pbml/digests_read.txt",
...
);
=item * Keys Needed When Archiving Thread Files
If, as discussed above, you have assigned the value C<1> to the
C<<archive_kill_trigger> key in C<%config_out>, then Mail::Digest::Tools
will archive older thread files, I<i.e.,> it will move thread files from the
directory specified in key C<dir_threads> to an archive directory if the
thread file has not been modified in a specified number of days. If new
messages need to be posted to a thread file which has been archived, that
file will be de-archived and brought back to the C<dir_threads> directory.
Thread files which are either archived or de-archived via a call to
C<process_new_digests()> or C<reprocess_ALL_digests()> will be logged in
appropriately named files.
Hence, the keys you will need to define when archiving thread files are:
=over 4
=item 1 C<archive_kill_days>
This key must be assigned the number of days after which a thread file sitting
in the C<dir_threads> directory is moved to an archive directory. If not
specified, will default to 14 days.
=item 2 C<dir_archive_top>
This key must be assigned the name of the I<top> archive directory, I<i.e.,>
the directory at the top of a tree of archive directories.
When you track a particular mailing list digest for a number of years, the
number of different thread files can grow to enormous proportions. For
example, the author has tracked over 10,000 distinct thread files from the
Perl-Win32-Users list over a three-and-a-half year period. 10,000 files in a
single directory is completely unwieldy and slows directory read-times
tremendously. Mail::Digest::Tools therefore by default provides a tree of
archive directories: a top directory which contains no thread files but
instead holds 27 subdirectories , one for each letter of the English alphabet
and one for thread files which start with any other character (guaranteed to
work with ASCII only; not tested with other character sets).
dir_archive_top
a
b
c
...
z
other
The user gets to choose where to place the top archive directory but the 27
subdirectories are automatically placed beneath that one. The top archive
directory is the value assigned to C<%config_out> key C<dir_archive_top>.
=item 3 C<archived_today>
This key should be assigned the name of a file which will log any and all
files archived by a single call to C<process_new_digests()> or
C<reprocess_ALL_digests()>. (By 'single' call is meant that this is I<not>
an ongoing log; it only shows what happened today.) If not assigned a value,
it will default to a file called F<archived_today.txt> located in the
directory named by key C<dir_digest>.
=item 4 C<de_archived_today>
This key should be assigned the name of a file which will log any and all
files de-archived by a single call to C<process_new_digests()> or
C<reprocess_ALL_digests()>. (By 'single' call is meant that this is I<not>
an ongoing log; it only shows what happened today.) If not assigned a value,
it will default to a file called F<de_archived_today.txt> located in the
directory named by key C<dir_digest>.
=item 5 C<archive_config>
This key is reserved for future use. In the current version of
Mail::Digest::Tools it does not need to be set, but, should you be obsessive
about this, set it to C<0>.
=back
Adding these keys to our sample C<%config_out> hashes, we get:
%pw32u_config_out = (
title => 'Perl-Win32-Users',
dir_digest => "../pw32u",
dir_threads => "../pw32u/Threads",
digests_log => "../pw32u/digests_log.txt",
todays_topics => "../pw32u/todays_topics.txt",
id_format => 'sprintf("%03d",$1) . \'_\' .
sprintf("%04d",$2)',
output_id_format => 'sprintf("%04d",$1)',
thread_msg_delimiter => "--__--__--\n\n",
archive_kill_trigger => 1,
digests_read_flag => 1,
digests_read => "../pw32u/digests_read.txt",
archive_kill_days => 14,
dir_archive_top => "../pw32u/Threads/archive",
archived_today => "../pw32u/archived_today.txt",
de_archived_today => "../pw32u/de_archived_today.txt",
...
);
%pbml_config_out = (
title => 'Perl Beginner',
dir_digest => "../pbml",
dir_threads => "../pbml/Threads",
digests_log => "../pbml/digests_log.txt",
todays_topics => "../pbml/todays_topics.txt",
id_format => 'sprintf("%05d",$1)',
output_id_format => 'sprintf("%04d",$1)',
thread_msg_delimiter => "_*_*_*_*_*_\n_*_*_*_*_*_\n\n\n",
archive_kill_trigger => -1,
digests_read_flag => 1,
digests_read => "../pbml/digests_read.txt",
...
);
Note that since in our example we chose I<not> to archive thread files from
the Perl Beginner list -- as evinced by the assignment of C<-1> to key
C<archive_kill_trigger> -- we do not need to assign any values to
C<dir_archive_top>, C<archived_today> or C<de_archived_today> in
C<%pbml_config_out>.
=item * Keys Needed When Deleting Thread Files
The keys needed for C<%config_out> when you have chosen to delete thread
files after a specified interval parallel those you would have needed if you
had chosen to archive those files instead.
=over 4
=item 1 C<archive_kill_days>
This key must be assigned the number of days after which a thread file sitting
in the C<dir_threads> directory is deleted. If not specified, will default
to 14 days.
=item 2 C<deleted_today>
This key should be assigned the name of a file which will log any and all
files deleted by a single call to C<process_new_digests()> or
C<reprocess_ALL_digests()>. (By 'single' call is meant that this is I<not>
an ongoing log; it only shows what happened today.) If not assigned a value,
it will default to a file called F<deleted_today.txt> located in the
directory named by key C<dir_digest>.
=back
Adding these keys to our sample C<%config_out> hashes, we get:
%pw32u_config_out = (
title => 'Perl-Win32-Users',
dir_digest => "../pw32u",
dir_threads => "../pw32u/Threads",
digests_log => "../pw32u/digests_log.txt",
todays_topics => "../pw32u/todays_topics.txt",
id_format => 'sprintf("%03d",$1) . \'_\' .
sprintf("%04d",$2)',
output_id_format => 'sprintf("%04d",$1)',
thread_msg_delimiter => "--__--__--\n\n",
archive_kill_trigger => 1,
digests_read_flag => 1,
digests_read => "../pw32u/digests_read.txt",
archive_kill_days => 14,
dir_archive_top => "../pw32u/Threads/archive",
archived_today => "../pw32u/archived_today.txt",
de_archived_today => "../pw32u/de_archived_today.txt",
...
);
%pbml_config_out = (
title => 'Perl Beginner',
dir_digest => "../pbml",
dir_threads => "../pbml/Threads",
digests_log => "../pbml/digests_log.txt",
todays_topics => "../pbml/todays_topics.txt",
id_format => 'sprintf("%05d",$1)',
output_id_format => 'sprintf("%04d",$1)',
thread_msg_delimiter => "_*_*_*_*_*_\n_*_*_*_*_*_\n\n\n",
archive_kill_trigger => -1,
digests_read_flag => 1,
digests_read => "../pbml/digests_read.txt",
archive_kill_days => 14,
deleted_today => "../pbml/deleted_today.txt",
...
);
Note that since in our example we chose to archive thread files from
the Perl-Win32-Users list -- as evinced by the assignment of C<1> to key
C<archive_kill_trigger> -- we do not need to assign any values to
C<deleted_today> in C<%pw32u_config_out>.
=item * Keys Needed When Stripping Multipart MIME Content from Thread Files
Recall from above that you had to study a given digest to determine whether or
not it contained multipart MIME content in need of stripping out. If a digest,
such as the ActiveState Perl-Win32-Users digest, contained a lot of such bloat,
you set key C<MIME_cleanup_flag> in C<%config_in> to a value of C<1>. If, on
the other hand, the mailing list provider stripped out the multipart MIME
content before distributing the digest, you set that key to a value of C<0>.
Mail::Digest::Tools will automatically strip out multipart MIME content once
you have set C<MIME_cleanup_flag> to C<1>. All that is left for you to decide
is: Do I want to view a log of which messages processed in a I<single> call of
C<process_new_digests()> or C<reprocess_ALL_digests()> had multipart MIME
content stripped out -- or not? If so, you must set two keys in
C<%config_out>:
=over 4
=item 1 C<MIME_cleanup_log_flag>
This key must be set to a true value such as C<1>.
=item 2 C<mimelog>
This key should be assigned the name of the 'mimelog' file, but if you do not
specify a value it will default to a file F<mimelog.txt> placed in the
directory named by key C<dir_digest>.
=back
The logfile so created looks like this:
Processed Problem
001_1775_0003 CASE C
001_1775_0015 CASE C
001_1775_0018 CASE C
001_1775_0021 CASE E
where items in the 'Processed' column were either (a) successfully stripped of
multipart MIME content by Mail::Digest::Tools as specified by the internal rule
denoted by the 'CASE'; or (b) were recognized by Mail::Digest::Tools as
containing multipart MIME content that could not be stripped out.
This is relatively esoteric and probably of interest mainly to the module's
developer. So if you are not interested in this feature set
C<MIME_cleanup_log_flag> to C<0> and no mimelog will be created -- but
Mail::Digest::Tools will still do its best to strip out extraneous multipart
MIME content.
Our sample C<%config_out> hashes are now complete. They look like this:
%pw32u_config_out = (
title => 'Perl-Win32-Users',
dir_digest => "../pw32u",
dir_threads => "../pw32u/Threads",
digests_log => "../pw32u/digests_log.txt",
todays_topics => "../pw32u/todays_topics.txt",
id_format => 'sprintf("%03d",$1) . \'_\' .
sprintf("%04d",$2)',
output_id_format => 'sprintf("%04d",$1)',
thread_msg_delimiter => "--__--__--\n\n",
archive_kill_trigger => 1,
digests_read_flag => 1,
digests_read => "../pw32u/digests_read.txt",
archive_kill_days => 14,
dir_archive_top => "../pw32u/Threads/archive",
archived_today => "../pw32u/archived_today.txt",
de_archived_today => "../pw32u/de_archived_today.txt",
mimelog => "../pw32u/mimelog.txt",
MIME_cleanup_log_flag => 1,
);
%pbml_config_out = (
title => 'Perl Beginner',
dir_digest => "../pbml",
dir_threads => "../pbml/Threads",
digests_log => "../pbml/digests_log.txt",
todays_topics => "../pbml/todays_topics.txt",
id_format => 'sprintf("%05d",$1)',
output_id_format => 'sprintf("%04d",$1)',
thread_msg_delimiter => "_*_*_*_*_*_\n_*_*_*_*_*_\n\n\n",
archive_kill_trigger => -1,
digests_read_flag => 1,
digests_read => "../pbml/digests_read.txt",
archive_kill_days => 14,
deleted_today => "../pbml/deleted_today.txt",
);
Note that C<%pbml_config_out> does not have C<MIME_cleanup_log_flag> or
C<mimelog> keys. It doesn't need them, because in providing the Perl
Beginners mailing list Yahoo! Groups strips out unnecessary multipart
MIME content before sending the digest to you.
=back
=head1 HELPFUL HINTS
... in which the module author shares what he has learned using
Mail::Digest::Tools and its predecessors since August 2000.
=head2 Initial Configuration and Testing
As mentioned above, if you are considering creating a local archive of threads
originating in daily digest versions of a mailing list, you should first
accumulate 6-10 instances of such digests and both:
=over 4
=item 1
study the internal structure of the digest -- needed to develop a
C<%config_in> for the digest; and
=item 2
carefully consider how you wish to structure the output from the module's
use on your system -- needed to develop C<%config_out> for the digest
=back
Once you have developed the initial configuration, you should call
C<reprocess_ALL_digests()> on the digests, then open the files created to see
if the results are what you want. If they are I<not> what you want, then you
need to think about what you should change in C<%config_in> and/or
C<%config_out>. Make those changes, then call C<reprocess_ALL_digests()>
again. Repeat as needed, making sure not to delete any of the digest files
you are using as sources until you are completely satisfied with your
configuration.
Once, however, you I<are> satisfied with your configuration, you should call
C<process_new_digests()> on new instances of digests and I<never> call
C<reprocess_ALL_digests()> for that digest again (lest you not be able to
regenerate threads containing messages from digests you have deleted over
time).
=head2 Where to Store the Configuration Hashes
As mentioned above, you will probably find it convenient to write separate
Perl scripts to call each one of Mail::Digest::Tool's public functions. You
could code C<%config_in> and C<%config_out> in each of those scripts just
before the respective function calls. But that would violate the principle of
'Repeated Code Is a Mistake' and multiply maintenance problems. It's far
better to code the two configuration hashes in a separate plain-text file and
'require' that file into your script. That way, any changes you make in the
configuration will be automatically picked up by each script that calls a
Mail::Digest::Tools function.
Here is an example of such a file holding the configuration hashes governing
use of the Perl-Win32-Users digest, along with a script making use of that file.
# file: pw32u.digest.data
$topdir = "E:/Digest/pw32u";
%config_in = (
grep_formula => 'Perl-Win32-Users digest',
pattern_target => '.*Vol\s(\d+),\sIssue\s(\d+)\.txt',
# next element's value must be double-quoted
source_msg_delimiter => "--__--__--\n\n",
topics_intro => 'Today\'s Topics:',
message_style_flag => '^Message:\s+(\d+)$',
from_style_flag => '^From:\s+(.+)$',
org_style_flag => '^Organization:\s+(.+)$',
to_style_flag => '^To:\s+(.+)$',
cc_style_flag => '^CC:\s+(.+)$',
subject_style_flag => '^Subject:\s+(.+)$',
date_style_flag => '^Date:\s+(.+)$',
reply_to_style_flag => '^Reply-To:\s+(.+)$',
MIME_cleanup_flag => 1,
);
%config_out = (
title => 'Perl-Win32-Users',
dir_digest => $topdir,
dir_threads => "$topdir/Threads",
dir_archive_top => "$topdir/Threads/archive",
archived_today => "$topdir/archived_today.txt",
de_archived_today => "$topdir/de_archived_today.txt",
deleted_today => "$topdir/deleted_today.txt",
digests_log => "$topdir/digests_log.txt",
digests_read => "$topdir/digests_read.txt",
todays_topics => "$topdir/todays_topics.txt",
mimelog => "$topdir/mimelog.txt",
id_format => 'sprintf("%03d",$1) . \'_\' .
sprintf("%04d",$2)',
output_id_format => 'sprintf("%04d",$1)',
MIME_cleanup_log_flag => 1,
# next element's value must be double-quoted
thread_msg_delimiter => "--__--__--\n\n",
archive_kill_trigger => 1,
archive_kill_days => 14,
digests_read_flag => 1,
archive_config => 0,
);
# script: dig.pl
# USAGE: perl dig.pl
#!/usr/bin/perl
use strict;
use warnings;
use Mail::Digest::Tools qw( process_new_digests );
our (%config_in, %config_out);
my $data_file = 'pw32u.digest.data';
require $data_file;
process_new_digests(\%config_in, \%config_out);
print "\nFinished\n";
=head2 Maintaining Local Archives of More than One Digest
The module author has maintained local archives of more than a half dozen
different mailing list digests over the past several years. He has found it
convenient to maintain the configuration information for I<all> the digests
he is following at a given time in a I<single> configuration file. The
advantage to this approach is that if two digests share a similar internal
structure (perhaps due to being generated by the same mailing list program or
list provider) and if the user chooses to structure the output from the two
digests in similar or identical ways, then getting the configuration hashes
becomes much easier and the potential for error is reduced.
Here is a sample directory and file structure for maintaining archives of
two different digests on a Win32 system:
digest/
digest.data
process_new.pl
process_ALL.pl
reply_digest_message.pl
repair_digest_order.pl
consolidate_threads.pl
deletables.pl
pw32u/
Perl-Win32-Users Digest, Vol 1 Issue 1771.txt
Perl-Win32-Users Digest, Vol 1 Issue 1772.txt
digest_log.txt
digest_read.txt
mimelog.txt
Threads/
pbml/
[PBML] Digest Number 1491.txt
[PBML] Digest Number 1492.txt
digest_log.txt
Threads/
File F<digest.data> would look like this:
# digest.data
$topdir = "E:/Digest";
%digest_structure = (
pbml => {
grep_formula => '\[PBML\]',
pattern_target => '.*\s(\d+)\.txt$',
...
},
pw32u => {
grep_formula => 'Perl-Win32-Users digest',
pattern_target => '.*Vol\s(\d+),\sIssue\s(\d+)\.txt',
...
},
);
%digest_output_format = (
pbml => {
title => 'Perl Beginner',
dir_digest => "$topdir/pbml",
dir_threads => "$topdir/pbml/Threads",
...
},
pw32u => {
title => 'Perl-Win32-Users',
dir_digest => "$topdir/pw32u",
dir_threads => "$topdir/pw32u/Threads",
...
},
);
To accomodate this slightly more complex structure in the configuration file,
the calling script might be modified as follows:
# script: dig.pl
# USAGE: perl dig.pl [short-name for digest]
#!/usr/bin/perl
use Mail::Digest::Tools qw( process_new_digests );
my ($this_key, %config_in, %config_out);
# variables imported from $data_file
our (%digest_structure, %digest_output_format);
my $data_file = 'digest.data';
require $data_file;
$this_key = shift @ARGV;
die "\n The command-line argument you typed: $this_key\n does not call an accessible digest$!"
unless (defined $digest_structure{$this_key}
and defined $digest_output_format{$this_key});
my ($k,$v);
while ( ($k, $v) = each %{$digest_structure{$this_key}} ) {
$config_in{$k} = $v;
}
while ( ($k, $v) = each %{$digest_output_format{$this_key}} ) {
$config_out{$k} = $v;
}
process_new_digests(\%config_in, \%config_out);
print "\nFinished\n";
=head2 Getting Your Mail to the Right Place on Your System
For several years the module author used the scripts which were predecessors
to Mail::Digest::Tools on a Win32 system where mail was read with Microsoft
Outlook Express. He would do a "File/Save as.." on an instance of a digest,
select text format (*.txt) and save it to an appropriate directory. Later,
the author used the shareware e-mail client Poco, in which the same operation
was accomplished by highlighting a file and keying "Ctrl+S".
But as the number of digests the author was tracking grew, this procedure
became more and more tedious. Fortunately, about that time the author was
assigned to write a review of the second edition of the Perl Cookbook, and he
learned how to use the Net::POP3 module to receive his e-mail directly. So
now he uses a Perl script to get all his digests and save them as text files
to appropriate directories -- and then lets a GUI e-mail client take care of
the rest.
Here is a script which more or less accomplishes this:
# script: get_digests.pl
#!/usr/bin/perl
use strict;
use warnings;
use Net::POP3;
use Term::ReadKey;
my ($site, $username, $password);
my ($verref, $pop3, $messagesref, $undeleted, $msgnum, $message);
my ($k,$v);
my ($oldfh, $output);
my %digests = (
'pbml' => "E:/Digest/pbml",
'pw32u' => "E:/Digest/pw32u",
'london' => "E:/Digest/london",
);
$site = 'pop3.someISP.com';
$username = 'myuserid';
$pop3 = Net::POP3->new($site)
or die "Couldn't open connection to $site: $!";
print "Enter password for $username at $site: ";
ReadMode('noecho');
$password = ReadLine(0);
chomp $password;
ReadMode(0);
print "\n";
defined ($pop3->login($username, $password))
or die "Can't authenticate: $!";
$messagesref = $pop3->list
or die "Can't get list of undeleted messages: $!";
while ( ($k,$v) = each %$messagesref ) {
my ($messageref, $line, %headers);
print "$k:\t$v\n";
$messageref = $pop3->top($k);
local $_;
foreach (@$messageref) {
chomp;
last if (/^\s*$/);
next unless (/^\s*(Date:|From:|Subject:|To:)/);
if (/^\s*Date:\s*(.*)/) {
$headers{'Date'} = $1;
}
if (/^\s*From:\s*(.*)/) {
$headers{'From'} = $1;
}
if (/^\s*Subject:\s*(.*)/) {
$headers{'Subject'} = $1;
}
if (/^\s*To:\s*(.*)/) {
$headers{'To'} = $1;
}
}
if ($headers{'Subject'} =~ /^\[PBML\]/) {
get_digest($pop3, $k, 'pbml', $headers{'Subject'});
}
if ($headers{'Subject'} =~ /^Perl-Win32-Users/) {
get_digest($pop3, $k, 'pw32u', $headers{'Subject'});
}
if ($headers{'Subject'} =~ /^london\.pm/) {
get_digest($pop3, $k, 'london', $headers{'Subject'});
}
}
$pop3->quit() or die "Couldn't quit cleanly: $!";
print "Finished!\n";
sub get_digest {
my ($pop3, $msgnum, $digest, $subj) = @_;
print "Retrieving $msgnum: $subj";
my $message =
$pop3->get($msgnum) or die "Couldn't get message $msgnum: $!";
if ($message) {
print "\n";
my $digestfile = "$digests{$digest}/$subj.txt";
_print_message($digestfile, $message);
print "Marking $msgnum for deletion\n";;
$pop3->delete($msgnum) or die "Couldn't delete message $msgnum: $!";
} else {
print "Failed: $!\n";
}
}
sub _print_message {
my ($digestfile, $message) = @_;
my @lines = @{$message};
my $counter = 0;
open(FH, ">$digestfile")
or die "Couldn't open $digestfile for writing: $!";
for (my $i = 0; $i<=$#lines; $i++) {
chomp($lines[$i]);
# Identify the first blank line in the digest,
# i.e., the end of the headers
if ($lines[$i] =~ /^$/) {
$counter = $i;
last;
}
};
# Transfer digest to appropriate directory, skipping over digest header
# so as to start just above Today's Topics
foreach my $line (@lines[$counter+1 .. $#lines]) {
chomp($line);
# For some reason the $pop3->get() puts a single whitespace at the
# start of most (all but the first?) lines
# That has to be cleaned up so digest.pl can correctly process
# header info and identify beginning of Today's Topics
if ($line =~ /^\s(.*)/) {
print FH $1, "\n";
} else {
print FH $line, "\n";
}
}
close FH or die "Couldn't close after writing: $!";
}
No promise is made that this script or any script contained in this
documentation will work correctly on your system. Hack it up to get it to
work the way you want it to.
=head1 ASSUMPTIONS AND QUALIFICATIONS
=over 4
=item 1 No Change in Mailing List Digest Software
The main assumption on which Mail::Digest::Tools depends for its success is
that the provider of a particular digest continues to use the same mailing
list software to produce the digest. If the provider changes his/her software,
you must modify Mail::Digest::Tools' configuration data accordingly.
=item 2 Digest Must Be One E-mail Without Attachments
At its current stage of development Mail::Digest::Tools is only applicable to
mailing list digests which arrive as one continuous file. It is C<not>
applicable to digests (e.g., Cygwin, module-authors@perl.org) which are
supplied in a format consisting of (a) one file with instructions and a table
of contents and (b) all the individual messages provided as e-mail attachments.
=item 3 Perl 5.6+ Only
The program was created with Perl 5.6. Certain features, such as the use of
the C<our> modifier, were not available prior to 5.6. Modifications to
account for pre-5.6 features are left as an exercise for the user.
=item 4 Time::Local
Mail::Digest::Tools internally uses Perl core extension Time::Local. If at
some future point this module is not included as part of a Perl core
distribution, you would have to install it manually from CPAN.
=back
=head1 HISTORY AND FUTURE DEVELOPMENT
=head2 PRE-CPAN HISTORY
ActiveState maintains Perl for Windows-based platforms and also maintains a
variety of mailing lists for users of its Windows-compatible versions of Perl.
Subscribers to these lists can receive messages either as individual e-mails
or as part of a daily digest which contains a listing of the day's topics and
the complete text of each message. The messages are often best followed as
discussion 'threads' which may extend over several days' worth of digests.
In June of 2000, however, ActiveState had to temporarily take its mailing lists
off-line for technical reasons. When these lists were restored to service,
their archive capacities were not immediately restored. I had just begun my
study of Perl and had come to enjoy reading the Perl-Win32-Users digest. As
I set off for the Yet Another Perl Conference in Pittsburgh, I shouted out,
'I want my Perl-Win32-Users digest!' I wrote a Perl script called C<digest.pl>
to fill that gap.
ActiveState has since restored archiving capacity to their lists. For reasons
that would perhaps best be explored in a psychotherapeutic context, however, I
had become attached to my local archive of the 'pw32u' list, so I continued to
maintain this program and fine-tune its coding.
In early 2001 it became apparent that this program could be applied to a wide
variety of mailing list digests -- not just those provided by ActiveState. In
particular, valuable digests provided by Yahoo Groups (formerly E-groups) such
as NT Emacs Users, Perl 5 Porters and Perl Beginners could also be archived if
C<digest.pl> were modified appropriately. I made those modifications and
began to track several other digests. I was able to use the archive I had
developed as a window into one part of the Perl community in a Lightning Talk
I gave at YAPC::North America in Montreal in June 2001, ''An Index of
Incivility in the Perl Community.''
Maintaining C<digest.pl> was, to a considerable extent, the way I taught myself
Perl. Along the way I incorporated my first profiler into the script -- and
then discarded it. Some of the subroutines I had written for early versions of
the program had applicability to other scripts -- and thus was born my first
module -- also since discarded. By July 2003 I was up to version 1.3.
Following a suggestion by Uri Guttman at the YAPC::EU conference held in Paris
in July 2003, wherever possible the use of separate
print statements for each line to be printed was eliminated in favor of
concatenating strings to be printed into much larger strings which could be
printed all at once. This revision reduced the number of times filehandles
had to be opened for writing. A given thread file was now opened only once
per call of this program, rather than once for each message in each digest
processed per call of the program.
Various other improvements, such as the possibility of stripping out
unnecessary multipart MIME content and the introduction of subdirectories
for archiving, were made in late 2003. At that point I
decided to transform the script into a full-fledged Perl module. At first I
tried out an object-oriented structure (with which I was familiar from my first
two CPAN modules, F<List::Compare> and F<Data::Presenter>). That OO structure
necessitated one constructor and one method call per typical script, but since
the constructor did nothing but some cursory validation of the configuration
data, it was mostly superfluous. Hence, I jettisoned the OO structure in favor
of a functional approach. The result: Mail::Digest::Tools.
=head2 CPAN
After these revisions, I was up to version 1.96. Why revert to a lower
version number at this point? That is why Mail::Digest::Tools makes its CPAN
debut in version 2.04.
v1.97 (2/18/2004): Dealing with problem that Win32 and Unix/Linux may create
different thread names for the same set of source messages because they have
different lists of characters forbidden in file names. This became a problem
while writing tests for C<process_new_digests()> because it made predicting
the names of thread files created via that function more difficult to predict.
Tests adjusted appropriately.
v1.98 (2/19/2004): Eliminated suspect uses of C</o> modifier on regexes.
This was causing problems when I called C<process_new_digests()> on two
different types of digests in the same script. Also, eliminated code
referring to DOS (I<e.g.,> code eliminating characters unacceptable in
DOS filenames) as I have no way to test this module on a DOS box.
v1.99 (2/22/2004): ActiveState introduced a new format for its
Perl-Win32-Users digest -- the digest which originally inspired the creation
of this module's predecessor in 2000. One aspect of this new format was a
clear improvement: HTML attachments are now stripped before messages are
posted to the digest, so multipart MIME content has either been reduced
considerably or eliminated altogether. But another aspect of this new
format upset code going back four years: The delimiter immediately
following Today's Topics is now different from the delimiters separating each
message in the digest. Working around this appeared to be surprisingly
difficult, especially since this revision had to be done in the middle of
writing a test suite for CPAN distribution. A new key has been added to the
C<%config_in> hash for each digest:
$config_in{'post_topics_delimiter'}
v2.00 (2/23/2004): Testing conducted after the last revision revealed a bug
going back several versions in the internal subroutine stripping multipart
MIME content. The last paragraph of each message which did I<not> have MIME
content was being stripped off. The offending code was found within
C<_analyze_message_body()>. (The author recently learned of the CPAN
module F<Email::StripMime>. This looks promising as a replacement for
the hand-rolled subroutine used within Mail::Digest::Tools, but a full study
of its possibilities will be deferred to a later version. Also in this
version, POD was rewritten to reflect the introduction of the post-topics
delimiter.
v2.01 (2/24/2004): Backslashes (except as part of C<\n> newline characters)
are prohibited in C<%config_out> key C<thread_msg_delimiter>. This is
because in the test suite that key's value is used as a variable inside a
regular expression which in turn is used as an argument to C<split()>.
Preliminary investigation suggests that to work around the backslash
metacharacter in that situation would be very time-consuming.
v2.02 (2/26/2004): Revised C<reply_to_digest_message()> internal
subroutine C<_strip_down_for_reply> to reflect distinction between post-topics
delimiter and source message delimiter.
v2.03 (3/04/2004): Fixed bug in C<readdir> call in C<repair_message_order()>.
Extensive reworking of test suite.
v2.04 (3/05/2004): No changes in module. Refinement of test suite only.
v2.05 (3/07/2004): Fixed accidental deletion of incrementation of
C<$message_count> in C<_strip_down()>.
v2.06 (3/10/2004): Correction of errors in test suite. Elimination of use of List::Compare in test suite.
v2.07 (3/11/2004): Correction of error in t/03.t
v2.08 (3/11/2004): Correction in _clean_up_thread_title and in tests.
v2.10 (3/15/2004): Corrections to README and documentation only.
v2.11 (10/23/2004): Fixed several errors which resulted in "Bizarre copy of hash in leave" error when running test suite under Devel::Cover.
v2.12 (05/14/2011): Added 'mirbsd' to list of Unixish-OSes.
=head1 AUTHOR
James E. Keenan (F<jkeenan@cpan.org>).
Creation date: August 21, 2000.
Last modification date: May 14, 2011.
Copyright (c) 2000-2011 James E. Keenan. United States. All rights reserved.
This software is distributed with absolutely no warranty, express or implied.
Use it at your own risk. This is free software which you may distribute under
the same terms as Perl itself.
=cut