The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl

# NOTE:
# This code is derived from code written by John Graham-Cumming
# and found in the recent issue of Dr. Dobbs Journal (May 2005)
# in the article "Naive Bayesian Text Classification".
# Some changes have been made but at it's core this code still
# the same, so I want to be sure to give credit where credit is
# due.

use strict;

my %words;

sub load_db {
    return unless -e 'words.db-p5.pl';
    open DB, "<words.db-p5.pl" || die "Cannot open the words.db-p5.pl file: $!";
    while (my $line = <DB>) {
        $line .= chomp;
        my ($key, $value) = split(/\t/, $line);
        $words{$key} = $value;
    }
    close DB;
}

sub save_db {
    open DB, ">words.db-p5.pl" || die "Cannot open the words.db-p5.pl file: $!";
    foreach my $key (keys %words) {
        print DB "$key\t$words{$key}\n";
    }
    close DB;    
}

sub parse_file {
    my ($file) = @_;
    my %word_counts;
    open FILE, "<$file" || die "Cannot open the '$file' file : $!";
    while (my $line = <FILE>) {
        while ($line =~ s/(\w+)[ \t\n\r]//) {
            $word_counts{lc($1)}++;
        }
    }
    close FILE;
    return %word_counts;
}

sub add_words {
    my ($category, %words_in_file) = @_;
    foreach my $word (keys %words_in_file) {
        $words{"$category-$word"} += $words_in_file{$word};
    }    
}

sub classify {
    my (%words_in_file) = @_;
    
    my %count;
    my $total = 0;
    foreach my $entry (keys %words) {
        $entry =~ /^(.+)-(.+)$/;
        $count{$1} += $words{$entry};
        $total += $words{$entry};
    }
    
    my %score;
    foreach my $word (keys %words_in_file) {
        foreach my $category (keys %count) {
            if (defined($words{"$category-$word"})) {
                $score{$category} += log($words{"$category-$word"} / $count{$category});
            }
            else {
                $score{$category} += log(0.01 / $count{$category});
            }
        }
    }
    
    foreach my $category (keys %count) {
        $score{$category} += log($count{$category} / $total)
    }
    
    foreach my $category (sort { $score{$b} <=> $score{$a} } keys %count) {
        print "$category $score{$category}\n";
    }
    
}

load_db();

if ($ARGV[0] eq 'add' && $#ARGV == 2) {
    add_words($ARGV[1], parse_file($ARGV[2]));
}
elsif ($ARGV[0] eq 'classify' && $#ARGV == 1) {
    classify(parse_file($ARGV[1]));
}
else {
    print <<USAGE;
USAGE:
    add <category> <file>
    classify <file>
USAGE
}

save_db();