The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -w

## classify_test_data_in_a_file_symbolic.pl

#use lib '../blib/lib', '../blib/arch';

use strict;
use Algorithm::DecisionTree;

die "This script must be called with exactly three command-line arguments:\n" .
    "     1st arg: name of the training datafile\n" .
    "     2nd arg: name of the test data file\n" .     
    "     3rd arg: the name of the output file to which class labels will be written\n" 
    unless @ARGV == 3;

my $debug = 0;

my ($training_datafile, $test_datafile, $outputfile) = @ARGV;

my $dt = Algorithm::DecisionTree->new(training_datafile => $training_datafile);

$dt->get_training_data();
$dt->calculate_first_order_probabilities();
$dt->calculate_class_priors();

### UNCOMMENT THE NEXT STATEMENT if you would like to see
### the training data that was read from the disk file:
#$dt->show_training_data();

my $root_node = $dt->construct_decision_tree_classifier();


### UNCOMMENT THE NEXT STATEMENT if you would like to see
### the decision tree displayed in your terminal window:
#$root_node->display_decision_tree("   ");

# NOW YOU ARE READY TO CLASSIFY TEST DATA IN A FILE:

open TESTFILEHANDLE, $test_datafile 
    or die "Unable to open the test datafile $test_datafile: $!";
open OUTPUTHANDLE, ">$outputfile"
    or die "Unable to open the test datafile $outputfile: $!";
print OUTPUTHANDLE "\nThe classification results for each sample are shown in decreasing order of class probabilities:\n\n";

my @features;
while (<TESTFILEHANDLE>) {
    chomp;
    next if /^#/;
    next if /^\s*$/;
    if (/^\s*Feature Order For Data:\s*(.+)/i) {
        @features = split /\s+/, $1;
        die "feature labels are empty" if @features == 0;
        next;
    }
    my @test_sample_entries = split /\s+/;
    my $sample_name = shift @test_sample_entries;
    my $class_label = shift @test_sample_entries;
    die "the number of features listed in the header does not match " . 
        "the number of values in the test data for sample $sample_name" 
                  unless @features == @test_sample_entries;
    my @test_sample;
    foreach my $i (0..@features-1) {
        push @test_sample, "$features[$i]=$test_sample_entries[$i]";
    }
    my %classification = %{$dt->classify($root_node, \@test_sample)};
    my @solution_path = @{$classification{'solution_path'}};
    delete $classification{'solution_path'};
    my @which_classes = keys %classification;
    @which_classes = sort {$classification{$b} <=> $classification{$a}} @which_classes;
    my $result = "$sample_name:  ";
    foreach my $class (@which_classes) {
        $result .= sprintf( "$class: %.3f    ", $classification{$class});    
    }
    print "$result\n" if $debug == 1;
    print OUTPUTHANDLE "$result\n";
}
close TESTFILEHANDLE;
close OUTPUTHANDLE;