lib/Algorithm/BoostedDecisionTree.pm

package Algorithm::BoostedDecisionTree;

#--------------------------------------------------------------------------------------
# Copyright (c) 2015 Avinash Kak. All rights reserved.  This program is free
# software.  You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::BoostedDecisionTree is a Perl module for boosted decision-tree based
# classification of multidimensional data.
# -------------------------------------------------------------------------------------

use lib 'blib/lib', 'blib/arch';

#use 5.10.0;
use strict;
use warnings;
use Carp;
use Algorithm::DecisionTree 3.20;
use List::Util qw(reduce min max);

our $VERSION = '3.20';

@BoostedDecisionTree::ISA = ('Algorithm::DecisionTree');


############################################   Constructor  ##############################################

# Constructor:
sub new { 
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n" 
                           if check_for_illegal_params(@params) == 0;
    my %dtargs = %args;
    delete $dtargs{how_many_stages};
    my $instance = Algorithm::DecisionTree->new(%dtargs);
    bless $instance, $class;
    $instance->{_how_many_stages}             =  $args{how_many_stages} || undef;
    $instance->{_stagedebug}                   =  $args{stagedebug} || 0;
    $instance->{_training_samples}             =  {map {$_ => []} 0..$args{how_many_stages}};
    $instance->{_all_trees}                    =  {map {$_ => Algorithm::DecisionTree->new(%dtargs)} 0..$args{how_many_stages}};
    $instance->{_root_nodes}                   =  {map {$_ => undef} 0..$args{how_many_stages}};
    $instance->{_sample_selection_probs}       =  {map {$_ => {}} 0..$args{how_many_stages}};
    $instance->{_trust_factors}                =  {map {$_ => undef} 0..$args{how_many_stages}};
    $instance->{_misclassified_samples}        =  {map {$_ => []} 0..$args{how_many_stages}};
    $instance->{_classifications}              =  undef;
    $instance->{_trust_weighted_decision_classes}  =  undef;
    bless $instance, $class;
}


sub get_training_data_for_base_tree {
    my $self = shift;
    my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
    my $filename = $self->{_training_datafile} || die "you did not specify a file for the training data";
    open FILEIN, $filename or die "Unable to open $filename: $!";
    die("Aborted. get_training_data_csv() is only for CSV files") unless $filename =~ /\.csv$/;
    my $class_name_in_column = $self->{_csv_class_column_index} - 1;   # subtract 1 because first col has labels
    my @all_data =  <FILEIN>;
    my %data_hash = ();
    my @csv_headers;
    foreach my $record (@all_data) {
        next if $record =~ /^#/;     
        next if $record =~ /^[ ]*\r?\n?$/;
        next if $record =~ /^[\s=]*$/;
        $record =~ s/\r?\n?$//;
        if ($record =~ /^\"\"/) {
            @csv_headers =  grep $_, map {$_ =~ s/^\"|\"$//g; $_} map {$_ =~ s/^\s*|\s*$//g; $_} split /,/, $record;
        } else {
            my @fields =  map {$_ =~ s/^\"|\"$//g; $_} map {$_ =~ s/^\s*|\s*$//g; $_} split /,/, $record;
            my @fields_after_first = @fields[1..$#fields]; 
            $data_hash{$fields[0]} = \@fields_after_first;
        }
    }
    die 'Aborted. The first row of CSV file must begin with "" and then list the feature names and the header label you want to use for the column that will hold class labels for data sample records'
        unless @csv_headers;
    my @field_names = @csv_headers;
    print "\nfield names: @field_names\n" if $self->{_debug2};
    my $class_column_heading = $field_names[$class_name_in_column];
    my @feature_names = map {$field_names[$_-1]} @{$self->{_csv_columns_for_features}};
    print "\nfeature names: @feature_names\n" if $self->{_debug2};
    $class_column_heading =~ s/^\s*\"|\"\s*$//g;
    my %class_for_sample_hash = ();
    my %feature_values_for_samples_hash = ();
    foreach my $key (keys %data_hash) {
        next if $key =~ /^\"\"$/;
        my $cleanedup = $key;
        $cleanedup =~ s/^\s*\"|\"\s*$//g;
        my $which_class = $data_hash{$key}[$class_name_in_column];
        $which_class  =~ s/^\s*\"|\"\s*$//g;
        $class_for_sample_hash{"sample_$cleanedup"} = "$class_column_heading=$which_class";
        my @features_and_values_list = ();
        foreach my $i (@{$self->{_csv_columns_for_features}}) {
            my $feature_column_header = $field_names[$i-1];
            my $feature_val = $data_hash{$key}->[$i-1];
            $feature_val  =~ s/^\s*\"|\"\s*$//g;
            $feature_val = sprintf("%.1f",$feature_val) if $feature_val =~ /^\d+$/;
            push @features_and_values_list,  "$feature_column_header=$feature_val";
        }
        $feature_values_for_samples_hash{"sample_" . $cleanedup} = \@features_and_values_list;
    }
    my @sample_names = keys %feature_values_for_samples_hash;
    my @all_class_names =  sort keys %{ {map {$_ => 1} values %class_for_sample_hash } };
    $self->{_number_of_training_samples} = scalar @sample_names;
    my %features_and_values_hash = map { my $a = $_; {$csv_headers[$a-1] => [  map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};     
    if ($self->{_debug2}) {
        print "\nDisplaying features and their values for entire training data:\n\n";
        foreach my $fname (keys  %features_and_values_hash) {         
            print "        $fname    =>  @{$features_and_values_hash{$fname}}\n";
        }
    }
    my %features_and_unique_values_hash = ();
    my %feature_values_how_many_uniques_hash  =  ();
    my %numeric_features_valuerange_hash   =   ();
    foreach my $feature (keys %features_and_values_hash) {
        my %seen = ();
        my @unique_values_for_feature =  grep {$_ if $_ ne 'NA' && !$seen{$_}++} @{$features_and_values_hash{$feature}};
        $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
        my $not_all_values_float = 0;
        map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
        if ($not_all_values_float == 0) {
            my @minmaxvalues = minmax(\@unique_values_for_feature);
            $numeric_features_valuerange_hash{$feature} = \@minmaxvalues; 
        }
        $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
    }
    $self->{_all_trees}->{0}->{_class_names} = \@all_class_names;
    $self->{_all_trees}->{0}->{_feature_names} = \@feature_names;
    $self->{_all_trees}->{0}->{_samples_class_label_hash} = \%class_for_sample_hash;
    $self->{_all_trees}->{0}->{_training_data_hash}  =  \%feature_values_for_samples_hash;
    $self->{_all_trees}->{0}->{_features_and_values_hash}    =  \%features_and_values_hash;
    $self->{_all_trees}->{0}->{_features_and_unique_values_hash}    =  \%features_and_unique_values_hash;
    $self->{_all_trees}->{0}->{_numeric_features_valuerange_hash} = \%numeric_features_valuerange_hash;
    $self->{_all_trees}->{0}->{_feature_values_how_many_uniques_hash} = \%feature_values_how_many_uniques_hash;
    $self->{_all_training_data} = \%feature_values_for_samples_hash;    
    $self->{_all_sample_names} = [sort {sample_index($a) cmp sample_index($b)} keys %feature_values_for_samples_hash];
    if ($self->{_debug1}) {
        print "\n\n===========================  data ingested for the base tree   ==================================\n\n";
        print "\nAll class names: @{$self->{_all_trees}->{0}->{_class_names}}\n";
        print "\nEach sample data record:\n";
        foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_all_trees}->{0}->{_training_data_hash}}) {
            print "$kee    =>   @{$self->{_all_trees}->{0}->{_training_data_hash}->{$kee}}\n";
        }
        print "\nclass label for each data sample:\n";        
        foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_all_trees}->{0}->{_samples_class_label_hash}}) {
            print "$kee    =>   $self->{_all_trees}->{0}->{_samples_class_label_hash}->{$kee}\n";            
        }
        print "\nfeatures and the values taken by them:\n";
        for my $kee  (sort keys %{$self->{_all_trees}->{0}->{_features_and_values_hash}}) {
            print "$kee    =>   @{$self->{_all_trees}->{0}->{_features_and_values_hash}->{$kee}}\n";                        
        }
        print "\nnumeric features and their ranges:\n";
        for my $kee  (sort keys %{$self->{_all_trees}->{0}->{_numeric_features_valuerange_hash}}) {
            print "$kee    =>   @{$self->{_all_trees}->{0}->{_numeric_features_valuerange_hash}->{$kee}}\n";
        }
        print "\nunique values for the features:\n";
        for my $kee  (sort keys %{$self->{_all_trees}->{0}->{_features_and_unique_values_hash}}) {
            print "$kee    =>   @{$self->{_all_trees}->{0}->{_features_and_unique_values_hash}->{$kee}}\n";  
        }
        print "\nnumber of unique values in each feature:\n";        
        for my $kee  (sort keys %{$self->{_all_trees}->{0}->{_feature_values_how_many_uniques_hash}}) {
            print "$kee    =>   $self->{_all_trees}->{0}->{_feature_values_how_many_uniques_hash}->{$kee}\n";
        }
    }
}

sub show_training_data_for_base_tree {
    my $self = shift;
    $self->{_all_trees}->{0}->show_training_data();
}

sub calculate_first_order_probabilities_and_class_priors {
    my $self = shift;
    $self->{_all_trees}->{0}->calculate_first_order_probabilities();
    $self->{_all_trees}->{0}->calculate_class_priors();
    $self->{_sample_selection_probs}->{0} =  {map { $_ => 1.0/@{$self->{_all_sample_names}} } @{$self->{_all_sample_names}}};
}

sub construct_base_decision_tree {
    my $self = shift;
    $self->{_root_nodes}->{0} = $self->{_all_trees}->{0}->construct_decision_tree_classifier();
}

sub display_base_decision_tree {
    my $self = shift;
    $self->{_root_nodes}->{0}->display_decision_tree("     ");
}

sub construct_cascade_of_trees {
    my $self = shift;
    $self->{_training_samples}->{0} = $self->{_all_sample_names};
    $self->{_misclassified_samples}->{0} = $self->evaluate_one_stage_of_cascade($self->{_all_trees}->{0}, $self->{_root_nodes}->{0});
    if ($self->{_stagedebug}) {
        $self->show_class_labels_for_misclassified_samples_in_stage(0);
        print "\n\nSamples misclassified by base classifier: @{$self->{_misclassified_samples}->{0}}\n";
        my $how_many = @{$self->{_misclassified_samples}->{0}};
        print "\nNumber of misclassified samples: $how_many\n";
    }
    my $misclassification_error_rate = reduce {$a+$b} map {$self->{_sample_selection_probs}->{0}->{$_}} @{$self->{_misclassified_samples}->{0}};
    print "\nMisclassification_error_rate for base classifier: $misclassification_error_rate\n" if $self->{_stagedebug};
    $self->{_trust_factors}->{0} = 0.5 * log((1-$misclassification_error_rate)/$misclassification_error_rate);
    print "\nBase class trust factor: $self->{_trust_factors}->{0}\n"  if $self->{_stagedebug};
    foreach my $stage_index (1 .. $self->{_how_many_stages} - 1) {
        print "\n\n========================== Constructing stage indexed $stage_index =========================\n"
              if $self->{_stagedebug};
        $self->{_sample_selection_probs}->{$stage_index} =  { map {$_ =>  $self->{_sample_selection_probs}->{$stage_index-1}->{$_} *   exp(-1.0 * $self->{_trust_factors}->{$stage_index - 1} *  (contained_in($_, @{$self->{_misclassified_samples}->{$stage_index - 1}}) ? -1.0 : 1.0) )  }  @{$self->{_all_sample_names}} };        
        my $normalizer = reduce {$a + $b} values %{$self->{_sample_selection_probs}->{$stage_index}};
        print "\nThe normalizer is: $normalizer\n"  if $self->{_stagedebug};
        map {$self->{_sample_selection_probs}->{$stage_index}->{$_}  /= $normalizer} keys %{$self->{_sample_selection_probs}->{$stage_index}};
        my @training_samples_this_stage = ();
        my $sum_of_probs = 0.0;
        foreach my $sample (sort {$self->{_sample_selection_probs}->{$stage_index}->{$b} <=> $self->{_sample_selection_probs}->{$stage_index}->{$a}} keys %{$self->{_sample_selection_probs}->{$stage_index}}) {
            $sum_of_probs += $self->{_sample_selection_probs}->{$stage_index}->{$sample};
            push @training_samples_this_stage, $sample if $sum_of_probs < 0.5;
            last if $sum_of_probs > 0.5;
        }
        $self->{_training_samples}->{$stage_index} = [sort {sample_index($a) <=> sample_index($b)} @training_samples_this_stage];
        if ($self->{_stagedebug}) {
            print "\nTraining samples for stage $stage_index: @{$self->{_training_samples}->{$stage_index}}\n\n";
            my $num_of_training_samples = @{$self->{_training_samples}->{$stage_index}};
            print "\nNumber of training samples this stage $num_of_training_samples\n\n";
        }
        # find intersection of two sets:
        my %misclassified_samples = map {$_ => 1} @{$self->{_misclassified_samples}->{$stage_index-1}};
        my @training_samples_selection_check = grep $misclassified_samples{$_}, @{$self->{_training_samples}->{$stage_index}};
        if ($self->{_stagedebug}) {
            my @training_in_misclassified = sort {sample_index($a) <=> sample_index($b)} @training_samples_selection_check;
            print "\nTraining samples in the misclassified set: @training_in_misclassified\n";
            my $how_many = @training_samples_selection_check;
            print "\nNumber_of_miscalssified_samples_in_training_set: $how_many\n";
        }
        my $dt_this_stage = Algorithm::DecisionTree->new('boostingmode');
        $dt_this_stage->{_training_data_hash} = { map {$_ => $self->{_all_training_data}->{$_} } @{$self->{_training_samples}->{$stage_index}} };

        $dt_this_stage->{_class_names} = $self->{_all_trees}->{0}->{_class_names};
        $dt_this_stage->{_feature_names} = $self->{_all_trees}->{0}->{_feature_names};
        $dt_this_stage->{_entropy_threshold} = $self->{_all_trees}->{0}->{_entropy_threshold};
        $dt_this_stage->{_max_depth_desired} = $self->{_all_trees}->{0}->{_max_depth_desired};        
        $dt_this_stage->{_symbolic_to_numeric_cardinality_threshold} = $self->{_all_trees}->{0}->{_symbolic_to_numeric_cardinality_threshold};
        $dt_this_stage->{_samples_class_label_hash} = {map {$_ => $self->{_all_trees}->{0}->{_samples_class_label_hash}->{$_}} keys %{$dt_this_stage->{_training_data_hash}}};
        $dt_this_stage->{_features_and_values_hash} = {map {$_ => []} keys %{$self->{_all_trees}->{0}->{_features_and_values_hash}}};
        my $pattern = '(\S+)\s*=\s*(\S+)';        
        foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %{$dt_this_stage->{_training_data_hash}}) { 
            foreach my $feature_and_value (@{$dt_this_stage->{_training_data_hash}->{$sample}}) {
                $feature_and_value =~ /$pattern/;
                my ($feature, $value) = ($1, $2);
                push @{$dt_this_stage->{_features_and_values_hash}->{$feature}}, $value if $value ne 'NA';
            }
        }
        $dt_this_stage->{_features_and_unique_values_hash} = {map {my $feature = $_; $feature => [sort keys %{{map {$_ => 1} @{$dt_this_stage->{_features_and_values_hash}->{$feature}}}}]} keys %{$dt_this_stage->{_features_and_values_hash}}};
        $dt_this_stage->{_numeric_features_valuerange_hash} = {map {$_ => []} keys %{$self->{_all_trees}->{0}->{_numeric_features_valuerange_hash}}};
        $dt_this_stage->{_numeric_features_valuerange_hash} = {map {my $feature = $_; $feature =>  [min(@{$dt_this_stage->{_features_and_unique_values_hash}->{$feature}}), max(@{$dt_this_stage->{_features_and_unique_values_hash}->{$feature}})]} keys %{$self->{_all_trees}->{0}->{_numeric_features_valuerange_hash}}};
        if ($self->{_stagedebug}) {
            print "\n\nPrinting features and their values in the training set:\n\n";
            foreach my $kee (sort keys %{$dt_this_stage->{_features_and_values_hash}}) {
                print "$kee   =>  @{$dt_this_stage->{_features_and_values_hash}->{$kee}}\n";
            }
            print "\n\nPrinting unique values for features:\n\n";
            foreach my $kee (sort keys %{$dt_this_stage->{_features_and_unique_values_hash}}) {
                print "$kee   =>  @{$dt_this_stage->{_features_and_unique_values_hash}->{$kee}}\n";            
            }
            print "\n\nPrinting unique value ranges for features:\n\n";
            foreach my $kee (sort keys %{$dt_this_stage->{_numeric_features_valuerange_hash}}) {
                print "$kee   =>  @{$dt_this_stage->{_numeric_features_valuerange_hash}->{$kee}}\n";            
            }
        }
        $dt_this_stage->{_feature_values_how_many_uniques_hash} = {map {$_ => undef} keys %{$self->{_all_trees}->{0}->{_features_and_unique_values_hash}}};
        $dt_this_stage->{_feature_values_how_many_uniques_hash} = {map {$_ => scalar @{$dt_this_stage->{_features_and_unique_values_hash}->{$_}}} keys %{$self->{_all_trees}->{0}->{_features_and_unique_values_hash}}};
        $dt_this_stage->calculate_first_order_probabilities();
        $dt_this_stage->calculate_class_priors();
        print "\n\n>>>>>>>Done with the initialization of the tree for stage $stage_index<<<<<<<<<<\n" if $self->{_stagedebug};
        my $root_node_this_stage = $dt_this_stage->construct_decision_tree_classifier();
        $root_node_this_stage->display_decision_tree("     ") if $self->{_stagedebug};

        $self->{_all_trees}->{$stage_index} = $dt_this_stage;
        $self->{_root_nodes}->{$stage_index} = $root_node_this_stage;
        $self->{_misclassified_samples}->{$stage_index} = $self->evaluate_one_stage_of_cascade($self->{_all_trees}->{$stage_index}, $self->{_root_nodes}->{$stage_index});
        if ($self->{_stagedebug}) {
            print "\nSamples misclassified by stage $stage_index classifier: @{$self->{_misclassified_samples}->{$stage_index}}\n";
            printf("\nNumber of misclassified samples: %d\n", scalar @{$self->{_misclassified_samples}->{$stage_index}});
            $self->show_class_labels_for_misclassified_samples_in_stage($stage_index);
        }
        my $misclassification_error_rate = reduce {$a+$b} map {$self->{_sample_selection_probs}->{$stage_index}->{$_}} @{$self->{_misclassified_samples}->{$stage_index}};
        print "\nStage $stage_index misclassification_error_rate: $misclassification_error_rate\n" if $self->{_stagedebug};

        $self->{_trust_factors}->{$stage_index} = 0.5 * log((1-$misclassification_error_rate)/$misclassification_error_rate);
        print "\nStage $stage_index trust factor: $self->{_trust_factors}->{$stage_index}\n"  if $self->{_stagedebug};
    }
}

sub evaluate_one_stage_of_cascade {
    my $self = shift;
    my $trainingDT = shift;
    my $root_node = shift;
    my @misclassified_samples = ();
    foreach my $test_sample_name (@{$self->{_all_sample_names}}) {
        my @test_sample_data = @{$self->{_all_trees}->{0}->{_training_data_hash}->{$test_sample_name}};
        print "original data in $test_sample_name:@test_sample_data\n" if $self->{_stagedebug};
        @test_sample_data = map {$_ if $_ !~ /=NA$/} @test_sample_data;
        print "$test_sample_name: @test_sample_data\n" if $self->{_stagedebug}; 
        my %classification = %{$trainingDT->classify($root_node, \@test_sample_data)};
        my @solution_path = @{$classification{'solution_path'}};                                  
        delete $classification{'solution_path'};                                              
        my @which_classes = keys %classification;
        @which_classes = sort {$classification{$b} <=> $classification{$a}} @which_classes;
        my $most_likely_class_label = $which_classes[0];
        if ($self->{_stagedebug}) {
            print "\nClassification:\n\n";
            print "     class                         probability\n";
            print "     ----------                    -----------\n";
            foreach my $which_class (@which_classes) {
                my $classstring = sprintf("%-30s", $which_class);
                my $valuestring = sprintf("%-30s", $classification{$which_class});
                print "     $classstring $valuestring\n";
            }
            print "\nSolution path in the decision tree: @solution_path\n";
            print "\nNumber of nodes created: " . $root_node->how_many_nodes() . "\n";
        }
        my $true_class_label_for_test_sample = $self->{_all_trees}->{0}->{_samples_class_label_hash}->{$test_sample_name};
        printf("%s:   true_class: %s    estimated_class: %s\n", $test_sample_name, $true_class_label_for_test_sample, $most_likely_class_label) if $self->{_stagedebug};
        push @misclassified_samples, $test_sample_name if $true_class_label_for_test_sample ne $most_likely_class_label;
    }
    return [sort {sample_index($a) <=> sample_index($b)} @misclassified_samples];
}

sub show_class_labels_for_misclassified_samples_in_stage {
    my $self = shift;
    my $stage_index = shift;
    die "\nYou must first call 'construct_cascade_of_trees()' before invoking 'show_class_labels_for_misclassified_samples_in_stage()'" unless @{$self->{_misclassified_samples}->{0}} > 0;
    my @classes_for_misclassified_samples = ();
    my @just_class_labels = ();

    for my $sample (@{$self->{_misclassified_samples}->{$stage_index}}) {    
        my $true_class_label_for_sample = $self->{_all_trees}->{0}->{_samples_class_label_hash}->{$sample};            
        push @classes_for_misclassified_samples, sprintf("%s => %s", $sample, $true_class_label_for_sample);
        push @just_class_labels, $true_class_label_for_sample; 
    }
    print "\nSamples misclassified by the classifier for Stage $stage_index: @{$self->{_misclassified_samples}->{$stage_index}}\n";
    my $how_many = @{$self->{_misclassified_samples}->{$stage_index}};
    print "\nNumber of misclassified samples: $how_many\n";
    print "\nShowing class labels for samples misclassified by stage $stage_index: ";
    print "\nClass labels for samples: @classes_for_misclassified_samples\n";
    my @class_names_unique =  sort keys %{{map {$_ => 1} @just_class_labels}};
    print "\nClass names (unique) for misclassified samples: @class_names_unique\n";
    print "\nFinished displaying class labels for samples misclassified by stage $stage_index\n\n";
}

sub display_decision_trees_for_different_stages {
    my $self = shift;
    print "\nDisplaying the decisions trees for all stages:\n\n";
    foreach my $i (0..$self->{_how_many_stages}-1) {
        print "\n\n=============================   For stage $i   ==================================\n\n";
        $self->{_root_nodes}->{$i}->display_decision_tree("     ");
    }
    print "\n==================================================================================\n\n\n";
}

sub classify_with_boosting {
    my $self = shift;    
    my $test_sample = shift;
    $self->{_classifications} = [map $self->{_all_trees}->{$_}->classify($self->{_root_nodes}->{$_}, $test_sample), 0..$self->{_how_many_stages}-1];
}

sub display_classification_results_for_each_stage {
    my $self = shift;        
    my @classifications = @{$self->{_classifications}};
    die "You must first call 'classify_with_boosting()' before invoking 'display_classification_results_for_each_stage()'\n"
        unless @classifications; 
    my @solution_paths = map $_->{'solution_path'}, @classifications;
    foreach my $i (0..$self->{_how_many_stages}-1) {
        print "\n\n=============================   For stage $i   ==================================\n\n";
        my %classification = %{$classifications[$i]};
        delete $classification{'solution_path'};
        my @which_classes = keys %classification;
        @which_classes = sort {$classification{$b} <=> $classification{$a}} @which_classes;
        print "\nClassification:\n\n";
        print "Classifier trust: $self->{_trust_factors}->{$i}\n\n";
        print "     class                         probability\n";
        print "     ----------                    -----------\n";
        foreach my $which_class (@which_classes) {
            my $classstring = sprintf("%-30s", $which_class);
            my $valuestring = sprintf("%-30s", $classification{$which_class});
            print "     $classstring $valuestring\n";
        }

        print "\nSolution path in the decision tree: @{$solution_paths[$i]}\n";
        printf("\nNumber of nodes created: %d\n", $self->{_root_nodes}->{$i}->how_many_nodes());
    }
    print "\n=================================================================================\n\n";
}

sub trust_weighted_majority_vote_classifier {
    my $self = shift;     
    my @classifications = @{$self->{_classifications}};
    die "You must first call 'classify_with_boosting()' before invoking 'trust_weighted_majority_vote_classifier()'\n"
        unless @classifications; 
    my %decision_classes = map {$_ => 0} @{$self->{_all_trees}->{0}->{_class_names}};
    foreach my $i (0..$self->{_how_many_stages}-1) {
        my %classification = %{$classifications[$i]};                            
        delete $classification{'solution_path'} if exists $classification{'solution_path'};
        my @sorted_classes = sort {$classification{$b} <=> $classification{$a}} keys %classification;
        $decision_classes{$sorted_classes[0]} += $self->{_trust_factors}->{$i};        
    }
    my @sorted_by_weighted_votes_decision_classes = sort {$decision_classes{$b} <=> $decision_classes{$a}} keys %decision_classes;
    my @sorted_class_and_weight_pairs;
    foreach my $class_name (sort {$decision_classes{$b} <=> $decision_classes{$a}} keys %decision_classes) {
        push @sorted_class_and_weight_pairs, [$class_name, $decision_classes{$class_name}];
    }
    $self->{_trust_weighted_decision_classes} = \@sorted_class_and_weight_pairs;
    return $sorted_by_weighted_votes_decision_classes[0];
}

sub display_trust_weighted_decision_for_test_sample {
    my $self = shift;         
    die "You must first call 'trust_weighted_majority_vote_classifier() before invoking display_trust_weighted_decision_for_test_sample()'\n"
        unless $self->{_trust_weighted_decision_classes};
    print "\nClassifier labels for test sample sorted by trust weights (The greater the trust weight, the greater the confidence we have in the classification label):\n\n";
    foreach my $item (@{$self->{_trust_weighted_decision_classes}}) {
        print "$item->[0]   =>    $item->[1]\n";
    }
}

sub classify_with_base_decision_tree {
    my $self = shift; 
    my $test_sample = shift;
    return $self->{_all_trees}->{0}->classify($self->{_root_nodes}->{0}, $test_sample);
}

sub get_all_class_names {
    my $self = shift;     
    return $self->{_all_trees}->{0}->{_class_names};
}


################################################### Utility Routines #################################################

# checks whether an element is in an array:
sub contained_in {
    my $ele = shift;
    my @array = @_;
    my $count = 0;
    map {$count++ if $ele eq $_} @array;
    return $count;
}

sub minmax {
    my $arr = shift;
    my ($min, $max);
    foreach my $i (0..@{$arr}-1) {
        if ( (!defined $min) || ($arr->[$i] < $min) ) {
            $min = $arr->[$i];
        }
        if ( (!defined $max) || ($arr->[$i] > $max) ) {
            $max = $arr->[$i];
        }
    }
    return ($min, $max);
}

sub sample_index {
    my $arg = shift;
    $arg =~ /_(.+)$/;
    return $1;
}    

sub check_for_illegal_params {
    my @params = @_;
    my @legal_params = qw / how_many_stages
                            training_datafile
                            entropy_threshold
                            max_depth_desired
                            csv_class_column_index
                            csv_columns_for_features
                            symbolic_to_numeric_cardinality_threshold
                            number_of_histogram_bins
                            debug1
                            debug2
                            debug3
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

1;
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)