The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl

##  randomized_trees_for_classifying_one_test_sample_2.pl

##  This script demonstrates how you can use the RandomizedTreesForBigData
##  class for data classification in the big data context.  Assuming you
##  have access to a very large training database, you can draw multiple
##  random datasets from the database and use each for constructing a
##  different decision tree.  Subsequently, the final classification for a
##  new data sample can be based on majority voting by all the decision
##  trees thus constructed.  In order to use this functionality, you need
##  to set the following two constructor parameters of this class:
##
##           how_many_training_samples_per_tree
##
##           how_many_trees


use strict;
use warnings;
use Algorithm::RandomizedTreesForBigData;

my $training_datafile = "stage3cancer.csv";

my $rt = Algorithm::RandomizedTreesForBigData->new(
                              training_datafile => $training_datafile,
                              csv_class_column_index => 2,
                              csv_columns_for_features => [3,4,5,6,7,8],
                              entropy_threshold => 0.01,
                              max_depth_desired => 8,
                              symbolic_to_numeric_cardinality_threshold => 10,
                              how_many_trees => 3,
                              how_many_training_samples_per_tree => 50,
                              csv_cleanup_needed => 1,
         );

print "\nReading the training data ...\n";
$rt->get_training_data_for_N_trees();

##   UNCOMMENT the following statement if you want to see the training data used for each tree::
$rt->show_training_data_for_all_trees();


print "\nCalculating first order probabilities...\n";
$rt->calculate_first_order_probabilities();

print "\nCalculating class priors...\n";
$rt->calculate_class_priors();

print "\nConstructing all decision trees ....\n";
$rt->construct_all_decision_trees();

##   UNCOMMENT the following statement if you want to see all decision trees individually:
$rt->display_all_decision_trees();

print "\nReading the test sample....\n";
my $test_sample  = ['g2 = 4.2',
                    'grade = 2.3',
                    'gleason = 4',
                    'eet = 1.7',
                    'age = 55.0',
                    'ploidy = diploid'];

print "\nClassify the test sample with each decision tree....\n";
$rt->classify_with_all_trees( $test_sample );

##   COMMENT OUT the following statement if you do NOT want to see the classification results
##   produced by each tree separately:
$rt->display_classification_results_for_all_trees();

print "\n\nWill now calculate the majority decision from all trees:\n";
my $decision = $rt->get_majority_vote_classification();
print "\nMajority vote decision: $decision\n";