The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -w

use lib '../blib/lib', '../blib/arch';

##  introspection_show_training_samples_at_all_nodes_direct_influence.pl

##  The purpose of this script is to descend down the decision tree and show
##  at each node the training samples that participated directly in the
##  feature tests leading to that node.  We refer to this as the DIRECT
##  propagation of a training sample's influence to that node.

##  However, in light of the comments made elsewhere in this distribution,
##  note that a training sample can affect nodes indirectly through the
##  generalization achieved by the probabilistic modeling of the data.
##  Examples of this are nodes that are descendants of the nodes affected
##  directly by the training sample.

##      Node 0: the samples are: None
##      Node 1: the samples are: ['sample_46', 'sample_58']
##      Node 2: the samples are: ['sample_1', 'sample_4', 'sample_7', ..... ]
##      Node 3: the samples are: []
##      Node 4: the samples are: []

##  Again, in keeping with the explanation in the main module file, the
##  empty list for the samples at a node indicates merely that the node is
##  a consequence of the generalization achieved by the probabilistic
##  modeling of the training data.

use strict;
use Algorithm::DecisionTree;

my $training_datafile = "stage3cancer.csv";

my $dt = Algorithm::DecisionTree->new( 
                              training_datafile => $training_datafile,
                              csv_class_column_index => 2,
                              csv_columns_for_features => [3,4,5,6,7,8],
                              entropy_threshold => 0.01,
                              max_depth_desired => 8,
                              symbolic_to_numeric_cardinality_threshold => 10,
         );

$dt->get_training_data();
$dt->calculate_first_order_probabilities();
$dt->calculate_class_priors();

#   UNCOMMENT THE NEXT STATEMENT if you would like to see the
#   training data that was read from the disk file:
#$dt->show_training_data();

my $root_node = $dt->construct_decision_tree_classifier();

#   UNCOMMENT THE NEXT TWO STATEMENTS if you would like to see the
#   decision tree displayed in your terminal window:
print "\n\nThe Decision Tree:\n\n";
$root_node->display_decision_tree("     ");           

#   You must construct an instance of the DT introspection class and
#   initialize the instance before you can get any answers through
#   introspection by the instance:
my $introspector = DTIntrospection->new($dt);
$introspector->initialize();

$introspector->display_training_samples_at_all_nodes_direct_influence_only();