The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -w

use lib '../blib/lib', '../blib/arch';

##  introspection_show_training_samples_to_nodes_influence_propagation.pl

##  With this script, you can display how the influence of each training
##  data sample propagates through the decision tree.  The influence
##  propagation is displayed in the following manner:
##
##    sample_1:
##       nodes affected directly: [2, 5, 19, 23]
##       nodes affected through probabilistic generalization:
##            2=> [3, 4, 25]
##                25=> [26]
##            5=> [6]
##                6=> [7, 13]
##                    7=> [8, 11]
##                        8=> [9, 10]
##                        11=> [12]
##                    13=> [14, 18]
##                        14=> [15, 16]
##                            16=> [17]
##            19=> [20]
##                20=> [21, 22]
##            23=> [24]
##    
##    sample_4:
##       nodes affected directly: [2, 5, 6, 7, 11]
##       nodes affected through probabilistic generalization:
##            2=> [3, 4, 25]
##                25=> [26]
##            5=> [19]
##                19=> [20, 23]
##                    20=> [21, 22]
##                    23=> [24]
##            6=> [13]
##                13=> [14, 18]
##                    14=> [15, 16]
##                        16=> [17]
##            7=> [8]
##                8=> [9, 10]
##            11=> [12]
##    
##    ...
##    ...
##
##
##  In the display shown above, the influence of the training sample
##  labeled `sample_1' is felt directly at the nodes labeled 2, 5, 19, and
##  23.  What that means is that the feature tests that result in these
##  nodes coming into existence directly involve the data provided by
##  `sample_1'.  The same training sample also contributes to the formation
##  of the nodes that are the child nodes of those nodes that are affected
##  directly by `sample_1'.  That is, since the nodes 3, 4, and 25 are the
##  child nodes of node 2, they are also indirectly affected by the data
##  provided by `sample1'.  

use strict;
use Algorithm::DecisionTree;

my $training_datafile = "stage3cancer.csv";

my $dt = Algorithm::DecisionTree->new( 
                              training_datafile => $training_datafile,
                              csv_class_column_index => 2,
                              csv_columns_for_features => [3,4,5,6,7,8],
                              entropy_threshold => 0.01,
                              max_depth_desired => 8,
                              symbolic_to_numeric_cardinality_threshold => 10,
         );

$dt->get_training_data();
$dt->calculate_first_order_probabilities();
$dt->calculate_class_priors();

#   UNCOMMENT THE NEXT STATEMENT if you would like to see the
#   training data that was read from the disk file:
#$dt->show_training_data();

my $root_node = $dt->construct_decision_tree_classifier();

#   UNCOMMENT THE NEXT TWO STATEMENTS if you would like to see the
#   decision tree displayed in your terminal window:
print "\n\nThe Decision Tree:\n\n";
$root_node->display_decision_tree("     ");           

#   You must construct an instance of the DT introspection class and
#   initialize the instance before you can get any answers through
#   introspection by the instance:
my $introspector = DTIntrospection->new($dt);
$introspector->initialize();

$introspector->display_training_samples_to_nodes_influence_propagation();