The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl

##  get_indexes_associated_with_fields.pl

##  Large database files may have hundreds of fields and it is not always easy to
##  figure out what numerical index is associated with a given field.

##  At the same time, the constructor of the DecisionTree module requires that the
##  field that holds the class label and fields that contain the feature values be
##  specified by their numerical zero-based indexes.

##  If you have a very large database and you are faced with the problem described
##  above, you can run this script to see the zero-based numerical index values
##  associated with the different columns of your CSV file.

##  CALL SYNTAX:    get_indexes_associated_with_fields.pl    my_database_file.csv

use strict;
use warnings;

die "Call syntax:   get_indexes_associated_with_fields.pl  filename.csv"
         unless @ARGV == 1;

my $training_datafile = shift;

open FILEIN, $training_datafile or die "Unable to open $training_datafile: $!";
die("Aborted. Only CSV files allowed") unless $training_datafile =~ /\.csv$/;
my $firstline = <FILEIN>;
$firstline =~ s/\r?\n?$//;
my $record = cleanup_csv($firstline);
my @all_fields = split /,/, $record;
my %duplicate_detector;
map {$duplicate_detector{$_}++} @all_fields;
map { die "\n\nYour training file is NOT usable --- it contains duplicate field names" 
                                if $duplicate_detector{$_} > 1 } keys %duplicate_detector;
my $num_of_fields = scalar @all_fields;
print "\nNumber of fields: $num_of_fields\n";
my $all_fields_with_indexes = {map {$_ => $all_fields[$_]} 0 .. $num_of_fields-1};
my $all_fields_with_indexes_inverted = {map {$all_fields[$_] => $_} 0 .. $num_of_fields-1};

print "\nAll field names along with their positional indexes:\n";
foreach my $kee (sort {$a <=> $b} keys %{$all_fields_with_indexes}) {
    print "$kee=>$all_fields_with_indexes->{$kee}  ";
}
print "\n\nInverted index for field names:\n";
foreach my $kee (sort {$a cmp $b} keys %{$all_fields_with_indexes_inverted}) {
    print "$kee=>$all_fields_with_indexes_inverted->{$kee}  ";
}

sub cleanup_csv {
    my $line = shift;
    $line =~ tr/\/:?()[]{}'/          /;
    my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
    for (@double_quoted) {
        my $item = $_;
        $item = substr($item, 1, -1);
        $item =~ s/^s+|,|\s+$//g;
        $item = join '_',  split /\s+/, $item;
        substr($line, index($line, $_), length($_)) = $item;
    }
    my @white_spaced = $line =~ /,(\s*[^,]+)(?=,|$)/g;
    for (@white_spaced) {
        my $item = $_;
        $item =~ s/\s+/_/g;
        $item =~ s/^\s*_|_\s*$//g;
        substr($line, index($line, $_), length($_)) = $item;
    }
    $line =~ s/,\s*(?=,|$)/,NA/g;
    return $line;
}