The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl

# This script creates a Categorizer and runs several of its methods on
# a corpus, reporting the results.
#
# Copyright 2002 Ken Williams, under the same license as the
# AI::Categorizer distribution.


use strict;
use AI::Categorizer;
use Benchmark;
my $HAVE_YAML = eval "use YAML; 1";

my ($opt, $do_stage, $outfile) = parse_command_line(@ARGV);
@ARGV = grep !/^-\d$/, @ARGV;

my $c = eval {new AI::Categorizer(%$opt)};
if ($@ and $@ =~ /^The following parameter/) {
  die "$@\nPlease see the AI::Categorizer documentation for a description of parameters accepted.\n";
}
die $@ if $@;

%$do_stage = map {$_, 1} 1..5 unless keys %$do_stage;

my $out_fh;
if ($outfile) {
  open $out_fh, ">> $outfile" or die "Can't create $outfile: $!";
  select((select($out_fh), $|=1)[0]);
  if (keys(%$do_stage) > 1) {
    print $out_fh "~~~~~~~~~~~~~~~~", scalar(localtime), "~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
    if ($HAVE_YAML) {
      print {$out_fh} YAML::Dump($c->dump_parameters);
    } else {
      warn "More detailed parameter dumping is available if you install the YAML module from CPAN.\n";
    }
  }
}
  

run_section('scan_features',     1, $do_stage);
run_section('read_training_set', 2, $do_stage);
run_section('train',             3, $do_stage);
run_section('evaluate_test_set', 4, $do_stage);
if ($do_stage->{5}) {
  my $result = $c->stats_table;
  print $result if $c->verbose;
  print $out_fh $result if $out_fh;
}

sub run_section {
  my ($section, $stage, $do_stage) = @_;
  return unless $do_stage->{$stage};
  if (keys %$do_stage > 1) {
    print " % $0 @ARGV -$stage\n" if $c->verbose;
    die "$0 is not executable, please change its execution permissions"
      unless -x $0;
    system($0, @ARGV, "-$stage") == 0
      or die "$0 returned nonzero status, \$?=$?";
    return;
  }
  my $start = new Benchmark;
  $c->$section();
  my $end = new Benchmark;
  my $summary = timestr(timediff($end, $start));
  my ($rss, $vsz) = memory_usage();
  print "$summary (memory: rss=$rss, vsz=$vsz)\n" if $c->verbose;
  print $out_fh "Stage $stage: $summary (memory: rss=$rss, vsz=$vsz)\n" if $out_fh;
}

sub parse_command_line {
  my (%opt, %do_stage);

  while (@_) {
    if ($_[0] =~ /^-(\d+)$/) {
      shift;
      $do_stage{$1} = 1;
      
    } elsif ( $_[0] eq '--config_file' ) {
      die "--config_file requires the YAML module from CPAN to be installed.\n" unless $HAVE_YAML;
      shift;
      my $file = shift;
      my $href = YAML::LoadFile($file);
      @opt{keys %$href} = values %$href;
      
    } elsif ( $_[0] =~ /^--/ ) {
      my ($k, $v) = (shift, shift);
      $k =~ s/^--//;
      $opt{$k} = $v;
      
    } else {
      die usage();
    }
  }

  while (my ($k, $v) = each %opt) {
    # Allow abbreviations
    if ($k =~ /^(\w+)_class$/) {
      my $name = $1;
      $v =~ s/^::/AI::Categorizer::\u${name}::/;
      $opt{$k} = $v;
    }
  }

  my $outfile;
  unless ($outfile = delete $opt{outfile}) {
    $outfile = $opt{progress_file} ? "$opt{progress_file}-results.txt" : "results.txt";
  }

  return (\%opt, \%do_stage, $outfile);
}

sub usage {
  return <<EOF;
 Usage:

  $0 --parameter_1 <value_1> --parameter_2 <value_2>
      # You may specify a YAML config file as follows:
  $0 --config_file <path> --parameter_3 <value_3>
      # Or, to run only step 3 (of 5)
  $0 --config_file <path> -3

 --parameter_1, --parameter_2, etc. are parameters accepted by
 AI::Categorizer objects' new() methods.

EOF
}

sub memory_usage {
  my ($rss, $vsz);
  if ($^O eq 'darwin' or $^O eq 'linux') {
    ($rss, $vsz) = `ps -eo rss,vsz -p $$` =~ /(\d+)\s+(\d+)/;
  } elsif ($^O eq 'solaris') {
    ($rss, $vsz) = `ps -o rss,vsz -p $$` =~ /(\d+)\s+(\d+)/;
  } else {
    warn "Unknown system, can't get memory usage";
  }
  return ($rss, $vsz);
}