The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# encapsulate information about a single classification result
package Algorithm::AM::Result;
use strict;
use warnings;
our $VERSION = '3.08';
# ABSTRACT: Store results of an AM classification
use Text::Table;
use Crypt::PRNG qw(rand);

#pod =head2 SYNOPSIS
#pod
#pod   use Algorithm::AM;
#pod
#pod   my $am = Algorithm::AM->new('finnverb', -commas => 'no');
#pod   my ($result) = $am->classify;
#pod   print @{ $result->winners };
#pod   print $result->statistical_summary;
#pod
#pod =head2 DESCRIPTION
#pod
#pod This package encapsulates all of the classification information
#pod generated by L<Algorithm::AM/classify>, including the assigned class,
#pod score to each class, gang effects, analogical sets,
#pod and timing information. It also provides several methods for
#pod generating printable reports with this information.
#pod
#pod Note that the words 'score' and 'point' are used here to represent
#pod whatever count is assigned by analogical modeling during
#pod classification. This can be either pointers or occurrences. For an
#pod explanation of this, see L<Algorithm::AM::algorithm>.
#pod
#pod All of the scores returned by the methods here are scalars with
#pod special PV and NV values. You should excercise caution when doing
#pod calculations with them. See L<Algorithm::AM::BigInt> for more
#pod information.
#pod
#pod =cut

## TODO: variables consider exporting someday
## @itemcontextchain
## %itemcontextchainhead
## %context_to_class
## %contextsize
use Class::Tiny qw(
    exclude_nulls
    given_excluded
    cardinality
    test_in_train
    test_item
    count_method

    start_time
    end_time

    training_set

    scores
    high_score
    total_points
    winners
    is_tie
    result

    scores_normalized
    random_outcome
), {
    'scores_normalized' => sub {
        my ($self) = @_;
        my $total_points = $self->total_points;
        my $scores = $self->scores;
        my $normalized = {};
        for my $class (keys %$scores){
            $normalized->{$class} = $scores->{$class} / $total_points
        }
        return $normalized;
    },
    'random_outcome' => sub {
        my ($self) = @_;
        my $score_map = $self->scores_normalized;
        my @classes = sort keys %$score_map;
        my @scores = @{$score_map}{@classes};
        # this portion taken from List::Util::WeightedChoice
        # create ranges for each of the classes, and pick
        # a class by choosing a random number in the range.
        my @ranges = ();
        my $left = 0;
        for my $score(@scores){
            my $right = $left+$score;
            push @ranges, $right;
            $left = $right;
        }
        my $scoreIndex = rand $left;
        for( my $i =0; $i< @scores; $i++){
            my $range = $ranges[$i];
            return $classes[$i] if $scoreIndex < $range;
        }
    }
};
use Carp 'croak';
use Algorithm::AM::BigInt 'bigcmp';

# For printing percentages in reports
my $percentage_format = '%7.3f%%';

#pod =head1 REPORT METHODS
#pod
#pod The methods below return human eye-friendly reports about the
#pod classification. The return value is a reference, so it must be
#pod dereferenced for printing like so:
#pod
#pod  print ${ $result->statistical_summary };
#pod
#pod =head2 C<config_info>
#pod
#pod Returns a scalar (string) ref containing information about the
#pod configuration at the time of classification. Information from the
#pod following accessors is included:
#pod
#pod     exclude_nulls
#pod     given_excluded
#pod     cardinality
#pod     test_in_train
#pod     test_item
#pod     count_method
#pod
#pod =cut
sub config_info {
    my ($self) = @_;
    my @headers = ('Option', 'Setting');
    my @rows = (
        [ "Given context", (join ' ', @{$self->test_item->features}) .
            ', ' . $self->test_item->comment],
        [ "Nulls", ($self->exclude_nulls ? 'exclude' : 'include')],
        [ "Gang",  $self->count_method],
        [ "Test item in training set", ($self->test_in_train ? 'yes' : 'no')],
        [ "Test item excluded", ($self->given_excluded ? 'yes' : 'no')],
        [ "Size of training set", $self->training_set->size ],
        [ "Number of active features", $self->cardinality ],
    );
    my @table = _make_table(\@headers, \@rows);
    my $info = join '', @table;
    return \$info;
}

# input several variables from AM's guts (sum, pointers,
# itemcontextchainhead and itemcontextchain). Calculate the
# prediction statistics, and
# store information needed for computing analogical sets.
# Set result to tie/correct/incorrect and also is_tie if
# expected class is provided, and high_score, scores, winners, and
# total_points.
sub _process_stats {
    my ($self, $sum, $pointers,
        $itemcontextchainhead, $itemcontextchain, $context_to_class,
        $gang, $active_feats, $contextsize) = @_;
    my $total_points = $pointers->{grandtotal};
    my $max = '';
    my @winners;
    my %scores;

    # iterate all possible classes and store the ones that have a
    # non-zero score. Store the high-scorers, as well.
    # 1) find which one(s) has the highest score (the prediction) and
    # 2) print out the ones with scores (probability of prediction)
    for my $class_index (1 .. $self->training_set->num_classes) {
        my $class_score;
        # skip classes with no score
        next unless $class_score = $sum->[$class_index];

        my $class = $self->training_set->_class_for_index($class_index);
        $scores{$class} = $class_score;

        # check if the class has the highest score, or ties for it
        do {
            my $cmp = bigcmp($class_score, $max);
            if ($cmp > 0){
                @winners = ($class);
                $max = $class_score;
            }elsif($cmp == 0){
                push @winners, $class;
            }
        };
    }

    # set result to tie/correct/incorrect after comparing
    # expected/actual class labels. Only do this if the expected
    # class label is known.
    if(my $expected = $self->test_item->class){
        if(exists $scores{$expected} &&
                bigcmp($scores{$expected}, $max) == 0){
            if(@winners > 1){
                $self->result('tie');
            }else{
                $self->result('correct');
            }
        }else{
            $self->result('incorrect');
        }
    }
    if(@winners > 1){
        $self->is_tie(1);
    }
    $self->high_score($max);
    $self->scores(\%scores);
    $self->winners(\@winners);
    $self->total_points($total_points);
    $self->{pointers} = $pointers;
    $self->{itemcontextchainhead} = $itemcontextchainhead;
    $self->{itemcontextchain} = $itemcontextchain;
    $self->{context_to_class} = $context_to_class;
    $self->{gang} = $gang;
    $self->{active_feats} = $active_feats;
    $self->{contextsize} = $contextsize;
    return;
}

#pod =head2 C<statistical_summary>
#pod
#pod Returns a scalar reference (string) containing a statistical summary
#pod of the classification results. The summary includes all possible
#pod predicted classes with their scores and percentage scores and the
#pod total score for all classes. Whether the predicted class
#pod is correct/incorrect/a tie of some sort is also included, if the
#pod test item had a known class.
#pod
#pod =cut
sub statistical_summary {
    my ($self) = @_;
    my %scores = %{$self->scores};
    my $total_points = $self->total_points;

    # Make a table with information about predictions for different
    # classes. Each row contains a class name, the score,
    # and the percentage predicted.
    my @rows;
    for my $class (sort keys %scores){
        push @rows, [ $class, $scores{$class},
            sprintf($percentage_format,
                100 * $scores{$class} / $total_points) ];
    }
    # add a Total row
    push @rows, [ 'Total', $total_points ];

    my @table = _make_table(['Class', 'Score', 'Percentage'],
        \@rows);
    # copy the rule from the first row into the second to last row
    # to separate the Total row
    splice(@table, $#table - 1, 0, $table[0]);

    my $info = "Statistical Summary\n";
    $info .= join '', @table;
    # the predicted class (the one with the highest score)
    # and the result (correct/incorrect/tie).
    if ( defined (my $expected = $self->test_item->class) ) {
        $info .= "Expected class: $expected\n";
        my $result = $self->result;
        if ( $result eq 'correct') {
            $info .= "Correct class predicted.\n";
        }elsif($result eq 'tie'){
            $info .= "Prediction is a tie.\n";
        }else {
            $info .= "Incorrect class predicted.\n";
        }
    }else{
        $info .= "Expected class unknown\n";
    }
    return \$info;
}

#TODO: the keys for this set don't seem to make any sense.
sub analogical_set {
    my ($self) = @_;
    if(!exists $self->{_analogical_set}){
        $self->_calculate_analogical_set;
    }
    # make a safe copy
    my %set = %{$self->{_analogical_set}};
    return \%set;
}

#pod =head2 C<analogical_set_summary>
#pod
#pod Returns a scalar reference (string) containing the analogical set,
#pod meaning all items that contributed to the predicted class, along
#pod with the amount contributed by each item (score and
#pod percentage overall). Items are ordered by appearance in the data
#pod set.
#pod
#pod =cut
sub analogical_set_summary {
    my ($self) = @_;
    my $set = $self->analogical_set;
    my $total_points = $self->total_points;

    # Make a table for the analogical set. Each row contains an
    # item with its class, comment, score, and the percentage
    # of total score contributed.
    my @rows;
    foreach my $id (sort keys %$set){
        my $entry = $set->{$id};
        my $score = $entry->{score};
        push @rows, [
            $entry->{item}->class,
            $entry->{item}->comment,
            $score,
            sprintf($percentage_format, 100 * $score / $total_points)
        ];
    }
    my @table = _make_table(
        ['Class', 'Item', 'Score', 'Percentage'], \@rows);
    my $info = "Analogical Set\nTotal Frequency = $total_points\n";
    $info .= join '', @table;
    return \$info;
}

# calculate and store analogical effects in $self->{_analogical_set}
sub _calculate_analogical_set {
    my ($self) = @_;
    my $train = $self->training_set;
    my %set;
    foreach my $context ( keys %{$self->{pointers}} ) {
        next unless
            exists $self->{itemcontextchainhead}->{$context};
        for (
            my $index = $self->{itemcontextchainhead}->{$context};
            defined $index;
            $index = $self->{itemcontextchain}->[$index]
        )
        {
            my $item = $train->get_item($index);
            $set{$item->id} = {
                item => $item,
                score => $self->{pointers}->{$context}
            };
        }
    }
    $self->{_analogical_set} = \%set;
    return;
}

sub gang_effects {
    my ($self) = @_;
    if(!$self->{_gang_effects}){
        $self->_calculate_gangs;
    }
    return $self->{_gang_effects};
}

#pod =head2 C<gang_summary>
#pod
#pod Returns a scalar reference (string) containing the gang effects on the
#pod final class prediction.
#pod
#pod A single boolean parameter can be provided to turn on list printing,
#pod meaning gang items items are printed. This is false (off) by default.
#pod
#pod =cut
sub gang_summary {
    my ($self, $print_list) = @_;
    my $test_item = $self->test_item;

    my $gangs = $self->gang_effects;

    # Make a table for the gangs with these rows:
    #   Percentage
    #   Score
    #   Num
    #   Class
    #   Features
    #   item comment
    my @rows;
    # first row is a header with test item for easy reference
    push @rows, [
        'Context',
        undef,
        undef,
        undef,
        @{$test_item->features},
    ];

    # store the number of rows added for each gang
    # will help with printing later
    my @gang_rows;
    my $current_row = -1;
    # add information for each gang; sort by order of highest to
    # lowest effect
    foreach my $gang (@$gangs){
        $current_row++;
        $gang_rows[$current_row]++;
        my $features = $gang->{features};
        # add the gang supracontext, effect and score
        push @rows, [
            sprintf($percentage_format, 100 * $gang->{effect}),
            $gang->{score},
            undef,
            undef,
            # print undefined feature slots as asterisks
            map {length($_) ? $_ : '*'} @$features
        ];
        # add each class in the gang, along with the total number
        # and effect of the gang items supporting it
        for my $class (sort keys %{ $gang->{class} }){
            $gang_rows[$current_row]++;
            push @rows, [
                sprintf($percentage_format,
                    100 * $gang->{class}->{$class}->{effect}),
                $gang->{class}->{$class}->{score},
                scalar @{ $gang->{data}->{$class} },
                $class,
                undef
            ];
            if($print_list){
                # add the list of items in the given context
                for my $item (@{ $gang->{data}->{$class} }){
                    $gang_rows[$current_row]++;
                    push @rows, [
                        undef,
                        undef,
                        undef,
                        undef,
                        @{ $item->features },
                        $item->comment,
                    ];
                }
            }
        }
    }

    # construct the table from the rows
    my @headers = (
        \'| ',
        'Percentage' => \' | ',
        'Score' => \' | ',
        'Num Items' => \' | ',
        'Class' => \' | ',
        ('' => \' ') x @{$test_item->features}
    );
    pop @headers;
    if($print_list){
        push @headers, \' | ', 'Item Comment';
    }
    push @headers, \' |';
    my @rule = qw(- +);
    my $table = Text::Table->new(@headers);
    $table->load(@rows);
    # main header
    $current_row = 0;
    my $return = $table->rule(@rule) .
        $table->title .
        $table->body($current_row) .
        $table->rule(@rule);
    $current_row++;
    # add info with a header for each gang
    for my $num (@gang_rows){
        # a row of '*' separates each gang
        $return .= $table->rule('*','*') .
            $table->body($current_row) .
            $table->rule(@rule);
        $current_row++;
        for(1 .. $num - 1){
            $return .= $table->body($current_row);
            $current_row++;
        }
    }
    $return .= $table->rule(@rule);
    return \$return;
}

sub _calculate_gangs {
    my ($self) = @_;
    my $train = $self->training_set;
    my $total_points = $self->total_points;
    my $raw_gang = $self->{gang};
    my @gangs;

    foreach my $context (keys %{$raw_gang})
    {
        my $gang = {};
        my @features = $self->_unpack_supracontext($context);
        # for now, store gangs by the supracontext printout
        my $key = join ' ', map {length($_) ? $_ : '-'} @features;
        $gang->{score} = $raw_gang->{$context};
        $gang->{effect} = $raw_gang->{$context} / $total_points;
        $gang->{features} = \@features;

        my $p = $self->{pointers}->{$context};
        # if the supracontext is homogenous
        if ( my $class_index = $self->{context_to_class}->{$context} ) {
            # store a 'homogenous' key that indicates this, besides
            # indicating the unanimous class prediction.
            my $class = $train->_class_for_index($class_index);
            $gang->{homogenous} = $class;
            my @data;
            for (
                my $index = $self->{itemcontextchainhead}->{$context};
                defined $index;
                $index = $self->{itemcontextchain}->[$index]
              )
            {
                push @data, $train->get_item($index);
            }
            $gang->{data}->{$class} = \@data;
            $gang->{size} = scalar @data;
            $gang->{class}->{$class}->{score} = $p;
            $gang->{class}->{$class}->{effect} =
                $gang->{effect};
        }
        # for heterogenous supracontexts we have to store data for
        # each class
        else {
            $gang->{homogenous} = 0;
            # first loop through the data and sort by class, also
            # finding the total gang size
            my $size = 0;
            my %data;
            for (
                my $index = $self->{itemcontextchainhead}->{$context};
                defined $index;
                $index = $self->{itemcontextchain}->[$index]
              )
            {
                my $item = $train->get_item($index);
                push @{ $data{$item->class} }, $item;
                $size++;
            }
            $gang->{data} = \%data;
            $gang->{size} = $size;

            # then store aggregate statistics for each class
            for my $class (keys %data){
                $gang->{class}->{$class}->{score} = $p;
                $gang->{class}->{$class}->{effect} =
                    # score*num_data/total
                    @{ $data{$class} } * $p / $total_points;
            }
        }
        push @gangs, $gang;
    }

    # sort by score and then alphabetically by class labels
    @gangs = sort{
        bigcmp($b->{score}, $a->{score}) ||
        (join '', sort keys %{ $b->{class} })
        cmp
        (join '', sort keys %{ $a->{class} })} @gangs;
    $self->{_gang_effects} = \@gangs;
    return;
}

# Unpack and return the supracontext features.
# Blank entries mean the variable may be anything, e.g.
# ('a' 'b' '') means a supracontext containing items
# wich have ('a' 'b' whatever) as variable values.
sub _unpack_supracontext {
    my ($self, $context) = @_;
    my @context_list = unpack "S!4", $context;
    my @alist = @{$self->{active_feats}};
    my (@features) = @{ $self->test_item->features };
    my $exclude_nulls = $self->exclude_nulls;
    my $j = 1;
    foreach my $a (reverse @alist) {
        my $partial_context = pop @context_list;
        for ( ; $a ; --$a ) {
            if($exclude_nulls){
                ++$j while !defined $features[ -$j ];
            }
            $features[ -$j ] = '' if $partial_context & 1;
            $partial_context >>= 1;
            ++$j;
        }
    }
    return @features;
}

# mostly by Ovid:
# http://use.perl.org/use.perl.org/_Ovid/journal/36762.html
# Return table rows with a nice header and column separators
sub _make_table {
    my ( $headers, $rows ) = @_;

    my @rule      = qw(- +);
    my @headers   = \'| ';
    push @headers => map { $_ => \' | ' } @$headers;
    pop  @headers;
    push @headers => \' |';

    unless ('ARRAY' eq ref $rows
        && 'ARRAY' eq ref $rows->[0]
        && @$headers == @{ $rows->[0] }) {
        croak(
            "make_table() rows must be an AoA with rows being same size as headers"
        );
    }
    my $table = Text::Table->new(@headers);
    $table->rule(@rule);
    $table->body_rule(@rule);
    $table->load(@$rows);

    return $table->rule(@rule),
        $table->title,
        $table->rule(@rule),
        map({ $table->body($_) } 0 .. @$rows),
        $table->rule(@rule);
}

1;

__END__

=pod

=encoding UTF-8

=head1 NAME

Algorithm::AM::Result - Store results of an AM classification

=head1 VERSION

version 3.08

=head2 SYNOPSIS

  use Algorithm::AM;

  my $am = Algorithm::AM->new('finnverb', -commas => 'no');
  my ($result) = $am->classify;
  print @{ $result->winners };
  print $result->statistical_summary;

=head2 DESCRIPTION

This package encapsulates all of the classification information
generated by L<Algorithm::AM/classify>, including the assigned class,
score to each class, gang effects, analogical sets,
and timing information. It also provides several methods for
generating printable reports with this information.

Note that the words 'score' and 'point' are used here to represent
whatever count is assigned by analogical modeling during
classification. This can be either pointers or occurrences. For an
explanation of this, see L<Algorithm::AM::algorithm>.

All of the scores returned by the methods here are scalars with
special PV and NV values. You should excercise caution when doing
calculations with them. See L<Algorithm::AM::BigInt> for more
information.

=head1 REPORT METHODS

The methods below return human eye-friendly reports about the
classification. The return value is a reference, so it must be
dereferenced for printing like so:

 print ${ $result->statistical_summary };

=head2 C<config_info>

Returns a scalar (string) ref containing information about the
configuration at the time of classification. Information from the
following accessors is included:

    exclude_nulls
    given_excluded
    cardinality
    test_in_train
    test_item
    count_method

=head2 C<statistical_summary>

Returns a scalar reference (string) containing a statistical summary
of the classification results. The summary includes all possible
predicted classes with their scores and percentage scores and the
total score for all classes. Whether the predicted class
is correct/incorrect/a tie of some sort is also included, if the
test item had a known class.

=head2 C<analogical_set_summary>

Returns a scalar reference (string) containing the analogical set,
meaning all items that contributed to the predicted class, along
with the amount contributed by each item (score and
percentage overall). Items are ordered by appearance in the data
set.

=head2 C<gang_summary>

Returns a scalar reference (string) containing the gang effects on the
final class prediction.

A single boolean parameter can be provided to turn on list printing,
meaning gang items items are printed. This is false (off) by default.

=head1 CONFIGURATION INFORMATION

The following methods provide information about the configuration
of AM at the time of classification.

=head2 C<exclude_nulls>

Set to the value given by the same method of
L<Algorithm::AM|Algorithm::AM/exclude_nulls> at the time of
classification.

=head2 C<given_excluded>

Set to the value given by the same method of
L<Algorithm::AM|Algorithm::AM/exclude_nulls> at the time of
classification.

=head2 C<cardinality>

The number of features used during classification. If there
were null feature values and L</exclude_nulls> was set to true,
then this number will be lower than the cardinality of the utilized
data sets.

=head2 C<test_in_train>

True if the test item was present among the training items.

=head2 C<test_item>

Returns the L<item|Algorithm::AM::DataSet::Item> which was classified.

=head2 C<count_method>

Returns either "linear" or "squared", indicating the setting used
for computing analogical sets. See L<Algorithm::AM/linear>.

=head2 C<training_set>

Returns the L<data set|Algorithm::AM::DataSet> which was the
source of classification data.

=head1 RESULT DETAILS

The following methods provide information about the results of
the classification.

=head2 C<result>

If the class of the test item was known before classification, this
returns "tie", "correct", or "incorrect", depending on the label
assigned by the classification. Otherwise this returns C<undef>.

=head2 C<gang_effects>

Return a hash describing gang effects. Gang effects are similar to
analogical sets, but the total effects of entire subcontexts and
supracontexts are also calculated and printed.

TODO: details, details! Maybe make a gang class to hold this structure.

=head2 C<analogical_set>

The analogical set is the set of items from the training set that
had some effect on the item classification. The analogical effect of
an item in the analogical set is the score it contributed towards
a classification matching its own class label.

This method returns the items in the analogical set along with their
analogical effects, in the following structure:

 { 'item_id' => {'item' => item, 'score' => score}

C<item> above is the actual item object. The item_id is used so that
the analogical effect of a particular item can be found quickly:

 my $set = $result->analogical_set;
 print 'the item's analogical effect was '
     . $set->{$item->id}->score;

=head2 C<random_outcome>

This returns one of the class labels predicted for the test item.
The choice is done probabilistically, with the probability of each
value given by its L<normalized score|/scores_normalized>.

For a given result object, the return value of this method never
changes; the value is only chosen once.

=head2 C<high_score>

Returns the highest score assigned to any of the class labels.

=head2 C<scores>

Returns a hash mapping all predicted classes to their scores.

=head2 C<scores_normalized>

Returns a hash mapping all predicted classes to their score,
divided by the total score for all classes. For example,
if the L</scores> method returns the following:

 {'e' => 4, 'r' => 9}

then this method would return the following (values below are
rounded):

 {'e' => 0.3076923, 'r' => 0.6923077}

=head2 C<winners>

Returns an array ref containing the classes which had the highest
score. There is more than one only if there is a tie for the highest
score.

=head2 C<is_tie>

Returns true if more than one class was assigned the high score.

=head2 C<total_points>

The sum total number of points assigned as a score to any contexts.

=head2 C<start_time>

Returns the start time of the classification.

=head2 C<end_time>

Returns the end time of the classification.

=head1 AUTHOR

Theron Sanford <shixilun@yahoo.com>, Nathan Glenn <garfieldnate@gmail.com>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2013 by Royal Skousen.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut