The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.

package Stream::Aggregate::Stats;

use strict;
use warnings;
# use Scalar::Util;
require List::Util;
require Exporter;

our @ISA = qw(Exporter);
our @EXPORT = qw(percentile median mean largest smallest dominant dominantcount standard_deviation numeric_only);

our $ps;

sub numeric_only
{
	my ($field) = @_;
	die unless $ps;
	return $ps->{numeric}{$field} if $ps->{numeric}{$field};

	unless ($ps->{keep}->{$field}) {
		$ps->{numeric}{$field} = [];
	}

	no warnings;
	$ps->{numeric}{$field} = [ grep({defined($_) and ($_ <=> 0 or $_+0 eq $_) } @{$ps->{keep}->{$field}}) ];
}

sub percentile
{
	my ($field, $cutoff) = @_;
	die unless $cutoff >= 0 && $cutoff <= 100;

	return undef unless numeric_only($field);

	my @sorted = sort { $a <=> $b || $a cmp $b } @{$ps->{numeric}->{$field}};

	return $sorted[0] unless @sorted > 1;

	# count the fences, not the fence posts
	my $i = (@sorted -1) * $cutoff / 100;

	my $rem = $i - int($i);

	# my $c = @sorted; print "# i = $i, rem = $rem, sorted$c = @sorted\n";

	return $sorted[$i] unless $rem;

	return $sorted[$i] unless exists $sorted[$i+1];

	return $sorted[$i] unless $sorted[$i] > 0;
	return $sorted[$i] unless $sorted[$i+1] > 0;

	return $sorted[$i] * (1-$rem) + $sorted[$i+1] * $rem;
}

sub standard_deviation
{
	my ($field) = @_;
	return undef unless numeric_only($field);
	my $count = scalar(@{$ps->{numeric}->{$field}});
	return 0 unless $count;
	my $mean = List::Util::sum(@{$ps->{numeric}->{$field}}) / $count;
	my $x;
	for my $item (@{$ps->{numeric}->{$field}}) {
		my $diff = $item - $mean;
		$x += $diff * $diff;
	}
	return sqrt($x / $count);
}

sub median
{
	my ($field) = @_;
	return undef unless numeric_only($field);
	return percentile($field, 50);
}

sub mean
{
	my ($field) = @_;
	return undef unless numeric_only($field);
	my $data = $ps->{numeric}{$field};
	return undef unless $data && @$data;


	my $count = @$data;
	my $sum = List::Util::sum(@$data);
	my $side = $ps->{sidestats}{$field} || {};

	if ($side->{count}) {
		return ($sum + $side->{sum}) / ($count + $side->{count});
	} else {
		return $sum / $count;
	}
}

sub largest
{
	my ($field) = @_;
	return undef unless numeric_only($field);
	return List::Util::max(@{$ps->{numeric}->{$field}}, grep(defined $_, $ps->{sidestats}{$field}{max}));
}

sub smallest
{
	my ($field) = @_;
	return undef unless numeric_only($field);
	return List::Util::min(@{$ps->{numeric}->{$field}},grep(defined $_, $ps->{sidestats}{$field}{min}));
}

# stats "mode"
sub dominant
{
	my ($field) = @_;
	die unless $ps;
	return undef unless $ps->{keep}->{$field};
	my %counts;
	for my $d (@{$ps->{keep}->{$field}}) {
		next unless defined $d;
		$counts{$d}++;
	}
	my $maxcount = 0;
	my $max;
	for my $c (keys %counts) {
		next unless $counts{$c} > $maxcount;
		$maxcount = $counts{$c};
		$max = $c;
	}
	$ps->{sidestats}{$field}{dominantcount} = $maxcount;
	return ($max, $maxcount) if wantarray;
	return $max;
}

sub dominantcount
{
	my ($field) = @_;
	die unless $ps;
	my $data = $ps->{keep}{$field};
	return undef unless $data && @$data;
	my ($max, $maxcount);
	if (defined $ps->{sidestats}{$field}{dominantcount}) {
		$maxcount = $ps->{sidestats}{$field}{dominantcount}
	} else {
		($max, $maxcount) = dominant($field);
	}
	my $side = $ps->{sidestats}{$field};
	my $multiplier = (@$data + ($side->{count} || 0)) / @$data;  # for discarded data
	return $maxcount * $multiplier;
}

1;

__END__

=head1 NAME

Stream::Aggregate::Stats - some standard statistics functions

=head1 SYNOPSIS

 use Stream::Aggregate::Stats;

 $data = [ 1, 2, 3, 5, 9 ];
 $mydata = {
	keep => {
		fieldname	=> $data,
	},
	numeric => { },
 };

 local($Stream::Aggregate::Stats::ps) = $mydata;
 
 $mean = mean('fieldname');

=head1 DESCRIPTION

This module implements some standard statistics functions.  It has
an odd API: it expect the data to be inside a structure.  This is to
facilitate uses where the operations are found at runtime from
user input or configuration files.

The exact structure is:

 {
	keep => {
		field1 => [ @data ],
		field2 => [ @data ], 
		field3 => [ @data ],
	},
	numeric => {},
 }

where C<field1>, C<field2>, etc are names for the data.   You then ask
for functions by name: eg: C<mean('field1')>.

The functions available are:
C<percentile>,
C<standard_deviation>,
C<median>,
C<mean>,
C<largest> (numeric max),
C<smallest> (numeric min),
C<dominant> (the mode),
and
C<dominantcount>.
The C<percentile> function takes two arguments: the field name and the 
percentile: C<percentile(field1 =E<gt> 80)> gives you the 80th percentle
for the data in field1.   The 50th percentile is the same as the median.
The C<dominantcount> is the number of values that exactly matched the 
mode.

=head1 LICENSE

This package may be used and redistributed under the terms of either
the Artistic 2.0 or LGPL 2.1 license.