The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Dumbbench::Stats;
use strict;
use warnings;
use List::Util ();
use Statistics::CaseResampling ();

use Class::XSAccessor {
  constructor => 'new',
  accessors => [qw/data name/],
};

# Note: This is entirely unoptimized. There is a lot of unnecessary
#       stuff going on. This is to allow the user to modify the data
#       set in flight. If this comes back to haunt us at some point,
#       we can still optimize, but at this point, convenience still wins.

sub sorted_data {
  my $self = shift;
  my $sorted = [sort { $a <=> $b } @{$self->data}];
  return $sorted;
}

sub first_quartile { Statistics::CaseResampling::first_quartile($_[0]->data) }
sub second_quartile { return $_[0]->median }
sub third_quartile { Statistics::CaseResampling::third_quartile($_[0]->data) }


sub n { scalar(@{$_[0]->data}) }

sub sum {
  my $self = shift;
  return List::Util::sum(@{$self->data});
}

sub min {
  my $self = shift;
  return List::Util::min(@{$self->data});
}

sub max {
  my $self = shift;
  return List::Util::max(@{$self->data});
}

sub mean {
  my $self = shift;
  return $self->sum / $self->n;
}

sub median { Statistics::CaseResampling::median($_[0]->data) } # O(n)!

sub median_confidence_limits {
  my $self = shift;
  my $nsigma = shift;
  my $alpha = Statistics::CaseResampling::nsigma_to_alpha($nsigma);
  # note: The 1000 here is kind of a lower limit for reasonable accuracy.
  #       But if the data set is small, that's more significant. If the data
  #       set is VERY large, then running much more than 1k resamplings
  #       is VERY expensive. So 1k is probably a reasonable default.
  return Statistics::CaseResampling::median_simple_confidence_limits($self->data, 1-$alpha, 1000)
}

sub mad {
  my $self = shift;
  my $median = $self->median;
  my @val = map {abs($_ - $median)} @{$self->data};
  return ref($self)->new(data => \@val)->median;
}

sub mad_dev {
  my $self = shift;
  return $self->mad()*1.4826;
}

sub std_dev {
  my $self = shift;
  my $data = $self->data;
  my $mean = $self->mean;
  my $var = 0;
  $var += ($_-$mean)**2 for @$data;
  $var /= @$data - 1;
  return sqrt($var);
}

sub filter_outliers {
  my $self = shift;
  my %opt = @_;
  my $var_measure = $opt{variability_measure} || 'mad';
  my $n_sigma = $opt{nsigma_outliers} || 2.5;
  my $data = $self->data;

  if ($n_sigma == 0) {
    return([@$data], []); # special case: no filtering
  }

  my $median = $self->median;
  my $variability = $self->$var_measure;
  my @good;
  my @outliers;
  foreach my $x (@$data) {
    if (abs($x-$median) < $variability*$n_sigma) {
      push @good, $x;
    }
    else {
      push @outliers, $x;
    }
  }

  return(\@good, \@outliers);
}


1;