The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#
# LSNoHistory.pm - least-squares regression without data history
#
# $Id: LSNoHistory.pm,v 1.6 2003/02/23 05:11:29 pliam Exp $
#

package Statistics::LSNoHistory;
use strict;

use vars qw($VERSION);
$VERSION = sprintf("%d.%02d", (q$Name: LSNoHist_Release_0_01 $ =~ /\d+/g));

#############################################################################
# top-level pod 
#############################################################################

=pod

=head1 NAME

Statistics::LSNoHistory - Least-Squares linear regression package without 
data history

=head1 SYNOPSIS

  # construct from points
  $reg = Statistics::LSNoHistory->new(points => [
    1.0 => 1.0,
    2.1 => 1.9,
    2.8 => 3.2,
    4.0 => 4.1,
    5.2 => 4.9
  ]);

  # other equivalent constructions
  $reg = Statistics::LSNoHistory->new(
    xvalues => [1.0, 2.1, 2.8, 4.0, 5.2],
    yvalues => [1.0, 1.9, 3.2, 4.1, 4.9]
  );
  # or
  $reg = Statistics::LSNoHistory->new;
  $reg->append_arrays(
    [1.0, 2.1, 2.8, 4.0, 5.2],
    [1.0, 1.9, 3.2, 4.1, 4.9]
  );
  # or
  $reg = Statistics::LSNoHistory->new;
  $reg->append_points(
    1.0 => 1.0, 2.1 => 1.9, 2.8 => 3.2, 4.0 => 4.1, 5.2 => 4.9
  );

  # You may also construct from the preliminary statistics of a 
  # previous regression:
  $reg = Statistics::LSNoHistory->new(
    num => 5,
    sumx => 15.1,
    sumy => 15.1,
    sumxx => 56.29,
    sumyy => 55.67,
    sumxy => 55.83,
    minx => 1.0,
    maxx => 5.2,
    miny => 1.0,
    maxy => 4.9
  );
  # thus a branch may be instantiated as follows
  $branch = Statistics::LSNoHistory->new(%{$reg->dump_stats});
  $reg->append_point(6.1, 5.9);
  $branch->append_point(5.8, 6.0);

  # calculate regression values, print some
  printf("Slope: %.2f\n", $reg->slope);
  printf("Intercept %.2f\n", $reg->intercept);
  printf("Correlation Coefficient: %.2f\n", $reg->pearson_r);
  ...


=head1 DESCRIPTION

This package provides standard least squares linear regression 
functionality without the need for storing the complete data history.  
Like any other, it finds best m,k (in least squares sense) so that 
y = m*x + k fits data points (x_1,y_1),...,(x_n,y_n).

In many applications involving linear regression, it is desirable
to compute a regression based on the intermediate statistics of a 
previous regression along with any I<new> data points.  Thus there
is no need to store a complete data history, but rather only a minimal 
set of intermediate statistics, the number of which, thanks to Gauss, 
is 6.  

The user interface provides a way to instantiate a regression object 
with either raw data or previous intermediate statistics.

=cut

#############################################################################
# construction
#############################################################################

=pod

=head1 CONSTRUCTOR ARGUMENTS

The constructor (or class method I<new>) takes several possible 
arguments.  The initialization scenario depends on the kinds of 
arguments passed and falls into one of the following categories:

=over 2

=item *

I<default:> S<new>() by itself is equivalent to initializing with no
data.  All internal statistics are set to zero.

=item *

I<data points array:> new(I<points> => [x_1 => y_1, x_2 => y_2,..., 
x_n => y_n]) processes the n specified data points.  Note that
points expects an array reference even though we've written it
in "hash notation" for clarity.

=item *

I<data value arrays:> new(I<xvalues> => [x_1, x_2,..., x_n], 
I<yvalues> => [y_1, y_2,..., y_n]) is equivalent to the above.

=item *

I<previous state:> new(I<state arguments>) requires I<all> of the
following intermediate statistics:

=over 6

=item I<num>

S<=E<gt>> Number of points.

=item I<sumx>

S<=E<gt>> Sum of x values.

=item I<sumy> 

S<=E<gt>> Sum of y values.

=item I<sumxx>

S<=E<gt>> Sum of x values squared.

=item I<sumyy>

S<=E<gt>> Sum of y values squared.

=item I<sumxy> 

S<=E<gt>> Sum of x*y products.

=item I<minx> 

S<=E<gt>> Minimum x value.

=item I<maxx> 

S<=E<gt>> Maximum x value.

=item I<miny> 

S<=E<gt>> Minimum y value.

=item I<maxy> 

S<=E<gt>> Maximum y value.

=back 6

=back 2

=cut

## new constructor
sub new {
	my $class = shift;
	my %args = @_;
	my $self;
	my @stats = qw(num sumx sumy sumxx sumyy sumxy);
	push(@stats, qw(minx maxx miny maxy)); # min/max

	# if complete set of statistics, construct from previous state
	# if (@stats == scalar(grep {defined($args{$_})} @stats)) {
	if (@stats == grep {defined($args{$_})} @stats) {
		# reject unsupported arguments and combinations 
		if (grep {defined($args{$_})} qw(points xvalues yvalues)) {
			die "Cannot give new data along with previous state.";
		}
		unless (@stats == keys %args) {
			die "Unknown constructor arguments.";
		}
		# check the number of points for consistency
		unless (abs(int($args{num})) == $args{num}) {
			die "Bad number of points: must be positive integer.";
		}
		$self = \%args;
    	bless $self, $class;
		return $self;
	}
	# in any other case we're starting from scratch
	$self = {};
	bless $self, $class;
	$self->_init;
	# x & y value array refs
	if (defined($args{xvalues}) && defined($args{yvalues})) {
		if (defined $args{points}) {
			die "Must give points or array values, but not both";
		}
		unless (scalar(keys %args) == 2) {
			die "Unknown constructor arguments.";
		}
		$self->append_arrays($args{xvalues}, $args{yvalues});
	}
	# (x,y) point array ref
	elsif (defined($args{points})) {
		if (grep {defined($args{$_})} qw(xvalues yvalues)) {
			die "Must give points or array values, but not both";
		}
		unless (scalar(keys %args) == 1) {
			die "Unknown constructor arguments.";
		}
		$self->append_points(@{$args{points}});
	}
	# default constructor (already initialized above)
	else { 
		if (scalar(keys %args)) {
			die "Unknown constructor arguments.";
		}
	}
    return $self;
}

## _init in this context really means start with state of 0's
sub _init {
	my $self = shift;
	my @stats = qw(num sumx sumy sumxx sumyy sumxy);
	push(@stats, qw(minx maxx miny maxy)); # min/max

	@$self{@stats} = (0) x scalar(@stats);
}


#############################################################################
# other methods
#############################################################################
=pod

=head1 METHODS

=over 2

=cut

#
# adding data
#

## append_point
=pod 

=item *

I<append_point>(x,y) process an additional data point.

=cut 
sub append_point {
	my $self = shift;
	my($x,$y) = @_;

	## will have to recompute regression
	$self->{cached} = 0;

	# min/max
	if ($self->{num}) {
		$self->{minx} = ($x < $self->{minx}) ? $x : $self->{minx};
		$self->{maxx} = ($x > $self->{maxx}) ? $x : $self->{maxx};
		$self->{miny} = ($y < $self->{miny}) ? $y : $self->{miny};
		$self->{maxy} = ($y > $self->{maxy}) ? $y : $self->{maxy};
	}
	else {
		$self->{minx} = $x;
		$self->{maxx} = $x;
		$self->{miny} = $y;
		$self->{maxy} = $y;
	}

	# classic stats
	$self->{num}++;
	$self->{sumx} += $x;
	$self->{sumy} += $y;
	$self->{sumxx} += $x**2;
	$self->{sumyy} += $y**2;
	$self->{sumxy} += $x*$y;
}

## append_points
=pod 

=item *

I<append_points>(x_1 => y_1,..., x_n => y_n) process additional data points, 
which is equivalent to calling append_point() n times.

=cut 
sub append_points {
	my $self = shift;
	my @points = @_;
	my $num = scalar(@points);

	if ($num % 2) { die "Incomplete list of points."; }

	$num /= 2;
	for (1..$num) { $self->append_point(splice(@points, 0, 2)); }
}


## append_arrays
=pod 

=item *

I<append_arrays>([x_1, x_2,..., x_n], [y_1, y_2,..., y_n])
process additional data points given a pair x and y data array
references.  Also equivalent to calling append_point() n times.

=cut 
sub append_arrays {
	my $self = shift;
	my ($xr, $yr) = @_;
	my ($xn, $yn);

	# check arg type
	unless ((ref($xr) eq 'ARRAY') && (ref($yr) eq 'ARRAY'))  { 
		die "Must pass pair of array references."; 
	}

	# check that sizes match
	$xn = scalar(@$xr);
	$yn = scalar(@$yr);
	unless ($xn == $yn) { die "Incomplete list of points."; }

	for (1..$xn) { $self->append_point(shift(@$xr), shift(@$yr)); }
}

#
# computing the regression
#

## _regress method -- done behind the scenes & considered private
sub _regress {
	my $self = shift;
	my($n) = $self->{num};
	my($dx) = $n*$self->{sumxx} - $self->{sumx}**2;
	my($dy) = $n*$self->{sumyy} - $self->{sumy}**2;

	# check that we have 2 points 
	unless ($n >= 2) { die "Must have at least 2 points for regression."; }
	# check data for consistency
	unless (($dx!=0) && ($dy!=0)) { 
		die "Inconsistent data: would divide by zero."; 
	}

	# means and variances
	$self->{avgx} = $self->{sumx}/$n;
	$self->{avgy} = $self->{sumy}/$n;
	$self->{varx} = $dx/$n/($n-1);
	$self->{vary} = $dy/$n/($n-1);

	# slopes and intercepts
	$self->{mx} = ($n*$self->{sumxy} - $self->{sumx}*$self->{sumy})/$dx;
	$self->{kx} = $self->{avgy} - $self->{mx}*$self->{avgx};
	$self->{my} = ($n*$self->{sumxy} - $self->{sumx}*$self->{sumy})/$dy;
	$self->{ky} = $self->{avgx} - $self->{my}*$self->{avgy};
	
	# correlation coefficient (Pearson's r) and chi squared
	$self->{r} = ($n*$self->{sumxy} - $self->{sumx}*$self->{sumy}) 
		/ sqrt($dx*$dy);
	$self->{chi2} = (1-$self->{r}**2)*$dy/$n;

	# flag that regression calculations are up to date
	$self->{cached} = 1;
}

#
# presentation of stats, prediction
#

## average_x 
=pod 

=item *

I<average_x> returns the mean of the x values.

=cut
sub average_x { 
	my $self = shift;
	$self->_regress unless $self->{cached};
	return $self->{avgx}
}

## average_y 
=pod 

=item *

I<average_y> returns the mean of the y values.

=cut
sub average_y { 
	my $self = shift;
	$self->_regress unless $self->{cached};
	return $self->{avgy}
}

## variance_x
=pod 

=item *

I<variance_x> returns the (n-1)-style variance of the x values. 

=cut
sub variance_x { 
	my $self = shift;
	$self->_regress unless $self->{cached};
	return $self->{varx}
}

## variance_y
=pod 

=item *

I<variance_y> returns the (n-1)-style variance of the y values. 

=cut
sub variance_y { 
	my $self = shift;
	$self->_regress unless $self->{cached};
	return $self->{vary}
}

## slope
=pod 

=item *

I<slope> returns the slope m so that y = m*x + k is a least squares fit.
Note that this is the least (y-y_avg)**2, and thus the standard slope.

=cut
sub slope { 
	my $self = shift;
	$self->_regress unless $self->{cached};
	return $self->{mx}
}

## intercept
=pod 

=item *

I<intercept> returns the intercept k so that y = m*x + k is a least squares 
fit.  Note again that this is the least (y-y_avg)**2, and thus the 
standard intercept.

=cut
sub intercept { 
	my $self = shift;
	$self->_regress unless $self->{cached};
	return $self->{kx}
}

## predict - predicte a y value given an x value
=pod 

=item *

I<predict>(x) predicts a y value, given an x value.  Computes m*x + k, where 
m, k are the standard regression slope and intercept (->slope and ->intercept, 
respectively) for the most recent data.

=cut 
sub predict {
	my $self = shift;
	my($x) = @_;

	$self->_regress unless $self->{cached};
	return $self->{mx}*$x + $self->{kx};
}

## slope_y
=pod 

=item *

I<slope_y> returns the slope m' so that y = m'*x + k' is a least squares fit.
Note that this is the least (x-x_avg)**2, and thus I<not> the standard slope.

=cut
sub slope_y { 
	my $self = shift;
	$self->_regress unless $self->{cached};
	return $self->{my}
}

## intercept_y
=pod 

=item *

I<intercept_y> returns the intercept k' so that y = m'*x + k' is a least 
squares fit.  Note that this is the least (x-x_avg)**2, and thus I<not> 
the standard intercept.

=cut
sub intercept_y { 
	my $self = shift;
	$self->_regress unless $self->{cached};
	return $self->{ky}
}

## predict_x - predicte an x value given a y value
=pod 

=item *

I<predict_x>(y) predicts an x value given a y value.  Computes m'*y + k', 
where m', k' are the regression (y-reletive) slope and intercept 
(->slope_y and ->intercept_y, respectively) for the most recent data.

=cut 
sub predict_x {
	my $self = shift;
	my($y) = @_;

	$self->_regress unless $self->{cached};
	return $self->{my}*$y + $self->{ky};
}

## pearson_r
=pod 

=item *

I<pearson_r> returns Pearson's r correlation coefficient.

=cut
sub pearson_r { 
	my $self = shift;
	$self->_regress unless $self->{cached};
	return $self->{r}
}

## chi_squared
=pod 

=item *

I<chi_squared> returns the chi squared statistic.

=cut
sub chi_squared { 
	my $self = shift;
	$self->_regress unless $self->{cached};
	return $self->{chi2}
}

## minimum_x
=pod 

=item *

I<minimum_x> returns the minimum x value

=cut
sub minimum_x { return shift->{minx}; }

## maximum_x
=pod 

=item *

I<maximum_x> returns the maximum x value

=cut
sub maximum_x { return shift->{maxx}; }

## minimum_y
=pod 

=item *

I<minimum_y> returns the minimum y value

=cut
sub minimum_y { return shift->{miny}; }

## maximum_y
=pod 

=item *

I<maximum_y> returns the maximum y value

=cut
sub maximum_y { return shift->{maxy}; }

## dump_stats
=pod 

=item *

I<dump_stats> returns a hash reference of the form

        { num => <val>,
          sumx => <val>,
          sumy => <val>,
          sumxx => <val>,
          sumyy => <val>,
          sumxy => <val>,
          minx => <val>,
          maxx => <val>,
          miny => <val>,
          maxy => <val> }

in other words, containing all the stats required by the final constructor 
above.  This effectively dumps the regression history.

=cut
sub dump_stats { 
	my $self = shift;
	my @stats = qw(num sumx sumy sumxx sumyy sumxy);
	push(@stats, qw(minx maxx miny maxy)); # min/max
	my %dump;

	@dump{@stats} = @$self{@stats};
	return \%dump;
}

1;

__END__
=pod 

=head1 BUGS

This technique is more susceptible to roundoff errors than others which
store the data.  Extra care must be taken to scale the data before 
processing.

=head1 AUTHOR

John Pliam <pliam@cpan.org>

=head1 SEE ALSO

CPAN modules: Statistics::OLS, Statistics::Descriptive, 
Statistics::GaussHelmert, Statistics::Regression.

Any book on statistics, any handbook of mathematics, any comprehensive 
book on numerical algorithms.

Press et al, Numerical Recipes in L [L in {C,Fortran, ...}], Nth edition
[N > 0], Cambridge Univ Press.

=head1 COPYING

See distribution file C<COPYING> for complete information.