The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# -*-Perl-*- Test Harness script for Bioperl
# $Id$

# This will outline many tests for the population genetics
# objects in the Bio::PopGen namespace

use strict;

BEGIN {     
    use lib '.';
    use Bio::Root::Test;
    
    test_begin(-tests => 100);
	
    use_ok('Bio::PopGen::Individual');
    use_ok('Bio::PopGen::Genotype');
    use_ok('Bio::PopGen::Population');
    use_ok('Bio::PopGen::IO');
    use_ok('Bio::PopGen::PopStats');
    use_ok('Bio::AlignIO');
    use_ok('Bio::PopGen::Statistics');
    use_ok('Bio::PopGen::Utilities');
}

# test Fu and Li's D using data from the paper

is(sprintf("%.3f",
	   Bio::PopGen::Statistics->fu_and_li_D_counts(24, 18, 9)),
   -1.529);

is(sprintf("%.3f",
	   Bio::PopGen::Statistics->fu_and_li_D_star_counts(24, 18, 10)),
   -1.558);

is(sprintf("%.3f",
	   Bio::PopGen::Statistics->fu_and_li_F_counts(24, 3.16, 18, 9)),
   -1.735);

is(sprintf("%.2f",
	   Bio::PopGen::Statistics->fu_and_li_F_star_counts(24, 3.16, 18, 10)),
   -1.71);

my $FILE1 = test_output_file();

my @individuals = ( Bio::PopGen::Individual->new(-unique_id => '10a'));
ok($individuals[0]);

my @genotypes = ( Bio::PopGen::Genotype->new(-marker_name    => 'Mkr1',
					    -individual_id  => '10a',
					    -alleles => [ qw(A a)]),
		  Bio::PopGen::Genotype->new(-marker_name    => 'Mkr2',
					    -individual_id  => '10a',
					    -alleles => [ qw(B B)]),
		  Bio::PopGen::Genotype->new(-marker_name    => 'Mkr3',
					    -individual_id  => '10a',
					    -alleles => [ qw(A a)]));
is(($genotypes[1]->get_Alleles)[0], 'B');

$individuals[0]->add_Genotype(@genotypes);
is($individuals[0]->get_Genotypes,3);
is($individuals[0]->get_Genotypes(-marker => 'Mkr3')->get_Alleles(),2);
my @alleles = $individuals[0]->get_Genotypes(-marker => 'Mkr2')->get_Alleles();
is($alleles[0], 'B');

					     
my $population = Bio::PopGen::Population->new(-name        => 'TestPop1',
					     -source      => 'testjasondata',
					     -description => 'throw away example',
					     -individuals => \@individuals);

is(scalar ($population->get_Individuals()), 1);
is($population->name, 'TestPop1');
is($population->source, 'testjasondata');
is($population->description, 'throw away example');

my @genotypes2 = ( Bio::PopGen::Genotype->new(-marker_name   => 'Mkr1',
					     -individual_id => '11',
					     -alleles       => [ qw(A A)]),
		   Bio::PopGen::Genotype->new(-marker_name   => 'Mkr2',
					     -individual_id => '11',
					     -alleles       => [ qw(B B)]),
		   Bio::PopGen::Genotype->new(-marker_name   => 'Mkr3',
					     -individual_id => '11',
					     -alleles       => [ qw(a a)]),
		   Bio::PopGen::Genotype->new(-marker_name   => 'Mkr4',
					     -individual_id => '11',
					     -alleles       => [ qw(C C)])
		   );
push @individuals, Bio::PopGen::Individual->new(-genotypes   => \@genotypes2,
					       -unique_id   => '11');
$population->add_Individual($individuals[1]);

is(scalar ($population->get_Individuals()), 2);
my ($found_ind) = $population->get_Individuals(-unique_id => '10a');
is($found_ind->unique_id, '10a');
is(scalar($population->get_Individuals(-marker => 'Mkr4')) , 1);
is(scalar($population->get_Individuals(-marker => 'Mkr3')) , 2);

my @g = $population->get_Genotypes(-marker => 'Mkr4');

is($g[0]->individual_id, '11');
is(($g[0]->get_Alleles())[0], 'C');

my $marker = $population->get_Marker('Mkr3');
ok($marker);

@alleles = $marker->get_Alleles;
is(@alleles,2);
my %af = $marker->get_Allele_Frequencies();
is($af{'a'}, 0.75);
is($af{'A'}, 0.25);

$population->remove_Individuals('10a');
$marker = $population->get_Marker('Mkr3');
%af = $marker->get_Allele_Frequencies();

is($af{'a'}, 1);
is($af{'A'}, undef);

# Read in data from a file
my $io = Bio::PopGen::IO->new(-format => 'csv',
			     -file   => test_input_file('popgen_saureus.dat'));

my @inds;
while( my $ind = $io->next_individual ) {
    push @inds, $ind;
}

my @mrsainds = grep { $_->unique_id =~ /^MRSA/ } @inds;
my @mssainds = grep { $_->unique_id =~ /^MSSA/ } @inds;
my @envinds = grep { $_->unique_id =~ /^NC/ } @inds;

is(scalar @mrsainds, 9);
is(scalar @mssainds, 10);
is(scalar @envinds, 5);

my $mrsapop = Bio::PopGen::Population->new(-name        => 'MRSA',
					  -description => 'Resistant S.aureus',
					  -individuals => \@mrsainds);

my $mssapop = Bio::PopGen::Population->new(-name        => 'MSSA',
					  -description =>'Suceptible S.aureus',
					  -individuals => \@mssainds);

my $envpop = Bio::PopGen::Population->new(-name        => 'NC',
					 -description => 'WT isolates',
					  -individuals => \@envinds);

my $stats = Bio::PopGen::PopStats->new(-haploid => 1);
my $fst = $stats->Fst([$mrsapop,$mssapop],[qw(AFLP1)]);
# We're going to check the values against other programs first
is(sprintf("%.3f",$fst),0.077,'mrsa,mssa aflp1'); 
  
$fst = $stats->Fst([$envpop,$mssapop,$mrsapop],[qw(AFLP1 )]);
is(sprintf("%.3f",$fst),0.035,'all pops, aflp1'); 

$fst = $stats->Fst([$mrsapop,$envpop],[qw(AFLP1 AFLP2)]);
is(sprintf("%.3f",$fst),0.046,'mrsa,envpop aflp1,aflp2');

# Read in data from a file
$io = Bio::PopGen::IO->new(-format => 'csv',
			  -file   => test_input_file('popgen_saureus.multidat'));

@inds = ();
while( my $ind = $io->next_individual ) {
    push @inds, $ind;
}

@mrsainds = grep { $_->unique_id =~ /^MRSA/ } @inds;
@mssainds = grep { $_->unique_id =~ /^MSSA/ } @inds;
@envinds = grep { $_->unique_id =~ /^NC/ } @inds;

is(scalar @mrsainds, 7);
is(scalar @mssainds, 10);
is(scalar @envinds, 5);

$mrsapop = Bio::PopGen::Population->new(-name        => 'MRSA',
				       -description => 'Resistant S.aureus',
				       -individuals => \@mrsainds);

$mssapop = Bio::PopGen::Population->new(-name        => 'MSSA',
				       -description =>'Suceptible S.aureus',
				       -individuals => \@mssainds);

$envpop = Bio::PopGen::Population->new(-name        => 'NC',
				      -description => 'WT isolates',
				      -individuals => \@envinds);

$stats = Bio::PopGen::PopStats->new(-haploid => 1);
my @all_bands = map { 'B' . $_ } 1..20;
my @mkr1     = map { 'B' . $_ } 1..13;
my @mkr2     = map { 'B' . $_ } 14..20;

# still wrong ? seems to work now? --sendubala
$fst = $stats->Fst([$mrsapop,$mssapop],[@all_bands ]);
#TODO: {
#    local $TODO = 'Possible bug with Fst() output';
is(sprintf("%.3f",$fst),'-0.001','mssa,mrsa all_bands'); # We're going to check the values against other programs first
#}
$fst = $stats->Fst([$envpop,$mssapop],[ @mkr1 ]);
is(sprintf("%.3f",$fst),0.023,'env,mssa mkr1'); # We're going to check the values against other programs first

$fst = $stats->Fst([$envpop,$mssapop,$mrsapop],[ @all_bands ]);
is(sprintf("%.3f",$fst),0.071,'env,mssa,mrsa all bands'); # We're going to check the values against other programs first

$fst = $stats->Fst([$envpop,$mssapop,$mrsapop],[ @mkr2 ]);
is(sprintf("%.3f",$fst),0.076, 'env,mssa,mrsa mkr2'); # We're going to check the values against other programs first

$fst = $stats->Fst([$mrsapop,$envpop],[@all_bands ]);
is(sprintf("%.3f",$fst),0.241,'mrsa,nc all_bands'); # We're going to check the values against other programs first

# test overall allele freq setting for a population

my $poptst1 = Bio::PopGen::Population->new(-name => 'tst1');
my $poptst2 = Bio::PopGen::Population->new(-name => 'tst2');

$poptst1->set_Allele_Frequency(-frequencies => 
			       { 'marker1' => { 'a' => '0.20',
						'A' => '0.80' },
				 'marker2' => { 'A' => '0.10',
						'B' => '0.20',
						'C' => '0.70' }
			     });

my $mk1 = $poptst1->get_Marker('marker1');
my %f1 = $mk1->get_Allele_Frequencies;
is($f1{'a'}, '0.20');
is($f1{'A'}, '0.80');
my $mk2 = $poptst1->get_Marker('marker2');
my %f2 = $mk2->get_Allele_Frequencies;
is($f2{'C'}, '0.70');

$poptst2->set_Allele_Frequency(-name      => 'marker1',
			       -allele    => 'A',
			       -frequency => '0.60');
$poptst2->set_Allele_Frequency(-name      => 'marker1',
			       -allele    => 'a',
			       -frequency => '0.40');

#TODO: {
#    local $TODO = 'Fst not calculated yet for just allele freqs';
#    ok 0;
#    #$fst = $stats->Fst([$poptst1,$poptst2],[qw(marker1 marker2) ]);
#}

$io = Bio::PopGen::IO->new(-format => 'csv',
			  -file   => ">$FILE1");

$io->write_individual(@inds);
$io->close();
ok( -s $FILE1);
$io = Bio::PopGen::IO->new(-format => 'csv',
			  -file   => ">$FILE1");

$io->write_population(($mssapop,$mrsapop));
$io->close();
ok( -s $FILE1);

$io = Bio::PopGen::IO->new(-format => 'prettybase',
			  -file   => ">$FILE1");

$io->write_individual(@inds);
$io->close();
ok( -s $FILE1);

$io = Bio::PopGen::IO->new(-format => 'prettybase',
			  -file   => ">$FILE1");

$io->write_population(($mssapop,$mrsapop));
$io->close();
ok( -s $FILE1);


# Let's do PopGen::Statistics tests here

$io = Bio::PopGen::IO->new(-format          => 'prettybase',
			  -no_header       => 1,
			  -file            => test_input_file('popstats.prettybase'));
my (@ingroup,@outgroup);
my $sitecount;
while( my $ind = $io->next_individual ) {
    if($ind->unique_id =~ /out/) {
	push @outgroup, $ind;
    } else { 
	push @ingroup, $ind;
	$sitecount = scalar $ind->get_marker_names() unless defined $sitecount;
    }
}
$stats = Bio::PopGen::Statistics->new();

# Real data and values courtesy M.Hahn and DNASP

is($stats->pi(\@ingroup),2);
is(Bio::PopGen::Statistics->pi(\@ingroup,$sitecount),0.4);

is(Bio::PopGen::Statistics->theta(\@ingroup),1.92);
is(Bio::PopGen::Statistics->theta(\@ingroup,$sitecount),0.384);

# Test with a population object
my $ingroup  = Bio::PopGen::Population->new(-individuals => \@ingroup);
my $outgroup = Bio::PopGen::Population->new(-individuals => \@outgroup);

is($stats->pi($ingroup),2);
is(Bio::PopGen::Statistics->pi($ingroup,$sitecount),0.4);

is(Bio::PopGen::Statistics->theta($ingroup),1.92);
is(Bio::PopGen::Statistics->theta($ingroup,$sitecount),0.384);

my $haploidpop = $ingroup->haploid_population;
is(sprintf("%.5f",Bio::PopGen::Statistics->tajima_D($haploidpop)), 0.27345);

# to fix
is(sprintf("%.5f",Bio::PopGen::Statistics->tajima_D(\@ingroup)),0.27345);
is(sprintf("%.5f",Bio::PopGen::Statistics->tajima_D($ingroup)),0.27345);

is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D_star(\@ingroup)),
   0.27345);
is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D_star($ingroup)),
   0.27345);

is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_F_star(\@ingroup)),
     0.27834);
is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_F_star($ingroup)),
   0.27834);

is((Bio::PopGen::Statistics->derived_mutations(\@ingroup,\@outgroup))[0], 1);
is((Bio::PopGen::Statistics->derived_mutations($ingroup,\@outgroup))[0], 1);
is((Bio::PopGen::Statistics->derived_mutations(\@ingroup,$outgroup))[0], 1);
is((Bio::PopGen::Statistics->derived_mutations($ingroup,$outgroup))[0], 1);

# expect to have 1 external mutation
@ingroup = $haploidpop->get_Individuals;

is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D(\@ingroup,1)),0.75653);
is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D($ingroup,1)),0.75653);

is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D(\@ingroup,
						       \@outgroup)),0.75653);
is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D($ingroup,
						       \@outgroup)),0.75653);
is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D($ingroup,
						       $outgroup)),0.75653);
is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_D(\@ingroup,
						       $outgroup)),0.75653);

is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_F(\@ingroup,1)),
     0.77499);
is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_F($haploidpop,1)),0.77499);
is(sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_F($ingroup,
						       \@outgroup)),0.77499);
is( sprintf("%.5f",Bio::PopGen::Statistics->fu_and_li_F($ingroup,
							 $outgroup)),0.77499);


# Test composite LD

$io = Bio::PopGen::IO->new(-format => 'prettybase',
			  -file   => test_input_file('compLD_test.prettybase'));

my $pop = $io->next_population;

my %LD = $stats->composite_LD($pop);

is($LD{'01'}->{'02'}->[1], 10);
is($LD{'01'}->{'03'}->[1], 0);
is($LD{'02'}->{'03'}->[1], 0);

# Test composite LD

$io = Bio::PopGen::IO->new(-format => 'prettybase',
			  -file   => test_input_file('compLD_missingtest.prettybase'));

$pop = $io->next_population;

%LD = $stats->composite_LD($pop);

is(sprintf("%.4f",$LD{'ProC9198EA'}->{'ProcR2973EA'}->[0]), -0.0375);
is(sprintf("%.2f",$LD{'ProC9198EA'}->{'ProcR2973EA'}->[1]), 2.56);



# build a population from an alignment

my $alnin = Bio::AlignIO->new(-format => 'clustalw',
			      -file   => test_input_file('T7.aln'));
my $aln = $alnin->next_aln;
$population = Bio::PopGen::Utilities->aln_to_population(-alignment => $aln);
is($population->get_number_individuals,9);
#warn($aln->match_line,"\n");
my $matchline = $aln->match_line;
is( $population->get_marker_names, $matchline =~ tr/ //);
for my $name ( $population->get_marker_names ) {
    my $marker = $population->get_Marker($name); 
#    warn("$name ",join(" ",$marker->get_Alleles()),"\n");
}


# test Rich's phase and hap parsers

$io = Bio::PopGen::IO->new(-format   => 'hapmap',
			  -verbose  => 1,
			  -no_header=> 1,
			  -starting_column => 10,
			  -file     => test_input_file('example.hap'));

# Some IO might support reading in a population at a time

my @population;
while( my $ind = $io->next_individual ) {
    push @population, $ind;
}
is(@population, 90);
is($population[3]->unique_id, 'NA06994');
is($population[3]->get_Genotypes, 34);
$population = Bio::PopGen::Population->new(-individuals => \@population);

is(sprintf("%.3f",$stats->pi($population)),12.266);
# if forced haploid population is called within pi
# need to decide about that...
# is(sprintf("%.3f",$stats->pi($population)),12.266);

is(sprintf("%.3f",$stats->theta($population)),5.548);
#TODO: {
#    local $TODO = 'May be TJd inconsistency, need to recalculate';
    is(sprintf("%.3f",$stats->tajima_D($population)),'2.926');
    is(sprintf("%.3f",$stats->tajima_D($population->haploid_population)),3.468);
#}
$io = Bio::PopGen::IO->new(-format => 'phase',
			  -file   => test_input_file('example.phase'));

# Some IO might support reading in a population at a time

@population = ();
while( my $ind = $io->next_individual ) {
    push @population, $ind;
}
is(@population, 4);


# test diploid data

# bug 2492
{
    my $in = Bio::PopGen::IO->new(-format=>"csv", -fh=>\*DATA);
    my $pop = $in->next_population;
    is(sprintf("%.3f",$stats->pi($pop)),0.833,'Pi on 3-allele data');
    is(sprintf("%.3f",$stats->theta($pop)),0.545,'Theta on 3-allele data');
}
__DATA__
SAMPLE,Site-1
seq_1,G
seq_2,C
seq_3,T
seq_4,G