The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl
# -*-Perl-*- (for my emacs)

use strict;
use warnings;

=head1 NAME 

bp_heterogeneity_test - a test for distinguishing between selection and population expansion.

=head1 SYNOPSIS

heterogenetity_test -mut_1/--mutsyn synonymous_mut_count -mut_2/--mutnon nonsyn_mut_count -s/--smaplesize sample_size [-i/--iterations iterations] [-o/--observed observed_D] [-v/--verbose] [--silent ] [-m/--method tajimaD or fuD] [--precision]

=head2 DESCRIPTION

This is an implementation of the Heterogenetity test as described in
Hahn MW, Rausher MD, and Cunningham CW. 2002. Genetics 161(1):11-20. 

=head2 OPTIONS

 Options in brackets above are optional

 -s or --samplesize samplesize 
 -mut_1 or --mutsyn synonymous mutation count 
 -mut_2 or --mutnon nonsynonmous mutation count 
 -i or --iterations number of iterations 
 -o or --observed   observed D 
 -m or --method     tajimaD or fuD  for Tajima's D or Fu and Li's D
 -v or --verbose    print out extra verbose messages
 --silent           Be extra quiet
 --precision        Level of precision - specify the number of digits 
                   (default 4)

=head2 AUTHOR Matthew Hahn E<lt>matthew.hahn-at-duke.eduE<gt>

For more information contact:

Matthew Hahn, E<lt>matthew.hahn-at-duke.eduE<gt>
Jason Stajich E<lt>jason-at-bioperl-dot-orgE<gt>

=cut

use Getopt::Long;
use Bio::PopGen::Simulation::Coalescent;
use Bio::PopGen::Statistics;
use Bio::PopGen::Individual;
use Bio::PopGen::Genotype;

my $sample_size = 4;
my $mut_count_1 = 10; # synonymous
my $mut_count_2 = 20; # non-synonymous
my $iterations = 1;
my $verbose = 0;
my $observedD = undef;
my $method = 'fuD';
my $help = 0;
my $precision = '4'; # Let's make the random precision between
                     # 0->1 to 1000th digits

GetOptions( 
	    's|samplesize|samp_size:i' => \$sample_size,
	    'mut_1|mutsyn:i'           => \$mut_count_1,
	    'mut_2|mutnon:i'           => \$mut_count_2, 
	    'i|iterations:i'           => \$iterations,
	    'o|obsered|observedD:f'    => \$observedD, 
	    'v|verbose'                => \$verbose,
	    'm|method:s'               => \$method,
	    'h|help'                   => \$help,
	    'silent'                   => sub { $verbose = -1; },
	    'p|precision:i'            => \$precision,
	    );

if( $help ) {
    system("perldoc",$0);
    exit(0);
}

if( $method ne 'fuD' and $method ne 'tajimaD' ) {
    die("available methods are [fu and li's D] (fuD) and Tajima's D (tajimaD)");
}
my @D_distribution;  
printf("sample size is %d iteration count = %d\n", $sample_size, 
       $iterations);

my $sim = new Bio::PopGen::Simulation::Coalescent
    (-sample_size => $sample_size);

for(my $iter = 0; $iter < $iterations; $iter++ ) {
    my $tree = $sim->next_tree($sample_size);
    my $f1 = 0;
    if( $mut_count_1 > 0 ) {
	$sim->add_Mutations($tree,$mut_count_1,$precision);

	my @leaves = $tree->get_leaf_nodes;
	# the outgroup is just an individual with the ancestral state 
	# (no mutations)
	my $outgroup = new Bio::PopGen::Individual();
	foreach my $m ( $leaves[0]->get_marker_names ) {
	    $outgroup->add_Genotype(Bio::PopGen::Genotype->new
				    (-marker_name=> $m,
				     -alleles    => [ 0 ]));
	}
	if( $method eq 'fuD' ) {
	    $f1 = Bio::PopGen::Statistics->fu_and_li_D(\@leaves,[$outgroup]);
	} elsif( $method eq 'tajimaD' ) {
	    $f1 = Bio::PopGen::Statistics->tajima_D(\@leaves);
	}
	print "(mutation count = $mut_count_1) D=$f1\n" 
	    if( $verbose >= 0);
    }
    
    my $f2 = 0;
    if( $mut_count_2 > 0 ) {
	$sim->add_Mutations($tree,$mut_count_2,$precision);
	my @leaves = $tree->get_leaf_nodes;
        # the outgroup is just an individual with the ancestral state 
	# (no mutations)
	my $outgroup = new Bio::PopGen::Individual();
	foreach my $m ( $leaves[0]->get_marker_names ) {
	    $outgroup->add_Genotype(Bio::PopGen::Genotype->new
				    (-marker_name=> $m,
				     -alleles    => [ 0 ]));
	}
	if( $method eq 'fuD' ) {
	    $f2 = Bio::PopGen::Statistics->fu_and_li_D(\@leaves,[$outgroup]);
	} elsif( $method eq 'tajimaD' ) {
	    $f2 = Bio::PopGen::Statistics->tajima_D(\@leaves);
	}
	print "(mutation count = $mut_count_2) D=$f2\n" if( $verbose >= 0);

    }
    my $deltaD = ( $f1 - $f2 );
    push @D_distribution, $deltaD;
    if( $iter % 10 == 0 && $iter > 0 ) { 
	print STDERR "iter = $iter\n"; 
    }
}

if( defined $observedD && $iterations > 1 ) { 
    my @sortedD = sort { $a <=> $b } @D_distribution;
    my $i;
    for($i = 0; $i < scalar @sortedD; $i++ ) {
	if( $sortedD[$i] > $observedD ) { 
	    last;
	}
    }
    
    printf( "index %d value=%.4f out of %d total (obs=%.4f)\n", 
	    $i, $sortedD[$i], scalar @sortedD, $observedD);
}