The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# -*-Perl-*- Test Harness script for Bioperl
# $Id$

use strict;

BEGIN { 
    use lib '.';
    use Bio::Root::Test;
    
    test_begin(-tests => 116);
	
    use_ok('Bio::Location::Simple');
    use_ok('Bio::Coordinate::Pair');
    use_ok('Bio::Coordinate::ExtrapolatingPair');
	use_ok('Bio::Coordinate::GeneMapper');
}

#
# Extrapolating pairs
#
#    No gaps returned, matches extrapolated
#     returns always a match or undef
#     -strict
#


# the  reverse strand pair
my $inr = Bio::Location::Simple->new(-start=>2, -end=>5, -strand=>1);
my $outr = Bio::Location::Simple->new(-start=>10, -end=>13, -strand=>-1);
ok my $pairr = Bio::Coordinate::ExtrapolatingPair->
    new(-in => $inr,
	-out => $outr
       );

my $posr = Bio::Location::Simple->new 
    (-start => 3, -end => 4, -strand=> 1 );
my $resr = $pairr->map($posr);
is $resr->start, 11;
is $resr->end, 12;
is $resr->strand, -1;



# propepide
my $match1 = Bio::Location::Simple->new 
    (-seq_id => 'propeptide', -start => 21, -end => 40, -strand=>1 );
# peptide
my $match2 = Bio::Location::Simple->new
    (-seq_id => 'peptide', -start => 1, -end => 20, -strand=>1 );

ok my $pair = Bio::Coordinate::ExtrapolatingPair->
    new(-in => $match1,
	-out => $match2,
	-strict => 1
       );

ok $pair->test;
is $pair->strand(), 1; #  = in->strand * out->strand
is $pair->in->seq_id(), 'propeptide';
is $pair->strict(), 1;

my ($count, $pos, $pos2, $res, $match, $res2);

# match within
$pos = Bio::Location::Simple->new 
    (-start => 25, -end => 25, -strand=> -1 );
$res = $pair->map($pos);

isa_ok $res, 'Bio::Location::Simple';
is $res->start, 5;
is $res->end, 5;
is $res->strand, -1;
is $res->seq_id, 'peptide';


# match outside = undef
$pos = Bio::Location::Simple->new (-start => 5, -end => 5 );
$res = $pair->map($pos);

is $res, undef;

#
# partial match = match
#
$pos2 = Bio::Location::Simple->new
    (-start => 20, -end => 22, -strand=> -1 );

ok $res = $pair->map($pos2);

is $res->start, 0;
is $res->end, 2;
is $res->seq_id, 'peptide';
is $res->strand, -1;


#
# partial match2 =  match & gap
#
$pos2 = Bio::Location::Simple->new (-start => 40, -end => 41, -strand=> 1 );
ok $res = $pair->map($pos2);
is $res->start, 20;
is $res->end, 20;

#
#enveloping
#
$pos2 = Bio::Location::Simple->new (-start => 19, -end => 41, -strand=> 1 );
ok $res = $pair->map($pos2);
is $res->start, 1;
is $res->end, 20;

#
# testing the changing the strand
#

# chr
$match1 = Bio::Location::Simple->new 
    (-seq_id => 'chr', -start => 21, -end => 40, -strand=>1 );
# gene
$match2 = Bio::Location::Simple->new
    (-seq_id => 'gene', -start => 1, -end => 20, -strand=>-1 );

 $pair = Bio::Coordinate::ExtrapolatingPair->
#my $pair = Bio::Coordinate::Pair->
    new(-in => $match1,
	-out => $match2,
	-strict => 0
       );

$pos = Bio::Location::Simple->new 
    (-start => 38, -end => 40, -strand=> 1 );
$res = $pair->map($pos);
is $res->start, 1;
is $res->end, 3;
is $res->strand, -1;

$pos = Bio::Location::Simple->new 
    (-start => 1, -end => 3, -strand=> 1 );
$res = $pair->map($pos);
is $res->start, 38;
is $res->end, 40;
is $res->strand, -1;


#
#
# Gene Mapper
#
#

ok my $m = Bio::Coordinate::GeneMapper->new(-in => 'propeptide',
					   -out => 'peptide');
#$m->verbose(2);

is $m->peptide_offset(5), 5;


# match within
$pos = Bio::Location::Simple->new 
    (-start => 25, -end => 25, -strand=> 1 );
$res = $m->map($pos);

is $res->start, 20;
is $res->end, 20;
is $res->strand, 1;
is $res->seq_id, 'peptide';


#
# nozero
#

# match within
$pos = Bio::Location::Simple->new 
    (-start => 4, -end => 5, -strand=> 1 );
$res = $m->map($pos);
is $res->start, -1;
is $res->end, 0;

is $m->nozero('in&out'), 'in&out';
$res = $m->map($pos);
is $res->start, -2;
is $res->end, -1;
is $m->nozero(0), 0;



ok $m->swap;
$pos = Bio::Location::Simple->new 
    (-start => 5, -end => 5, -strand=> 1 );
$res = $m->map($pos);
is $res->start, 10;

# cds -> propeptide
is $m->in('cds'), 'cds';
is $m->out('propeptide'), 'propeptide';

$res = $m->map($pos);
is $res->start, 2;
ok $res = $m->_translate($pos);
is $res->start, 2;
ok $res = $m->_reverse_translate($pos);
is $res->start, 13;
is $res->end, 15;

$pos = Bio::Location::Simple->new 
    (-start => 26, -end => 26, -strand=> 1 );
$m->out('peptide');
$res = $m->map($pos);
is $res->start, 4;


#
# frame
#

$pos = Bio::Location::Simple->new 
    (-start => 1, -end => 3, -strand=> 1 );
$res = $m->_frame($pos);
is $res->start, 1;
is $res->end, 3;


# Collection representing exons
#
#  cds    1   5     6   10    11  15
#  exon   1   5     1   5     1   5
#  gene   1   5    11   15   21   25
#         |---|     |---|     |---|
#-----|-----------------------|---|--
# chr 1   5   9    15   19   25   29
#         pair1     pair2     pair3

# gene
my $e1 = Bio::Location::Simple->new 
    (-seq_id => 'gene', -start => 5, -end => 9, -strand=>1 );
my $e2 = Bio::Location::Simple->new 
    (-seq_id => 'gene', -start => 15, -end => 19, -strand=>1 );
my $e3 = Bio::Location::Simple->new 
    (-seq_id => 'gene', -start => 25, -end => 29, -strand=>1 );
my @cexons = ($e1, $e2, $e3);

$m= Bio::Coordinate::GeneMapper->new();

$m->in('chr');
$m->out('gene');
my $off = $m->cds(5);
is $off->start, 5; # start of the coding region
is $m->exons(@cexons), 3;

$m->out('exon');
$pos = Bio::Location::Simple->new
    (-start => 6, -end => 7, -strand=> 1 );
$res = $m->map($pos);

is $res->start, 2;
is $res->end, 3;

$m->out('negative_intron');
$pos = Bio::Location::Simple->new 
    (-start => 12, -end => 14, -strand=> 1 );
$res = $m->map($pos);
is $res->start, -3;
is $res->end, -1;
is $res->seq_id, 'intron1';


# cds
$m->out('cds');
$pos = Bio::Location::Simple->new
    (-start => 5, -end => 9, -strand=> 1 );
$res = $m->map($pos);
is $res->start, 1;
is $res->end, 5;

$pos = Bio::Location::Simple->new
    (-start => 15, -end => 25, -strand=> 1 );
$res = $m->map($pos);
is $res->start, 6;
is $res->end, 11;

$pos = Bio::Location::Simple->new
    (-start => 5, -end => 19, -strand=> 1 );
$res = $m->map($pos);
is $res->start, 1;
is $res->end, 10;


#
# chr to cds ; ranges into one
#
my $exons = Bio::Location::Split->new(-seq_id => 'gene');
$exons->add_sub_Location($e1);
$exons->add_sub_Location($e2);
$exons->add_sub_Location($e3);

$res = $m->map($exons);
isa_ok $res,'Bio::Location::Simple';
is $res->start, 1;
is $res->end, 15;

#
# cds to chr; single range into two
#
$m->in('cds');
$m->out('gene');

$pos = Bio::Location::Simple->new
    (-start => 4, -end => 7, -strand=> 1 );
$res = $m->map($pos);
is $res->start, 4;
is $res->end, 12;



# Collection representing exons
#
#  cds  -11  -7    -6  -2    -1   3  :27
#  cds   -6  -2    -1 1 3     4   8  :17
#  exon   1   5     1   5     1   5
#  gene -21  -17  -11  -7    -1 1 3  :27
#  gene -11  -7    -1 1 3     9   13 :17
#         |---|     |---|     |---|
#-----|-----------------------|---|--
# chr 1   5   9    15   19   25   29
#         pair1     pair2     pair3

$m= Bio::Coordinate::GeneMapper->new();

$m->in('chr');
$m->out('gene');
$off = $m->cds(17);
is $off->start, 17; # start of the coding region
is $m->exons(@cexons), 3;

# testing parameter handling in the constructor
ok $m = Bio::Coordinate::GeneMapper->new(-in => 'gene',
					-out => 'peptide',
					-cds => 3,
					-exons => @cexons,
					-utr => 7,
					-peptide_offset => 5
				       );


#
# Real life data
# Mapping SNPs into  human serum protein MSE55 and
# human galecting LGALS2 from Ensembl:
#

#Ensembl Gene ID	Exon Start (Chr bp)	Exon End (Chr bp)	Exon Coding Start (Chr bp)
#	Exon Coding End (Chr bp)	Strand

my @gene1_dump = split ( /\n/, qq {
ENSG00000128283	34571058	34571126			1
ENSG00000128283	34576610	34577350	34576888	34577350	1
ENSG00000128283	34578646	34579858	34578646	34579355	1
});


my @gene2_dump = split ( /\n/, qq {
ENSG00000100079	34590438	34590464			-1
ENSG00000100079	34582387	34582469	34582387	34582469	-1
ENSG00000100079	34581114	34581273	34581114	34581273	-1
ENSG00000100079	34580784	34580950	34580804	34580950	-1
}); # exon start should be less than end or is this intentional?

#Chromosome Name	Location (bp)	Strand	Reference ID
my @snp_dump = split ( /\n/, qq {
22	34572694	1	2235335
22	34572799	1	2235336
22	34572843	1	2235337
22	34574896	1	2076087
22	34575256	1	2076088
22	34578830	1	2281098
22	34579111	1	2281099
22	34580411	1	2235338
22	34580591	1	2281097
22	34580845	1	2235339
22	34581963	1	2281100
22	34583722	1	140057
22	34585003	1	140058
22	34587726	1	968725
22	34588207	1	2284055
22	34591507	1	1969639
22	34591949	1	140059
});
shift @snp_dump;

my ($cdsr, @exons) = read_gene_data(@gene1_dump);

ok my $g1 = Bio::Coordinate::GeneMapper->new(-in=>'chr', -out=>'gene');
$g1->cds($cdsr);

#$pos = Bio::Location::Simple->new
#    (-start => 34576888, -end => 34579355, -strand=> 1 );
$res = $g1->map($cdsr);
is $res->start, 1;
is $res->end, 2468;

$g1->exons(@exons);
$g1->in('gene');
$g1->out('cds');
$res = $g1->map($res);
is $res->start, 1;
is $res->end, 1173;

#map_snps($g1, @snp_dump);


#gene 2 in reverse strand
($cdsr, @exons) = read_gene_data(@gene2_dump);
ok my $g2 = Bio::Coordinate::GeneMapper->new(-in=>'chr', -out=>'gene');
$g2->cds($cdsr);

$pos = Bio::Location::Simple->new
    (-start => $cdsr->end-2, -end => $cdsr->end, -strand=> 1 );
$res = $g2->map($pos);
is $res->start, 1;
is $res->end, 3;
is $res->strand, -1;


$g2->exons(@exons);

#map_snps($g2, @snp_dump);


$match1 = Bio::Location::Simple->new 
    (-seq_id => 'a', -start => 5, -end => 17, -strand=>1 );
$match2 = Bio::Location::Simple->new
    (-seq_id => 'b', -start => 1, -end => 13, -strand=>-1 );
ok $pair = Bio::Coordinate::Pair->new(-in => $match1,
					 -out => $match2,
					);

#
# split location
#

ok my $split = Bio::Location::Split->new();
ok $split->add_sub_Location(Bio::Location::Simple->new(-start=>6,
                                                      -end=>8,
                                                      -strand=>1));
$split->add_sub_Location(Bio::Location::Simple->new(-start=>15,
                                                   -end=>16,
                                                   -strand=>1));

$res=$pair->map($split);
ok my @sublocs = $res->each_Location(1);
is @sublocs, 2;

#print Dumper \@sublocs;
is $sublocs[0]->start, 2;
is $sublocs[0]->end, 3;
is $sublocs[1]->start, 10;
is $sublocs[1]->end, 12;

# testing  cds -> gene/chr which generates a split location from a simple one
# exons in reverse strand!
#
#  pept   33222     111
#  cds    8   4     3 1-1
#  exon   5   1     5   1
#  gene  13   9     3 1-2
#         |---|     |---|
#-----|-------------------
# chr 1   5   9    15   19
#           e1        e2

# gene
$e1 = Bio::Location::Simple->new
    (-seq_id => 'gene', -start => 5, -end => 9, -strand=>-1 );
$e2 = Bio::Location::Simple->new 
    (-seq_id => 'gene', -start => 15, -end => 19, -strand=>-1 );
@cexons = ($e1, $e2);
my $cds= Bio::Location::Simple->new
    (-seq_id => 'gene', -start => 5, -end => 17, -strand=>-1 );

$m = Bio::Coordinate::GeneMapper->new(-in=>'cds', -out=>'chr');

$m->cds($cds); # this has to be set first!?
is $m->exons(@cexons), 2;


my $cds_f= Bio::Location::Simple->new
    (-start => 2, -end => 7, );
$res = $m->map($cds_f);

ok @sublocs = $res->each_Location(1);
is @sublocs, 2;

is $sublocs[0]->start, 6;
is $sublocs[0]->end, 9;
is $sublocs[1]->start, 15;
is $sublocs[1]->end, 16;


# test inex, exon & negative_intron

$m->in('gene');
$m->out('inex');

$pos = Bio::Location::Simple->new 
    (-seq_id => 'gene', -start => 2, -end => 10, -strand=> 1 );

$res = $m->map($pos);
is $res->each_Location, 3;


$m->out('intron');
$res = $m->map($pos);
is $res->start, 1;
is $res->end, 5;
is $res->strand, 1;

$m->out('negative_intron');
$res = $m->map($pos);
is $res->start, -5;
is $res->end, -1;
is $res->strand, 1;

is $m->_mapper_code2string('1-2'), 'chr-gene';
is $m->_mapper_string2code('chr-gene'), '1-2';


#todo:
#  strict mapping mode
#  extrapolating pair code into Bio::Coordinate::Pair ?






sub read_gene_data {
    my ($self,@gene_dump) = @_;
    my ($cds_start, $cds_end, $strand, @exons);

    #one line per exon
    my ($first, $first_line);
    for my $line ( @gene_dump ) {

	my ($geneid, $exon_start, $exon_end, $exon_cstart,
	    $exon_cend, $exon_strand) = split /\t/, $line;

	$strand = $exon_strand if $exon_strand;
	#print join (' ', $geneid, $exon_start, $exon_strand), "\n";

	# CDS location in chromosome coordinates
	$cds_start = $exon_cstart if !$cds_start and $exon_cstart;
	$cds_end = $exon_cend if $exon_cend;


	if ($exon_start > $exon_end) {
	    ($exon_start, $exon_end) = ($exon_end, $exon_start);
	}

	my $exon = Bio::Location::Simple->new
	    (-seq_id => 'gene', -start => $exon_start,
	     -end => $exon_end, -strand=>$strand, -verbose=>2);
	push @exons, $exon;
    }

    if ($cds_start > $cds_end) {
	($cds_start, $cds_end) = ($cds_end, $cds_start);
    }

    my $cdsr = Bio::Location::Simple->new (-start => $cds_start,
					   -end => $cds_end,
					   -strand=> $strand);

    return ($cdsr, @exons);
}


sub map_snps {
    my ($mapper, @snps) =@_;
    $mapper->in('chr');
    $mapper->out('cds');
    foreach my $line (@snps) {
	$mapper->out('cds');

	my ($chr, $start, $strand, $id) = split /\t/, $line;
	my $loc = Bio::Location::Simple->new
	    ( -start => $start,
	     -end => $start, -strand=>$strand );

	my $res = $mapper->map($loc);
	my $cds_start = 0;
	$cds_start = $res->start if defined $res;#defined $res->start;
	print $id, "\t", $cds_start, "\n";

	# coding
	if ($cds_start) {
	    $mapper->out('propeptide');
	    my $frame_obj = $mapper->_frame($res);
	    my $res = $mapper->map($loc);
	    my $cds_start = 0;
	    $cds_start = $res->start if defined $res;#defined $res->start;
	    print  "\t\t", $cds_start, " (", $frame_obj->start, ")\n";
	}
    }
}