The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# -*-Perl-*- Test Harness script for Bioperl
# $Id: SearchIO_blast_pull.t 11525 2007-06-27 10:16:38Z sendu $

use strict;

BEGIN {
	use lib '.';
    use Bio::Root::Test;
    
    test_begin(-tests => 289);
	
	use_ok('Bio::SearchIO');
}


my $searchio = Bio::SearchIO->new(-format => 'blast_pull', -file => test_input_file('new_blastn.txt'));

my $result = $searchio->next_result;
is $result->database_name, 'All GenBank+EMBL+DDBJ+PDB sequences (but no EST, STS,GSS,environmental samples or phase 0, 1 or 2 HTGS sequences)';
is $result->database_entries, 3742891;
is $result->database_letters, 16670205594;
is $result->algorithm, 'BLASTN';
is $result->algorithm_version, '2.2.13 [Nov-27-2005]';
is $result->query_name, 'pyrR,';
is $result->query_length, 558;
is $result->get_statistic('kappa'), 0.711;
is $result->get_statistic('kappa_gapped'), 0.711;
is $result->get_statistic('lambda'), 1.37;
is $result->get_statistic('lambda_gapped'), 1.37;
is $result->get_statistic('entropy'), 1.31;
is $result->get_statistic('entropy_gapped'), 1.31;
is $result->get_statistic('dbletters'), -509663586;
is $result->get_statistic('dbentries'), 3742891;
is $result->get_statistic('effectivespace'), 8935230198384;
is $result->get_parameter('matrix'), 'blastn matrix:1 -3';
is $result->get_parameter('gapopen'), 5;
is $result->get_parameter('gapext'), 2;
is $result->get_statistic('S2'), 60;
is $result->get_statistic('S2_bits'), 119.4;
float_is $result->get_parameter('expect'), 1e-23;
is $result->get_statistic('num_extensions'), 117843;

my @valid = ( [ 'gi|41400296|gb|AE016958.1|', 4829781, 'AE016958', '6e-059', 236],
	      [ 'gi|54013472|dbj|AP006618.1|', 6021225, 'AP006618', '4e-026', 127],
	      [ 'gi|57546753|dbj|BA000030.2|', 9025608, 'BA000030', '1e-023', 119]);
my $count = 0;
while (my $hit = $result->next_hit ) {
    my $d = shift @valid;

    is $hit->name, shift @$d;
    is $hit->length, shift @$d;
    is $hit->accession, shift @$d;
    float_is($hit->significance, shift @$d);
    is $hit->raw_score, shift @$d;

    if( $count == 0 ) {
        my $hsps_left = 1;
        while( my $hsp = $hit->next_hsp ) {
            is $hsp->query->start, 262;
            is $hsp->query->end, 552;
            is $hsp->hit->start, 1166897;
            is $hsp->hit->end, 1167187;
            is $hsp->length('hsp'), 291;
            is $hsp->start('hit'), $hsp->hit->start;
            is $hsp->end('query'), $hsp->query->end;
            is $hsp->strand('sbjct'), $hsp->subject->strand;# alias for hit
            float_is($hsp->evalue, 6e-59);
            is $hsp->score, 119;
            is $hsp->bits,236;	    	    
            is sprintf("%.1f",$hsp->percent_identity), 85.2;
            is sprintf("%.3f",$hsp->frac_identical('query')), 0.852;
            is sprintf("%.3f",$hsp->frac_identical('hit')), 0.852;
            is $hsp->gaps, 0;
            $hsps_left--;
        }
        is $hsps_left, 0;
    }
    last if( $count++ > @valid );
}
is @valid, 0;

# descriptionless hit
$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
							   '-file' => test_input_file('blast_no_hit_desc.txt'));

$result = $searchio->next_result;
my $hit = $result->next_hit;
is $hit->name, 'chr1';
is $hit->description, '';

# further (NCBI blastn/p) tests taken from SearchIO.t
$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
							   '-file' => test_input_file('ecolitst.bls'));

$result = $searchio->next_result;

is($result->database_name, 'ecoli.aa', 'database_name()');
is($result->database_entries, 4289);
is($result->database_letters, 1358990);

is($result->algorithm, 'BLASTP');
like($result->algorithm_version, qr/^2\.1\.3/);
like($result->query_name, qr/gi|1786183|gb|AAC73113.1| (AE000111) aspartokinase I,\s+homoserine dehydrogenase I [Escherichia coli]/);
is($result->query_length, 820);
is($result->get_statistic('kappa'), '0.135');
is($result->get_statistic('kappa_gapped'), '0.0410');
is($result->get_statistic('lambda'), '0.319');
is($result->get_statistic('lambda_gapped'), '0.267');
is($result->get_statistic('entropy'), '0.383');
is($result->get_statistic('entropy_gapped'), '0.140');

is($result->get_statistic('dbletters'), 1358990);
is($result->get_statistic('dbentries'), 4289);
is($result->get_statistic('effective_hsplength'), 47);
is($result->get_statistic('effectivespace'), 894675611);
is($result->get_parameter('matrix'), 'BLOSUM62');
is($result->get_parameter('gapopen'), 11);
is($result->get_parameter('gapext'), 1);
is($result->get_statistic('S2'), '92');
is($result->get_statistic('S2_bits'), '40.0');
float_is($result->get_parameter('expect'), '1.0e-03');
is($result->get_statistic('num_extensions'), '82424');


@valid = ( [ 'gb|AAC73113.1|', 820, 'AAC73113', '0', 1567],
	      [ 'gb|AAC76922.1|', 810, 'AAC76922', '1e-91', 332],
	      [ 'gb|AAC76994.1|', 449, 'AAC76994', '3e-47', 184]);
$count = 0;
while (my $hit = $result->next_hit) {
    my $d = shift @valid;

    is($hit->name, shift @$d);
    is($hit->length, shift @$d);
    is($hit->accession, shift @$d);
    float_is($hit->significance, shift @$d);
    is($hit->raw_score, shift @$d );

    if( $count == 0 ) {
        my $hsps_left = 1;
        while (my $hsp = $hit->next_hsp) {
            is($hsp->query->start, 1);
            is($hsp->query->end, 820);
            is($hsp->hit->start, 1);
            is($hsp->hit->end, 820);
            is($hsp->length('hsp'), 820);
            is($hsp->start('hit'), $hsp->hit->start);
            is($hsp->end('query'), $hsp->query->end);
            is($hsp->strand('sbjct'), $hsp->subject->strand);# alias for hit
            float_is($hsp->evalue, 0.0);
            is($hsp->score, 4058);
            is($hsp->bits,1567);	    	    
            is(sprintf("%.2f",$hsp->percent_identity), 98.29);
            is(sprintf("%.4f",$hsp->frac_identical('query')), 0.9829);
            is(sprintf("%.4f",$hsp->frac_identical('hit')), 0.9829);
            is($hsp->gaps, 0);
            $hsps_left--;
        }
        is($hsps_left, 0);
    }
    last if( $count++ > @valid );
}
is(@valid, 0);

$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
							  '-file'   => test_input_file('a_thaliana.blastn'));

$result = $searchio->next_result;
is($result->database_name, 'All GenBank+EMBL+DDBJ+PDB sequences (but no EST, STS, GSS,or phase 0, 1 or 2 HTGS sequences)');
is($result->database_letters, 4677375331);
is($result->database_entries, 1083200);
is($result->algorithm, 'BLASTN');
like($result->algorithm_version, qr/^2\.2\.1/);
is($result->query_name, '');
is($result->query_length, 60);
is($result->get_parameter('gapopen'), 5);
is($result->get_parameter('gapext'), 2);
is($result->get_parameter('ktup'), undef);

is($result->get_statistic('lambda'), 1.37);
is($result->get_statistic('kappa'), 0.711);
is($result->get_statistic('entropy'),1.31 );
is($result->get_statistic('T'), 0);
is($result->get_statistic('A'), 30);
is($result->get_statistic('X1'), '6');
is($result->get_statistic('X1_bits'), 11.9);
is($result->get_statistic('X2'), 15);
is($result->get_statistic('X2_bits'), 29.7);
is($result->get_statistic('S1'), 12);
is($result->get_statistic('S1_bits'), 24.3);
is($result->get_statistic('S2'), 17);
is($result->get_statistic('S2_bits'), 34.2);

is($result->get_statistic('dbentries'), 1083200);

@valid = ( [ 'gb|AY052359.1|', 2826, 'AY052359', '3e-18', 96, 1, 60, 
	     '1.0000'],
	   [ 'gb|AC002329.2|AC002329', 76170, 'AC002329', '3e-18', 96, 1, 60, 
	     '1.0000' ],
	   [ 'gb|AF132318.1|AF132318', 5383, 'AF132318', '0.04', 42, 35, 55, 
	     '0.3500']);
$count = 0;

while( my $hit = $result->next_hit ) {
    my $d = shift @valid;
    is($hit->name, shift @$d);
    is($hit->length, shift @$d);
    is($hit->accession, shift @$d);
    float_is($hit->significance, shift @$d);
    is($hit->raw_score, shift @$d );
    is($hit->start, shift @$d);
    is($hit->end,shift @$d);    
    is(sprintf("%.4f",$hit->frac_aligned_query), shift @$d);
    if( $count == 0 ) {
        my $hsps_left = 1;
        while( my $hsp = $hit->next_hsp ) {
            is($hsp->query->start, 1);
            is($hsp->query->end, 60);
            is($hsp->query->strand, 1);
            is($hsp->hit->start, 154);
            is($hsp->hit->end, 212);
            is($hsp->hit->strand, 1);
            is($hsp->length('hsp'), 60);	    
            float_is($hsp->evalue, 3e-18);
            is($hsp->score, 48);
            is($hsp->bits,95.6);
            is(sprintf("%.2f",$hsp->percent_identity), 96.67);
            is(sprintf("%.4f",$hsp->frac_identical('query')), 0.9667);
            is(sprintf("%.4f",$hsp->frac_identical('hit')), 0.9831);
            is($hsp->query->frame(), 0);
            is($hsp->hit->frame(), 0);
            is($hsp->query->seq_id, '');
            is($hsp->hit->seq_id, 'gb|AY052359.1|');
            is($hsp->gaps('query'), 0);
            is($hsp->gaps('hit'), 1);
            is($hsp->gaps, 1);
            is($hsp->query_string, 'aggaatgctgtttaattggaatcgtacaatggagaatttgacggaaatagaatcaacgat');
            is($hsp->hit_string, 'aggaatgctgtttaattggaatca-acaatggagaatttgacggaaatagaatcaacgat');
            is($hsp->homology_string, '|||||||||||||||||||||||  |||||||||||||||||||||||||||||||||||');
            is(sprintf("%.2f",$hsp->get_aln->overall_percentage_identity), 96.67);
            is(sprintf("%.2f",$hsp->get_aln->percentage_identity), 98.31);
            $hsps_left--;
        }
        is($hsps_left, 0);
    }
    last if( $count++ > @valid );
} 
is(@valid, 0);

$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
							  '-file'   => test_input_file('frac_problems.blast'));
my @expected = ("1.000", "0.943");
while (my $result = $searchio->next_result) {
    my $hit = $result->next_hit;
	if (@expected == 2) {
		is($hit->frac_identical, shift @expected);
	}
	else {
		TODO: {
			local $TODO = 'frac_identical failing!';
			is($hit->frac_identical, shift @expected);
		}
	}
}
is(@expected, 0);

# And even more: frac_aligned_query should never be over 1!
$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
							  '-file'   => test_input_file('frac_problems2.blast'));
$result = $searchio->next_result;
$hit = $result->next_hit;
is $hit->frac_aligned_query, 0.97;

# Also, start and end should be sane
$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
							  '-file'   => test_input_file('frac_problems3.blast'));
$result = $searchio->next_result;
$hit = $result->next_hit;
is $hit->start('sbjct'), 207;
is $hit->end('sbjct'), 1051;

# Do a multiblast report test
$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
							   '-file'   => test_input_file('multi_blast.bls'));

@expected = qw(CATH_RAT CATL_HUMAN CATL_RAT PAPA_CARPA);
my $results_left = 4;
while( my $result = $searchio->next_result ) {
    is($result->query_name, shift @expected, "Multiblast query test");
    $results_left--;
}
is($results_left, 0);

# Web blast result parsing
$searchio = Bio::SearchIO->new(-format => 'blast_pull',
							   -file   => test_input_file('catalase-webblast.BLASTP'));
ok($result = $searchio->next_result);
ok($hit = $result->next_hit);
is($hit->name, 'gi|40747822|gb|EAA66978.1|', 'full hit name');
is($hit->accession, 'EAA66978', 'hit accession');
ok(my $hsp = $hit->next_hsp);
is($hsp->query->start, 1, 'query start');
is($hsp->query->end, 528, 'query start');

# tests for new BLAST 2.2.13 output
$searchio = Bio::SearchIO->new(-format => 'blast_pull',
							  -file   => test_input_file('new_blastn.txt'));

$result = $searchio->next_result;
is($result->database_name, 'All GenBank+EMBL+DDBJ+PDB sequences (but no EST, STS,GSS,environmental samples or phase 0, 1 or 2 HTGS sequences)');
is($result->database_entries, 3742891);
is($result->database_letters, 16670205594);
is($result->algorithm, 'BLASTN');
is($result->algorithm_version, '2.2.13 [Nov-27-2005]');
is($result->query_name, 'pyrR,');
is($result->query_length, 558);
is($result->get_statistic('kappa'), '0.711');
is($result->get_statistic('kappa_gapped'), '0.711');
is($result->get_statistic('lambda'), '1.37');
is($result->get_statistic('lambda_gapped'), '1.37');
is($result->get_statistic('entropy'), '1.31');
is($result->get_statistic('entropy_gapped'), '1.31');
is($result->get_statistic('dbletters'), '-509663586');
is($result->get_statistic('dbentries'), 3742891);
is($result->get_statistic('effective_hsplength'), undef);
is($result->get_statistic('effectivespace'), 8935230198384);
is($result->get_parameter('matrix'), 'blastn matrix:1 -3');
is($result->get_parameter('gapopen'), 5);
is($result->get_parameter('gapext'), 2);
is($result->get_statistic('S2'), '60');
is($result->get_statistic('S2_bits'), '119.4');
float_is($result->get_parameter('expect'), '1e-23');
is($result->get_statistic('num_extensions'), '117843');

@valid = ( [ 'gi|41400296|gb|AE016958.1|', 4829781, 'AE016958', '6e-059', 236],
	      [ 'gi|54013472|dbj|AP006618.1|', 6021225, 'AP006618', '4e-026', 127],
	      [ 'gi|57546753|dbj|BA000030.2|', 9025608, 'BA000030', '1e-023', 119]);
$count = 0;
while( $hit = $result->next_hit ) {
    my $d = shift @valid;

    is($hit->name, shift @$d);
    is($hit->length, shift @$d);
    is($hit->accession, shift @$d);
    float_is($hit->significance, shift @$d);
    is($hit->raw_score, shift @$d );

    if( $count == 0 ) {
        my $hsps_left = 1;
        while( my $hsp = $hit->next_hsp ) {
            is($hsp->query->start, 262);
            is($hsp->query->end, 552);
            is($hsp->hit->start, 1166897);
            is($hsp->hit->end, 1167187);
            is($hsp->length('hsp'), 291);
            is($hsp->start('hit'), $hsp->hit->start);
            is($hsp->end('query'), $hsp->query->end);
            is($hsp->strand('sbjct'), $hsp->subject->strand);# alias for hit
            float_is($hsp->evalue, 6e-59);
            is($hsp->score, 119);
            is($hsp->bits,236);	    	    
            is(sprintf("%.2f",$hsp->percent_identity), 85.22);
            is(sprintf("%.4f",$hsp->frac_identical('query')), 0.8522);
            is(sprintf("%.4f",$hsp->frac_identical('hit')), 0.8522);
            is($hsp->gaps, 0);
            $hsps_left--;
        }
        is($hsps_left, 0);
    }
    last if( $count++ > @valid );
}
is(@valid, 0);

# Bug 2189
$searchio = Bio::SearchIO->new(-format => 'blast_pull',
							  -file   => test_input_file('blastp2215.blast'));

$result = $searchio->next_result;
is($result->database_entries, 4460989);
is($result->database_letters, 1533424333);
is($result->algorithm, 'BLASTP');
is($result->algorithm_version, '2.2.15 [Oct-15-2006]');
is($result->query_name, 'gi|15608519|ref|NP_215895.1|');
is($result->query_length, 193);
my @hits = $result->hits;
is(scalar(@hits), 10);
is($hits[1]->accession,'1W30');
float_is($hits[4]->significance,'2e-72');
is($hits[7]->score,'254');
$result = $searchio->next_result;
is($result->database_entries, 4460989);
is($result->database_letters, 1533424333);
is($result->algorithm, 'BLASTP');
is($result->algorithm_version, '2.2.15 [Oct-15-2006]');
is($result->query_name, 'gi|15595598|ref|NP_249092.1|');
is($result->query_length, 423);
@hits = $result->hits;
is(scalar(@hits), 10);
is($hits[1]->accession,'ZP_00972546');
float_is($hits[4]->significance, '0.0');
is($hits[7]->score, 624);

# Bug 2246
$searchio = Bio::SearchIO->new(-format => 'blast_pull',
                               -verbose => -1,
							  -file   => test_input_file('bug2246.blast'));
$result = $searchio->next_result;
$hit = $result->next_hit;
is $hit->name, 'UniRef50_Q9X0H5';
is $hit->length, undef;
is $hit->accession, 'UniRef50_Q9X0H5';
is $hit->description, 'Cluster: Histidyl-tRNA synthetase; n=4; Thermoto...';
is $hit->raw_score, 23;
float_is($hit->significance, 650);

#*** need to /fully/ test a multi-result, multi-hit, multi-hsp report!