The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#-*-perl-*-
#$Id$
#testing SoapEUtilities with network queries
# idea: reproduce the examples at
# http://www.bioperl.org/wiki/HOWTO:EUtilities_Cookbook

use strict;
use warnings;
our $home;
BEGIN {
    use Bio::Root::Test;
    use lib '.';
    $home = '.'; # set to '.' for Build use,
                      # '..' for debugging from .t file
    unshift @INC, $home;
    test_begin(-tests => 119,
               -requires_modules => [qw(Bio::DB::ESoap
                                        Bio::DB::ESoap::WSDL
                                        Bio::DB::SoapEUtilities
                                        Bio::DB::SoapEUtilities::Result
                                        Bio::DB::SoapEUtilities::FetchAdaptor
                                        Bio::DB::SoapEUtilities::LinkAdaptor
                                        Bio::DB::SoapEUtilities::DocSumAdaptor
                                        SOAP::Lite
                                        XML::Twig
                                        )],
           -requires_email      => 1);
}

my $dep_msg = "NCBI SOAP EUtilities API no longer supported as of 2015, modules are deprecated and will be removed in a future release";

diag($dep_msg);

SKIP: {
    skip($dep_msg, 119);

    my $email = test_email();

    diag("Using $email for tests") if test_debug();

    my ($fac, $result, $seqio, $seq, $i);
    my @prot_ids = qw(1621261 89318838 68536103 20807972 730439);
    my @prot_ids_2 = qw(828392 790 470338);
    my @accns = qw(CAB02640 EAS10332 YP_250808 NP_623143 P41007);
    my $lg_contig = 27479347;
    my $sciname_id = 527031;
    my @gene_ids = qw(828392 790 470338);
    my $nbr_test_id = 1621261;
    my @linkout_test_ids = qw(28864546 53828898 14523048 14336674 1817575);

    SKIP : {
        test_skip(-tests => 119,
                  -requires_networking => 1);
    ok $fac = Bio::DB::SoapEUtilities->new(-verbose => test_debug), "SoapEUtilities factory";

    diag("Retrieve raw data records from GenBank, save raw data to file, then parse via Bio::SeqIO");

    ok $result = $fac->efetch(-email =>$email, -db => 'protein', -id => \@prot_ids_2 )->run(-no_parse=>1), "run efetch, no parse methods";

    ok $seqio = Bio::DB::SoapEUtilities::FetchAdaptor->new(-result=>$result), "create adaptor";
    for ($i=0; $seq = $seqio->next_seq; $i++) {
         isa_ok($seq, 'Bio::Seq::RichSeq');
         ok $seq->id, "primary_id present";
    1;}
    is $i, 3, "iterated all seq objs";
    ok $result = $fac->efetch( -rettype => 'fasta' )->run(-no_parse=>1), "run efetch, fasta return, no parse methods";
    ok $seqio = Bio::DB::SoapEUtilities::FetchAdaptor->new(-result=>$result), "create adaptor";
    for ($i=0; $seq = $seqio->next_seq; $i++) {
         isa_ok($seq, 'Bio::Seq');
         ok $seq->id, "primary_id present";
    1;}

    diag("Get accessions (actually accession.versions) for a list of GenBank IDs (GIs)");
    # SOAP server doesn't seem to like 'acc' as a rettype, get in fasta format
    # and use the result accessors
    ok $fac->efetch->set_parameters(-db=>'protein',
                                    -id=>\@prot_ids,
                                    -rettype => 'fasta' ), "set rettype = fasta";
    ok $result = $fac->efetch->run, "run query with methods parsing";
    is scalar @{$result->TSeqSet_TSeq_TSeqAccver}, 5, "retrieved all accns via \$result->TSeqSet_TSeq_TSeqAccver";
    diag("Get GIs for a list of accessions");
    ok $fac->efetch->set_parameters( -id=>\@accns ), "set -id to accn list";
    ok $result = $fac->efetch->run, "run query with methods parsing";
    is scalar @{$result->TSeqSet_TSeq_TSeqGi}, 5, "retrieved all GIs via \$result->TSeqSet_TSeq_TSeqGi";

    diag("Downloading a large contig");
    ok $fac->efetch->reset_parameters( -db => 'nucleotide',
                                       -email =>$email,
                                       -id => $lg_contig), "set parms for lg contig example";
    ok $seqio = $fac->efetch->run( -auto_adapt => 1 ), "run with auto_adapt";

    ok $seq = $seqio->next_seq, "iterate adaptor";
    ok !$seq->seq, "no sequence present (contig only)";
    TODO: {
        local $TODO = "why not work with gbwithparts?";
        ok $fac->efetch->set_parameters( -rettype=>'gbwithparts' ), "rettype = gbwithparts";
        dies_ok { $seqio = $fac->efetch->run( -auto_adapt => 1 ) };
        diag("run with auto_adapt (dies not really ok...)");

        ok $seq = $seqio->next_seq, "iterate adaptor";
        if ($seq) { ok $seq->seq, "sequence now present"; } else {ok 1;}
    }

    diag("Get the scientific name for an organism");
    ok $fac->efetch->reset_parameters( -email =>$email,
                                      -db=>'taxonomy',
                                      -id=>[$sciname_id, $sciname_id+1] ), "set params for sciname test";
    ok my $spio = $fac->efetch->run(-auto_adapt => 1), "run with autoadapt";
    ok my $sp = $spio->next_species;
    like $sp->scientific_name, qr/Bacillus thuringiensis/, "sciname";
    is ($sciname_id, $sp->ncbi_taxid, "taxid retrieved and correct");

    ok $fac->esummary( -email =>$email, -db => 'taxonomy', -id => $sciname_id ), "set esummary parms";
    ok my $docs = $fac->run(-auto_adapt=>1), "run with autoadapt";
    ok my $doc = $docs->next_docsum, "iterate adaptor";
    like $doc->ScientificName, qr/Bacillus thuringiensis/, "sciname";
    is ($sciname_id, $doc->TaxId, "taxid retrieved and correct");

    diag("Simple database query");
    ok $fac->esearch( -email =>$email, -db=>'protein', -term=> 'BRCA and human', -usehistory=>1 );
    ok $result = $fac->run, "run with method parsing";
    is $result->QueryTranslation,
       'BRCA[All Fields] AND ("Homo sapiens"[Organism] OR human[All Fields])',
    "query translation";
    cmp_ok $result->count, ">=", 73, "result count";
    ok $fac->esearch->reset_parameters(-email =>$email,
                                       -WebEnv => $result->webenv,
                                       -QueryKey => $result->query_key,
                                       -RetMax => 100 );
    ok my $wresult = $fac->esearch->run, "run web environment query with retmax set";
    cmp_ok $wresult->count, ">=", 73, "all ids retrieved";

    diag("What databases are available for querying via eUtils?");
    ok $result = $fac->einfo(-email =>$email)->run, "run einfo general query";
    cmp_ok scalar(@{$result->dbs}), ">=", 42, "bunch o' dbs";

    diag("What information is available for database 'x'?");
    ok $result = $fac->einfo(-email =>$email,-db=>'pubmed')->run, "run pubmed info";
    is $result->db, 'pubmed', "dbname";
    cmp_ok $result->record_count, ">=", 19000000, "record count";

    diag("How do I run a global query against all Entrez databases?");

    ok my $queries = $fac->egquery(-email =>$email, -term => 'BRCA and human' )->run(-auto_adapt=>1), "run fac for eGQuery, autoadapt";

    ok my $query = $queries->query_by_db('protein'), "query by db (protein)";
    is $query->term, 'BRCA and human', "term accessor correct";
    ok !$queries->query_by_db('bllog'), "no query by db (bllog)";
    for ( $i=0; my $q = $queries->next_query; $i++ ) {
        ok $q->db, "db (query $i)";
    }
    cmp_ok scalar($queries->found_in_dbs), ">=", 11, "found in enuf dbs";
    cmp_ok scalar($queries->found_in_dbs), "<=", $i, "but not in too many";

    diag("I want the document summaries for a list of IDs from database 'x'.");
    ok  $docs = $fac->esummary(-email =>$email, -db=>'gene', -id=>\@prot_ids_2 )->run(-auto_adapt=>1), "run esummary, autoadapt";
    my @tax = (3702, 9606, 9598);
    my @acc = qw( NC_003075.7 NC_000002.11 NC_006469.3 );
    for ($i=0; my $doc = $docs->next_docsum; $i++) {
        is $doc->id, $prot_ids_2[$i], " id retrieved";
        is $doc->TaxID, $tax[$i], " taxid retrieved correctly";
        is $doc->GenomicInfo->{ChrAccVer}, $acc[$i], "chrom acc/ver retrieved correctly";
        cmp_ok scalar ($doc->item_names), ">", 5, "bunch o' items";
    }
    is $i, 3, "got all docsums";

    diag("I want a list of database 'x' UIDs that are linked from a list of database 'y' UIDs");

    ok my $links =$fac->elink(-email =>$email,
                              -db => 'nucleotide',
                              -dbfrom => 'protein',
                              -id => \@prot_ids )->run( -auto_adapt => 1),
    "run elink protein->nucleotide, auto adapt";
    my %h;
    for ($i=0; my $linkset = $links->next_linkset; $i++) {
        my @ids = $linkset->ids;
        my @submitted_ids = $linkset->submitted_ids;
        $h{$submitted_ids[0]} = @ids == 1 ? $ids[0] : [@ids];
    }
    for (keys %h) {
        is_deeply( $links->id_map($_) || [], $h{$_}, "id_map correct ($_)");
        1;
    }
    is $i, 5, "got all linksets";

    1;

    }
}

# remove later
#sub test_input_file { "data/".shift };