# -*-Perl-*- Test Harness script for Bioperl
# $Id$
use strict;
BEGIN {
use lib '.';
use Bio::Root::Test;
test_begin( -tests => 139 );
use_ok('Bio::SeqIO::fastq');
use_ok('Bio::Seq::Quality');
}
my $DEBUG = test_debug();
# simple parsing, data conversion of fastq example files
my %example_files = (
bug2335 => {
'variant' => 'sanger',
'seq' => 'TTGGAATGTTGCAAATGGGAGGCAGTTTGAAATACTGAATAGGCCTCATC'.
'GAGAATGTGAAGTTTCAGTAAAGACTTGAGGAAGTTGAATGAGCTGATGA'.
'ATGGATATATG',
'qual' => '31 23 32 23 31 22 27 28 32 24 25 23 30 25 2 21 33 '.
'29 9 17 33 27 27 27 25 33 29 9 28 32 27 7 27 21 '.
'26 21 27 27 17 26 23 31 23 32 24 27 27 28 27 28 '.
'28 27 27 31 23 23 28 27 27 32 23 27 35 30 12 28 '.
'27 27 25 33 29 10 27 28 28 33 25 27 27 31 23 34 '.
'27 27 32 24 27 30 22 24 28 24 27 28 27 26 28 27 '.
'28 32 24 28 33 25 23 27 27 28 27 28 26',
'display_id' => 'DS6BPQV01A2G0A',
'desc' => undef,
'count' => 1
},
test1_sanger => {
'variant' => 'sanger',
'seq' => 'TATTGACAATTGTAAGACCACTAAGGATTTTTGGGCGGCAGCGACTTGGA'.
'GCTCTTGTAAAAGCGCACTGCGTTCCTTTTCTTTATTCTTTTGATCTTGA'.
'GAATCTTCTAAAAATGCCGAAAAGAAATGTTGGGAAGAGAGCGTAATCAG'.
'TTTAGAAATGCTCTTGATGGTAGCTTTATGTTGATCCATTCTTCTGCCTC'.
'CTTTACGAATAAAATAGAATAAAACTCAAATGACTAATTACCTGTATTTT'.
'ACCTAATTTTGTGATAAAATTCAAGAAAATATGTTCGCCTTCAATAATTA'.
'TG',
'qual' => '37 37 37 37 37 37 37 37 37 37 37 40 38 40 40 37 '.
'37 37 39 39 40 39 39 39 39 39 37 33 33 33 33 33 '.
'39 39 34 29 28 28 38 39 39 39 39 39 39 37 37 37 '.
'37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '.
'37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 38 '.
'38 29 29 29 34 38 37 37 33 33 33 33 37 37 37 37 '.
'37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '.
'37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '.
'37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '.
'37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '.
'37 37 37 37 37 37 37 37 37 37 37 37 37 37 38 38 '.
'38 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '.
'37 37 37 37 37 37 37 37 37 34 34 34 38 38 37 37 '.
'37 37 37 37 37 37 37 37 40 40 40 40 38 38 38 38 '.
'40 40 40 38 38 38 40 40 40 40 40 40 40 40 40 40 '.
'38 38 38 38 38 32 25 25 25 25 30 30 31 32 32 31 '.
'31 31 31 31 31 31 31 31 19 19 19 19 19 22 22 31 '.
'31 31 31 31 31 31 31 32 32 31 32 31 31 31 31 31 '.
'31 25 25 25 28 28 30 30 30 30 30 31 31 32',
'display_id' => 'SRR005406.250',
'desc' => 'FB9GE3J10F6I2T length=302',
'count' => 250
},
test2_solexa => {
'variant' => 'solexa',
'seq' => 'GTATTATTTAATGGCATACACTCAA',
'qual' => '25 25 25 25 25 25 25 25 25 25 23 25 25 25 25 23 '.
'25 23 23 21 23 23 23 17 17',
'display_id' => 'SLXA-B3_649_FC8437_R1_1_1_183_714',
'desc' => undef,
'count' => 5
},
test3_illumina => {
'variant' => 'illumina',
'seq' => 'CCAAATCTTGAATTGTAGCTCCCCT',
'qual' => '15 19 24 15 17 24 24 24 24 24 19 24 24 21 24 24 '.
'20 24 24 24 24 20 18 13 19',
'display_id' => 'FC12044_91407_8_200_285_136',
'desc' => undef,
'count' => 25
},
example => {
'variant' => 'sanger', # TODO: guessing on the format here...
'seq' => 'GTTGCTTCTGGCGTGGGTGGGGGGG',
'qual' => '26 26 26 26 26 26 26 26 26 26 26 24 26 22 26 26 '.
'13 22 26 18 24 18 18 18 18',
'display_id' => 'EAS54_6_R1_2_1_443_348',
'desc' => undef,
'count' => 3
},
illumina_faked => {
'variant' => 'illumina',
'seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN',
'qual' => '40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 '.
'21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0',
'display_id' => 'Test',
'desc' => 'PHRED qualities from 40 to 0 inclusive',
'count' => 1
},
sanger_93 => {
'variant' => 'sanger',
'seq' => 'ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGAC'.
'TGACTGACTGACTGACTGACTGACTGACTGACTGAN',
'qual' => '93 92 91 90 89 88 87 86 85 84 83 82 81 80 79 78 77 76 75 '.
'74 73 72 71 70 69 68 67 66 65 64 63 62 61 60 59 58 57 56 '.
'55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 '.
'36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 '.
'17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0',
'display_id' => 'Test',
'desc' => 'PHRED qualities from 93 to 0 inclusive',
'count' => 1
},
sanger_faked => {
'variant' => 'sanger',
'seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN',
'qual' => '40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 '.
'21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0',
'display_id' => 'Test',
'desc' => 'PHRED qualities from 40 to 0 inclusive',
'count' => 1
},
solexa_example => {
'variant' => 'solexa',
'seq' => 'GTATTATTTAATGGCATACACTCAA',
'qual' => '25 25 25 25 25 25 25 25 25 25 23 25 25 25 25 23 '.
'25 23 23 21 23 23 23 17 17',
'display_id' => 'SLXA-B3_649_FC8437_R1_1_1_183_714',
'desc' => undef,
'count' => 5
},
solexa_faked => {
'variant' => 'solexa',
'seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN',
'qual' => '40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 '.
'24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 10 9 '.
'8 7 6 5 5 4 4 3 3 2 2 1 1',
'display_id' => 'slxa_0001_1_0001_01',
'desc' => undef,
'count' => 1
},
tricky => {
'variant' => 'sanger', # TODO: guessing on the format here...
'seq' => 'TGGGAGGTTTTATGTGGAAAGCAGCAATGTACAAGA',
'qual' => '40 40 40 40 40 40 40 13 40 40 40 40 40 40 16 31 '.
'19 19 31 12 22 13 4 27 5 10 14 3 14 4 19 7 10 10 '.
'7 4',
'display_id' => '071113_EAS56_0053:1:3:990:501',
'desc' => undef,
'count' => 4
},
evil_wrapping => {
'variant' => 'sanger', # TODO: guessing on the format here...
'seq' => 'AACCCGTCCCATCAAAGATTTTGGTTGGAACCCGAAAGGGTTTTGAATTC'.
'AAACCCCTTTCGGTTCCAACTATTCAATTGTTTAACTTTTTTTAAATTGA'.
'TGGTCTGTTGGACCATTTGTAATAATCCCCATCGGAATTTCTTT',
'qual' => '32 26 31 26 4 22 20 30 25 2 27 27 24 36 32 16 '.
'26 28 36 32 18 4 33 26 33 26 32 26 33 26 31 26 '.
'4 24 36 32 16 36 32 16 36 32 18 4 27 33 26 32 26 '.
'23 36 32 15 35 31 18 3 36 32 16 28 33 26 32 26 33 '.
'26 33 26 25 28 25 33 26 25 33 25 32 24 25 36 32 '.
'15 32 24 27 37 32 23 16 10 5 1 35 30 12 33 26 19 '.
'27 25 25 14 27 26 28 25 32 24 23 12 20 30 21 28 '.
'34 29 10 23 27 27 18 26 28 19 25 35 32 18 4 27 26 '.
'28 23 12 24 13 32 28 8 25 33 28 9',
'display_id' => 'SRR014849.203935',
'desc' => 'EIXKN4201B4HU6 length=144',
'count' => 3
},
);
for my $example (sort keys %example_files) {
my $file = test_input_file('fastq', "$example.fastq");
my $variant = $example_files{$example}->{variant};
my $in = Bio::SeqIO->new(-format => "fastq-$variant",
-file => $file,
-verbose => 2); #strictest level
my $ct = 0;
my $sample_seq;
eval {
while (my $seq = $in->next_seq) {
$ct++;
$sample_seq = $seq; # always grab the last seq
}
};
ok(!$@, "$example parses");
is($ct, $example_files{$example}->{count}, "correct num. seqs in $example");
ok(defined($sample_seq), 'sample sequence obtained');
if ($sample_seq) {
isa_ok($sample_seq, 'Bio::Seq::Quality');
for my $method (qw(seq desc display_id)) {
is($sample_seq->$method,
$example_files{$example}->{$method},
"$method() matches $example");
}
is(join(' ', map {sprintf("%.0f", $_)} @{$sample_seq->qual}),
$example_files{$example}->{qual},
"qual() matches $example");
my $truncated = $sample_seq->trunc(1,10);
is(scalar(@{$truncated->meta}), $truncated->length);
}
}
# test round-trip and conversions (single file of each type)
my @variants = qw(sanger illumina solexa);
my %conversion = ( # check conversions, particularly solexa
sanger_93 => {
'variant' => 'sanger',
'to_solexa' => {
'-seq' => 'ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGAN',
'-qual' => [ (map {62} 0..31),(reverse(1..61)),1 ],
'-raw_quality' => '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJHGFECB@>;;',
'-id' => 'Test',
'-desc' => 'PHRED qualities from 93 to 0 inclusive',
'-descriptor' => 'Test PHRED qualities from 93 to 0 inclusive'
},
'to_illumina' => {
'-seq' => 'ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGAN',
'-qual' => [ (map {62} 0..31),(reverse(0..61)) ],
'-raw_quality' => '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@',
'-id' => 'Test',
'-desc' => 'PHRED qualities from 93 to 0 inclusive',
'-descriptor' => 'Test PHRED qualities from 93 to 0 inclusive'
},
'to_sanger' => {
'-seq' => 'ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGAN',
'-qual' => [reverse(0..93)],
'-raw_quality' => '~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:9876543210/.-,+*)(\'&%$#"!',
'-id' => 'Test',
'-desc' => 'PHRED qualities from 93 to 0 inclusive',
'-descriptor' => 'Test PHRED qualities from 93 to 0 inclusive'
},
},
solexa_faked => {
'variant' => 'solexa',
'to_solexa' => {'-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN',
'-qual' => [qw(40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 10 9 8 7 6 5 5 4 4 3 3 2 2 1 1)],
'-raw_quality' => 'hgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;',
'-id' => 'slxa_0001_1_0001_01',
'-desc' => '',
'-descriptor' => 'slxa_0001_1_0001_01'
},
'to_illumina' => {
'-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN',
'-qual' => [qw(40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 10 9 8 7 6 5 5 4 4 3 3 2 2 1 1)],
'-namespace' => 'solexa',
'-raw_quality' => 'hgfedcba`_^]\\[ZYXWVUTSRQPONMLKJJIHGFEEDDCCBBAA',
'-id' => 'slxa_0001_1_0001_01',
'-desc' => '',
'-descriptor' => 'slxa_0001_1_0001_01'
},
'to_sanger' => {
'-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN',
'-qual' => [qw(40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 10 9 8 7 6 5 5 4 4 3 3 2 2 1 1)],
'-namespace' => 'solexa',
'-raw_quality' => 'IHGFEDCBA@?>=<;:9876543210/.-,++*)(\'&&%%$$##""',
'-id' => 'slxa_0001_1_0001_01',
'-desc' => '',
'-descriptor' => 'slxa_0001_1_0001_01'
},
},
illumina_faked => {
'variant' => 'illumina',
'to_solexa' => {
'-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN',
'-qual' => [reverse(1..40), 1], # round trip from solexa is lossy
'-namespace' => 'illumina',
'-raw_quality' => 'hgfedcba`_^]\[ZYXWVUTSRQPONMLKJHGFECB@>;;',
'-id' => 'Test',
'-desc' => 'PHRED qualities from 40 to 0 inclusive',
'-descriptor' => 'Test PHRED qualities from 40 to 0 inclusive'
},
'to_illumina' => {
'-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN',
'-qual' => [reverse(0..40)],
'-raw_quality' => 'hgfedcba`_^]\\[ZYXWVUTSRQPONMLKJIHGFEDCBA@',
'-id' => 'Test',
'-desc' => 'PHRED qualities from 40 to 0 inclusive',
'-descriptor' => 'Test PHRED qualities from 40 to 0 inclusive'
},
'to_sanger' => {
'-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN',
'-qual' => [reverse(0..40)],
'-raw_quality' => 'IHGFEDCBA@?>=<;:9876543210/.-,+*)(\'&%$#"!',
'-id' => 'Test',
'-desc' => 'PHRED qualities from 40 to 0 inclusive',
'-descriptor' => 'Test PHRED qualities from 40 to 0 inclusive'
}
},
);
for my $example (sort keys %conversion) {
my $file = test_input_file('fastq', "$example.fastq");
my $variant = $conversion{$example}->{variant};
my $in = Bio::SeqIO->new(-format => "fastq-$variant",
-file => $file,
-verbose => 2); #strictest level
# this both tests the next_dataset method and helps check roundtripping
my $seq = $in->next_seq;
for my $newvar (@variants) {
next unless exists $conversion{$example}->{"to_$newvar"};
my $outfile = test_output_file();
Bio::SeqIO->new(-format => "fastq-$newvar",
-file => ">$outfile",
-verbose => -1)->write_seq($seq);
my $newdata = Bio::SeqIO->new(-format => "fastq-$newvar",
-file => $outfile)->next_dataset;
# round for simple comparison, get around floating pt comparison probs
if ($newvar eq 'solexa') {
$newdata->{-qual} = [map {sprintf("%.0f",$_)} @{$newdata->{-qual}}];
}
#print Dumper($newdata) if $variant eq 'sanger' && $newvar eq 'illumina';
$conversion{$example}->{"to_$newvar"}->{'-namespace'} = $newvar;
is_deeply($newdata, $conversion{$example}->{"to_$newvar"}, "Conversion from $variant to $newvar");
}
}
# test fastq exception handling
my %error = (
# file name
error_diff_ids => {
variant => 'sanger',
exception => qr/doesn't\smatch\sseq\sdescriptor/xms,
},
error_long_qual => {
variant => 'sanger',
exception => qr/doesn't\smatch\slength\sof\ssequence/xms,
},
error_no_qual => {
variant => 'sanger',
exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms,
},
error_qual_del => {
variant => 'sanger',
exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
},
error_qual_escape => {
variant => 'sanger',
exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
},
error_qual_null => {
variant => 'sanger',
exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
},
error_qual_space => {
variant => 'sanger',
exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
},
error_qual_tab => {
variant => 'sanger',
exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
},
error_qual_unit_sep => {
variant => 'sanger',
exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
},
error_qual_vtab => {
variant => 'sanger',
exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
},
error_short_qual => {
variant => 'sanger',
exception => qr/doesn't\smatch\slength\sof\ssequence/,
},
error_spaces => {
variant => 'sanger',
exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
},
error_tabs => {
variant => 'sanger',
exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
},
error_trunc_at_plus => {
variant => 'sanger',
exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms,
},
error_trunc_at_qual => {
variant => 'sanger',
exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms,
},
error_trunc_at_seq => {
variant => 'sanger',
exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms,
},
error_trunc_in_title => {
variant => 'sanger',
exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms,
},
error_trunc_in_seq => {
variant => 'sanger',
exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms,
},
error_trunc_in_plus => {
variant => 'sanger',
exception => qr/doesn't\smatch\sseq\s descriptor/xms,
},
error_trunc_in_qual => {
variant => 'sanger',
exception => qr/doesn't\smatch\slength\sof\ssequence/xms,
},
);
for my $example (sort keys %error) {
my $file = test_input_file('fastq', "$example.fastq");
my $variant = $error{$example}->{variant};
my $in = Bio::SeqIO->new(-format => "fastq-$variant",
-file => $file,
-verbose => 2); #strictest level
my $ct = 0;
throws_ok { while (my $seq = $in->next_seq) {
$ct++;
} } $error{$example}->{exception}, "Exception caught for $example";
}