#!/usr/bin/perl
require "bptest.pl";
$skiptests = 0; # to skip early tests during development
$timing = 0; # to use the same seed for timing tests
#
# This test covers the 8859-1, apple, TeX, troff, and HTML
# character sets. It also tests the canon<->unicode routines
# from bp-p-utils.pl.
#
# It does not test the 'none' and 'auto' character sets. The
# auto character set doesn't exist, so it tests all the charsets
# that are currently in bp.
#
# For each charset, we run through an iso string, which contains
# the 8-bit characters 0 through 255. Before processing, certain
# characters known to not be supported are removed (generally
# 200-237 octal, but sometimes others). This is to prevent
# excessive warnings and make the tests easier to perform. We
# run the fromcanon code, then the tocanon code, and check to
# see that the outputs match.
#
# In addition, we do the same thing for a long string of random
# 8-bit characters, with the hope that this will test combinations
# of characters we may not have thought of.
#
# Some formats also test a few special strings that we want to
# test. These usually are selected by looking at the source
# code for the converter, and checking any seemingly weak points.
#
# Lastly, we run these strings through a loop:
# 88591->can->troff->can->html->can->88591
# and check that they match. After removing some characters, we try
# 88591->can->troff->can->apple->can->html->can->88591
#
# Generate a string with each 8bit character.
$isostring = '';
for (0..255) {
$isostring .= pack("C", $_);
}
# Generate a random string of 8bit characters. This should
# be a long string so we test lots of things.
# We do this because although testing the 8bit characters in order
# should find any problems, certain character combinations can
# cause problems if the routine isn't written properly. This may
# stumble across a combination I didn't think of.
# We must print out the seed if there is an error, so that we can
# duplicate the string that caused the error.
#
# Argh! Perl4 coredumps with srand, and Perl5 coredumps with MSRNG.
# I don't know why they crash. Perl5.002 seems to work with MSRNG.
#
$randstring = '';
$randstringlength = 16385; # should be at least 8192
$seed = undef;
$seed = 42 if $timing;
if ( ($] >= 5) && ($] < 5.002) ) {
srand($seed) unless defined $seed;
$n = 0;
while ($n++ < $randstringlength) {
$randstring .= pack("C", int(rand(256)) );
}
} else {
if (defined $seed) {
$seed = &msrand($seed);
} else {
$seed = &msrand;
}
$n = 0;
while ($n++ < $randstringlength) {
$randstring .= pack("C", int(&mrand(256)) );
}
}
$randstring = $randstring x 5 if $timing;
$cantest = "${bib'cs_ext}0026";
# The character conversion routines assume that the strings will
# already be escaped. They also do not do any unescaping (except
# for cs_ext and cs_meta, which they are supposed to handle).
$isostring =~ s/$bib'cs_escape/$bib'cs_char_escape/go;
$randstring =~ s/$bib'cs_escape/$bib'cs_char_escape/go;
print "Testing character set routines, seed=$seed.\n";
if (!$skiptests) {
&begintest("bib'unicode", 12);
&check('', "bib'canon_to_unicode", '006C', 'l');
&check('', "bib'canon_to_unicode", '00C4', "\304");
&check('', "bib'canon_to_unicode", 'A1C6', "${bib'cs_ext}A1c6");
&check('', "bib'canon_to_unicode", '001C', $bib'cs_char_escape);
&check('', "bib'unicode_to_canon", '5', '0035');
&check('', "bib'unicode_to_canon", "\xE9", '00E9');
&check('', "bib'unicode_to_canon", "${bib'cs_ext}CF8A", 'CF8A');
&check('', "bib'decimal_to_unicode", '0023', 35);
&check('', "bib'decimal_to_unicode", '359B', 13723);
&check('', "bib'unicode_to_decimal", 35, '0023');
&check('', "bib'unicode_to_decimal", 13723, '359b');
$f = $failed;
for (0..512) {
$can = &bib'unicode_to_canon(&bib'decimal_to_unicode($_));
$val = &bib'unicode_to_decimal(&bib'canon_to_unicode($can));
&check('nostatus,norun',"unicode loop", $_, $val);
}
$can = $val = undef;
&check('partial', "unicode loop", $f, $failed);
&endtest;
# Test ISO-8859-1
&testcharset("8859-1", 5);
$caniso = $isostring;
&check('', "bp_cs_88591'tocanon", $caniso, $isostring);
&check('', "bp_cs_88591'fromcanon", $isostring, $caniso);
$canran = $randstring;
&check('', "bp_cs_88591'tocanon", $canran, $randstring);
&check('', "bp_cs_88591'fromcanon", $randstring, $canran);
&check('', "bp_cs_88591'fromcanon", '&', $cantest);
&endtest;
# Test Apple
&bib'errors('print');
&testcharset("apple", 5);
# get rid of characters we don't handle.
$c = $caniso;
$c =~ s/[\200-\237\246\255\262\263\271\274\275\276\320\327\335\336\360\375\376]//g;
$appiso = &bp_cs_apple'fromcanon($c);
&check('partial', "apple'fromcanon", 1, 1);
&check('', "bp_cs_apple'tocanon", $c, $appiso);
$c = $appiso = undef;
$c = $canran;
$c =~ s/[\200-\237\246\255\262\263\271\274\275\276\320\327\335\336\360\375\376]//g;
$appran = &bp_cs_apple'fromcanon($c);
&check('partial', "apple'fromcanon", 1, 1);
&check('', "bp_cs_apple'tocanon", $c, $appran);
$c = $appran = undef;
&check('', "bp_cs_apple'fromcanon", '&', $cantest);
&endtest;
# Test troff
&testcharset("troff", 5);
($c = $caniso) =~ s/[\200-\237]//g; # troff can't handle these
$troiso = &bp_cs_troff'fromcanon($c);
&check('partial', "troff'fromcanon", 1, 1);
&check('', "bp_cs_troff'tocanon", $c, $troiso);
$c = $troiso = undef;
($c = $canran) =~ s/[\200-\237]//g; # troff can't handle these
$troran = &bp_cs_troff'fromcanon($c);
&check('partial', "troff'fromcanon", 1, 1);
&check('', "bp_cs_troff'tocanon", $c, $troran);
$c = $troran = undef;
&check('', "bp_cs_troff'fromcanon", '&', $cantest);
&endtest;
# Test TeX
&testcharset("tex", 3);
($c = $caniso) =~ s/[\200-\337]//g; # TeX can't handle these
$texiso = &bp_cs_tex'fromcanon($c, 1);
&check('partial', "tex'fromcanon", 1, 1);
&check('', "bp_cs_tex'tocanon", $c, $texiso, 1);
$c = $texiso = undef;
# XXXXX Fix me! Something in here is broken.
#($c = $canran) =~ s/[\200-\237]//g;
#$texran = &bp_cs_tex'fromcanon($c, 1);
#&check('partial', "tex'fromcanon", 1, 1);
#&check('', "bp_cs_tex'tocanon", $c, $texran, 1);
#$c = $texran = undef;
&check('', "bp_cs_tex'fromcanon", '&', $cantest);
&endtest;
# Test HTML
&testcharset("html", 7);
&check('', "bp_cs_html'fromcanon", '&', $cantest);
&check('', "bp_cs_html'tocanon", pack("C", 199), 'Ç' );
# This should generate a warning
$oldwlev = $bib'glb_warn_level;
$bib'glb_warn_level = 0;
&check('', "bp_cs_html'tocanon", '', '&fo??o*+?/g;' );
$bib'glb_warn_level = $oldwlev;
$c = $caniso;
$htmiso = &bp_cs_html'fromcanon($c);
&check('partial', "html'fromcanon", 1, 1);
$t = &bp_cs_html'tocanon($htmiso);
$t =~ s/${bib'cs_ext}0026/&/g; # html leaves & characters in extended form.
&check('norun', "bp_cs_html'tocanon", $c, $t);
$c = $htmiso = $t = undef;
$c = $canran;
$htmran = &bp_cs_html'fromcanon($c);
&check('partial', "html'fromcanon", 1, 1);
$t = &bp_cs_html'tocanon($htmran);
$t =~ s/${bib'cs_ext}0026/&/g; # html leaves & characters in extended form.
&check('norun', "bp_cs_html'tocanon", $c, $t);
$c = $htmran = $t = undef;
&endtest;
} else { # end of skipped tests
&bib'load_charset('8859-1');
&bib'load_charset('apple');
&bib'load_charset('troff');
&bib'load_charset('html');
}
# Test troff / HTML / 8859-1 loop
&begintest("cs conversion loops", 5);
$f = $failed;
for $iso ( "\000" .. "\377" ) {
next if $iso =~ /[\200-\237]/;
$can = &bp_cs_88591'tocanon($iso);
$imd = &bp_cs_troff'fromcanon($can);
$can = &bp_cs_troff'tocanon($imd);
$imd = &bp_cs_html'fromcanon($can);
$can = &bp_cs_html'tocanon($imd);
$isr = &bp_cs_88591'fromcanon($can);
&check('nostatus,norun', "cs conversion loop ".ord($iso), $iso, $isr);
}
$can = $imd = $isr = undef;
&check('partial', "cs conversion loop", $f, $failed);
$can = &bp_cs_88591'tocanon($isostring);
$imd = &bp_cs_html'fromcanon($can);
$can = &bp_cs_html'tocanon($imd);
$isr = &bp_cs_88591'fromcanon($can);
&check('norun', "iso conversion loop 1", $isostring, $isr);
$can = $imd = $isr = undef;
($iso = $isostring) =~ s/[\200-\237]//g;
$can = &bp_cs_88591'tocanon($iso);
$imd = &bp_cs_troff'fromcanon($can);
$can = &bp_cs_troff'tocanon($imd);
$imd = &bp_cs_html'fromcanon($can);
$can = &bp_cs_html'tocanon($imd);
$isr = &bp_cs_88591'fromcanon($can);
&check('norun', "iso conversion loop 2", $iso, $isr);
$can = $imd = $isr = undef; # leave $iso
$iso =~ s/[\200-\237\246\255\262\263\271\274\275\276\320\327\335\336\360\375\376]//g;
$can = &bp_cs_88591'tocanon($iso);
$imd = &bp_cs_troff'fromcanon($can);
$can = &bp_cs_troff'tocanon($imd);
$imd = &bp_cs_apple'fromcanon($can);
$can = &bp_cs_apple'tocanon($imd);
$imd = &bp_cs_html'fromcanon($can);
$can = &bp_cs_html'tocanon($imd);
$isr = &bp_cs_88591'fromcanon($can);
&check('norun', "iso conversion loop 3", $iso, $isr);
$can = $imd = $isr = $iso = undef;
$can = &bp_cs_88591'tocanon($randstring);
$imd = &bp_cs_html'fromcanon($can);
$can = &bp_cs_html'tocanon($imd);
$rnr = &bp_cs_88591'fromcanon($can);
&check('norun', "random conversion loop 1", $randstring, $rnr);
$can = $imd = $rnr = undef;
&endtest;
&bib'load_charset("html");
# We need an options routine for charsets...
$bp_cs_html'opt_html3 = 1;
&begintest("HTML 3", 4);
&check('', "bp_cs_html'tocanon", pack("C", 222), 'Þ' );
&check('', "bp_cs_html'tocanon", pack("C", 165), '¥' );
&check('', "bp_cs_html'tocanon", "${bib'cs_ext}AB7F", '&U+AB7F;' );
&check('', "bp_cs_html'fromcanon", '&U+AB7F;', "${bib'cs_ext}AB7F" );
&endtest;