/* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "Lucy/Util/ToolSet.h"
#include "Lucy/Test.h"
#include "Lucy/Test/Util/TestStringHelper.h"
#include "Lucy/Test/TestUtils.h"
#include "Lucy/Util/StringHelper.h"
#include "utf8proc.h"
#include "Lucy/Util/Json.h"
/* This alternative implementation of utf8_valid() is (presumably) slower, but
* it implements the standard in a more linear, easy-to-grok way.
*/
#define TRAIL_OK(n) (n >= 0x80 && n <= 0xBF)
static bool_t
S_utf8_valid_alt(const char *maybe_utf8, size_t size) {
const uint8_t *string = (const uint8_t*)maybe_utf8;
const uint8_t *const end = string + size;
while (string < end) {
int count = StrHelp_UTF8_COUNT[*string];
bool_t valid = false;
if (count == 1) {
if (string[0] <= 0x7F) {
valid = true;
}
}
else if (count == 2) {
if (string[0] >= 0xC2 && string[0] <= 0xDF) {
if (TRAIL_OK(string[1])) {
valid = true;
}
}
}
else if (count == 3) {
if (string[0] == 0xE0) {
if (string[1] >= 0xA0 && string[1] <= 0xBF
&& TRAIL_OK(string[2])
) {
valid = true;
}
}
else if (string[0] >= 0xE1 && string[0] <= 0xEC) {
if (TRAIL_OK(string[1])
&& TRAIL_OK(string[2])
) {
valid = true;
}
}
else if (string[0] == 0xED) {
if (string[1] >= 0x80 && string[1] <= 0x9F
&& TRAIL_OK(string[2])
) {
valid = true;
}
}
else if (string[0] >= 0xEE && string[0] <= 0xEF) {
if (TRAIL_OK(string[1])
&& TRAIL_OK(string[2])
) {
valid = true;
}
}
}
else if (count == 4) {
if (string[0] == 0xF0) {
if (string[1] >= 0x90 && string[1] <= 0xBF
&& TRAIL_OK(string[2])
&& TRAIL_OK(string[3])
) {
valid = true;
}
}
else if (string[0] >= 0xF1 && string[0] <= 0xF3) {
if (TRAIL_OK(string[1])
&& TRAIL_OK(string[2])
&& TRAIL_OK(string[3])
) {
valid = true;
}
}
else if (string[0] == 0xF4) {
if (string[1] >= 0x80 && string[1] <= 0x8F
&& TRAIL_OK(string[2])
&& TRAIL_OK(string[3])
) {
valid = true;
}
}
}
if (!valid) {
return false;
}
string += count;
}
if (string != end) {
return false;
}
return true;
}
static void
test_overlap(TestBatch *batch) {
int32_t result;
result = StrHelp_overlap("", "", 0, 0);
TEST_INT_EQ(batch, result, 0, "two empty strings");
result = StrHelp_overlap("", "foo", 0, 3);
TEST_INT_EQ(batch, result, 0, "first string is empty");
result = StrHelp_overlap("foo", "", 3, 0);
TEST_INT_EQ(batch, result, 0, "second string is empty");
result = StrHelp_overlap("foo", "foo", 3, 3);
TEST_INT_EQ(batch, result, 3, "equal strings");
result = StrHelp_overlap("foo bar", "foo", 7, 3);
TEST_INT_EQ(batch, result, 3, "first string is longer");
result = StrHelp_overlap("foo", "foo bar", 3, 7);
TEST_INT_EQ(batch, result, 3, "second string is longer");
}
static void
test_to_base36(TestBatch *batch) {
char buffer[StrHelp_MAX_BASE36_BYTES];
StrHelp_to_base36(U64_MAX, buffer);
TEST_STR_EQ(batch, "3w5e11264sgsf", buffer, "base36 U64_MAX");
StrHelp_to_base36(1, buffer);
TEST_STR_EQ(batch, "1", buffer, "base36 1");
TEST_INT_EQ(batch, buffer[1], 0, "base36 NULL termination");
}
static void
test_utf8_round_trip(TestBatch *batch) {
bool_t failed = false;
uint32_t code_point;
for (code_point = 0; code_point <= 0x10FFFF; code_point++) {
char buffer[4];
uint32_t size = StrHelp_encode_utf8_char(code_point, buffer);
char *start = buffer;
char *end = start + size;
// Verify length returned by encode_utf8_char().
if (size != StrHelp_UTF8_COUNT[(unsigned char)buffer[0]]) {
break;
}
// Verify that utf8_valid() agrees with alternate implementation.
if (!!StrHelp_utf8_valid(start, size)
!= !!S_utf8_valid_alt(start, size)
) {
break;
}
// Verify back_utf8_char().
if (StrHelp_back_utf8_char(end, start) != start) {
break;
}
// Verify round trip of encode/decode.
if (StrHelp_decode_utf8_char(buffer) != code_point) {
break;
}
}
if (code_point == 0x110000) {
PASS(batch, "Successfully round tripped 0 - 0x10FFFF");
}
else {
FAIL(batch, "Failed round trip at 0x%.1X", (unsigned)code_point);
}
}
static void
S_test_validity(TestBatch *batch, const char *content, size_t size,
bool_t expected, const char *description) {
bool_t sane = StrHelp_utf8_valid(content, size);
bool_t double_check = S_utf8_valid_alt(content, size);
if (sane != double_check) {
FAIL(batch, "Disagreement: %s", description);
}
else {
TEST_TRUE(batch, sane == expected, "%s", description);
}
}
static void
test_utf8_valid(TestBatch *batch) {
// Musical symbol G clef:
// Code point: U+1D11E
// UTF-16: 0xD834 0xDD1E
// UTF-8 0xF0 0x9D 0x84 0x9E
S_test_validity(batch, "\xF0\x9D\x84\x9E", 4, true,
"Musical symbol G clef");
S_test_validity(batch, "\xED\xA0\xB4\xED\xB4\x9E", 6, false,
"G clef as UTF-8 encoded UTF-16 surrogates");
S_test_validity(batch, ".\xED\xA0\xB4.", 5, false,
"Isolated high surrogate");
S_test_validity(batch, ".\xED\xB4\x9E.", 5, false,
"Isolated low surrogate");
// Shortest form.
S_test_validity(batch, ".\xC1\x9C.", 4, false,
"Non-shortest form ASCII backslash");
S_test_validity(batch, ".\xC0\xAF.", 4, false,
"Non-shortest form ASCII slash");
S_test_validity(batch, ".\xC0\x80.", 4, false,
"Non-shortest form ASCII NUL character");
// Range.
S_test_validity(batch, "\xF8\x88\x80\x80\x80", 5, false, "5-byte UTF-8");
// Bad continuations.
S_test_validity(batch, "\xE2\x98\xBA\xE2\x98\xBA", 6, true,
"SmileySmiley");
S_test_validity(batch, "\xE2\xBA\xE2\x98\xBA", 5, false,
"missing first continuation byte");
S_test_validity(batch, "\xE2\x98\xE2\x98\xBA", 5, false,
"missing second continuation byte");
S_test_validity(batch, "\xE2\xE2\x98\xBA", 4, false,
"missing both continuation bytes");
S_test_validity(batch, "\xBA\xE2\x98\xBA\xE2\xBA", 5, false,
"missing first continuation byte (end)");
S_test_validity(batch, "\xE2\x98\xBA\xE2\x98", 5, false,
"missing second continuation byte (end)");
S_test_validity(batch, "\xE2\x98\xBA\xE2", 4, false,
"missing both continuation bytes (end)");
S_test_validity(batch, "\xBA\xE2\x98\xBA", 4, false,
"isolated continuation byte 0xBA");
S_test_validity(batch, "\x98\xE2\x98\xBA", 4, false,
"isolated continuation byte 0x98");
S_test_validity(batch, "\xE2\x98\xBA\xBA", 4, false,
"isolated continuation byte 0xBA (end)");
S_test_validity(batch, "\xE2\x98\xBA\x98", 4, false,
"isolated continuation byte 0x98 (end)");
}
static void
test_is_whitespace(TestBatch *batch) {
TEST_TRUE(batch, StrHelp_is_whitespace(' '), "space is whitespace");
TEST_TRUE(batch, StrHelp_is_whitespace('\n'), "newline is whitespace");
TEST_TRUE(batch, StrHelp_is_whitespace('\t'), "tab is whitespace");
TEST_TRUE(batch, StrHelp_is_whitespace('\v'),
"vertical tab is whitespace");
TEST_TRUE(batch, StrHelp_is_whitespace(0x180E),
"Mongolian vowel separator is whitespace");
TEST_FALSE(batch, StrHelp_is_whitespace('a'), "'a' isn't whitespace");
TEST_FALSE(batch, StrHelp_is_whitespace(0), "NULL isn't whitespace");
TEST_FALSE(batch, StrHelp_is_whitespace(0x263A),
"Smiley isn't whitespace");
}
static void
test_back_utf8_char(TestBatch *batch) {
char buffer[4];
char *buf = buffer + 1;
uint32_t len = StrHelp_encode_utf8_char(0x263A, buffer);
char *end = buffer + len;
TEST_TRUE(batch, StrHelp_back_utf8_char(end, buffer) == buffer,
"back_utf8_char");
TEST_TRUE(batch, StrHelp_back_utf8_char(end, buf) == NULL,
"back_utf8_char returns NULL rather than back up beyond start");
TEST_TRUE(batch, StrHelp_back_utf8_char(buffer, buffer) == NULL,
"back_utf8_char returns NULL when end == start");
}
static void
test_utf8proc_normalization(TestBatch *batch) {
SKIP(batch, "utf8proc can't handle control chars or Unicode non-chars");
return;
for (int32_t i = 0; i < 100; i++) {
CharBuf *source = TestUtils_random_string(rand() % 40);
// Normalize once.
uint8_t *normalized;
int32_t check = utf8proc_map(CB_Get_Ptr8(source), CB_Get_Size(source),
&normalized,
UTF8PROC_STABLE |
UTF8PROC_COMPOSE |
UTF8PROC_COMPAT |
UTF8PROC_CASEFOLD);
if (check < 0) {
lucy_Json_set_tolerant(1);
CharBuf *json = lucy_Json_to_json((Obj*)source);
if (!json) {
json = CB_newf("[failed to encode]");
}
FAIL(batch, "Failed to normalize: %s", CB_Get_Ptr8(json));
DECREF(json);
DECREF(source);
return;
}
// Normalize again.
size_t normalized_len = strlen((char*)normalized);
uint8_t *dupe;
int32_t dupe_check = utf8proc_map(normalized, normalized_len, &dupe,
UTF8PROC_STABLE |
UTF8PROC_COMPOSE |
UTF8PROC_COMPAT |
UTF8PROC_CASEFOLD);
if (dupe_check < 0) {
THROW(ERR, "Unexpected normalization error: %i32", dupe_check);
}
int comparison = strcmp((char*)normalized, (char*)dupe);
free(dupe);
free(normalized);
DECREF(source);
if (comparison != 0) {
FAIL(batch, "Not fully normalized");
return;
}
}
PASS(batch, "Normalization successful.");
}
void
TestStrHelp_run_tests() {
TestBatch *batch = TestBatch_new(41);
TestBatch_Plan(batch);
test_overlap(batch);
test_to_base36(batch);
test_utf8_round_trip(batch);
test_utf8_valid(batch);
test_is_whitespace(batch);
test_back_utf8_char(batch);
test_utf8proc_normalization(batch);
DECREF(batch);
}