/* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define C_LUCY_TESTSTANDARDTOKENIZER
#include "Lucy/Util/ToolSet.h"
#include "Lucy/Test.h"
#include "Lucy/Test/Analysis/TestStandardTokenizer.h"
#include "Lucy/Analysis/StandardTokenizer.h"
#include "Lucy/Store/FSFolder.h"
#include "Lucy/Util/Json.h"
static void
test_Dump_Load_and_Equals(TestBatch *batch) {
StandardTokenizer *tokenizer = StandardTokenizer_new();
Obj *dump = StandardTokenizer_Dump(tokenizer);
StandardTokenizer *clone = (StandardTokenizer*)StandardTokenizer_Load(tokenizer, dump);
TEST_TRUE(batch,
StandardTokenizer_Equals(tokenizer, (Obj*)clone),
"Dump => Load round trip");
DECREF(tokenizer);
DECREF(dump);
DECREF(clone);
}
static void
test_tokenizer(TestBatch *batch) {
StandardTokenizer *tokenizer = StandardTokenizer_new();
ZombieCharBuf *word = ZCB_WRAP_STR(
" ."
"tha\xCC\x82t's"
":"
"1,02\xC2\xADZ4.38"
"\xE0\xB8\x81\xC2\xAD\xC2\xAD"
"\xF0\xA0\x80\x80"
"a"
"/",
35);
VArray *got = StandardTokenizer_Split(tokenizer, (CharBuf*)word);
CharBuf *token = (CharBuf*)VA_Fetch(got, 0);
TEST_TRUE(batch,
token
&& CB_Is_A(token, CHARBUF)
&& CB_Equals_Str(token, "tha\xcc\x82t's", 8),
"Token: %s", CB_Get_Ptr8(token));
token = (CharBuf*)VA_Fetch(got, 1);
TEST_TRUE(batch,
token
&& CB_Is_A(token, CHARBUF)
&& CB_Equals_Str(token, "1,02\xC2\xADZ4.38", 11),
"Token: %s", CB_Get_Ptr8(token));
token = (CharBuf*)VA_Fetch(got, 2);
TEST_TRUE(batch,
token
&& CB_Is_A(token, CHARBUF)
&& CB_Equals_Str(token, "\xE0\xB8\x81\xC2\xAD\xC2\xAD", 7),
"Token: %s", CB_Get_Ptr8(token));
token = (CharBuf*)VA_Fetch(got, 3);
TEST_TRUE(batch,
token
&& CB_Is_A(token, CHARBUF)
&& CB_Equals_Str(token, "\xF0\xA0\x80\x80", 4),
"Token: %s", CB_Get_Ptr8(token));
token = (CharBuf*)VA_Fetch(got, 4);
TEST_TRUE(batch,
token
&& CB_Is_A(token, CHARBUF)
&& CB_Equals_Str(token, "a", 1),
"Token: %s", CB_Get_Ptr8(token));
DECREF(got);
CharBuf *path = CB_newf("modules");
FSFolder *modules_folder = FSFolder_new(path);
if (!FSFolder_Check(modules_folder)) {
DECREF(modules_folder);
CB_setf(path, "../modules");
modules_folder = FSFolder_new(path);
if (!FSFolder_Check(modules_folder)) {
THROW(ERR, "Can't open modules folder");
}
}
CB_setf(path, "unicode/ucd/WordBreakTest.json");
VArray *tests = (VArray*)Json_slurp_json((Folder*)modules_folder, path);
if (!tests) { RETHROW(Err_get_error()); }
for (uint32_t i = 0, max = VA_Get_Size(tests); i < max; i++) {
Hash *test = (Hash*)VA_Fetch(tests, i);
CharBuf *text = (CharBuf*)Hash_Fetch_Str(test, "text", 4);
VArray *wanted = (VArray*)Hash_Fetch_Str(test, "words", 5);
VArray *got = StandardTokenizer_Split(tokenizer, text);
TEST_TRUE(batch, VA_Equals(wanted, (Obj*)got), "UCD test #%d", i + 1);
DECREF(got);
}
DECREF(tests);
DECREF(modules_folder);
DECREF(path);
DECREF(tokenizer);
}
void
TestStandardTokenizer_run_tests() {
TestBatch *batch = TestBatch_new(984);
TestBatch_Plan(batch);
test_Dump_Load_and_Equals(batch);
test_tokenizer(batch);
DECREF(batch);
}