/* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <ctype.h>
#include <stdio.h>
#include "Lucy/Util/ToolSet.h"
#include "Lucy/Util/Json.h"
#include "Lucy/Object/Host.h"
#include "Lucy/Store/Folder.h"
#include "Lucy/Store/InStream.h"
#include "Lucy/Store/OutStream.h"
#include "Lucy/Util/Memory.h"
#include "Lucy/Util/Json/JsonParser.h"
/* Routines generated by Lemon. */
void*
LucyParseJsonAlloc(void * (*allocate)(size_t));
void
LucyParseJson(void *json_parser, int token_type, lucy_Obj *value,
lucy_JsonParserState *state);
void
LucyParseJsonFree(void *json_parser, void(*freemem)(void*));
void
LucyParseJsonTrace(FILE *trace, char *line_prefix);
// Encode JSON for supplied "dump". On failure, sets Err_error and returns
// false.
static bool_t
S_to_json(Obj *dump, CharBuf *json, int32_t depth);
// Parse JSON from raw UTF-8 in memory.
static Obj*
S_parse_json(char *text, size_t size);
static Obj*
S_do_parse_json(void *json_parser, char *json, size_t len);
// Parse a JSON number. Advance the text buffer just past the number.
static Float64*
S_parse_number(char **json_ptr, char *const limit);
// Parse a JSON string. Advance the text buffer from pointing at the opening
// double quote to pointing just after the closing double quote.
static CharBuf*
S_parse_string(char **json_ptr, char *const limit);
// Unescape JSON string text. Expects pointers bookending the text data (i.e.
// pointing just after the opening double quote and directly at the closing
// double quote), and assumes that escapes have already been sanity checked
// for length.
static CharBuf*
S_unescape_text(char *const top, char *const end);
// Check that the supplied text begins with the specified keyword, which must
// then end on a word boundary (i.e. match "null" but not the first four
// letters of "nullify").
static INLINE bool_t
SI_check_keyword(char *json, char* end, const char *keyword, size_t len);
// Make it possible to be loosen constraints during testing.
static bool_t tolerant = false;
// Indentation: two spaces per level.
static const char indentation[] = " ";
static const size_t INDENTATION_LEN = sizeof(indentation) - 1;
// Append indentation spaces x depth.
static void
S_cat_whitespace(CharBuf *json, int32_t depth);
// Set Err_error, appending escaped JSON in the vicinity of the error.
static void
S_set_error(CharBuf *mess, char *json, char *limit, int line,
const char *func);
#define SET_ERROR(_mess, _json, _end) \
S_set_error(_mess, _json, _end, __LINE__, CFISH_ERR_FUNC_MACRO)
Obj*
Json_from_json(CharBuf *json) {
Obj *dump = S_parse_json((char*)CB_Get_Ptr8(json), CB_Get_Size(json));
if (!dump) {
ERR_ADD_FRAME(Err_get_error());
}
return dump;
}
Obj*
Json_slurp_json(Folder *folder, const CharBuf *path) {
InStream *instream = Folder_Open_In(folder, path);
if (!instream) {
ERR_ADD_FRAME(Err_get_error());
return NULL;
}
size_t len = (size_t)InStream_Length(instream);
char *buf = InStream_Buf(instream, len);
Obj *dump = S_parse_json(buf, len);
InStream_Close(instream);
DECREF(instream);
if (!dump) {
ERR_ADD_FRAME(Err_get_error());
}
return dump;
}
bool_t
Json_spew_json(Obj *dump, Folder *folder, const CharBuf *path) {
CharBuf *json = Json_to_json(dump);
if (!json) {
ERR_ADD_FRAME(Err_get_error());
return false;
}
OutStream *outstream = Folder_Open_Out(folder, path);
if (!outstream) {
ERR_ADD_FRAME(Err_get_error());
DECREF(json);
return false;
}
size_t size = CB_Get_Size(json);
OutStream_Write_Bytes(outstream, CB_Get_Ptr8(json), size);
OutStream_Close(outstream);
DECREF(outstream);
DECREF(json);
return true;
}
CharBuf*
Json_to_json(Obj *dump) {
// Validate object type, only allowing hashes and arrays per JSON spec.
if (!dump || !(Obj_Is_A(dump, HASH) || Obj_Is_A(dump, VARRAY))) {
if (!tolerant) {
CharBuf *class_name = dump ? Obj_Get_Class_Name(dump) : NULL;
CharBuf *mess = MAKE_MESS("Illegal top-level object type: %o",
class_name);
Err_set_error(Err_new(mess));
return NULL;
}
}
// Encode.
CharBuf *json = CB_new(31);
if (!S_to_json(dump, json, 0)) {
DECREF(json);
ERR_ADD_FRAME(Err_get_error());
json = NULL;
}
else {
// Append newline.
CB_Cat_Trusted_Str(json, "\n", 1);
}
return json;
}
void
Json_set_tolerant(bool_t tolerance) {
tolerant = tolerance;
}
static const int32_t MAX_DEPTH = 200;
static void
S_append_json_string(Obj *dump, CharBuf *json) {
// Append opening quote.
CB_Cat_Trusted_Str(json, "\"", 1);
// Process string data.
ZombieCharBuf *iterator = ZCB_WRAP((CharBuf*)dump);
while (ZCB_Get_Size(iterator)) {
uint32_t code_point = ZCB_Nip_One(iterator);
if (code_point > 127) {
// There is no need to escape any high characters, including those
// above the BMP, as we assume that the destination channel can
// handle arbitrary UTF-8 data.
CB_Cat_Char(json, code_point);
}
else {
char buffer[7];
size_t len;
switch (code_point & 127) {
// Perform all mandatory escapes enumerated in the JSON spec.
// Note that the spec makes escaping forward slash optional;
// we choose not to.
case 0x00: case 0x01: case 0x02: case 0x03:
case 0x04: case 0x05: case 0x06: case 0x07:
case 0x0b: case 0x0e: case 0x0f:
case 0x10: case 0x11: case 0x12: case 0x13:
case 0x14: case 0x15: case 0x16: case 0x17:
case 0x18: case 0x19: case 0x1a: case 0x1b:
case 0x1c: case 0x1d: case 0x1e: case 0x1f: {
sprintf(buffer, "\\u%04x", (unsigned)code_point);
len = 6;
break;
}
case '\b':
memcpy(buffer, "\\b", 2);
len = 2;
break;
case '\t':
memcpy(buffer, "\\t", 2);
len = 2;
break;
case '\n':
memcpy(buffer, "\\n", 2);
len = 2;
break;
case '\f':
memcpy(buffer, "\\f", 2);
len = 2;
break;
case '\r':
memcpy(buffer, "\\r", 2);
len = 2;
break;
case '\\':
memcpy(buffer, "\\\\", 2);
len = 2;
break;
case '\"':
memcpy(buffer, "\\\"", 2);
len = 2;
break;
// Ordinary printable ASCII.
default:
buffer[0] = (char)code_point;
len = 1;
}
CB_Cat_Trusted_Str(json, buffer, len);
}
}
// Append closing quote.
CB_Cat_Trusted_Str(json, "\"", 1);
}
static void
S_cat_whitespace(CharBuf *json, int32_t depth) {
while (depth--) {
CB_Cat_Trusted_Str(json, indentation, INDENTATION_LEN);
}
}
static bool_t
S_to_json(Obj *dump, CharBuf *json, int32_t depth) {
// Guard against infinite recursion in self-referencing data structures.
if (depth > MAX_DEPTH) {
CharBuf *mess = MAKE_MESS("Exceeded max depth of %i32", MAX_DEPTH);
Err_set_error(Err_new(mess));
return false;
}
if (!dump) {
CB_Cat_Trusted_Str(json, "null", 4);
}
else if (dump == (Obj*)CFISH_TRUE) {
CB_Cat_Trusted_Str(json, "true", 4);
}
else if (dump == (Obj*)CFISH_FALSE) {
CB_Cat_Trusted_Str(json, "false", 5);
}
else if (Obj_Is_A(dump, CHARBUF)) {
S_append_json_string(dump, json);
}
else if (Obj_Is_A(dump, INTNUM)) {
CB_catf(json, "%i64", Obj_To_I64(dump));
}
else if (Obj_Is_A(dump, FLOATNUM)) {
CB_catf(json, "%f64", Obj_To_F64(dump));
}
else if (Obj_Is_A(dump, VARRAY)) {
VArray *array = (VArray*)dump;
size_t size = VA_Get_Size(array);
if (size == 0) {
// Put empty array on single line.
CB_Cat_Trusted_Str(json, "[]", 2);
return true;
}
else if (size == 1) {
Obj *elem = VA_Fetch(array, 0);
if (!(Obj_Is_A(elem, HASH) || Obj_Is_A(elem, VARRAY))) {
// Put array containing single scalar element on one line.
CB_Cat_Trusted_Str(json, "[", 1);
if (!S_to_json(elem, json, depth + 1)) {
return false;
}
CB_Cat_Trusted_Str(json, "]", 1);
return true;
}
}
// Fall back to spreading elements across multiple lines.
CB_Cat_Trusted_Str(json, "[", 1);
for (size_t i = 0; i < size; i++) {
CB_Cat_Trusted_Str(json, "\n", 1);
S_cat_whitespace(json, depth + 1);
if (!S_to_json(VA_Fetch(array, i), json, depth + 1)) {
return false;
}
if (i + 1 < size) {
CB_Cat_Trusted_Str(json, ",", 1);
}
}
CB_Cat_Trusted_Str(json, "\n", 1);
S_cat_whitespace(json, depth);
CB_Cat_Trusted_Str(json, "]", 1);
}
else if (Obj_Is_A(dump, HASH)) {
Hash *hash = (Hash*)dump;
size_t size = Hash_Get_Size(hash);
// Put empty hash on single line.
if (size == 0) {
CB_Cat_Trusted_Str(json, "{}", 2);
return true;
}
// Validate that all keys are strings, then sort.
VArray *keys = Hash_Keys(hash);
for (size_t i = 0; i < size; i++) {
Obj *key = VA_Fetch(keys, i);
if (!key || !Obj_Is_A(key, CHARBUF)) {
DECREF(keys);
CharBuf *key_class = key ? Obj_Get_Class_Name(key) : NULL;
CharBuf *mess = MAKE_MESS("Illegal key type: %o", key_class);
Err_set_error(Err_new(mess));
return false;
}
}
VA_Sort(keys, NULL, NULL);
// Spread pairs across multiple lines.
CB_Cat_Trusted_Str(json, "{", 1);
for (size_t i = 0; i < size; i++) {
Obj *key = VA_Fetch(keys, i);
CB_Cat_Trusted_Str(json, "\n", 1);
S_cat_whitespace(json, depth + 1);
S_append_json_string(key, json);
CB_Cat_Trusted_Str(json, ": ", 2);
if (!S_to_json(Hash_Fetch(hash, key), json, depth + 1)) {
DECREF(keys);
return false;
}
if (i + 1 < size) {
CB_Cat_Trusted_Str(json, ",", 1);
}
}
CB_Cat_Trusted_Str(json, "\n", 1);
S_cat_whitespace(json, depth);
CB_Cat_Trusted_Str(json, "}", 1);
DECREF(keys);
}
return true;
}
static Obj*
S_parse_json(char *text, size_t size) {
void *json_parser = LucyParseJsonAlloc(lucy_Memory_wrapped_malloc);
if (json_parser == NULL) {
CharBuf *mess = MAKE_MESS("Failed to allocate JSON parser");
Err_set_error(Err_new(mess));
return NULL;
}
Obj *dump = S_do_parse_json(json_parser, text, size);
LucyParseJsonFree(json_parser, lucy_Memory_wrapped_free);
return dump;
}
static Obj*
S_do_parse_json(void *json_parser, char *json, size_t len) {
lucy_JsonParserState state;
state.result = NULL;
state.errors = false;
char *text = json;
char *const end = text + len;
while (text < end) {
int token_type = -1;
Obj *value = NULL;
char *const save = text;
switch (*text) {
case ' ': case '\n': case '\r': case '\t':
// Skip insignificant whitespace, which the JSON RFC defines
// as only four ASCII characters.
text++;
continue;
case '[':
token_type = LUCY_JSON_TOKENTYPE_LEFT_SQUARE_BRACKET;
text++;
break;
case ']':
token_type = LUCY_JSON_TOKENTYPE_RIGHT_SQUARE_BRACKET;
text++;
break;
case '{':
token_type = LUCY_JSON_TOKENTYPE_LEFT_CURLY_BRACKET;
text++;
break;
case '}':
token_type = LUCY_JSON_TOKENTYPE_RIGHT_CURLY_BRACKET;
text++;
break;
case ':':
token_type = LUCY_JSON_TOKENTYPE_COLON;
text++;
break;
case ',':
token_type = LUCY_JSON_TOKENTYPE_COMMA;
text++;
break;
case '"':
value = (Obj*)S_parse_string(&text, end);
if (value) {
token_type = LUCY_JSON_TOKENTYPE_STRING;
}
else {
// Clear out parser and return.
LucyParseJson(json_parser, 0, NULL, &state);
ERR_ADD_FRAME(Err_get_error());
return NULL;
}
break;
case 'n':
if (SI_check_keyword(text, end, "null", 4)) {
token_type = LUCY_JSON_TOKENTYPE_NULL;
text += 4;
}
break;
case 't':
if (SI_check_keyword(text, end, "true", 4)) {
token_type = LUCY_JSON_TOKENTYPE_TRUE;
value = (Obj*)CFISH_TRUE;
text += 4;
}
break;
case 'f':
if (SI_check_keyword(text, end, "false", 5)) {
token_type = LUCY_JSON_TOKENTYPE_FALSE;
value = (Obj*)CFISH_FALSE;
text += 5;
}
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
case '-': { // Note no '+', as JSON spec doesn't allow it.
value = (Obj*)S_parse_number(&text, end);
if (value) {
token_type = LUCY_JSON_TOKENTYPE_NUMBER;
}
else {
// Clear out parser and return.
LucyParseJson(json_parser, 0, NULL, &state);
ERR_ADD_FRAME(Err_get_error());
return NULL;
}
}
break;
}
LucyParseJson(json_parser, token_type, value, &state);
if (state.errors) {
SET_ERROR(CB_newf("JSON syntax error"), save, end);
return NULL;
}
}
// Finish up.
LucyParseJson(json_parser, 0, NULL, &state);
if (state.errors) {
SET_ERROR(CB_newf("JSON syntax error"), json, end);
return NULL;
}
return state.result;
}
static Float64*
S_parse_number(char **json_ptr, char *const limit) {
char *top = *json_ptr;
char *end = top;
bool_t terminated = false;
// We can't assume NULL termination for the JSON string, so we need to
// ensure that strtod() cannot overrun and access invalid memory.
for (; end < limit; end++) {
switch (*end) {
// Only these characters may legally follow a number in
// Javascript. If we don't find one before the end of the JSON,
// it's a parse error.
case ' ': case '\n': case '\r': case '\t':
case ']':
case '}':
case ':':
case ',':
terminated = true;
break;
}
}
Float64 *result = NULL;
if (terminated) {
char *terminus;
double number = strtod(top, &terminus);
if (terminus != top) {
*json_ptr = terminus;
result = Float64_new(number);
}
}
if (!result) {
SET_ERROR(CB_newf("JSON syntax error"), top, limit);
}
return result;
}
static CharBuf*
S_parse_string(char **json_ptr, char *const limit) {
// Find terminating double quote, determine whether there are any escapes.
char *top = *json_ptr + 1;
char *end = NULL;
bool_t saw_backslash = false;
for (char *text = top; text < limit; text++) {
if (*text == '"') {
end = text;
break;
}
else if (*text == '\\') {
saw_backslash = true;
if (text + 1 < limit && text[1] == 'u') {
text += 5;
}
else {
text += 1;
}
}
}
if (!end) {
SET_ERROR(CB_newf("Unterminated string"), *json_ptr, limit);
return NULL;
}
// Advance the text buffer to just beyond the closing quote.
*json_ptr = end + 1;
if (saw_backslash) {
return S_unescape_text(top, end);
}
else {
// Optimize common case where there are no escapes.
size_t len = end - top;
if (!StrHelp_utf8_valid(top, len)) {
CharBuf *mess = MAKE_MESS("Bad UTF-8 in JSON");
Err_set_error(Err_new(mess));
return NULL;
}
return CB_new_from_trusted_utf8(top, len);
}
}
static CharBuf*
S_unescape_text(char *const top, char *const end) {
// The unescaped string will never be longer than the escaped string
// because only a \u escape can theoretically be too long and
// StrHelp_encode_utf8_char guards against sequences over 4 bytes.
// Therefore we can allocate once and not worry about reallocating.
size_t cap = end - top + 1;
char *target_buf = (char*)MALLOCATE(cap);
size_t target_size = 0;
for (char *text = top; text < end; text++) {
if (*text != '\\') {
target_buf[target_size++] = *text;
}
else {
// Process escape.
text++;
switch (*text) {
case '"':
target_buf[target_size++] = '"';
break;
case '\\':
target_buf[target_size++] = '\\';
break;
case '/':
target_buf[target_size++] = '/';
break;
case 'b':
target_buf[target_size++] = '\b';
break;
case 'f':
target_buf[target_size++] = '\f';
break;
case 'n':
target_buf[target_size++] = '\n';
break;
case 'r':
target_buf[target_size++] = '\r';
break;
case 't':
target_buf[target_size++] = '\t';
break;
case 'u': {
// Copy into a temp buffer because strtol will overrun
// into adjacent text data for e.g. "\uAAAA1".
char temp[5] = { 0, 0, 0, 0, 0 };
memcpy(temp, text + 1, 4);
text += 4;
char *num_end;
long code_point = strtol(temp, &num_end, 16);
char *temp_ptr = temp;
if (num_end != temp_ptr + 4 || code_point < 0) {
FREEMEM(target_buf);
SET_ERROR(CB_newf("Invalid \\u escape"), text - 5, end);
return NULL;
}
if (code_point >= 0xD800 && code_point <= 0xDFFF) {
FREEMEM(target_buf);
SET_ERROR(CB_newf("Surrogate pairs not supported"),
text - 5, end);
return NULL;
}
target_size += StrHelp_encode_utf8_char((uint32_t)code_point,
target_buf + target_size);
}
break;
default:
FREEMEM(target_buf);
SET_ERROR(CB_newf("Illegal escape"), text - 1, end);
return NULL;
}
}
}
// NULL-terminate, sanity check, then return the escaped string.
target_buf[target_size] = '\0';
if (!StrHelp_utf8_valid(target_buf, target_size)) {
FREEMEM(target_buf);
CharBuf *mess = MAKE_MESS("Bad UTF-8 in JSON");
Err_set_error(Err_new(mess));
return NULL;
}
return CB_new_steal_from_trusted_str(target_buf, target_size, cap);
}
static INLINE bool_t
SI_check_keyword(char *json, char* end, const char *keyword, size_t len) {
if (end - json > len
&& strncmp(json, keyword, len) == 0
&& json[len] != '_'
&& !isalnum(json[len])
) {
return true;
}
return false;
}
static void
S_set_error(CharBuf *mess, char *json, char *limit, int line,
const char *func) {
if (func) {
CB_catf(mess, " at %s %s line %i32 near ", func, __FILE__,
(int32_t)line);
}
else {
CB_catf(mess, " at %s line %i32 near ", __FILE__, (int32_t)line);
}
// Append escaped text.
int64_t len = limit - json;
if (len > 32) {
const char *end = StrHelp_back_utf8_char(json + 32, json);
len = end - json;
}
ZombieCharBuf *snippet = ZCB_WRAP_STR(json, len);
S_append_json_string((Obj*)snippet, mess);
// Set Err_error.
Err_set_error(Err_new(mess));
}