The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <ctype.h>
#include <stdio.h>

#include "Lucy/Util/ToolSet.h"

#include "Lucy/Util/Json.h"
#include "Lucy/Object/Host.h"
#include "Lucy/Store/Folder.h"
#include "Lucy/Store/InStream.h"
#include "Lucy/Store/OutStream.h"
#include "Lucy/Util/Memory.h"
#include "Lucy/Util/Json/JsonParser.h"

/* Routines generated by Lemon. */
void*
LucyParseJsonAlloc(void * (*allocate)(size_t));
void
LucyParseJson(void *json_parser, int token_type, lucy_Obj *value,
              lucy_JsonParserState *state);
void
LucyParseJsonFree(void *json_parser, void(*freemem)(void*));
void
LucyParseJsonTrace(FILE *trace, char *line_prefix);

// Encode JSON for supplied "dump".  On failure, sets Err_error and returns
// false.
static bool_t
S_to_json(Obj *dump, CharBuf *json, int32_t depth);

// Parse JSON from raw UTF-8 in memory.
static Obj*
S_parse_json(char *text, size_t size);
static Obj*
S_do_parse_json(void *json_parser, char *json, size_t len);

// Parse a JSON number.  Advance the text buffer just past the number.
static Float64*
S_parse_number(char **json_ptr, char *const limit);

// Parse a JSON string.  Advance the text buffer from pointing at the opening
// double quote to pointing just after the closing double quote.
static CharBuf*
S_parse_string(char **json_ptr, char *const limit);

// Unescape JSON string text.  Expects pointers bookending the text data (i.e.
// pointing just after the opening double quote and directly at the closing
// double quote), and assumes that escapes have already been sanity checked
// for length.
static CharBuf*
S_unescape_text(char *const top, char *const end);

// Check that the supplied text begins with the specified keyword, which must
// then end on a word boundary (i.e. match "null" but not the first four
// letters of "nullify").
static INLINE bool_t
SI_check_keyword(char *json, char* end, const char *keyword, size_t len);

// Make it possible to be loosen constraints during testing.
static bool_t tolerant = false;

// Indentation: two spaces per level.
static const char indentation[]     = "  ";
static const size_t INDENTATION_LEN = sizeof(indentation) - 1;

// Append indentation spaces x depth.
static void
S_cat_whitespace(CharBuf *json, int32_t depth);

// Set Err_error, appending escaped JSON in the vicinity of the error.
static void
S_set_error(CharBuf *mess, char *json, char *limit, int line,
            const char *func);
#define SET_ERROR(_mess, _json, _end) \
    S_set_error(_mess, _json, _end, __LINE__, CFISH_ERR_FUNC_MACRO)

Obj*
Json_from_json(CharBuf *json) {
    Obj *dump = S_parse_json((char*)CB_Get_Ptr8(json), CB_Get_Size(json));
    if (!dump) {
        ERR_ADD_FRAME(Err_get_error());
    }
    return dump;
}

Obj*
Json_slurp_json(Folder *folder, const CharBuf *path) {
    InStream *instream = Folder_Open_In(folder, path);
    if (!instream) {
        ERR_ADD_FRAME(Err_get_error());
        return NULL;
    }
    size_t len = (size_t)InStream_Length(instream);
    char *buf = InStream_Buf(instream, len);
    Obj *dump = S_parse_json(buf, len);
    InStream_Close(instream);
    DECREF(instream);
    if (!dump) {
        ERR_ADD_FRAME(Err_get_error());
    }
    return dump;
}

bool_t
Json_spew_json(Obj *dump, Folder *folder, const CharBuf *path) {
    CharBuf *json = Json_to_json(dump);
    if (!json) {
        ERR_ADD_FRAME(Err_get_error());
        return false;
    }
    OutStream *outstream = Folder_Open_Out(folder, path);
    if (!outstream) {
        ERR_ADD_FRAME(Err_get_error());
        DECREF(json);
        return false;
    }
    size_t size = CB_Get_Size(json);
    OutStream_Write_Bytes(outstream, CB_Get_Ptr8(json), size);
    OutStream_Close(outstream);
    DECREF(outstream);
    DECREF(json);
    return true;
}

CharBuf*
Json_to_json(Obj *dump) {
    // Validate object type, only allowing hashes and arrays per JSON spec.
    if (!dump || !(Obj_Is_A(dump, HASH) || Obj_Is_A(dump, VARRAY))) {
        if (!tolerant) {
            CharBuf *class_name = dump ? Obj_Get_Class_Name(dump) : NULL;
            CharBuf *mess = MAKE_MESS("Illegal top-level object type: %o",
                                      class_name);
            Err_set_error(Err_new(mess));
            return NULL;
        }
    }

    // Encode.
    CharBuf *json = CB_new(31);
    if (!S_to_json(dump, json, 0)) {
        DECREF(json);
        ERR_ADD_FRAME(Err_get_error());
        json = NULL;
    }
    else {
        // Append newline.
        CB_Cat_Trusted_Str(json, "\n", 1);
    }

    return json;
}

void
Json_set_tolerant(bool_t tolerance) {
    tolerant = tolerance;
}

static const int32_t MAX_DEPTH = 200;

static void
S_append_json_string(Obj *dump, CharBuf *json) {
    // Append opening quote.
    CB_Cat_Trusted_Str(json, "\"", 1);

    // Process string data.
    ZombieCharBuf *iterator = ZCB_WRAP((CharBuf*)dump);
    while (ZCB_Get_Size(iterator)) {
        uint32_t code_point = ZCB_Nip_One(iterator);
        if (code_point > 127) {
            // There is no need to escape any high characters, including those
            // above the BMP, as we assume that the destination channel can
            // handle arbitrary UTF-8 data.
            CB_Cat_Char(json, code_point);
        }
        else {
            char buffer[7];
            size_t len;
            switch (code_point & 127) {
                    // Perform all mandatory escapes enumerated in the JSON spec.
                    // Note that the spec makes escaping forward slash optional;
                    // we choose not to.
                case 0x00: case 0x01: case 0x02: case 0x03:
                case 0x04: case 0x05: case 0x06: case 0x07:
                case 0x0b: case 0x0e: case 0x0f:
                case 0x10: case 0x11: case 0x12: case 0x13:
                case 0x14: case 0x15: case 0x16: case 0x17:
                case 0x18: case 0x19: case 0x1a: case 0x1b:
                case 0x1c: case 0x1d: case 0x1e: case 0x1f: {
                        sprintf(buffer, "\\u%04x", (unsigned)code_point);
                        len = 6;
                        break;
                    }
                case '\b':
                    memcpy(buffer, "\\b", 2);
                    len = 2;
                    break;
                case '\t':
                    memcpy(buffer, "\\t", 2);
                    len = 2;
                    break;
                case '\n':
                    memcpy(buffer, "\\n", 2);
                    len = 2;
                    break;
                case '\f':
                    memcpy(buffer, "\\f", 2);
                    len = 2;
                    break;
                case '\r':
                    memcpy(buffer, "\\r", 2);
                    len = 2;
                    break;
                case '\\':
                    memcpy(buffer, "\\\\", 2);
                    len = 2;
                    break;
                case '\"':
                    memcpy(buffer, "\\\"", 2);
                    len = 2;
                    break;

                    // Ordinary printable ASCII.
                default:
                    buffer[0] = (char)code_point;
                    len = 1;
            }
            CB_Cat_Trusted_Str(json, buffer, len);
        }
    }

    // Append closing quote.
    CB_Cat_Trusted_Str(json, "\"", 1);
}

static void
S_cat_whitespace(CharBuf *json, int32_t depth) {
    while (depth--) {
        CB_Cat_Trusted_Str(json, indentation, INDENTATION_LEN);
    }
}

static bool_t
S_to_json(Obj *dump, CharBuf *json, int32_t depth) {
    // Guard against infinite recursion in self-referencing data structures.
    if (depth > MAX_DEPTH) {
        CharBuf *mess = MAKE_MESS("Exceeded max depth of %i32", MAX_DEPTH);
        Err_set_error(Err_new(mess));
        return false;
    }

    if (!dump) {
        CB_Cat_Trusted_Str(json, "null", 4);
    }
    else if (dump == (Obj*)CFISH_TRUE) {
        CB_Cat_Trusted_Str(json, "true", 4);
    }
    else if (dump == (Obj*)CFISH_FALSE) {
        CB_Cat_Trusted_Str(json, "false", 5);
    }
    else if (Obj_Is_A(dump, CHARBUF)) {
        S_append_json_string(dump, json);
    }
    else if (Obj_Is_A(dump, INTNUM)) {
        CB_catf(json, "%i64", Obj_To_I64(dump));
    }
    else if (Obj_Is_A(dump, FLOATNUM)) {
        CB_catf(json, "%f64", Obj_To_F64(dump));
    }
    else if (Obj_Is_A(dump, VARRAY)) {
        VArray *array = (VArray*)dump;
        size_t size = VA_Get_Size(array);
        if (size == 0) {
            // Put empty array on single line.
            CB_Cat_Trusted_Str(json, "[]", 2);
            return true;
        }
        else if (size == 1) {
            Obj *elem = VA_Fetch(array, 0);
            if (!(Obj_Is_A(elem, HASH) || Obj_Is_A(elem, VARRAY))) {
                // Put array containing single scalar element on one line.
                CB_Cat_Trusted_Str(json, "[", 1);
                if (!S_to_json(elem, json, depth + 1)) {
                    return false;
                }
                CB_Cat_Trusted_Str(json, "]", 1);
                return true;
            }
        }
        // Fall back to spreading elements across multiple lines.
        CB_Cat_Trusted_Str(json, "[", 1);
        for (size_t i = 0; i < size; i++) {
            CB_Cat_Trusted_Str(json, "\n", 1);
            S_cat_whitespace(json, depth + 1);
            if (!S_to_json(VA_Fetch(array, i), json, depth + 1)) {
                return false;
            }
            if (i + 1 < size) {
                CB_Cat_Trusted_Str(json, ",", 1);
            }
        }
        CB_Cat_Trusted_Str(json, "\n", 1);
        S_cat_whitespace(json, depth);
        CB_Cat_Trusted_Str(json, "]", 1);
    }
    else if (Obj_Is_A(dump, HASH)) {
        Hash *hash = (Hash*)dump;
        size_t size = Hash_Get_Size(hash);

        // Put empty hash on single line.
        if (size == 0) {
            CB_Cat_Trusted_Str(json, "{}", 2);
            return true;
        }

        // Validate that all keys are strings, then sort.
        VArray *keys = Hash_Keys(hash);
        for (size_t i = 0; i < size; i++) {
            Obj *key = VA_Fetch(keys, i);
            if (!key || !Obj_Is_A(key, CHARBUF)) {
                DECREF(keys);
                CharBuf *key_class = key ? Obj_Get_Class_Name(key) : NULL;
                CharBuf *mess = MAKE_MESS("Illegal key type: %o", key_class);
                Err_set_error(Err_new(mess));
                return false;
            }
        }
        VA_Sort(keys, NULL, NULL);

        // Spread pairs across multiple lines.
        CB_Cat_Trusted_Str(json, "{", 1);
        for (size_t i = 0; i < size; i++) {
            Obj *key = VA_Fetch(keys, i);
            CB_Cat_Trusted_Str(json, "\n", 1);
            S_cat_whitespace(json, depth + 1);
            S_append_json_string(key, json);
            CB_Cat_Trusted_Str(json, ": ", 2);
            if (!S_to_json(Hash_Fetch(hash, key), json, depth + 1)) {
                DECREF(keys);
                return false;
            }
            if (i + 1 < size) {
                CB_Cat_Trusted_Str(json, ",", 1);
            }
        }
        CB_Cat_Trusted_Str(json, "\n", 1);
        S_cat_whitespace(json, depth);
        CB_Cat_Trusted_Str(json, "}", 1);

        DECREF(keys);
    }

    return true;
}

static Obj*
S_parse_json(char *text, size_t size) {
    void *json_parser = LucyParseJsonAlloc(lucy_Memory_wrapped_malloc);
    if (json_parser == NULL) {
        CharBuf *mess = MAKE_MESS("Failed to allocate JSON parser");
        Err_set_error(Err_new(mess));
        return NULL;
    }
    Obj *dump = S_do_parse_json(json_parser, text, size);
    LucyParseJsonFree(json_parser, lucy_Memory_wrapped_free);
    return dump;
}

static Obj*
S_do_parse_json(void *json_parser, char *json, size_t len) {
    lucy_JsonParserState state;
    state.result = NULL;
    state.errors = false;

    char *text = json;
    char *const end = text + len;
    while (text < end) {
        int  token_type = -1;
        Obj *value      = NULL;
        char *const save = text;
        switch (*text) {
            case ' ': case '\n': case '\r': case '\t':
                // Skip insignificant whitespace, which the JSON RFC defines
                // as only four ASCII characters.
                text++;
                continue;
            case '[':
                token_type = LUCY_JSON_TOKENTYPE_LEFT_SQUARE_BRACKET;
                text++;
                break;
            case ']':
                token_type = LUCY_JSON_TOKENTYPE_RIGHT_SQUARE_BRACKET;
                text++;
                break;
            case '{':
                token_type = LUCY_JSON_TOKENTYPE_LEFT_CURLY_BRACKET;
                text++;
                break;
            case '}':
                token_type = LUCY_JSON_TOKENTYPE_RIGHT_CURLY_BRACKET;
                text++;
                break;
            case ':':
                token_type = LUCY_JSON_TOKENTYPE_COLON;
                text++;
                break;
            case ',':
                token_type = LUCY_JSON_TOKENTYPE_COMMA;
                text++;
                break;
            case '"':
                value = (Obj*)S_parse_string(&text, end);
                if (value) {
                    token_type = LUCY_JSON_TOKENTYPE_STRING;
                }
                else {
                    // Clear out parser and return.
                    LucyParseJson(json_parser, 0, NULL, &state);
                    ERR_ADD_FRAME(Err_get_error());
                    return NULL;
                }
                break;
            case 'n':
                if (SI_check_keyword(text, end, "null", 4)) {
                    token_type = LUCY_JSON_TOKENTYPE_NULL;
                    text += 4;
                }
                break;
            case 't':
                if (SI_check_keyword(text, end, "true", 4)) {
                    token_type = LUCY_JSON_TOKENTYPE_TRUE;
                    value = (Obj*)CFISH_TRUE;
                    text += 4;
                }
                break;
            case 'f':
                if (SI_check_keyword(text, end, "false", 5)) {
                    token_type = LUCY_JSON_TOKENTYPE_FALSE;
                    value = (Obj*)CFISH_FALSE;
                    text += 5;
                }
                break;
            case '0': case '1': case '2': case '3': case '4':
            case '5': case '6': case '7': case '8': case '9':
            case '-': { // Note no '+', as JSON spec doesn't allow it.
                    value = (Obj*)S_parse_number(&text, end);
                    if (value) {
                        token_type = LUCY_JSON_TOKENTYPE_NUMBER;
                    }
                    else {
                        // Clear out parser and return.
                        LucyParseJson(json_parser, 0, NULL, &state);
                        ERR_ADD_FRAME(Err_get_error());
                        return NULL;
                    }
                }
                break;
        }
        LucyParseJson(json_parser, token_type, value, &state);
        if (state.errors) {
            SET_ERROR(CB_newf("JSON syntax error"), save, end);
            return NULL;
        }
    }

    // Finish up.
    LucyParseJson(json_parser, 0, NULL, &state);
    if (state.errors) {
        SET_ERROR(CB_newf("JSON syntax error"), json, end);
        return NULL;
    }
    return state.result;
}

static Float64*
S_parse_number(char **json_ptr, char *const limit) {
    char *top = *json_ptr;
    char *end = top;
    bool_t terminated = false;

    // We can't assume NULL termination for the JSON string, so we need to
    // ensure that strtod() cannot overrun and access invalid memory.
    for (; end < limit; end++) {
        switch (*end) {
                // Only these characters may legally follow a number in
                // Javascript.  If we don't find one before the end of the JSON,
                // it's a parse error.
            case ' ': case '\n': case '\r': case '\t':
            case ']':
            case '}':
            case ':':
            case ',':
                terminated = true;
                break;
        }
    }

    Float64 *result = NULL;
    if (terminated) {
        char *terminus;
        double number = strtod(top, &terminus);
        if (terminus != top) {
            *json_ptr = terminus;
            result = Float64_new(number);
        }
    }
    if (!result) {
        SET_ERROR(CB_newf("JSON syntax error"), top, limit);
    }
    return result;
}

static CharBuf*
S_parse_string(char **json_ptr, char *const limit) {
    // Find terminating double quote, determine whether there are any escapes.
    char *top = *json_ptr + 1;
    char *end = NULL;
    bool_t saw_backslash = false;
    for (char *text = top; text < limit; text++) {
        if (*text == '"') {
            end = text;
            break;
        }
        else if (*text == '\\') {
            saw_backslash = true;
            if (text + 1 < limit && text[1] == 'u') {
                text += 5;
            }
            else {
                text += 1;
            }
        }
    }
    if (!end) {
        SET_ERROR(CB_newf("Unterminated string"), *json_ptr, limit);
        return NULL;
    }

    // Advance the text buffer to just beyond the closing quote.
    *json_ptr = end + 1;

    if (saw_backslash) {
        return S_unescape_text(top, end);
    }
    else {
        // Optimize common case where there are no escapes.
        size_t len = end - top;
        if (!StrHelp_utf8_valid(top, len)) {
            CharBuf *mess = MAKE_MESS("Bad UTF-8 in JSON");
            Err_set_error(Err_new(mess));
            return NULL;
        }
        return CB_new_from_trusted_utf8(top, len);
    }
}

static CharBuf*
S_unescape_text(char *const top, char *const end) {
    // The unescaped string will never be longer than the escaped string
    // because only a \u escape can theoretically be too long and
    // StrHelp_encode_utf8_char guards against sequences over 4 bytes.
    // Therefore we can allocate once and not worry about reallocating.
    size_t cap = end - top + 1;
    char *target_buf = (char*)MALLOCATE(cap);
    size_t target_size = 0;
    for (char *text = top; text < end; text++) {
        if (*text != '\\') {
            target_buf[target_size++] = *text;
        }
        else {
            // Process escape.
            text++;
            switch (*text) {
                case '"':
                    target_buf[target_size++] = '"';
                    break;
                case '\\':
                    target_buf[target_size++] = '\\';
                    break;
                case '/':
                    target_buf[target_size++] = '/';
                    break;
                case 'b':
                    target_buf[target_size++] = '\b';
                    break;
                case 'f':
                    target_buf[target_size++] = '\f';
                    break;
                case 'n':
                    target_buf[target_size++] = '\n';
                    break;
                case 'r':
                    target_buf[target_size++] = '\r';
                    break;
                case 't':
                    target_buf[target_size++] = '\t';
                    break;
                case 'u': {
                        // Copy into a temp buffer because strtol will overrun
                        // into adjacent text data for e.g. "\uAAAA1".
                        char temp[5] = { 0, 0, 0, 0, 0 };
                        memcpy(temp, text + 1, 4);
                        text += 4;
                        char *num_end;
                        long code_point = strtol(temp, &num_end, 16);
                        char *temp_ptr = temp;
                        if (num_end != temp_ptr + 4 || code_point < 0) {
                            FREEMEM(target_buf);
                            SET_ERROR(CB_newf("Invalid \\u escape"), text - 5, end);
                            return NULL;
                        }
                        if (code_point >= 0xD800 && code_point <= 0xDFFF) {
                            FREEMEM(target_buf);
                            SET_ERROR(CB_newf("Surrogate pairs not supported"),
                                      text - 5, end);
                            return NULL;
                        }
                        target_size += StrHelp_encode_utf8_char((uint32_t)code_point,
                                                                target_buf + target_size);
                    }
                    break;
                default:
                    FREEMEM(target_buf);
                    SET_ERROR(CB_newf("Illegal escape"), text - 1, end);
                    return NULL;
            }
        }
    }

    // NULL-terminate, sanity check, then return the escaped string.
    target_buf[target_size] = '\0';
    if (!StrHelp_utf8_valid(target_buf, target_size)) {
        FREEMEM(target_buf);
        CharBuf *mess = MAKE_MESS("Bad UTF-8 in JSON");
        Err_set_error(Err_new(mess));
        return NULL;
    }
    return CB_new_steal_from_trusted_str(target_buf, target_size, cap);
}

static INLINE bool_t
SI_check_keyword(char *json, char* end, const char *keyword, size_t len) {
    if (end - json > len
        && strncmp(json, keyword, len) == 0
        && json[len] != '_'
        && !isalnum(json[len])
       ) {
        return true;
    }
    return false;
}

static void
S_set_error(CharBuf *mess, char *json, char *limit, int line,
            const char *func) {
    if (func) {
        CB_catf(mess, " at %s %s line %i32 near ", func, __FILE__,
                (int32_t)line);
    }
    else {
        CB_catf(mess, " at %s line %i32 near ", __FILE__, (int32_t)line);
    }

    // Append escaped text.
    int64_t len = limit - json;
    if (len > 32) {
        const char *end = StrHelp_back_utf8_char(json + 32, json);
        len = end - json;
    }
    ZombieCharBuf *snippet = ZCB_WRAP_STR(json, len);
    S_append_json_string((Obj*)snippet, mess);

    // Set Err_error.
    Err_set_error(Err_new(mess));
}