The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <ctype.h>
#include <stdio.h>

#include "Lucy/Util/ToolSet.h"

#include "Lucy/Util/Json.h"

#include "Clownfish/Boolean.h"
#include "Clownfish/CharBuf.h"
#include "Clownfish/Num.h"
#include "Clownfish/Util/Memory.h"
#include "Lucy/Store/Folder.h"
#include "Lucy/Store/InStream.h"
#include "Lucy/Store/OutStream.h"
#include "Lucy/Util/Json/JsonParser.h"
#include "Lucy/Util/StringHelper.h"

/* Routines generated by Lemon. */
void*
LucyParseJsonAlloc(void * (*allocate)(size_t));
void
LucyParseJson(void *json_parser, int token_type, Obj *value,
              lucy_JsonParserState *state);
void
LucyParseJsonFree(void *json_parser, void(*freemem)(void*));
void
LucyParseJsonTrace(FILE *trace, char *line_prefix);

// Encode JSON for supplied "dump".  On failure, sets the global error object
// and returns false.
static bool
S_to_json(Obj *dump, CharBuf *buf, int32_t depth);

// Parse JSON from raw UTF-8 in memory.
static Obj*
S_parse_json(const char *text, size_t size);
static Obj*
S_do_parse_json(void *json_parser, const char *json, size_t len);

// Parse a JSON number.  Advance the text buffer just past the number.
static Float*
S_parse_number(const char **json_ptr, const char *limit);

// Parse a JSON string.  Advance the text buffer from pointing at the opening
// double quote to pointing just after the closing double quote.
static String*
S_parse_string(const char **json_ptr, const char *limit);

// Unescape JSON string text.  Expects pointers bookending the text data (i.e.
// pointing just after the opening double quote and directly at the closing
// double quote), and assumes that escapes have already been sanity checked
// for length.
static String*
S_unescape_text(const char *top, const char *end);

// Check that the supplied text begins with the specified keyword, which must
// then end on a word boundary (i.e. match "null" but not the first four
// letters of "nullify").
static CFISH_INLINE bool
SI_check_keyword(const char *json, const char* end, const char *keyword,
                 size_t len);

// Make it possible to be loosen constraints during testing.
static bool tolerant = false;

// Indentation: two spaces per level.
static const char indentation[]     = "  ";
static const size_t INDENTATION_LEN = sizeof(indentation) - 1;

// Append indentation spaces x depth.
static void
S_cat_whitespace(CharBuf *buf, int32_t depth);

// Set the global error object, appending escaped JSON in the vicinity of the
// error.
static void
S_set_error(const char *mess, const char *json, const char *limit, int line,
            const char *func);
#define SET_ERROR(_mess, _json, _end) \
    S_set_error(_mess, _json, _end, __LINE__, CFISH_ERR_FUNC_MACRO)

Obj*
Json_from_json(String *json) {
    Obj *dump = S_parse_json(Str_Get_Ptr8(json), Str_Get_Size(json));
    if (!dump) {
        ERR_ADD_FRAME(Err_get_error());
    }
    return dump;
}

Obj*
Json_slurp_json(Folder *folder, String *path) {
    InStream *instream = Folder_Open_In(folder, path);
    if (!instream) {
        ERR_ADD_FRAME(Err_get_error());
        return NULL;
    }
    size_t len = (size_t)InStream_Length(instream);
    const char *buf = InStream_Buf(instream, len);
    Obj *dump = S_parse_json(buf, len);
    InStream_Close(instream);
    DECREF(instream);
    if (!dump) {
        ERR_ADD_FRAME(Err_get_error());
    }
    return dump;
}

bool
Json_spew_json(Obj *dump, Folder *folder, String *path) {
    String *json = Json_to_json(dump);
    if (!json) {
        ERR_ADD_FRAME(Err_get_error());
        return false;
    }
    OutStream *outstream = Folder_Open_Out(folder, path);
    if (!outstream) {
        ERR_ADD_FRAME(Err_get_error());
        DECREF(json);
        return false;
    }
    size_t size = Str_Get_Size(json);
    OutStream_Write_Bytes(outstream, Str_Get_Ptr8(json), size);
    OutStream_Close(outstream);
    DECREF(outstream);
    DECREF(json);
    return true;
}

String*
Json_to_json(Obj *dump) {
    // Validate object type, only allowing hashes and arrays per JSON spec.
    if (!dump || !(Obj_is_a(dump, HASH) || Obj_is_a(dump, VECTOR))) {
        if (!tolerant) {
            String *class_name = dump ? Obj_get_class_name(dump) : NULL;
            String *mess = MAKE_MESS("Illegal top-level object type: %o",
                                     class_name);
            Err_set_error(Err_new(mess));
            return NULL;
        }
    }

    // Encode.
    CharBuf *buf = CB_new(31);
    String *json = NULL;
    if (!S_to_json(dump, buf, 0)) {
        ERR_ADD_FRAME(Err_get_error());
    }
    else {
        // Append newline.
        CB_Cat_Trusted_Utf8(buf, "\n", 1);
        json = CB_Yield_String(buf);
    }

    DECREF(buf);
    return json;
}

void
Json_set_tolerant(bool tolerance) {
    tolerant = tolerance;
}

static const int32_t MAX_DEPTH = 200;

static void
S_append_json_string(String *dump, CharBuf *buf) {
    // Append opening quote.
    CB_Cat_Trusted_Utf8(buf, "\"", 1);

    // Process string data.
    StringIterator *iter = Str_Top(dump);
    int32_t code_point;
    while (STR_OOB != (code_point = StrIter_Next(iter))) {
        if (code_point > 127) {
            // There is no need to escape any high characters, including those
            // above the BMP, as we assume that the destination channel can
            // handle arbitrary UTF-8 data.
            CB_Cat_Char(buf, code_point);
        }
        else {
            char buffer[7];
            size_t len;
            switch (code_point & 127) {
                    // Perform all mandatory escapes enumerated in the JSON spec.
                    // Note that the spec makes escaping forward slash optional;
                    // we choose not to.
                case 0x00: case 0x01: case 0x02: case 0x03:
                case 0x04: case 0x05: case 0x06: case 0x07:
                case 0x0b: case 0x0e: case 0x0f:
                case 0x10: case 0x11: case 0x12: case 0x13:
                case 0x14: case 0x15: case 0x16: case 0x17:
                case 0x18: case 0x19: case 0x1a: case 0x1b:
                case 0x1c: case 0x1d: case 0x1e: case 0x1f: {
                        sprintf(buffer, "\\u%04x", (unsigned)code_point);
                        len = 6;
                        break;
                    }
                case '\b':
                    memcpy(buffer, "\\b", 2);
                    len = 2;
                    break;
                case '\t':
                    memcpy(buffer, "\\t", 2);
                    len = 2;
                    break;
                case '\n':
                    memcpy(buffer, "\\n", 2);
                    len = 2;
                    break;
                case '\f':
                    memcpy(buffer, "\\f", 2);
                    len = 2;
                    break;
                case '\r':
                    memcpy(buffer, "\\r", 2);
                    len = 2;
                    break;
                case '\\':
                    memcpy(buffer, "\\\\", 2);
                    len = 2;
                    break;
                case '\"':
                    memcpy(buffer, "\\\"", 2);
                    len = 2;
                    break;

                    // Ordinary printable ASCII.
                default:
                    buffer[0] = (char)code_point;
                    len = 1;
            }
            CB_Cat_Trusted_Utf8(buf, buffer, len);
        }
    }

    // Append closing quote.
    CB_Cat_Trusted_Utf8(buf, "\"", 1);

    DECREF(iter);
}

static void
S_cat_whitespace(CharBuf *buf, int32_t depth) {
    while (depth--) {
        CB_Cat_Trusted_Utf8(buf, indentation, INDENTATION_LEN);
    }
}

static bool
S_to_json(Obj *dump, CharBuf *buf, int32_t depth) {
    // Guard against infinite recursion in self-referencing data structures.
    if (depth > MAX_DEPTH) {
        String *mess = MAKE_MESS("Exceeded max depth of %i32", MAX_DEPTH);
        Err_set_error(Err_new(mess));
        return false;
    }

    if (!dump) {
        CB_Cat_Trusted_Utf8(buf, "null", 4);
    }
    else if (dump == (Obj*)CFISH_TRUE) {
        CB_Cat_Trusted_Utf8(buf, "true", 4);
    }
    else if (dump == (Obj*)CFISH_FALSE) {
        CB_Cat_Trusted_Utf8(buf, "false", 5);
    }
    else if (Obj_is_a(dump, STRING)) {
        S_append_json_string((String*)dump, buf);
    }
    else if (Obj_is_a(dump, INTEGER)) {
        CB_catf(buf, "%i64", Int_Get_Value((Integer*)dump));
    }
    else if (Obj_is_a(dump, FLOAT)) {
        CB_catf(buf, "%f64", Float_Get_Value((Float*)dump));
    }
    else if (Obj_is_a(dump, VECTOR)) {
        Vector *array = (Vector*)dump;
        size_t size = Vec_Get_Size(array);
        if (size == 0) {
            // Put empty array on single line.
            CB_Cat_Trusted_Utf8(buf, "[]", 2);
            return true;
        }
        else if (size == 1) {
            Obj *elem = Vec_Fetch(array, 0);
            if (!(Obj_is_a(elem, HASH) || Obj_is_a(elem, VECTOR))) {
                // Put array containing single scalar element on one line.
                CB_Cat_Trusted_Utf8(buf, "[", 1);
                if (!S_to_json(elem, buf, depth + 1)) {
                    return false;
                }
                CB_Cat_Trusted_Utf8(buf, "]", 1);
                return true;
            }
        }
        // Fall back to spreading elements across multiple lines.
        CB_Cat_Trusted_Utf8(buf, "[", 1);
        for (size_t i = 0; i < size; i++) {
            CB_Cat_Trusted_Utf8(buf, "\n", 1);
            S_cat_whitespace(buf, depth + 1);
            if (!S_to_json(Vec_Fetch(array, i), buf, depth + 1)) {
                return false;
            }
            if (i + 1 < size) {
                CB_Cat_Trusted_Utf8(buf, ",", 1);
            }
        }
        CB_Cat_Trusted_Utf8(buf, "\n", 1);
        S_cat_whitespace(buf, depth);
        CB_Cat_Trusted_Utf8(buf, "]", 1);
    }
    else if (Obj_is_a(dump, HASH)) {
        Hash *hash = (Hash*)dump;
        size_t size = Hash_Get_Size(hash);

        // Put empty hash on single line.
        if (size == 0) {
            CB_Cat_Trusted_Utf8(buf, "{}", 2);
            return true;
        }

        // Validate that all keys are strings, then sort.
        Vector *keys = Hash_Keys(hash);
        for (size_t i = 0; i < size; i++) {
            Obj *key = Vec_Fetch(keys, i);
            if (!key || !Obj_is_a(key, STRING)) {
                DECREF(keys);
                String *key_class = key ? Obj_get_class_name(key) : NULL;
                String *mess = MAKE_MESS("Illegal key type: %o", key_class);
                Err_set_error(Err_new(mess));
                return false;
            }
        }
        Vec_Sort(keys);

        // Spread pairs across multiple lines.
        CB_Cat_Trusted_Utf8(buf, "{", 1);
        for (size_t i = 0; i < size; i++) {
            String *key = (String*)Vec_Fetch(keys, i);
            CB_Cat_Trusted_Utf8(buf, "\n", 1);
            S_cat_whitespace(buf, depth + 1);
            S_append_json_string(key, buf);
            CB_Cat_Trusted_Utf8(buf, ": ", 2);
            if (!S_to_json(Hash_Fetch(hash, key), buf, depth + 1)) {
                DECREF(keys);
                return false;
            }
            if (i + 1 < size) {
                CB_Cat_Trusted_Utf8(buf, ",", 1);
            }
        }
        CB_Cat_Trusted_Utf8(buf, "\n", 1);
        S_cat_whitespace(buf, depth);
        CB_Cat_Trusted_Utf8(buf, "}", 1);

        DECREF(keys);
    }

    return true;
}

static Obj*
S_parse_json(const char *text, size_t size) {
    void *json_parser = LucyParseJsonAlloc(Memory_wrapped_malloc);
    if (json_parser == NULL) {
        String *mess = MAKE_MESS("Failed to allocate JSON parser");
        Err_set_error(Err_new(mess));
        return NULL;
    }
    Obj *dump = S_do_parse_json(json_parser, text, size);
    LucyParseJsonFree(json_parser, Memory_wrapped_free);
    return dump;
}

static Obj*
S_do_parse_json(void *json_parser, const char *json, size_t len) {
    lucy_JsonParserState state;
    state.result = NULL;
    state.errors = false;

    const char *text = json;
    const char *const end = text + len;
    while (text < end) {
        int  token_type = -1;
        Obj *value      = NULL;
        const char *const save = text;
        switch (*text) {
            case ' ': case '\n': case '\r': case '\t':
                // Skip insignificant whitespace, which the JSON RFC defines
                // as only four ASCII characters.
                text++;
                continue;
            case '[':
                token_type = LUCY_JSON_TOKENTYPE_LEFT_SQUARE_BRACKET;
                text++;
                break;
            case ']':
                token_type = LUCY_JSON_TOKENTYPE_RIGHT_SQUARE_BRACKET;
                text++;
                break;
            case '{':
                token_type = LUCY_JSON_TOKENTYPE_LEFT_CURLY_BRACKET;
                text++;
                break;
            case '}':
                token_type = LUCY_JSON_TOKENTYPE_RIGHT_CURLY_BRACKET;
                text++;
                break;
            case ':':
                token_type = LUCY_JSON_TOKENTYPE_COLON;
                text++;
                break;
            case ',':
                token_type = LUCY_JSON_TOKENTYPE_COMMA;
                text++;
                break;
            case '"':
                value = (Obj*)S_parse_string(&text, end);
                if (value) {
                    token_type = LUCY_JSON_TOKENTYPE_STRING;
                }
                else {
                    // Clear out parser and return.
                    LucyParseJson(json_parser, 0, NULL, &state);
                    ERR_ADD_FRAME(Err_get_error());
                    return NULL;
                }
                break;
            case 'n':
                if (SI_check_keyword(text, end, "null", 4)) {
                    token_type = LUCY_JSON_TOKENTYPE_NULL;
                    text += 4;
                }
                break;
            case 't':
                if (SI_check_keyword(text, end, "true", 4)) {
                    token_type = LUCY_JSON_TOKENTYPE_TRUE;
                    value = (Obj*)CFISH_TRUE;
                    text += 4;
                }
                break;
            case 'f':
                if (SI_check_keyword(text, end, "false", 5)) {
                    token_type = LUCY_JSON_TOKENTYPE_FALSE;
                    value = (Obj*)CFISH_FALSE;
                    text += 5;
                }
                break;
            case '0': case '1': case '2': case '3': case '4':
            case '5': case '6': case '7': case '8': case '9':
            case '-': { // Note no '+', as JSON spec doesn't allow it.
                    value = (Obj*)S_parse_number(&text, end);
                    if (value) {
                        token_type = LUCY_JSON_TOKENTYPE_NUMBER;
                    }
                    else {
                        // Clear out parser and return.
                        LucyParseJson(json_parser, 0, NULL, &state);
                        ERR_ADD_FRAME(Err_get_error());
                        return NULL;
                    }
                }
                break;
        }
        if (token_type < 0) {
            // Clear out parser and return.
            LucyParseJson(json_parser, 0, NULL, &state);
            SET_ERROR("JSON syntax error", save, end);
            return NULL;
        }
        LucyParseJson(json_parser, token_type, value, &state);
        if (state.errors) {
            SET_ERROR("JSON syntax error", save, end);
            return NULL;
        }
    }

    // Finish up.
    LucyParseJson(json_parser, 0, NULL, &state);
    if (state.errors) {
        SET_ERROR("JSON syntax error", json, end);
        return NULL;
    }
    return state.result;
}

static Float*
S_parse_number(const char **json_ptr, const char *limit) {
    const char *top = *json_ptr;
    const char *end = top;
    bool terminated = false;

    // We can't assume NULL termination for the JSON string, so we need to
    // ensure that strtod() cannot overrun and access invalid memory.
    for (; end < limit; end++) {
        switch (*end) {
                // Only these characters may legally follow a number in
                // Javascript.  If we don't find one before the end of the JSON,
                // it's a parse error.
            case ' ': case '\n': case '\r': case '\t':
            case ']':
            case '}':
            case ':':
            case ',':
                terminated = true;
                break;
        }
    }

    Float *result = NULL;
    if (terminated) {
        char *terminus;
        double number = strtod(top, &terminus);
        if (terminus != top) {
            *json_ptr = terminus;
            result = Float_new(number);
        }
    }
    if (!result) {
        SET_ERROR("JSON syntax error", top, limit);
    }
    return result;
}

static String*
S_parse_string(const char **json_ptr, const char *limit) {
    // Find terminating double quote, determine whether there are any escapes.
    const char *top = *json_ptr + 1;
    const char *end = NULL;
    bool saw_backslash = false;
    for (const char *text = top; text < limit; text++) {
        if (*text == '"') {
            end = text;
            break;
        }
        else if (*text == '\\') {
            saw_backslash = true;
            if (text + 1 < limit && text[1] == 'u') {
                text += 5;
            }
            else {
                text += 1;
            }
        }
    }
    if (!end) {
        SET_ERROR("Unterminated string", *json_ptr, limit);
        return NULL;
    }

    // Advance the text buffer to just beyond the closing quote.
    *json_ptr = end + 1;

    if (saw_backslash) {
        return S_unescape_text(top, end);
    }
    else {
        // Optimize common case where there are no escapes.
        size_t len = (size_t)(end - top);
        if (!Str_utf8_valid(top, len)) {
            String *mess = MAKE_MESS("Bad UTF-8 in JSON");
            Err_set_error(Err_new(mess));
            return NULL;
        }
        return Str_new_from_trusted_utf8(top, len);
    }
}

static String*
S_unescape_text(const char *top, const char *end) {
    size_t cap = (size_t)(end - top) + 1;
    CharBuf *cb = CB_new(cap);
    const char *chunk = top;
    const char *text  = top;

    while (text < end) {
        if (*text != '\\') {
            text++;
        }
        else {
            if (!Str_utf8_valid(chunk, (size_t)(text - chunk))) {
                DECREF(cb);
                String *mess = MAKE_MESS("Bad UTF-8 in JSON");
                Err_set_error(Err_new(mess));
                return NULL;
            }
            CB_Cat_Trusted_Utf8(cb, chunk, (size_t)(text - chunk));

            // Process escape.
            text++;
            switch (*text) {
                case '"':
                case '\\':
                case '/':
                    CB_Cat_Trusted_Utf8(cb, text, 1);
                    break;
                case 'b':
                    CB_Cat_Trusted_Utf8(cb, "\b", 1);
                    break;
                case 'f':
                    CB_Cat_Trusted_Utf8(cb, "\f", 1);
                    break;
                case 'n':
                    CB_Cat_Trusted_Utf8(cb, "\n", 1);
                    break;
                case 'r':
                    CB_Cat_Trusted_Utf8(cb, "\r", 1);
                    break;
                case 't':
                    CB_Cat_Trusted_Utf8(cb, "\t", 1);
                    break;
                case 'u': {
                        int32_t code_point = 0;
                        for (int i = 1; i < 5; i++) {
                            char c = text[i];
                            int32_t digit = 0;
                            if (c >= '0' && c <= '9') {
                                digit = c - '0';
                            }
                            else if (c >= 'a' && c <= 'f') {
                                digit = c - 'a' + 10;
                            }
                            else if (c >= 'A' && c <= 'F') {
                                digit = c - 'A' + 10;
                            }
                            else {
                                DECREF(cb);
                                SET_ERROR("Invalid \\u escape", text - 1, end);
                                return NULL;
                            }
                            code_point = code_point * 16 + digit;
                        }
                        if (code_point >= 0xD800 && code_point <= 0xDFFF) {
                            DECREF(cb);
                            SET_ERROR("Surrogate pairs not supported",
                                      text - 1, end);
                            return NULL;
                        }
                        CB_Cat_Char(cb, code_point);
                        text += 4;
                    }
                    break;
                default:
                    DECREF(cb);
                    SET_ERROR("Illegal escape", text - 1, end);
                    return NULL;
            }

            text++;
            chunk = text;
        }
    }

    if (!Str_utf8_valid(chunk, (size_t)(text - chunk))) {
        DECREF(cb);
        String *mess = MAKE_MESS("Bad UTF-8 in JSON");
        Err_set_error(Err_new(mess));
        return NULL;
    }
    CB_Cat_Trusted_Utf8(cb, chunk, (size_t)(text - chunk));

    String *retval = CB_Yield_String(cb);
    DECREF(cb);
    return retval;
}

static CFISH_INLINE bool
SI_check_keyword(const char *json, const char* end, const char *keyword,
                 size_t len) {
    if ((size_t)(end - json) > len
        && strncmp(json, keyword, len) == 0
        && json[len] != '_'
        && !isalnum(json[len])
       ) {
        return true;
    }
    return false;
}

static void
S_set_error(const char *mess, const char *json, const char *limit, int line,
            const char *func) {
    CharBuf *buf = CB_new(0);
    CB_Cat_Utf8(buf, mess, strlen(mess));

    if (func) {
        CB_catf(buf, " at %s %s line %i32 near ", func, __FILE__,
                 (int32_t)line);
    }
    else {
        CB_catf(buf, " at %s line %i32 near ", __FILE__, (int32_t)line);
    }

    // Append escaped text.
    int64_t len = limit - json;
    if (len > 32) {
        const char *end = StrHelp_back_utf8_char(json + 32, json);
        len = end - json;
    }
    else if (len < 0) {
        len = 0; // sanity check
    }
    String *snippet = SSTR_WRAP_UTF8(json, (size_t)len);
    S_append_json_string(snippet, buf);

    String *full_mess = CB_Yield_String(buf);
    DECREF(buf);

    // Set global error object.
    Err_set_error(Err_new(full_mess));
}

int64_t
Json_obj_to_i64(Obj *obj) {
    int64_t retval = 0;

    if (!obj) {
        THROW(ERR, "Can't extract integer from NULL");
    }
    else if (Obj_is_a(obj, INTEGER)) {
        retval = Int_Get_Value((Integer*)obj);
    }
    else if (Obj_is_a(obj, FLOAT)) {
        retval = Float_To_I64((Float*)obj);
    }
    else if (Obj_is_a(obj, STRING)) {
        retval = Str_To_I64((String*)obj);
    }
    else {
        THROW(ERR, "Can't extract integer from object of type %o",
              Obj_get_class_name(obj));
    }

    return retval;
}

double
Json_obj_to_f64(Obj *obj) {
    double retval = 0;

    if (!obj) {
        THROW(ERR, "Can't extract float from NULL");
    }
    else if (Obj_is_a(obj, FLOAT)) {
        retval = Float_Get_Value((Float*)obj);
    }
    else if (Obj_is_a(obj, INTEGER)) {
        retval = Int_To_F64((Integer*)obj);
    }
    else if (Obj_is_a(obj, STRING)) {
        retval = Str_To_F64((String*)obj);
    }
    else {
        THROW(ERR, "Can't extract float from object of type %o",
              Obj_get_class_name(obj));
    }

    return retval;
}

bool
Json_obj_to_bool(Obj *obj) {
    bool retval = false;

    if (!obj) {
        THROW(ERR, "Can't extract bool from NULL");
    }
    else if (Obj_is_a(obj, BOOLEAN)) {
        retval = Bool_Get_Value((Boolean*)obj);
    }
    else if (Obj_is_a(obj, INTEGER)) {
        retval = Int_Get_Value((Integer*)obj) != 0;
    }
    else if (Obj_is_a(obj, FLOAT)) {
        retval = Float_Get_Value((Float*)obj) != 0.0;
    }
    else if (Obj_is_a(obj, STRING)) {
        retval = (Str_To_I64((String*)obj) != 0);
    }
    else {
        THROW(ERR, "Can't extract bool from object of type %o",
              Obj_get_class_name(obj));
    }

    return retval;
}