The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define C_LUCY_STRINGHELPER
#include <string.h>

#define LUCY_USE_SHORT_NAMES
#define CHY_USE_SHORT_NAMES

#include "Lucy/Util/StringHelper.h"
#include "Lucy/Object/Err.h"
#include "Lucy/Util/Memory.h"

const uint8_t lucy_StrHelp_UTF8_COUNT[] = {
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0,
};

int32_t
StrHelp_overlap(const char *a, const char *b, size_t a_len,  size_t b_len) {
    size_t i;
    const size_t len = a_len <= b_len ? a_len : b_len;

    for (i = 0; i < len; i++) {
        if (*a++ != *b++) { break; }
    }
    return i;
}

static const char base36_chars[] = "0123456789abcdefghijklmnopqrstuvwxyz";

uint32_t
StrHelp_to_base36(uint64_t num, void *buffer) {
    char  my_buf[StrHelp_MAX_BASE36_BYTES];
    char *buf = my_buf + StrHelp_MAX_BASE36_BYTES - 1;
    char *end = buf;

    // Null terminate.
    *buf = '\0';

    // Convert to base 36 characters.
    do {
        *(--buf) = base36_chars[num % 36];
        num /= 36;
    } while (num > 0);

    uint32_t size = end - buf;
    memcpy(buffer, buf, size + 1);
    return size;
}

bool_t
StrHelp_utf8_valid(const char *ptr, size_t size) {
    const uint8_t *string    = (const uint8_t*)ptr;
    const uint8_t *const end = string + size;
    while (string < end) {
        const uint8_t header_byte = *string++;
        int count = StrHelp_UTF8_COUNT[header_byte] & 0x7;
        switch (count & 0x7) {
            case 1:
                // ASCII
                break;
            case 2:
                if (string == end)              { return false; }
                // Disallow non-shortest-form ASCII.
                if (!(header_byte & 0x1E))      { return false; }
                if ((*string++ & 0xC0) != 0x80) { return false; }
                break;
            case 3:
                if (end - string < 2)           { return false; }
                if (header_byte == 0xED) {
                    if (*string < 0x80 || *string > 0x9F) {
                        return false;
                    }
                }
                else if (!(header_byte & 0x0F)) {
                    if (!(*string & 0x20)) {
                        return false;
                    }
                }
                if ((*string++ & 0xC0) != 0x80) { return false; }
                if ((*string++ & 0xC0) != 0x80) { return false; }
                break;
            case 4:
                if (end - string < 3)           { return false; }
                if (!(header_byte & 0x07)) {
                    if (!(*string & 0x30)) {
                        return false;
                    }
                }
                if ((*string++ & 0xC0) != 0x80) { return false; }
                if ((*string++ & 0xC0) != 0x80) { return false; }
                if ((*string++ & 0xC0) != 0x80) { return false; }
                break;
            default:
                return false;
        }
    }

    return true;
}

bool_t
StrHelp_is_whitespace(uint32_t code_point) {
    switch (code_point) {
            // <control-0009>..<control-000D>
        case 0x0009: case 0x000A: case 0x000B: case 0x000C: case 0x000D:
        case 0x0020: // SPACE
        case 0x0085: // <control-0085>
        case 0x00A0: // NO-BREAK SPACE
        case 0x1680: // OGHAM SPACE MARK
        case 0x180E: // MONGOLIAN VOWEL SEPARATOR
            // EN QUAD..HAIR SPACE
        case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004:
        case 0x2005: case 0x2006: case 0x2007: case 0x2008: case 0x2009:
        case 0x200A:
        case 0x2028: // LINE SEPARATOR
        case 0x2029: // PARAGRAPH SEPARATOR
        case 0x202F: // NARROW NO-BREAK SPACE
        case 0x205F: // MEDIUM MATHEMATICAL SPACE
        case 0x3000: // IDEOGRAPHIC SPACE
            return true;

        default:
            return false;
    }
}

uint32_t
StrHelp_encode_utf8_char(uint32_t code_point, void *buffer) {
    uint8_t *buf = (uint8_t*)buffer;
    if (code_point <= 0x7F) { // ASCII
        buf[0] = (uint8_t)code_point;
        return 1;
    }
    else if (code_point <= 0x07FF) { // 2 byte range
        buf[0] = (uint8_t)(0xC0 | (code_point >> 6));
        buf[1] = (uint8_t)(0x80 | (code_point & 0x3f));
        return 2;
    }
    else if (code_point <= 0xFFFF) { // 3 byte range
        buf[0] = (uint8_t)(0xE0 | (code_point  >> 12));
        buf[1] = (uint8_t)(0x80 | ((code_point >> 6) & 0x3F));
        buf[2] = (uint8_t)(0x80 | (code_point        & 0x3f));
        return 3;
    }
    else if (code_point <= 0x10FFFF) { // 4 byte range
        buf[0] = (uint8_t)(0xF0 | (code_point  >> 18));
        buf[1] = (uint8_t)(0x80 | ((code_point >> 12) & 0x3F));
        buf[2] = (uint8_t)(0x80 | ((code_point >> 6)  & 0x3F));
        buf[3] = (uint8_t)(0x80 | (code_point         & 0x3f));
        return 4;
    }
    else {
        THROW(ERR, "Illegal Unicode code point: %u32", code_point);
        UNREACHABLE_RETURN(uint32_t);
    }
}

uint32_t
StrHelp_decode_utf8_char(const char *ptr) {
    const uint8_t *const string = (const uint8_t*)ptr;
    uint32_t retval = *string;
    int bytes = StrHelp_UTF8_COUNT[retval];

    switch (bytes & 0x7) {
        case 1:
            break;

        case 2:
            retval = ((retval     & 0x1F) << 6)
                     | (string[1] & 0x3F);
            break;

        case 3:
            retval = ((retval      & 0x0F) << 12)
                     | ((string[1] & 0x3F) << 6)
                     | (string[2]  & 0x3F);
            break;

        case 4:
            retval = ((retval      & 0x07) << 18)
                     | ((string[1] & 0x3F) << 12)
                     | ((string[2] & 0x3F) << 6)
                     | (string[3]  & 0x3F);
            break;

        default:
            THROW(ERR, "Invalid UTF-8 header byte: %x32", retval);
    }

    return retval;
}

const char*
StrHelp_back_utf8_char(const char *ptr, char *start) {
    while (--ptr >= start) {
        if ((*ptr & 0xC0) != 0x80) { return ptr; }
    }
    return NULL;
}