The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define C_LUCY_QUERYLEXER
#include <stdlib.h>
#include <ctype.h>
#include "Lucy/Util/ToolSet.h"

#include "Lucy/Search/QueryParser/QueryLexer.h"
#include "Lucy/Search/QueryParser.h"
#include "Lucy/Search/QueryParser/ParserElem.h"

#define TOKEN_OPEN_PAREN  LUCY_QPARSER_TOKEN_OPEN_PAREN
#define TOKEN_CLOSE_PAREN LUCY_QPARSER_TOKEN_CLOSE_PAREN
#define TOKEN_MINUS       LUCY_QPARSER_TOKEN_MINUS
#define TOKEN_PLUS        LUCY_QPARSER_TOKEN_PLUS
#define TOKEN_NOT         LUCY_QPARSER_TOKEN_NOT
#define TOKEN_OR          LUCY_QPARSER_TOKEN_OR
#define TOKEN_AND         LUCY_QPARSER_TOKEN_AND
#define TOKEN_FIELD       LUCY_QPARSER_TOKEN_FIELD
#define TOKEN_STRING      LUCY_QPARSER_TOKEN_STRING
#define TOKEN_QUERY       LUCY_QPARSER_TOKEN_QUERY

static ParserElem*
S_consume_keyword(StringIterator *iter, const char *keyword,
                  size_t keyword_len, uint32_t type);

static ParserElem*
S_consume_field(StringIterator *iter);

static ParserElem*
S_consume_text(StringIterator *iter);

static ParserElem*
S_consume_quoted_string(StringIterator *iter);

QueryLexer*
QueryLexer_new() {
    QueryLexer *self = (QueryLexer*)Class_Make_Obj(QUERYLEXER);
    return QueryLexer_init(self);
}

QueryLexer*
QueryLexer_init(QueryLexer *self) {
    QueryLexerIVARS *const ivars = QueryLexer_IVARS(self);
    ivars->heed_colons = false;
    return self;
}

bool
QueryLexer_Heed_Colons_IMP(QueryLexer *self) {
    return QueryLexer_IVARS(self)->heed_colons;
}

void
QueryLexer_Set_Heed_Colons_IMP(QueryLexer *self, bool heed_colons) {
    QueryLexer_IVARS(self)->heed_colons = heed_colons;
}

Vector*
QueryLexer_Tokenize_IMP(QueryLexer *self, String *query_string) {
    QueryLexerIVARS *const ivars = QueryLexer_IVARS(self);

    Vector *elems = Vec_new(0);
    if (!query_string) { return elems; }

    StringIterator *iter = Str_Top(query_string);

    while (StrIter_Has_Next(iter)) {
        ParserElem *elem = NULL;

        if (StrIter_Skip_Whitespace(iter)) {
            // Fast-forward past whitespace.
            continue;
        }

        if (ivars->heed_colons) {
            ParserElem *elem = S_consume_field(iter);
            if (elem) {
                Vec_Push(elems, (Obj*)elem);
            }
        }

        int32_t code_point = StrIter_Next(iter);
        switch (code_point) {
            case '(':
                elem = ParserElem_new(TOKEN_OPEN_PAREN, NULL);
                break;
            case ')':
                elem = ParserElem_new(TOKEN_CLOSE_PAREN, NULL);
                break;
            case '+':
                if (StrIter_Has_Next(iter)
                    && !StrIter_Skip_Whitespace(iter)
                   ) {
                    elem = ParserElem_new(TOKEN_PLUS, NULL);
                }
                else {
                    elem = ParserElem_new(TOKEN_STRING, (Obj*)Str_newf("+"));
                }
                break;
            case '-':
                if (StrIter_Has_Next(iter)
                    && !StrIter_Skip_Whitespace(iter)
                   ) {
                    elem = ParserElem_new(TOKEN_MINUS, NULL);
                }
                else {
                    elem = ParserElem_new(TOKEN_STRING, (Obj*)Str_newf("-"));
                }
                break;
            case '"':
                StrIter_Recede(iter, 1);
                elem = S_consume_quoted_string(iter);
                break;
            case 'O':
                StrIter_Recede(iter, 1);
                elem = S_consume_keyword(iter, "OR", 2, TOKEN_OR);
                if (!elem) {
                    elem = S_consume_text(iter);
                }
                break;
            case 'A':
                StrIter_Recede(iter, 1);
                elem = S_consume_keyword(iter, "AND", 3, TOKEN_AND);
                if (!elem) {
                    elem = S_consume_text(iter);
                }
                break;
            case 'N':
                StrIter_Recede(iter, 1);
                elem = S_consume_keyword(iter, "NOT", 3, TOKEN_NOT);
                if (!elem) {
                    elem = S_consume_text(iter);
                }
                break;
            default:
                StrIter_Recede(iter, 1);
                elem = S_consume_text(iter);
                break;
        }
        Vec_Push(elems, (Obj*)elem);
    }

    DECREF(iter);
    return elems;
}


static ParserElem*
S_consume_keyword(StringIterator *iter, const char *keyword,
                  size_t keyword_len, uint32_t type) {
    if (!StrIter_Starts_With_Utf8(iter, keyword, keyword_len)) {
        return NULL;
    }
    StringIterator *temp = StrIter_Clone(iter);
    StrIter_Advance(temp, keyword_len);
    int32_t lookahead = StrIter_Next(temp);
    if (lookahead == STR_OOB) {
        DECREF(temp);
        return NULL;
    }
    if (Str_is_whitespace(lookahead)
        || lookahead == '"'
        || lookahead == '('
        || lookahead == ')'
        || lookahead == '+'
        || lookahead == '-'
       ) {
        StrIter_Recede(temp, 1);
        StrIter_Assign(iter, temp);
        DECREF(temp);
        return ParserElem_new(type, NULL);
    }
    DECREF(temp);
    return NULL;
}

static ParserElem*
S_consume_field(StringIterator *iter) {
    StringIterator *temp = StrIter_Clone(iter);

    // Field names constructs must start with a letter or underscore.
    int32_t code_point = StrIter_Next(temp);
    if (code_point == STR_OOB) {
        DECREF(temp);
        return NULL;
    }
    if (!(isalpha(code_point) || code_point == '_')) {
        DECREF(temp);
        return NULL;
    }

    // Only alphanumerics and underscores are allowed in field names.
    while (':' != (code_point = StrIter_Next(temp))) {
        if (code_point == STR_OOB) {
            DECREF(temp);
            return NULL;
        }
        if (!(isalnum(code_point) || code_point == '_')) {
            DECREF(temp);
            return NULL;
        }
    }

    // Field name constructs must be followed by something sensible.
    int32_t lookahead = StrIter_Next(temp);
    if (lookahead == STR_OOB) {
        DECREF(temp);
        return NULL;
    }
    if (!(isalnum(lookahead)
          || lookahead == '_'
          || lookahead > 127
          || lookahead == '"'
          || lookahead == '('
         )
       ) {
        DECREF(temp);
        return NULL;
    }

    // Consume string data.
    StrIter_Recede(temp, 2); // Back up over lookahead and colon.
    String *field = StrIter_crop(iter, temp);
    StrIter_Advance(temp, 1); // Skip colon.
    StrIter_Assign(iter, temp);
    DECREF(temp);
    return ParserElem_new(TOKEN_FIELD, (Obj*)field);
}

static ParserElem*
S_consume_text(StringIterator *iter) {
    StringIterator *temp = StrIter_Clone(iter);

    while (1) {
        int32_t code_point = StrIter_Next(temp);
        if (code_point == '\\') {
            code_point = StrIter_Next(temp);
            if (code_point == STR_OOB) {
                break;
            }
        }
        else if (code_point == STR_OOB) {
            break;
        }
        else if (Str_is_whitespace(code_point)
            || code_point == '"'
            || code_point == '('
            || code_point == ')'
           ) {
            StrIter_Recede(temp, 1);
            break;
        }
    }

    String *text = StrIter_crop(iter, temp);
    StrIter_Assign(iter, temp);
    DECREF(temp);
    return ParserElem_new(TOKEN_STRING, (Obj*)text);
}

static ParserElem*
S_consume_quoted_string(StringIterator *iter) {
    StringIterator *temp = StrIter_Clone(iter);

    if (StrIter_Next(temp) != '"') {
        THROW(ERR, "Internal error: expected a quote");
    }

    while (1) {
        int32_t code_point = StrIter_Next(temp);
        if (code_point == STR_OOB || code_point == '"') {
            break;
        }
        else if (code_point == '\\') {
            StrIter_Next(temp);
        }
    }

    String *text = StrIter_crop(iter, temp);
    StrIter_Assign(iter, temp);
    DECREF(temp);
    return ParserElem_new(TOKEN_STRING, (Obj*)text);
}