The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define C_LUCY_LEXINDEX
#define C_LUCY_TERMINFO
#include "Lucy/Util/ToolSet.h"

#include "Lucy/Index/LexIndex.h"
#include "Lucy/Index/Segment.h"
#include "Lucy/Index/TermInfo.h"
#include "Lucy/Index/TermStepper.h"
#include "Lucy/Plan/Architecture.h"
#include "Lucy/Plan/FieldType.h"
#include "Lucy/Plan/Schema.h"
#include "Lucy/Store/Folder.h"
#include "Lucy/Store/InStream.h"

// Read the data we've arrived at after a seek operation.
static void
S_read_entry(LexIndex *self);

LexIndex*
LexIndex_new(Schema *schema, Folder *folder, Segment *segment,
             const CharBuf *field) {
    LexIndex *self = (LexIndex*)VTable_Make_Obj(LEXINDEX);
    return LexIndex_init(self, schema, folder, segment, field);
}

LexIndex*
LexIndex_init(LexIndex *self, Schema *schema, Folder *folder,
              Segment *segment, const CharBuf *field) {
    int32_t  field_num = Seg_Field_Num(segment, field);
    CharBuf *seg_name  = Seg_Get_Name(segment);
    CharBuf *ixix_file = CB_newf("%o/lexicon-%i32.ixix", seg_name, field_num);
    CharBuf *ix_file   = CB_newf("%o/lexicon-%i32.ix", seg_name, field_num);
    Architecture *arch = Schema_Get_Architecture(schema);

    // Init.
    Lex_init((Lexicon*)self, field);
    self->tinfo        = TInfo_new(0);
    self->tick         = 0;

    // Derive
    self->field_type = Schema_Fetch_Type(schema, field);
    if (!self->field_type) {
        CharBuf *mess = MAKE_MESS("Unknown field: '%o'", field);
        DECREF(ix_file);
        DECREF(ixix_file);
        DECREF(self);
        Err_throw_mess(ERR, mess);
    }
    INCREF(self->field_type);
    self->term_stepper = FType_Make_Term_Stepper(self->field_type);
    self->ixix_in = Folder_Open_In(folder, ixix_file);
    if (!self->ixix_in) {
        Err *error = (Err*)INCREF(Err_get_error());
        DECREF(ix_file);
        DECREF(ixix_file);
        DECREF(self);
        RETHROW(error);
    }
    self->ix_in = Folder_Open_In(folder, ix_file);
    if (!self->ix_in) {
        Err *error = (Err*)INCREF(Err_get_error());
        DECREF(ix_file);
        DECREF(ixix_file);
        DECREF(self);
        RETHROW(error);
    }
    self->index_interval = Arch_Index_Interval(arch);
    self->skip_interval  = Arch_Skip_Interval(arch);
    self->size    = (int32_t)(InStream_Length(self->ixix_in) / sizeof(int64_t));
    self->offsets = (int64_t*)InStream_Buf(self->ixix_in,
                                           (size_t)InStream_Length(self->ixix_in));

    DECREF(ixix_file);
    DECREF(ix_file);

    return self;
}

void
LexIndex_destroy(LexIndex *self) {
    DECREF(self->field_type);
    DECREF(self->ixix_in);
    DECREF(self->ix_in);
    DECREF(self->term_stepper);
    DECREF(self->tinfo);
    SUPER_DESTROY(self, LEXINDEX);
}

int32_t
LexIndex_get_term_num(LexIndex *self) {
    return (self->index_interval * self->tick) - 1;
}

Obj*
LexIndex_get_term(LexIndex *self) {
    return TermStepper_Get_Value(self->term_stepper);
}

TermInfo*
LexIndex_get_term_info(LexIndex *self) {
    return self->tinfo;
}

static void
S_read_entry(LexIndex *self) {
    InStream *ix_in  = self->ix_in;
    TermInfo *tinfo  = self->tinfo;
    int64_t offset = (int64_t)NumUtil_decode_bigend_u64(self->offsets + self->tick);
    InStream_Seek(ix_in, offset);
    TermStepper_Read_Key_Frame(self->term_stepper, ix_in);
    tinfo->doc_freq     = InStream_Read_C32(ix_in);
    tinfo->post_filepos = InStream_Read_C64(ix_in);
    tinfo->skip_filepos = tinfo->doc_freq >= self->skip_interval
                          ? InStream_Read_C64(ix_in)
                          : 0;
    tinfo->lex_filepos  = InStream_Read_C64(ix_in);
}

void
LexIndex_seek(LexIndex *self, Obj *target) {
    TermStepper *term_stepper = self->term_stepper;
    InStream    *ix_in        = self->ix_in;
    FieldType   *type         = self->field_type;
    int32_t      lo           = 0;
    int32_t      hi           = self->size - 1;
    int32_t      result       = -100;

    if (target == NULL || self->size == 0) {
        self->tick = 0;
        return;
    }
    else {
        if (!Obj_Is_A(target, CHARBUF)) {
            THROW(ERR, "Target is a %o, and not comparable to a %o",
                  Obj_Get_Class_Name(target), VTable_Get_Name(CHARBUF));
        }
        /* TODO:
        Obj *first_obj = VA_Fetch(terms, 0);
        if (!Obj_Is_A(target, Obj_Get_VTable(first_obj))) {
            THROW(ERR, "Target is a %o, and not comparable to a %o",
                Obj_Get_Class_Name(target), Obj_Get_Class_Name(first_obj));
        }
        */
    }

    // Divide and conquer.
    while (hi >= lo) {
        const int32_t mid = lo + ((hi - lo) / 2);
        const int64_t offset
            = (int64_t)NumUtil_decode_bigend_u64(self->offsets + mid);
        InStream_Seek(ix_in, offset);
        TermStepper_Read_Key_Frame(term_stepper, ix_in);

        // Compare values.  There is no need for a NULL-check because the term
        // number is alway between 0 and self->size - 1.
        Obj *value = TermStepper_Get_Value(term_stepper);
        int32_t comparison = FType_Compare_Values(type, target, value);

        if (comparison < 0) {
            hi = mid - 1;
        }
        else if (comparison > 0) {
            lo = mid + 1;
        }
        else {
            result = mid;
            break;
        }
    }

    // Record the index of the entry we've seeked to, then read entry.
    self->tick = hi == -1 // indicating that target lt first entry
                 ? 0
                 : result == -100 // if result is still -100, it wasn't set
                 ? hi
                 : result;
    S_read_entry(self);
}