The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define C_LUCY_LEXICONREADER
#define C_LUCY_POLYLEXICONREADER
#define C_LUCY_DEFAULTLEXICONREADER
#include "Lucy/Util/ToolSet.h"

#include "Lucy/Index/LexiconReader.h"
#include "Lucy/Plan/FieldType.h"
#include "Lucy/Plan/Schema.h"
#include "Lucy/Index/PolyLexicon.h"
#include "Lucy/Index/SegLexicon.h"
#include "Lucy/Index/Segment.h"
#include "Lucy/Index/Snapshot.h"
#include "Lucy/Index/TermInfo.h"
#include "Lucy/Store/Folder.h"

LexiconReader*
LexReader_init(LexiconReader *self, Schema *schema, Folder *folder,
               Snapshot *snapshot, VArray *segments, int32_t seg_tick) {
    DataReader_init((DataReader*)self, schema, folder, snapshot, segments,
                    seg_tick);
    ABSTRACT_CLASS_CHECK(self, LEXICONREADER);
    return self;
}

LexiconReader*
LexReader_aggregator(LexiconReader *self, VArray *readers, I32Array *offsets) {
    UNUSED_VAR(self);
    return (LexiconReader*)PolyLexReader_new(readers, offsets);
}

PolyLexiconReader*
PolyLexReader_new(VArray *readers, I32Array *offsets) {
    PolyLexiconReader *self
        = (PolyLexiconReader*)VTable_Make_Obj(POLYLEXICONREADER);
    return PolyLexReader_init(self, readers, offsets);
}

PolyLexiconReader*
PolyLexReader_init(PolyLexiconReader *self, VArray *readers,
                   I32Array *offsets) {
    Schema *schema = NULL;
    for (uint32_t i = 0, max = VA_Get_Size(readers); i < max; i++) {
        LexiconReader *reader
            = (LexiconReader*)CERTIFY(VA_Fetch(readers, i), LEXICONREADER);
        if (!schema) { schema = LexReader_Get_Schema(reader); }
    }
    LexReader_init((LexiconReader*)self, schema, NULL, NULL, NULL, -1);
    self->readers = (VArray*)INCREF(readers);
    self->offsets = (I32Array*)INCREF(offsets);
    return self;
}

void
PolyLexReader_close(PolyLexiconReader *self) {
    if (self->readers) {
        for (uint32_t i = 0, max = VA_Get_Size(self->readers); i < max; i++) {
            LexiconReader *reader
                = (LexiconReader*)VA_Fetch(self->readers, i);
            if (reader) { LexReader_Close(reader); }
        }
        VA_Clear(self->readers);
    }
}

void
PolyLexReader_destroy(PolyLexiconReader *self) {
    DECREF(self->readers);
    DECREF(self->offsets);
    SUPER_DESTROY(self, POLYLEXICONREADER);
}

Lexicon*
PolyLexReader_lexicon(PolyLexiconReader *self, const CharBuf *field,
                      Obj *term) {
    PolyLexicon *lexicon = NULL;

    if (field != NULL) {
        Schema *schema = PolyLexReader_Get_Schema(self);
        FieldType *type = Schema_Fetch_Type(schema, field);
        if (type != NULL) {
            lexicon = PolyLex_new(field, self->readers);
            if (!PolyLex_Get_Num_Seg_Lexicons(lexicon)) {
                DECREF(lexicon);
                return NULL;
            }
            if (term) { PolyLex_Seek(lexicon, term); }
        }
    }

    return (Lexicon*)lexicon;
}

uint32_t
PolyLexReader_doc_freq(PolyLexiconReader *self, const CharBuf *field,
                       Obj *term) {
    uint32_t doc_freq = 0;
    for (uint32_t i = 0, max = VA_Get_Size(self->readers); i < max; i++) {
        LexiconReader *reader = (LexiconReader*)VA_Fetch(self->readers, i);
        if (reader) {
            doc_freq += LexReader_Doc_Freq(reader, field, term);
        }
    }
    return doc_freq;
}

DefaultLexiconReader*
DefLexReader_new(Schema *schema, Folder *folder, Snapshot *snapshot,
                 VArray *segments, int32_t seg_tick) {
    DefaultLexiconReader *self
        = (DefaultLexiconReader*)VTable_Make_Obj(DEFAULTLEXICONREADER);
    return DefLexReader_init(self, schema, folder, snapshot, segments,
                             seg_tick);
}

// Indicate whether it is safe to build a SegLexicon using the given
// parameters. Will return false if the field is not indexed or if no terms
// are present for this field in this segment.
static bool_t
S_has_data(Schema *schema, Folder *folder, Segment *segment, CharBuf *field) {
    FieldType *type = Schema_Fetch_Type(schema, field);

    if (!type || !FType_Indexed(type)) {
        // If the field isn't indexed, bail out.
        return false;
    }
    else {
        // Bail out if there are no terms for this field in this segment.
        int32_t  field_num = Seg_Field_Num(segment, field);
        CharBuf *seg_name  = Seg_Get_Name(segment);
        CharBuf *file = CB_newf("%o/lexicon-%i32.dat", seg_name, field_num);
        bool_t retval = Folder_Exists(folder, file);
        DECREF(file);
        return retval;
    }
}

DefaultLexiconReader*
DefLexReader_init(DefaultLexiconReader *self, Schema *schema, Folder *folder,
                  Snapshot *snapshot, VArray *segments, int32_t seg_tick) {

    // Init.
    LexReader_init((LexiconReader*)self, schema, folder, snapshot, segments,
                   seg_tick);
    Segment *segment = DefLexReader_Get_Segment(self);

    // Build an array of SegLexicon objects.
    self->lexicons = VA_new(Schema_Num_Fields(schema));
    for (uint32_t i = 1, max = Schema_Num_Fields(schema) + 1; i < max; i++) {
        CharBuf *field = Seg_Field_Name(segment, i);
        if (field && S_has_data(schema, folder, segment, field)) {
            SegLexicon *lexicon = SegLex_new(schema, folder, segment, field);
            VA_Store(self->lexicons, i, (Obj*)lexicon);
        }
    }

    return self;
}

void
DefLexReader_close(DefaultLexiconReader *self) {
    DECREF(self->lexicons);
    self->lexicons = NULL;
}

void
DefLexReader_destroy(DefaultLexiconReader *self) {
    DECREF(self->lexicons);
    SUPER_DESTROY(self, DEFAULTLEXICONREADER);
}

Lexicon*
DefLexReader_lexicon(DefaultLexiconReader *self, const CharBuf *field,
                     Obj *term) {
    int32_t     field_num = Seg_Field_Num(self->segment, field);
    SegLexicon *orig      = (SegLexicon*)VA_Fetch(self->lexicons, field_num);
    SegLexicon *lexicon   = NULL;

    if (orig) { // i.e. has data
        lexicon
            = SegLex_new(self->schema, self->folder, self->segment, field);
        SegLex_Seek(lexicon, term);
    }

    return (Lexicon*)lexicon;
}

static TermInfo*
S_find_tinfo(DefaultLexiconReader *self, const CharBuf *field, Obj *target) {
    if (field != NULL && target != NULL) {
        int32_t field_num = Seg_Field_Num(self->segment, field);
        SegLexicon *lexicon
            = (SegLexicon*)VA_Fetch(self->lexicons, field_num);

        if (lexicon) {
            // Iterate until the result is ge the term.
            SegLex_Seek(lexicon, target);

            //if found matches target, return info; otherwise NULL
            Obj *found = SegLex_Get_Term(lexicon);
            if (found && Obj_Equals(target, found)) {
                return SegLex_Get_Term_Info(lexicon);
            }
        }
    }
    return NULL;
}

TermInfo*
DefLexReader_fetch_term_info(DefaultLexiconReader *self,
                             const CharBuf *field, Obj *target) {
    TermInfo *tinfo = S_find_tinfo(self, field, target);
    return tinfo ? TInfo_Clone(tinfo) : NULL;
}

uint32_t
DefLexReader_doc_freq(DefaultLexiconReader *self, const CharBuf *field,
                      Obj *term) {
    TermInfo *tinfo = S_find_tinfo(self, field, term);
    return tinfo ? TInfo_Get_Doc_Freq(tinfo) : 0;
}