The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define C_LUCY_LEXICONWRITER
#include "Lucy/Util/ToolSet.h"

#include "Lucy/Index/LexiconWriter.h"
#include "Clownfish/CharBuf.h"
#include "Lucy/Plan/FieldType.h"
#include "Lucy/Plan/Schema.h"
#include "Lucy/Index/PolyReader.h"
#include "Lucy/Index/Posting/MatchPosting.h"
#include "Lucy/Index/Segment.h"
#include "Lucy/Index/SegReader.h"
#include "Lucy/Index/Snapshot.h"
#include "Lucy/Index/TermInfo.h"
#include "Lucy/Index/TermStepper.h"
#include "Lucy/Plan/Architecture.h"
#include "Lucy/Store/Folder.h"
#include "Lucy/Store/OutStream.h"

int32_t LexWriter_current_file_format = 3;

LexiconWriter*
LexWriter_new(Schema *schema, Snapshot *snapshot, Segment *segment,
              PolyReader *polyreader) {
    LexiconWriter *self = (LexiconWriter*)Class_Make_Obj(LEXICONWRITER);
    return LexWriter_init(self, schema, snapshot, segment, polyreader);
}

LexiconWriter*
LexWriter_init(LexiconWriter *self, Schema *schema, Snapshot *snapshot,
               Segment *segment, PolyReader *polyreader) {
    Architecture *arch = Schema_Get_Architecture(schema);

    DataWriter_init((DataWriter*)self, schema, snapshot, segment, polyreader);
    LexiconWriterIVARS *const ivars = LexWriter_IVARS(self);

    // Assign.
    ivars->index_interval = Arch_Index_Interval(arch);
    ivars->skip_interval  = Arch_Skip_Interval(arch);

    // Init.
    ivars->ix_out             = NULL;
    ivars->ixix_out           = NULL;
    ivars->dat_out            = NULL;
    ivars->count              = 0;
    ivars->ix_count           = 0;
    ivars->dat_file           = NULL;
    ivars->ix_file            = NULL;
    ivars->ixix_file          = NULL;
    ivars->counts             = Hash_new(0);
    ivars->ix_counts          = Hash_new(0);
    ivars->temp_mode          = false;
    ivars->term_stepper       = NULL;
    ivars->tinfo_stepper      = (TermStepper*)MatchTInfoStepper_new(schema);

    return self;
}

void
LexWriter_Destroy_IMP(LexiconWriter *self) {
    LexiconWriterIVARS *const ivars = LexWriter_IVARS(self);
    DECREF(ivars->term_stepper);
    DECREF(ivars->tinfo_stepper);
    DECREF(ivars->dat_file);
    DECREF(ivars->ix_file);
    DECREF(ivars->ixix_file);
    DECREF(ivars->dat_out);
    DECREF(ivars->ix_out);
    DECREF(ivars->ixix_out);
    DECREF(ivars->counts);
    DECREF(ivars->ix_counts);
    SUPER_DESTROY(self, LEXICONWRITER);
}

static void
S_add_last_term_to_ix(LexiconWriter *self) {
    LexiconWriterIVARS *const ivars = LexWriter_IVARS(self);

    // Write file pointer to index record.
    OutStream_Write_I64(ivars->ixix_out, OutStream_Tell(ivars->ix_out));

    // Write term and file pointer to main record.  Track count of terms added
    // to ix.
    TermStepper_Write_Key_Frame(ivars->term_stepper,
                                ivars->ix_out, TermStepper_Get_Value(ivars->term_stepper));
    TermStepper_Write_Key_Frame(ivars->tinfo_stepper,
                                ivars->ix_out, TermStepper_Get_Value(ivars->tinfo_stepper));
    OutStream_Write_C64(ivars->ix_out, OutStream_Tell(ivars->dat_out));
    ivars->ix_count++;
}

void
LexWriter_Add_Term_IMP(LexiconWriter* self, Obj* term_text, TermInfo* tinfo) {
    LexiconWriterIVARS *const ivars = LexWriter_IVARS(self);
    OutStream *dat_out = ivars->dat_out;

    if ((ivars->count % ivars->index_interval == 0)
        && !ivars->temp_mode
       ) {
        // Write a subset of entries to lexicon.ix.
        S_add_last_term_to_ix(self);
    }

    TermStepper_Write_Delta(ivars->term_stepper, dat_out, term_text);
    TermStepper_Write_Delta(ivars->tinfo_stepper, dat_out, (Obj*)tinfo);

    // Track number of terms.
    ivars->count++;
}

void
LexWriter_Start_Field_IMP(LexiconWriter *self, int32_t field_num) {
    LexiconWriterIVARS *const ivars = LexWriter_IVARS(self);
    Segment   *const segment  = LexWriter_Get_Segment(self);
    Folder    *const folder   = LexWriter_Get_Folder(self);
    Schema    *const schema   = LexWriter_Get_Schema(self);
    String    *const seg_name = Seg_Get_Name(segment);
    String    *const field    = Seg_Field_Name(segment, field_num);
    FieldType *const type     = Schema_Fetch_Type(schema, field);

    // Open outstreams.
    DECREF(ivars->dat_file);
    DECREF(ivars->ix_file);
    DECREF(ivars->ixix_file);
    ivars->dat_file  = Str_newf("%o/lexicon-%i32.dat",  seg_name, field_num);
    ivars->ix_file   = Str_newf("%o/lexicon-%i32.ix",   seg_name, field_num);
    ivars->ixix_file = Str_newf("%o/lexicon-%i32.ixix", seg_name, field_num);
    ivars->dat_out = Folder_Open_Out(folder, ivars->dat_file);
    if (!ivars->dat_out) { RETHROW(INCREF(Err_get_error())); }
    ivars->ix_out = Folder_Open_Out(folder, ivars->ix_file);
    if (!ivars->ix_out) { RETHROW(INCREF(Err_get_error())); }
    ivars->ixix_out = Folder_Open_Out(folder, ivars->ixix_file);
    if (!ivars->ixix_out) { RETHROW(INCREF(Err_get_error())); }

    // Initialize count and ix_count, term stepper and term info stepper.
    ivars->count    = 0;
    ivars->ix_count = 0;
    ivars->term_stepper = FType_Make_Term_Stepper(type);
    TermStepper_Reset(ivars->tinfo_stepper);
}

void
LexWriter_Finish_Field_IMP(LexiconWriter *self, int32_t field_num) {
    LexiconWriterIVARS *const ivars = LexWriter_IVARS(self);
    String *field = Seg_Field_Name(ivars->segment, field_num);

    // Store count of terms for this field as metadata.
    Hash_Store(ivars->counts, (Obj*)field,
               (Obj*)Str_newf("%i32", ivars->count));
    Hash_Store(ivars->ix_counts, (Obj*)field,
               (Obj*)Str_newf("%i32", ivars->ix_count));

    // Close streams.
    OutStream_Close(ivars->dat_out);
    OutStream_Close(ivars->ix_out);
    OutStream_Close(ivars->ixix_out);
    DECREF(ivars->dat_out);
    DECREF(ivars->ix_out);
    DECREF(ivars->ixix_out);
    ivars->dat_out  = NULL;
    ivars->ix_out   = NULL;
    ivars->ixix_out = NULL;

    // Close term stepper.
    DECREF(ivars->term_stepper);
    ivars->term_stepper = NULL;
}

void
LexWriter_Enter_Temp_Mode_IMP(LexiconWriter *self, String *field,
                              OutStream *temp_outstream) {
    LexiconWriterIVARS *const ivars = LexWriter_IVARS(self);
    Schema    *schema = LexWriter_Get_Schema(self);
    FieldType *type   = Schema_Fetch_Type(schema, field);

    // Assign outstream.
    if (ivars->dat_out != NULL) {
        THROW(ERR, "Can't enter temp mode (filename: %o) ", ivars->dat_file);
    }
    ivars->dat_out = (OutStream*)INCREF(temp_outstream);

    // Initialize count and ix_count, term stepper and term info stepper.
    ivars->count    = 0;
    ivars->ix_count = 0;
    ivars->term_stepper = FType_Make_Term_Stepper(type);
    TermStepper_Reset(ivars->tinfo_stepper);

    // Remember that we're in temp mode.
    ivars->temp_mode = true;
}

void
LexWriter_Leave_Temp_Mode_IMP(LexiconWriter *self) {
    LexiconWriterIVARS *const ivars = LexWriter_IVARS(self);
    DECREF(ivars->term_stepper);
    ivars->term_stepper = NULL;
    DECREF(ivars->dat_out);
    ivars->dat_out   = NULL;
    ivars->temp_mode = false;
}

void
LexWriter_Finish_IMP(LexiconWriter *self) {
    LexiconWriterIVARS *const ivars = LexWriter_IVARS(self);

    // Ensure that streams were closed (by calling Finish_Field or
    // Leave_Temp_Mode).
    if (ivars->dat_out != NULL) {
        THROW(ERR, "File '%o' never closed", ivars->dat_file);
    }
    else if (ivars->ix_out != NULL) {
        THROW(ERR, "File '%o' never closed", ivars->ix_file);
    }
    else if (ivars->ix_out != NULL) {
        THROW(ERR, "File '%o' never closed", ivars->ix_file);
    }

    // Store metadata.
    Seg_Store_Metadata_Utf8(ivars->segment, "lexicon", 7,
                            (Obj*)LexWriter_Metadata(self));
}

Hash*
LexWriter_Metadata_IMP(LexiconWriter *self) {
    LexiconWriterIVARS *const ivars = LexWriter_IVARS(self);
    LexWriter_Metadata_t super_meta
        = (LexWriter_Metadata_t)SUPER_METHOD_PTR(LEXICONWRITER,
                                                 LUCY_LexWriter_Metadata);
    Hash *const metadata  = super_meta(self);
    Hash *const counts    = (Hash*)INCREF(ivars->counts);
    Hash *const ix_counts = (Hash*)INCREF(ivars->ix_counts);

    // Placeholders.
    if (Hash_Get_Size(counts) == 0) {
        Hash_Store_Utf8(counts, "none", 4, (Obj*)Str_newf("%i32", (int32_t)0));
        Hash_Store_Utf8(ix_counts, "none", 4,
                        (Obj*)Str_newf("%i32", (int32_t)0));
    }

    Hash_Store_Utf8(metadata, "counts", 6, (Obj*)counts);
    Hash_Store_Utf8(metadata, "index_counts", 12, (Obj*)ix_counts);

    return metadata;
}

void
LexWriter_Add_Segment_IMP(LexiconWriter *self, SegReader *reader,
                          I32Array *doc_map) {
    // No-op, since the data gets added via PostingListWriter.
    UNUSED_VAR(self);
    UNUSED_VAR(reader);
    UNUSED_VAR(doc_map);
}

int32_t
LexWriter_Format_IMP(LexiconWriter *self) {
    UNUSED_VAR(self);
    return LexWriter_current_file_format;
}