The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define C_LUCY_SEGPOSTINGLIST
#define C_LUCY_POSTING
#define C_LUCY_SKIPSTEPPER
#include "Lucy/Util/ToolSet.h"

#include "Lucy/Index/SegPostingList.h"
#include "Lucy/Index/Posting.h"
#include "Lucy/Index/Posting/RawPosting.h"
#include "Lucy/Index/PostingListReader.h"
#include "Lucy/Index/Segment.h"
#include "Lucy/Index/SkipStepper.h"
#include "Lucy/Index/TermInfo.h"
#include "Lucy/Index/SegLexicon.h"
#include "Lucy/Index/LexiconReader.h"
#include "Lucy/Index/Similarity.h"
#include "Lucy/Plan/Architecture.h"
#include "Lucy/Plan/FieldType.h"
#include "Lucy/Plan/Schema.h"
#include "Lucy/Search/Compiler.h"
#include "Lucy/Search/Matcher.h"
#include "Lucy/Store/InStream.h"
#include "Lucy/Store/Folder.h"
#include "Lucy/Util/MemoryPool.h"

// Low level seek call.
static void
S_seek_tinfo(SegPostingList *self, TermInfo *tinfo);

SegPostingList*
SegPList_new(PostingListReader *plist_reader, String *field) {
    SegPostingList *self = (SegPostingList*)Class_Make_Obj(SEGPOSTINGLIST);
    return SegPList_init(self, plist_reader, field);
}

SegPostingList*
SegPList_init(SegPostingList *self, PostingListReader *plist_reader,
              String *field) {
    SegPostingListIVARS *const ivars = SegPList_IVARS(self);
    Schema       *const schema   = PListReader_Get_Schema(plist_reader);
    Folder       *const folder   = PListReader_Get_Folder(plist_reader);
    Segment      *const segment  = PListReader_Get_Segment(plist_reader);
    Architecture *const arch     = Schema_Get_Architecture(schema);
    String       *const seg_name = Seg_Get_Name(segment);
    int32_t       field_num      = Seg_Field_Num(segment, field);
    String       *post_file      = Str_newf("%o/postings-%i32.dat",
                                           seg_name, field_num);
    String       *skip_file      = Str_newf("%o/postings.skip", seg_name);

    // Init.
    ivars->doc_freq        = 0;
    ivars->count           = 0;

    // Init skipping vars.
    ivars->skip_stepper    = SkipStepper_new();
    ivars->skip_count      = 0;
    ivars->num_skips       = 0;

    // Assign.
    ivars->plist_reader    = (PostingListReader*)INCREF(plist_reader);
    ivars->field           = Str_Clone(field);
    ivars->skip_interval   = Arch_Skip_Interval(arch);

    // Derive.
    Similarity *sim  = Schema_Fetch_Sim(schema, field);
    ivars->posting   = Sim_Make_Posting(sim);
    ivars->field_num = field_num;

    // Open both a main stream and a skip stream if the field exists.
    if (Folder_Exists(folder, post_file)) {
        ivars->post_stream = Folder_Open_In(folder, post_file);
        if (!ivars->post_stream) {
            Err *error = (Err*)INCREF(Err_get_error());
            DECREF(post_file);
            DECREF(skip_file);
            DECREF(self);
            RETHROW(error);
        }
        ivars->skip_stream = Folder_Open_In(folder, skip_file);
        if (!ivars->skip_stream) {
            Err *error = (Err*)INCREF(Err_get_error());
            DECREF(post_file);
            DECREF(skip_file);
            DECREF(self);
            RETHROW(error);
        }
    }
    else {
        //  Empty, so don't bother with these.
        ivars->post_stream = NULL;
        ivars->skip_stream = NULL;
    }
    DECREF(post_file);
    DECREF(skip_file);

    return self;
}

void
SegPList_Destroy_IMP(SegPostingList *self) {
    SegPostingListIVARS *const ivars = SegPList_IVARS(self);
    DECREF(ivars->plist_reader);
    DECREF(ivars->posting);
    DECREF(ivars->skip_stepper);
    DECREF(ivars->field);

    if (ivars->post_stream != NULL) {
        InStream_Close(ivars->post_stream);
        InStream_Close(ivars->skip_stream);
        DECREF(ivars->post_stream);
        DECREF(ivars->skip_stream);
    }

    SUPER_DESTROY(self, SEGPOSTINGLIST);
}

Posting*
SegPList_Get_Posting_IMP(SegPostingList *self) {
    return SegPList_IVARS(self)->posting;
}

uint32_t
SegPList_Get_Doc_Freq_IMP(SegPostingList *self) {
    return SegPList_IVARS(self)->doc_freq;
}

int32_t
SegPList_Get_Doc_ID_IMP(SegPostingList *self) {
    SegPostingListIVARS *const ivars = SegPList_IVARS(self);
    return Post_IVARS(ivars->posting)->doc_id;
}

uint32_t
SegPList_Get_Count_IMP(SegPostingList *self) {
    return SegPList_IVARS(self)->count;
}

InStream*
SegPList_Get_Post_Stream_IMP(SegPostingList *self) {
    return SegPList_IVARS(self)->post_stream;
}

int32_t
SegPList_Next_IMP(SegPostingList *self) {
    SegPostingListIVARS *const ivars = SegPList_IVARS(self);
    InStream *const post_stream = ivars->post_stream;
    Posting  *const posting     = ivars->posting;

    // Bail if we're out of docs.
    if (ivars->count >= ivars->doc_freq) {
        Post_Reset(posting);
        return 0;
    }
    ivars->count++;

    Post_Read_Record(posting, post_stream);

    return Post_IVARS(posting)->doc_id;
}

int32_t
SegPList_Advance_IMP(SegPostingList *self, int32_t target) {
    SegPostingListIVARS *const ivars = SegPList_IVARS(self);
    PostingIVARS *const posting_ivars = Post_IVARS(ivars->posting);
    const uint32_t skip_interval = ivars->skip_interval;

    if (ivars->doc_freq >= skip_interval) {
        InStream *post_stream           = ivars->post_stream;
        InStream *skip_stream           = ivars->skip_stream;
        SkipStepper *const skip_stepper = ivars->skip_stepper;
        SkipStepperIVARS *const skip_stepper_ivars
            = SkipStepper_IVARS(skip_stepper);
        uint32_t new_doc_id             = skip_stepper_ivars->doc_id;
        int64_t new_filepos             = InStream_Tell(post_stream);

        /* Assuming the default skip_interval of 16...
         *
         * Say we're currently on the 5th doc matching this term, and we get a
         * request to skip to the 18th doc matching it.  We won't have skipped
         * yet, but we'll have already gone past 5 of the 16 skip docs --
         * ergo, the modulus in the following formula.
         */
        int32_t num_skipped = 0 - (ivars->count % skip_interval);
        if (num_skipped == 0 && ivars->count != 0) {
            num_skipped = 0 - skip_interval;
        }

        // See if there's anything to skip.
        while (target > skip_stepper_ivars->doc_id) {
            new_doc_id  = skip_stepper_ivars->doc_id;
            new_filepos = skip_stepper_ivars->filepos;

            if (skip_stepper_ivars->doc_id != 0
                && skip_stepper_ivars->doc_id >= posting_ivars->doc_id
               ) {
                num_skipped += skip_interval;
            }

            if (ivars->skip_count >= ivars->num_skips) {
                break;
            }

            SkipStepper_Read_Record(skip_stepper, skip_stream);
            ivars->skip_count++;
        }

        // If we found something to skip, skip it.
        if (new_filepos > InStream_Tell(post_stream)) {

            // Move the postings filepointer up.
            InStream_Seek(post_stream, new_filepos);

            // Jump to the new doc id.
            posting_ivars->doc_id = new_doc_id;

            // Increase count by the number of docs we skipped over.
            ivars->count += num_skipped;
        }
    }

    // Done skipping, so scan.
    while (1) {
        int32_t doc_id = SegPList_Next(self);
        if (doc_id == 0 || doc_id >= target) {
            return doc_id;
        }
    }
}

void
SegPList_Seek_IMP(SegPostingList *self, Obj *target) {
    SegPostingListIVARS *const ivars = SegPList_IVARS(self);
    LexiconReader *lex_reader = PListReader_Get_Lex_Reader(ivars->plist_reader);
    TermInfo      *tinfo      = LexReader_Fetch_Term_Info(lex_reader,
                                                          ivars->field, target);
    S_seek_tinfo(self, tinfo);
    DECREF(tinfo);
}

void
SegPList_Seek_Lex_IMP(SegPostingList *self, Lexicon *lexicon) {
    SegPostingListIVARS *const ivars = SegPList_IVARS(self);

    // Maybe true, maybe not.
    SegLexicon *const seg_lexicon = (SegLexicon*)lexicon;

    // Optimized case.
    if (Obj_is_a((Obj*)lexicon, SEGLEXICON)
        && (SegLex_Get_Segment(seg_lexicon)
            == PListReader_Get_Segment(ivars->plist_reader)) // i.e. same segment
       ) {
        S_seek_tinfo(self, SegLex_Get_Term_Info(seg_lexicon));
    }
    // Punt case.  This is more expensive because of the call to
    // LexReader_Fetch_Term_Info() in Seek().
    else {
        Obj *term = Lex_Get_Term(lexicon);
        SegPList_Seek(self, term);
    }
}

static void
S_seek_tinfo(SegPostingList *self, TermInfo *tinfo) {
    SegPostingListIVARS *const ivars = SegPList_IVARS(self);
    ivars->count = 0;

    if (tinfo == NULL) {
        // Next will return false; other methods invalid now.
        ivars->doc_freq = 0;
    }
    else {
        // Transfer doc_freq, seek main stream.
        int64_t post_filepos = TInfo_Get_Post_FilePos(tinfo);
        ivars->doc_freq      = TInfo_Get_Doc_Freq(tinfo);
        InStream_Seek(ivars->post_stream, post_filepos);

        // Prepare posting.
        Post_Reset(ivars->posting);

        // Prepare to skip.
        ivars->skip_count = 0;
        ivars->num_skips  = ivars->doc_freq / ivars->skip_interval;
        SkipStepper_Set_ID_And_Filepos(ivars->skip_stepper, 0, post_filepos);
        InStream_Seek(ivars->skip_stream, TInfo_Get_Skip_FilePos(tinfo));
    }
}

Matcher*
SegPList_Make_Matcher_IMP(SegPostingList *self, Similarity *sim,
                          Compiler *compiler, bool need_score) {
    SegPostingListIVARS *const ivars = SegPList_IVARS(self);
    return Post_Make_Matcher(ivars->posting, sim, (PostingList*)self, compiler,
                             need_score);
}

RawPosting*
SegPList_Read_Raw_IMP(SegPostingList *self, int32_t last_doc_id,
                      String *term_text, MemoryPool *mem_pool) {
    SegPostingListIVARS *const ivars = SegPList_IVARS(self);
    return Post_Read_Raw(ivars->posting, ivars->post_stream,
                         last_doc_id, term_text, mem_pool);
}