The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define C_LUCY_RICHPOSTING
#define C_LUCY_RICHPOSTINGMATCHER
#define C_LUCY_RAWPOSTING
#define C_LUCY_TOKEN
#include "Lucy/Util/ToolSet.h"

#include "Lucy/Index/Posting/RichPosting.h"
#include "Lucy/Analysis/Token.h"
#include "Lucy/Analysis/Inversion.h"
#include "Lucy/Index/Posting/RawPosting.h"
#include "Lucy/Index/PostingList.h"
#include "Lucy/Index/PostingPool.h"
#include "Lucy/Index/Similarity.h"
#include "Lucy/Plan/FieldType.h"
#include "Lucy/Search/Compiler.h"
#include "Lucy/Store/InStream.h"
#include "Lucy/Util/MemoryPool.h"

#define FREQ_MAX_LEN     C32_MAX_BYTES
#define MAX_RAW_POSTING_LEN(_text_len, _freq) \
    (              sizeof(RawPosting) \
                   + _text_len                /* term text content */ \
                   + FREQ_MAX_LEN             /* freq c32 */ \
                   + (C32_MAX_BYTES * _freq)  /* positions deltas */ \
                   + _freq                    /* per-pos boost byte */ \
    )

RichPosting*
RichPost_new(Similarity *sim) {
    RichPosting *self = (RichPosting*)VTable_Make_Obj(RICHPOSTING);
    return RichPost_init(self, sim);
}

RichPosting*
RichPost_init(RichPosting *self, Similarity *sim) {
    ScorePost_init((ScorePosting*)self, sim);
    self->prox_boosts     = NULL;
    return self;
}

void
RichPost_destroy(RichPosting *self) {
    FREEMEM(self->prox_boosts);
    SUPER_DESTROY(self, RICHPOSTING);
}

void
RichPost_read_record(RichPosting *self, InStream *instream) {
    float *const norm_decoder = self->norm_decoder;
    uint32_t  num_prox = 0;
    uint32_t  position = 0;
    float     aggregate_weight = 0.0;

    // Decode delta doc.
    uint32_t doc_code = InStream_Read_C32(instream);
    self->doc_id += doc_code >> 1;

    // If the stored num was odd, the freq is 1.
    if (doc_code & 1) {
        self->freq = 1;
    }
    // Otherwise, freq was stored as a C32.
    else {
        self->freq = InStream_Read_C32(instream);
    }

    // Read positions, aggregate per-position boost byte into weight.
    num_prox = self->freq;
    if (num_prox > self->prox_cap) {
        self->prox
            = (uint32_t*)REALLOCATE(self->prox, num_prox * sizeof(uint32_t));
        self->prox_boosts
            = (float*)REALLOCATE(self->prox_boosts, num_prox * sizeof(float));
    }
    uint32_t *positions    = self->prox;
    float    *prox_boosts  = self->prox_boosts;

    while (num_prox--) {
        position += InStream_Read_C32(instream);
        *positions++ = position;
        *prox_boosts = norm_decoder[InStream_Read_U8(instream)];
        aggregate_weight += *prox_boosts;
        prox_boosts++;
    }
    self->weight = aggregate_weight / self->freq;
}

void
RichPost_add_inversion_to_pool(RichPosting *self, PostingPool *post_pool,
                               Inversion *inversion, FieldType *type,
                               int32_t doc_id, float doc_boost,
                               float length_norm) {
    MemoryPool *mem_pool = PostPool_Get_Mem_Pool(post_pool);
    Similarity *sim = self->sim;
    float       field_boost = doc_boost * FType_Get_Boost(type) * length_norm;
    Token     **tokens;
    uint32_t    freq;

    Inversion_Reset(inversion);
    while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) {
        Token   *token          = *tokens;
        uint32_t raw_post_bytes = MAX_RAW_POSTING_LEN(token->len, freq);
        RawPosting *raw_posting
            = RawPost_new(MemPool_Grab(mem_pool, raw_post_bytes), doc_id,
                          freq, token->text, token->len);
        char *const start = raw_posting->blob + token->len;
        char *dest = start;
        uint32_t last_prox = 0;
        uint32_t i;

        // Positions and boosts.
        for (i = 0; i < freq; i++) {
            Token *const t = tokens[i];
            const uint32_t prox_delta = t->pos - last_prox;
            const float boost = field_boost * t->boost;

            NumUtil_encode_c32(prox_delta, &dest);
            last_prox = t->pos;

            *((uint8_t*)dest) = Sim_Encode_Norm(sim, boost);
            dest++;
        }

        // Resize raw posting memory allocation.
        raw_posting->aux_len = dest - start;
        raw_post_bytes = dest - (char*)raw_posting;
        MemPool_Resize(mem_pool, raw_posting, raw_post_bytes);
        PostPool_Feed(post_pool, &raw_posting);
    }
}

RawPosting*
RichPost_read_raw(RichPosting *self, InStream *instream, int32_t last_doc_id,
                  CharBuf *term_text, MemoryPool *mem_pool) {
    char *const    text_buf       = (char*)CB_Get_Ptr8(term_text);
    const size_t   text_size      = CB_Get_Size(term_text);
    const uint32_t doc_code       = InStream_Read_C32(instream);
    const uint32_t delta_doc      = doc_code >> 1;
    const int32_t  doc_id         = last_doc_id + delta_doc;
    const uint32_t freq           = (doc_code & 1)
                                    ? 1
                                    : InStream_Read_C32(instream);
    size_t raw_post_bytes         = MAX_RAW_POSTING_LEN(text_size, freq);
    void *const allocation        = MemPool_Grab(mem_pool, raw_post_bytes);
    RawPosting *const raw_posting
        = RawPost_new(allocation, doc_id, freq, text_buf, text_size);
    uint32_t num_prox = freq;
    char *const start = raw_posting->blob + text_size;
    char *      dest  = start;
    UNUSED_VAR(self);

    // Read positions and per-position boosts.
    while (num_prox--) {
        dest += InStream_Read_Raw_C64(instream, dest);
        *((uint8_t*)dest) = InStream_Read_U8(instream);
        dest++;
    }

    // Resize raw posting memory allocation.
    raw_posting->aux_len = dest - start;
    raw_post_bytes       = dest - (char*)raw_posting;
    MemPool_Resize(mem_pool, raw_posting, raw_post_bytes);

    return raw_posting;
}

RichPostingMatcher*
RichPost_make_matcher(RichPosting *self, Similarity *sim,
                      PostingList *plist, Compiler *compiler,
                      bool_t need_score) {
    RichPostingMatcher* matcher
        = (RichPostingMatcher*)VTable_Make_Obj(RICHPOSTINGMATCHER);
    UNUSED_VAR(self);
    UNUSED_VAR(need_score);
    return RichPostMatcher_init(matcher, sim, plist, compiler);
}

RichPostingMatcher*
RichPostMatcher_init(RichPostingMatcher *self, Similarity *sim,
                     PostingList *plist, Compiler *compiler) {
    return (RichPostingMatcher*)ScorePostMatcher_init((ScorePostingMatcher*)self,
                                                      sim, plist, compiler);
}