The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define C_LUCY_DOCVECTOR
#include "Lucy/Util/ToolSet.h"

#include "Lucy/Index/DocVector.h"
#include "Lucy/Index/TermVector.h"
#include "Lucy/Store/InStream.h"
#include "Lucy/Store/OutStream.h"

// Extract a document's compressed TermVector data into (term_text =>
// compressed positional data) pairs.
static Hash*
S_extract_tv_cache(ByteBuf *field_buf);

// Pull a TermVector object out from compressed positional data.
static TermVector*
S_extract_tv_from_tv_buf(const CharBuf *field, const CharBuf *term_text,
                         ByteBuf *tv_buf);

DocVector*
DocVec_new() {
    DocVector *self = (DocVector*)VTable_Make_Obj(DOCVECTOR);
    return DocVec_init(self);
}

DocVector*
DocVec_init(DocVector *self) {
    self->field_bufs    = Hash_new(0);
    self->field_vectors = Hash_new(0);
    return self;
}

void
DocVec_serialize(DocVector *self, OutStream *outstream) {
    Hash_Serialize(self->field_bufs, outstream);
    Hash_Serialize(self->field_vectors, outstream);
}

DocVector*
DocVec_deserialize(DocVector *self, InStream *instream) {
    self = self ? self : (DocVector*)VTable_Make_Obj(DOCVECTOR);
    self->field_bufs    = Hash_deserialize(NULL, instream);
    self->field_vectors = Hash_deserialize(NULL, instream);
    return self;
}

void
DocVec_destroy(DocVector *self) {
    DECREF(self->field_bufs);
    DECREF(self->field_vectors);
    SUPER_DESTROY(self, DOCVECTOR);
}

void
DocVec_add_field_buf(DocVector *self, const CharBuf *field,
                     ByteBuf *field_buf) {
    Hash_Store(self->field_bufs, (Obj*)field, INCREF(field_buf));
}

ByteBuf*
DocVec_field_buf(DocVector *self, const CharBuf *field) {
    return (ByteBuf*)Hash_Fetch(self->field_bufs, (Obj*)field);
}

VArray*
DocVec_field_names(DocVector *self) {
    return Hash_Keys(self->field_bufs);
}

TermVector*
DocVec_term_vector(DocVector *self, const CharBuf *field,
                   const CharBuf *term_text) {
    Hash *field_vector = (Hash*)Hash_Fetch(self->field_vectors, (Obj*)field);

    // If no cache hit, try to fill cache.
    if (field_vector == NULL) {
        ByteBuf *field_buf
            = (ByteBuf*)Hash_Fetch(self->field_bufs, (Obj*)field);

        // Bail if there's no content or the field isn't highlightable.
        if (field_buf == NULL) { return NULL; }

        field_vector = S_extract_tv_cache(field_buf);
        Hash_Store(self->field_vectors, (Obj*)field, (Obj*)field_vector);
    }

    // Get a buf for the term text or bail.
    ByteBuf *tv_buf = (ByteBuf*)Hash_Fetch(field_vector, (Obj*)term_text);
    if (tv_buf == NULL) {
        return NULL;
    }

    return S_extract_tv_from_tv_buf(field, term_text, tv_buf);
}

static Hash*
S_extract_tv_cache(ByteBuf *field_buf) {
    Hash    *tv_cache  = Hash_new(0);
    char    *tv_string = BB_Get_Buf(field_buf);
    int32_t  num_terms = NumUtil_decode_c32(&tv_string);
    CharBuf *text      = CB_new(0);

    // Read the number of highlightable terms in the field.
    for (int32_t i = 0; i < num_terms; i++) {
        size_t   overlap = NumUtil_decode_c32(&tv_string);
        size_t   len     = NumUtil_decode_c32(&tv_string);

        // Decompress the term text.
        CB_Set_Size(text, overlap);
        CB_Cat_Trusted_Str(text, tv_string, len);
        tv_string += len;

        // Get positions & offsets string.
        char *bookmark_ptr      = tv_string;
        int32_t num_positions   = NumUtil_decode_c32(&tv_string);
        while (num_positions--) {
            // Leave nums compressed to save a little mem.
            NumUtil_skip_cint(&tv_string);
            NumUtil_skip_cint(&tv_string);
            NumUtil_skip_cint(&tv_string);
        }
        len = tv_string - bookmark_ptr;

        // Store the $text => $posdata pair in the output hash.
        Hash_Store(tv_cache, (Obj*)text,
                   (Obj*)BB_new_bytes(bookmark_ptr, len));
    }
    DECREF(text);

    return tv_cache;
}

static TermVector*
S_extract_tv_from_tv_buf(const CharBuf *field, const CharBuf *term_text,
                         ByteBuf *tv_buf) {
    TermVector *retval      = NULL;
    char       *posdata     = BB_Get_Buf(tv_buf);
    char       *posdata_end = posdata + BB_Get_Size(tv_buf);
    int32_t    *positions   = NULL;
    int32_t    *starts      = NULL;
    int32_t    *ends        = NULL;
    uint32_t    num_pos     = 0;

    if (posdata != posdata_end) {
        num_pos   = NumUtil_decode_c32(&posdata);
        positions = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t));
        starts    = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t));
        ends      = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t));
    }

    // Expand C32s.
    for (uint32_t i = 0; i < num_pos; i++) {
        positions[i] = NumUtil_decode_c32(&posdata);
        starts[i]    = NumUtil_decode_c32(&posdata);
        ends[i]      = NumUtil_decode_c32(&posdata);
    }

    if (posdata != posdata_end) {
        THROW(ERR, "Bad encoding of posdata");
    }
    else {
        I32Array *posits_map = I32Arr_new_steal(positions, num_pos);
        I32Array *starts_map = I32Arr_new_steal(starts, num_pos);
        I32Array *ends_map   = I32Arr_new_steal(ends, num_pos);
        retval = TV_new(field, term_text, posits_map, starts_map, ends_map);
        DECREF(posits_map);
        DECREF(starts_map);
        DECREF(ends_map);
    }

    return retval;
}