/* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define C_LUCY_INVERSION
#define C_LUCY_TOKEN
#include "Lucy/Util/ToolSet.h"
#include "Lucy/Analysis/Inversion.h"
#include "Lucy/Analysis/Token.h"
#include "Lucy/Util/SortUtils.h"
#ifndef SIZE_MAX
#define SIZE_MAX ((size_t)-1)
#endif
// After inversion, record how many like tokens occur in each group.
static void
S_count_clusters(Inversion *self);
Inversion*
Inversion_new(Token *seed_token) {
Inversion *self = (Inversion*)VTable_Make_Obj(INVERSION);
// Init.
self->cap = 16;
self->size = 0;
self->tokens = (Token**)CALLOCATE(self->cap, sizeof(Token*));
self->cur = 0;
self->inverted = false;
self->cluster_counts = NULL;
self->cluster_counts_size = 0;
// Process the seed token.
if (seed_token != NULL) {
Inversion_append(self, (Token*)INCREF(seed_token));
}
return self;
}
void
Inversion_destroy(Inversion *self) {
if (self->tokens) {
Token **tokens = self->tokens;
Token **const limit = tokens + self->size;
for (; tokens < limit; tokens++) {
DECREF(*tokens);
}
FREEMEM(self->tokens);
}
FREEMEM(self->cluster_counts);
SUPER_DESTROY(self, INVERSION);
}
uint32_t
Inversion_get_size(Inversion *self) {
return self->size;
}
Token*
Inversion_next(Inversion *self) {
// Kill the iteration if we're out of tokens.
if (self->cur == self->size) {
return NULL;
}
return self->tokens[self->cur++];
}
void
Inversion_reset(Inversion *self) {
self->cur = 0;
}
static void
S_grow(Inversion *self, size_t size) {
if (size > self->cap) {
uint64_t amount = size * sizeof(Token*);
// Clip rather than wrap.
if (amount > SIZE_MAX || amount < size) { amount = SIZE_MAX; }
self->tokens = (Token**)REALLOCATE(self->tokens, (size_t)amount);
self->cap = size;
memset(self->tokens + self->size, 0,
(size - self->size) * sizeof(Token*));
}
}
void
Inversion_append(Inversion *self, Token *token) {
if (self->inverted) {
THROW(ERR, "Can't append tokens after inversion");
}
if (self->size >= self->cap) {
size_t new_capacity = Memory_oversize(self->size + 1, sizeof(Token*));
S_grow(self, new_capacity);
}
self->tokens[self->size] = token;
self->size++;
}
Token**
Inversion_next_cluster(Inversion *self, uint32_t *count) {
Token **cluster = self->tokens + self->cur;
if (self->cur == self->size) {
*count = 0;
return NULL;
}
// Don't read past the end of the cluster counts array.
if (!self->inverted) {
THROW(ERR, "Inversion not yet inverted");
}
if (self->cur > self->cluster_counts_size) {
THROW(ERR, "Tokens were added after inversion");
}
// Place cluster count in passed-in var, advance bookmark.
*count = self->cluster_counts[self->cur];
self->cur += *count;
return cluster;
}
void
Inversion_invert(Inversion *self) {
Token **tokens = self->tokens;
Token **limit = tokens + self->size;
int32_t token_pos = 0;
// Thwart future attempts to append.
if (self->inverted) {
THROW(ERR, "Inversion has already been inverted");
}
self->inverted = true;
// Assign token positions.
for (; tokens < limit; tokens++) {
Token *const cur_token = *tokens;
cur_token->pos = token_pos;
token_pos = (int32_t)((uint32_t)token_pos + (uint32_t)cur_token->pos_inc);
if (token_pos < cur_token->pos) {
THROW(ERR, "Token positions out of order: %i32 %i32",
cur_token->pos, token_pos);
}
}
// Sort the tokens lexically, and hand off to cluster counting routine.
Sort_quicksort(self->tokens, self->size, sizeof(Token*), Token_compare,
NULL);
S_count_clusters(self);
}
static void
S_count_clusters(Inversion *self) {
Token **tokens = self->tokens;
uint32_t *counts
= (uint32_t*)CALLOCATE(self->size + 1, sizeof(uint32_t));
// Save the cluster counts.
self->cluster_counts_size = self->size;
self->cluster_counts = counts;
for (uint32_t i = 0; i < self->size;) {
Token *const base_token = tokens[i];
char *const base_text = base_token->text;
const size_t base_len = base_token->len;
uint32_t j = i + 1;
// Iterate through tokens until text doesn't match.
while (j < self->size) {
Token *const candidate = tokens[j];
if ((candidate->len == base_len)
&& (memcmp(candidate->text, base_text, base_len) == 0)
) {
j++;
}
else {
break;
}
}
// Record count at the position of the first token in the cluster.
counts[i] = j - i;
// Start the next loop at the next token we haven't seen.
i = j;
}
}