The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#define C_KINO_INVERSION
#define C_KINO_TOKEN
#include "KinoSearch/Util/ToolSet.h"

#include "KinoSearch/Analysis/Inversion.h"
#include "KinoSearch/Analysis/Token.h"
#include "KinoSearch/Util/SortUtils.h"

#ifndef SIZE_MAX
  #define SIZE_MAX ((size_t)-1)
#endif

// After inversion, record how many like tokens occur in each group.
static void
S_count_clusters(Inversion *self);

Inversion*
Inversion_new(Token *seed_token) 
{
    Inversion *self = (Inversion*)VTable_Make_Obj(INVERSION);

    // Init. 
    self->cap                 = 16;
    self->size                = 0;
    self->tokens              = (Token**)CALLOCATE(self->cap, sizeof(Token*));
    self->cur                 = 0;
    self->inverted            = false;
    self->cluster_counts      = NULL;
    self->cluster_counts_size = 0;

    // Process the seed token. 
    if (seed_token != NULL) {
        Inversion_append(self, (Token*)INCREF(seed_token));
    }

    return self;
}

void            
Inversion_destroy(Inversion *self)
{       
    if (self->tokens) {
        Token **tokens       = self->tokens;
        Token **const limit  = tokens + self->size;
        for ( ; tokens < limit; tokens++) {
            DECREF(*tokens);
        }
        FREEMEM(self->tokens);
    }
    FREEMEM(self->cluster_counts);
    SUPER_DESTROY(self, INVERSION);
}

uint32_t
Inversion_get_size(Inversion *self) { return self->size; }

Token*
Inversion_next(Inversion *self) 
{
    // Kill the iteration if we're out of tokens. 
    if (self->cur == self->size)
        return NULL;
    return self->tokens[ self->cur++ ];
}

void
Inversion_reset(Inversion *self) 
{
    self->cur = 0;
}

static void
S_grow(Inversion *self, size_t size) 
{
    if (size > self->cap) {
        uint64_t amount = size * sizeof(Token*);
        // Clip rather than wrap. 
        if (amount > SIZE_MAX || amount < size) { amount = SIZE_MAX; }
        self->tokens = (Token**)REALLOCATE(self->tokens, (size_t)amount);
        self->cap    = size;
        memset(self->tokens + self->size, 0,
            (size - self->size) * sizeof(Token*));
    }
}

void
Inversion_append(Inversion *self, Token *token) 
{
    if (self->inverted) {
        THROW(ERR, "Can't append tokens after inversion");
    }
    if (self->size >= self->cap) {
        size_t new_capacity = Memory_oversize(self->size + 1, sizeof(Token*));
        S_grow(self, new_capacity);
    }
    self->tokens[ self->size ] = token;
    self->size++;
}

Token**
Inversion_next_cluster(Inversion *self, uint32_t *count)
{
    Token **cluster = self->tokens + self->cur;

    if (self->cur == self->size) {
        *count = 0;
        return NULL;
    }

    // Don't read past the end of the cluster counts array. 
    if (!self->inverted)
        THROW(ERR, "Inversion not yet inverted");
    if (self->cur > self->cluster_counts_size)
        THROW(ERR, "Tokens were added after inversion");

    // Place cluster count in passed-in var, advance bookmark. 
    *count = self->cluster_counts[ self->cur ];
    self->cur += *count;

    return cluster;
}

void 
Inversion_invert(Inversion *self)
{
    Token   **tokens = self->tokens;
    Token   **limit  = tokens + self->size;
    int32_t   token_pos = 0;

    // Thwart future attempts to append. 
    if (self->inverted)
        THROW(ERR, "Inversion has already been inverted");
    self->inverted = true;

    // Assign token positions. 
    for ( ;tokens < limit; tokens++) {
        Token *const cur_token = *tokens;
        cur_token->pos = token_pos;
        token_pos += cur_token->pos_inc;
        if (token_pos < cur_token->pos) {
            THROW(ERR, "Token positions out of order: %i32 %i32", 
                cur_token->pos, token_pos);
        }
    }

    // Sort the tokens lexically, and hand off to cluster counting routine. 
    Sort_quicksort(self->tokens, self->size, sizeof(Token*), Token_compare,
        NULL);
    S_count_clusters(self);
}

static void
S_count_clusters(Inversion *self)
{
    Token **tokens = self->tokens;
    uint32_t *counts 
        = (uint32_t*)CALLOCATE(self->size + 1, sizeof(uint32_t)); 

    // Save the cluster counts. 
    self->cluster_counts_size = self->size;
    self->cluster_counts = counts;

    for (uint32_t i = 0; i < self->size; ) {
        Token *const base_token = tokens[i];
        char  *const base_text  = base_token->text;
        const size_t base_len   = base_token->len;
        uint32_t     j          = i + 1;

        // Iterate through tokens until text doesn't match. 
        while (j < self->size) {
            Token *const candidate = tokens[j];

            if (   (candidate->len == base_len)
                && (memcmp(candidate->text, base_text, base_len) == 0)
            ) {
                j++;
            }
            else {
                break;
            }
        }

        // Record count at the position of the first token in the cluster. 
        counts[i] = j - i;

        // Start the next loop at the next token we haven't seen. 
        i = j;
    }
}

/* Copyright 2006-2011 Marvin Humphrey
 *
 * This program is free software; you can redistribute it and/or modify
 * under the same terms as Perl itself.
 */