The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package KinoSearch1::Index::TermInfosWriter;
use strict;
use warnings;
use KinoSearch1::Util::ToolSet;
use base qw( KinoSearch1::Util::Class );

BEGIN {
    __PACKAGE__->init_instance_vars(
        # constructor params
        invindex       => undef,
        seg_name       => undef,
        is_index       => 0,
        index_interval => 1024,
        skip_interval  => 16,
    );
}
our %instance_vars;

sub new {
    my $class = shift;
    confess kerror() unless verify_args( \%instance_vars, @_ );
    my %args = ( %instance_vars, @_ );
    my $invindex = $args{invindex};

    # open an outstream
    my $suffix = $args{is_index} ? 'tii' : 'tis';
    my $filename = "$args{seg_name}.$suffix";
    $invindex->delete_file($filename) if $invindex->file_exists($filename);
    my $outstream = $args{invindex}->open_outstream($filename);

    my $self = _new( $outstream,
        @args{qw( is_index index_interval skip_interval )} );

    # create the tii doppelganger
    if ( !$args{is_index} ) {
        my $other = __PACKAGE__->new(
            invindex => $invindex,
            seg_name => $args{seg_name},
            is_index => 1,
        );
        $self->_set_other($other);
        $other->_set_other($self);
    }

    return $self;
}

sub finish {
    my $self      = shift;
    my $outstream = $self->_get_outstream;

    # seek to near the head and write the number of terms processed
    $outstream->seek(4);
    $outstream->lu_write( 'Q', $self->_get_size );

    # cue the doppelganger's exit
    if ( !$self->_get_is_index ) {
        $self->_get_other()->finish;
    }

    $outstream->close;
}

1;

__END__

__XS__

MODULE = KinoSearch1    PACKAGE = KinoSearch1::Index::TermInfosWriter

TermInfosWriter*
_new(outstream_sv, is_index, index_interval, skip_interval)
    SV  *outstream_sv;
    I32  is_index;
    I32  index_interval;
    I32  skip_interval;
CODE:
    RETVAL = Kino1_TInfosWriter_new(outstream_sv, is_index, index_interval, 
        skip_interval);
OUTPUT: RETVAL

=for comment

Add a Term (encoded as a termstring) and its associated TermInfo.

=cut 

void
add(obj, termstring_sv, tinfo)
    TermInfosWriter *obj;
    SV              *termstring_sv;
    TermInfo        *tinfo;
PREINIT:
    ByteBuf bb;
    STRLEN len;
PPCODE:
    bb.ptr  = SvPV(termstring_sv, len);
    bb.size = len;
    Kino1_TInfosWriter_add(obj, &bb, tinfo);

=for comment

Export the FORMAT constant to Perl.

=cut

IV
FORMAT()
CODE:
    RETVAL = KINO_TINFOS_FORMAT;
OUTPUT: RETVAL


SV*
_set_or_get(obj, ...)
    TermInfosWriter *obj;
ALIAS:
    _set_other     = 1
    _get_other     = 2
    _get_outstream = 4
    _get_is_index  = 6
    _get_size      = 8
CODE:
{
    KINO_START_SET_OR_GET_SWITCH

    case 1:  SvREFCNT_dec(obj->other_sv);
             obj->other_sv = newSVsv( ST(1) );
             Kino1_extract_struct(obj->other_sv, obj->other, TermInfosWriter*,
                "KinoSearch1::Index::TermInfosWriter");
             /* fall through */
    case 2:  RETVAL = newSVsv(obj->other_sv);
             break;

    case 4:  RETVAL = newSVsv(obj->fh_sv);
             break;

    case 6:  RETVAL = newSViv(obj->is_index);
             break;

    case 8:  RETVAL = newSViv(obj->size);
             break;

    KINO_END_SET_OR_GET_SWITCH
}
OUTPUT: RETVAL


void
DESTROY(obj)
    TermInfosWriter *obj;
PPCODE:
    Kino1_TInfosWriter_destroy(obj);

__H__

#ifndef H_KINO_TERM_INFOS_WRITER
#define H_KINO_TERM_INFOS_WRITER 1

#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#include "KinoSearch1IndexTerm.h"
#include "KinoSearch1IndexTermInfo.h"
#include "KinoSearch1StoreOutStream.h"
#include "KinoSearch1UtilByteBuf.h"
#include "KinoSearch1UtilCClass.h"
#include "KinoSearch1UtilMathUtils.h"
#include "KinoSearch1UtilMemManager.h"
#include "KinoSearch1UtilStringHelper.h"

#define KINO_TINFOS_FORMAT -2

typedef struct terminfoswriter {
    OutStream *fh;
    SV        *fh_sv;
    I32        is_index;
    I32        index_interval;
    I32        skip_interval;
    struct terminfoswriter* other;
    SV        *other_sv;
    ByteBuf   *last_termstring;
    TermInfo  *last_tinfo;
    I32        last_fieldnum;
    double     last_tis_ptr;
    I32        size;
} TermInfosWriter;

TermInfosWriter* Kino1_TInfosWriter_new(SV*, I32, I32, I32);
void Kino1_TInfosWriter_add(TermInfosWriter*, ByteBuf*, TermInfo*);
void Kino1_TInfosWriter_destroy(TermInfosWriter*);

#endif /* include guard */

__C__

#include "KinoSearch1IndexTermInfosWriter.h"

TermInfosWriter*
Kino1_TInfosWriter_new(SV *outstream_sv, I32 is_index, I32 index_interval, 
                      I32 skip_interval) {
    TermInfosWriter *obj;

    /* allocate */
    Kino1_New(0, obj, 1, TermInfosWriter);

    /* assign */
    obj->is_index       = is_index;
    obj->index_interval = index_interval;
    obj->skip_interval  = skip_interval;
    obj->fh_sv          = newSVsv(outstream_sv);
    Kino1_extract_struct(obj->fh_sv, obj->fh, OutStream*,
        "KinoSearch1::Store::OutStream");
    /* NOTE: this value forces the first field_num in the .tii file to -1.
     * Do not change it. */
    obj->last_termstring    = Kino1_BB_new_string("\xff\xff", 2);
    obj->last_tinfo         = Kino1_TInfo_new();
    obj->last_fieldnum      = -1;
    obj->last_tis_ptr       = 0,
    obj->size               = 0;
    obj->other              = NULL;
    obj->other_sv           = &PL_sv_undef;
 
    /* write file header */
    obj->fh->write_int(obj->fh, KINO_TINFOS_FORMAT);
    obj->fh->write_long(obj->fh, 0.0); /* return to fill in later */
    obj->fh->write_int(obj->fh, index_interval);
    obj->fh->write_int(obj->fh, skip_interval);

    return obj;
}


/* Write out a term/terminfo combo. */
void 
Kino1_TInfosWriter_add(TermInfosWriter* obj, ByteBuf* termstring_bb,
                      TermInfo* tinfo) {
    char      *termstring, *last_tstring;
    STRLEN     termstring_len, last_tstring_len;

    I32        field_num;
    I32        overlap;
    char      *diff_start_str;
    STRLEN     diff_len;
    OutStream* fh;

    /* make local copy */
    fh = obj->fh;

    /* write a subset of the entries to the .tii index */
    if (    (obj->size % obj->index_interval == 0)
         && (!obj->is_index)               
    ) {
        Kino1_TInfosWriter_add(obj->other, obj->last_termstring,
        obj->last_tinfo);
    }

    /* extract string pointers and string lengths */
    termstring       = termstring_bb->ptr;
    last_tstring     = obj->last_termstring->ptr;
    termstring_len   = termstring_bb->size;
    last_tstring_len = obj->last_termstring->size;

    /* to obtain field number, decode packed 'n' at top of termstring */
    field_num = (I16)Kino1_decode_bigend_U16(termstring);

    /* move past field_num */
    termstring       += KINO_FIELD_NUM_LEN;
    last_tstring     += KINO_FIELD_NUM_LEN;
    termstring_len   -= KINO_FIELD_NUM_LEN;
    last_tstring_len -= KINO_FIELD_NUM_LEN;

    /* count how many bytes the strings share at the top */ 
    overlap = Kino1_StrHelp_string_diff(last_tstring, termstring,
        last_tstring_len, termstring_len);
    diff_start_str = termstring + overlap;
    diff_len       = termstring_len - overlap;

    /* write number of common bytes */
    fh->write_vint(fh, overlap);

    /* write common bytes */
    fh->write_string(fh, diff_start_str, diff_len);
    
    /* write field number and doc_freq */
    fh->write_vint(fh, field_num);
    fh->write_vint(fh, tinfo->doc_freq);

    /* delta encode filepointers */
    fh->write_vlong(fh, (tinfo->frq_fileptr - obj->last_tinfo->frq_fileptr) );
    fh->write_vlong(fh, (tinfo->prx_fileptr - obj->last_tinfo->prx_fileptr) );

    /* write skipdata */
    if (tinfo->doc_freq >= obj->skip_interval)
        fh->write_vint(fh, tinfo->skip_offset);

    /* the .tii index file gets a pointer to the location of the primary */
    if (obj->is_index) {
        double tis_ptr;

        tis_ptr = obj->other->fh->tell(obj->other->fh);
        obj->fh->write_vlong(obj->fh, (tis_ptr - obj->last_tis_ptr));
        obj->last_tis_ptr = tis_ptr;
    }

    /* track number of terms */
    obj->size++;

    /* remember for delta encoding */
    Kino1_BB_assign_string(obj->last_termstring, termstring_bb->ptr,
        termstring_bb->size);
    StructCopy(tinfo, obj->last_tinfo, TermInfo);
}

void
Kino1_TInfosWriter_destroy(TermInfosWriter *obj) {
    SvREFCNT_dec(obj->fh_sv);
    SvREFCNT_dec(obj->other_sv);
    Kino1_BB_destroy(obj->last_termstring);
    Kino1_TInfo_destroy(obj->last_tinfo);
    Kino1_Safefree(obj);
}


__POD__

==begin devdocs

==head1 NAME

KinoSearch1::Index::TermInfosWriter - write a term dictionary

==head1 DESCRIPTION

The TermInfosWriter write both parts of the term dictionary.  The primary
instance creates a shadow TermInfosWriter that writes the index.

==head1 COPYRIGHT

Copyright 2005-2010 Marvin Humphrey

==head1 LICENSE, DISCLAIMER, BUGS, etc.

See L<KinoSearch1> version 1.01.

==end devdocs
==cut