The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package KinoSearch1::Document::Field;
use strict;
use warnings;
use KinoSearch1::Util::ToolSet;
use base qw( KinoSearch1::Util::Class );

BEGIN {
    __PACKAGE__->init_instance_vars(
        # constructor args / members
        name       => undef,
        analyzer   => undef,
        boost      => 1,
        stored     => 1,
        indexed    => 1,
        analyzed   => 1,
        vectorized => 1,
        binary     => 0,
        compressed => 0,
        omit_norms => 0,
        field_num  => undef,
        value      => '',
        fnm_bits   => undef,
        fdt_bits   => undef,
        tv_string  => '',
        tv_cache   => undef,
    );
    __PACKAGE__->ready_get_set(
        qw(
            value
            tv_string
            boost
            indexed
            stored
            analyzed
            vectorized
            binary
            compressed
            analyzer
            field_num
            name
            omit_norms
            )
    );
}

use KinoSearch1::Index::FieldsReader;
use KinoSearch1::Index::FieldInfos;
use KinoSearch1::Index::TermVector;

use Storable qw( dclone );

sub init_instance {
    my $self = shift;

    # field name is required
    croak("Missing required parameter 'name'")
        unless length $self->{name};

    # don't index binary fields
    if ( $self->{binary} ) {
        $self->{indexed}  = 0;
        $self->{analyzed} = 0;
    }
}

sub clone {
    my $self = shift;
    return dclone($self);
}

# Given two Field objects, return a child which has all the positive
# attributes of both parents (meaning: values are OR'd).
sub breed_with {
    my ( $self, $other ) = @_;
    my $kid = $self->clone;
    for (qw( indexed vectorized )) {
        $kid->{$_} ||= $other->{$_};
    }
    return $kid;
}

sub set_fnm_bits { $_[0]->{fnm_bits} = $_[1] }

sub get_fnm_bits {
    my $self = shift;
    $self->{fnm_bits} = KinoSearch1::Index::FieldInfos->encode_fnm_bits($self)
        unless defined $self->{fnm_bits};
    return $self->{fnm_bits};
}

sub set_fdt_bits { $_[0]->{fdt_bits} = $_[1] }

sub get_fdt_bits {
    my $self = shift;
    $self->{fdt_bits}
        = KinoSearch1::Index::FieldsReader->encode_fdt_bits($self)
        unless defined $self->{fdt_bits};
    return $self->{fdt_bits};
}

sub get_value_len { bytes::length( $_[0]->{value} ) }

# Return a TermVector object for a given Term, if it's in this field.
sub term_vector {
    my ( $self, $term_text ) = @_;
    return unless bytes::length( $self->{tv_string} );
    if ( !defined $self->{tv_cache} ) {
        $self->{tv_cache} = _extract_tv_cache( $self->{tv_string} );
    }
    if ( exists $self->{tv_cache}{$term_text} ) {
        my ( $positions, $starts, $ends )
            = _unpack_posdata( $self->{tv_cache}{$term_text} );
        my $term_vector = KinoSearch1::Index::TermVector->new(
            text          => $term_text,
            field         => $self->{name},
            positions     => $positions,
            start_offsets => $starts,
            end_offsets   => $ends,
        );
        return $term_vector;
    }

    return;
}

1;

__END__

__XS__

MODULE = KinoSearch1    PACKAGE = KinoSearch1::Document::Field

=for comment

Return ref to a hash where the keys are term texts and the values are encoded
positional data.

=cut

void
_extract_tv_cache(tv_string_sv)
    SV *tv_string_sv;
PREINIT:
    HV *tv_cache_hv;
PPCODE:
    tv_cache_hv = Kino1_Field_extract_tv_cache(tv_string_sv);
    XPUSHs( sv_2mortal( newRV_noinc( (SV*)tv_cache_hv ) ) );
    XSRETURN(1);

=for comment

Decompress positional data.

=cut

void
_unpack_posdata(posdata_sv)
    SV *posdata_sv;
PREINIT:
    AV     *positions_av, *starts_av, *ends_av;
PPCODE:
    positions_av = newAV();
    starts_av    = newAV();
    ends_av      = newAV();
    Kino1_Field_unpack_posdata(posdata_sv, positions_av, starts_av, ends_av);
    XPUSHs(sv_2mortal( newRV_noinc((SV*)positions_av) ));
    XPUSHs(sv_2mortal( newRV_noinc((SV*)starts_av)    ));
    XPUSHs(sv_2mortal( newRV_noinc((SV*)ends_av)      ));
    XSRETURN(3);


__H__

#ifndef H_KINOSEARCH_FIELD
#define H_KINOSEARCH_FIELD 1

#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#include "KinoSearch1StoreInStream.h"
#include "KinoSearch1UtilCarp.h"

HV*  Kino1_Field_extract_tv_cache(SV*);
void Kino1_Field_unpack_posdata(SV*, AV*, AV*, AV*);

#endif /* include guard */

__C__

#include "KinoSearch1DocumentField.h"

HV* 
Kino1_Field_extract_tv_cache(SV *tv_string_sv) {
    HV *tv_cache_hv;
    char    *tv_string, *bookmark_ptr, *key;
    char   **tv_ptr;
    STRLEN   len, tv_len, overlap, key_len;
    SV      *text_sv, *nums_sv;
    I32      i, num_terms, num_positions;

    /* allocate a new hash */
    tv_cache_hv = newHV();
    
    /* extract pointers */
    tv_string = SvPV(tv_string_sv, tv_len);
    tv_ptr    = &tv_string;

    /* create a base text scalar */
    text_sv = newSV(1);
    SvPOK_on(text_sv);
    *(SvEND(text_sv)) = '\0';

    /* read the number of vectorized terms in the field */
    num_terms = Kino1_InStream_decode_vint(tv_ptr);
    for (i = 0; i < num_terms; i++) {

        /* decompress the term text */
        overlap = Kino1_InStream_decode_vint(tv_ptr);
        SvCUR_set(text_sv, overlap);
        len = Kino1_InStream_decode_vint(tv_ptr);
        sv_catpvn(text_sv, *tv_ptr, len);
        *tv_ptr += len;
        key = SvPV(text_sv, key_len);

        /* get positions & offsets string */
        num_positions = Kino1_InStream_decode_vint(tv_ptr);
        bookmark_ptr = *tv_ptr;
        while(num_positions--) {
            /* leave nums compressed to save a little mem */
            (void)Kino1_InStream_decode_vint(tv_ptr);
            (void)Kino1_InStream_decode_vint(tv_ptr);
            (void)Kino1_InStream_decode_vint(tv_ptr);
        }
        len = *tv_ptr - bookmark_ptr;
        nums_sv = newSVpvn(bookmark_ptr, len);

        /* store the $text => $posdata pair in the output hash */
        hv_store(tv_cache_hv, key, key_len, nums_sv, 0);
    }
    SvREFCNT_dec(text_sv);

    return tv_cache_hv;
}

void
Kino1_Field_unpack_posdata(SV *posdata_sv, AV *positions_av, 
                          AV *starts_av,  AV *ends_av) {
    STRLEN  len;
    char   *posdata, *posdata_end;
    char  **posdata_ptr;
    SV     *num_sv;
    posdata      = SvPV(posdata_sv, len);
    posdata_ptr  = &posdata;
    posdata_end  = SvEND(posdata_sv);

    /* translate encoded VInts to Perl scalars */
    while(*posdata_ptr < posdata_end) {
        num_sv = newSViv( Kino1_InStream_decode_vint(posdata_ptr) );
        av_push(positions_av, num_sv);
        num_sv = newSViv( Kino1_InStream_decode_vint(posdata_ptr) );
        av_push(starts_av,    num_sv);
        num_sv = newSViv( Kino1_InStream_decode_vint(posdata_ptr) );
        av_push(ends_av,      num_sv);
    }

    if (*posdata_ptr != posdata_end)
        Kino1_confess("Bad encoding of posdata");
}

__POD__

=head1 NAME

KinoSearch1::Document::Field - a field within a document

=head1 SYNOPSIS

    # no public interface

=head1 DESCRIPTION

Fields can only be defined or manipulated indirectly, via InvIndexer and Doc.

=head1 COPYRIGHT

Copyright 2005-2010 Marvin Humphrey

=head1 LICENSE, DISCLAIMER, BUGS, etc.

See L<KinoSearch1> version 1.01.

=cut