The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
package KinoSearch1::Store::InStream;
use base qw( KinoSearch1::Util::CClass );
use strict;
use warnings;
use KinoSearch1::Util::ToolSet;

sub close { CORE::close shift->get_fh }

=for comment
Dupe the filehandle and create a new object around the dupe.  Seek the dupe
to the same spot as the original.

=cut

sub clone_stream {
    my $self = shift;
    open( my $duped_fh, '<&=', $self->get_fh )
        or confess("Couldn't dupe filehandle: $!");
    my $evil_twin
        = __PACKAGE__->new( $duped_fh, $self->get_offset, $self->length, );
    $evil_twin->seek( $self->tell );
    return $evil_twin;
}

1;

__END__

__XS__

MODULE = KinoSearch1    PACKAGE = KinoSearch1::Store::InStream

=begin comment

    my $instream = KinoSearch1::Store::Instream->new( 
        $filehandle, $offset, $length 
    );

Constructor.  Takes 1-3 arguments, and unlike most classes in the KinoSearch1
suite, the arguments to the constructor are not labeled parameters.

The second argument, an offset, defaults to 0 if not supplied.  Non-zero
offsets get factored in when calling seek and tell.

The last argument, a length, is the length of the "file" in bytes.  Supplying
an explicit value is only essential for InStreams which are assigned to read a
portion of a compound file -- otherwise, the length gets auto-calculated
correctly.

=end comment
=cut

InStream*
new(class, fh_sv, ...)
    char   *class;
    SV     *fh_sv;
PREINIT:
    double  offset = 0;
    double  len    = -1;
CODE:
    if (items > 2) {
        SV* offset_sv;
        offset_sv = ST(2);
        if (SvOK(offset_sv))
            offset = SvNV(offset_sv);
    }
    if (items > 3) {
        SV *len_sv;
        len_sv = ST(3);
        if (SvOK(len_sv))
            len = SvNV(len_sv);
    }
    RETVAL = Kino1_InStream_new(class, fh_sv, offset, len);
OUTPUT: RETVAL


=for comment
Seek to target plus the object's start offset.

=cut

void
seek(instream, target)
    InStream *instream;
    double    target;
PPCODE:
    instream->seek(instream, target);

=for comment
Return the filehandle's position minus the offset.

=cut

double
tell(instream)
    InStream *instream;
CODE:
    RETVAL = instream->tell(instream);
OUTPUT: RETVAL

=for comment
Return the length of the "file" in bytes, factoring in the offset.

=cut

double
length(instream)
    InStream *instream;
CODE:
    RETVAL = instream->len;
OUTPUT: RETVAL

=begin comment

    @items = $instream->lu_read( TEMPLATE );

Read the items specified by TEMPLATE from the InStream.

=end comment
=cut

SV*
_set_or_get(instream, ...)
    InStream *instream;
ALIAS:
    set_len      = 1
    get_len      = 2
    set_offset   = 3
    get_offset   = 4
    set_fh       = 5
    get_fh       = 6
CODE:
{
    KINO_START_SET_OR_GET_SWITCH

    case 1:  instream->len = SvNV( ST(1) );
             /* fall through */
    case 2:  RETVAL = newSVnv(instream->len);
             break;
    
    case 3:  instream->offset = SvNV( ST(1) );
             /* fall through */
    case 4:  RETVAL = newSVnv(instream->offset);
             break;
    
    case 5:  Kino1_confess("Can't set_fh");
             /* fall through */
    case 6:  RETVAL = newSVsv(instream->fh_sv);
             break;

    KINO_END_SET_OR_GET_SWITCH
}
OUTPUT: RETVAL


void
lu_read (instream, template_sv)
    InStream *instream;
    SV       *template_sv
PREINIT:
    STRLEN    tpt_len;      /* bytelength of template */
    char     *template;     /* ptr to a spot in the template */
    char     *tpt_end;      /* ptr to the end of the template */
    int       repeat_count; /* number of times to repeat sym */
    char      sym;          /* the current symbol in the template */
    char      countsym;     /* used when calculating repeat counts */
    IV        aIV;
    SV       *aSV;
    char      aChar;
    char*     string;
    STRLEN    len;
PPCODE:
{
    /* prepare template string pointers */
    template    = SvPV(template_sv, tpt_len);
    tpt_end     = SvEND(template_sv);

    repeat_count = 0;
    while (1) {
        if (repeat_count == 0) {
            /* fast-forward past space characters */
            while (*template == ' ' && template < tpt_end) {
                template++;
            }

            /* break out of the loop if we've exhausted the template */
            if (template == tpt_end) {
                break;
            }
            
            /* derive the current symbol and a possible digit repeat sym */
            sym      = *template++;
            countsym = *template;

            if (template == tpt_end) { 
                /* sym is last char in template, so process once */
                repeat_count = 1;
            }
            else if (countsym >= '0' && countsym <= '9') {
                /* calculate numerical repeat count */
                repeat_count = countsym - KINO_NUM_CHAR_OFFSET;
                countsym = *(++template);
                while (  template <= tpt_end 
                      && countsym >= '0' 
                      && countsym <= '9'
                ) {
                    repeat_count = (repeat_count * 10) 
                        + (countsym - KINO_NUM_CHAR_OFFSET);
                    countsym = *(++template);
                }
            }
            else { /* no numeric repeat count, so process sym only once */
                repeat_count = 1;
            }
        }

        /* thwart potential infinite loop */
        if (repeat_count < 1)
            Kino1_confess( "invalid repeat_count: %d", repeat_count);
        
        switch(sym) {

        case 'a': /* arbitrary binary data */
            len = repeat_count;
            repeat_count = 1;
            aSV = newSV(len + 1);
            SvCUR_set(aSV, len);
            SvPOK_on(aSV);
            string = SvPVX(aSV);
            instream->read_bytes(instream, string, len);
            break;

        case 'b': /* signed byte */
        case 'B': /* unsigned byte */
            aChar = instream->read_byte(instream);
            if (sym == 'b') 
                aIV = (signed char)aChar;
            else
                aIV = (unsigned char)aChar;
            aSV = newSViv(aIV);
            break;

        case 'i': /* signed 32-bit integer */
            aSV = newSViv( (I32)instream->read_int(instream) );
            break;
            
        case 'I': /* unsigned 32-bit integer */
            aSV = newSVuv( instream->read_int(instream) );
            break;

        case 'Q': /* unsigned "64-bit integer" */
            aSV = newSVnv( instream->read_long(instream) );
            break;

        case 'T': /* string */
            len = instream->read_vint(instream);
            aSV = newSV(len + 1);
            SvCUR_set(aSV, len);
            SvPOK_on(aSV);
            string = SvPVX(aSV);
            instream->read_chars(instream, string, 0, len);
            break;

        case 'V': /* VInt */
            aSV = newSVuv( instream->read_vint(instream) );
            break;

        case 'W': /* VLong */
            aSV = newSVnv( instream->read_vlong(instream) );
            break;

        default: 
            aSV = NULL; /* suppress unused var compiler warning */
            Kino1_confess("Invalid type in template: '%c'", sym);
        }

        /* Put a scalar on the stack, use up one symbol or repeater */
        XPUSHs( sv_2mortal(aSV) );
        repeat_count -= 1;
    }
}

void
DESTROY(instream)
    InStream *instream;
PPCODE:
    Kino1_InStream_destroy(instream);

__H__


#ifndef H_KINOSEARCH_STORE_INSTREAM
#define H_KINOSEARCH_STORE_INSTREAM 1

#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#include "KinoSearch1UtilCarp.h"
#include "KinoSearch1UtilMathUtils.h"

/* Detect whether we're on an ASCII or EBCDIC machine. */
#if '0' == 240
#define KINO_NUM_CHAR_OFFSET 240
#else
#define KINO_NUM_CHAR_OFFSET 48
#endif

#define KINO_IO_STREAM_BUF_SIZE 1024

typedef struct instream {
    PerlIO  *fh;
    SV      *fh_sv;
    double   offset;
    double   len;
    char    *buf;          
    Off_t    buf_start;    /* file position of start of buffer */
    int      buf_len;      /* number of valid bytes in the buffer */
    int      buf_pos;      /* next byte to read */
    void   (*seek)(struct instream*, double);
    double (*tell)(struct instream*);
    char   (*read_byte)(struct instream*);
    void   (*read_bytes)(struct instream*, char*, STRLEN);
    void   (*read_chars)(struct instream*, char*, STRLEN, STRLEN);
    U32    (*read_int)(struct instream*);
    double (*read_long)(struct instream*);
    U32    (*read_vint)(struct instream*);
    double (*read_vlong)(struct instream*);
} InStream;

InStream* Kino1_InStream_new     (char*, SV*, double, double);
void   Kino1_InStream_seek       (InStream*, double);
double Kino1_InStream_tell       (InStream*);
void   Kino1_InStream_refill     (InStream*);
char   Kino1_InStream_read_byte  (InStream*);
void   Kino1_InStream_read_bytes (InStream*, char*, STRLEN);
void   Kino1_InStream_read_chars (InStream*, char*, STRLEN, STRLEN);
U32    Kino1_InStream_read_int   (InStream*);
double Kino1_InStream_read_long  (InStream*);
U32    Kino1_InStream_decode_vint(char**);
U32    Kino1_InStream_read_vint  (InStream*);
double Kino1_InStream_read_vlong (InStream*);
void   Kino1_InStream_destroy    (InStream*);

#endif /* include guard */

__C__

#include "KinoSearch1StoreInStream.h"


InStream*
Kino1_InStream_new(char *class, SV *fh_sv, double offset, double len ) {
    InStream *instream;

    /* allocate */
    Kino1_New(0, instream, 1, InStream);

    /* assign */
    instream->fh_sv       = newSVsv(fh_sv);
    instream->fh          = IoIFP( sv_2io(fh_sv) );
    instream->offset      = offset;

    /* init buffer */
    instream->buf       = NULL;
    instream->buf_start = 0;
    instream->buf_len   = 0;
    instream->buf_pos   = 0;

    /* seek */
    if (offset != 0) {
        PerlIO_seek(instream->fh, offset, 0);
    }

    /* calculate len if an (intentionally) invalid value was supplied */
    if (len < 0.0) {
        double bookmark = PerlIO_tell(instream->fh);
        PerlIO_seek(instream->fh, 0, 2);
        len = PerlIO_tell(instream->fh);
        PerlIO_seek(instream->fh, bookmark, 0);
    }
    instream->len = len;

    /* assign methods */
    instream->seek       = Kino1_InStream_seek;
    instream->tell       = Kino1_InStream_tell;
    instream->read_byte  = Kino1_InStream_read_byte;
    instream->read_bytes = Kino1_InStream_read_bytes;
    instream->read_chars = Kino1_InStream_read_chars;
    instream->read_int   = Kino1_InStream_read_int;
    instream->read_long  = Kino1_InStream_read_long;
    instream->read_vint  = Kino1_InStream_read_vint;
    instream->read_vlong = Kino1_InStream_read_vlong;

    return instream;
}

void
Kino1_InStream_seek(InStream *instream, double target) {
    /* seek within buffer if possible */
    if (   (target >= instream->buf_start)
        && (target <  (instream->buf_start + instream->buf_pos))
    ) {
        instream->buf_pos = target - instream->buf_start;
    }
    /* nope, not possible, so seek within file and prepare to refill */
    else {
        instream->buf_start = target;
        instream->buf_pos   = 0;
        instream->buf_len   = 0;
        PerlIO_seek(instream->fh, target + instream->offset, 0);
    }
}

double
Kino1_InStream_tell(InStream *instream) {
    return instream->buf_start + instream->buf_pos;
}

void
Kino1_InStream_refill(InStream *instream) {
    int check_val;

    /* wait to allocate buffer until it's needed */
    if (instream->buf == NULL)
        Kino1_New(0, instream->buf, KINO_IO_STREAM_BUF_SIZE, char);

    /* add bytes read to file position, reset */
    instream->buf_start += instream->buf_pos;
    instream->buf_pos = 0;

    /* calculate the number of bytes to read */
    if (KINO_IO_STREAM_BUF_SIZE < instream->len - instream->buf_start)
        instream->buf_len = KINO_IO_STREAM_BUF_SIZE;
    else
        instream->buf_len = instream->len - instream->buf_start;

    /* perform the file operations */
    PerlIO_seek(instream->fh, 0, 1);
    check_val = PerlIO_seek(instream->fh, 
        (instream->buf_start + instream->offset), 0);
    if (check_val == -1)
        Kino1_confess("refill: PerlIO_seek failed: %d", errno);
    check_val = PerlIO_read(instream->fh, instream->buf, instream->buf_len);
    if (check_val != instream->buf_len) 
        Kino1_confess("refill: tried to read %d bytes, got %d: %d", 
            instream->buf_len, check_val, errno);
}

char
Kino1_InStream_read_byte(InStream *instream) {
    if (instream->buf_pos >= instream->buf_len)
        Kino1_InStream_refill(instream);
    return instream->buf[ instream->buf_pos++ ];
}

void
Kino1_InStream_read_bytes (InStream *instream, char* buf, STRLEN len) {
    if (instream->buf_pos + len < instream->buf_len) {
        /* request is entirely within buffer, so copy */
        Copy((instream->buf + instream->buf_pos), buf, len, char);
        instream->buf_pos += len;
    }
    else {
        /* get the request from the file and reset buffer */
        int check_val;
        Off_t start;
        start = instream->tell(instream);
        check_val = PerlIO_seek(instream->fh, (start + instream->offset), 0);
        if (check_val == -1)
            Kino1_confess("read_bytes: PerlIO_seek failed: %d", errno );
        check_val = PerlIO_read(instream->fh, buf, len);
        if (check_val < len)
            Kino1_confess("read_bytes: tried to read %"UVuf" bytes, got %d", 
                (UV)len, check_val);
        
        /* reset vars and refill if there's more in the file */
        instream->buf_start = start + len;
        instream->buf_pos   = 0;
        instream->buf_len   = 0;
        if (instream->buf_start < instream->len)
            Kino1_InStream_refill(instream);
    }
}

/* This is just a wrapper for read_bytes, but that may change.  It should
 * be used whenever Lucene character data is being read, typically after
 * read_vint as part of a String read. If and when a change does come, it will
 * be a lot easier to track down all the relevant code fragments if read_chars
 * gets used consistently. 
 */
void
Kino1_InStream_read_chars(InStream *instream, char *buf, STRLEN start, 
                         STRLEN len) {
    buf += start;
    instream->read_bytes(instream, buf, len);
}

U32
Kino1_InStream_read_int (InStream *instream) {
    unsigned char buf[4];
    instream->read_bytes(instream, (char*)buf, 4);
    return Kino1_decode_bigend_U32(buf);
}

double
Kino1_InStream_read_long (InStream *instream) {
    unsigned char buf[8];
    double        aDouble;

    /* get 8 bytes from the stream */
    instream->read_bytes(instream, (char*)buf, 8);
 
    /* get high 4 bytes, multiply by 2**32 */
    aDouble = Kino1_decode_bigend_U32(buf);
    aDouble = aDouble * pow(2.0, 32.0);
    
    /* decode low four bytes as unsigned int and add to total */
    aDouble += Kino1_decode_bigend_U32(&buf[4]);

    return aDouble;
}

/* read in a Variable INTeger, stored in 1-5 bytes */
U32 
Kino1_InStream_read_vint (InStream *instream) {
    unsigned char aUChar;
    int           bitshift;
    U32           aU32;

    /* start by reading one byte; use the lower 7 bits */
    aUChar = (unsigned char)instream->read_byte(instream);
    aU32 = aUChar & 0x7f;

    /* keep reading and shifting as long as the high bit is set */
    for (bitshift = 7; (aUChar & 0x80) != 0; bitshift += 7) {
        aUChar = (unsigned char)instream->read_byte(instream);
        aU32 |= (aUChar & 0x7f) << bitshift;
    }
    return aU32;
}

U32
Kino1_InStream_decode_vint(char **source_ptr) {
    char *source;
    int   bitshift;
    U32   aU32;
    
    source = *source_ptr;
    aU32 = (unsigned char)*source & 0x7f;
    for (bitshift = 7; (*source & 0x80) != 0; bitshift += 7) {
        source++;
         aU32 |= ((unsigned char)*source & 0x7f) << bitshift;
    }
    source++;
    *source_ptr = source;
    return aU32;
}

double
Kino1_InStream_read_vlong (InStream *instream) {
    unsigned char aUChar;
    int           bitshift;
    double        aDouble;

    aUChar = (unsigned char)instream->read_byte(instream);
    aDouble = aUChar & 0x7f;
    for (bitshift = 7; (aUChar & 0x80) != 0; bitshift += 7) {
        aUChar = (unsigned char)instream->read_byte(instream);
        aDouble += (aUChar & 0x7f) * pow(2, bitshift);
    }
    return aDouble;
}


void
Kino1_InStream_destroy(InStream* instream) {
    SvREFCNT_dec(instream->fh_sv);
    Kino1_Safefree(instream->buf);
    Kino1_Safefree(instream);
}

__POD__

==begin devdocs

==head1 NAME

KinoSearch1::Store::InStream - filehandles for reading invindexes

==head1 SYNOPSIS
    
    # isa blessed filehandle
    
    my $instream  = $invindex->open_instream( $filehandle, $offset, $length );
    my @ten_vints = $instream->lu_read('V10');

==head1 DESCRIPTION

The InStream class abstracts out all input operations to KinoSearch1.

InStream is implemented as a inside-out object around a blessed filehandle.
It would almost be possible to use an ordinary filehandle, but the
objectification is necessary because InStreams have to be capable of
pretending that they are acting upon a distinct file when in reality they may
be reading only a portion of a compound file.

For the template used by lu_read, see InStream's companion,
L<OutStream|KinoSearch1::Store::OutStream>.

==head1 COPYRIGHT

Copyright 2005-2010 Marvin Humphrey

==head1 LICENSE, DISCLAIMER, BUGS, etc.

See L<KinoSearch1> version 1.01.

==end devdocs
==cut