The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Freq;

require 5.005_62;
use strict;
use warnings;
no warnings "recursion";
use vars qw( $VERSION );

use FileHandle;
use CDB_File;

# Constants for data about each word.
use constant NDOCS  => 0;
use constant NWORDS => 1;
use constant DX     => 2; # doc index
use constant PX     => 3; # position index
use constant LASTDOC => 4;

$VERSION = '0.22';
use Inline Config =>
            VERSION => '0.22',
            NAME => 'Freq';
use Inline 'C';


sub open_write {
    my $type = shift;
    my $path = shift; # Name of index (directory).

    my $self = {};

    if( -e "$path/conf" ){

        # Read in conf file.
        $self = _configure($path);
        # IMPORTANT conf variables: seg_max_words, nsegments


        $self->{name} = $path;
        $self->{seg_nwords} = 0;
        $self->{seg_ndocs} = 0;
        $self->{isrs} = {}; 
        $self->{mode} = 'WRONLY';
    }
    else {    # entirely new index
        # Set up a default configuration.

        mkdir $path;
        $self = {
            mode => 'WRONLY',
            name => $path,
            nsegments => 0,
            nwords => 0,
            ndocs => 0,
            seg_max_words => 5 * 1024 * 1024, # 5 million words
            isrs => {},
            seg_nwords => 0,
            seg_ndocs => 0,
            ids => [],
            # What else?
        };
        index_document($self, "initial", "abcdefghijklmnopqrstuvwxyz");
    }

    return bless $self, $type;
}

sub open_read {
    my $type = shift;
    my $path = shift;

    my $self = {};

    if( -e "$path/conf" ){
        $self = _configure($path);

        my %cdb;
        tie %cdb, 'CDB_File', "$path/0/CDB" or die $!;
        $self->{cdb} = \%cdb;

        open(IDS, "<$path/0/ids") or die $!;
        chomp( @{ $self->{ids} } = <IDS> );
        close IDS;

        $self->{name} = $path;
        $self->{isrs} = {};
        $self->{mode} = 'RDONLY';
    }
    else {
        warn "Index $path does not exist\n";
        return undef;
    }

    isrcache_init($self->{cdb});
    return bless $self, $type;
}



sub close_index {
    my $self = shift;

    if( $self->{mode} eq 'WRONLY' ){
        $self->_write_segment();
    }
    else {
        $self->DESTROY;
    }

    return 1;
}

sub DESTROY {
    my $self = shift;
    untie %{ $self->{cdb} };
    @{ $self->{ids} } = ();
    $self = undef;
    return 1;
}

sub tokenize_std {
    my $doc = shift;
    $doc =~ s|<[^>]+>||g; # Get rid of all tags.
	$doc =~ s|(\d)| $1 |g; # Count each digit.
	$doc = lc $doc;
	return split(m|[\W_]+|, $doc);
}



# Isrs are cached in a closure initialized with the cdb disk hash
{
    my %isrs = ();
    my %nrequests = ();
    my %timestamp = ();
    my $cdb = {};

    # this is called by the open_read function to 
    # instantiate the cdb object. That way only 
    # the individual word is necessary to call isr().
    sub isrcache_init {
        $cdb = shift;
    }

    sub isr {
        my $word = shift;
    
        # return the empty isr if no occurrence.
        return new_isr() unless exists $cdb->{$word};
        $nrequests{$word}++;
        $timestamp{$word} = time;
        return $isrs{$word} if exists $isrs{$word};

        # remove the top 10 isrs according to least-requested and 
        # least-recently requested.
        if(50000 < scalar keys %isrs){
            my $time = time;
            my @words = sort { $a <=> $b } 
                        map { $nrequests{$_} / ($time - $timestamp{$_}) }
                        keys %isrs;
            delete $isrs{$_} for @words[0..9];
        }

        my $isr = _read_isr($cdb, $word);    
        $isrs{$word} = $isr;
        return $isr;
    }
}

sub _read_isr {
    my $cdb = shift;
    my $word = shift;
    my $isr = new_isr();
    #return $isr unless exists $cdb->{$word};
    my $ISR = $cdb->{$word};

    my($ndocs, $nwords, $lastdoc, $dxlen) = unpack "L4", $ISR;
    substr($ISR, 0, 16) = '';

    #print STDERR "position = $pos, length = $length\n";
    $isr->[NWORDS] = $nwords;
    $isr->[NDOCS] = $ndocs;
    $isr->[LASTDOC] = $lastdoc;
    $isr->[DX] = substr($ISR, 0, $dxlen);
    substr($ISR, 0, $dxlen) = '';
    my $pxrunlen_len = unpack "L", $ISR;
    substr($ISR, 0, 4) = '';
    my @pxrunlens = unpack "w*", substr($ISR, 0, $pxrunlen_len);
    substr($ISR, 0, $pxrunlen_len) = '';
    my @PX;
    while(@pxrunlens){
        my $runlen = shift @pxrunlens;
        my $PX = substr($ISR, 0, $runlen);
        substr($ISR, 0, $runlen) = '';
        push @PX, $PX;
    }
    $isr->[PX] = \@PX;

    return $isr;
}


sub index_document {
    my $self = shift;
    my $doc_name = shift;
    my $document = shift;
    my $isrs = $self->{isrs};
    my $docid = $self->{seg_ndocs}++;
    my $position = 0;
    my %seen = (); # words in this doc -> list of position deltas
    my %last = (); # words in this doc -> last position seen
    
    push @{ $self->{ids} }, $doc_name;

    # Record term position deltas
    for my $word ( split /\W+/, $document ){
        $position++;
        push @{ $seen{$word} }, 
            $position - (exists $last{$word} ? $last{$word} : 0);
        $last{$word} = $position;
    }

    while(my($word, $deltas) = each %seen){
        $isrs->{$word} = new_isr() unless exists $isrs->{$word};
        my $isr = $isrs->{$word};
        my $docdelta = $docid - $isr->[LASTDOC];
        $isr->[LASTDOC] = $docid;
        $isr->[NDOCS]++;
        $isr->[NWORDS] += scalar @$deltas;
        if( @$deltas == 1 ){
            $isr->[DX] .= 
                pack("w*",
                    2 * $docdelta,       # 
                    $deltas->[0]); # the single position delta
        }
        else { # term count > 1
            $deltas = pack "w*", @$deltas;
            $isr->[DX] .= 
                pack("w*", 
                2 * $docdelta + 1, # doc delta*2, +1 for >1 term count in doc
                scalar @{ $isr->[PX] }); # index to position deltas in PX
            push @{ $isr->[PX] }, $deltas;
        }
    }

    $self->{seg_nwords} += $position;
    
    if( $self->{seg_nwords} >= $self->{seg_max_words} ){
        $self->_write_segment();
    }
    return $position;
}

sub new_isr {
    my $isr = [];
    $isr->[NDOCS] = 0;
    $isr->[NWORDS] = 0;
    $isr->[DX] = '';
    $isr->[PX] = [];
    $isr->[LASTDOC] = 0;
    return $isr;
}



sub _write_segment {
    my $self = shift;
    my $path = $self->{name};
    my $nsegments = $self->{nsegments}++;
    my $isrs = $self->{isrs};
    my $count = 0;
   
    mkdir "$path/$nsegments";
    my $new = new CDB_File("$path/$nsegments/NEW", 
                            "$path/$nsegments/CDB.tmp") or 
        die "$0: new CDB_File failed: $!\n";

    my @words = keys %$isrs;
    for my $word ( @words ){

        if( (++$count) % 50 == 0 ){
            print STDERR chr(13), "(", $count, ")\t\t";
        }

        $new->insert($word, _serialize_isr($isrs->{$word}));
        delete $isrs->{$word};
    }

    %{ $isrs } = ();

    $new->finish or die $!;

    rename "$path/$nsegments/NEW", "$path/$nsegments/CDB";

    open IDS, ">$path/$nsegments/ids";
    print IDS "$_\n" for @{$self->{ids}};
    close IDS;
    @{ $self->{ids} } = ();

    # segment conf
    open CONF, ">$path/$nsegments/conf";
    print CONF "seg_nwords:", $self->{seg_nwords}, "\n";
    print CONF "seg_ndocs:", $self->{seg_ndocs}, "\n";
    close CONF;


    # top level conf
    $self->{nwords} += $self->{seg_nwords};
    $self->{ndocs} += $self->{seg_ndocs};

    open CONF, ">$path/conf";
    binmode CONF;
    print CONF 'seg_max_words:', $self->{seg_max_words}, "\n";
    print CONF "# DO NOT EDIT BELOW THIS LINE\n";
    print CONF 'nwords:', $self->{nwords}, "\n";
    print CONF 'nsegments:', $self->{nsegments}, "\n";
    print CONF 'ndocs:', $self->{ndocs}, "\n";
    close CONF;

    $self->{seg_nwords} = 0;
    $self->{seg_ndocs} = 0;

    return "\nwrote segment with $count isrs.\n";
}

sub _serialize_isr {
    my $isr = shift;

    my $newisr = '';
    $newisr .= pack "L", $isr->[NDOCS];   # num of docs
    $newisr .= pack "L", $isr->[NWORDS];  # num of positions.
    $newisr .= pack "L", $isr->[LASTDOC]; # last document with this word

    $newisr .= pack "L", length $isr->[DX]; # runlength
    $newisr .= $isr->[DX]; 

    my $runlengths = pack "w*", map {length $_} @{ $isr->[PX] };
    $newisr .= pack "L", length $runlengths;
    $newisr .= $runlengths;
    $newisr .= join '', @{ $isr->[PX] }; # BER delta lists

    return $newisr;
}


sub _configure {
    my $path = shift;
    my $self = {};    

    # File "conf" contains seg_max_words, nwords.

    open CONF, "<$path/conf" or die $!;
    while(<CONF>){
        next if m|^#|;
        chomp;
        my ($key, $value) = split m|:|;
        $self->{$key} = $value;
    }
    close CONF;

    return $self;
}



sub optimize_index {
    my $path = shift;

    my @dirs = sort { $a <=> $b }
               grep { /^\d+$/ }
               map { s/^.+?\/(\d+)$/$1/; $_ }
               glob("$path/*");

    print STDERR "Compacting segments ", join(" ", @dirs), "\n";

    # gather necessary info for each segment
    my (@segments, %words);
    for my $segment (@dirs){
        my $conf = _configure("$path/$segment");
        my %cdb;
        tie %cdb, 'CDB_File', "$path/$segment/CDB";
        my %localwords = ();
        $localwords{$_} = 1 for grep {length($_) < 26} keys %cdb;
        $words{$_} = 1 for keys %localwords;
        print STDERR "Gathered ", scalar keys %words, " words at segment $segment.           \r";
        push @segments, [ $conf, \%cdb, "$path/$segment", \%localwords ];
    }


    # new consolidated index segment
    mkdir "$path/NEWSEGMENT";

    # create new cdb
    print STDERR "Creating new compacted segment.                        \n";
    my $newidx = new CDB_File("$path/NEWSEGMENT/NEW", 
                            "$path/NEWSEGMENT/CDB.tmp") or 
        die "$0: new CDB_File failed: $!\n";

    my $ntokens = scalar keys %words;
    my $t0 = time;
    while(my($word, undef) = each %words){
        $ntokens--;
        if(time-$t0 > 2){
            print STDERR "Compacting $ntokens th word: $word                       \r";
            $t0 = time;
        }
        my $isr = new_isr();
        my $ndocs = 0;
        for my $segment (@segments){
            my($conf, $cdb, undef, $localwords) = @$segment;
            if(exists $localwords->{$word}){ 
                $isr = _append_isr($isr, $ndocs, _read_isr($cdb, $word));
            }
            $ndocs += $conf->{seg_ndocs};
        }
        $newidx->insert($word, _serialize_isr($isr));
    }

    # write ids file, delete older segments
    open IDS, ">$path/NEWSEGMENT/ids";
    for my $segment (@segments){
        print STDERR "Writing document ids for segment ", $segment->[2], ", deleting segment dir.\r";
        open SEGIDS, $segment->[2] . "/ids"; # path to ids file
        while(<SEGIDS>){
            print IDS $_;
        }
        unlink glob($segment->[2] . "/*");
        rmdir $segment->[2];
    }

    print STDERR "\nWriting disk hash.\n";
    $newidx->finish;
    rename "$path/NEWSEGMENT/NEW", "$path/NEWSEGMENT/CDB";

    my ($nwords, $ndocs);
    $nwords += $_->[0]->{seg_nwords} for @segments;
    $ndocs += $_->[0]->{seg_ndocs} for @segments;

    open CONF, ">$path/NEWSEGMENT/conf";
    print CONF "seg_nwords:", $nwords, "\n";
    print CONF "seg_ndocs:", $ndocs, "\n";
    close CONF;

    my $conf = _configure($path);
    open CONF, ">$path/conf";
    binmode CONF;
    print CONF 'seg_max_words:', $conf->{seg_max_words}, "\n";
    print CONF "# DO NOT EDIT BELOW THIS LINE\n";
    print CONF "nsegments:1\n";
    print CONF "nwords:", $nwords, "\n";
    print CONF "ndocs:", $ndocs, "\n";
    close CONF;


    rename "$path/NEWSEGMENT", "$path/0";
}


sub _append_isr {
    my($isr0, $seg0ndocs, $isr1) = @_;
    return $isr0 unless $isr1->[NDOCS];

    # convert from ber-string to integer array
    my @isr0dx = unpack "w*", $isr0->[DX]; $isr0->[DX] = \@isr0dx;
    my @isr1dx = unpack "w*", $isr1->[DX]; $isr1->[DX] = \@isr1dx;

    # adjust first doc delta to previous segment's doc count
    my $adjust = ($seg0ndocs - $isr0->[LASTDOC]);
    my $ptrflag = $isr1->[DX]->[0] % 2;
    my $delta0 = int($isr1->[DX]->[0] / 2);
    $isr1->[DX]->[0] = ($delta0 + $adjust) * 2 + $ptrflag;

    # adjust px ptrs to previous isr's PX length
    my $pxn = scalar @{ $isr0->[PX] };
    while(@{ $isr1->[DX] }){
        my $delta = shift @{ $isr1->[DX] };
        my $pxptr = shift @{ $isr1->[DX] };
        if($delta % 2){ # odd, means pxptr is a PX array index
            $pxptr += $pxn;
        }
        push @{ $isr0->[DX] }, $delta, $pxptr;
    }
    push @{ $isr0->[PX] }, @{ $isr1->[PX] };
    $isr0->[NDOCS] += $isr1->[NDOCS];
    $isr0->[NWORDS] += $isr1->[NWORDS];
    $isr0->[LASTDOC] = $seg0ndocs + $isr1->[LASTDOC];

    # convert from integer array back to ber-string
    $isr0->[DX] = pack "w*", @{ $isr0->[DX] };
    $isr1->[DX] = pack "w*", @{ $isr1->[DX] };

    return $isr0;
}






# SEARCHING

# finds docs with search terms in logical relationship indicated in 
# query. 
sub fancy_search {
    my $self = shift;
    my $query = shift;
    my ($aligner, $matcher) = compile_query($query);

    my %docmatches = (); # docid => match position list
    my $docid = 1;
    while(my $nextdoc = $aligner->($docid)){
        unless($docid == $nextdoc){
            $docid = $nextdoc;
            next;
        }
        my ($pos0, $posN) = (0, 0);
        while(($pos0, $posN) = $matcher->($pos0)){
            push @{$docmatches{$self->{ids}->[$docid]}}, "$pos0-$posN";
            #$pos0++;
        }
#$docmatches{$self->{ids}->[$docid]} = 1;
        $docid++;
    }
    return \%docmatches;
}

sub compile_query {

    # These regexs transform the [A|B]C[D #w5 E|F]G[H|I J|K] syntax
    # to a more recursable 
    # < [ A B ] C [ < D:#w5 E > F ] G [ H < I J > K ] > notation 
    # <> means sequential entities (~AND)
    # [] means alternative entities (OR)
    my $qstr = shift;
    $qstr =~ s/[\[\]\|\<\>]/ $& /g;
    $qstr =~ s/\s+(#[wWtT]\d+)/$1/g;
#    $qstr =~ s/\s+(([^\s\[\]\|]+\s+){2,})/ < $1 > /g;
#    $qstr =~ s/\|/ /g;
    $qstr = "< $qstr >";
    my @token = split(/\s+/, $qstr);

#    print join(" ", @token), "\n";

    shift @token; # take off first 'and' marker to pass to and_chain
    return and_chain(@token);
}

sub and_chain {
    my @rest = @_;
    return sub { return $_[0] },         # aligner
           sub { return ($_[0], $_[0]) } # matcher
           unless @rest; # empty list

    my ($align, $nalign, $match, $nmatch, $interval);

    if($rest[0] eq '>'){ # this chain is done
        shift @rest;
        return sub { return $_[0] },          # aligner
               sub { return ($_[0], $_[0]) }, # matcher
               @rest;
    }
    elsif($rest[0] eq '['){
        shift @rest;
        ($align, $match, $interval, @rest) = or_chain(@rest);
    }
    else {
        my $word = shift @rest;
        ($interval, $word) = parse_interval($word);
        ($align, $match) = isr_align_match($word);
    }
    ($nalign, $nmatch, @rest) = and_chain(@rest);
    
    return 
        and_aligner($align, $nalign),
        and_matcher($match, $nmatch, $interval),
        @rest;

}

sub or_chain {
    my @rest = @_;
    return sub { return undef }, # aligner
           sub { return () },    # matcher
		   0
           unless @rest; # empty list

    my ($align, $nalign, $match, $nmatch, $interval);

    if($rest[0] =~ /^\]/){ # this chain is done
        ($interval) = parse_interval($rest[0]);
        shift @rest;
        return sub { return undef }, # aligner
               sub { return () },    # matcher
               $interval,
               @rest;
    }
    elsif($rest[0] eq '<'){
        shift @rest;
        ($align, $match, @rest) = and_chain(@rest);
    }
    else {
        my $word = shift @rest;
        ($align, $match) = isr_align_match($word);
    }
    ($nalign, $nmatch, @rest) = or_chain(@rest);

    return 
        or_aligner($align, $nalign),
        or_matcher($match, $nmatch),
        @rest;
}

sub parse_interval {
    my $word = shift;
    my ($newword, $interval) = split(/#[wWtT]/, $word);
    $interval ||= 1;
    $interval *= -1 if ($word =~ /#[tT]/);
    return $interval, $newword;
}

sub min {
    return 
      (defined $_[0]         ?
          (defined $_[1]     ?
              ($_[0] > $_[1] ?
                  $_[1]      :
                  $_[0] )    :
              $_[0] )        :
          $_[1] );
}


sub or_aligner {
    my ($this, $next) = @_;

    return sub {
        my $pos = shift;
        my $thispos = $this->($pos);
        my $nextpos = $next->($pos);
        return min($thispos, $nextpos);
    }; 
}


# returned function does this:
# FIND first instance of ($this0, $thisN) match boundaries
# WHERE $this0 > $top0, $thisN > $topN. 
# RETURN ($this*) or ($next*) according to min($thisN, $nextN)
sub or_matcher {
    my ($this, $next) = @_;
    my ($this0, $thisN, $next0, $nextN) = (0, 0, 0, 0);

    return sub {
        my ($top0, $topN) = @_;

        # return NEXT if there is no THIS.
        return $next->($top0, $topN) unless 
            ($this0, $thisN) = $this->($top0, $topN); # AND-node or ISR

        # find thisN high enough
        while($thisN <= $topN){
            return $next->($top0, $topN) unless
                ($this0, $thisN) = $this->($this0);
        }

        # we have a valid THIS, now get neighboring match.
        return ($this0, $thisN) unless 
            ($next0, $nextN) = $next->($top0, $topN); # OR-node

        return (min($thisN, $nextN) == $thisN) ?
               ($this0, $thisN) :
               ($next0, $nextN);
    };
}


# returns the highest valued document id in the chain
sub and_aligner {
    my ($this, $next) = @_;
    my $thispos;

    return sub {
        my $pos = shift;
        return undef unless $thispos = $this->($pos);
        return $next->($thispos);
    };
}

# returned function does:
# FIND first instance of $this* and $next* 
# WHERE ($next0-$interval) < $thisN < $nextN
# and $pos < $this0.
# RETURN ($this0, $nextN)
sub and_matcher {
    my ($this, $next, $interval) = @_;
    my ($this0, $thisN, $next0, $nextN) = (0, 0, 0, 0);
    my $exactadjust = ($interval < 0) ? -$interval : 1; 
    $interval = abs($interval);

    return sub {
        my $pos = shift;
        return () unless 
            ($this0, $thisN) = $this->($pos, $pos); #OR-node or ISR
        return () unless 
            ($next0, $nextN) = $next->($thisN+$exactadjust-1); # AND-chain
        while($thisN < ($next0-$interval)){
            return () unless 
                ($this0, $thisN) = $this->($pos, $next0-$interval-1);
            if($thisN > ($next0-$exactadjust+1)){
                return () unless 
                    ($next0, $nextN) = $next->($thisN);
            }
        }
        return ($this0, $nextN);
    };
}



sub isr_align_match {
    my ($word) = @_;

    my $isr = isr($word);

    if(!$isr->[DX]){ # empty isr?
        return sub { return undef }, sub { return () };
    }

    my $docid = 0; # doc id requested
    my $dxsum = 0; # doc id for this isr
    my $dxn = 0; # location in DX
    my $px = ''; # current pos list
    my $pxsum = 0; # token location in doc
    my $pxn = 0; # location in PX string

    return 
        sub { # the align
            $docid = shift;
            return undef unless defined $dxsum;
            if($docid > $dxsum){  # new doc
                ($dxsum, $dxn, $px) = 
                    sum_to_doc($isr->[DX], $dxsum, $dxn, $docid);
                $px = defined $px ?
                          ($px % 2) ? 
                              $isr->[PX]->[int $px/2] : # string px
                              pack "w", $px/2 :         # single pos delta
                          undef;
                ($pxsum, $pxn) = (0, 0);
            }
            return $dxsum;
        },
        sub { # the match
            return () unless (defined $dxsum) and 
                             ($docid == $dxsum); # the align is valid
            my (undef, $pos) = @_;
            if($pxsum <= $pos){
                ($pxsum, $pxn) = sum_to_pos($px, $pos, $pxn, $pxsum);
            }
            return ($pxsum > $pos) ? ($pxsum, $pxsum) : ();
        };
}


# Return the new PX list corresponding to doc $pos, starting with doc 
# $sum, DX index $n.
sub sum_to_doc_perl {
    my($isr, $sum, $n, $pos) = @_;

    return () unless defined $isr->[DX]->[$n];
    my $dx = $isr->[DX];
    while($sum < $pos){ # current doc < target doc
        $sum += int($dx->[$n]/2);
        $n += 2;
        last unless defined $dx->[$n];
    }
    return () if ($sum < $pos); # no documents left
    my $px = ($dx->[$n-2] % 2) ? 
              $isr->[PX]->[$dx->[$n-1]] :  # PX list
              pack "w", $dx->[$n-1];       # single position delta (BER)
    return ($sum, $n, $px);
}

1;


__DATA__

=pod

=head1 NAME

Freq - An inverted text index.

=head1 ABSTRACT

THIS IS ALPHA SOFTWARE

Freq is a text indexer and search utility written in Perl and C. It has several special features not to be found in most available similar programs, namely arbitrarily complex sequence and alternation queries, and proximity searches with both exact counts and limits. There is no result ranking (yet). 

The index format draws some ideas from the Lucene search engine, with some simplifications and enhancements. The index segments are stored in a CDB disk hash (from dj bernstein).

=head1 SYNOPSIS

Index documents:

  # cat textcorpus.txt | tokenize | indexstream index_dir
  # cat textcorpus.txt | tokenize | stopstop | indexstream index_dir
  # optimize index_dir

Search:

  # freqsearch index_dir
  # (type search terms)

=head1 PROGRAMMING API

  use Freq;

  # open for indexing
  $index = Freq->open_write( "indexname" );
  $index->index_document( "docname", $string );
  $index->close_index();

  # open for searching
  $index = Freq->open_read( "indexname" );

  # Find all docs containing a phrase
  $result = $index->search( "this phrase and no other phrase" );

  # result is hashref:
  # { doc1 => [ match1, match2 ... matchN ],
  #   doc2 => [ match1, ... ]
  # }
  # ... where 'match' is the token location of each match within that doc.


=head1 SEARCH SYNTAX

Sequences of words are enclosed in angle brackets '<' and '>'. Alternations are enclosed in square brackets '[' and ']'. These may be nested within each other as long as it makes logical sense. "<the quick [brown grey] fox>" is a simple valid phrase. Nested square brackets don't make sense, logically, so they aren't allowed. Also not allowed are adjacent angle bracket sequences. However, alternations may be adjacent, as in "<I [go went] [to from] the store>". As long as these rules are followed, search terms may be arbitrarily complex. For example:

"<At [a the] council [of for] the [gods <deities we [love <believe in>] with all our [hearts strength]>] on olympus [hercules apollo athena <some guys>] [was were] [praised <condemned to eternal suffering in the underworld>]>"

Two operators are available to do proximity searches. '#wN' represents *at least* N intervening skips between words (the number of between words plus 1). Thus "<The #w8 dog>" would match the text "The quick brown fox jumped over the lazy dog". If #w7 or lesser had been used it would not match, but if #w9 or greater had been used it would still match. Also there is the '#tN' operator, which represents *exactly* N intervening skips. Thus for the above example "<The #t8 dog>", and no other value, would match. These operators can be used after words or alternations, but no other place. 


=head1 AUTHOR

Ira Joseph Woodhead, ira at sweetpota dot to

=head1 SEE ALSO

C<Lucene>
C<Plucene>
C<CDB_File>
C<Inline>

=cut


__C__


/* count how many characters make up the compressed integer 
   at the beginning of the string px. */
int next_integer_length(char* px){
    unsigned int length = 0;
    unsigned char mask = (1 << 7);
    //if(!*px) return 0; // empty string
    while(*px & mask){
        px++;
        length++;
    }
	length++; // final char
    return (int) length;
}

/* convert the compressed integer at the beginning of the string
   px to a int. */
int next_integer_val(char* px){
    unsigned int value = 0;
    unsigned char himask = (1 << 7); // 10000000
	unsigned char lomask = 127;      // 01111111
    while(*px & himask){
        value = ((value << 7) | (*px & lomask));
        px++;
    }
    value = ((value << 7) | *px);
    return value;
}


/* px is a string of chars representing BER compressed integers.
   These are position deltas within a document. pxn is the current
   string index, pxsum is the current sum. sum_to_pos() computes
   the first position in a document past pos.
*/
void sum_to_pos(SV* pxSV, int pos, int pxn, int pxsum){
    char* px = SvPV_nolen( pxSV );

    INLINE_STACK_VARS;
/*
    if(strlen(px) <= pxn){
        INLINE_STACK_RESET;
        INLINE_STACK_PUSH(sv_2mortal(newSViv(0)));
        INLINE_STACK_PUSH(sv_2mortal(newSViv(0)));
        INLINE_STACK_DONE;
        return;
    }
*/

    px += pxn; // advance char pointer to current pxn
    while(*px && (pxsum <= pos)){
        unsigned int len = next_integer_length(px);
        pxsum += next_integer_val(px);
		px += len;
        pxn += len;
    }
    
    INLINE_STACK_RESET;
    INLINE_STACK_PUSH(sv_2mortal(newSViv(pxsum)));
    INLINE_STACK_PUSH(sv_2mortal(newSViv(pxn)));
    INLINE_STACK_DONE;
    return;
}


/* dx is a string of chars representing BER compressed integers.
   These are document id deltas. dxn is the current
   string index, dxsum is the current sum. sum_to_doc() computes
   the first position in the corpus at or past pos, and finds the
   integer index into px (if it exists. if it does not exist, it
   finds the single position delta from dx).
*/
void sum_to_doc(SV* dxSV, int dxsum, int dxn, int pos){
    char* dx = SvPV_nolen( dxSV );
    int pxval = 0;
    int last_dx_delta = 0;

    INLINE_STACK_VARS;

    dx += dxn; // advance char pointer to current dxn
    while(*dx && (dxsum < pos)){
        unsigned int len = next_integer_length(dx);
        last_dx_delta = next_integer_val(dx);
        dxsum += floor(last_dx_delta/2);
		dx += len; // advance ptr
        dxn += len;

		len = next_integer_length(dx);
        pxval = next_integer_val(dx);
		dx += len;
		dxn += len;
    }

    if(dxsum < pos){
        INLINE_STACK_RESET;
        INLINE_STACK_DONE;
	    return;
    }
	
    INLINE_STACK_RESET;
    INLINE_STACK_PUSH(sv_2mortal(newSViv(dxsum)));
    INLINE_STACK_PUSH(sv_2mortal(newSViv(dxn)));
    INLINE_STACK_PUSH(sv_2mortal(newSViv(pxval * 2 + (last_dx_delta % 2))));
    INLINE_STACK_DONE;
    return;
}