package DBIx::FullTextSearch::Blob;
use strict;
# Open in the backend just sets the object
sub open {
my ($class, $fts) = @_;
return bless { 'fts' => $fts }, $class;
}
# Create creates the table(s) according to the parameters
sub _create_tables {
my ($class, $fts) = @_;
my $CREATE_DATA = <<EOF;
create table $fts->{'data_table'} (
word varchar($fts->{'word_length'}) binary
default '' not null,
idx longblob default '' not null,
primary key (word)
)
EOF
my $dbh = $fts->{'dbh'};
$dbh->do($CREATE_DATA) or return $dbh->errstr;
push @{$fts->{'created_tables'}}, $fts->{'data_table'};
return;
}
sub add_document {
my ($self, $id, $words) = @_;
my $fts = $self->{'fts'};
my $dbh = $fts->{'dbh'};
my $data_table = $fts->{'data_table'};
my $update_sth = ( defined $self->{'adding_update_sth'}
? $self->{'adding_update_sth'}
: $self->{'adding_update_sth'} = $dbh->prepare(
"update $data_table set idx = concat(idx, ?)
where word = ?") );
my @insert_values;
my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}}
. $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}};
my $num_words = 0;
for my $word ( keys %$words ) {
### print STDERR "$word($id) adding\n";
# here we will want to parametrize the bit size of the
# data
my $value = pack $packstring, $id, $words->{$word};
my $rows = $update_sth->execute($value, $word);
push @insert_values, $word, $value if $rows == 0;
$num_words += $words->{$word};
}
if(@insert_values){
my $sql_str = "insert into $data_table values ". join(',', ('(?, ?)') x (@insert_values/2));
$dbh->do($sql_str,{},@insert_values);
}
return $num_words;
}
sub delete_document {
my $self = shift;
for my $id (@_) { $self->update_document($id, {}); }
}
sub update_document {
my ($self, $id, $words) = @_;
my $fts = $self->{'fts'};
my $dbh = $fts->{'dbh'};
my $data_table = $fts->{'data_table'};
my $insert_sth = ( defined $self->{'insert_sth'}
? $self->{'insert_sth'}
: $self->{'insert_sth'} = $dbh->prepare("
insert into $data_table values (?, ?)") );
my $update_sth = ( defined $self->{'update_update_sth'}
? $self->{'update_update_sth'}
: $self->{'update_update_sth'} =
$dbh->prepare("update $data_table set idx =
concat(substring(idx, 1, ?), ?, substring(idx, ?))
where word = ?") );
my @insert_values;
$dbh->do("lock tables $data_table write");
my $select_sth = $dbh->prepare("select word from $data_table");
$select_sth->execute;
my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}}
. $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}};
my ($packnulls) = pack $packstring, 0, 0;
my $packlength = length $packnulls;
my $num_words = 0;
while (my ($word) = $select_sth->fetchrow_array) {
my $value = (defined $words->{$word} ?
pack($packstring, $id, $words->{$word}) : '');
# the method find_position finds the position of the
# "record" for document $id with word $word; returned is
# the position in bytes and yes/no values specifying if
# the record is already present in the blob; if it is,
# we need to replace it, otherwise just insert.
my ($pos, $shift) = $self->find_position($word, $id);
if (not defined $pos) {
push @insert_values, $word, $value;
}
else {
my $spos = $pos + 1; # I'm not sure why this
$spos += $packlength if $shift;
$update_sth->execute($pos, $value, $spos, $word);
}
delete $words->{$word};
$num_words++ if defined $value;
}
for my $word ( keys %$words ) {
my $value = pack $packstring, $id, $words->{$word};
push @insert_values, $word, $value;
# $insert_sth->execute($word, $value);
$num_words++;
}
if(@insert_values){
my $sql_str = "insert into $data_table values ". join(',', ('(?, ?)') x (@insert_values/2));
$dbh->do($sql_str,{},@insert_values);
}
$dbh->do("unlock tables");
return $num_words;
}
sub find_position {
my ($self, $word, $id) = @_;
# here, with the calculation of where in the blob we have the
# docid and where the count of words and how long they are, we
# should really look at the parameters (num of bits of various
# structures and values) given to create
my $fts = $self->{'fts'};
my $dbh = $fts->{'dbh'};
my $data_table = $fts->{'data_table'};
# Sth to read the length of the blob holding the document/count info
my $get_length_sth = ( defined $self->{'get_length_sth'}
? $self->{'get_length_sth'}
: $self->{'get_length_sth'} = $dbh->prepare("select
length(idx) from $data_table where word = ?"));
my $length = $dbh->selectrow_array($get_length_sth, {}, $word);
my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}}
. $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}};
my ($packnulls) = pack $packstring, 0, 0;
my $packlength = length $packnulls;
if (not defined $length) { return; }
$length = int($length/$packlength);
my ($bot, $top, $med, $val) = (0, $length);
if (not defined $fts->{'max_doc_id'})
{ $med = int(($top - $bot) / 2); }
else
{ $med = int($top * $id / $fts->{'max_doc_id'}); }
my $blob_direct_fetch = $fts->{'blob_direct_fetch'};
# we divide the interval
while ($bot != $top) {
$med = $top - 1 if $med >= $top;
$med = $bot if $med < $bot;
if ($top - $bot <= $blob_direct_fetch) {
my $get_interval_sth = (
defined $self->{'get_interval_sth'}
? $self->{'get_interval_sth'}
: $self->{'get_interval_sth'} = $dbh->prepare("select substring(idx,?,?) from $data_table where word = ?"));
my $alldata = $dbh->selectrow_array($get_interval_sth,
{},
$bot * $packlength + 1,
($top - $bot) * $packlength,
$word);
return unless defined $alldata;
my @docs;
my $i = 0;
while ($i < length $alldata) {
push @docs, unpack $packstring,
substr $alldata, $i, $packlength;
$i += $packlength;
}
for (my $i = 0; $i < @docs; $i += 2) {
if ($docs[$i] == $id) { return (($bot+($i/2))*$packlength, 1); }
if ($docs[$i] > $id) { return (($bot+($i/2))*$packlength, 0); }
}
return ($top * $packlength, 0);
}
($val) = $dbh->selectrow_array(
"select substring(idx, ?, 2) from $data_table
where word = ?", {}, ($med * $packlength) + 1, $word);
($val) = unpack $packstring, $val;
if (not defined $val) { return; }
if ($val == $id) { return ($med * $packlength, 1); }
elsif ($val < $id) { $bot = $med + 1; }
else { $top = $med; }
$med = int($med * $id / $val);
}
return ($bot * $packlength, 0);
}
sub contains_hashref {
my $self = shift;
my $fts = $self->{'fts'};
my $dbh = $fts->{'dbh'};
my $data_table = $fts->{'data_table'};
my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}}
. $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}};
my ($packnulls) = pack $packstring, 0, 0;
my $packlength = length $packnulls;
my $sth = ( defined $self->{'get_idx_sth'} ?
$self->{'get_idx_sth'} :
$self->{'get_idx_sth'} =
$dbh->prepare(
"select idx from $data_table where word like ?"
));
my $out = {};
for my $word (@_) {
$sth->execute($word);
while (my ($blob) = $sth->fetchrow_array) {
next unless defined $blob;
my @data;
my $i = 0;
while ($i < length $blob) {
push @data, unpack $packstring,
substr $blob, $i, $packlength;
$i += $packlength;
}
while (@data) {
my $doc = shift @data;
my $count = shift @data;
unless (defined $out->{$doc}) { $out->{$doc} = 0; }
$out->{$doc} += $count;
}
}
$sth->finish;
}
$out;
}
*parse_and_index_data = \&DBIx::FullTextSearch::parse_and_index_data_count;
1;