The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.

package DBIx::FullTextSearch::Blob;
use strict;

# Open in the backend just sets the object
sub open {
	my ($class, $fts) = @_;
	return bless { 'fts' => $fts }, $class;
}
# Create creates the table(s) according to the parameters
sub _create_tables {
	my ($class, $fts) = @_;
	my $CREATE_DATA = <<EOF;
		create table $fts->{'data_table'} (
			word varchar($fts->{'word_length'}) binary
					default '' not null,
			idx longblob default '' not null,
			primary key (word)
		)
EOF
        my $dbh = $fts->{'dbh'};
	$dbh->do($CREATE_DATA) or return $dbh->errstr;
        push @{$fts->{'created_tables'}}, $fts->{'data_table'};
	return;
}

sub add_document {
	my ($self, $id, $words) = @_;
	my $fts = $self->{'fts'};
	my $dbh = $fts->{'dbh'};
	my $data_table = $fts->{'data_table'};

	my $update_sth = ( defined $self->{'adding_update_sth'}
		? $self->{'adding_update_sth'}
		: $self->{'adding_update_sth'} = $dbh->prepare(
			"update $data_table set idx = concat(idx, ?)
				where word = ?") );

	my @insert_values;

	my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}}
		. $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}};
	my $num_words = 0;
	for my $word ( keys %$words ) {
### print STDERR "$word($id) adding\n";
		# here we will want to parametrize the bit size of the
		# data
		my $value = pack $packstring, $id, $words->{$word};
		my $rows = $update_sth->execute($value, $word);
		push @insert_values, $word, $value if $rows == 0;
		$num_words += $words->{$word};
	}

	if(@insert_values){
		my $sql_str = "insert into $data_table values ". join(',', ('(?, ?)') x (@insert_values/2));
		$dbh->do($sql_str,{},@insert_values);
	}

	return $num_words;
}

sub delete_document {
	my $self = shift;
	for my $id (@_) { $self->update_document($id, {}); }
}

sub update_document {
	my ($self, $id, $words) = @_;
	my $fts = $self->{'fts'};
	my $dbh = $fts->{'dbh'};
	my $data_table = $fts->{'data_table'};

	my $insert_sth = ( defined $self->{'insert_sth'}
		? $self->{'insert_sth'}
		: $self->{'insert_sth'} = $dbh->prepare("
			insert into $data_table values (?, ?)") );

        my $update_sth = ( defined $self->{'update_update_sth'}
		? $self->{'update_update_sth'}
		: $self->{'update_update_sth'} =
			$dbh->prepare("update $data_table set idx =
			concat(substring(idx, 1, ?), ?, substring(idx, ?))
					where word = ?") );


	my @insert_values;

	$dbh->do("lock tables $data_table write");

	my $select_sth = $dbh->prepare("select word from $data_table");
	$select_sth->execute;

	my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}}
		. $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}};
	my ($packnulls) = pack $packstring, 0, 0;
	my $packlength = length $packnulls;
	my $num_words = 0;
	while (my ($word) = $select_sth->fetchrow_array) {
		my $value = (defined $words->{$word} ?
				pack($packstring, $id, $words->{$word}) : '');

		# the method find_position finds the position of the
		# "record" for document $id with word $word; returned is
		# the position in bytes and yes/no values specifying if
		# the record is already present in the blob; if it is,
		# we need to replace it, otherwise just insert.

		my ($pos, $shift) = $self->find_position($word, $id);
		if (not defined $pos) {
			push @insert_values, $word, $value;
		}
		else {
			my $spos = $pos + 1;	# I'm not sure why this
			$spos += $packlength if $shift;
			$update_sth->execute($pos, $value, $spos, $word);	
		}
		delete $words->{$word};	
		$num_words++ if defined $value;
	}

	for my $word ( keys %$words ) {
		my $value = pack $packstring, $id, $words->{$word};
		push @insert_values, $word, $value;
#		$insert_sth->execute($word, $value);
		$num_words++;
	}

	if(@insert_values){
		my $sql_str = "insert into $data_table values ". join(',', ('(?, ?)') x (@insert_values/2));
		$dbh->do($sql_str,{},@insert_values);
	}

	$dbh->do("unlock tables");

	return $num_words;
}

sub find_position {
	my ($self, $word, $id) = @_;
	# here, with the calculation of where in the blob we have the
	# docid and where the count of words and how long they are, we
	# should really look at the parameters (num of bits of various
	# structures and values) given to create

	my $fts = $self->{'fts'};
	my $dbh = $fts->{'dbh'};
	my $data_table = $fts->{'data_table'};

	# Sth to read the length of the blob holding the document/count info
	my $get_length_sth = ( defined $self->{'get_length_sth'}
		? $self->{'get_length_sth'}
		: $self->{'get_length_sth'} = $dbh->prepare("select
			length(idx) from $data_table where word = ?"));
	my $length = $dbh->selectrow_array($get_length_sth, {}, $word);

	my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}}
		. $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}};
	my ($packnulls) = pack $packstring, 0, 0;
	my $packlength = length $packnulls;

	if (not defined $length) { return; }
	$length = int($length/$packlength);
	
	my ($bot, $top, $med, $val) = (0, $length);

	if (not defined $fts->{'max_doc_id'})
		{ $med = int(($top - $bot) / 2); }
	else
		{ $med = int($top * $id / $fts->{'max_doc_id'}); }

	my $blob_direct_fetch = $fts->{'blob_direct_fetch'};
	# we divide the interval
	while ($bot != $top) {
		$med = $top - 1 if $med >= $top;
		$med = $bot if $med < $bot;

		if ($top - $bot <= $blob_direct_fetch) {
			my $get_interval_sth = (
				defined $self->{'get_interval_sth'}
				? $self->{'get_interval_sth'}
				: $self->{'get_interval_sth'} = $dbh->prepare("select substring(idx,?,?) from $data_table where word = ?"));
			my $alldata = $dbh->selectrow_array($get_interval_sth,
				{},
				$bot * $packlength + 1,
				($top - $bot) * $packlength,
				$word);
			return unless defined $alldata;

			my @docs;
			my $i = 0;
			while ($i < length $alldata) {
				push @docs, unpack $packstring,
					substr $alldata, $i, $packlength;
				$i += $packlength;
			}
			for (my $i = 0; $i < @docs; $i += 2) {
                                if ($docs[$i] == $id) { return (($bot+($i/2))*$packlength, 1); }
				if ($docs[$i] > $id) { return (($bot+($i/2))*$packlength, 0); }
			}
			return ($top * $packlength, 0);
		}
		($val) = $dbh->selectrow_array(
			"select substring(idx, ?, 2) from $data_table
			where word = ?", {}, ($med * $packlength) + 1, $word);
		($val) = unpack $packstring, $val;

		if (not defined $val) { return; }
		if ($val == $id) { return ($med * $packlength, 1); }

		elsif ($val < $id) { $bot = $med + 1; }
		else { $top = $med; }

		$med = int($med * $id / $val);
	}
	return ($bot * $packlength, 0);
}

sub contains_hashref {
	my $self = shift;
	my $fts = $self->{'fts'};
	my $dbh = $fts->{'dbh'};
	my $data_table = $fts->{'data_table'};

	my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}}
		. $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}};
	my ($packnulls) = pack $packstring, 0, 0;
	my $packlength = length $packnulls;

	my $sth = ( defined $self->{'get_idx_sth'} ?
		$self->{'get_idx_sth'} :
		$self->{'get_idx_sth'} =
			$dbh->prepare(
				"select idx from $data_table where word like ?"
			));
	
	my $out = {};
	for my $word (@_) {
		$sth->execute($word);
		while (my ($blob) = $sth->fetchrow_array) {
			next unless defined $blob;
			my @data;
			my $i = 0;
			while ($i < length $blob) {
				push @data, unpack $packstring,
					substr $blob, $i, $packlength;
				$i += $packlength;
			}
			while (@data) {
				my $doc = shift @data;
				my $count = shift @data;
				unless (defined $out->{$doc}) { $out->{$doc} = 0; }
				$out->{$doc} += $count;
			}
		}
		$sth->finish;
	}
	$out;
}

*parse_and_index_data = \&DBIx::FullTextSearch::parse_and_index_data_count;

1;