lib/Text/SimSearch.pm - metacpan.org

package Text::SimSearch;
use strict;
use warnings;
use strict;
use warnings;
use Storable qw( nstore retrieve);
use Time::HiRes qw(gettimeofday tv_interval);

our $VERSION = '0.02';

sub new {
    my $class = shift;
    my $self = bless {@_}, $class;
    return $self;
}

sub add_item_from_file {
    my $self             = shift;
    my $file             = shift;
    my $max_posting_size = shift || 1000;

    my $tmp_data;
    my $labels;
    open my $fh, "<", $file
        or die("can not open $file");
    my $i = 0;
    while (<$fh>) {
        chomp $_;
        my @f = split "\t", $_;
        my $label = shift @f;
        $labels->[$i] = $label;

        my %vec = @f;
        my $vec = $self->_unit_length( \%vec );
        while ( my ( $key, $val ) = each %$vec ) {
            next unless $val > 0;
            $tmp_data->{$key}->{$i} = $val;
        }

        $i++;
    }
    close($fh);

    # make "Posting-Lists" from $tmp_data created above.
    # Note: concatenate the label-ID and the weight as string,
    #       and then convert it to interger value.

    my $posting_lists;

    my $key_scale = int( log( int @$labels ) / log(10) ) + 1;
    my $val_scale = 6;

    $i = 0;
    while ( my ( $key, $ref ) = each %$tmp_data ) {

        my @array;
        for ( keys %$ref ) {

            my $label_id = $_;
            my $weight   = $ref->{$label_id};

            # convert the weight to integer value.
            my $condition = '%.' . $val_scale . 'f';
            my $integer = sprintf( $condition, $weight ) * ( 10**$val_scale );

            # convert the label_id to integer value, and connect to $interger.
            $condition = '%0' . $key_scale . 'd';
            $integer .= sprintf( $condition, $label_id );
            push @array, $integer;
        }

        # cut down posting-list to suitable size and compress it.
        my @tmp;
        my $n = 0;
    LABEL:
        for ( sort { $b <=> $a } @array ) {
            my $p = pack( "w*", $_ );
            push @tmp, $p;
            last LABEL if ++$n == $max_posting_size;
        }

        $posting_lists->{$key} = \@tmp;

        $i++;
    }

    $self->{index_data} = {
        posting_lists => $posting_lists,
        labels        => $labels,
        key_scale     => $key_scale,
        val_scale     => $val_scale
    };
}

sub search {
    my $self         = shift;
    my $query_vector = shift;
    my $number       = shift || 10;

    my $t0 = [gettimeofday];

    my $vec       = $self->_unit_length($query_vector);
    my $key_scale = $self->{index_data}->{key_scale};
    my $val_scale = $self->{index_data}->{val_scale};

    my $dot_product;
    while ( my ( $q_key, $q_val ) = each %$vec ) {
        my $compressed_array = $self->{index_data}->{posting_lists}->{$q_key};
        next if !$compressed_array;
    LABEL:
        my $max;

        for (@$compressed_array) {

            # decompress and decode
            my $string   = unpack( "w*", $_ );
            my $count    = length($string) - $key_scale;
            my $val      = substr( $string, 0, $count );
            my $label_id = int substr( $string, $count, $key_scale );
            $val = $val / ( 10**$val_scale );

            # calculate similarities
            $dot_product->{$label_id} += $q_val * $val;
        }
    }

    my @list;
    for (
        sort { $dot_product->{$b} <=> $dot_product->{$a} }
        keys %$dot_product
        )
    {
        my $similarity = sprintf( "%8.6f", $dot_product->{$_} );
        my $label = $self->{index_data}->{labels}->[$_];
        push @list, { label => $label, similarity => $similarity };
        last if int @list == $number;
    }
    my $elapsed = tv_interval($t0);

    return {
        elapsed        => $elapsed,
        retrieved_list => \@list,
        return_num     => int @list,
    };
}

sub save {
    my $self      = shift;
    my $save_file = shift;
    my $index     = $self->{index_data};
    nstore( $index, $save_file );
}

sub load {
    my $self      = shift;
    my $save_file = shift;
    my $index     = retrieve($save_file);
    $self->{index_data} = $index;
}

sub _unit_length {
    my $self = shift;
    my $vec  = shift;
    my $ret;
    my $norm = $self->_calc_norm($vec);
    while ( my ( $key, $value ) = each %$vec ) {
        $ret->{$key} = $value / $norm;
    }
    return $ret;
}

sub _calc_norm {
    my $self = shift;
    my $vec  = shift;

    my $norm;
    for ( values %$vec ) {
        $norm += $_**2;
    }
    sqrt($norm);
}

1;
__END__

=head1 NAME

Text::SimSearch - inverted index for similarity search 

=head1 SYNOPSIS

  use Text::SimSearch;

  my $indexer = Text::SimSearch->new;

  $indexer->add_item_from_file("source_file.tsv");
  $indexer->save("save.bin");

  $indexer->load("save.bin");

  my $query_vector = { BAG_OF_WORDS };
  my $number       = 100; # retrieve num
  my $result = $indexer->search( $query_vector, $number );


=head1 DESCRIPTION

Text::SimSearch is a on-memory inverted index for similarity search.


=head1 AUTHOR

Takeshi Miki E<lt>miki@cpan.orgE<gt>

=head1 SEE ALSO

=head1 LICENSE

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)