t/02-search.t - metacpan.org

#!/usr/bin/env perl
use strict;
use warnings;
use Test::More;
use Data::Dump qw( dump );
use File::Temp qw( tempdir );
my $invindex = tempdir( CLEANUP => 1 );

use Lucy::Plan::Schema;
use Lucy::Plan::FullTextType;
use Lucy::Analysis::PolyAnalyzer;
use Lucy::Index::Indexer;
use Lucy::Search::IndexSearcher;

my $schema     = Lucy::Plan::Schema->new;
my $stopfilter = Lucy::Analysis::SnowballStopFilter->new( language => 'en', );
my $stemmer    = Lucy::Analysis::SnowballStemmer->new( language => 'en' );
my $case_folder = Lucy::Analysis::CaseFolder->new;
my $tokenizer   = Lucy::Analysis::RegexTokenizer->new;
my $analyzer    = Lucy::Analysis::PolyAnalyzer->new(
    analyzers => [
        $case_folder,
        $tokenizer,

        # our existing tests have too many stopwords to refactor
        # but this is helpful when debugging related code in Dialect::Lucy

        #$stopfilter,

        $stemmer,
    ]
);
my $fulltext = Lucy::Plan::FullTextType->new(
    analyzer => $analyzer,
    sortable => 1,
);
$schema->spec_field( name => 'title',  type => $fulltext );
$schema->spec_field( name => 'color',  type => $fulltext );
$schema->spec_field( name => 'date',   type => $fulltext );
$schema->spec_field( name => 'option', type => $fulltext );

my $indexer = Lucy::Index::Indexer->new(
    index    => $invindex,
    schema   => $schema,
    create   => 1,
    truncate => 1,
);

use_ok('Search::Query::Parser');

ok( my $parser = Search::Query::Parser->new(
        fields => {
            title  => { analyzer => $analyzer },
            color  => { analyzer => $analyzer },
            date   => { analyzer => $analyzer },
            option => { analyzer => $analyzer },
        },
        query_class_opts =>
            { default_field => [qw( title color date option )], },
        dialect        => 'Lucy',
        croak_on_error => 1,
    ),
    "new parser"
);

ok( my $tree_able = $parser->parse("foo OR bar"), "parse for tree_able" );
ok( my $tree      = $tree_able->tree(),           "->tree" );
ok( my $native_tree = $tree_able->translate_to('Native'),
    "translate to Native dialect" );

my %docs = (
    'doc1' => {
        title  => 'i am doc1',
        color  => 'red blue orange',
        date   => '20100329',
        option => 'a',
    },
    'doc2' => {
        title  => 'i am doc2',
        color  => 'green yellow purple',
        date   => '20100301',
        option => 'b',
    },
    'doc3' => {
        title  => 'i am doc3',
        color  => 'brown black white',
        date   => '19720329',
        option => '',
    },
    'doc4' => {
        title  => 'i am doc4',
        color  => 'white',
        date   => '20100510',
        option => 'c',
    },
    'doc5' => {
        title  => 'unlike the others',
        color  => 'teal',
        date   => '19000101',
        option => 'd',
    },
);

# set up the index
for my $doc ( keys %docs ) {
    $indexer->add_doc( $docs{$doc} );
}

$indexer->commit;

my $searcher = Lucy::Search::IndexSearcher->new( index => $invindex, );

# search
my %queries = (
    'title:(i am)'                                       => 4,
    'title:("i am")'                                     => 4,
    'color:red'                                          => 1,
    'brown'                                              => 1,
    'date=(20100301..20100331)'                          => 2,
    'date!=(20100301..20100331)'                         => 3,
    '-date:(20100301..20100331)'                         => 3,
    'am AND (-date=(20100301..20100331))'                => 2,
    'am AND (date=(20100301..20100331))'                 => 2,
    'color:re*'                                          => 1,
    'color:re?'                                          => 1,
    'color:br?wn'                                        => 1,
    'color:*n'                                           => 2,
    'color!=red'                                         => 4,
    'not color=red and not title=doc2'                   => 3,
    '"i doc1"~2'                                         => 1,
    'option!=?*'                                         => 1,
    'NOT option:?*'                                      => 1,
    '(title=am) and (date!=20100301 and date!=20100329)' => 2,   # doc3 & doc4
    '(re* OR gree*) AND title=am'                        => 2,
    '(re* OR gree*)'                                     => 2,
    'not green'                                          => 4,
    'not green and title=doc3'                           => 1,
    '*oc*'                                               => 4,
    'green and not title=doc3'                           => 1,
    '((title=doc*) (NOT color=teal)) and (NOT option=c) and (date=(20100301..20100331))'
        => 2,
);

for my $str ( sort keys %queries ) {
    my $query = $parser->parse($str);

    #$query->debug(1);

    my $hits_expected = $queries{$str};
    if ( ref $hits_expected ) {
        $query->debug(1);
        $hits_expected = $hits_expected->[0];
    }

    #diag($query);
    my $lucy_query = $query->as_lucy_query();
    if ( !$lucy_query ) {
        diag("No lucy_query for $str");
        next;
    }
    my $hits = $searcher->hits(
        query      => $lucy_query,
        offset     => 0,
        num_wanted => 5,             # more than we have
    );

    is( $hits->total_hits, $hits_expected, "$str = $hits_expected" );

    if ( $hits->total_hits != $hits_expected or $query->debug ) {

        $query->debug(1);
        diag($str);
        diag($query);
        diag( dump($query) );

        diag( dump( $query->as_lucy_query ) );
        if ( $query->as_lucy_query->isa('Lucy::Search::NOTQuery') ) {
            diag( "negated_query: "
                    . dump( $query->as_lucy_query->get_negated_query ) );
        }
        diag( dump $query->as_lucy_query->dump );

    }
}

# exercise some as_lucy_query options
my $query = $parser->parse(qq/"orange red"~3/);
$query->ignore_order_in_proximity(1);

#$query->debug(1);
my $ks_query = $query->as_lucy_query();
my $hits
    = $searcher->hits( query => $ks_query, offset => 0, num_wanted => 5 );
is( $hits->total_hits, 1, "proximity order ignored" );
$query->ignore_order_in_proximity(0);
$ks_query = $query->as_lucy_query();
$hits = $searcher->hits( query => $ks_query, offset => 0, num_wanted => 5 );
is( $hits->total_hits, 0, "proximity order respected" );

# alternate way of doing wildcard searches: expand initial query
# from lexicon like Xapian does.
$parser->term_expander(
    sub {
        my ($term) = @_;
        return ($term) unless $term =~ m/[\*\?]/;

        # Assume here we have a cached list of terms,
        # either from a Lexicon or a db, etc.
        # In this case, we just return a hardcoded array
        # since we know what $term is

        return qw( doc1 doc2 doc3 doc4 );

    },
);

ok( my $wild_query = $parser->parse(qq/title=doc*/), "parse query" );
$ks_query = $wild_query->as_lucy_query();

#diag($wild_query);
#diag(dump $ks_query->dump);
#diag($ks_query->to_string);
$hits = $searcher->hits( query => $ks_query, offset => 0, num_wanted => 5 );
is( $hits->total_hits, 4, "alternate wildcard works" );

# allow for adding new queries without adjusting test count
done_testing( scalar( keys %queries ) + 9 );
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)