package KinoSearch::QueryParser::QueryParser;
use strict;
use warnings;
use KinoSearch::Util::ToolSet;
use base qw( KinoSearch::Util::Class );
BEGIN {
__PACKAGE__->init_instance_vars(
# constructor args / members
analyzer => undef,
default_boolop => 'OR',
default_field => undef, # back compat
fields => undef,
# members
bool_groups => undef,
phrases => undef,
bool_group_re => undef,
phrase_re => undef,
label_inc => 0,
);
}
use KinoSearch::Analysis::TokenBatch;
use KinoSearch::Analysis::Tokenizer;
use KinoSearch::Search::BooleanQuery;
use KinoSearch::Search::PhraseQuery;
use KinoSearch::Search::TermQuery;
use KinoSearch::Index::Term;
sub init_instance {
my $self = shift;
$self->{bool_groups} = {};
$self->{phrases} = {};
croak("default_boolop must be either 'AND' or 'OR'")
unless $self->{default_boolop} =~ /^(?:AND|OR)$/;
# create a random string that presumably won't appear in a search string
my @chars = ( 'A' .. 'Z' );
my $randstring = '';
$randstring .= $chars[ rand @chars ] for ( 1 .. 16 );
$self->{randstring} = $randstring;
# create labels which won't appear in search strings
$self->{phrase_re} = qr/^(_phrase$randstring\d+)/;
$self->{bool_group_re} = qr/^(_boolgroup$randstring\d+)/;
# verify fields param
my $fields =
defined $self->{fields}
? $self->{fields}
: [ $self->{default_field} ];
croak("Required parameter 'fields' not supplied as arrayref")
unless ( defined $fields
and reftype($fields) eq 'ARRAY' );
$self->{fields} = $fields;
# verify analyzer
croak("Missing required param 'analyzer'")
unless a_isa_b( $self->{analyzer}, 'KinoSearch::Analysis::Analyzer' );
}
# regex matching a quoted string
my $quoted_re = qr/
" # opening quote
( # capture
[^"]*? # anything not a quote
)
(?:"|$) # closed by either a quote or end of string
/xsm;
# regex matching a parenthetical group
my $paren_re = qr/
\( # opening paren
( # capture
[^()]*? # anything not a paren
)
(?:\)|$) # closed by paren or end of string
/xsm;
# regex matching a negating boolean operator
my $neg_re = qr/^(?:
NOT\s+ # NOT followed by space
|-(?=\S) # minus followed by something not-spacey
)/xsm;
# regex matching a requiring boolean operator
my $req_re = qr/^
\+(?=\S) # plus followed by something not-spacey
/xsm;
# regex matching a field indicator
my $field_re = qr/^
( # capture
[^"(:\s]+ # non-spacey string
)
: # followed by :
/xsm;
sub parse {
my ( $self, $qstring_orig, $default_fields ) = @_;
$qstring_orig = '' unless defined $qstring_orig;
$default_fields ||= $self->{fields};
my $default_boolop = $self->{default_boolop};
my @clauses;
# substitute contiguous labels for phrases and boolean groups
my $qstring = $self->_extract_phrases($qstring_orig);
$qstring = $self->_extract_boolgroups($qstring);
local $_ = $qstring;
while ( bytes::length $_ ) {
# fast-forward past whitespace
next if s/^\s+//;
my $occur = $default_boolop eq 'AND' ? 'MUST' : 'SHOULD';
if (s/^AND\s+//) {
if (@clauses) {
# require the previous clause (unless it's negated)
if ( $clauses[-1]{occur} eq 'SHOULD' ) {
$clauses[-1]{occur} = 'MUST';
}
}
# require this clause
$occur = 'MUST';
}
elsif (s/^OR\s+//) {
if (@clauses) {
$clauses[-1]{occur} = 'SHOULD';
}
$occur = 'SHOULD';
}
# detect tokens which cause this clause to be required or negated
if (s/$neg_re//) {
$occur = 'MUST_NOT';
}
elsif (s/$req_re//) {
$occur = 'MUST';
}
# set the field
my $fields = s/^$field_re// ? [$1] : $default_fields;
# if a phrase label is detected...
if (s/$self->{phrase_re}//) {
my $query;
# retreive the text and analyze it
my $orig_phrase_text = delete $self->{phrases}{$1};
my $token_texts = $self->_analyze($orig_phrase_text);
if (@$token_texts) {
my $query = $self->_get_field_query( $fields, $token_texts );
push @clauses, { query => $query, occur => $occur }
if defined $query;
}
}
# if a label indicating a bool group is detected...
elsif (s/$self->{bool_group_re}//) {
# parse boolean subqueries recursively
my $inner_text = delete $self->{bool_groups}{$1};
my $query = $self->parse($inner_text, $fields);
push @clauses, { query => $query, occur => $occur };
}
# what's left is probably a term
elsif (s/([^"(\s]+)//) {
my $token_texts = $self->_analyze($1);
@$token_texts = grep { $_ ne '' } @$token_texts;
if (@$token_texts) {
my $query = $self->_get_field_query( $fields, $token_texts );
push @clauses, { occur => $occur, query => $query };
}
}
}
if ( @clauses == 1 and $clauses[0]{occur} ne 'MUST_NOT' ) {
# if it's just a simple query, return it unwrapped
return $clauses[0]{query};
}
else {
# otherwise, build a boolean query
my $bool_query = KinoSearch::Search::BooleanQuery->new;
for my $clause (@clauses) {
$bool_query->add_clause(
query => $clause->{query},
occur => $clause->{occur},
);
}
return $bool_query;
}
}
# Wrap a TermQuery/PhraseQuery to deal with multiple fields.
sub _get_field_query {
my ( $self, $fields, $token_texts ) = @_;
my @queries = grep { defined $_ }
map { $self->_gen_single_field_query( $_, $token_texts ) } @$fields;
if ( @queries == 0 ) {
return;
}
elsif ( @queries == 1 ) {
return $queries[0];
}
else {
my $wrapper_query = KinoSearch::Search::BooleanQuery->new;
for my $query (@queries) {
$wrapper_query->add_clause(
query => $query,
occur => 'SHOULD',
);
}
return $wrapper_query;
}
}
# Create a TermQuery, a PhraseQuery, or nothing.
sub _gen_single_field_query {
my ( $self, $field, $token_texts ) = @_;
if ( @$token_texts == 1 ) {
my $term = KinoSearch::Index::Term->new( $field, $token_texts->[0] );
return KinoSearch::Search::TermQuery->new( term => $term );
}
elsif ( @$token_texts > 1 ) {
my $phrase_query = KinoSearch::Search::PhraseQuery->new;
for my $token_text (@$token_texts) {
$phrase_query->add_term(
KinoSearch::Index::Term->new( $field, $token_text ),
);
}
return $phrase_query;
}
}
# break a string into tokens
sub _analyze {
my ( $self, $string ) = @_;
my $token_batch = KinoSearch::Analysis::TokenBatch->new;
$token_batch->append( $string, 0, bytes::length($string) );
$token_batch = $self->{analyzer}->analyze($token_batch);
my @token_texts;
while ( $token_batch->next ) {
push @token_texts, $token_batch->get_text;
}
return \@token_texts;
}
# replace all phrases with labels
sub _extract_phrases {
my ( $self, $qstring ) = @_;
while ( $qstring =~ $quoted_re ) {
my $label
= sprintf( "_phrase$self->{randstring}%d", $self->{label_inc}++ );
$qstring =~ s/$quoted_re/$label /; # extra space for safety
# store the phrase text for later retrieval
$self->{phrases}{$label} = $1;
}
return $qstring;
}
# recursively replace boolean groupings with labels, innermost first
sub _extract_boolgroups {
my ( $self, $qstring ) = @_;
while ( $qstring =~ $paren_re ) {
my $label = sprintf( "_boolgroup$self->{randstring}%d",
$self->{label_inc}++ );
$qstring =~ s/$paren_re/$label /; # extra space for safety
# store the text for later retrieval
$self->{bool_groups}{$label} = $1;
}
return $qstring;
}
1;
__END__
=head1 NAME
KinoSearch::QueryParser::QueryParser - transform a string into a Query object
=head1 SYNOPSIS
my $query_parser = KinoSearch::QueryParser::QueryParser->new(
analyzer => $analyzer,
fields => [ 'bodytext' ],
);
my $query = $query_parser->parse( $query_string );
my $hits = $searcher->search( query => $query );
=head1 DESCRIPTION
The QueryParser accepts search strings as input and produces Query objects,
suitable for feeding into L<KinoSearch::Searcher|KinoSearch::Searcher>.
=head2 Syntax
The following constructs are recognized by QueryParser.
=over
=item *
Boolean operators 'AND', 'OR', and 'AND NOT'.
=item *
Prepented +plus and -minus, indicating that the labeled entity should be
either required or forbidden -- be it a single word, a phrase, or a
parenthetical group.
=item *
Logical groups, delimited by parentheses.
=item *
Phrases, delimited by double quotes.
=item *
Field-specific terms, in the form of C<fieldname:termtext>. (The field
specified by fieldname will be used instead of the QueryParser's default
fields).
A field can also be given to a logical group, in which case it is the same as
if the field had been prepended onto every term in the group. For example:
C<foo:(bar baz)> is the same as C<foo:bar foo:baz>.
=back
=head1 METHODS
=head2 new
my $query_parser = KinoSearch::QueryParser::QueryParser->new(
analyzer => $analyzer, # required
fields => [ 'bodytext' ], # required
default_boolop => 'AND', # default: 'OR'
);
Constructor. Takes hash-style parameters:
=over
=item *
B<analyzer> - An object which subclasses
L<KinoSearch::Analysis::Analyzer|KinoSearch::Analysis::Analyzer>. This
B<must> be identical to the Analyzer used at index-time, or the results won't
match up.
=item *
B<fields> - the names of the fields which will be searched against. Must be
supplied as an arrayref.
=item *
B<default_field> - deprecated. Use C<fields> instead.
=item *
B<default_boolop> - two possible values: 'AND' and 'OR'. The default is 'OR',
which means: return documents which match any of the query terms. If you
want only documents which match all of the query terms, set this to 'AND'.
=back
=head2 parse
my $query = $query_parser->parse( $query_string );
Turn a query string into a Query object. Depending on the contents of the
query string, the returned object could be any one of several subclasses of
L<KinoSearch::Search::Query|KinoSearch::Search::Query>.
=head1 COPYRIGHT
Copyright 2005-2009 Marvin Humphrey
=head1 LICENSE, DISCLAIMER, BUGS, etc.
See L<KinoSearch|KinoSearch> version 0.165.
=cut