%right OR
%right AND
%left MINUS
# Generate the perl module with the command:
# yapp -sm UR::BoolExpr::BxParser -o - BxParser.yp | sed s/Parse::Yapp::Driver/UR::BoolExpr::BxParser::Yapp::Driver/ > BxParser.pm
%%
boolexpr: { [] }
| expr { UR::BoolExpr::BxParser->_simplify($_[1]) }
| boolexpr ORDER_BY order_by_list { [@{$_[1]}, '-order', $_[3]] }
| boolexpr GROUP_BY group_by_list { [@{$_[1]}, '-group', $_[3]] }
| boolexpr LIMIT INTEGER { [@{$_[1]}, '-limit', $_[3]] }
| boolexpr OFFSET INTEGER { [@{$_[1]}, '-offset', $_[3]] }
;
expr: condition { $_[1] }
| expr AND expr { UR::BoolExpr::BxParser->_and($_[1], $_[3]) }
| expr OR expr { UR::BoolExpr::BxParser->_or($_[1], $_[3]) }
| LEFT_PAREN expr RIGHT_PAREN { $_[2] }
;
condition: property operator optional_spaces value { [ "$_[1] $_[2]" => $_[4] ] }
| property like_operator like_value { [ "$_[1] $_[2]" => $_[3] ] }
| property in_operator set { [ "$_[1] $_[2]" => $_[3] ] }
| property COLON optional_spaces old_syntax_in_value { [ "$_[1] in" => $_[4] ] }
| property negation COLON optional_spaces old_syntax_in_value { [ "$_[1] $_[2] in" => $_[5] ] }
| property between_operator between_value { [ "$_[1] $_[2]" => $_[3] ] }
| property COLON optional_spaces between_value { [ "$_[1] between" => $_[4] ] }
| property negation COLON optional_spaces between_value { [ "$_[1] $_[2] between" => $_[5] ] }
| property boolean_op_word { [ "$_[1] $_[2]" => 1 ] }
| property null_op_word { [ "$_[1] $_[2]" => undef ] }
;
boolean_op_word: TRUE_WORD { $_[1] }
| FALSE_WORD { $_[1] }
;
null_op_word: IS_NULL { '=' }
| negation IS_NULL { "!=" }
| IS_NOT_NULL { '!=' }
;
spaces: WHITESPACE { $_[1] }
;
optional_spaces: { undef }
| spaces { undef }
;
property: IDENTIFIER { $_[1] }
| keyword_as_value { $_[1] }
;
order_by_property: property { $_[1 ] }
| MINUS property { '-'.$_[2] }
| property DESC_WORD { '-'.$_[1] }
| property ASC_WORD { $_[1] }
;
order_by_list: order_by_property { [ $_[1]] }
| order_by_property AND order_by_list { [$_[1], @{$_[3]}] }
;
group_by_list: property { [ $_[1] ] }
| property AND group_by_list { [$_[1], @{$_[3]}] }
;
operator: an_operator { $_[1] }
| negation an_operator { "$_[1] $_[2]" }
;
negation: NOT_WORD { 'not' }
| NOT_BANG { 'not' }
;
an_operator: OPERATORS { $_[1] }
| EQUAL_SIGN { '=' }
| DOUBLEEQUAL_SIGN { '=' }
;
like_operator: LIKE_WORD { 'like' }
| negation LIKE_WORD { "$_[1] like" }
| TILDE { 'like' }
| negation TILDE { "$_[1] like" }
;
like_value: value { $_[1] =~ m/\%/ ? $_[1] : '%' . $_[1] . '%' }
;
in_operator: IN_WORD { 'in' }
| negation IN_WORD { "$_[1] in" }
;
old_syntax_in_value: single_value IN_DIVIDER old_syntax_in_value { [ $_[1], @{$_[3]} ] }
| single_value IN_DIVIDER single_value { [ $_[1], $_[3] ] }
;
set: LEFT_BRACKET set_body RIGHT_BRACKET { $_[2] }
;
set_body: value SET_SEPARATOR set_body { [ $_[1], @{$_[3]} ] }
| value { [ $_[1] ] }
;
between_operator: BETWEEN_WORD { 'between' }
| negation BETWEEN_WORD { "$_[1] between" }
;
between_value: single_value MINUS single_value { [ $_[1], $_[3] ] }
;
keyword_as_value: IN_WORD { $_[1] }
| LIKE_WORD { $_[1] }
| BETWEEN_WORD { $_[1] }
| NOT_WORD { $_[1] }
| DESC_WORD { $_[1] }
| ASC_WORD { $_[1] }
| TRUE_WORD { $_[1] }
| FALSE_WORD { $_[1] }
;
value: single_value subsequent_values_list { $_[1].$_[2] }
| single_value { $_[1] }
;
subsequent_value_part: IDENTIFIER { $_[1] }
| number { $_[1] }
| WORD { $_[1] }
| DOUBLEQUOTE_STRING { ($_[1] =~ m/^"(.*?)"$/)[0]; }
| SINGLEQUOTE_STRING { ($_[1] =~ m/^'(.*?)'$/)[0]; }
| keyword_as_value { $_[1] }
;
subsequent_values_list: subsequent_value_part { $_[1] }
| subsequent_value_part subsequent_values_list { $_[1].$_[2] }
| spaces subsequent_values_list { $_[1].$_[2] }
| spaces { '' } # to gobble the final space in a value before the next expression part
;
single_value: subsequent_value_part { $_[1] }
| AND { $_[1] }
| OR { $_[1] }
;
number: INTEGER { $_[1] + 0 }
| REAL { $_[1] + 0 }
| MINUS INTEGER { 0 - $_[2] } # to reject --5
| MINUS REAL { 0 - $_[2] }
;
%%
package UR::BoolExpr::BxParser;
use strict;
use warnings;
sub _error {
my @expect = $_[0]->YYExpect;
my $tok = $_[0]->YYData->{INPUT};
my $match = $_[0]->YYData->{MATCH};
my $string = $_[0]->YYData->{STRING};
my $err = qq(Can't parse expression "$string"\n Syntax error near token $tok '$match');
my $rem = $_[0]->YYData->{REMAINING};
$err .= ", remaining text: '$rem'" if $rem;
$err .= "\nExpected one of: " . join(", ", @expect) . "\n";
Carp::croak($err);
}
my %token_states = (
'DEFAULT' => [
WHITESPACE => qr{\s+},
AND => [ qr{and}i, 'DEFAULT'],
OR => [ qr{or}i, 'DEFAULT' ],
BETWEEN_WORD => qr{between},
LIKE_WORD => qr{like},
IN_WORD => qr{in},
NOT_WORD => qr{not},
DESC_WORD => qr{desc},
ASC_WORD => qr{asc},
TRUE_WORD => qr{true},
FALSE_WORD => qr{false},
LIMIT => qr{limit},
OFFSET => qr{offset},
IDENTIFIER => qr{[a-zA-Z_][a-zA-Z0-9_.]*},
MINUS => qr{-},
INTEGER => qr{\d+},
REAL => qr{\d*\.\d+|\d+\.\d*},
WORD => [ qr{[%\+\.\/\w][\+\-\.%\w\/]*}, # also allow / for pathnames, - for hyphenated names, % for like wildcards
'dont_gobble_spaces' ],
DOUBLEQUOTE_STRING => qr{"(?:\\.|[^"])*"},
SINGLEQUOTE_STRING => qr{'(?:\\.|[^'])*'},
LEFT_PAREN => [ qr{\(}, 'DEFAULT' ],
RIGHT_PAREN => [ qr{\)}, 'DEFAULT' ],
LEFT_BRACKET => [ qr{\[}, 'set_contents'],
RIGHT_BRACKET => [qr{\]}, 'DEFAULT' ],
NOT_BANG => qr{!},
EQUAL_SIGN => [ qr{=}, 'dont_gobble_spaces' ],
DOUBLEEQUAL_SIGN => [ qr{=>}, 'dont_gobble_spaces' ],
OPERATORS => [ qr{<=|>=|<|>}, 'dont_gobble_spaces' ],
AND => [ qr{,}, 'DEFAULT' ],
COLON => [ qr{:}, 'after_colon_value' ],
TILDE => qr{~},
ORDER_BY => qr{order by},
GROUP_BY => qr{group by},
IS_NULL => qr{is null|is undef},
IS_NOT_NULL => qr{is not null|is not undef},
],
'set_contents' => [
SET_SEPARATOR => qr{,}, # Depending on state, can be either AND or SET_SEPARATOR
WORD => qr{[%\+\.\w\:][\+\.\:%\w]*}, # also allow / for pathnames, - for hyphenated names, % for like wildcards
RIGHT_BRACKET => [qr{\]}, 'DEFAULT' ],
],
'after_colon_value' => [
INTEGER => qr{\d+},
REAL => qr{\d*\.\d+|\d+\.\d*},
IN_DIVIDER => qr{\/},
#WORD => qr{\w+}, # Override WORD in DEFAULT to disallow /
WORD => qr{[%\+\.\w\:][\+\.\:%\w]*}, # Override WORD in DEFAULT to disallow /
DOUBLEQUOTE_STRING => qr{"(?:\\.|[^"])*"},
SINGLEQUOTE_STRING => qr{'(?:\\.|[^'])*'},
WHITESPACE => [qr{\s+}, 'DEFAULT'],
],
'dont_gobble_spaces' => [
AND => [ qr{and}, 'DEFAULT'],
OR => [ qr{or}, 'DEFAULT' ],
LIMIT => [qr{limit}, 'DEFAULT'],
OFFSET => [qr{offset}, 'DEFAULT'],
INTEGER => qr{\d+},
REAL => qr{\d*\.\d+|\d+\.\d*},
WORD => qr{[%\+\.\/\w][\+\-\.\:%\w\/]*}, # also allow / for pathnames, - for hyphenated names, % for like wildcards
ORDER_BY => [qr{order by}, 'DEFAULT'],
GROUP_BY => [qr{group by}, 'DEFAULT'],
],
);
sub parse {
my $string = shift;
my %params = @_;
my $debug = $params{'tokdebug'};
my $yydebug = $params{'yydebug'} || 0;
print "\nStarting parse for string $string\n" if $debug;
my $parser = UR::BoolExpr::BxParser->new();
$parser->YYData->{STRING} = $string;
my $parser_state = 'DEFAULT';
my $get_next_token = sub {
if (length($string) == 0) {
print "String is empty, we're done!\n" if $debug;
return (undef, '');
}
GET_NEXT_TOKEN:
foreach (1) {
my $longest = 0;
my $longest_token = '';
my $longest_match = '';
for my $token_list ( $parser_state, 'DEFAULT' ) {
print "\nTrying tokens for state $token_list...\n" if $debug;
my $tokens = $token_states{$token_list};
for(my $i = 0; $i < @$tokens; $i += 2) {
my($tok, $re) = @$tokens[$i, $i+1];
print "Trying token $tok... " if $debug;
my($regex,$next_parser_state);
if (ref($re) eq 'ARRAY') {
($regex,$next_parser_state) = @$re;
} else {
$regex = $re;
}
if ($string =~ m/^($regex)/) {
print "Matched >>$1<<" if $debug;
my $match_len = length($1);
if ($match_len > $longest) {
print "\n ** It's now the longest" if $debug;
$longest = $match_len;
$longest_token = $tok;
$longest_match = $1;
if ($next_parser_state) {
$parser_state = $next_parser_state;
}
}
}
print "\n" if $debug;
}
$string = substr($string, $longest);
print "Consuming up to char pos $longest chars, string is now >>$string<<\n" if $debug;
if ($longest_token eq 'WHITESPACE' and $parser_state ne 'dont_gobble_spaces') {
print "Redoing token extraction after whitespace\n" if $debug;
redo GET_NEXT_TOKEN;
}
$parser->YYData->{REMAINING} = $string;
if ($longest) {
print "Returning token $longest_token, match $longest_match\n next state is named $parser_state\n" if $debug;
$parser->YYData->{INPUT} = $longest_token;
$parser->YYData->{MATCH} = $longest_match;
return ($longest_token, $longest_match);
}
last if $token_list eq 'DEFAULT'; # avoid going over it twice if $parser_state is DEFAULT
}
}
print "Didn't match anything, done!\n" if $debug;
return (undef, ''); # Didn't match anything
};
return ( $parser->YYParse(
yylex => $get_next_token,
yyerror => \&_error,
yydebug => $yydebug),
\$string,
);
}
# Used by the top-level expr production to turn an or-type parse tree with
# only a single AND condition into a simple AND-type tree (1-level arrayref).
# Or to add the '-or' to the front of a real OR-type tree so it can be passed
# directly to UR::BoolExpr::resolve()
sub _simplify {
my($class, $expr) = @_;
if (ref($expr->[0])) {
if (@$expr == 1) {
# An or-type parse tree, but with only one AND subrule - use as a simple and-type rule
$expr = $expr->[0];
} else {
$expr = ['-or', $expr]; # an or-type parse tree with multiple subrules
}
}
return $expr;
}
# Handles the case for "expr AND expr" where one or both exprs can be an
# OR-type expr. In that case, it distributes the AND exprs among all the
# OR conditions. For example:
# (a=1 or b=2) and (c=3 or d=4)
# is the same as
# (a=1 and c=3) or (a=1 and d=4) or (b=2 and c=3) or (b=2 and d=4)
# This is necessary because the BoolExpr resolver can only handle 1-level deep
# AND-type rules, or a 1-level deep OR-type rule composed of any number of
# 1-level deep AND-type rules
sub _and {
my($class,$left, $right) = @_;
# force them to be [[ "property operator" => value]] instead of just [ "property operator" => value ]
$left = [ $left ] unless (ref($left->[0]));
$right = [ $right ] unless (ref($right->[0]));
my @and;
foreach my $left_subexpr ( @$left ) {
foreach my $right_subexpr (@$right) {
push @and, [@$left_subexpr, @$right_subexpr];
}
}
\@and;
}
sub _or {
my($class,$left, $right) = @_;
# force them to be [[ "property operator" => value]] instead of just [ "property operator" => value ]
$left = [ $left ] unless (ref($left->[0]));
$right = [ $right ] unless (ref($right->[0]));
[ @$left, @$right ];
}
1;