The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
%right OR
%right AND
%left MINUS

# Generate the perl module with the command:
# yapp -sm UR::BoolExpr::BxParser -o - BxParser.yp | sed s/Parse::Yapp::Driver/UR::BoolExpr::BxParser::Yapp::Driver/ > BxParser.pm

%%

boolexpr:                                 { [] }
        | expr                            { UR::BoolExpr::BxParser->_simplify($_[1]) }
        | boolexpr ORDER_BY order_by_list { [@{$_[1]}, '-order', $_[3]] }
        | boolexpr GROUP_BY group_by_list { [@{$_[1]}, '-group', $_[3]] }
        | boolexpr LIMIT INTEGER          { [@{$_[1]}, '-limit', $_[3]] }
        | boolexpr OFFSET INTEGER         { [@{$_[1]}, '-offset', $_[3]] }
        ;

expr: condition                            { $_[1] }
    | expr AND expr                        { UR::BoolExpr::BxParser->_and($_[1], $_[3]) }
    | expr OR expr                         { UR::BoolExpr::BxParser->_or($_[1], $_[3]) }
    | LEFT_PAREN expr RIGHT_PAREN          { $_[2] }
    ;

condition: property operator optional_spaces value                     { [ "$_[1] $_[2]" => $_[4] ] }
         | property like_operator like_value                           { [ "$_[1] $_[2]" => $_[3] ] }
         | property in_operator set                                    { [ "$_[1] $_[2]" => $_[3] ] }
         | property COLON optional_spaces old_syntax_in_value          { [ "$_[1] in" => $_[4] ] }
         | property negation COLON optional_spaces old_syntax_in_value { [ "$_[1] $_[2] in" => $_[5] ] }
         | property between_operator between_value                     { [ "$_[1] $_[2]" => $_[3] ] }
         | property COLON optional_spaces between_value                { [ "$_[1] between" => $_[4] ] }
         | property negation COLON optional_spaces between_value       { [ "$_[1] $_[2] between" => $_[5] ] }
         | property boolean_op_word                                    { [ "$_[1] $_[2]" => 1 ] }
         | property null_op_word                                       { [ "$_[1] $_[2]" => undef ] }
         ;

boolean_op_word: TRUE_WORD  { $_[1] }
               | FALSE_WORD { $_[1] }
      ;

null_op_word: IS_NULL { '=' }
            | negation IS_NULL  { "!=" }
            | IS_NOT_NULL { '!=' }
            ;

spaces: WHITESPACE { $_[1] }
      ;

optional_spaces:        { undef }
               | spaces { undef }
               ;

property: IDENTIFIER        { $_[1] }
        | keyword_as_value  { $_[1] }
        ;

order_by_property: property           { $_[1 ] }
                 | MINUS property     { '-'.$_[2] }
                 | property DESC_WORD { '-'.$_[1] }
                 | property ASC_WORD  { $_[1] }
                 ;

order_by_list: order_by_property                   { [ $_[1]] }
             | order_by_property AND order_by_list { [$_[1], @{$_[3]}] }
             ;

group_by_list: property                   { [ $_[1] ] }
             | property AND group_by_list { [$_[1], @{$_[3]}] }
             ;

operator: an_operator           { $_[1] }
        | negation an_operator  { "$_[1] $_[2]" }
        ;

negation: NOT_WORD { 'not' }
        | NOT_BANG { 'not' }
        ;

an_operator: OPERATORS        { $_[1] }
           | EQUAL_SIGN       { '=' }
           | DOUBLEEQUAL_SIGN { '=' }
        ;

like_operator: LIKE_WORD          { 'like' }
             | negation LIKE_WORD { "$_[1] like" }
             | TILDE              { 'like' }
             | negation TILDE     { "$_[1] like" }
             ;

like_value: value                 {  $_[1] =~ m/\%/ ? $_[1] : '%' . $_[1] . '%' }
          ;

in_operator: IN_WORD              { 'in' }
           | negation IN_WORD     { "$_[1] in" }
           ;

old_syntax_in_value: single_value IN_DIVIDER old_syntax_in_value { [ $_[1], @{$_[3]} ] }
                   | single_value IN_DIVIDER single_value        { [ $_[1], $_[3] ] }
        ; 

set: LEFT_BRACKET set_body RIGHT_BRACKET { $_[2] }
   ;

set_body: value SET_SEPARATOR set_body { [ $_[1], @{$_[3]} ] }
        | value                        { [ $_[1] ] }
        ;

between_operator: BETWEEN_WORD          { 'between' }
                | negation BETWEEN_WORD { "$_[1] between" }
                ;

between_value: single_value MINUS single_value { [ $_[1], $_[3] ] }
             ;
   
keyword_as_value: IN_WORD      { $_[1] }
                | LIKE_WORD    { $_[1] }
                | BETWEEN_WORD { $_[1] }
                | NOT_WORD     { $_[1] }
                | DESC_WORD    { $_[1] }
                | ASC_WORD     { $_[1] }
                | TRUE_WORD    { $_[1] }
                | FALSE_WORD   { $_[1] }
       ;

value: single_value subsequent_values_list { $_[1].$_[2] }
     | single_value                        { $_[1] }
     ;

subsequent_value_part: IDENTIFIER         { $_[1] }
                     | number             { $_[1] }
                     | WORD               { $_[1] }
                     | DOUBLEQUOTE_STRING { ($_[1] =~ m/^"(.*?)"$/)[0]; }
                     | SINGLEQUOTE_STRING { ($_[1] =~ m/^'(.*?)'$/)[0]; }
                     | keyword_as_value   { $_[1] }
                     ;

subsequent_values_list: subsequent_value_part                 { $_[1] }
               | subsequent_value_part subsequent_values_list { $_[1].$_[2] }
               | spaces subsequent_values_list                { $_[1].$_[2] }
               | spaces  { '' }   # to gobble the final space in a value before the next expression part
               ;

single_value: subsequent_value_part   { $_[1] }
            | AND                     { $_[1] }
            | OR                      { $_[1] }
            ;


number: INTEGER       { $_[1] + 0 }
      | REAL          { $_[1] + 0 }
      | MINUS INTEGER { 0 - $_[2] } # to reject --5
      | MINUS REAL    { 0 - $_[2] }
      ; 

%%

package UR::BoolExpr::BxParser;
use strict;
use warnings;

sub _error {
    my @expect = $_[0]->YYExpect;
    my $tok = $_[0]->YYData->{INPUT};
    my $match = $_[0]->YYData->{MATCH};
    my $string = $_[0]->YYData->{STRING};
    my $err = qq(Can't parse expression "$string"\n  Syntax error near token $tok '$match');
    my $rem = $_[0]->YYData->{REMAINING};
    $err .= ", remaining text: '$rem'" if $rem;
    $err .= "\nExpected one of: " . join(", ", @expect) . "\n";
    Carp::croak($err);
}

my %token_states = (
    'DEFAULT' => [
        WHITESPACE => qr{\s+},
        AND => [ qr{and}i, 'DEFAULT'],
        OR => [ qr{or}i, 'DEFAULT' ],
        BETWEEN_WORD => qr{between},
        LIKE_WORD => qr{like},
        IN_WORD => qr{in},
        NOT_WORD => qr{not},
        DESC_WORD => qr{desc},
        ASC_WORD => qr{asc},
        TRUE_WORD => qr{true},
        FALSE_WORD => qr{false},
        LIMIT => qr{limit},
        OFFSET => qr{offset},
        IDENTIFIER => qr{[a-zA-Z_][a-zA-Z0-9_.]*},
        MINUS => qr{-},
        INTEGER => qr{\d+},
        REAL => qr{\d*\.\d+|\d+\.\d*},
        WORD => qr{[%\+\.\/\w][\+\-\.%\w\/]*},   # also allow / for pathnames, - for hyphenated names, % for like wildcards
        DOUBLEQUOTE_STRING => qr{"(?:\\.|[^"])*"},
        SINGLEQUOTE_STRING => qr{'(?:\\.|[^'])*'},
        LEFT_PAREN => [ qr{\(}, 'DEFAULT' ],
        RIGHT_PAREN => [ qr{\)}, 'DEFAULT' ],
        LEFT_BRACKET => [ qr{\[}, 'set_contents'],
        RIGHT_BRACKET => [qr{\]}, 'DEFAULT' ],
        NOT_BANG => qr{!},
        EQUAL_SIGN => [ qr{=}, 'dont_gobble_spaces' ],
        DOUBLEEQUAL_SIGN => [ qr{=>}, 'dont_gobble_spaces' ],
        OPERATORS => [ qr{<=|>=|<|>}, 'dont_gobble_spaces' ],
        AND => [ qr{,}, 'DEFAULT' ],
        COLON => [ qr{:}, 'after_colon_value' ],
        TILDE => qr{~},
        ORDER_BY => qr{order by},
        GROUP_BY => qr{group by},
        IS_NULL => qr{is null|is undef},
        IS_NOT_NULL => qr{is not null|is not undef},
    ],
    'set_contents' => [
        SET_SEPARATOR => qr{,},  # Depending on state, can be either AND or SET_SEPARATOR
        WORD => qr{[%\+\.\w\:][\+\.\:%\w]*},   # also allow / for pathnames, - for hyphenated names, % for like wildcards
        RIGHT_BRACKET => [qr{\]}, 'DEFAULT' ],
    ],
    'after_colon_value' => [
        INTEGER => qr{\d+},
        REAL => qr{\d*\.\d+|\d+\.\d*},
        IN_DIVIDER => qr{\/},
        #WORD => qr{\w+},    # Override WORD in DEFAULT to disallow /
        WORD => qr{[%\+\.\w\:][\+\.\:%\w]*},   # Override WORD in DEFAULT to disallow /
        DOUBLEQUOTE_STRING => qr{"(?:\\.|[^"])*"},
        SINGLEQUOTE_STRING => qr{'(?:\\.|[^'])*'},
        WHITESPACE => [qr{\s+}, 'DEFAULT'],
    ],
    'dont_gobble_spaces' => [
        AND => [ qr{and}, 'DEFAULT'],
        OR => [ qr{or}, 'DEFAULT' ],
        LIMIT => [qr{limit}, 'DEFAULT'],
        OFFSET => [qr{offset}, 'DEFAULT'],
        INTEGER => qr{\d+},
        REAL => qr{\d*\.\d+|\d+\.\d*},
        WORD => qr{[%\+\.\/\w][\+\-\.\:%\w\/]*},   # also allow / for pathnames, - for hyphenated names, % for like wildcards
        ORDER_BY => [qr{order by}, 'DEFAULT'],
        GROUP_BY => [qr{group by}, 'DEFAULT'],
    ],
);

sub parse {
    my $string = shift;
    my %params = @_;

    my $debug = $params{'tokdebug'};
    my $yydebug = $params{'yydebug'} || 0;

    print "\nStarting parse for string $string\n" if $debug;
    my $parser = UR::BoolExpr::BxParser->new();
    $parser->YYData->{STRING} = $string;

    my $parser_state = 'DEFAULT';

    my $get_next_token = sub {
        if (length($string) == 0) {
            print "String is empty, we're done!\n" if $debug;
            return (undef, '');  
       }

        GET_NEXT_TOKEN:
        foreach (1) {
            my $longest = 0;
            my $longest_token = '';
            my $longest_match = '';

            for my $token_list ( $parser_state, 'DEFAULT' ) {
                print "\nTrying tokens for state $token_list...\n" if $debug;
                my $tokens = $token_states{$token_list};
                for(my $i = 0; $i < @$tokens; $i += 2) {
                    my($tok, $re) = @$tokens[$i, $i+1];
                    print "Trying token $tok... " if $debug;

                    my($regex,$next_parser_state);
                    if (ref($re) eq 'ARRAY') {
                        ($regex,$next_parser_state) = @$re;
                    } else {
                        $regex = $re;
                    }

                    if ($string =~ m/^($regex)/) {
                        print "Matched >>$1<<" if $debug;
                        my $match_len = length($1);
                        if ($match_len > $longest) {
                            print "\n  ** It's now the longest" if $debug;
                            $longest = $match_len;
                            $longest_token = $tok;
                            $longest_match = $1;
                            if ($next_parser_state) {
                                $parser_state = $next_parser_state;
                            }
                        }
                    }
                    print "\n" if $debug;
                }

                $string = substr($string, $longest);
                print "Consuming up to char pos $longest chars, string is now >>$string<<\n" if $debug;

                if ($longest_token eq 'WHITESPACE' and $parser_state ne 'dont_gobble_spaces') {
                    print "Redoing token extraction after whitespace\n" if $debug;
                    redo GET_NEXT_TOKEN;
                }

                $parser->YYData->{REMAINING} = $string;
                if ($longest) {
                    print "Returning token $longest_token, match $longest_match\n  next state is named $parser_state\n" if $debug;
                    $parser->YYData->{INPUT} = $longest_token;
                    $parser->YYData->{MATCH} = $longest_match;
                    return ($longest_token, $longest_match);
                }
                last if $token_list eq 'DEFAULT';  # avoid going over it twice if $parser_state is DEFAULT
            }
        }
        print "Didn't match anything, done!\n" if $debug;
        return (undef, '');  # Didn't match anything
    };

    return ( $parser->YYParse(
                yylex => $get_next_token,
                yyerror => \&_error,
                yydebug => $yydebug),
             \$string,
            );
}

# Used by the top-level expr production to turn an or-type parse tree with
# only a single AND condition into a simple AND-type tree (1-level arrayref).
# Or to add the '-or' to the front of a real OR-type tree so it can be passed
# directly to UR::BoolExpr::resolve()
sub _simplify {
    my($class, $expr) = @_;

    if (ref($expr->[0])) {
        if (@$expr == 1) {
            # An or-type parse tree, but with only one AND subrule - use as a simple and-type rule
            $expr = $expr->[0];
        } else {
            $expr = ['-or', $expr]; # an or-type parse tree with multiple subrules
        }
    }
    return $expr;
}

# Handles the case for "expr AND expr" where one or both exprs can be an
# OR-type expr.  In that case, it distributes the AND exprs among all the
# OR conditions.  For example:
# (a=1 or b=2) and (c=3 or d=4)
# is the same as
# (a=1 and c=3) or (a=1 and d=4) or (b=2 and c=3) or (b=2 and d=4)
# This is necessary because the BoolExpr resolver can only handle 1-level deep
# AND-type rules, or a 1-level deep OR-type rule composed of any number of
# 1-level deep AND-type rules
sub _and {
    my($class,$left, $right) = @_;

    # force them to be [[ "property operator" => value]] instead of just [ "property operator" => value ]
    $left  = [ $left ]  unless (ref($left->[0]));
    $right = [ $right ] unless (ref($right->[0]));

    my @and;
    foreach my $left_subexpr ( @$left ) {
        foreach my $right_subexpr (@$right) {
            push @and, [@$left_subexpr, @$right_subexpr];
        }
    }
    \@and;
}

sub _or {
    my($class,$left, $right) = @_;

    # force them to be [[ "property operator" => value]] instead of just [ "property operator" => value ]
    $left  = [ $left ]  unless (ref($left->[0]));
    $right = [ $right ] unless (ref($right->[0]));

    [ @$left, @$right ];
}

1;