The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# WARNING: This file is no longer used by PCR itself;
# we're using examples/Grammar.grammar to generate Pugs::Grammar::Rule.pmc
# instead.

use v6-alpha;
use utf8;

# Perl6 implementation of the 'Rule' syntax
# author: Flavio S. Glock - fglock@gmail.com

=for compiling

    The util/update-rule-pmc script is used to
    generate lib/Pugs/Grammar/Rule.pmc from
    this file.

    This script does the following:

    Use v6.pm to compile this file and
    post-process it after compiling:
    - remove all references to:
        Data::Bind
    - replace the header with:
        package Pugs::Grammar::Rule;
        use utf8;
        no strict 'refs';
        use Pugs::Runtime::Match;
        use Pugs::Runtime::Regex;
    The post-processing thing is done
    by the Perl 5 script util/patch-rule-pmc.pl

=cut

grammar Pugs::Grammar::Rule;

#use Pugs::Runtime::Match;

our %rule_terms;
our %variables;

token pod_begin {
    |   \n =end \N*
    |   . \N* <.pod_begin>
}

token pod_other {
    |   \n =cut \N*
    |   . \N* <.pod_other>
}

token ws {
    [
    |    \# \N*
    |    \n [ = [
            |  begin <.ws> END \N* .*
            |  begin  <.pod_begin>
            |  kwid   <.pod_other>
            |  pod    <.pod_other>
            |  for    <.pod_other>
            |  head1  <.pod_other>
            ]?
            ]?
    |    \s
    ]+
}

# regex ident can start with a number
token ident {
    [ <.alnum> | _ | '::' ]+
}

token alnum {
    <[0-9a-zA-Z]>
}

token alpha {
    <[a-zA-Z]>
}

token digit {
    <[0-9]>
}

# after '\\'
token special_char {
        | ( c | C ) \[ ( [<alnum>|\s| ';' | '(' | ')' | '-' ]+) \]
          #  \c[LATIN LETTER A]
          { return { special_char => '\\' ~ $0 ~ $1 , } }

        | [ x | X ] <xdigit>+
          #  \x0021    \X0021
          { return { special_char => '\\' ~ $/ , } }
        | ( x | X ) \[ (<xdigit>+) \]
          #  \x[0021]  \X[0021]
          { return { special_char => '\\' ~ $0 ~ $1 , } }

        | [ o | O ] \d+
          #  \o0021    \O0021
          { return { special_char => '\\' ~ $/ , } }
        | ( o | O ) \[ (\d+) \]
          #  \o[0021]  \O[0021]
          { return { special_char => '\\' ~ $0 ~ $1 , } }

        | .
          #  \e  \E
          { return { special_char => '\\' ~ $/ , } }
}

token literal {
    [
    |  \\ <special_char>
    |  <-[ \' ]>
    ]*
}

token double_quoted {
    [
    |  \\ <special_char>
    |  <%Pugs::Grammar::Rule::variables>
    |  <-[ \" ]>
    ]*
}

token metasyntax {
    [
    |  \\ <special_char>
    |  \'  <.literal>     \'
    |  \"  <.double_quoted>   \"
    |  \{  <.string_code>        \}
    |  \<  <.metasyntax>  \>
    |  <-[ \> ]>
    ]+
}

token char_range {
    [
    |  \\ <special_char>
    |  <-[ \] ]>
    ]+
}

token char_class {
    |  <.alpha>+
    |  \[  <.char_range>  \]
}

token string_code {
    # bootstrap "code"
    [
    |  \\ <special_char>
    |  \'  <.literal>     \'
    |  \"  <.double_quoted>   \"
    |  \{  [ <.string_code> | '' ]  \}
    |  \(  [ <.string_code> | '' ]  \)
    |  \<  [ <.string_code> | '' ]  \>
    |  [ <.ws> | \> | \= | \- ] \>
    |  <.ws>
    |  <-[ \} \) \> ]>
    ]+
}

token parsed_code {
    # this subrule is overridden inside the perl6 compiler
    <.string_code>
    { return '{' ~ $/ ~ '}' }
}

token named_capture_body {
    | \(  <rule>        \)  { return { capturing_group => $$<rule> ,} }
    | \[  <rule>        \]  { return $$<rule> }
    | \<  <parse_metasyntax>  { return $$<parse_metasyntax> }
    | \'  <.literal>    \'
        { return { metasyntax => { metasyntax => ~ $$/ ,} } }
    | { die "invalid alias syntax"; }
}

token parse_metasyntax {
        $<modifier> := [ '!' | '?' | '.' | '' ]
    [
        '{'  <parsed_code>  '}>'
        { return { closure => {
            closure  => $$<parsed_code>,
            modifier => $$<modifier>,
        } } }
    |
        <char_class>
        ( <[+-]> <char_class> )+
        \>
        {
            if ( $$<modifier> eq '!' ) {
              return {
                negate => {
                  char_class => [
                    '+' ~ $<char_class>,
                    @($/[0]),   # TODO - stringify
              ] } }
            }
            return {
              char_class => [
                '+' ~ $<char_class>,
                @($/[0]),   # TODO - stringify
            ] }
        }
    |
        <ident>
        [
          <.ws> <rule> \>
          {
            if  ( $$<ident> eq 'before'
               || $$<ident> eq 'after'
                ) {
                return { $$<ident> => { rule => $$<rule>, modifier => $$<modifier> } }
            }
            return { metasyntax => {
                metasyntax => $$<ident>,
                rule       => $$<rule>,
                modifier   => $$<modifier>,
            } }
          }
        |
          ':' <.ws>?
          $<str> := [
            [
            |  \\ <special_char>
            |  <%Pugs::Grammar::Rule::variables>
            |  <-[ \> ]>
            ]*
          ]
          \>
          {
            if  ( $$<ident> eq 'before'
               || $$<ident> eq 'after'
                ) {
                return { $$<ident> => {
                    rule     => { metasyntax => {
                        metasyntax => '\'' ~ $$<str> ~ '\''
                    } },
                    modifier => $$<modifier>,
                } }
            }
            return { metasyntax => {
                metasyntax => $$<ident>,
                string   => $$<str>,
                modifier => $$<modifier>,
            } }
          }
        |
          \(  <parsed_code>  \) \>
          { return { call => {
              method   => $$<ident>,
              params   => $$<parsed_code>,
              modifier => $$<modifier>,
          } } }
        ]
    |
        <metasyntax>  \>
        { return { metasyntax => {
              metasyntax => ~$$<metasyntax>,
              modifier   => $$<modifier>,
        } } }
    ]
}


%variables = (

    '$<' => token {
        <ident> \>
        { return { match_variable => '$' ~ $/<ident> ,} }
    },
    '$' => token {
        <.digit>+
        { return { match_variable => '$' ~ $/ ,} }
    |
        \^?
        [ <.alnum> | _ | \: \: ]+
        { return { variable => '$' ~ $/ ,} }
    },
    '@' => token {
        <.digit>+
        { return { match_variable => '@' ~ $/ ,} }
    |
        \^?
        [ <.alnum> | _ | \: \: ]+
        { return { variable => '@' ~ $/ ,} }
    },
    '%' => token {
        <.digit>+
        { return { match_variable => '%' ~ $/ ,} }
    |
        \^?
        [ <.alnum> | _ | \: \: ]+
        { return { variable => '%' ~ $/ ,} }
    },

); # /%variables


%rule_terms = (

    '{*}' => token {
        # placeholder
        { return { metasyntax => { metasyntax => 'null' ,} } }
    },

    '\'' => token {
        <.literal>     \'
        { return { metasyntax => { metasyntax => '\'' ~ $$/ ,} } }
    },
    '(' => token {
        <rule> \)
        { return { capturing_group => $$<rule> ,} }
    },
    '<(' => token {
        <rule>  ')>'
        { return { capture_as_result => $$<rule> ,} }
    },
    '<+' => token {
        <char_class>
        ( <[+-]> <char_class> )*
        \>
        { return {
            char_class => [
                '+' ~ $<char_class>,
                @($/[0]),   # TODO - stringify
            ] }
        }
    },
    '<-' => token {
        <char_class>
        ( <[+-]> <char_class> )*
        \>
        { return {
            char_class => [
                '-' ~ $<char_class>,
                @($/[0]),   # TODO - stringify
            ] }
        }
    },
    '<[' => token {
        <char_range>  \]
        ( <[+-]> <char_class> )*
        \>
        { return {
            char_class => [
                '+[' ~ $<char_range> ~ ']',
                @($/[0]),   # TODO - stringify
            ] }
        }
    },
    '<' => token {
        <parse_metasyntax>
        { return $$<parse_metasyntax> }
    },
    '{' => token {
        <parsed_code>  \}
        { return { closure => {
            closure => $$<parsed_code>,
            modifier => 'plain',
        } } }
    },
    '\\' => token {
        <special_char>
        { return $$<special_char> }
    },
    '.' => token {
        { return { 'dot' => 1 ,} }
    },
    '[' => token {
        <rule> \]
        { return $$<rule> }
    },
    ':::' => token { { return { colon => ':::' ,} } },
    ':?'  => token { { return { colon => ':?' ,} } },
    ':+'  => token { { return { colon => ':+' ,} } },
    '::'  => token { { return { colon => '::' ,} } },
    ':'   => token { { return { colon => ':'  ,} } },
    '$$'  => token { { return { colon => '$$' ,} } },
    '$'   => token { { return { colon => '$'  ,} } },
    '^^'  => token { { return { colon => '^^' ,} } },
    '^'   => token { { return { colon => '^'  ,} } },

    '>>'  => token { { return { colon => '>>' ,} } },
    '»'   => token { { return { colon => '>>' ,} } },

    '<<'  => token { { return { colon => '<<' ,} } },
    '«'   => token { { return { colon => '<<' ,} } },

    ':i'  => token {
        <.ws> <rule>
        { return { modifier => { modifier => 'ignorecase', :$$<rule>, } } } },
    ':ignorecase'  => token {
        <.ws> <rule>
        { return { modifier => { modifier => 'ignorecase', :$$<rule>, } } } },
    ':s'  => token {
        <.ws> <rule>
        { return { modifier => 'sigspace',   :$$<rule>, } } },
    ':sigspace'    => token {
        <.ws> <rule>
        { return { modifier => 'sigspace',   :$$<rule>, } } },
    ':P5' => token {
        <.ws> <rule>
        { return { modifier => 'Perl5',  :$$<rule>, } } },
    ':Perl5'       => token {
        <.ws> <rule>
        { return { modifier => 'Perl5',  :$$<rule>, } } },
    ':bytes'       => token {
        <.ws> <rule>
        { return { modifier => 'bytes',  :$$<rule>, } } },
    ':codes'       => token {
        <.ws> <rule>
        { return { modifier => 'codes',  :$$<rule>, } } },
    ':graphs'      => token {
        <.ws> <rule>
        { return { modifier => 'graphs', :$$<rule>, } } },
    ':langs'       => token {
        <.ws> <rule>
        { return { modifier => 'langs',  :$$<rule>, } } },

); # /%rule_terms

token term {
    |  <%Pugs::Grammar::Rule::variables>
       [  <.ws>? ':=' <.ws>? <named_capture_body>
          {
            return { named_capture => {
                rule =>  $$<named_capture_body>,
                ident => $$<Pugs::Grammar::Rule::variables>,
            }, };
          }
       |
          {
            return $$<Pugs::Grammar::Rule::variables>
          }
       ]
    |  <%Pugs::Grammar::Rule::rule_terms>
        {
            #print "term: ", Dumper( $_[0]->data );
            return $$<Pugs::Grammar::Rule::rule_terms>
        }
    |  <-[ \] \} \) \> \: \? \+ \* \| \& ]>
        {
            #print "constant: ", Dumper( $_[0]->data );
            return { 'constant' => $$/ ,}
        }
}

token quant {
    |   '**' <.ws>? \{  <parsed_code>  \}
        { return { closure => $$<parsed_code> ,} }
    |   <[  \? \* \+  ]>?
}

token quantifier {
    $<ws1>   := (<.ws>?)
    <!before  <[   \} \] \)   ]> >
    <term>
    $<ws2>   := (<.ws>?)
    <quant>
    $<greedy> := (<[  \? \+  ]>?)
    $<ws3>   := (<.ws>?)
    {
      if
               $$/{'quant'}   eq ''
            && $$/{'greedy'} eq ''
            && $$/{'ws1'}    eq ''
            && $$/{'ws2'}    eq ''
            && $$/{'ws3'}    eq ''
      {
          return $$/{'term'};
      }
      return {
        quant => {
            term    => $$/{'term'},
            quant   => $$/{'quant'},
            greedy  => $$/{'greedy'},
            ws1     => $$/{'ws1'},
            ws2     => $$/{'ws2'},
            ws3     => $$/{'ws3'},
        } }
    }
}

token concat {
    <quantifier>+
    {
      use v5;
        my @a = map {  $_->()  }  @{ $::_V6_MATCH_->{'quantifier'} };
        return { concat => \@a ,}  if scalar @a > 1;
        return $a[0];
      use v6;
    }
}

token conjunctive1 {
    [ <.ws>? \& <!before \& > ]?

    <concat>**{1}
    [
        \& <!before \& >  <concat>
    ]*

    {
      use v5;
        my @a = map {  $$_  }  @{ $::_V6_MATCH_->{'concat'} };
        return { conjunctive1 => \@a ,}  if scalar @a > 1;
        return $a[0];
      use v6;
    }
}

token disjunctive1 {
    [ <.ws>? \| <!before \| > ]?

    <conjunctive1>**{1}
    [
        \| <!before \| > <conjunctive1>
    ]*

    {
      use v5;
        my @a = map {  $$_  }  @{ $::_V6_MATCH_->{'conjunctive1'} };
        return { alt1 => \@a ,}  if scalar @a > 1;
        return $a[0];
      use v6;
    }
}

token conjunctive {
    [ <.ws>? \& \& ]?

    <disjunctive1>**{1}
    [
        \& \& <disjunctive1>
    ]*

    {
      use v5;
        my @a = map {  $$_  }  @{ $::_V6_MATCH_->{'disjunctive1'} };
        return { conjunctive => \@a ,}  if scalar @a > 1;
        return $a[0];
      use v6;
    }
}

token rule {
    [ <.ws>? \| \| ]?

    <conjunctive>**{1}
    [
        \| \| <conjunctive>
    ]*

    {
      use v5;
        my @a = map {  $$_  }  @{ $::_V6_MATCH_->{'conjunctive'} };
        return { alt => \@a ,}  if scalar @a > 1;
        return $a[0];
      use v6;
    }
}

token named_regex {
    ( 'token' | 'regex' | 'rule' )
    <.ws> <ident> <.ws>? '{'
        <.ws>?
        <rule>
    '}' ';'?

    { return {
            type => $$0,
            name => $$<ident>,
            ast => $$<rule>
        };
    }
}

# This is hacky, will do better later
token verbatim {
    '%{' ( [ <!before '%}'> . ]* ) '%}'
    { return {
            type => 'block',
            value => $$0
        };
    }
}

token item {
    | <verbatim>       { return $$<verbatim>; }
    | <named_regex> { return $$<named_regex>; }
}

token grammar {
    <.ws>? 'grammar' <.ws> <ident> <.ws>? ';'
    <.ws>?
    [ <item> <.ws>? ]*
    { return { $$<ident> => $<item> } }
}

token spec {
    <verbatim>?
    <grammar>*
    { return {
            block => $<verbatim>,
            'grammar' => $<grammar> }
    }
}