The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Google::ProtocolBuffers::Compiler;
use strict;
use warnings;
use Parse::RecDescent;
use Data::Dumper;
use Google::ProtocolBuffers::Constants qw/:types :labels/;
use Carp;
use Config qw/%Config/;
use File::Spec;

##
## Grammar is based on work by Alek Storm 
## http://groups.google.com/group/protobuf/browse_thread/thread/1cccfc624cd612da
## http://groups.google.com/group/protobuf/attach/33102cfc0c57d449/proto2.ebnf?part=4
## 

my $grammar = <<'END_OF_GRAMMAR';

proto       :   <skip: qr!  (?: //.*\n | \s+ )*  !x>
                ## list of top level declarations. 
                ## Skip empty declarations and ";".
                (message | extend | enum | import | package | option | service | syntax | ";")(s) /\Z/
                { $return = [ grep {ref $_} @{$item[2]} ]; }
            |   <error>

import      :   "import" <commit> strLit ";"
                { $return = [ import => $item{strLit} ]; }
                ## error? reject pair means: 
                ##  if rule was commited (i.e. "import" was found), then fail the entire parse
                ##  otherwise, just skip this production (and try another one)
            |   <error?> <reject>   

package     :   "package" <commit> qualifiedIdent ";"
                { $return = [ package => $item{qualifiedIdent} ]; }
            |   <error?> <reject>   

option      :   ## so far, options are ignored 
                "option" <commit> optionBody ";"
                { $return = '' }        
            |   <error?> <reject>

optionBody  :   qualifiedIdent "=" constant
                { $return = '' }

message     :   "message" <commit> ident messageBody
                { $return = [ message => $item{ident}, $item{messageBody} ]; }
            |   <error?> <reject>

extend      :   "extend" <commit> userType "{" ( field | group | ";" )(s?) "}"
                { $return = [extend => $item{userType}, [ grep {ref $_} @{$item[5]}] ]}
                
enum        :   "enum" <commit> ident "{" (option | enumField | ";")(s) "}"
                { $return = [ enum => $item{ident}, [grep {ref $_} @{$item[5]}] ] }
            |   <error?> <reject>

enumField   :   ident "=" intLit ";"
                { $return = [ enumField => $item{ident}, $item{intLit} ] }

service     :   ## services are ignored
                "service" <commit> ident "{" ( option | rpc | ";" )(s?) "}"
                { $return = '' }
            |   <error?> <reject>
            
rpc         :   "rpc" <commit> ident "(" userType ")" "returns" "(" userType ")" rpcOptions(?) ";"
                { $return = '' }
            |   <error?> <reject>

rpcOptions  :   "{" option(s?) "}"

messageBody :   "{" <commit> ( field | enum | message | extend | extensions | group | option | oneof | ";" )(s?) "}"
                { $return = [ grep {ref $_} @{$item[3]} ] }
            |   <error?> <reject>

group       :   label "group" <commit> ident "=" intLit messageBody
                { $return = [group => $item{label}, $item{ident}, $item{intLit}, $item{messageBody} ] }
            |   <error?> <reject>

field       :   label type ident "=" intLit fOptList(?) ";"
                { $return = [field => $item{label}, $item{type}, $item{ident}, $item{intLit}, $item[6][0] ] }

oneof       :   "oneof" <commit> ident "{" ( oneofField | ";" )(s?) "}"
                { $return = [ oneof => $item{ident}, [grep {ref $_} @{$item[5]}] ] }
            |   <error?> <reject>

oneofField  :   type ident "=" intLit fOptList(?) ";"
                { $return = [field => "optional", $item{type}, $item{ident}, $item{intLit}, $item[5][0] ] }

fOptList    :   "[" fieldOption(s? /,/) "]"
                { $return = (grep {length($_)} @{$item[2]})[0] || '' }

fieldOption :   "default" <commit> "=" constant
                { $return =  $item{constant} }
            |   optionBody 
                { $return = '' }
            |   <error?> <reject>
            
extensions  :   "extensions" <commit> extension(s /,/) ";"
                { $return = '' }
            |   <error?> <reject>

extension   :   intLit ( "to" ( intLit | "max" ) )(s?)
                { $return = '' }

label       :   "required" | "optional" | "repeated" 


type        :   "double" | "float" | "int32" | "int64" | "uint32" | "uint64"
            |   "sint32" | "sint64" | "fixed32" | "fixed64" | "sfixed32" | "sfixed64"
            |   "bool" | "string" | "bytes" | userType

userType    :   (".")(?) qualifiedIdent
                {  $return = ($item[1] && @{$item[1]}) ? ".$item[2]" : $item[2]  }

constant    :   ident 
                { $return = $item[1]; }
            |   (floatLit | intLit | strLit | boolLit)
                { $return = { value => $item[1] } }
            
ident       :   /[a-z_]\w*/i

qualifiedIdent:     
                <leftop: ident "." ident>
                { $return = join(".", @{ $item[1] })}

intLit      :   hexInt | octInt| decInt 

decInt      :   /[-+]?[1-9]\d*/
                { $return = Google::ProtocolBuffers::Compiler::get_dec_int($item[1]) }

hexInt      :   /[-+]?0[xX]([A-Fa-f0-9])+/
                { $return = Google::ProtocolBuffers::Compiler::get_hex_int($item[1]) }

octInt      :   /[-+]?0[0-7]*/
                { $return = Google::ProtocolBuffers::Compiler::get_oct_int($item[1]) }

floatLit    :   ## Make floatLit do not match integer literals,
                ## so that it doesn't take off '0' from '0xFFF' or '012' (oct).
                /[-+]?\d*\.\d+([Ee][\+-]?\d+)?/
            |   /[-+]?\d+[Ee][\+-]?\d+/
            

boolLit     :   "true"
                { $return = 1 } 
            |   "false"
                { $return = 0 }

strLit      :   /['"]/ <skip:''> ( hexEscape | octEscape | charEscape | regularChar)(s?) /['"]/
                { $return = join('', @{$item[3]}) }

regularChar :   ## all chars exept chr(0) and "\n"
                /[^\0\n'"]/ 

hexEscape   :   /\\[Xx]/ /[A-Fa-f0-9]{1,2}/
                { $return = chr(hex($item[2])) }

octEscape   :   '\\' /^0?[0-7]{1,3}/
                { $return = chr(oct("0$item[2]") & 0xFF); }

charEscape  :   /\\[abfnrtv\\'"]/
                { 
                    my $s = substr($item[1], 1, 1);
                    $return =   ($s eq 'a') ? "\a" :
                                ($s eq 'b') ? "\b" : 
                                ($s eq 'f') ? "\f" :
                                ($s eq 'n') ? "\n" :
                                ($s eq 'r') ? "\r" :
                                ($s eq 't') ? "\t" :
                                ($s eq 'v') ? "\x0b" : $s;
                }

                                 
syntax      :   "syntax" "=" strLit ## syntax = "proto2";
                { 
                    die "Unknown syntax" unless $item{strLit} eq 'proto2';
                    $return = ''; 
                }                 

END_OF_GRAMMAR

my %primitive_types = (
    "double"    => TYPE_DOUBLE,
    "float"     => TYPE_FLOAT,
    "int32"     => TYPE_INT32,
    "int64"     => TYPE_INT64,
    "uint32"    => TYPE_UINT32,
    "uint64"    => TYPE_UINT64,
    "sint32"    => TYPE_SINT32,
    "sint64"    => TYPE_SINT64,
    "fixed32"   => TYPE_FIXED32,
    "fixed64"   => TYPE_FIXED64,
    "sfixed32"  => TYPE_SFIXED32,
    "sfixed64"  => TYPE_SFIXED64,
    "bool"      => TYPE_BOOL,
    "string"    => TYPE_STRING,
    "bytes"     => TYPE_BYTES,
);

my %labels = (
    'required'  => LABEL_REQUIRED,
    'optional'  => LABEL_OPTIONAL,
    'repeated'  => LABEL_REPEATED,
);

my $has_64bit = $Config{ivsize}>=8;

sub _get_int_value {
    my $str = shift;
    my $max_pos_str = shift;
    my $max_neg_str = shift;
    my $str_to_num = shift;
    my $str_to_bigint = shift;

    my $is_negative = ($str =~/^-/);
    $str =~ s/^[+-]//;
    
    if (!$has_64bit) {
        my $l = length($str);
        if (
            !$is_negative &&
                ($l>length($max_pos_str) 
                || ($l==length($max_pos_str) && uc($str) ge uc($max_pos_str)))
            || $is_negative &&
                ( $l>length($max_neg_str) 
                || ($l==length($max_neg_str) && uc($str) ge uc($max_neg_str)))
        )
        {
            my $v = $str_to_bigint->($str); 
            return ($is_negative) ? -$v : $v;
        }
    }
    
    my $v = $str_to_num->($str);
    return ($is_negative) ? -$v : $v;
}

sub get_dec_int {
    my $str = shift;
    
    return _get_int_value(
        $str, "2147483647", "2147483648",
        sub {
            no warnings 'portable';
            return $_[0]+0;
        }, 
        sub {
            return Math::BigInt->new($_[0]);    
        }
    );
}
    
sub get_hex_int {
    my $str = shift;

    return _get_int_value(
        $str, "0x7fffffff", "0x80000000",
        sub {
            no warnings 'portable';
            return hex($_[0]);
        }, 
        sub {
            return Math::BigInt->new($_[0]);    
        }
    );
}

sub get_oct_int {
    my $str = shift;
    return _get_int_value(
        $str, "017777777777", "020000000000",
        sub {
            no warnings 'portable';
            return oct($_[0]);
        }, 
        sub {
            ## oops, Math::BigInt doesn't accept strings of octal digits,
            ## ... but accepts binary digits
            my $v = shift;
            my @oct_2_binary = qw(000 001 010 011 100 101 110 111);
            $v =~ s/(.)/$oct_2_binary[$1]/g;
            return Math::BigInt->new('0b' . $v);
        }
    );
}


sub parse {
    my $class = shift;
    my $source = shift;
    my $opts  = shift;

    my $self  = bless { opts => $opts }; 
    
    $::RD_ERRORS = 1;
    $::RD_WARN = 1;
    my $parser = Parse::RecDescent->new($grammar) or die;

    ## all top level declarations from all files (files be included)
    ## will be here
    my @parse_tree;

    my (@import_files, $text);
    if ($source->{text}) {
        $text = $source->{text};
    } elsif ($source->{file}) {
        @import_files = ('', $source->{file});
    } else {
        die;
    }

    my %already_included_files;

    while ($text || @import_files)  {
        my ($content, $filename);
        
        if ($text) {
            $content = $text;
            undef $text;
        } else {
            ## path may be relative to the path of the file, where
            ## "import" directive. Also, root dir for proto files
            ## may be specified in options
            my ($root, $path) = splice(@import_files, 0, 2);            
            $filename = $self->_find_filename($root, $path);
            next if $already_included_files{$filename}++;
            {
                my $fh;
                open $fh, $filename or die "Can't read from $filename: $!";
                local $/;
                $content = <$fh>;
                close $fh;
            }
        }
        
        my $res = $parser->proto($content);
        die "" unless defined $res;
        
        ## start each file from empty package
        push @parse_tree, [package=>''];
        foreach my $decl (@$res) {
            if ($decl->[0] eq 'import') {
                push @import_files, ($filename, $decl->[1]);
            } else {
                push @parse_tree, $decl;
            }
        }
    }
    
    ##
    ## Pass #1. 
    ## Find names of messages and enums, including nested ones.
    ## 
    my $symbol_table = Google::ProtocolBuffers::Compiler::SymbolTable->new;
    $self->{symbol_table} = $symbol_table;
    $self->collect_names('', \@parse_tree);

    ##
    ## Pass #2.
    ## Create complete descriptions of messages with extensions.
    ## For each field of a user type a fully quilified type name must be found.
    ## For each default value defined by a constant (enum), a f.q.n of enum value must be found
    ## 
    foreach my $kind (qw/message group enum oneof/) {
        foreach my $fqname ($symbol_table->lookup_names_of_kind($kind)) {
            $self->{types}->{$fqname} = { kind => $kind, fields => [], extensions => [], oneofs => [] };
        }
    }
    $self->collect_fields('', \@parse_tree);
    
    return $self->{types};
}

sub _find_filename {
    my $self = shift;
    my $base_filename = shift;
    my $path = shift;

=comment
    my $filename = File::Spec->rel2abs($path, $base_filename);
    return $filename if -e $filename;
    
    if ($self->{opts}->{include_dir}) {
        $filename = File::Spec->rel2abs($path, $self->{opts}->{include_dir});
        return $filename if -e $filename;
    }
=cut
    use Cwd; my $d = getcwd();
    
    my $filename = $path;
    return $filename if -e $filename;

    if (my $inc_dirs = $self->{opts}->{include_dir}) {
        $inc_dirs = [ $inc_dirs ] unless(ref($inc_dirs) eq 'ARRAY');
        foreach my $d (@$inc_dirs){
            $filename = File::Spec->catfile($d, $path);
            return $filename if -e $filename;
        }
    }
    die "Can't find proto file: '$path'";
}


sub collect_names {
    my $self = shift;
    my $context = shift;
    my $nodes = shift;
        
    my $symbol_table = $self->{symbol_table};
    foreach my $decl (@$nodes) {
        my $kind = $decl->[0]; ## 'message', 'extent', 'enum' etc...
        if ($kind eq 'package') {
            ## package directive just set new context, 
            ## not related to previous one
            $context = $symbol_table->set_package($decl->[1]);
        } elsif ($kind eq 'message') {
            ## message may include nested messages/enums/groups/oneofs
            my $child_context = $symbol_table->add('message' => $decl->[1], $context);
            $self->collect_names($child_context, $decl->[2]);
        } elsif ($kind eq 'enum') {
            my $child_context = $symbol_table->add('enum' => $decl->[1], $context);
            $self->collect_names($child_context, $decl->[2]);
        } elsif ($kind eq 'group') {
            ## there may be nested messages/enums/groups/oneofs etc. inside group
            ## [group => $label, $ident, $intLit, $messageBody ]            
            my $child_context = $symbol_table->add('group' => $decl->[2], $context);
            $self->collect_names($child_context, $decl->[4]);
        } elsif ($kind eq 'oneof') {
            ## OneOfs may only contain fields, we add them to both
            ## the current and oneof context
            my $child_context = $symbol_table->add('oneof' => $decl->[1], $context);
            foreach my $oneof (@{$decl->[2]}) {
                $symbol_table->add('field' => $oneof->[3], $context);
                $symbol_table->add('field' => $oneof->[3], $child_context);
            }
        } elsif ($kind eq 'extend') {
            ## extend blocks are tricky: 
            ## 1) they don't create a new scope
            ## 2) there may be a group inside extend block, and there may be everything inside the group 
            $self->collect_names($context, $decl->[2]);
        } elsif ($kind eq 'field') {
            ## we add fields into symbol table just to check their uniqueness 
            ## in several extension blocks or oneofs. Example:
            ##  .proto: 
            ##      extend A { required int32 foo = 100  };
            ##      extend B { required int32 foo = 200  }; 
            ##      // Invalid! foo is already declared!
            ##
            $symbol_table->add('field' => $decl->[3], $context);
        } elsif ($kind eq 'enumField') {
            $symbol_table->add('enum_field' => $decl->[1], $context);
        } else {
            warn $kind;
        }
    }
}

sub collect_fields {
    my $self = shift;
    my $context = shift;
    my $nodes = shift;
    my $destination_type_name = shift;
    my $is_extension = shift;
    
    my $symbol_table = $self->{symbol_table};
    foreach my $decl (@$nodes) {
        my $kind = $decl->[0]; ## 'message', 'extent', 'enum' etc...
        if ($kind eq 'package') {
            $context = $decl->[1];
        } elsif ($kind eq 'message') {
            my $child_context = ($context) ? "$context.$decl->[1]" : $decl->[1];
            $self->collect_fields($child_context, $decl->[2], $child_context);
        } elsif ($kind eq 'enum') {
            my $child_context = ($context) ? "$context.$decl->[1]" : $decl->[1];
            $self->collect_fields($child_context, $decl->[2], $child_context);
        } elsif ($kind eq 'group') {
            ## groups are tricky: they are both definition of a field and type.
            ## [group => $label, $ident, $intLit, $messageBody ]
            ## first, collect fields inside the group           
            my $child_context = ($context) ? "$context.$decl->[2]" : $decl->[2];
            $self->collect_fields($child_context, $decl->[4], $child_context);
            ## second, add the group as one field to parent (destination) type
            confess unless $destination_type_name;
            my $name;
            my $fields_list;
            if ($is_extension) {
                ## for extensions, fully quilified names of fields are used,
                ## because they may be declared anywhere - even in another package
                $fields_list = $self->{types}->{$destination_type_name}->{extensions}; 
                $name = $symbol_table->lookup('group' => $decl->[2], $context);
            } else {
                ## regualar fields are always immediate children of their type
                $fields_list = $self->{types}->{$destination_type_name}->{fields}; 
                $name = $decl->[2];
            }
            my $label = (exists $labels{$decl->[1]}) ? $labels{$decl->[1]} : die;
            my ($type_name, $kind) = $symbol_table->lookup_symbol($decl->[2], $context);
            die unless $kind eq 'group';
            my $field_number = $decl->[3];  
            push @$fields_list, [$label, $type_name, $name, $field_number];
        } elsif ($kind eq 'oneof') {
            my $child_context = ($context) ? "$context.$decl->[1]" : $decl->[1];
            $self->collect_fields($child_context, $decl->[2], $child_context);
            push @{$self->{types}->{$destination_type_name}->{oneofs}}, $child_context;
        } elsif ($kind eq 'extend') {
            ## what is the fqn of the message to be extended?
            my $destination_message = $symbol_table->lookup('message' => $decl->[1], $context);
            $self->collect_fields($context, $decl->[2], $destination_message, 1);
        } elsif ($kind eq 'field') {
            confess unless $destination_type_name;
            # $decl = ['field' => $label, $type, $ident, $item{intLit}, $item{fOptList}] }
            my $name;
            my $fields_list;
            if ($is_extension) {
                ## for extensions, fully quilified names of fields are used,
                ## because they may be declared anywhere - even in another package
                $fields_list = $self->{types}->{$destination_type_name}->{extensions}; 
                $name = $symbol_table->lookup('field' => $decl->[3], $context);
            } else {
                ## regualar fields are always immediate children of their type
                $fields_list = $self->{types}->{$destination_type_name}->{fields}; 
                $name = $decl->[3];
            }

            my $label = (exists $labels{$decl->[1]}) ? $labels{$decl->[1]} : die;

            my ($type_name, $kind);
            if (exists $primitive_types{$decl->[2]}) {
                $type_name = $primitive_types{$decl->[2]};
            } else {
                ($type_name, $kind) = $symbol_table->lookup_symbol($decl->[2], $context);
                die unless $kind eq 'message' || $kind eq 'group' || $kind eq 'enum';
            }

            my $field_number = $decl->[4];  

            my $default_value = $decl->[5];
            if ($default_value && !ref $default_value) {
            	if ($default_value eq 'true') {
            	   	$default_value = { value => 1 };
            	} elsif ($default_value eq 'false') {
            		$default_value = { value => 0 }; 
            	} else {
                    ## this default is enum value
                    ## type name must be fqn of enum type
                    die unless $kind eq 'enum';
                    $default_value = $symbol_table->lookup('enum_field' => $default_value, $type_name);
            	}
            }
            push @$fields_list, [$label, $type_name, $name, $field_number, $default_value];
        } elsif ($kind eq 'enumField') {
            confess unless $destination_type_name;
            my $fields_list = $self->{types}->{$destination_type_name}->{fields};
            push @{$fields_list}, [$decl->[1], $decl->[2]];
        } else {
            warn $kind;
        }
    }
}

package Google::ProtocolBuffers::Compiler::SymbolTable;
##
## %$self - symbol name table, descriptions of fully qualified names like Foo.Bar:
##  $names{'foo'}       = { kind => 'package' }
##  $names{'foo.Bar'}   = { kind => 'message' }
##  $names{'foo.Bar.Baz'}={ kind => 'enum',   }
##
use Data::Dumper;
use Carp;

sub new {
    my $class = shift;
    return bless {}, $class;
}

sub set_package {
    my $self = shift;
    my $package = shift;
    
    return '' unless $package;
    
    my @idents = split qr/\./, $package;
    my $name = shift @idents;
    while (1) {
        if (exists $self->{$name}) {
            die unless $self->{$name}->{kind} eq 'package';
        } else {
            $self->{$name} = {kind => 'package'}
        }
        last unless @idents;
        $name .= '.' . shift(@idents);
    }
    return $name;
}

sub _add {
    my $self = shift;
    my $kind = shift;
    my $name = shift;
    my $context = shift;
    
    ## no fully quilified names are alowed to declare (so far)
    die if $name =~ /\./;
    my $fqn;
    if ($context) {
        die "$name, $context" unless $self->{$context};
        $fqn = "$context.$name";
    } else {
        $fqn = $name;
    }

    if (exists $self->{$fqn}) {
        die "Name '$fqn' is already defined";
    } else {
        $self->{$fqn} = { kind=>$kind };
    }
    
    return $fqn;
}

sub add {
    my $self = shift;
    my $kind = shift;
    my $name = shift;
    my $context = shift;

    ## tricky: enum values are both children and siblings of enums
    if ($kind eq 'enum_field') {
        die unless $self->{$context}->{kind} eq 'enum';
        my $fqn = $self->_add($kind, $name, $context);
        $context =~ s/(^|\.)\w+$//; ## parent context
        $self->_add($kind, $name, $context);
        return $fqn;
    } else {
    	return $self->_add($kind, $name, $context);
    }
}

## input: fully or partially qualified name
## output: (fully qualified name, its kind - 'message', 'enum_field' etc.)
sub lookup_symbol {
    my $self = shift;
    my $n = shift;
    my $c = shift;
    
    my $context = $c;
    my $name = $n;
    if ($name =~ s/^\.//) {
        ## this is an fully quialified name
        if (exists $self->{$name}) {
            return ($name, $self->{$name}->{kind});
        }
    } else {
        ## relative name - look it up in the current context and up
        while (1) {
            my $fqn = ($context) ? "$context.$name" : $name;        
            if (exists $self->{$fqn}) {
                return ($fqn, $self->{$fqn}->{kind});
            }
            ## one level up
            last unless $context;
            $context =~ s/(^|\.)\w+$//; 
        }
    }
    die "Name '$name' ($c, $n) is not defined" . Data::Dumper::Dumper($self); 
}

## input: kind, fully or partially qualified name, context
## ouptut: fully qualified name
## if found kind of the name doesn't match given kind, an exception is raised
sub lookup {
    my $self = shift;
    my $kind = shift;
    my $name = shift;
    my $context = shift;
    
    my ($fqn, $k) = $self->lookup_symbol($name, $context);
    unless ($kind eq $k) {
    	confess "Error: while looking for '$kind' named '$name' in '$context', a '$k' named '$fqn' was found";
    }
    return $fqn;
}

## returns list of all fully qualified name of a given kind
sub lookup_names_of_kind {
    my $self = shift;
    my $kind = shift;
    
    return grep { $self->{$_}->{kind} eq $kind } keys %$self;   
}

1;