lib/Parse/FSM.pm - metacpan.org

# $Id: FSM.pm,v 1.5 2013/01/01 22:04:43 Paulo Exp $

package Parse::FSM;

#------------------------------------------------------------------------------

=head1 NAME

Parse::FSM - Deterministic top-down parser based on a Finite State Machine

=cut

#------------------------------------------------------------------------------

use strict;
use warnings;

use Carp; our @CARP_NOT = ('Parse::FSM');
use Data::Dump 'dump';
use Text::Template 'fill_in_string';
use File::Slurp;

our $VERSION = '1.04';

#------------------------------------------------------------------------------

=head1 SYNOPSIS

  use Parse::FSM;
  $fsm = Parse::FSM->new;
  
  $fsm->prolog($text);
  $fsm->epilog($text);
  $fsm->add_rule($name, @elems, $action);
  $fsm->start_rule($name);
  
  $fsm->parse_grammar($text);

  $fsm->write_module($module);
  $fsm->write_module($module, $file);
  
  $parser = $fsm->parser; # isa Parse::FSM::Driver
  $parser->input(\&lexer);
  $result = $parser->parse;
  
  # script
  perl -MParse::FSM - Grammar.yp Parser::Module
  perl -MParse::FSM - Grammar.yp Parser::Module lib\Parser\Module.pm

=head1 DESCRIPTION

This module compiles the Finite State Machine used by the
L<Parse::FSM::Driver|Parse::FSM::Driver> parser module.

It can be used by by a sequence of C<add_rule> calls, or by parsing a yacc-like
grammar in one go with C<parse_grammar>.

Can be used as a script to generate a module from a grammar file.

The result of compiling the parser can be used immediately by retrieving the 
C<parser> object, or a pre-compiled module can be written to disk by
C<write_module>. This module can then be used by the client code of the parser.

As usual in top-down parsers, left recursion is not supported 
and generates an infinite loop. This parser is deterministic and does not implement backtracking.

=head1 METHODS - SETUP

=head2 new

Creates a new object.

=cut

#------------------------------------------------------------------------------
use Class::XSAccessor {
	constructor => '_init',
	accessors => [
		'_tree',		# parse tree
						# Contains nested HASH tables with the decision tree
						# used during parsing.
						# Each node maps:
						#	token      => next node / string with action code
						#	[subrule]  => next node / string with action code
						#	[subrule]? => next node / string with action code
						#	[subrule]* => next node / string with action code
						#	__else__   => next node / string with action code
						# The first level are the rule names.

		'_state_table',	# ARRAY that maps each state ID to the corresponding
						# HASH table from tree.
						# Copied to the generated parser module.
		
		'_action',		# map func text => [ sub name, sub text ]
		
		'start_rule',	# name start rule
		'prolog',		# code to include near the beginning of the file
		'epilog',		# code to include at the end of the file
		'_names',		# keep all generated names up to now, to be able to 
						# create unique ones
	],
};

#------------------------------------------------------------------------------
sub new {
	my($class) = @_;
	return $class->_init(_tree => {}, _state_table => [], _action => {},
						 _names => {});
}

#------------------------------------------------------------------------------
# create a new unique name (for actions, sub-rules)
sub _unique_name {
	my($self, $name) = @_;
	my $id = 1;
	while (exists $self->_names->{$name.$id}) {
		$id++;
	}
	$self->_names->{$name.$id}++;
	return $name.$id;
}		

#------------------------------------------------------------------------------

=head1 METHODS - BUILD GRAMMAR

=head2 start_rule

Name of the grammar start rule. It defaults to the first rule added by C<add_rule>.

=head2 prolog, epilog

Perl code to include in the generated module near the start of the generated
module and near the end of it.

=head2 add_rule

Adds one rule to the parser. 

  $fsm->add_rule($name, @elems, $action);

C<$name> is the name of the rule, i.e. the syntatic object recognized
by the rule. 

C<@elems> is the list of elements in sequence needed to recognize this rule.
Each element can be one of:

=over 4

=item *

A string that will match with that token type from the lexer. 

The empty string is used to match the end of input and should 
be present in the grammar to force the parser 
to accept all the input;

=item *

An array refernce of a list of all possible tokens to accept at this position.

=item *

A subrule name inside square brackets, optionally followed by a 
repetion character, that asks the parser to recursively descend 
to match that subrule at the current input location.

The accepted forms are:

C<[term]> - recurse to the term rule;

C<[term]?> - term is optional;

C<[term]*> - accept zero or more terms;

C<[term]+> - accept one or more terms;

C<[term]E<lt>+,E<gt>> - accept one or more terms separated by commas, 
any token type can be used instead of the comma;

=back

C<$action> is the Perl text of the action executed when the rule is recognized,
i.e. all elements were found in sequence. 

It has to be enclosed in brackets C<{}>, and can use the following lexical 
variables, that are declared by the generated code:

=over 4

=item *

C<$self> : object pointer;

=item *

C<@item> : values of all the tokens or rules identified in this rule. The subrule
call with repetions return an array reference containing all the found items
in the subrule;

=back

=cut

#------------------------------------------------------------------------------
# add_rule
# Args:
#	rule name
#	list of : 	'[rule]' '[rule]*' '[rule]?' '[rule]+' '[rule]<+SEP>' 	# subrules
#				token													# tokens
#	action :	'{ CODE }'
sub add_rule {
	my($self, $rule_name, @elems) = @_;
	my $action = pop(@elems);

	@elems or croak "missing arguments";
	$rule_name =~ /^\w+$/ or croak "invalid rule name ".dump($rule_name);
	
	# check for array-ref @elem and recurse for all alternatives
	for my $i (0 .. $#elems) {
		if (ref($elems[$i])) {		# isa 'ARRAY', others cause run-time error
			for (@{$elems[$i]}) {
				$self->add_rule($rule_name, 
								@elems[0 .. $i-1], $_, @elems[$i+1 .. $#elems],
								$action);
			}
			return;
		}
	}
	
	$self->_check_start_rule($rule_name);

	# load the tree
	my $tree = $self->_tree;
	$tree = $self->_add_tree_node($tree, $rule_name);	# load rule name
	
	my $comment = "$rule_name :";
	
	while (@elems) {
		my $elem = shift @elems;
		
		# handle subrule calls with quantifiers
		# check if recursing for _add_list_rule
		if ($rule_name !~ /^_lst_/ &&
			$elem =~ /^ \[ .* \] /x) {
			$elem = $self->_add_list_rule($elem);
		}
		
		$tree->{__comment__} = $comment;		# way up to this state
		
		$comment .= " ".($elem =~ /^\[/ ? $elem : dump($elem));
		
		if (@elems) {				# not a leaf node
			croak "leaf and node at ($comment)" 
				if (exists($tree->{$elem}) && ref($tree->{$elem}) ne 'HASH');
			$tree = $self->_add_tree_node($tree, $elem);	# load token
		}
		else {						# leaf node
			croak "leaf not unique at ($comment)"
				if (exists($tree->{$elem}));
			$self->_add_tree_node($tree, $elem);			# create node
			$tree->{$elem} = $self->_add_action($action, $rule_name, $comment);
		}
	}
	
	return;
}

#------------------------------------------------------------------------------
# add a list subrule, get passed a string '[subrule]*'
sub _add_list_rule {
	my($self, $elem) = @_; 
	
	$elem =~ /^ \[ (\w+) \] ( [?*+] | <\+.*> )? $/x
		or croak "invalid subrule call $elem";
	my($subrule, $quant) = ($1, $2);
	
	return "[$subrule]" unless $quant;		# subrule without quatifier
	
	# create a list subrule, so that the result of the repetion is returned
	# as an array reference
	my $list_subrule = $self->_unique_name("_lst_".$subrule);
	
	if ($quant eq '*' || $quant eq '?') {
		$self->add_rule($list_subrule, "[$subrule]$quant", 
						 '{ return \@item }');
	}
	elsif ($quant eq '+') {					# A+ -> A A*
		$self->add_rule($list_subrule, "[$subrule]", "[$subrule]*", 
						 '{ return \@item }');
	}
	elsif ($quant =~ /^< \+ (.*) >$/x) {	# A<+;> -> A Ac* ; Ac : ';' A
		my $separator = $1;
		my $list_subrule_cont = $self->_unique_name("_lst_".$subrule);
		
		# Ac : ';' A
		$self->add_rule($list_subrule_cont, $separator, "[$subrule]",
						 '{ return $item[1] }');
						 
		# A Ac*
		$self->add_rule($list_subrule, "[$subrule]", "[$list_subrule_cont]*",
						 '{ return \@item }');
	}
	else {
		die; # not reached
	}
	
	return "[$list_subrule]";
}

#------------------------------------------------------------------------------
# add a tree node and create a new state
sub _add_tree_node {
	my($self, $tree, $elem) = @_;
	
	$tree->{$elem} ||= {};
	
	# new state?
	if (! exists $tree->{__state__}) {
		my $id = scalar(@{$self->_state_table});
		$tree->{__state__} = $id;
		$self->_state_table->[$id] = $tree;
	}
	
	return $tree->{$elem};
}

#------------------------------------------------------------------------------
# define start rule, except if starting with '_' (internal)
sub _check_start_rule {
	my($self, $rule_name) = @_;
	
	if (! defined $self->start_rule && $rule_name =~ /^[a-z]/i) {
		$self->start_rule($rule_name);	# start rule is first defined rule
	}
	
	return;
}

#------------------------------------------------------------------------------
# _add_action()
#	Create a new action or re-use an existing one. An action has to start by 
#	'{'; a new name is created and a reference to the name is 
#	returned : "\&_action_RULE"
sub _add_action {
	my($self, $action, $rule_name, $comment) = @_;
	
	# remove braces
	$action =~ s/ \A \s* \{ \s* (.*?) \s* \} \s* \z /$1/xs 
		or croak "action must be enclosed in {}";

	# reuse an existing action, if any
	(my $cannon_action = $action) =~ s/\s+//g;
	if (!$self->_action->{$cannon_action}) {
		my $action_name = $self->_unique_name("_act_".$rule_name);

		# reduce indentation
		for ($action) {
			my($lead_space) = /^(\t+)/m;
			$lead_space and s/^$lead_space/\t/gm;
		}

		$action = 
			"# $comment\n".
			"sub $action_name {".
			($action ne '' ? "\n\tmy(\$self, \@item) = \@_;\n\t" : "").
			$action.
			"\n}\n\n";

		$self->_action->{$cannon_action} = [ $action_name, $action ];
	}
	else {
		# append this comment
		$self->_action->{$cannon_action}[1] =~ s/^(sub)/# $comment\n$1/m;
	}
	
	return "\\&".$self->_action->{$cannon_action}[0];
}	

#------------------------------------------------------------------------------
# compute the FSM machine
#
# expand [rule] calls into start_set(rule) => [ rule_id, next_state ]
#	Search for all sub-rule calls, and add each of the first tokens of the subrule
#	to the call. Repeat until no more rules added, to cope with follow sets being
# 	computed after being looked up
# creates FSM loops for the constructs:
#	A -> B?
# 	A -> B*
sub _compute_fsm {
	my($self) = @_;

	# repeat until no more follow tokens added
	# Example : A B[?*] C
	my $changed;
	do {
		$changed = 0;
		
		# check all states in turn
		for my $state (@{$self->_state_table}) {
			while (my($token, $next_state) = each %$state) {
				next unless my($subrule_name, $quant) = 
						$token =~ /^ \[ (.*) \] ( [?*] )? $/x;

				my $next_state_text = ref($next_state) eq 'HASH' ? 
											$next_state->{__state__} : 
											$next_state;
				
				my $subrule = $self->_tree->{$subrule_name} 
					or croak "rule $subrule_name not found";
				ref($subrule) eq 'HASH' or die;
				
				# call subrule on each of the subrule follow set
				# Example : add all 'follow(B) -> call B' to current rule
				for my $subrule_key (keys %$subrule) {
					next if $subrule_key =~ /^(__(comment|state)__|\[.*\][?*]?)$/;
					my $text = "[ ".$subrule->{__state__}.", ".
									(($quant||"") eq '*' ? 
											$state->{__state__} :	# loop on a '*'
											$next_state_text	# else, next state
									)." ]";
					if ($state->{$subrule_key}) {
						die if $state->{$subrule_key} ne $text;
					}
					else {
						$state->{$subrule_key} = $text;								
						$changed++;
					}
				}
				
				# call next rule on the next rule follow set
				# Example : add all 'follow(C) -> end' to end current rule
				if (defined($quant)) {
					if ($state->{__else__}) {
						die if $state->{__else__} ne $next_state_text;
					}
					else {
						$state->{__else__} = $next_state_text;
						$changed++;
					}
				}
			}		
		}
	} while ($changed);
	
	return;
}

#------------------------------------------------------------------------------

=head2 parse_grammar

Parses the given grammar text and adds to the parser. Example grammar follows:

  {
    # prolog
    use MyLibrary;
  }
  
  main   : (number | name)+ <eof> ;
  number : 'NUMBER' { $item[0][1] } ; # comment
  name   : 'NAME'   { $item[0][1] } ; # comment
  
  expr   : <list:    number '+' number > ;
  
  <start: main >
  
  {
    # epilog
    sub util_method {...}
  }

=over 4

=item prolog

If the text contains a code block surronded by braces before the first rule
definition, the text is copied without the external braces to the prolog
of generated module.

=item epilog

If the text contains a code block surronded by braces after the last rule
definition, the text is copied without the external braces to the epilog
of generated module.

=item statements

Statement are either rule definitions of directives and end with a 
semi-colon C<;>. Comments are as in Perl, from a hash C<#> sign to 
the end of the line.

=item rule

A rule defines one sentence to match in the grammar. The first rule defined
is the default start rule, i.e. the rule parsed by default on the input.
A rule name must start with a letter and contain only letters,
digits and the underscore character.

The rule definition follows after a colon and is composed of a sequence 
of tokens (quoted strings) and sub-rules, to match in sequence. The rule matches 
when all the tokens and sub-rules in the definition match in sequence.

The top level rule should end with C<E<lt>eofE<gt>> to make sure all input
is parsed.

The rule can define several alternative definitions separated by '|'.

The rule definition finishes with a semi-colon ';'.

A rule can call an anonymous sub-rule eclosed in parentheses.

=item action

The last item in the rule definition is a text delimited by {} with the code
to execute when the rule is matched. The code can use $self to refer to the 
Parser object, and @item to refer to the values of each of the tokens and 
sub-rules matched. The return value from the code defines the value of the
rule, passed to the upper level rule, or returned as the parse result.

If no action is supplied, a default action returns an array reference with 
the result of all tokens and sub-rules of the matched sentence.

=item quantifiers

Every token or sub-rule can be followed by a repetition specification: 
'?' (zero or one), '*' (zero or more), '+' (one or more), 
or '<+,>' (comma-separated list, comma can be replaced by any token).

=item directives

Directives are written with angle brackets.

=over 4

=item <eof>

Can be used in a rule instead of the empty string to represent the end of input.

=item <list: RULE TOKEN RULE >

Shortcut for creating lists of operators separated by tokens, 
returns the list of rule and token values.

=item <start: START_RULE >

Defines the start rule of the grammar. By default the first
defined rule is the start rule; use C<E<lt>start:E<gt>> to override that.

=back

=back

=cut

#------------------------------------------------------------------------------
sub parse_grammar {
	my($self, $text) = @_;

	# need to postpone load of Parse::FSM::Parser, as Parse::FSM is used by
	# the script that creates Parse::FSM::Parser
	eval 'use Parse::FSM::Parser'; $@ and die; ## no critic
	
	my $parser = Parse::FSM::Parser->new;
	$parser->user->{fsm} = $self;
	eval {
		$parser->from($text);			# setup lexer
		$parser->parse;
	};
	$@ and do { $@ =~ s/\s+\z//; croak $@; };
	
	return;
}

#------------------------------------------------------------------------------

=head1 METHODS - USE PARSER

=head2 parser

Computes the Finite State Machine to execute the parser and returns a 
L<Parse::FSM::Driver|Parse::FSM::Driver> object that implements the parser.

Usefull to build the parser and execute it in the same
program, but with the run-time penalty of the time to setup the state tables.

=cut

#------------------------------------------------------------------------------
sub parser {
	my($self) = @_;
	our $name ||= 'Parser00000'; $name++;		# new module on each call
	
	my $text = $self->_module_text($name, "-");
	eval $text;		## no critic
	$@ and die $@;

	my $parser = $name->new;
	
	return $parser;
}
#------------------------------------------------------------------------------

=head2 write_module

Receives as input the module name and the output file name
and writes the parser module. 

The file name is optional; if not supplied is computed from the 
module name by replacing C<::> by C</> and appending C<.pm>, 
e.g. C<Parse/Module.pm>.

The generated code includes C<parse_XXX> functions for every rule 
C<XXX> found in the grammar, as a short-cut for calling C<parse('XXX')>.

=cut

#------------------------------------------------------------------------------
sub write_module {
	my($self, $name, $file) = @_;
	
	$name or croak "name not defined";

	# build file name from module name
	unless (defined $file) {
		$file = $name;
		$file =~ s/::/\//g;
		$file .= ".pm";
	}
	
	my $text = $self->_module_text($name, $file);
	write_file($file, {atomic => 1}, $text);

	return;
}

#------------------------------------------------------------------------------
# template code for grammmar parser
my $TEMPLATE = <<'END_TEMPLATE';
# $Id: FSM.pm,v 1.5 2013/01/01 22:04:43 Paulo Exp $
# Parser generated by Parse::FSM

package # hide from CPAN indexer
  <% $name %>;

use strict;
use warnings;

use Parse::FSM::Driver; our @ISA = ('Parse::FSM::Driver');

<% $prolog %>

<% $table %>

sub new {
	my($class, %args) = @_;
	return $class->SUPER::new(
				_state_table	=> \@state_table,
				_start_state	=> $start_state,
				%args,
	);
}

<% $epilog %>

1;
END_TEMPLATE

#------------------------------------------------------------------------------
# module text
sub _module_text {
	my($self, $name, $file) = @_;

	$name or croak "name not defined";
	$file or croak "file not defined";

	my $table = $self->_table_dump;
	
	my @template_args = (
		DELIMITERS 	=> [ '<%', '%>' ],
		HASH 		=> {
			prolog	=> $self->prolog || "",
			epilog	=> $self->epilog || "",
			name	=> $name,
			table	=> $table,
		},
	);
	return fill_in_string($TEMPLATE, @template_args);
}

#------------------------------------------------------------------------------
# dump the state table
sub _table_dump {
	my($self) = @_;

	$self->_compute_fsm;

	#print dump($self),"\n" if $ENV{DEBUG};

	my $start_state = 0;
	if (defined($self->start_rule) && exists($self->_tree->{$self->start_rule})) {
		$start_state = $self->_tree->{$self->start_rule}{__state__};
	}
	else {
		croak "start state not found";
	}
	
	my $ret = 'my $start_state = '.$start_state.";\n".
			  'my @state_table = ('."\n";
	my $width;
	for my $i (0 .. $#{$self->_state_table}) {
		$ret .= "\t# [$i] " . 
				($self->_state_table->[$i]{__comment__} || "") . 
				"\n" .
				"\t{ "; 
		$width = 2;
		
		for my $key (sort keys %{$self->_state_table->[$i]}) {
			next if $key =~ /^(__(comment|state)__|\[.*\][?*]?)$/;
			
			my $value = $self->_state_table->[$i]{$key};
			$value = $value->{__state__} if ref($value) eq 'HASH';
			
			my $key_text = ($key =~ /^\w+$/) ? $key : dump($key);
			
			my $item_text = "$key_text => $value, ";
			if (($width += length($item_text)) > 72) {
				$ret .= "\n\t  ";
				$width = 2 + length($item_text);
			}			
			$ret .= $item_text;
		}
		
		$ret .= "},\n\n";
	}
	$ret .= ");\n\n";
	
	# dump action
	for (sort {$a->[0] cmp $b->[0]} values %{$self->_action}) {
		$ret .= $_->[1];
	}
	
	# dump parse_XXX functions
	my $length = 1;
	while (my($name, $rule) = each %{$self->_tree}) {
		next unless $name =~ /^[a-z]/i;
		$length = length($name) if length($name) > $length;
	}
	while (my($name, $rule) = each %{$self->_tree}) {
		next unless $name =~ /^[a-z]/i;
		$ret .= 
			"sub parse_$name". 
			(" " x ($length - length($name))).
			" { return shift->_parse($rule->{__state__}) }\n";
	}
		
	return $ret;
}

#------------------------------------------------------------------------------

=head1 PRE-COMPILING THE GRAMMAR

The setup of the parsing tables and creating the parsing module may take up
considerable time. Therefore it is usefull to separate the parser generation 
phase from the parsing phase.

=head2 precompile

A parser module can be created from a yacc-like grammar file by the 
following command. The generated file (last parameter) is optional; if not
supplied is computed from the module name by replacing C<::> by C</> and
appending C<.pm>, e.g. C<Parse/Module.pm>:

  perl -MParse::FSM - Grammar.yp Parser::Module
  perl -MParse::FSM - Grammar.yp Parser::Module lib\Parser\Module.pm

This is equivalent to the following Perl program:

  #!perl
  use Parse::FSM;
  Parse::FSM->precompile(@ARGV);

The class method C<precompile> receives as argumens the grammar file, the 
generated module name and an optional file name, and creates the parsing module.

=cut

#------------------------------------------------------------------------------
sub precompile {
	my($class, $grammar, $module, $file) = @_;

	my $self = $class->new;
	my $text = read_file($grammar);
	$self->parse_grammar($text);
	$self->write_module($module, $file);
	
	return;
}

#------------------------------------------------------------------------------
# startup code for pre-compiler
# borrowed from Parse::RecDescent
sub import {
    local *_die = sub { warn @_, "\n"; exit 1; };

    my($package, $file, $line) = caller;
    if (substr($file,0,1) eq '-' && $line == 0) {
        _die("Usage: perl -MParse::FSM - GRAMMAR MODULE::NAME [MODULE/NAME.pm]")
            unless @ARGV == 2 || @ARGV == 3;

        my($grammar, $module, $file) = @ARGV;
		eval {
			Parse::FSM->precompile($grammar, $module, $file);
		};
		$@ and _die($@);

		exit 0;
	}
	
	return;
}

#------------------------------------------------------------------------------


=head1 AUTHOR

Paulo Custodio, C<< <pscust at cpan.org> >>

=head1 ACKNOWLEDGEMENTS

Calling pre-compiler on C<import> 
borrowed from L<Parse::RecDescent|Parse::RecDescent>.

=head1 BUGS and FEEDBACK

Please report any bugs or feature requests through the web interface at
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Parse-FSM>.  

=head1 LICENSE and COPYRIGHT

Copyright (C) 2010-2011 Paulo Custodio.

This program is free software; you can redistribute it and/or modify it
under the terms of either: the GNU General Public License as published
by the Free Software Foundation; or the Artistic License.

See http://dev.perl.org/licenses/ for more information.

=cut

1; # End of Parse::FSM
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)