The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# $Id: Lexer.pm,v 1.7 2013/01/06 23:07:09 Paulo Exp $

package Parse::FSM::Lexer;

#------------------------------------------------------------------------------

=head1 NAME

Parse::FSM::Lexer - Companion Lexer for the Parse::FSM parser

=cut

#------------------------------------------------------------------------------

use 5.010;
use strict;
use warnings;

use File::Spec;
use Data::Dump 'dump';

our $VERSION = '1.06';

#------------------------------------------------------------------------------

=head1 SYNOPSIS

  use Parse::FSM::Lexer;
  $lex = Parse::FSM::Lexer->new;
  $lex = Parse::FSM::Lexer->new(@files);
  
  $lex->add_path(@dirs); @dirs = $lex->path;
  $full_path = $lex->path_search($file);

  $lex->from_file($filename);
  $lex->from_list(@input); 
  $lex->from_list(sub {});
  
  $lex->get_token;
  
  $lex->error(@message); 
  $lex->file; 
  $lex->line_nr;
  
  # in a nearby piece of code
  use MyParser; # isa Parse::FSM::Driver;
  my $parser = MyParser->new;
  $parser->input(sub {$lex->get_token});
  eval {$parser->parse}; $@ and $lex->error($@);

=head1 DESCRIPTION

This module implements a generic tokenizer that can be used by
L<Parse::FSM|Parse::FSM> parsers, and can also be used stand alone 
independently of the parser. 

It supports recursive file includes and takes track of current file name
and line number. It keeps the path of search directories to search for
input files.

The C<get_token> method can be called by the C<input> method of the parser
to retrieves the next input token to parse.

The module can be used directly if the supplied tokenizer is enough for the
application, but usually a derived class has to be written implementing a 
custom version of the C<tokenizer> method.

=head1 METHODS - SETUP

=head2 new

Creates a new object. If an argument list is given, calls C<from_file>
for each of the file starting from the last, so that the files are 
read in the given order.

=cut

#------------------------------------------------------------------------------
use constant INPUT 			=> 0;	# input stream, code ref
use constant FILE			=> 1;	# name of the input file, undef for list
use constant LINE_NR		=> 2;	# current input line number
use constant LINE_INC 		=> 3;	# increment to next line number
use constant SAW_NL 		=> 4;	# true if saw a newline before
									# used to increment LINE_INC on next token
use constant TEXT 			=> 5;	# line text being lexed

use constant STACK 			=> 6;	# stack of previous contexts for recursive
									# includes, saves
									# [input, file, line_nr, line_inc, saw_nl, 
									#		 text, pos(text)]
use constant PATH			=> 7;	# path of search directories

# only limited accessors
use Class::XSAccessor::Array {
	accessors => {
		file		=> FILE, 
		line_nr		=> LINE_NR,
		line_inc	=> LINE_INC,
	}
};

#------------------------------------------------------------------------------
sub new {
	my($class, @files) = @_;
	my $self = bless [], $class;
	$self->[STACK] = [];
	$self->[PATH]  = [];
	$self->from_file($_) for reverse @files;
	return $self;
}

#------------------------------------------------------------------------------
# push context for include file
sub _push_context {
	my($self) = @_;
	push @{$self->[STACK]},
		 [ @{$self}[ 0 .. STACK - 1 ], pos($self->[TEXT]) ];
	return;
}

#------------------------------------------------------------------------------
# pop context
sub _pop_context {
	my($self) = @_;
	( @{$self}[ 0 .. STACK - 1 ], pos($self->[TEXT]) )
		= @{ pop(@{$self->[STACK]}) || [] };
	return;
}

#------------------------------------------------------------------------------

=head1 METHODS - SEARCH PATH FOR FILES

=head2 path

Returns the list of directories to search in sequence for source files.

=cut

#------------------------------------------------------------------------------
sub path { @{$_[0][PATH]} } ## no critic
#------------------------------------------------------------------------------

=head2 add_path

Adds the given directories to the path searched for include files.

=cut

#------------------------------------------------------------------------------
sub add_path {
	my($self, @dirs) = @_;
	push @{$self->[PATH]}, @dirs;
}
#------------------------------------------------------------------------------

=head2 path_search

Searches for the given file name in the C<path> created by C<add_path>, returns 
the first full path name where the file can be found.

Returns the given input file name unchanged if:

=over 4

=item *

the file is found in the current directory; or 

=item *

the file is not found in any of the C<path> directories.

=back

=cut

#------------------------------------------------------------------------------
sub path_search {
	my($self, $file) = @_;
	
	return $file if -f $file;	# found
	
	for my $dir (@{$self->[PATH]}) {
		my $full_path = File::Spec->catfile($dir, $file);
		return $full_path if -f $full_path;
	}
	
	return $file;				# not found
}
#------------------------------------------------------------------------------

=head1 METHODS - INPUT STREAM

=head2 from_file

Saves the current input context, searches for the given input file name 
in the C<path>, opens the file and sets-up the object to read
each line in sequence. At the end of the 
file input resumes to the place where it was when C<from_file> was called.

Dies if the input file cannot be read, or if a file is
included recursively, to avoid an infinite include loop.

=cut

#------------------------------------------------------------------------------
sub from_file {
	my($self, $file) = @_;
	
	# search include path
	$file = $self->path_search($file);
	
	# check for include loop
	if (grep {($_->[FILE] // "") eq $file} @{$self->[STACK]}) {
		die "#include loop\n";
	}
	
	# open the file
	open(my $fh, "<", $file) or die "Error opening $file: $!\n";
		
	# create a new iterator to read file lines
	my $input = sub {
		$fh or return;
		my $line = <$fh>;
		return $line if defined $line;
		$fh = undef;		# free handle when file ends
		return;
	};
	$self->from_list($input);
	$self->[FILE] = $file;
	
	return;
}
#------------------------------------------------------------------------------

=head2 from_list

Saves the current input context and sets-up the object to read each element 
of the passed input list. Each element either a text string 
or a code reference of an iterator that returns text strings. 
The iterator returns C<undef> at the end of input.

=cut

#------------------------------------------------------------------------------
# input from text string (if scalar) or iterator (if CODE ref)
sub from_list {
	my($self, @input) = @_;
	
	# save previous context
	$self->_push_context if defined $self->[INPUT];
	
	# iterator
	my $input = sub {
		while (1) {
			@input or return;				# end of input
			for ($input[0]) {
				if (! ref $_) {
					return shift @input;	# scalar -> return it
				}
				else {						# has to be a CODE ref
					my $element = $_->();
					if (defined $element) {	# iterator returned something
						return $element;	
					}
					else {					# end of iterator
						shift @input;		# continue loop
					}
				}
			}
		}
	};
	
	# initialize
	@{$self}[ INPUT,  FILE,  LINE_NR, LINE_INC, SAW_NL, TEXT  ] 
		  = ( $input, undef, 0,       1,        1,      undef );
	
	return;
}
#------------------------------------------------------------------------------

=head1 METHODS - INPUT

=head2 get_token

Retrieves the next token from the input as an array reference containing
token type and token value. 

Returns C<undef> on end of input.

=head2 tokenizer

Method responsible to match the next token from the given input string.

This method can be overriden by a child class in order to implement a different
set ot tokens to be retrieved from the input.

It is implemented with features from the Perl 5.010 regex engine:

=over 4

=item *

one big regex with C</\G.../gc> to match from where the
last match ended; the string to match is passed as a scalar reference, so that
the position of last match C<pos()> is preserved;

=item *

one sequence of C<(?:...|...)> alternations for each token to be matched;

=item *

using C<(?E<gt>...)> for each token to make sure there is no
backtracking;

=item *

using capturing parentheses and embedded code evaluation 
C<(?{ [TYPE =E<gt> $^N] })> to return the token value
from the regex match;

=item *

using C<$^R> as the value of the matched token; 

As the regex engine is not
reentrant, any operation that may call another regex match 
(e.g. recursive file include) cannot be done inside 
the C<(?{ ... })> code block, and is done after the regex match by checking the 
C<$^R> for special tokens.

=item *

using C<undef> as the return of C<$^R> to ignore a token, e.g. white space.

=back

The default tokenizer recognizes and returns the following token types:

=over 4

=item [STR => $value]

Perl-like single or double quoted string, C<$value> contains the string 
without the quotes and with any backslash escapes resolved. 

The string cannot span multiple input lines.

=item [NUM => $value]

Perl-like integer in decimal, hexadecimal, octal or binary notation, 
C<$value> contains decimal value of the integer.

=item [NAME => $name]

Perl-like identifier name, i.e. word starting with a letter or underscore and 
followed by letters, underscores or digits.

=item [$token => $token]

All other characters except white space are returned in the form 
C<[$token=E<gt>$token]>, where C<$token> is a single character or one 
of the following composed tokens: << >> == != >= <=

=item white space

All white space is ignored, i.e. the tokenizer returns C<undef>.

=item [INCLUDE => $file]

Returned when a C<#include> statement is recognized, causes the lexer to
recursively include the file at the current input stream location.

=item [INPUT_POS => $file, $line_nr, $line_inc]

Returned when a C<#line> statement is recognized, causes the lexer to
set the current input location to the given C<$file>, C<$line_nr> and 
C<$line_inc>. 

=item [ERROR => $message]

Causes the lexer to call C<error> with the given error message, can be 
used when the input cannot be tokenized.

=back

=cut

#------------------------------------------------------------------------------
# get the next line from input, set TEXT, return true
sub _readline {
	my($self) = @_;
	
	while (1) {
		my $input = $self->[INPUT] or return;		# no input, return false
		if ( defined( $self->[TEXT] = $input->() ) ) {
			pos($self->[TEXT]) = 0;
			last;
		}
		else {
			$self->_pop_context;					# pop and continue
		}
	}
	return 1;
}

#------------------------------------------------------------------------------
# get next token as [TYPE => VALUE], undef on end of input
sub get_token {
	my($self) = @_;

	LINE:
	while (1) {
		# read line
		if (! defined $self->[TEXT]) {
			$self->_readline or return;			# end of input
		}
		
		# return tokens
		while ( (my $start_pos = pos($self->[TEXT]))
				< length($self->[TEXT])
			  ) {	
			# increment line number if last token included newlines
			# need to retest after each token
			if ($self->[SAW_NL]) {
				$self->[LINE_NR] += $self->[SAW_NL] * $self->[LINE_INC];
				undef $self->[SAW_NL];
			}
		
			# read next token
			my $token = $self->tokenizer(\($self->[TEXT]));
			
			# check for newlines
			my $end_pos = pos($self->[TEXT]);
			$self->[SAW_NL] += 
				substr($self->[TEXT], $start_pos, $end_pos - $start_pos) 
					=~ tr/\n/\n/;
			
			# check for special tokens
			next unless defined $token;
			
			if ($token->[0] eq 'INCLUDE') {
				eval { $self->from_file($token->[1]) };
				$self->error($@) if $@;
				next LINE;
			}
			elsif ($token->[0] eq 'INPUT_POS') {
				@{$self}[ SAW_NL, FILE, LINE_NR, LINE_INC ] =
						( undef,  @{$token}[1 .. $#$token] );
			}
			elsif ($token->[0] eq 'ERROR') {
				$self->error($token->[1]);
			}
			else {
				return $token;
			}
		}
		# end of line
		undef $self->[TEXT];
	}
}

#------------------------------------------------------------------------------
# get next token as [TYPE => VALUE] from the given string reference
# return undef to ignore a token
sub tokenizer {
	my($self, $rtext) = @_;
	our $LINE_NR; local $LINE_NR;
	
	$$rtext =~ m{\G
		(?:
			# #include
			(?> ^ (?&SP)* \# include (?&SP)*
				(?:	\' ( [^\'\n]+ ) \' 	(?{ [INCLUDE => $^N] }) 
				|	\" ( [^\"\n]+ ) \" 	(?{ [INCLUDE => $^N] }) 
				|	 < ( [^>\n]+  )  > 	(?{ [INCLUDE => $^N] }) 
				|	   ( \S+      )		(?{ [INCLUDE => $^N] }) 
				|						(?{ [ERROR => 
											 "#include expects a file name"] })
				)
				.* \n?					# eat newline
			)
		
			# #line
		|	(?> ^ (?&SP)* \# line (?&SP)+ 
					(\d+) (?&SP)+ 		(?{ $LINE_NR = $^N })
					\"? ([^\"\n]+) \"?	(?{ [INPUT_POS => $^N, $LINE_NR, 1] })
				.* \n?					# eat newline
			)
			
			# other #-lines - ignore
		|	(?> ^ (?&SP)* \# .* \n?		(?{ undef }) 
			)
		
			# white space
		|	(?> \s+						(?{ undef }) 
			)
			
			# string
		|	(?>	( \" [^\\\"]* (?: \\. [^\\\"]* )* \" )
										(?{ [STR => eval($^N)] })
			)
		|	(?>	( \' [^\\\']* (?: \\. [^\\\']* )* \' )
										(?{ [STR => eval($^N)] })
			)
			
			# number
		|	(?> 0x ( [0-9a-f]+ ) \b 	(?{ [NUM => hex($^N)] }) 
			)
		|	(?> 0b ( [01]+ ) \b			(?{ [NUM => oct("0b".$^N)] }) 
			)
		|	(?> 0 ( [0-7]+ ) \b			(?{ [NUM => oct("0".$^N)] }) 
			)
		|	(?> ( \d+ ) \b 				(?{ [NUM => 0+$^N] }) 
			)
		
			# name
		|	(?> ( [a-z_]\w* )			(?{ [NAME => $^N] }) 
			)
			
			# symbols
		|	(?> ( << | >> | == | != | >= | <= | . )
										(?{ [$^N, $^N] }) 
			)
		)
		
		(?(DEFINE)
			# horizontal blanks
			(?<SP>	[\t\f\r ] )
		)
	}gcxmi or die 'not reached';
	return $^R;
}

#------------------------------------------------------------------------------
# implemented by XSAccessor above

=head1 METHODS - INPUT LOCATION AND ERRORS

=head2 file

Returns the current input file, C<undef> if reading from a list.

=head2 line_nr

Returns the current input line number, starting at 1.

=head2 line_inc

Increment of line number on each new-line found, usually 1.

=head2 error

Dies with the given error message, indicating the place in the input source file
where the error occured.

=cut

#------------------------------------------------------------------------------
sub error { 
	my($self, @message) = @_;
	die $self->_error_msg("Error", @message);
}
#------------------------------------------------------------------------------

=head2 warning

Warns with the given error message, indicating the place in the input source file
where the warning occured.

=cut

#------------------------------------------------------------------------------
sub warning { 
	my($self, @message) = @_;
	warn $self->_error_msg("Warning", @message);
}

#------------------------------------------------------------------------------
# error message for error() and warning()
sub _error_msg { 
	my($self, $type, @message) = @_;
	chomp(@message);
	my $in_file = defined($self->[FILE]) ? "$self->[FILE]" : undef;
	my $in_line = $self->[LINE_NR] ? "($self->[LINE_NR])" : undef;
	my $at = ($in_file || $in_line) ? 
				join("", grep {defined} $in_file, $in_line) : undef;
	my $near;
	if (defined($self->[TEXT]) && defined(pos($self->[TEXT]))) {
		my $code = substr($self->[TEXT], pos($self->[TEXT]), 20);
		$code =~ s/\n.*//s;
		if ($code ne "") {
			$near = "near ".dump($code);
		}
	}

	return join(" ", grep {defined} $at, $type, @message, $near), "\n";
}
#------------------------------------------------------------------------------

=head1 AUTHOR, BUGS, FEEDBACK, LICENSE, COPYRIGHT

See L<Parse::FSM|Parse::FSM>

=cut

#------------------------------------------------------------------------------

1;