The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# $Id: Lexer.pm,v 1.3 2010/10/15 15:55:26 Paulo Exp $

package Asm::Preproc::Lexer;

#------------------------------------------------------------------------------

=head1 NAME

Asm::Preproc::Lexer - Lexer generator

=cut

#------------------------------------------------------------------------------

use strict;
use warnings;

use Carp;
use Text::Template 'fill_in_string';

use Asm::Preproc::Stream;
use Asm::Preproc::Line;
use Asm::Preproc::Token;

our $VERSION = '0.06';

#------------------------------------------------------------------------------

=head1 SYNOPSIS

  use Asm::Preproc::Lexer;
  my @tokens = (
     BLANKS  => qr/\s+/,       sub {()},
     COMMENT => [qr/\/\*/, qr/\*\//],
                               undef,
     QSTR    => [qr/'/],       sub { my($type, $value) = @_;
                                     [$type, 
                                      substr($value, 1, length($value)-2)] },
     QQSTR   => [qr/"/, qr/"/],
     NUM     => qr/\d+/,
     ID      => qr/[a-z]+/,    sub { my($type, $value) = @_; 
                                     [$type, $value] },
     SYM     => qr/(.)/,       sub { [$1, $1] },
  );
  my $lex = Asm::Preproc::Lexer->new(@tokens);
  my $lex2 = $lex->clone;
  $lex->from(sub {});          # read Asm::Preproc::Line from iterator
  $lex->from(@lines);          # read Asm::Preproc::Line from list
  my $token = $lex->get;       # isa Asm::Preproc::Token

=head1 DESCRIPTION

This module creates a tokenizer based on the specification given to the 
C<new> constructor.

The tokenizer reads L<Asm::Preproc::Line|Asm::Preproc::Line> objects and
splits them in L<Asm::Preproc::Token|Asm::Preproc::Token> objects on each
C<get> call. C<get> returns C<undef> on end of input.

=head1 FUNCTIONS

=head2 new

Creates a new tokenizer object for the given token specification.
Each token is specified by the following elements:

=over 4

=item type

String to identify the token type, unused if the token is discarded (see 
C<BLANKS> and C<COMMENT> above).

=item regexp

One of:

=over 4

=item 1

A single regular expression to match the token at the current input position.

=item 2

A list of one regular expression, to match delimited tokens that use the 
same delimiter for the start and the end. 
The token can span multiple lines.
See see C<QSTR> above for an example for multi-line single-quoted strings.

=item 3

A list of two regular expressions, to match the start 
of the token at the current input position, and the end of the token.
The token can span multiple lines.
See see C<COMMENT> above for an example for multi-line comments.

=back

The regular expression is matched where the previous match finished, 
and each sub-expression cannot span multiple lines.
Parentheses may be used to capture sub-expressions in C<$1>, C<$2>, etc.

It is considered an error, and the tokeninzer dies with an error message
when reading input, if some input cannot be recognized by any of the 
given C<regexp> espressions. Therefore the C<SYM> token above contains the
catch-all expression C<qr/(.)/>.

=item transform (optional)

The optional code reference is a transform subroutine. It receives 
the C<type> and C<value> of the recognized token, and returns one of:

=over 4

=item 1

An array ref with two elements C<[$type, $value]>, 
the new C<type> and C<value> to be 
returned in the L<Asm::Preproc::Token|Asm::Preproc::Token> object.

=item 2

An empty array C<()> to signal that this token shall be dicarded.

=back

As an optimization, the transform subroutine code reference may be
set to C<undef>, to signal that the token will be dicarded 
and there is no use in accumulating it while matching. 
This is usefull to discard comments upfront, instead of
collecting the whole comment, and then pass it to the transform subroutine
just to be discarded afterwards.
See see C<COMMENT> above for an example of usage.

=back

=head2 clone

Creates a copy of this tokenizer object without compiling a new 
lexing subroutine. The copied object has all pending input cleared.

=head2 input

L<Asm::Preproc::Stream|Asm::Preproc::Stream> object from which new lines
to process are read.

=cut

#------------------------------------------------------------------------------
use constant TEXT => 3;			# need to access as $self->[TEXT] to get pos()
use Class::XSAccessor::Array {
	accessors 		=> {
		_lexer		=> 0,
		input		=> 1,
		_line		=> 2,		# current line being processed
		_text		=> TEXT,	# text being parsed
	},
};

sub new { 
	my($class, @tokens) = @_;
	my $self = $class->_new(sub {undef});
	$self->_lexer( $self->_make_lexer(@tokens) );
	$self;
}

sub clone {
	my($self) = @_;
	$self->_new($self->_lexer);
}

sub _new { 
	my($class, $lexer) = @_;
	return bless [
					$lexer, 	 				# _lexer
					Asm::Preproc::Stream->new,	# input
					undef, "",					# _line, _rtext
		], ref($class) || $class;
};

#------------------------------------------------------------------------------
# compile the lexing subroutine
sub _make_lexer {
	my($self, @tokens) = @_;
	@tokens or croak "tokens expected";
	
	# closure for each token attributes, indexed by token sequence nr
	my @type;			# token type
	my @start_re;		# match start of token
	my @end_re;			# match end of token
	my @transform;		# transform subroutine
	my @discard;		# true to discard multi-line token
	my @comment;		# comment to show all options of each token branch
	
	# parse the @tokens list
	for (my $id = 0; @tokens; $id++) {
		# read type
		$type[$id] = shift @tokens;
		
		# read regexp
		my $re = shift @tokens or croak "regexp expected";
		
		if (ref $re eq 'Regexp') {
			$start_re[$id] = $re;
		}
		elsif (ref $re eq 'ARRAY') {
			@$re == 1 and push @$re, $re->[0];
			@$re == 2 or croak "invalid regexp list";
			($start_re[$id], $end_re[$id]) = @$re;
		}
		else {
			croak "invalid regexp";
		}

		# read transform, define discard
		if (@tokens) {
			if (! defined($tokens[0])) {
				$discard[$id] = 1;
				shift @tokens;
			}
			elsif (ref($tokens[0]) eq 'CODE') {
				$transform[$id] = shift @tokens;
			}
		}
	
		# comment
		$comment[$id] = join('  ', map {defined($_) ? $_ : ''}
								$id, 
								$type[$id], 
								$start_re[$id], 
								$end_re[$id],
								$transform[$id], 
								$discard[$id]);
		$comment[$id] =~ s/\n/\\n/g;
		
	}

	# LEXER code
	my $template_data = { 
		end_re			=> \@end_re,
		transform		=> \@transform,
		discard			=> \@discard,
		comment 		=> \@comment,
	};
	my @template_args = (
		DELIMITERS 	=> [ '<%', '%>' ],
		HASH 		=> $template_data,
	);

	my $code = fill_in_string(<<'END_CODE', @template_args);
	
	sub {
		my($self) = @_;

		for ($self->[TEXT]) {
			LINE:
			while (1) {								# read lines
				while ((pos()||0) >= length()) {	# last line consumed
					$self->_read_line or return undef;
				}
		
				TOKEN:
				while (1) {							# read tokens
					my $token_line = $self->_line;	# start of token line
					my $pos0 = pos()||0;			# position before match
					
					# need to read new line
					if (/ \G \z /gcx) {
						next LINE;
					}
END_CODE

	for my $id (0 .. $#type) {
		$template_data->{id} = $id;
		$template_data->{LINE_BLOCK} 
			= fill_in_string(<<'END_CODE', @template_args);
			
						BLOCK:	
						while (1) {			# read  multi-line block
							<% $discard[$id] ? '' : '$pos0 = pos()||0;' %>
							
							# need to read new line
							if (/ \G \z /gcx) {
								$self->_read_line 
									or $token_line->error(
											"unbalanced token at: ".$value);
							}
							# end
							elsif (/ \G (?s: .*?) $end_re[<% $id %>] /gcx) {
								<% $discard[$id] ? '' : 
									'$value .= $self->_capture($pos0);' %>
								last BLOCK;	# collected whole token
							}
							# consume all
							else {
								pos() = length();
								<% $discard[$id] ? '' : 
									'$value .= $self->_capture($pos0);' %>
							}
						}
END_CODE
			
		$template_data->{TRANSFORM} 
			= fill_in_string(<<'END_CODE', @template_args);
			
						# call transform routine
						my $ret = $transform[<% $id %>]->($type, $value);
						next unless $ret;		# discard token
						($type, $value) = @$ret;
END_CODE
		$code .= fill_in_string(<<'END_CODE', @template_args);
		
					# <% $comment[$id] %>
					elsif (/ \G $start_re[<% $id %>] /gcx) {
						my($type, $value) = <%
								'' %> ($type[<% $id %>], $self->_capture($pos0));
						<% $end_re[$id] ? $LINE_BLOCK : '' %>
						<% $transform[$id] ? $TRANSFORM : '' %>
						
						<% $discard[$id] ? 'next;' : '' %>
						
						return Asm::Preproc::Token->new(
											$type, $value, $token_line);
					}
END_CODE
	}
	
	$code .= fill_in_string(<<'END_CODE', @template_args);
					# no token recognized, consume rest of line and die
					else {
						pos() = length();
						$token_line->error("no token recognized at: ".
											substr($_, $pos0));
					}
				}
			}
		}
	};
END_CODE

	#warn $code;
	my $lexer = eval $code;
	$@ and croak "$code\n$@";
	
	return $lexer;
}

#------------------------------------------------------------------------------
# get the next line from input, save in _line, _rtext
sub _read_line {
	my($self) = @_;
	
	# get one line
	my $line = $self->input->get;
	my $text = "";				# default: no text to parse
	
	if (defined $line) {
		# convert to Asm::Preproc::Line if needed
		ref($line) or $line = Asm::Preproc::Line->new($line);
		$text = $line->text;
		$text = "" unless defined $text;		# make sure we have something
	}
	
	$self->_line( $line );		# line to return at each token
	$self->[TEXT] = $text;		# text to parse
	
	return $line;
}
#------------------------------------------------------------------------------
# capture the last match
sub _capture {
	my($self, $pos0) = @_;
	return substr($_, $pos0, pos() - $pos0);
}	
#------------------------------------------------------------------------------

=head2 from

Inserts the given input at the head of the input queue to the tokenizer.
The input is either a list of L<Asm::Preproc::Line|Asm::Preproc::Line>
objects, or an interator function that returns a 
L<Asm::Preproc::Line|Asm::Preproc::Line> object on each call.

The input list and interator can also return plain scalar strings, that 
are converted to L<Asm::Preproc::Line|Asm::Preproc::Line> on the fly, but
the information on input file location for error messages will not be available.

The new inserted input is processed before continuing with whatever was 
already in the queue.

=cut

#------------------------------------------------------------------------------
# my($self, @input) = @_;
sub from { shift->input->unget(@_); }
#------------------------------------------------------------------------------

=head2 get

Retrieves the next token from the input strean as a L<Asm::Preproc::Token|Asm::Preproc::Token> object. 

Returns C<undef> on end of input.

Dies with an error message indicating the location in the input if the 
source does not match any of the tokens.

=cut

#------------------------------------------------------------------------------
sub get { goto $_[0]->_lexer; }
#------------------------------------------------------------------------------

#------------------------------------------------------------------------------

=head2 stream

Returns a L<Asm::Preproc::Stream|Asm::Preproc::Stream> object that will 
return the result of C<get> on each call.

=cut

#------------------------------------------------------------------------------
sub stream {
	my($self) = @_;
	return Asm::Preproc::Stream->new(sub {$self->get});
}
#------------------------------------------------------------------------------

1;