package Language::P::Lexer;
use strict;
use warnings;
use base qw(Class::Accessor::Fast);
__PACKAGE__->mk_ro_accessors( qw(stream buffer tokens symbol_table
file line _start_of_line _heredoc_lexer
) );
__PACKAGE__->mk_accessors( qw(quote) );
use Language::P::ParseTree qw(:all);
use Language::P::Keywords;
our @TOKENS;
BEGIN {
our @TOKENS =
qw(T_ID T_FQ_ID T_SUB_ID T_EOF T_PACKAGE T_FILETEST
T_PATTERN T_STRING T_NUMBER T_QUOTE T_OR T_XOR
T_SEMICOLON T_COLON T_COMMA T_OPPAR T_CLPAR T_OPSQ T_CLSQ
T_OPBRK T_CLBRK T_OPHASH T_OPAN T_CLPAN T_INTERR
T_NOT T_SLESS T_CLAN T_SGREAT T_EQUAL T_LESSEQUAL T_SLESSEQUAL
T_GREATEQUAL T_SGREATEQUAL T_EQUALEQUAL T_SEQUALEQUAL T_NOTEQUAL
T_SNOTEQUAL T_SLASH T_BACKSLASH T_DOT T_DOTDOT T_DOTDOTDOT T_PLUS
T_MINUS T_STAR T_DOLLAR T_PERCENT T_AT T_AMPERSAND T_PLUSPLUS
T_MINUSMINUS T_ANDAND T_OROR T_ARYLEN T_ARROW T_MATCH T_NOTMATCH
T_ANDANDLOW T_ORORLOW T_NOTLOW T_XORLOW T_CMP T_SCMP T_SSTAR T_POWER
T_PLUSEQUAL T_MINUSEQUAL T_STAREQUAL T_SLASHEQUAL T_LABEL T_TILDE
T_CLASS_START T_CLASS_END T_CLASS T_QUANTIFIER T_ASSERTION T_ALTERNATE
T_CLGROUP
);
};
use constant
{ X_NOTHING => 0,
X_STATE => 1,
X_TERM => 2,
X_OPERATOR => 3,
X_BLOCK => 4,
X_REF => 5,
O_POS => 0,
O_TYPE => 1,
O_VALUE => 2,
O_ID_TYPE => 3,
O_FT_OP => 3,
O_QS_INTERPOLATE => 3,
O_QS_BUFFER => 4,
O_RX_REST => 3,
O_RX_SECOND_HALF => 5,
O_RX_FLAGS => 6,
O_RX_INTERPOLATED => 7,
O_NUM_FLAGS => 3,
LEX_NO_PACKAGE => 1,
map { $TOKENS[$_] => $_ + 1 } 0 .. $#TOKENS,
};
use Exporter qw(import);
our @EXPORT_OK =
( qw(X_NOTHING X_STATE X_TERM X_OPERATOR X_BLOCK X_REF
O_POS O_TYPE O_VALUE O_ID_TYPE O_FT_OP O_QS_INTERPOLATE O_QS_BUFFER
O_RX_REST O_RX_SECOND_HALF O_RX_FLAGS O_RX_INTERPOLATED O_NUM_FLAGS
LEX_NO_PACKAGE
), @TOKENS );
our %EXPORT_TAGS =
( all => \@EXPORT_OK,
);
sub new {
my( $class, $args ) = @_;
my $self = $class->SUPER::new( $args );
my $a = delete $self->{string} || "";
$self->{buffer} = ref $a ? $a : \$a;
$self->{tokens} = [];
$self->{brackets} = 0;
$self->{pending_brackets} = [];
$self->{line} = 1;
$self->{_start_of_line} = 1;
$self->{pos} = [ $self->file, $self->line ];
return $self;
}
sub peek {
my( $self, $expect ) = ( @_, X_NOTHING );
my $token = $self->lex( $expect );
$self->unlex( $token );
return $token;
}
sub unlex {
my( $self, $token ) = @_;
push @{$self->tokens}, $token;
}
my %ops =
( ';' => T_SEMICOLON,
':' => T_COLON,
',' => T_COMMA,
'=>' => T_COMMA,
'(' => T_OPPAR,
')' => T_CLPAR,
'[' => T_OPSQ,
']' => T_CLSQ,
'{' => T_OPBRK,
'}' => T_CLBRK,
'?' => T_INTERR,
'!' => T_NOT,
'<' => T_OPAN,
'lt' => T_SLESS,
'>' => T_CLAN,
'gt' => T_SGREAT,
'=' => T_EQUAL,
'<=' => T_LESSEQUAL,
'le' => T_SLESSEQUAL,
'>=' => T_GREATEQUAL,
'ge' => T_SGREATEQUAL,
'==' => T_EQUALEQUAL,
'eq' => T_SEQUALEQUAL,
'!=' => T_NOTEQUAL,
'ne' => T_SNOTEQUAL,
'<=>' => T_CMP,
'cmp' => T_SCMP,
'/' => T_SLASH,
'\\' => T_BACKSLASH,
'.' => T_DOT,
'..' => T_DOTDOT,
'...' => T_DOTDOTDOT,
'~' => T_TILDE,
'+' => T_PLUS,
'-' => T_MINUS,
'*' => T_STAR,
'x' => T_SSTAR,
'$' => T_DOLLAR,
'%' => T_PERCENT,
'**' => T_POWER,
'@' => T_AT,
'&' => T_AMPERSAND,
'|' => T_OR,
'^' => T_XOR,
'++' => T_PLUSPLUS,
'--' => T_MINUSMINUS,
'&&' => T_ANDAND,
'||' => T_OROR,
'$#' => T_ARYLEN,
'->' => T_ARROW,
'=~' => T_MATCH,
'!~' => T_NOTMATCH,
'and' => T_ANDANDLOW,
'or' => T_ORORLOW,
'not' => T_NOTLOW,
'xor' => T_XORLOW,
);
my %filetest =
( r => OP_FT_EREADABLE,
w => OP_FT_EWRITABLE,
x => OP_FT_EEXECUTABLE,
o => OP_FT_EOWNED,
R => OP_FT_RREADABLE,
W => OP_FT_RWRITABLE,
X => OP_FT_REXECUTABLE,
O => OP_FT_ROWNED,
e => OP_FT_EXISTS,
z => OP_FT_EMPTY,
s => OP_FT_NONEMPTY,
f => OP_FT_ISFILE,
d => OP_FT_ISDIR,
l => OP_FT_ISSYMLINK,
p => OP_FT_ISPIPE,
S => OP_FT_ISSOCKET,
b => OP_FT_ISBLOCKSPECIAL,
c => OP_FT_ISCHARSPECIAL,
t => OP_FT_ISTTY,
u => OP_FT_SETUID,
g => OP_FT_SETGID,
k => OP_FT_STICKY,
T => OP_FT_ISASCII,
B => OP_FT_ISBINARY,
M => OP_FT_MTIME,
A => OP_FT_ATIME,
C => OP_FT_CTIME,
);
my %quoted_chars =
( 'n' => "\n",
't' => "\t",
'r' => "\r",
'f' => "\f",
'b' => "\b",
'a' => "\a",
'e' => "\e",
);
my %quoted_pattern =
( w => [ T_CLASS, 'WORDS' ],
W => [ T_CLASS, 'NON_WORDS' ],
s => [ T_CLASS, 'SPACES' ],
S => [ T_CLASS, 'NOT_SPACES' ],
d => [ T_CLASS, 'DIGITS' ],
D => [ T_CLASS, 'NOT_DIGITS' ],
b => [ T_ASSERTION, 'WORD_BOUNDARY' ],
B => [ T_ASSERTION, 'NON_WORD_BOUNDARY' ],
A => [ T_ASSERTION, 'BEGINNING' ],
Z => [ T_ASSERTION, 'END_OR_NEWLINE' ],
z => [ T_ASSERTION, 'END' ],
G => [ T_ASSERTION, 'POS' ],
);
my %pattern_special =
( '^' => [ T_ASSERTION, 'START_SPECIAL' ],
'$' => [ T_ASSERTION, 'END_SPECIAL' ],
'*' => [ T_QUANTIFIER, 0, -1, 1 ],
'+' => [ T_QUANTIFIER, 1, -1, 1 ],
'?' => [ T_QUANTIFIER, 0, 1, 1 ],
'*?' => [ T_QUANTIFIER, 0, -1, 0 ],
'+?' => [ T_QUANTIFIER, 1, -1, 0 ],
'??' => [ T_QUANTIFIER, 0, 1, 0 ],
')' => [ T_CLGROUP ],
'|' => [ T_ALTERNATE ],
'[' => [ T_CLASS_START ],
']' => [ T_CLASS_END ],
);
sub _skip_space {
my( $self ) = @_;
my $buffer = $self->buffer;
my $retval = '';
my $reset_pos = 0;
for(;;) {
$self->_fill_buffer unless length $$buffer;
return unless length $$buffer;
if( $self->{_start_of_line}
&& $$buffer =~ s/^#[ \t]*line[ \t]+([0-9]+)(?:[ \t]+"([^"]+)")?[ \t]*[\r\n]// ) {
$self->{line} = $1;
$self->{file} = $2 if $2;
$reset_pos = 1;
next;
}
$$buffer =~ s/^([ \t]+)// && defined wantarray and $retval .= $1;
if( $$buffer =~ s/^([\r\n])// ) {
$retval .= $1 if defined wantarray;
$self->{_start_of_line} = 1;
++$self->{line};
$reset_pos = 1;
next;
}
if( $$buffer =~ s/^(#.*\n)// ) {
$retval .= $1 if defined wantarray;
$self->{_start_of_line} = 1;
++$self->{line};
$reset_pos = 1;
next;
}
last if length $$buffer;
}
if( $reset_pos ) {
$self->{pos} = [ $self->{file}, $self->{line} ];
}
return $retval;
}
# taken from intuit_more in toke.c
sub _character_class_insanity {
my( $self ) = @_;
my $buffer = $self->buffer;
if( $$buffer =~ /^\]|^\^/ ) {
return 1;
}
my( $t ) = $$buffer =~ /^(.*\])/;
my $w = 2;
my( $un_char, $last_un_char, @seen ) = ( 255 );
return 1 if !defined $t;
if( $t =~ /^\$/ ) {
$w -= 3;
} elsif( $t =~ /^[0-9][0-9]\]/ ) {
$w -= 10
} elsif( $t =~ /^[0-9]\]/ ) {
$w -= 100;
} elsif( $t =~ /^\$\w+/ ) {
# HACK, not in original
$w -= 100;
}
for(;;) {
last;
}
return $w >= 0 ? 1 : 0;
}
# taken from intuit_more in toke.c
sub _quoted_code_lookahead {
my( $self ) = @_;
my $buffer = $self->buffer;
if( $$buffer =~ s/^->([{[])// ) {
++$self->{brackets};
$self->unlex( [ $self->{pos}, $ops{$1}, $1 ] );
$self->unlex( [ $self->{pos}, T_ARROW, '->' ] );
} elsif( $$buffer =~ s/^{// ) {
if( !$self->quote->{interpolated_pattern} ) {
++$self->{brackets};
$self->unlex( [ $self->{pos}, T_OPBRK, '{' ] );
} elsif( $$buffer =~ /^[0-9]+,[0-9]*}/ ) {
die 'Quantifier!';
} else {
++$self->{brackets};
$self->unlex( [ $self->{pos}, T_OPBRK, '{' ] );
}
} elsif( $$buffer =~ s/^\[// ) {
if( !$self->quote->{interpolated_pattern} ) {
++$self->{brackets};
$self->unlex( [ $self->{pos}, T_OPSQ, '[' ] );
} else {
if( _character_class_insanity( $self ) ) {
$$buffer = '[' . $$buffer;
my $token = $self->lex_quote;
$self->unlex( $token );
} else {
++$self->{brackets};
$self->unlex( [ $self->{pos}, T_OPSQ, '[' ] );
}
}
} else {
my $token = $self->lex_quote;
$self->unlex( $token );
}
}
sub lex_pattern_group {
my( $self ) = @_;
my $buffer = $self->buffer;
die unless length $$buffer; # no whitespace allowed after '(?'
$$buffer =~ s/^(\#|:|[imsx]*\-[imsx]*:?|!|=|<=|<!|{|\?{|\?>)//x
or die "Invalid character after (?";
return [ $self->{pos}, T_PATTERN, $1 ];
}
sub lex_charclass {
my( $self ) = @_;
my $buffer = $self->buffer;
my $c = substr $$buffer, 0, 1, '';
if( $c eq '\\' ) {
my $qc = substr $$buffer, 0, 1, '';
if( my $qp = $quoted_pattern{$qc} ) {
return [ $self->{pos}, $qp->[0], $qp->[1] ];
}
return [ $self->{pos}, T_STRING, $qc ];
} elsif( $c eq '-' ) {
return [ $self->{pos}, T_MINUS, '-' ];
} elsif( $c eq ']' ) {
return [ $self->{pos}, T_CLASS_END ];
} else {
return [ $self->{pos}, T_STRING, $c ];
}
}
sub lex_quote {
my( $self ) = @_;
return pop @{$self->tokens} if @{$self->tokens};
my $buffer = $self->buffer;
my $v = '';
for(;;) {
unless( length $$buffer ) {
if( length $v ) {
$self->unlex( [ $self->{pos}, T_EOF, '' ] );
return [ $self->{pos}, T_STRING, $v, 1 ];
} else {
return [ $self->{pos}, T_EOF, '' ];
}
}
my $to_return;
my $pattern = $self->quote->{pattern};
my $interpolated_pattern = $self->quote->{interpolated_pattern};
while( length $$buffer ) {
my $c = substr $$buffer, 0, 1, '';
if( $pattern || $interpolated_pattern ) {
if( $c eq '\\' ) {
my $qc = substr $$buffer, 0, 1;
if( my $qp = $quoted_pattern{$qc} ) {
substr $$buffer, 0, 1, ''; # eat character
if( $pattern ) {
$to_return = [ $self->{pos}, T_PATTERN, $qc, $qp ];
} else {
$v .= $c . $qc;
next;
}
}
} elsif( $c eq '(' && !$interpolated_pattern ) {
my $nc = substr $$buffer, 0, 1;
if( $nc eq '?' ) {
substr $$buffer, 0, 1, ''; # eat character
$to_return = [ $self->{pos}, T_PATTERN, '(?' ];
} else {
$to_return = [ $self->{pos}, T_PATTERN, '(' ];
}
} elsif( !$interpolated_pattern
and my $special = $pattern_special{$c} ) {
# check nongreedy quantifiers
if( $special->[0] == T_QUANTIFIER ) {
my $qc = substr $$buffer, 0, 1;
if( $qc eq '?' ) {
substr $$buffer, 0, 1, '';
$special = $pattern_special{$c . $qc};
}
}
$to_return = [ $self->{pos}, T_PATTERN, $c, $special ];
}
}
if( $to_return ) {
if( length $v ) {
$self->unlex( $to_return );
return [ $self->{pos}, T_STRING, $v, 1 ];
} else {
return $to_return;
}
}
if( $c eq '\\' && $self->quote->{interpolate} ) {
my $qc = substr $$buffer, 0, 1, '';
if( $qc =~ /^[a-zA-Z]$/ ) {
if( $quoted_chars{$qc} ) {
$v .= $quoted_chars{$qc};
} else {
die "Invalid escape '$qc'";
}
} elsif( $qc =~ /^[0-9]$/ ) {
die "Unsupported numeric escape";
} else {
$v .= $qc;
}
} elsif( $c =~ /^[\$\@]$/ && $self->quote->{interpolate} ) {
if( $interpolated_pattern
&& ( !length( $$buffer )
|| index( "()| \r\n\t",
substr( $$buffer, 0, 1 ) ) != -1 ) ) {
$v .= $c;
} elsif( length $v ) {
$self->unlex( [ $self->{pos}, $ops{$c}, $c ] );
return [ $self->{pos}, T_STRING, $v ];
} else {
return [ $self->{pos}, $ops{$c}, $c ];
}
} else {
$v .= $c;
}
}
}
die "Can't get there";
}
sub lex_alphabetic_identifier {
my( $self, $flags ) = @_;
if( @{$self->tokens} ) {
return undef if $self->tokens->[-1]->[O_TYPE] != T_ID;
return pop @{$self->tokens};
}
local $_ = $self->buffer;
if( $flags & LEX_NO_PACKAGE ) {
return undef unless $$_ =~ /^[ \t\r\n]*\w/;
} else {
return undef unless $$_ =~ /^[ \t\r\n]*[':\w]/;
}
return lex_identifier( $self, $flags );
}
sub lex_identifier {
my( $self, $flags ) = @_;
if( @{$self->tokens} ) {
return undef if $self->tokens->[-1]->[O_TYPE] != T_ID;
return pop @{$self->tokens};
}
local $_ = $self->buffer;
_skip_space( $self )
if defined( $$_ ) && $$_ =~ /^[ \t\r\n]/;
return [ $self->{pos}, T_EOF, '' ] unless length $$_;
my $id;
$$_ =~ s/^\^([A-Z\[\\\]^_?])//x and do {
$id = [ $self->{pos}, T_ID, chr( ord( $1 ) - ord( 'A' ) + 1 ), T_FQ_ID ];
};
$id or $$_ =~ s/^::(?=\W)//x and do {
$id = [ $self->{pos}, T_ID, 'main::', T_FQ_ID ];
};
$id or $$_ =~ s/^(\'|::)?(\w+)//x and do {
if( $flags & LEX_NO_PACKAGE ) {
return [ $self->{pos}, T_ID, $2, T_ID ];
}
my $ids = defined $1 ? '::' . $2 : $2;
my $idt = defined $1 ? T_FQ_ID : T_ID;
while( $$_ =~ s/^::(\w*)|^\'(\w+)// ) {
$ids .= '::' . ( defined $1 ? $1 : $2 );
$idt = T_FQ_ID;
}
$id = [ $self->{pos}, T_ID, $ids, $idt ];
};
$id or $$_ =~ s/^{\^([A-Z\[\\\]^_?])(\w*)}//x and do {
$id = [ $self->{pos}, T_ID, chr( ord( $1 ) - ord( 'A' ) + 1 ) . $2, T_FQ_ID ];
};
$id or $$_ =~ s/^{//x and do {
my $spcbef = _skip_space( $self );
my $maybe_id;
if( $$_ =~ s/^(\w+)//x ) {
$maybe_id = $1;
} else {
$$_ = '{' . $spcbef . $$_;
return undef;
}
my $spcaft = _skip_space( $self );
if( $$_ =~ s/^}//x ) {
$id = [ $self->{pos}, T_ID, $maybe_id, T_ID ];
} elsif( $$_ =~ /^\[|^\{/ ) {
++$self->{brackets};
push @{$self->{pending_brackets}}, $self->{brackets};
$id = [ $self->{pos}, T_ID, $maybe_id, T_ID ];
} else {
# not a simple identifier
$$_ = '{' . $spcbef . $maybe_id . $spcaft . $$_;
return undef;
}
};
$id or $$_ =~ /^\$[\${:]/ and do {
return;
};
$id or $$_ =~ s/^(\W)(?=\W)// and do {
$id = [ $self->{pos}, T_ID, $1, T_FQ_ID ];
};
if( $id && $self->quote && $self->{brackets} == 0 ) {
_quoted_code_lookahead( $self );
}
return $id;
}
sub lex_number {
my( $self ) = @_;
local $_ = $self->buffer;
my( $num, $flags ) = ( '', 0 );
$$_ =~ s/^0([xb]?)//x and do {
if( $1 eq 'b' ) {
# binary number
if( $$_ =~ s/^([01]+)// ) {
$flags = NUM_BINARY;
$num .= $1;
return [ $self->{pos}, T_NUMBER, $num, $flags ];
} else {
die "Invalid binary digit";
}
} elsif( $1 eq 'x' ) {
# hexadecimal number
if( $$_ =~ s/^([0-9a-fA-F]+)// ) {
$flags = NUM_HEXADECIMAL;
$num .= $1;
return [ $self->{pos}, T_NUMBER, $num, $flags ];
} else {
die "Invalid hexadecimal digit";
}
} else {
# maybe octal number
if( $$_ =~ s/^([0-7]+)// ) {
$flags = NUM_OCTAL;
$num .= $1;
$$_ =~ /^[89]/ and die "Invalid octal digit";
return [ $self->{pos}, T_NUMBER, $num, $flags ];
} else {
$flags = NUM_INTEGER;
$num = '0'
}
}
};
$$_ =~ s/^(\d+)//x and do {
$flags = NUM_INTEGER;
$num .= $1;
};
# '..' operator (es. 12..15)
$$_ =~ /^\.\./ and return [ $self->{pos}, T_NUMBER, $num, $flags ];
$$_ =~ s/^\.(\d*)//x and do {
$flags = NUM_FLOAT;
$num = '0' unless length $num;
$num .= ".$1" if length $1;
};
$$_ =~ s/^[eE]([+-]?\d+)//x and do {
$flags = NUM_FLOAT;
$num .= "e$1";
};
return [ $self->{pos}, T_NUMBER, $num, $flags ];
}
my %quote_end = qw!( ) { } [ ] < >!;
my @rx_flags =
( FLAG_RX_MULTI_LINE, FLAG_RX_SINGLE_LINE, FLAG_RX_CASE_INSENSITIVE,
FLAG_RX_FREE_FORMAT, FLAG_RX_ONCE, FLAG_RX_GLOBAL, FLAG_RX_KEEP,
FLAG_RX_EVAL );
my @tr_flags = ( FLAG_RX_COMPLEMENT, FLAG_RX_DELETE, FLAG_RX_SQUEEZE );
my %regex_flags =
( m => [ OP_QL_M, 'msixogc', @rx_flags ],
qr => [ OP_QL_QR, 'msixo', @rx_flags ],
s => [ OP_QL_S, 'msixogce', @rx_flags ],
tr => [ OP_QL_TR, 'cds', @tr_flags ],
y => [ OP_QL_TR, 'cds', @tr_flags ],
);
sub _find_end {
my( $self, $op, $quote_start ) = @_;
local $_ = $self->buffer;
if( $op && !$quote_start ) {
if( $$_ =~ /^[ \t\r\n]/ ) {
_skip_space( $self );
}
# if we find a fat comma, we got a string constant, not the
# start of a quoted string!
$$_ =~ /^=>/ and return ( undef, [ $self->{pos}, T_STRING, $op ] );
$$_ =~ s/^([^ \t\r\n])// or die;
$quote_start = $1;
}
my $quote_end = $quote_end{$quote_start} || $quote_start;
my $paired = $quote_start eq $quote_end ? 0 : 1;
my $is_regex = $regex_flags{$op};
my $pos = $self->{pos};
my( $interpolated, $delim_count, $str ) = ( 0, 1, '' );
SCAN_END: for(;;) {
$self->_fill_buffer unless length $$_;
die "EOF while parsing quoted string" unless length $$_;
while( length $$_ ) {
my $c = substr $$_, 0, 1, '';
if( $c eq '\\' ) {
my $qc = substr $$_, 0, 1, '';
if( $qc eq $quote_start || $qc eq $quote_end ) {
$str .= $qc;
} else {
$str .= "\\" . $qc;
}
next;
} elsif( $paired && $c eq $quote_start ) {
++$delim_count;
} elsif( $c eq $quote_end ) {
--$delim_count;
last SCAN_END unless $delim_count;
} elsif( $is_regex
&& ( $c eq '$' || $c eq '@' )
&& $quote_start ne "'" ) {
my $nc = substr $$_, 0, 1;
if( length( $nc )
&& $nc ne $quote_end
&& index( "()| \r\n\t", $nc ) == -1 ) {
$interpolated = 1;
}
}
$str .= $c;
}
}
my $interpolate = $op eq 'qq' ? 1 :
$op eq 'q' ? 0 :
$op eq 'qw' ? 0 :
$quote_start eq "'" ? 0 :
1;
return ( $quote_start,
[ $pos, $is_regex ? T_PATTERN : T_QUOTE,
0, $interpolate, \$str, undef, undef, $interpolated ] );
}
sub _prepare_sublex {
my( $self, $op, $quote_start ) = @_;
my( $quote, $token ) = _find_end( $self, $op, $quote_start );
# oops, found fat comma: not a quote-like operator
return $token if $token->[O_TYPE] == T_STRING;
if( my $op_descr = $regex_flags{$op} ) {
# scan second part of substitution/transliteration
if( $op eq 's' || $op eq 'tr' || $op eq 'y' ) {
my $quote_char = $quote_end{$quote} ? undef : $quote;
my( undef, $rest ) = _find_end( $self, $op, $quote_char );
$token->[O_RX_SECOND_HALF] = $rest;
}
# scan regexp flags
$token->[O_VALUE] = $op_descr->[0];
my $fl_str = $op_descr->[1];
local $_ = $self->buffer;
my $flags = 0;
while( length( $$_ )
and ( my $idx = index( $fl_str, substr( $$_, 0, 1 ) ) ) >= 0 ) {
substr $$_, 0, 1, '';
$flags |= $op_descr->[$idx + 2];
}
$token->[O_RX_FLAGS] = $flags;
} elsif( $op eq 'qx' || $op eq "`" ) {
$token->[O_VALUE] = OP_QL_QX;
} elsif( $op eq 'qw' ) {
$token->[O_VALUE] = OP_QL_QW;
} elsif( $op eq '<' ) {
$token->[O_VALUE] = OP_QL_LT;
}
return $token;
}
sub _prepare_sublex_heredoc {
my( $self ) = @_;
my( $quote, $str, $end ) = ( '"', '' );
local $_ = $self->buffer;
my $pos = $self->{pos};
if( $$_ =~ s/^[ \t]*(['"`])// ) {
# << "EOT", << 'EOT', << `EOT`
$quote = $1;
while( $$_ =~ s/^(.*?)(\\)?($quote)// ) {
$end .= $1;
if( !$2 ) {
last;
} else {
$end .= $quote;
}
}
} else {
# <<\EOT, <<EOT
if( $$_ =~ s/\\// ) {
$quote = "'";
}
$$_ =~ s/^(\w*)//;
warn "Deprecated" unless $1;
$end = $1;
}
$end .= "\n";
my $lex = $self->_heredoc_lexer || $self;
my $finished = 0;
if( !$lex->stream ) {
$_ = $lex->buffer;
if( $$_ =~ s/(.*)^$end//m ) {
$str .= $1;
$finished = 1;
}
} else {
# if the lexer reads from a stream, it buffers at most one line,
# so by not using the buffer we skip the rest of the line
my $stream = $lex->stream;
while( defined( my $line = readline $stream ) ) {
if( $line eq $end ) {
$finished = 1;
last;
}
$str .= $line;
}
}
Carp::confess "EOF while looking for terminator '$end'" unless $finished;
return [ $pos, T_QUOTE, $quote eq "`" ? OP_QL_QX : 0, $quote ne "'", \$str ];
}
sub lex {
my( $self, $expect ) = ( @_, X_NOTHING );
return pop @{$self->tokens} if @{$self->tokens};
# skip blanks and comments
_skip_space( $self );
local $_ = $self->buffer;
return [ $self->{pos}, T_EOF, '' ] unless length $$_;
# numbers
$$_ =~ /^\d|^\.\d/ and return $self->lex_number;
# quote and quote-like operators
$$_ =~ s/^(q|qq|qx|qw|m|qr|s|tr|y)(?=\W)//x and
return _prepare_sublex( $self, $1, undef );
# 'x' operator special case
$$_ =~ /^x[0-9]/ && $expect == X_OPERATOR and do {
$$_ =~ s/^.//;
return [ $self->{pos}, T_SSTAR, 'x' ];
};
# anything that can start with alphabetic character: package name,
# label, identifier, fully qualified identifier, keyword, named
# operator
$$_ =~ s/^(::)?(\w+)//x and do {
my $ids = ( $1 || '' ) . $2;
my $fqual = $1 ? 1 : 0;
my $no_space = $$_ !~ /^[ \t\r\n]/;
my $op = $ops{$ids};
my $kw = $op || $fqual ? undef : $Language::P::Keywords::KEYWORDS{$ids};
my $type = $fqual ? T_FQ_ID :
$op ? -1 :
$kw ? $kw :
T_ID;
if( $no_space && ( $$_ =~ /^::/
|| ( ( $type == T_ID || $type == T_FQ_ID )
&& $$_ =~ /^'\w/ ) ) ) {
while( $$_ =~ s/^::(\w*)|^\'(\w+)// ) {
$ids .= '::' . ( defined $1 ? $1 : $2 );
}
if( $ids =~ s/::$// ) {
# warn for nonexistent package
}
$op = undef;
$type = T_FQ_ID;
}
# force subroutine call
if( $no_space && $type == T_ID && $$_ =~ /^\(/ ) {
$type = T_SUB_ID;
}
# look ahead for fat comma, save the original value for __LINE__
my $line = $self->line;
my $pos = $self->{pos};
_skip_space( $self );
if( $$_ =~ /^=>/ ) {
# fully qualified name (foo::moo) is quoted only if not declared
if( $type == T_FQ_ID
&& $self->symbol_table->get_symbol( $ids, '*' ) ) {
return [ $pos, T_ID, $ids, $type ];
} else {
return [ $pos, T_STRING, $ids ];
}
} elsif( $expect == X_STATE && $type != T_FQ_ID
&& $$_ =~ s/^:(?!:)// ) {
return [ $pos, T_LABEL, $ids ];
}
if( $type == T_ID && $ids =~ /^__/ ) {
if( $ids eq '__FILE__' ) {
return [ $pos, T_STRING, $self->file ];
} elsif( $ids eq '__LINE__' ) {
return [ $pos, T_NUMBER, $line, NUM_INTEGER ];
} elsif( $ids eq '__PACKAGE__' ) {
return [ $pos, T_PACKAGE, '' ];
}
}
if( $op ) {
# 'x' is an operator only when we expect it
if( $op == T_SSTAR && $expect != X_OPERATOR ) {
return [ $pos, T_ID, $ids, T_ID ];
}
return [ $pos, $op, $ids ];
}
return [ $pos, T_ID, $ids, $type ];
};
$$_ =~ s/^(["'`])//x and return _prepare_sublex( $self, $1, $1 );
# < when not operator (<> glob, <> file read, << here doc)
$$_ =~ /^</ and $expect != X_OPERATOR and do {
$$_ =~ s/^(<<|<)//x;
if( $1 eq '<' ) {
return _prepare_sublex( $self, '<', '<' );
} elsif( $1 eq '<<' ) {
return _prepare_sublex_heredoc( $self );
}
};
# multi char operators
$$_ =~ s/^(<=|>=|==|!=|=>|->
|=~|!~
|\.\.|\.\.\.
|\+\+|\-\-
|\+=|\-=|\*=|\/=
|\&\&|\|\|)//x and return [ $self->{pos}, $ops{$1}, $1 ];
$$_ =~ s/^\$//x and do {
if( $$_ =~ /^\#/ ) {
my $id = $self->lex_identifier( 0 );
if( $id ) {
$self->unlex( $id );
} else {
$$_ =~ s/^\#//x;
return [ $self->{pos}, $ops{'$#'}, '$#' ];
}
}
return [ $self->{pos}, $ops{'$'}, '$' ];
};
# brackets (block, subscripting, anonymous ref constructors)
$$_ =~ s/^([{}\[\]])// and do {
my $brack = $1;
if( $brack eq '[' || $brack eq '{' ) {
++$self->{brackets};
} else {
if( $brack eq '}'
&& @{$self->{pending_brackets}}
&& $self->{pending_brackets}[-1] == $self->{brackets} ) {
pop @{$self->{pending_brackets}};
--$self->{brackets};
return $self->lex( $expect );
}
--$self->{brackets};
if( $self->{brackets} == 0 && $self->quote ) {
_quoted_code_lookahead( $self );
}
}
# disambiguate start of block from anonymous hash
if( $brack eq '{' ) {
if( $expect == X_TERM ) {
return [ $self->{pos}, T_OPHASH, '{' ];
} elsif( $expect == X_OPERATOR ) {
# autoquote literal strings in hash subscripts
if( $$_ =~ s/^[ \t]*([[:alpha:]_]+)[ \t]*\}// ) {
$self->unlex( [ $self->{pos}, T_CLBRK, '}' ] );
$self->unlex( [ $self->{pos}, T_STRING, $1 ] );
}
} elsif( $expect != X_BLOCK ) {
# try to guess if it is a block or anonymous hash
$self->_skip_space;
if( $$_ =~ /^}/ ) {
return [ $self->{pos}, T_OPHASH, '{' ];
}
# treat '<bareward> =>', '<string> ,/=>' lookahead
# as indicators of anonymous hash
if( $$_ =~ /^([\w"'`])/ ) {
my $first = $1;
# can only be a string literal, quote like operator
# or identifier
my $next = $self->peek( X_NOTHING );
$self->_skip_space;
if( $$_ =~ /^=>/
|| ( $$_ =~ /^,/ && $next->[O_TYPE] != T_ID ) ) {
return [ $self->{pos}, T_OPHASH, '{' ];
}
}
}
}
return [ $self->{pos}, $ops{$brack}, $brack ];
};
# / (either regex start or division operator)
$$_ =~ s/^\///x and do {
if( $expect == X_TERM || $expect == X_STATE ) {
return _prepare_sublex( $self, 'm', '/' );
} else {
return [ $self->{pos}, T_SLASH, '/' ];
}
};
# filetest operators
$$_ =~ s/^-([rwxoRWXOezsfdlpSugkbctTBMMAC])(?=\W)// and do {
my $op = $1;
if( $$_ =~ /^[ \t]*=>/ ) {
$self->unlex( [ 'STRING', $1 ] );
return [ $self->{pos}, T_MINUS, '-' ];
}
return [ $self->{pos}, T_FILETEST, $op, $filetest{$op} ];
};
# single char operators
$$_ =~ s/^([:;,()\?<>!~=\/\\\+\-\.\|^\*%@&])//x and return [ $self->{pos}, $ops{$1}, $1 ];
die "Lexer error: '$$_'";
}
sub _fill_buffer {
my( $self ) = @_;
my $stream = $self->stream;
return unless $stream;
my $buffer = $self->buffer;
my $l = readline $stream;
if( defined $l ) {
$$buffer .= $l;
}
}
1;