package CQL::Lexer;
use strict;
use warnings;
use Carp qw( croak );
use String::Tokenizer;
use CQL::Token;
=head1 NAME
CQL::Lexer - a lexical analyzer for CQL
=head1 SYNOPSIS
my $lexer = CQL::Lexer->new();
$lexer->tokenize( 'foo and bar' );
my @tokens = $lexer->getTokens();
=head1 DESCRIPTION
CQL::Lexer is lexical analyzer for a string of CQL. Once you've
got a CQL::Lexer object you can tokenize a CQL string into CQL::Token
objects. Ordinarily you'll never want to do this yourself since
CQL::Parser calls CQL::Lexer for you.
CQL::Lexer uses Stevan Little's lovely String::Tokenizer in the background,
and does a bit of analysis afterwards to handle some peculiarities of
CQL: double quoted strings, <, <=, etc.
=head1 METHODS
=head2 new()
The constructor.
=cut
sub new {
my $class = shift;
my $self = {
tokenizer => String::Tokenizer->new(),
tokens => [],
position => 0,
};
return bless $self, ref($class) || $class;
}
=head2 tokenize()
Pass in a string of CQL to tokenize. This initializes the lexer with
data so that you can retrieve tokens.
=cut
sub tokenize {
my ( $self, $string ) = @_;
## extract the String::Tokenizer object we will use
my $tokenizer = $self->{tokenizer};
## reset position parsing a new string of tokens
$self->reset();
## delegate to String::Tokenizer for basic tokenization
debug( "tokenizing: $string" );
$tokenizer->tokenize( $string, '\/<>=()"',
String::Tokenizer->RETAIN_WHITESPACE );
## do a bit of lexical analysis on the results of basic
debug( "lexical analysis on tokens" );
my @tokens = _analyze( $tokenizer );
$self->{tokens} = \@tokens;
}
=head2 getTokens()
Returns a list of all the tokens.
=cut
sub getTokens {
my $self = shift;
return @{ $self->{tokens} };
}
=head2 token()
Returns the current token.
=cut
sub token {
my $self = shift;
return $self->{tokens}[ $self->{position} ];
}
=head2 nextToken()
Returns the next token, or undef if there are more tokens to retrieve
from the lexer.
=cut
sub nextToken {
my $self = shift;
## if we haven't gone over the end of our token list
## return the token at our current position while
## incrementing the position.
if ( $self->{position} < @{ $self->{tokens} } ) {
my $token = $self->{tokens}[ $self->{position}++ ];
return $token;
}
return CQL::Token->new( '' );
}
=head2 prevToken()
Returns the previous token, or undef if there are no tokens prior
to the current token.
=cut
sub prevToken {
my $self = shift;
## if we're not at the start of our list of tokens
## return the one previous to our current position
## while decrementing our position.
if ( $self->{position} > 0 ) {
my $token = $self->{tokens}[ --$self->{position} ];
return $token;
}
return CQL::Token->new( '' );
}
=head2 reset()
Resets the iterator to start reading tokens from the beginning.
=cut
sub reset {
shift->{position} = 0;
}
## Private sub used by _analyze for collecting a backslash escaped string terminated by "
sub _getString {
my $iterator = shift;
my $string = '"';
my $escaping = 0;
# loop through the tokens untill an unescaped " found
while ($iterator->hasNextToken()) {
my $token = $iterator->nextToken();
$string .= $token;
if ($escaping) {
$escaping = 0;
} elsif ($token eq '"') {
return $string;
} elsif ($token eq "\\") {
$escaping = 1;
}
}
croak( 'unterminated string ' . $string);
}
## Private sub used by _analyze to process \ outside double quotes.
## Because we tokenized on \ any \ outside double quotes (inside is handled by _getString)
## might need to be concatenated with a previous and or next CQL_WORD to form one CQL_WORD token
sub _concatBackslash {
my $tokensRef = shift;
my $i = 0;
while ($i < @$tokensRef) {
my $token = $$tokensRef[$i];
if ($token->getString() eq "\\") {
my $s = "\\";
my $replace = 0;
if ($i > 0) {
my $prevToken = $$tokensRef[$i - 1];
if (($prevToken->getType() == CQL_WORD) and !$prevToken->{terminated}) {
# concatenate and delete the previous CQL_WORD token
$s = $prevToken->getString() . $s;
$i--;
splice @$tokensRef, $i, 1;
$replace = 1;
}
}
if (!$token->{terminated} and ($i < $#$tokensRef)) {
my $nextToken = $$tokensRef[$i + 1];
if ($nextToken->getType() == CQL_WORD) {
# concatenate and delete the next CQL_WORD token
$s .= $nextToken->getString();
splice @$tokensRef, $i + 1, 1;
$replace = 1;
}
}
if ($replace) {
$$tokensRef[$i] = CQL::Token->new($s);
}
}
$i++;
}
}
sub _analyze {
my $tokenizer = shift;
my $iterator = $tokenizer->iterator();
my @tokens;
while ( defined (my $token = $iterator->nextToken()) ) {
## <=
if ( $token eq '<' and $iterator->lookAheadToken() eq '=' ) {
push( @tokens, CQL::Token->new( '<=' ) );
$iterator->nextToken();
}
## <>
elsif ( $token eq '<' and $iterator->lookAheadToken() eq '>' ) {
push( @tokens, CQL::Token->new( '<>') );
$iterator->nextToken();
}
## >=
elsif ( $token eq '>' and $iterator->lookAheadToken() eq '=' ) {
push( @tokens, CQL::Token->new( '>=' ) );
$iterator->nextToken();
}
## "quoted strings"
elsif ( $token eq '"' ) {
my $cqlToken = CQL::Token->new( _getString($iterator) );
## Mark this and the previous token as terminated to prevent concatenation with backslash
$cqlToken->{terminated} = 1;
if (@tokens) { $tokens[$#tokens]->{terminated} = 1; }
push( @tokens, $cqlToken );
}
## if it's just whitespace we can zap it
elsif ( $token =~ /\s+/ ) {
## Mark the previous token as terminated to prevent concatenation with backslash
if (@tokens) {
$tokens[$#tokens]->{terminated} = 1;
}
}
## otherwise it's fine the way it is
else {
push( @tokens, CQL::Token->new($token) );
}
} # while
## Concatenate \ outside double quotes with a previous and or next CQL_WORD to form one CQL_WORD token
_concatBackslash(\@tokens);
return @tokens;
}
sub debug {
return unless $CQL::DEBUG;
print STDERR 'CQL::Lexer: ', shift, "\n";
}
1;