The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Devel::Declare::Lexer;

use strict;
use warnings;
use v5;

use feature 'say';

our $VERSION = '0.012';

use Data::Dumper;
use Devel::Declare;
use Devel::Declare::Lexer::Stream;
use Devel::Declare::Lexer::Token;
use Devel::Declare::Lexer::Token::Bareword;
use Devel::Declare::Lexer::Token::Declarator;
use Devel::Declare::Lexer::Token::EndOfStatement;
use Devel::Declare::Lexer::Token::Heredoc;
use Devel::Declare::Lexer::Token::LeftBracket;
use Devel::Declare::Lexer::Token::Newline;
use Devel::Declare::Lexer::Token::Operator;
use Devel::Declare::Lexer::Token::RightBracket;
use Devel::Declare::Lexer::Token::String;
use Devel::Declare::Lexer::Token::Variable;
use Devel::Declare::Lexer::Token::Whitespace;

use vars qw/ @ISA $DEBUG $SHOWTRANSLATE /;
@ISA = ();
$DEBUG = 0;
$SHOWTRANSLATE = 0;

sub import
{
    my $class = shift;
    my $caller = caller;

    import_for($caller, @_);
}

sub import_for
{
    my ($caller, @args) = @_;
    my $class = shift;

    no strict 'refs';

    my %subinject = ();
    if(ref($args[0]) =~ /HASH/) {
        $DEBUG and say STDERR "Using hash for import";
        %subinject = %{$args[0]};
        @args = keys %subinject;
    }

    my @consts;

    my %tags = map { $_ => 1 } @args;
    if($tags{":debug"}) {
        $DEBUG = 1;
    }
    if($tags{":lexer_test"}) {
        $DEBUG and say STDERR "Adding 'lexer_test' to keyword list";

        push @consts, "lexer_test";
    }

    my @names = @args;
    for my $name (@names) {
        next if $name =~ /:/;
        $DEBUG and say STDERR "Adding '$name' to keyword list";

        push @consts, $name;
    }

    for my $word (@consts) {
        $DEBUG and say STDERR "Injecting '$word' into '$caller'";
        Devel::Declare->setup_for(
            $caller,
            {
                $word => { const => \&lexer }
            }
        );
        if($subinject{$word}) {
            $DEBUG and say STDERR "- Using sub provided in import";
            *{$caller.'::'.$word} = $subinject{$word};
        } else {
            $DEBUG and say STDERR "- Using default sub";
            *{$caller.'::'.$word} = sub () { 1; };
        }
    }
}

my %named_lexed_stack = ();
sub lexed
{
    my ($key, $callback) = @_;
    $DEBUG and say STDERR "Registered callback for keyword '$key'";
    $named_lexed_stack{$key} = $callback;
}

sub call_lexed
{
    my ($name, $stream) = @_;

    $DEBUG and say STDERR "Checking for callbacks for keyword '$name'";
    $DEBUG and say STDERR Dumper $stream;

    my $callback = $named_lexed_stack{$name};
    if($callback) {
        $DEBUG and say STDERR "Found callback '$callback' for keyword '$name'";
        $stream = &$callback($stream);
    }

    $DEBUG and say STDERR Dumper $stream;

    return $stream;
}

sub lexer
{
    my ($symbol, $offset) = @_;

    $DEBUG and print "=" x 80, "\n";

    my $linestr = Devel::Declare::get_linestr;
    my $original_linestr = $linestr;
    my $original_offset = $offset;
    $DEBUG and say STDERR "Starting with linestr '$linestr'";

    my @tokens = ();
    tie @tokens, "Devel::Declare::Lexer::Stream";
    my ($len, $tok);
    my $eoleos = 0;
    my $line = 1;

    # Skip the declarator
    $offset += Devel::Declare::toke_move_past_token($offset);
    push @tokens, new Devel::Declare::Lexer::Token::Declarator( value => $symbol );
    $DEBUG and say STDERR "Skipped declarator '$symbol'";

    my %lineoffsets = ( 1 => $offset );

    # We call this from a few places inside the loop
    my $skipspace = sub {
        # Move past any whitespace
        $len = Devel::Declare::toke_skipspace($offset);
        if($len > 0) {
            $tok = substr($linestr, $offset, $len);
            $DEBUG and say STDERR "Skipped whitespace '$tok', length [$len]";
            push @tokens, new Devel::Declare::Lexer::Token::Whitespace( value => $tok );
            $offset += $len;

            if($tok =~ /\n/) {
                # its odd that this works without handling any line numbering
                # I think we end up here when an end of line is found after a bareword (e.g. print\n"something")
                # It probably still needs some work on line numbering, but everything just seems to work! 
                $DEBUG and say STDERR "Got end of line in skipspace, probable bareword preceeding EOL";
                Devel::Declare::clear_lex_stuff;

                # We've got a new line so we need to refresh our linestr
                $linestr = Devel::Declare::get_linestr;
                $original_linestr = $linestr;

                $DEBUG and say STDERR "Refreshed linestr [$linestr]";
            }
        } elsif ($len < 0) {
            # Again, its odd that we don't handle any line numbering here, and a $len of < 0 is a definite EOL
            $DEBUG and say STDERR "Got end of line in skipspace";
        } elsif ($len == 0) {
            $DEBUG and say STDERR "No whitespace skipped";
        }
        return $len;
    };

    # Capture the tokens
    $DEBUG and say STDERR "Linestr length [", length $linestr, "]";
    my $heredoc = undef;
    my $heredoc_end_re = undef;
    my $heredoc_end_re2 = undef;
    my $nest = 0; # nested bracket tracking, just in case we get ; inside a block
    while($offset < length $linestr) {
        $DEBUG and say STDERR Dumper \%lineoffsets;
        if($heredoc && !(substr($linestr, $offset, 2) eq "\n")) {
            my $c = substr($linestr, $offset, 1);
            $DEBUG and say STDERR "Consuming char from heredoc: '$c'";
            $offset += 1;
            if($c =~ /\n/) {
                $DEBUG and say STDERR "Newline found in heredoc (current line $line)";
                #$line++;
                #$lineoffsets{$line} = $offset;
            } else {
                $heredoc->{value} .= $c;
            }
            $DEBUG and say STDERR "New heredoc value: " . $heredoc->{value};
            my $heredoc_name = $heredoc->{name};
            if($heredoc->{value} =~ /$heredoc_end_re/) {
                $heredoc->{value} =~ s/$heredoc_end_re2//;
                $DEBUG and say STDERR "Consumed heredoc, name [$heredoc_name]:\n" . $heredoc->{value};
                push @tokens, $heredoc;
                $heredoc = undef;
                $heredoc_end_re = undef;
                $heredoc_end_re2 = undef;
            }
            next;
        }

        $DEBUG and say STDERR "Offset[$offset], nest [$nest], Remaining[", substr($linestr, $offset), "]";

        if(substr($linestr, $offset, 1) eq ';') {
            $DEBUG and say STDERR "Got end of statement";
            push @tokens, new Devel::Declare::Lexer::Token::EndOfStatement;
            $offset += 1;
            $eoleos = 1;
            last unless $nest;
            next;
        }

        if(substr($linestr, $offset, 2) eq "\n") {
            if($heredoc) {
                $DEBUG and say STDERR "Got end of line in heredoc";
                $heredoc->{value} .= "\n";
            }

            if(!$heredoc) {
                $DEBUG and say STDERR "Got end of line in loop (current line $line)";
                push @tokens, new Devel::Declare::Lexer::Token::Newline;
                $offset += 1;
            }

            # this lets us capture a newline directly after a semicolon
            # and immediately exit the loop - otherwise we might start
            # consuming code that doesn't belong to us
            last if $eoleos && !$nest;
            $eoleos = 0;

            # If we're here, it's just a new line inside the statement that 
            # we do want to consume

            # We don't use skipspace here - it does too much!
            #&$skipspace;
            $len = Devel::Declare::toke_skipspace($offset);
            if($len != 0) {
                # TODO it seems odd that we don't add $len to the
                # offset... this might come back to bite us later!
                #$offset += $len - 6;
                $DEBUG and say STDERR "Skipped $len whitespace following EOL, not added to \$offset";
            }

            Devel::Declare::clear_lex_stuff;

            # Got a new line, so we need to refresh linestr
            $linestr = Devel::Declare::get_linestr;
            # It's not the next line, its everything upto and including the next line
            # so really our original_linestr is wrong!
            $original_linestr = $linestr;

            # Record some offsets for later - we start on line 1 and the first $line++ is 2
            # so we make a special case for recording line 1's offset
            if($line == 1) {
                $lineoffsets{1} = (length $symbol) + 1;
            };
            $line++;
            $lineoffsets{$line} = $heredoc ? $offset + 1 : $offset;

            $DEBUG and say STDERR "Refreshed linestr [$linestr], added lineoffset for line $line, offset $offset";
            next;
        }

        # FIXME Does this ever happen?
        if(&$skipspace < 0) {
            $DEBUG and say STDERR "Got skipspace < 0";
            last;
        }

        # Check if its a opening bracket
        if(substr($linestr, $offset, 1) =~ /(\{|\[|\()/) {
            my $b = substr($linestr, $offset, 1);
            push @tokens, new Devel::Declare::Lexer::Token::LeftBracket( value => $b );
            $nest++;
            $DEBUG and say STDERR "Got left bracket '$b', nest[$nest]";
            $offset += 1;
            next;
        }
        # Check if its a closing bracket
        if(substr($linestr, $offset, 1) =~ /(\}|\]|\))/) {
            my $b = substr($linestr, $offset, 1);
            push @tokens, new Devel::Declare::Lexer::Token::RightBracket( value => $b );
            $nest--;
            $DEBUG and say STDERR "Got right bracket '$b', nest[$nest]";
            $offset += 1;
            next;
        }
        # Check for a reference
        if(substr($linestr, $offset, 1) =~ /\\/) {
            $tok = substr($linestr, $offset, 1);
            $DEBUG and say STDERR "Got reference operator '$tok'";
            push @tokens, new Devel::Declare::Lexer::Token::Operator( value => $tok);
            $offset += 1;
            next;
        }
        # Check for variable
        if(substr($linestr, $offset, 1) =~ /(\$|\%|\@|\*)/) {
            # get the sign
            # TODO the variable name is captured later - it should probably be done here
            $tok = substr($linestr, $offset, 1);
            $DEBUG and say STDERR "Got variable '$tok'";
            push @tokens, new Devel::Declare::Lexer::Token::Variable( value => $tok );
            $offset += 1;
            next;
        }
        # Check for string
        if(substr($linestr, $offset, 1) =~ /^(q|\"|\')/) {
            # FIXME need to determine string type properly
            my $strstype = substr($linestr, $offset, 1);
            my $stretype = $strstype;
            if($strstype =~ /q/) {
                if(substr($linestr, $offset, 2) =~ /qq/) {
                    $strstype = substr($linestr, $offset, 3);
                    $offset += 2;
                } else {
                    $strstype = substr($linestr, $offset, 2);
                    $offset += 1;
                }
                $stretype = substr($linestr, $offset, 1);
                $stretype =~ tr/\(/)/;
                $len = Devel::Declare::toke_scan_str($offset);
            } else {
                $len = Devel::Declare::toke_scan_str($offset);
            }
            $DEBUG and say STDERR "Got string type '$strstype', end type '$stretype'";
            $tok = Devel::Declare::get_lex_stuff;
            Devel::Declare::clear_lex_stuff;
            $DEBUG and say STDERR "Got string '$tok'";
            push @tokens, new Devel::Declare::Lexer::Token::String( start => $strstype, end => $stretype, value => $tok );
            # get a new linestr - we might have captured multiple lines
            $linestr = Devel::Declare::get_linestr;
            $offset += $len;

            # If we do have multiple lines, we'll fix line numbering at the end

            next;
        }
        # Check for heredoc
        if(substr($linestr, $offset)=~ /^(<<\s*([\w\d]+)\s*\n)/) {
            # Heredocs are weird - we'll just remember we're in a heredoc until we get the end token
            $DEBUG and say STDERR "Got a heredoc with name '$2'";
            $heredoc = new Devel::Declare::Lexer::Token::Heredoc( name => $2, value => '' );
            $heredoc_end_re = qr/\n$2\n$/;
            $heredoc_end_re2 = qr/$2\n$/;
            $DEBUG and say STDERR "Created regex $heredoc_end_re and $heredoc_end_re2";

            # get a new linestr - we might have captured multiple lines
            $offset += 2 + (length $1);
    
            $len = Devel::Declare::toke_skipspace($offset);
            $linestr = Devel::Declare::get_linestr;
            $offset += $len;
            $DEBUG and say STDERR "Skipped $len whitespace at start of heredoc, got new linestr[$linestr]";

            $line++;
            $lineoffsets{$line} = $offset;

            # If we do have multiple lines, we'll fix line numbering at the end

            next;
        }
        # Check for operator after strings (so heredocs <<NAME work)
        if(substr($linestr, $offset, 1) =~ /[!\+\-\*\/\.><=,|&\?:]/) {
            $tok = substr($linestr, $offset, 1);
            $DEBUG and say STDERR "Got operator '$tok'";
            push @tokens, new Devel::Declare::Lexer::Token::Operator( value => $tok );
            $offset += 1;
            next;
        }
        # Check for bareword
        $len = Devel::Declare::toke_scan_word($offset, 1);
        if($len) {
            $tok = substr($linestr, $offset, $len);
            $DEBUG and say STDERR "Got bareword '$tok'";
            push @tokens, new Devel::Declare::Lexer::Token::Bareword( value => $tok );
            $offset += $len;
            next;
        }

    }

    # Callback (AT COMPILE TIME) to allow manipulation of the token stream before injection
    $DEBUG and say STDERR Dumper \@tokens;
    @tokens = @{call_lexed($symbol, \@tokens)};

    my $stmt = "";
    for my $token (@tokens) {
        $stmt .= $token->get;
    }

    $DEBUG and print "=" x 80, "\n";

    if($symbol =~ /^lexer_test$/) {
        $DEBUG and say STDERR "Escaping statement for variable assignment";
        $stmt =~ s/\\/\\\\/g;
        $stmt =~ s/\"/\\"/g;
        $stmt =~ s/\$/\\\$/g;
        $stmt =~ s/\n/\\n/g;
        chomp $stmt;
        $stmt = substr($stmt, 0, (length $stmt)); # strip the final \\n
    } else {
        $stmt =~ s/\n//g; # remove multiline on final statement
        chomp $stmt;
    }
    $DEBUG and say STDERR "Final statement: [$stmt]";

    # FIXME line numbering is broken if a \n appears inside a block, e.g. keyword { print "\n"; }
    #my @lcnt = split /[^\\]\\n/, $stmt;
    my @lcnt = split /\\n/, $stmt;
    my $lc = scalar @lcnt;
    $DEBUG and say STDERR "Lines:\n", Dumper \@lcnt;
    my $lineadjust = $lc - $line;
    $DEBUG and say STDERR "Linecount[$lc] lines[$line] - missing $lineadjust lines";

    # we've got a new linestr, we need to re-fix all our offsets
    $DEBUG and say STDERR "\n\nStarted with linestr [$linestr]";
    use Data::Dumper;
    $DEBUG and say STDERR Dumper \%lineoffsets;

    for my $l (sort keys %lineoffsets) {
        my $sol = $lineoffsets{$l};
        last if !defined $lineoffsets{$l+1}; # don't mess with the current line, yet!
        my $eol = $lineoffsets{$l + 1} - 1;
        my $diff = $eol - $sol;
        my $substr = substr($linestr, $sol, $diff);
        $DEBUG and say STDERR "\nLine $l, sol[$sol], eol[$eol], diff[$diff], linestr[$linestr], substr[$substr]";
        substr($linestr, $sol, $diff) = " " x $diff;
    }

    # now clear up the last line
    $DEBUG and say STDERR "Still got linestr[$linestr]";
    my $sol = $line == 1 ? (length $symbol) + 1 + $original_offset : $lineoffsets{$line};
    my $eol = (length $linestr) - 1;
    my $diff = $eol - $sol;
    my $substr = substr($linestr, $sol, $diff);
    $DEBUG and say STDERR "Got substr[$substr] sol[$sol] eol[$eol] diff[$diff]";

    my $newline = "\n" x $lineadjust;
    if($symbol =~ /^lexer_test$/) {
        $newline .= "and \$lexed = \"$stmt\";";
    } else {
        $newline .= " and " . substr($stmt, length $symbol);
    }

    substr($linestr, $sol, (length $linestr) - $sol - 1) = $newline; # put the rest of the statement in

    ($DEBUG || $SHOWTRANSLATE) and say STDERR "Got new linestr[$linestr] from original_linestr[$original_linestr]";

    $DEBUG and print "=" x 80, "\n";
    Devel::Declare::set_linestr($linestr);
}

=head1 NAME

Devel::Declare::Lexer

=head1 SYNOPSIS

    # Add :debug tag to enable debugging
    # Add :lexer_test to enable variable assignment
    # Anything not starting with : becomes a keyword
    use Devel::Declare::Lexer qw/ keyword /;

    BEGIN {
        # Create a callback for the keyword (inside a BEGIN block!)
        Devel::Declare::Lexer::lexed(keyword => sub {
            # Get the stream out (given as an arrayref)
            my ($stream_r) = @_;
            my @stream = @$stream_r;

            my $str = $stream[2]; # in the example below, the string is the 3rd token

            # Create a new stream (we could manipulate the existing one though)
            my @ns = ();
            tie @ns, "Devel::Declare::Lexer::Stream";

            # Add a few tokens to print the string 
            push @ns, (
                # You need this (for now)
                new Devel::Declare::Lexer::Token::Declarator( value => 'keyword' ),
                new Devel::Declare::Lexer::Token::Whitespace( value => ' ' ),

                # Everything else is your own custom code
                new Devel::Declare::Lexer::Token( value => 'print' ),
                new Devel::Declare::Lexer::Token::Whitespace( value => ' ' ),
                $string,
                new Devel::Declare::Lexer::Token::EndOfStatement,
                new Devel::Declare::Lexer::Token::Newline,
            );

            # Stream now contains:
            # keyword and print "This is a string";
            # keyword evaluates to 1, everything after the and gets executed

            # Return an arrayref
            return \@ns;
        });
    }

    # Use the keyword anywhere in this package
    keyword "This is a string";

=head1 DESCRIPTION

L<Devel::Declare::Lexer> makes it easier to parse code using L<Devel::Declare>
by generating a token stream from the statement and providing a callback for
you to manipulate it before its parsed by Perl.

The example in the synopsis creates a keyword named 'keyword', which accepts
a string and prints it.

Although this simple example could be done using print, say or any other simple
subroutine, L<Devel::Declare::Lexer> supports much more flexible syntax.

For example, it could be used to auto-expand subroutine declarations, e.g.
    method MethodName ( $a, @b ) {
        ... 
    }
into
    sub MethodName ($@) {
        my ($self, $a, @b) = @_;
        ...
    }

Unlike L<Devel::Declare>, there's no need to worry about parsing text and
taking care of multiline strings or code blocks - it's all done for you.

=head1 ADVANCED USAGE

L<Devel::Declare::Lexer>'s standard behaviour is to inject a sub into the
calling package which returns a 1. Because your statement typically gets
transformed into something like
    keyword and [your statement here];
the fact keyword evaluates to 1 means everything following the and will always
be executed.

You can extend this by using a different import syntax when loading L<Devel::Declare::Lexer>
    use Devel::Declare::Lexer { keyword => sub { $Some::Package::variable } };
which will cause the provided sub to be injected instead of the default sub.

=head1 SEE ALSO

Some examples can be found in the source download.

For more information about how L<Devel::Declare::Lexer> works, read the 
documentation for L<Devel::Declare>.

=head1 AUTHORS

Ian Kent - E<iankent@cpan.org> - original author

http://www.iankent.co.uk/

=head1 COPYRIGHT AND LICENSE

This library is free software under the same terms as perl itself

Copyright (c) 2013 Ian Kent

Devel::Declare::Lexer is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the license for more details.

=cut

1;