The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
use 5.010;
use strict;
use warnings;

package MarpaX::Repa::Lexer;
our $VERSION = '0.01';

=head1 NAME

MarpaX::Repa::Lexer - simplify lexing for Marpa parser

=head1 DESCRIPTION

Most details are in L<MarpaX::Repa>.

=head1 METHODS

=head2 new

Returns a new lexer instance. Takes named arguments.

    my $lexer = MyLexer->new(
        tokens => {
            word => qr{\b\w+\b},
        },
        store => 'array',
        recognizer => $recognizer,
        debug => 1,
    );

Possible arguments:

=over 4

=item tokens

Hash with names of terminals as keys and one of the
following as values:

=over 4

=item string

Just a string to match.

    'a token' => "matches this long string",

=item regular expression

A C<qr{}> compiled regexp.

    'a token' => qr{"[^"]+"},

Note that regexp MUST match at least one character. At this moment
look behind to look at chars before the current position is not
supported.

=item hash

With hash you can define token specific options. At this moment
'store' option only (see below). Use C<match> key to set what to
match (string or regular expression).

    'a token' => {
        match => "a string",
        store => 'hash',
    },

=back

=item store

What to store (pass to Marpa's recognizer). The following variants
are supported:

=over 4

=item hash (default)

    { token => 'a token', value => 'a value' }

=item array

    [ 'a token', 'a value' ]

=item scalar

    'a value'

=item undef

undef is stored so later Repa's actions will skip it.

=item a callback

A function will be called with token name and reference to its value.
Should return a reference or undef that will be passed to recognizer.

=back

=item recognizer

L<Marpa::R2::Recognizer> object or its subclass.

=item debug

If true then lexer prints debug log to STDERR.

=item min_buffer

Minimal size of the buffer (4*1024 by default).

=back

=cut

sub new {
    my $proto = shift;
    my $self = bless { @_ }, ref $proto || $proto;
    return $self->init;
}

=head2 init

Setups instance and returns C<$self>. Called from constructor.

=cut

sub init {
    my $self = shift;

    my $tokens = $self->{'tokens'};
    foreach my $token ( keys %$tokens ) {
        my ($match, @rest);
        if ( ref( $tokens->{ $token } ) eq 'HASH' ) {
            $match = $tokens->{ $token }{'match'};
            @rest = ($tokens->{ $token }{'store'});
        } else {
            $match = $tokens->{ $token };
        }
        $rest[0] ||= $self->{'store'} || 'hash';
        my $type =
            ref $match ? 'RE'
            : length $match == 1 ? 'CHAR'
            : 'STRING';
        $tokens->{ $token } = [ $type, $match, @rest ];
    }

    $self->{'min_buffer'} //= 4*1024;
    $self->{'buffer'} //= '';

    return $self;
}

=head2 recognize

Takes a file handle and parses it. Dies on critical errors, not when parser lost its way.
Returns recognizer that was passed to L</new>.

=cut

sub recognize {
    my $self = shift;
    my $fh = shift;

    my $rec = $self->{'recognizer'};

    my $buffer = $self->buffer;
    my $buffer_can_grow = $self->grow_buffer( $fh );

    my $expected = $rec->terminals_expected;
    return $rec unless @$expected;

    while ( length $$buffer ) {
        say STDERR "Expect token(s): ". join(', ', map "'$_'", @$expected)
            if $self->{'debug'};

        say STDERR "Buffer start: ". $self->dump_buffer .'...'
            if $self->{'debug'};

        my $first_char = substr $$buffer, 0, 1;
        foreach my $token ( @$expected ) {
            REDO:

            my ($matched, $match, $length);
            my ($type, $what, $how) = @{ $self->{'tokens'}{ $token } || [] };

            unless ( $type ) {
                say STDERR "Unknown token: '$token'" if $self->{'debug'};
                next;
            }
            elsif ( $type eq 'RE' ) {
                if ( $$buffer =~ /^($what)/ ) {
                    ($matched, $match, $length) = (1, $1, length $1);
                    if ( $length == length $$buffer && $buffer_can_grow ) {
                        $buffer_can_grow = $self->grow_buffer( $fh );
                        goto REDO;
                    }
                }
            }
            elsif ( $type eq 'STRING' ) {
                $length = length $what;
                ($matched, $match) = (1, $what)
                    if $what eq substr $$buffer, 0, $length;
            }
            elsif ( $type eq 'CHAR' ) {
                ($matched, $match, $length) = (1, $first_char, 1)
                    if $what eq $first_char;
            }
            else {
                die "Unknown type $type";
            }

            unless ( $matched ) {
                say STDERR "No '$token' in ". $self->dump_buffer if $self->{'debug'};
                next;
            }

            unless ( $length ) {
                die "Token '$token' matched empty string. This is not supported.";
            }
            say STDERR "Token '$token' matched ". $self->dump_buffer( $length )
                if $self->{'debug'};

            if ( ref $how ) {
                $match = $how->( $token, \"$match" );
            } elsif ( $how eq 'hash' ) {
                $match = \{ token => $token, value => $match };
            } elsif ( $how eq 'array' ) {
                $match = \[$token, $match];
            } elsif ( $how eq 'scalar' ) {
                $match = \"$match";
            } elsif ( $how eq 'undef' ) {
                $match = \undef;
            } else {
                die "Unknown store variant - '$how'";
            }

            $rec->alternative( $token, $match, $length );
        }

        my $skip = 0;
        while (1) {
            # XXX: we are done, no way to advance further, we would love this
            # to be improved in Marpa
            if ( $rec->current_earleme == $rec->thin->furthest_earleme ) {
                return $rec;
            }

            $skip++;
            local $@;
            if ( defined (my $events = eval { $rec->earleme_complete }) ) {
                if ( $events && $rec->exhausted ) {
                    substr $$buffer, 0, $skip, '';
                    return $rec;
                }
                $expected = $rec->terminals_expected;
                last if @$expected;
            } else {
                say STDERR "Failed to parse: $@" if $self->{'debug'};
                return $rec;
            }
        }
        substr $$buffer, 0, $skip, '';
        $buffer_can_grow = $self->grow_buffer( $fh )
            if $buffer_can_grow && $self->{'min_buffer'} > length $$buffer;

        say STDERR '' if $self->{'debug'};
    }
    return $rec;
}

=head2 buffer

Returns reference to the current buffer.

=cut

sub buffer { \$_[0]->{'buffer'} }

=head2 grow_buffer

Called when L</buffer> needs a re-fill with a file handle as argument.
Returns true if there is still data to come from the handle.

=cut

sub grow_buffer {
    my $self = shift;
    local $/ = \($self->{'min_buffer'}*2);
    $self->{'buffer'} .= readline($_[0]) // return 0;
    return 1 && $self->{'min_buffer'};
}

=head2 dump_buffer

Returns first 20 chars of the buffer with everything besides ASCII encoded
with C<\x{####}>. Use argument to control size, zero to mean whole buffer.

=cut

sub dump_buffer {
    my $self = shift;
    my $show = shift // 20;
    my $str = $show? substr( $self->{'buffer'}, 0, $show ) : $self->{'buffer'};
    (my $res = $str) =~ s/([^\x20-\x7E])/'\\x{'. hex( ord $1 ) .'}' /ge;
    return $res;
}

1;