lib/RDF/Trine/Parser/Turtle/Lexer.pm

# RDF::Trine::Parser::Turtle::Lexer
# -----------------------------------------------------------------------------

=head1 NAME

RDF::Trine::Parser::Turtle::Lexer - Tokenizer for parsing Turtle, TriG, and N-Triples

=head1 VERSION

This document describes RDF::Trine::Parser::Turtle::Lexer version 1.002_01

=head1 SYNOPSIS

 use RDF::Trine::Parser::Lexer;
 my $l = RDF::Trine::Parser::Lexer->new( file => $fh );
 while (my $t = $l->get_token) {
   ...
 }

=head1 METHODS

=over 4

=cut

package RDF::Trine::Parser::Turtle::Lexer;

use RDF::Trine::Parser::Turtle::Constants;
use 5.010;
use strict;
use warnings;
use Moose;
use Data::Dumper;
use RDF::Trine::Error;

our $VERSION;
BEGIN {
	$VERSION				= '1.002_01';
}

my $r_nameChar_extra		= qr'[-0-9\x{B7}\x{0300}-\x{036F}\x{203F}-\x{2040}]'o;
my $r_nameStartChar_minus_underscore	= qr'[A-Za-z\x{00C0}-\x{00D6}\x{00D8}-\x{00F6}\x{00F8}-\x{02FF}\x{0370}-\x{037D}\x{037F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{00010000}-\x{000EFFFF}]'o;
my $r_nameStartChar			= qr/[A-Za-z_\x{00C0}-\x{00D6}\x{00D8}-\x{00F6}\x{00F8}-\x{02FF}\x{0370}-\x{037D}\x{037F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}]/;
my $r_nameChar				= qr/${r_nameStartChar}|[-0-9\x{b7}\x{0300}-\x{036f}\x{203F}-\x{2040}]/;
my $r_prefixName			= qr/(?:(?!_)${r_nameStartChar})(?:$r_nameChar)*/;
my $r_nameChar_test			= qr"(?:$r_nameStartChar|$r_nameChar_extra)";
my $r_double				= qr'[+-]?([0-9]+\.[0-9]*[eE][+-]?[0-9]+|\.[0-9]+[eE][+-]?[0-9]+|[0-9]+[eE][+-]?[0-9]+)';
my $r_decimal				= qr'[+-]?([0-9]+\.[0-9]*|\.([0-9])+)';
my $r_integer				= qr'[+-]?[0-9]+';

has file => (
	is => 'ro',
	isa => 'FileHandle',
	required => 1,
);

has linebuffer => (
	is => 'rw',
	isa => 'Str',
	default => '',
);

has line => (
	is => 'rw',
	isa => 'Int',
	default => 1,
);

has column => (
	is => 'rw',
	isa => 'Int',
	default => 1,
);

has buffer => (
	is => 'rw',
	isa => 'Str',
	default => '',
);

has start_column => (
	is => 'rw',
	isa => 'Int',
	default => -1,
);

has start_line => (
	is => 'rw',
	isa => 'Int',
	default => -1,
);

sub BUILDARGS {
	my $class	= shift;
	if (scalar(@_) == 1) {
		return { file => shift };
	} else {
		return $class->SUPER::BUILDARGS(@_);
	}
}

=item C<< new_token ( $type, @values ) >>

Returns a new token with the given type and optional values, capturing the
current line and column of the input data.

=cut

sub new_token {
	my $self		= shift;
	my $type		= shift;
	my $start_line	= $self->start_line;
	my $start_col	= $self->start_column;
	my $line		= $self->line;
	my $col			= $self->column;
	return RDF::Trine::Parser::Turtle::Token->fast_constructor(
			$type,
			$start_line,
			$start_col,
			$line,
			$col,
			\@_,
		);
}

my %CHAR_TOKEN	= (
	'.'	=> DOT,
	';'	=> SEMICOLON,
	'['	=> LBRACKET,
	']'	=> RBRACKET,
	'('	=> LPAREN,
	')'	=> RPAREN,
	'{'	=> LBRACE,
	'}'	=> RBRACE,
	','	=> COMMA,
	'='	=> EQUALS,
);

my %METHOD_TOKEN	= (
# 	q[#]	=> '_get_comment',
	q[@]	=> '_get_keyword',
	q[<]	=> '_get_iriref',
	q[_]	=> '_get_bnode',
	q[']	=> '_get_literal',
	q["]	=> '_get_literal',
	q[:]	=> '_get_pname',
	(map {$_ => '_get_number'} (0 .. 9, '-', '+'))
);

=item C<< get_token >>

Returns the next token present in the input.

=cut

sub get_token {
	my $self	= shift;
	while (1) {
		unless (length($self->{buffer})) {
			$self->fill_buffer;
		}
# 		warn "getting token with buffer: " . Dumper($self->{buffer});
		my $c	= $self->_peek_char();
		return unless (defined($c) and length($c));
		
		$self->start_column( $self->column );
		$self->start_line( $self->line );
		
		if (defined(my $name = $CHAR_TOKEN{$c})) { $self->_get_char; return $self->new_token($name); }
		elsif (defined(my $method = $METHOD_TOKEN{$c})) { return $self->$method() }
		elsif ($c eq '#') {
			# we're ignoring comment tokens, but we could return them here instead of falling through to the 'next':
			$self->_get_comment();
			next;
		}
		elsif ($c =~ /[ \r\n\t]/) {
			while (defined($c) and length($c) and $c =~ /[\t\r\n ]/) {
				$self->_get_char;
				$c		= $self->_peek_char;
			}
			
			# we're ignoring whitespace tokens, but we could return them here instead of falling through to the 'next':
# 			return $self->new_token(WS);
			next;
		}
		elsif ($c =~ /[A-Za-z\x{00C0}-\x{00D6}\x{00D8}-\x{00F6}\x{00F8}-\x{02FF}\x{0370}-\x{037D}\x{037F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}]/) {
			if ($self->{buffer} =~ /^a(?!:)\b/) {
				$self->_get_char;
				return $self->new_token(A);
			} elsif ($self->{buffer} =~ /^(?:true|false)\b/) {
				my $bool	= $self->_read_length($+[0]);
				return $self->new_token(BOOLEAN, $bool);
			} else {
				return $self->_get_pname;
			}
		}
		elsif ($c eq '^') { $self->_read_word('^^'); return $self->new_token(HATHAT); }
		else {
# 			Carp::cluck sprintf("Unexpected byte '$c' (0x%02x)", ord($c));
			return $self->_throw_error(sprintf("Unexpected byte '$c' (0x%02x)", ord($c)));
		}
		warn 'byte: ' . Dumper($c);
	}
}

=begin private

=cut

=item C<< fill_buffer >>

Fills the internal parse buffer with a new line from the input source.

=cut

sub fill_buffer {
	my $self	= shift;
	unless (length($self->buffer)) {
		my $line	= $self->file->getline;
		if (defined($line)) {
			$self->{buffer}	.= $line;
		}
	}
}

=item C<< check_for_bom >>

Checks the input buffer for a Unicode BOM, and consumes it if it is present.

=cut

sub check_for_bom {
	my $self	= shift;
	my $c	= $self->_peek_char();
	if (defined($c) and $c eq "\x{FEFF}") {
		$self->_get_char;
	}
}

sub _get_char_safe {
	my $self	= shift;
	my $char	= shift;
	my $c		= $self->_get_char;
	if ($c ne $char) {
		$self->_throw_error("Expected '$char' but got '$c'");
	}
	return $c;
}

sub _get_char_fill_buffer {
	my $self	= shift;
	if (length($self->{buffer}) == 0) {
		$self->fill_buffer;
		if (length($self->{buffer}) == 0) {
			return;
		}
	}
	my $c		= substr($self->{buffer}, 0, 1, '');
	if ($c eq "\n") {
# 		$self->{linebuffer}	= '';
		$self->{line}	= 1+$self->{line};
		$self->{column}	= 1;
	} else {
# 		$self->{linebuffer}	.= $c;
		$self->{column}	= 1+$self->{column};
	}
	return $c;
}

sub _get_char {
	my $self	= shift;
	my $c		= substr($self->{buffer}, 0, 1, '');
	if ($c eq "\n") {
# 		$self->{linebuffer}	= '';
		$self->{line}	= 1+$self->{line};
		$self->{column}	= 1;
	} else {
# 		$self->{linebuffer}	.= $c;
		$self->{column}	= 1+$self->{column};
	}
	return $c;
}

sub _peek_char {
	my $self	= shift;
	if (length($self->{buffer}) == 0) {
		$self->fill_buffer;
		if (length($self->{buffer}) == 0) {
			return;
		}
	}
	my $c		= substr($self->{buffer}, 0, 1);
	return $c;
}

sub _read_word {
	my $self	= shift;
	my $word	= shift;
	while (length($self->{buffer}) < length($word)) {
		$self->fill_buffer;
	}
	
	if (substr($self->{buffer}, 0, length($word)) ne $word) {
		$self->_throw_error("Expected '$word'");
	}
	
	my $lines	= ($word =~ tr/\n//);
	my $lastnl	= rindex($word, "\n");
	my $cols	= length($word) - $lastnl - 1;
	$self->{lines}	+= $lines;
	if ($lines) {
		$self->{column}	= $cols;
	} else {
		$self->{column}	+= $cols;
	}
	substr($self->{buffer}, 0, length($word), '');
}

sub _read_length {
	my $self	= shift;
	my $len		= shift;
	while (length($self->{buffer}) < $len) {
		$self->fill_buffer;
	}
	
	my $word	= substr($self->{buffer}, 0, $len, '');
	my $lines	= ($word =~ tr/\n//);
	my $lastnl	= rindex($word, "\n");
	my $cols	= length($word) - $lastnl - 1;
	$self->{lines}	+= $lines;
	if ($lines) {
		$self->{column}	= $cols;
	} else {
		$self->{column}	+= $cols;
	}
	return $word;
}

sub _get_pname {
	my $self	= shift;

	my $prefix	= '';
	if ($self->{buffer} =~ /^$r_nameStartChar_minus_underscore/) {
		my @parts;
		unless ($self->{buffer} =~ /^$r_nameStartChar_minus_underscore/o) {
			$self->_throw_error("Expected: name");
		}
		my $nsc = substr($self->{buffer}, 0, $+[0]);
		$self->_read_word($nsc);
		push(@parts, $nsc);
		while ($self->{buffer} =~ /^$r_nameChar_test/) {
			my $nc;
			if ($self->{buffer} =~ /^$r_nameStartChar/) {
				$nc	= $self->_get_char();
			} else {
				unless ($self->{buffer} =~ /^$r_nameChar_extra/o) {
					$self->_error("Expected: nameStartChar");
				}
				$nc	= $self->_get_char();
			}
			push(@parts, $nc);
		}
		$prefix	= join('', @parts);
	}
	$self->_get_char_safe(':');
	if ($self->{buffer} =~ /^$r_nameStartChar/) {
		unless ($self->{buffer} =~ /^${r_nameStartChar}(?:${r_nameStartChar}|${r_nameChar_extra})*/o) {
			$self->_error("Expected: name");
		}
		my $name	= substr($self->{buffer}, 0, $+[0]);
		$self->_read_word($name);
		return $self->new_token(PREFIXNAME, $prefix, $name);
	} else {
		return $self->new_token(PREFIXNAME, $prefix);
	}
}

sub _get_iriref {
	my $self	= shift;
	$self->_get_char_safe(q[<]);
	my $iri	= '';
	while (1) {
		my $c	= $self->_peek_char;
		last unless defined($c);
		if (substr($self->{buffer}, 0, 1) eq '\\') {
			$self->_get_char_safe('\\');
			my $esc	= $self->_get_char;
			given ($esc) {
				when('\\'){ $iri .= "\\" }
				when('"'){ $iri .= '"' }
				when('r'){ $iri .= "\r" }
				when('t'){ $iri .= "\t" }
				when('n'){ $iri .= "\n" }
				when('>'){ $iri .= ">" }
				when('U'){
					my $codepoint	= $self->_read_length(8);
					unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
						$self->throw_error("Bad unicode escape codepoint '$codepoint'");
					}
					$iri .= chr(hex($codepoint));
				}
				when('u'){
					my $codepoint	= $self->_read_length(4);
					unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
						$self->throw_error("Bad unicode escape codepoint '$codepoint'");
					}
					$iri .= chr(hex($codepoint));
				}
				default {
					$self->throw_error("Unrecognized iri escape '$esc'");
				}
			}
		} elsif ($self->{buffer} =~ /^[^>\\]+/) {
			$iri	.= $self->_read_length($+[0]);
		} elsif (substr($self->{buffer}, 0, 1) eq '>') {
			last;
		} else {
			$self->throw_error("Got '$c' while expecting IRI character");
		}
	}
	$self->_get_char_safe(q[>]);
	return $self->new_token(IRI, $iri);
}

sub _get_bnode {
	my $self	= shift;
	$self->_read_word('_:');
	unless ($self->{buffer} =~ /^${r_nameStartChar}(?:${r_nameStartChar}|${r_nameChar_extra})*/o) {
		$self->_error("Expected: name");
	}
	my $name	= substr($self->{buffer}, 0, $+[0]);
	$self->_read_word($name);
	return $self->new_token(BNODE, $name);
}

sub _get_number {
	my $self	= shift;
	if ($self->{buffer} =~ /^${r_double}/) {
		return $self->new_token(DOUBLE, $self->_read_length($+[0]));
	} elsif ($self->{buffer} =~ /^${r_decimal}/) {
		return $self->new_token(DECIMAL, $self->_read_length($+[0]));
	} elsif ($self->{buffer} =~ /^${r_integer}/) {
		return $self->new_token(INTEGER, $self->_read_length($+[0]));
	} else {
		$self->_throw_error("Expected number");
	}
}

sub _get_comment {
	my $self	= shift;
	$self->_get_char_safe('#');
	my $comment	= '';
	my $c		= $self->_peek_char;
	while (length($c) and $c !~ /[\r\n]/) {
		$comment	.= $self->_get_char;
		$c			= $self->_peek_char;
	}
	if (length($c) and $c =~ /[\r\n]/) {
		$self->_get_char;
	}
	return $self->new_token(COMMENT, $comment);
}

sub _get_literal {
	my $self	= shift;
	my $c		= $self->_peek_char();
	$self->_get_char_safe(q["]);
	if (substr($self->{buffer}, 0, 2) eq q[""]) {
		# #x22 #x22 #x22 lcharacter* #x22 #x22 #x22
		$self->_read_word(q[""]);
		
		my $quote_count	= 0;
		my $string	= '';
		while (1) {
			if (length($self->{buffer}) == 0) {
				$self->fill_buffer;
				if (length($self->{buffer}) == 0) {
					$self->_throw_error("Found EOF in string literal");
				}
			}
			if (substr($self->{buffer}, 0, 1) eq '"') {
				my $c	= $self->_get_char;
				$quote_count++;
				if ($quote_count == 3) {
					last;
				}
			} else {
				if ($quote_count) {
					$string	.= '"' foreach (1..$quote_count);
					$quote_count	= 0;
				}
				if (substr($self->{buffer}, 0, 1) eq '\\') {
					my $c	= $self->_get_char;
# 					$self->_get_char_safe('\\');
					my $esc	= $self->_get_char_fill_buffer;
					given ($esc) {
						when('\\'){ $string .= "\\" }
						when('"'){ $string .= '"' }
						when('r'){ $string .= "\r" }
						when('t'){ $string .= "\t" }
						when('n'){ $string .= "\n" }
						when('>'){ $string .= ">" }
						when('U'){
							my $codepoint	= $self->_read_length(8);
							unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
								$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
							}
							$string .= chr(hex($codepoint));
						}
						when('u'){
							my $codepoint	= $self->_read_length(4);
							unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
								$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
							}
							$string .= chr(hex($codepoint));
						}
						default {
							$self->_throw_error("Unrecognized string escape '$esc'");
						}
					}
				} else {
					$self->{buffer}	=~ /^[^"\\]+/;
					$string	.= $self->_read_length($+[0]);
				}
			}
		}
		return $self->new_token(STRING3D, $string);
	} else {
		### #x22 scharacter* #x22
		my $string	= '';
		while (1) {
			if (substr($self->{buffer}, 0, 1) eq '\\') {
				my $c	= $self->_peek_char;
				$self->_get_char_safe('\\');
				my $esc	= $self->_get_char;
				given ($esc) {
					when('\\'){ $string .= "\\" }
					when('"'){ $string .= '"' }
					when('r'){ $string .= "\r" }
					when('t'){ $string .= "\t" }
					when('n'){ $string .= "\n" }
					when('>'){ $string .= ">" }
					when('U'){
						my $codepoint	= $self->_read_length(8);
						unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
							$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
						}
						$string .= chr(hex($codepoint));
					}
					when('u'){
						my $codepoint	= $self->_read_length(4);
						unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
							$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
						}
						$string .= chr(hex($codepoint));
					}
					default {
						$self->_throw_error("Unrecognized string escape '$esc'");
					}
				}
			} elsif ($self->{buffer} =~ /^[^"\\]+/) {
				$string	.= $self->_read_length($+[0]);
			} elsif (substr($self->{buffer}, 0, 1) eq '"') {
				last;
			} else {
				$self->_throw_error("Got '$c' while expecting string character");
			}
		}
		$self->_get_char_safe(q["]);
		return $self->new_token(STRING1D, $string);
	}
}

sub _get_keyword {
	my $self	= shift;
	$self->_get_char_safe('@');
	if ($self->{buffer} =~ /^base/) {
		$self->_read_word('base');
		return $self->new_token(BASE);
	} elsif ($self->{buffer} =~ /^prefix/) {
		$self->_read_word('prefix');
		return $self->new_token(PREFIX);
	} else {
		if ($self->{buffer} =~ /^[a-z]+(-[a-z0-9]+)*\b/) {
			my $lang	= $self->_read_length($+[0]);
			return $self->new_token(LANG, $lang);
		} else {
			$self->_throw_error("Expected keyword or language tag");
		}
	}
}

sub _throw_error {
	my $self	= shift;
	my $error	= shift;
	my $line	= $self->start_line;
	my $col		= $self->start_column;
# 	Carp::cluck "$line:$col: $error: " . Dumper($self->{buffer});
	RDF::Trine::Error::ParserError::Positioned->throw(
		-text => "$error at $line:$col",
		-value => [$line, $col],
	);
}

__PACKAGE__->meta->make_immutable;

1;

__END__

=end private

=back

=head1 BUGS

Please report any bugs or feature requests to through the GitHub web interface
at L<https://github.com/kasei/perlrdf/issues>.

=head1 AUTHOR

Gregory Todd Williams  C<< <gwilliams@cpan.org> >>

=head1 COPYRIGHT

Copyright (c) 2006-2012 Gregory Todd Williams. This
program is free software; you can redistribute it and/or modify it under
the same terms as Perl itself.

=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)