/* -*- Mode: c; c-basic-offset: 2 -*-
*
* n3_lexer.l - Raptor Notation 3 lexer - making tokens for grammar generator
*
* Copyright (C) 2003-2006, David Beckett http://purl.org/net/dajobe/
* Copyright (C) 2003-2005, University of Bristol, UK http://www.bristol.ac.uk/
*
* This package is Free Software and part of Redland http://librdf.org/
*
* It is licensed under the following three licenses as alternatives:
* 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
* 2. GNU General Public License (GPL) V2 or any newer version
* 3. Apache License, V2.0 or any newer version
*
* You may not use this file except in compliance with at least one of
* the above three licenses.
*
* See LICENSE.html or LICENSE.txt at the top of this package for the
* complete terms and further detail along with the license texts for
* the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
*
*
* This is an incomplete Notation 3 parser.
*
* To generate the C files from this source, rather than use the
* shipped n3_lexer.c/.h needs a patched version of flex 2.5.31 such
* as the one available in Debian GNU/Linux. Details below
* near the %option descriptions.
*
*/
/* recognise 8-bits */
%option 8bit
%option warn nodefault
/* all symbols prefixed by this */
%option prefix="n3_lexer_"
/* This is not needed, flex is invoked -on3_lexer.c */
/* %option outfile="n3_lexer.c" */
/* Emit a C header file for prototypes
* Only available in flex 2.5.13 or newer.
* It was renamed to header-file in flex 2.5.19
*/
%option header-file="n3_lexer.h"
/* Do not emit #include <unistd.h>
* Only available in flex 2.5.7 or newer.
* Broken in flex 2.5.31 without patches.
*/
%option nounistd
/* Never interactive */
/* No isatty() check */
%option never-interactive
/* Batch scanner */
%option batch
/* Never use yyunput */
%option nounput
%option reentrant
/* definitions */
%{
/* NOTE: These headers are NOT included here but are inserted by
* fix-flex since otherwise it appears far too late in the generated C
*/
/*
#ifdef HAVE_CONFIG_H
#include <raptor_config.h>
#endif
#ifdef WIN32
#include <win32_raptor_config.h>
#endif
*/
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdarg.h>
#ifdef HAVE_ERRNO_H
#include <errno.h>
#endif
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#include <raptor.h>
#include <raptor_internal.h>
#include <n3_parser.h>
#include <n3_common.h>
/* Prototypes */
static unsigned char *n3_copy_token(unsigned char *text, size_t len);
static unsigned char *n3_copy_string_token(raptor_parser* rdf_parser, unsigned char *text, size_t len, int delim);
void n3_lexer_syntax_error(void* rdf_parser, const char *msg, ...) RAPTOR_PRINTF_FORMAT(2, 3);
#ifdef RAPTOR_DEBUG
const char * n3_token_print(int token, YYSTYPE *lval);
#endif
int n3_lexer_lex (YYSTYPE *n3_parser_lval, yyscan_t yyscanner);
#define YY_DECL int n3_lexer_lex (YYSTYPE *n3_parser_lval, yyscan_t yyscanner)
#ifdef __cplusplus
#define INPUT_FN yyinput
#else
#define INPUT_FN input
#endif
/* Missing n3_lexer.c/h prototypes */
int n3_lexer_get_column(yyscan_t yyscanner);
void n3_lexer_set_column(int column_no , yyscan_t yyscanner);
%}
/* from SPARQL */
LANGUAGETOKEN [A-Za-z][-A-Z_a-z0-9]*
NCCHAR1 [A-Za-z\\\x80-\xff]
NCCHAR {NCCHAR1}|"-"|"_"|[0-9]
NCNAME_PREFIX {NCCHAR1}(({NCCHAR}|".")*{NCCHAR})?
NCNAME ("_"|{NCCHAR1})(({NCCHAR}|".")*{NCCHAR})?
QNAME {NCNAME_PREFIX}?":"{NCNAME}?
BNAME "_:"{NCNAME}
/* similar to SPARQL but no need for <= check here */
QUOTEDURI \<[^\>]*\>
DECIMAL [0-9]+"."[0-9]*|"."[0-9]+
DOUBLE [0-9]+"."[0-9]*{EXPONENT}|"."([0-9])+{EXPONENT}|([0-9])+{EXPONENT}
EXPONENT [eE][+-]?[0-9]+
%x PREF LITERAL
%%
/* rules */
raptor_parser *rdf_parser=(raptor_parser*)yyextra;
raptor_n3_parser* n3_parser=(raptor_n3_parser*)rdf_parser->context;
\r\n|\r|\n { n3_parser->lineno++; }
[\ \t\v]+ { /* empty */ }
"a" { return A; }
"." { return DOT; }
"," { return COMMA; }
";" { return SEMICOLON; }
"[" { return LEFT_SQUARE; }
"]" { return RIGHT_SQUARE; }
"@prefix" { BEGIN(PREF); return PREFIX; }
"@" { return AT; }
"^^" { return HAT; }
"(" { return LEFT_ROUND; }
")" { return RIGHT_ROUND; }
'([^'\\\n\r]|\\[^\n\r])*' { n3_parser_lval->string=n3_copy_string_token(rdf_parser, (unsigned char*)yytext+1, yyleng-2, '\"'); /* ' */
return STRING_LITERAL; }
\"([^"\\\n\r]|\\[^\n\r])*\" { n3_parser_lval->string=n3_copy_string_token(rdf_parser, (unsigned char*)yytext+1, yyleng-2, '"'); /* ' */
return STRING_LITERAL; }
\"\"\" { BEGIN(LITERAL);
n3_parser->sb=raptor_new_stringbuffer();
}
<LITERAL>\"\"\" {
size_t len;
BEGIN(INITIAL);
len=raptor_stringbuffer_length(n3_parser->sb);
n3_parser_lval->string=(unsigned char *)RAPTOR_MALLOC(cstring, len+1);
raptor_stringbuffer_copy_to_string(n3_parser->sb, (unsigned char*)n3_parser_lval->string, len);
n3_parser_lval->string[len]='\0';
raptor_free_stringbuffer(n3_parser->sb);
n3_parser->sb=NULL;
return STRING_LITERAL; }
<LITERAL>\"|(\\[^uU]|\\u....|\\U........|[^\"]|\n)* {
if (*yytext == EOF) {
BEGIN(INITIAL);
n3_syntax_error(rdf_parser, "End of file in middle of literal");
raptor_free_stringbuffer(n3_parser->sb);
n3_parser->sb=NULL;
return EOF;
}
if(raptor_stringbuffer_append_turtle_string(n3_parser->sb, (unsigned char*)yytext, yyleng, '"', (raptor_simple_message_handler)n3_lexer_syntax_error, rdf_parser)) { /* " */
BEGIN(INITIAL);
raptor_free_stringbuffer(n3_parser->sb);
n3_parser->sb=NULL;
yyterminate();
}
}
{BNAME} { n3_parser_lval->string=n3_copy_token((unsigned char*)yytext+2, yyleng-2);
return BLANK_LITERAL; }
{QNAME} { n3_parser_lval->uri=n3_qname_to_uri(rdf_parser, (unsigned char*)yytext, yyleng);
return QNAME_LITERAL; }
[-+]?{DECIMAL} { n3_parser_lval->string=n3_copy_token((unsigned char*)yytext, yyleng);
return DECIMAL_LITERAL;
}
[-+]?{DOUBLE} {
double d;
int n;
n=sscanf((const char*)yytext, "%lf", &d);
if(n != 1) {
n3_syntax_error(rdf_parser, "N3 syntax error - Illegal floating point constant %s", yytext);
yyterminate();
}
n3_parser_lval->floating=d;
return FLOATING_LITERAL;
}
[-+]?[0-9]+ { n3_parser_lval->integer=atoi(yytext);
return INTEGER_LITERAL; }
<PREF>[\ \t\v]+ { /* eat up leading whitespace */ }
<PREF>{NCNAME_PREFIX}":" { n3_parser_lval->string=n3_copy_token((unsigned char*)yytext, yyleng);
BEGIN(INITIAL);
return IDENTIFIER; }
<PREF>":" { BEGIN(INITIAL);
n3_parser_lval->string=n3_copy_token((unsigned char*)yytext, 0);
return IDENTIFIER; }
<PREF>(.|\n) { BEGIN(INITIAL);
if (*yytext == EOF)
return EOF;
n3_syntax_error(rdf_parser, "syntax error at '%c' - @prefix name must end in :", *yytext);
yyterminate(); }
{QUOTEDURI} { if(yyleng == 2)
n3_parser_lval->uri=raptor_uri_copy(rdf_parser->base_uri);
else {
yytext[yyleng-1]='\0';
n3_parser_lval->uri=raptor_new_uri_relative_to_base(rdf_parser->base_uri, (const unsigned char*)yytext+1);
}
return URI_LITERAL; }
{LANGUAGETOKEN} { n3_parser_lval->string=n3_copy_token((unsigned char*)yytext, yyleng);
return IDENTIFIER; }
\#[^\r\n]*(\r\n|\r|\n) { /* # comment */
n3_parser->lineno++;
}
. { if (*yytext == EOF)
return EOF;
n3_syntax_error(rdf_parser, "syntax error at '%c'", *yytext);
yyterminate();
}
%%
/* user code */
int
yywrap (yyscan_t yyscanner) {
return 1;
}
static unsigned char *
n3_copy_token(unsigned char *text, size_t len)
{
unsigned char *s;
if(!len)
len=strlen((const char*)text);
s=(unsigned char *)RAPTOR_MALLOC(cstring, len+1);
strncpy((char*)s, (const char*)text, len);
s[len] = '\0';
return s;
}
static unsigned char *
n3_copy_string_token(raptor_parser* rdf_parser,
unsigned char *string, size_t len, int delim)
{
raptor_stringbuffer* sb=NULL;
int rc;
if(len) {
sb=raptor_new_stringbuffer();
if(!sb)
return NULL;
rc=raptor_stringbuffer_append_turtle_string(sb, string, len, delim,
(raptor_simple_message_handler)n3_lexer_syntax_error,
rdf_parser);
if(rc) {
raptor_free_stringbuffer(sb);
return NULL;
}
len=raptor_stringbuffer_length(sb);
}
string=(unsigned char *)RAPTOR_MALLOC(cstring, len+1);
if(sb) {
raptor_stringbuffer_copy_to_string(sb, string, len+1);
raptor_free_stringbuffer(sb);
}
string[len]='\0';
return string;
}
void
n3_lexer_syntax_error(void* ctx, const char *message, ...)
{
raptor_parser* rdf_parser=(raptor_parser *)ctx;
raptor_n3_parser* n3_parser=(raptor_n3_parser*)rdf_parser->context;
va_list arguments;
rdf_parser->locator.line=n3_parser->lineno;
#ifdef RAPTOR_N3_USE_ERROR_COLUMNS
rdf_parser->locator.column=n3_lexer_get_column(yyscanner);
#endif
va_start(arguments, message);
raptor_parser_error_varargs(((raptor_parser*)rdf_parser), message, arguments);
va_end(arguments);
}
#ifdef RAPTOR_DEBUG
const char *
n3_token_print(int token, YYSTYPE *lval)
{
static char buffer[2048];
if(!token)
return "<<EOF>>";
switch(token) {
case PREFIX:
return "PREFIX";
case A:
return "A";
case DOT:
return "DOT";
case COMMA:
return "COMMA";
case SEMICOLON:
return "SEMICOLON";
case LEFT_SQUARE:
return "LEFT_SQUARE";
case RIGHT_SQUARE:
return "RIGHT_SQUARE";
case AT:
return "AT";
case HAT:
return "HAT";
case STRING_LITERAL:
sprintf(buffer, "STRING_LITERAL(%s)", lval->string);
return buffer;
case URI_LITERAL:
sprintf(buffer, "URI_LITERAL(%s)",
(lval->uri ? (char*)raptor_uri_as_string(lval->uri) : ""));
return buffer;
case BLANK_LITERAL:
sprintf(buffer, "BLANK_LITERAL(%s)", lval->string);
return buffer;
case QNAME_LITERAL:
sprintf(buffer, "QNAME_LITERAL(%s)",
(lval->uri ? (char*)raptor_uri_as_string(lval->uri) : ""));
return buffer;
case INTEGER_LITERAL:
sprintf(buffer, "INTEGER_LITERAL(%d)", lval->integer);
return buffer;
case FLOATING_LITERAL:
sprintf(buffer, "FLOATING_LITERAL(%1g)", lval->floating);
return buffer;
case IDENTIFIER:
sprintf(buffer, "IDENTIFIER(%s)",
(lval->string ? (char*)lval->string : ""));
return buffer;
case DECIMAL_LITERAL:
sprintf(buffer, "DECIMAL_LITERAL(%s)", lval->string);
return buffer;
case ERROR_TOKEN:
return "ERROR";
default:
fprintf(stderr, "n3_token_print: UNKNOWN token %d - add a new case\n", token);
abort();
}
}
#endif
#ifdef STANDALONE
static void
n3_token_free(int token, YYSTYPE *lval)
{
if(!token)
return;
switch(token) {
case STRING_LITERAL:
case BLANK_LITERAL:
case IDENTIFIER:
if(lval->string)
RAPTOR_FREE(cstring, lval->string);
break;
case URI_LITERAL:
case QNAME_LITERAL:
if(lval->uri)
raptor_free_uri(lval->uri);
break;
default:
break;
}
}
int
main(int argc, char *argv[])
{
raptor_parser rdf_parser;
raptor_n3_parser n3_parser;
yyscan_t scanner;
int token=EOF;
FILE *fh;
YYSTYPE lval;
const unsigned char *uri_string;
char *filename=NULL;
raptor_init();
if(argc > 1) {
filename=argv[1];
fh=fopen(filename, "r");
if(!fh) {
fprintf(stderr, "%s: Cannot open file %s - %s\n", argv[0], filename,
strerror(errno));
exit(1);
}
} else {
filename="<stdin>";
fh=stdin;
}
memset(&rdf_parser, 0, sizeof(raptor_parser));
memset(&n3_parser, 0, sizeof(raptor_n3_parser));
yylex_init(&n3_parser.scanner);
scanner=n3_parser.scanner;
n3_lexer_set_in(fh, scanner);
n3_lexer_set_extra(&rdf_parser, scanner);
/* Initialise enough of the parser and locator to get error messages */
rdf_parser.context=&n3_parser;
n3_parser.lineno=1;
rdf_parser.locator.file=filename;
rdf_parser.locator.column= -1;
uri_string=raptor_uri_filename_to_uri_string(filename);
rdf_parser.base_uri=raptor_new_uri(uri_string);
RAPTOR_FREE(cstring, (void*)uri_string);
while(1) {
memset(&lval, 0, sizeof(YYSTYPE));
if(n3_lexer_get_text(scanner) != NULL)
printf("yyinput '%s'\n", n3_lexer_get_text(scanner));
token=yylex(&lval, scanner);
#ifdef RAPTOR_DEBUG
printf("token %s\n", n3_token_print(token, &lval));
#else
printf("token %d\n", token);
#endif
n3_token_free(token, &lval);
if(!token || token == EOF || token == ERROR_TOKEN)
break;
}
yylex_destroy(scanner);
raptor_free_uri(rdf_parser.base_uri);
raptor_finish();
if(token == ERROR_TOKEN)
return 1;
return 0;
}
#endif