The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/* -*- Mode: c; c-basic-offset: 2 -*-
 *
 * sparql_lexer.l - Rasqal SPARQL lexer - making tokens for sparql grammar generator
 *
 * $Id: sparql_lexer.l 11543 2006-10-25 05:58:44Z dajobe $
 *
 * Copyright (C) 2004-2006, David Beckett http://purl.org/net/dajobe/
 * Copyright (C) 2004-2005, University of Bristol, UK http://www.bristol.ac.uk/
 * 
 * This package is Free Software and part of Redland http://librdf.org/
 * 
 * It is licensed under the following three licenses as alternatives:
 *   1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
 *   2. GNU General Public License (GPL) V2 or any newer version
 *   3. Apache License, V2.0 or any newer version
 * 
 * You may not use this file except in compliance with at least one of
 * the above three licenses.
 * 
 * See LICENSE.html or LICENSE.txt at the top of this package for the
 * complete terms and further detail along with the license texts for
 * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
 * 
 * To generate the C files from this source, rather than use the
 * shipped sparql_lexer.c/.h needs a patched version of flex 2.5.31 such
 * as the one available in Debian GNU/Linux.   Details below
 * near the %option descriptions.
 *
 * SPARQL defined in http://www.w3.org/TR/rdf-sparql-query/
 *   http://www.w3.org/TR/2005/WD-rdf-sparql-query-20050419/
 *
 * Editor's draft of above http://www.w3.org/2001/sw/DataAccess/rq23/
 */


/* recognise 8-bits */
%option 8bit
%option warn nodefault

/* all symbols prefixed by this */
%option prefix="sparql_lexer_"

/* This is not needed, flex is invoked -osparql_lexer.c */
%option outfile="sparql_lexer.c"

/* Emit a C header file for prototypes
 * Only available in flex 2.5.13 or newer.
 * It was renamed to header-file in flex 2.5.19
 */
%option header-file="sparql_lexer.h"

/* Do not emit #include <unistd.h>
 * Only available in flex 2.5.7 or newer.
 * Broken in flex 2.5.31 without patches.
 */
%option nounistd

/* Never interactive */
/*  No isatty() check */
%option never-interactive

/* Batch scanner */
%option batch

/* Never use yyunput */
%option nounput

%option reentrant


%x ID PREF LITERAL LITERAL2

  /* definitions */

%{

/* NOTE: These headers are NOT included here. They are inserted by fix-flex
 * since otherwise it appears far too late in the generated C
 */

/*
#ifdef HAVE_CONFIG_H
#include <rasqal_config.h>
#endif

#ifdef WIN32
#include <win32_rasqal_config.h>
#endif
*/

#include <stdio.h>
#include <string.h>
#include <stdarg.h>
#include <ctype.h>

#include <rasqal.h>
#include <rasqal_internal.h>

#include <sparql_parser.h>

#include <sparql_common.h>



static int sparql_skip_c_comment(rasqal_query *rq);

/*
 * Extra checks beyond valid Namespaces in XML 1.1 name
 *
 * SPARQL_NAME_CHECK_VARNAME (token VARNAME)
 *   No '.' allowed.
 *   No '-' allowed.
 *
 * SPARQL_NAME_CHECK_PREFIX (token NCNAME_PREFIX)
 *   No '_' allowed as the first character.
 *   No '.' allowed as the last character.
 *
 * SPARQL_NAME_CHECK_NCNAME (token NCNAME)
 *   No '.' allowed as the first character.
 *   No '.' allowed as the last character.
 *
*/
typedef enum {
  SPARQL_NAME_CHECK_NO_UL_FIRST  = 1,
  SPARQL_NAME_CHECK_NO_DOT_LAST  = 2,
  SPARQL_NAME_CHECK_NO_DOT_MINUS = 4,

  SPARQL_NAME_CHECK_VARNAME = SPARQL_NAME_CHECK_NO_DOT_MINUS,
  SPARQL_NAME_CHECK_PREFIX  = SPARQL_NAME_CHECK_NO_UL_FIRST | SPARQL_NAME_CHECK_NO_DOT_LAST,
  SPARQL_NAME_CHECK_NCNAME  = SPARQL_NAME_CHECK_NO_DOT_LAST
} sparql_name_check_flags;


static int rasqal_sparql_name_check(const unsigned char *string, size_t length, sparql_name_check_flags check_flags);
static unsigned char *sparql_copy_name(rasqal_query *rq, const unsigned char *text, size_t len, sparql_name_check_flags check_flags);
static raptor_uri* sparql_copy_qname(rasqal_query *rq, const unsigned char *text, size_t len);
static int sparql_copy_string_token(rasqal_query *rq, YYSTYPE* lval, const unsigned char *text, size_t len, int delim);

#ifdef RASQAL_DEBUG
const char * sparql_token_print(int token, YYSTYPE *lval);
#endif

int sparql_lexer_lex (YYSTYPE *sparql_parser_lval, yyscan_t yyscanner);
#define YY_DECL int sparql_lexer_lex (YYSTYPE *sparql_parser_lval, yyscan_t yyscanner)

#ifdef __cplusplus
#define INPUT_FN yyinput
#else
#define INPUT_FN input
#endif

/* Remove the re-fill function since it should never be called */
#define YY_INPUT(buf,result,max_size) { return YY_NULL; }


/* Missing sparql_lexer.c/h prototypes */
int sparql_lexer_get_column(yyscan_t yyscanner);
void sparql_lexer_set_column(int  column_no , yyscan_t yyscanner);


%}

LANGUAGETOKEN [A-Za-z][-A-Z_a-z0-9]*


/*
 * rq23 is http://www.w3.org/2001/sw/DataAccess/rq23/
 * CVS ID 1.420 2005/07/12 15:38:40
 */


/* [85] NCCHAR1p ::= [A-Z] | [a-z] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
 *   [#x00F8-#x02FF] | [#x0370-#x037D] | [#x037F-#x1FFF] |
 *   [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] |
 *   [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
 *   [#x10000-#xEFFFF]
 *
 * This is an XML 1.1 NameStartChar
 *   http://www.w3.org/TR/2004/REC-xml11-20040204/#NT-NameStartChar
 * except
 *   No '_' allowed.
 */
NCCHAR1p [A-Za-z\\\x80-\xff]

/* [86] NCCHAR1 ::= NCCHAR1p | '_'
 * This is an XML 1.1 NameStartChar
 *   http://www.w3.org/TR/2004/REC-xml11-20040204/#NT-NameStartChar
 */
NCCHAR1 [A-Za-z\\\x80-\xff_]

/* [87]  VARNAME ::= ( NCCHAR1 | _ ) 
 *   ( NCCHAR1 | [0-9] | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040] )*
 *
 * This is an Namespaces in XML 1.1 Name except:
 *   No '.' allowed.
 *   No '-' allowed.
 */
VARNAME ({NCCHAR1}|[0-9])({NCCHAR1}|[0-9])*

/* [88]  NCCHAR ::= 
 *   NCCHAR1 | '-' | [0-9] | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040]
 *
 * This is XML 1.1 NameChar
 *   http://www.w3.org/TR/2004/REC-xml11-20040204/#NT-NameChar
 * except:
 *   No '.' allowed.
 */
NCCHAR {NCCHAR1}|"-"|[0-9]

/* [89]  NCNAME_PREFIX ::= NCCHAR1p ((NCCHAR|".")* NCCHAR)?
 *
 * This is an Namespaces in XML 1.1 Name except:
 *   No '_' allowed as the first character.
 *   No '.' allowed as the last character.
 */
NCNAME_PREFIX {NCCHAR1p}(({NCCHAR}|".")*{NCCHAR})?

/* [90]  NCNAME ::= NCCHAR1 ((NCCHAR|".")* NCCHAR)?
 *
 * This is an Namespaces in XML 1.1 Name except:
 *   No '.' allowed as the last character.
 */
NCNAME {NCCHAR1}(({NCCHAR}|".")*{NCCHAR})?

/* rq23 [67] QNAME_NS ::= NCNAME_PREFIX? ':'
 * Inlined into in rule <PREF>{NCNAME_PREFIX}":" below 
 */

/* rq23 [68] QNAME ::= NCNAME_PREFIX? ':' NCNAME? */
QNAME {NCNAME_PREFIX}?":"{NCNAME}?

/* rq23 [69] BNODE_LABEL (renamed to BNAME) ::= '_:' NCNAME */
BNAME "_:"{NCNAME}


/* The initial char conditions are to ensure this doesn't grab < or <= */
QUOTEDURI \<[^<= ][^>]*\>|\<\>


DECIMAL [0-9]+"."[0-9]*|"."[0-9]+
DOUBLE [0-9]+"."[0-9]*{EXPONENT}|"."([0-9])+{EXPONENT}|([0-9])+{EXPONENT}
EXPONENT [eE][+-]?[0-9]+


%%
  /* rules */

  int c;
  rasqal_query *rq=(rasqal_query*)yyextra;
  rasqal_sparql_query_engine *rqe=(rasqal_sparql_query_engine*)rq->context;


"//"[^\r\n]*(\r\n|\r|\n)	{ /* C++ comment */
        rqe->lineno++;
}

"/*"	{ int lines=sparql_skip_c_comment(rq);
          if(lines < 0)
            yyterminate();
          rqe->lineno += lines;
        }

\r\n|\r|\n     { rqe->lineno++; }

[\ \t\v]+   { /* eat up other whitespace */
	;
}

[Ss][Ee][Ll][Ee][Cc][Tt] { return SELECT; }
[Ff][Rr][Oo][Mm] { return FROM; }
[Ww][Hh][Ee][Rr][Ee] { return WHERE; }
[Pp][Rr][Ee][Ff][Ii][Xx] { BEGIN(PREF);
		return PREFIX; }
[Dd][Ee][Ss][Cc][Rr][Ii][Bb][Ee] { return DESCRIBE; }
[Cc][Oo][Nn][Ss][Tt][Rr][Uu][Cc][Tt] { return CONSTRUCT; }
[Aa][Ss][Kk] { return ASK; }
[Dd][Ii][Ss][Tt][Ii][Nn][Cc][Tt] { return DISTINCT; }
[Ll][Ii][Mm][Ii][Tt] { return LIMIT; }
[Uu][Nn][Ii][Oo][Nn] { return UNION; }
[Oo][Pp][Tt][Ii][Oo][Nn][Aa][Ll] { return OPTIONAL; }
[Bb][Aa][Ss][Ee] { return BASE; }
[Bb][Oo][Uu][Nn][Dd] { return BOUND; }
[Ss][Tt][Rr] { return STR; }
[Ll][Aa][Nn][Gg] { return LANG; }
[Dd][Aa][Tt][Aa][Tt][Yy][Pp][Ee] { return DATATYPE; }
[Ii][Ss][UuIi][Rr][Ii] { return ISURI; }
[Ii][Ss][Bb][Ll][Aa][Nn][Kk] { return ISBLANK; }
[Ii][Ss][Ll][Ii][Tt][Ee][Rr][Aa][Ll] { return ISLITERAL; }
[Gg][Rr][Aa][Pp][Hh] { return GRAPH; }
[Nn][Aa][Mm][Ee][Dd] { return NAMED; }
[Ff][Ii][Ll][Tt][Ee][Rr] { return FILTER; }
[Oo][Ff][Ff][Ss][Ee][Tt] { return OFFSET; }
[Oo][Rr][Dd][Ee][Rr] { return ORDER; }
[Bb][Yy] { return BY; }
[Rr][Ee][Gg][Ee][Xx] { return REGEX; }
[Aa][Ss][Cc] { return ASC; }
[Dd][Ee][Ss][Cc] { return DESC; } 
[Ll][Aa][Nn][Gg][Mm][Aa][Tt][Cc][Hh][Ee][Ss] { return LANGMATCHES; } 

"a" { return A; }

","      { return ','; } 
"("      { return '('; } 
")"      { return ')'; } 
"["       { return '['; }
"]"       { return ']'; }
"?"  { BEGIN(ID); return '?'; }
"$"  { BEGIN(ID); return '$'; }
"{"      { return '{'; } 
"}"      { return '}'; } 
"."      { return '.'; } 
";"      { return ';'; } 

"||"         { return SC_OR; }
"&&"         { return SC_AND; }

"="            { return EQ; }
"!="            { return NEQ; }
"<"/[^A-Za-z=>]  { return LT; }
">"             { return GT; }
"<="         { return LE; }
">="         { return GE; }

"+"         { return '+'; }
"-"         { return '-'; }
"*"         { return '*'; }
"/"         { return '/'; }
"!"         { return '!'; }

[0-9]+["lL"]?   { c=yytext[yyleng-1];
                  if (c== 'l' || c == 'L')
                    yytext[yyleng-1]='\0';
                  sparql_parser_lval->literal=rasqal_new_integer_literal(RASQAL_LITERAL_INTEGER, atoi(yytext));
 		  return INTEGER_LITERAL; 
}

0[xX][0-9a-fA-F]+   { int i;
                      int n;
                      if(yytext[yyleng+1] == 'x') 
                        n=sscanf(yytext+2, "%x", &i);
                      else
                        n=sscanf(yytext+2, "%X", &i);
                      if(n != 1) {
                        sparql_syntax_error(rq, "SPARQL syntax error - Illegal hex constant %c%c%c",
                                          yytext[0], yytext[1], yytext[2]);
                        yyterminate();
                      }
                      sparql_parser_lval->literal=rasqal_new_integer_literal(RASQAL_LITERAL_INTEGER, i);
                      return INTEGER_LITERAL; 
}

[-+]?{DECIMAL}  { 
                        double d;
                        int n;
                        
                        n=sscanf((const char*)yytext, "%lf", &d);
                        if(n != 1) {
                          sparql_syntax_error(rq, "SPARQL syntax error - Illegal decimal constant %s", yytext);
                          yyterminate();
                        }
                        sparql_parser_lval->literal=rasqal_new_decimal_literal((const unsigned char*)yytext);
                        return DECIMAL_LITERAL;
}

[-+]?{DOUBLE} {
                        double d;
                        int n;
                        
                        n=sscanf((const char*)yytext, "%lf", &d);
                        if(n != 1) {
                          sparql_syntax_error(rq, "SPARQL syntax error - Illegal floating point constant %s", yytext);
                          yyterminate();
                        }
                        sparql_parser_lval->literal=rasqal_new_double_literal(d);
                        return FLOATING_POINT_LITERAL;
}

'([^'\\\n\r]|\\[^\n\r])*'(@{LANGUAGETOKEN})?(^^({QUOTEDURI}|{QNAME}))?    { /*' */
                        sparql_copy_string_token(rq, sparql_parser_lval,
                          (const unsigned char*)yytext+1, yyleng-1, '\'');
                        return STRING_LITERAL; }

\"([^"\\\n\r]|\\[^\n\r])*\"(@{LANGUAGETOKEN})?(^^({QUOTEDURI}|{QNAME}))?   { /* " */
                        sparql_copy_string_token(rq, sparql_parser_lval,
                          (const unsigned char*)yytext+1, yyleng-1, '"');
                        return STRING_LITERAL; }

\"\"\"				{ BEGIN(LITERAL); }

<LITERAL>(.|\n)*\"\"\"	{ sparql_copy_string_token(rq, sparql_parser_lval, 
                            (unsigned char*)yytext, yyleng-3, '"'); /* ' */
                          BEGIN(INITIAL);
                          return STRING_LITERAL; }

<LITERAL>(.|\n)	{ BEGIN(INITIAL);
		  if (!*yytext)
                    return EOF;

                  sparql_syntax_error(rq, "syntax error at %c - \"\"\"string was not terminated", *yytext);
                  yyterminate();  }


\'\'\'				{ BEGIN(LITERAL2); }

<LITERAL2>(.|\n)*\'\'\'	{ sparql_copy_string_token(rq, sparql_parser_lval, 
                            (unsigned char*)yytext, yyleng-3, '\'');
                          BEGIN(INITIAL);
                          return STRING_LITERAL; }

<LITERAL2>(.|\n)	{ BEGIN(INITIAL);
		  if (!*yytext)
                    return EOF;

                  sparql_syntax_error(rq, "syntax error at %c - '''string was not terminated", *yytext);
                  yyterminate();  }


[Tt][Rr][Uu][Ee]	{ sparql_parser_lval->literal=rasqal_new_boolean_literal(1);
                  return BOOLEAN_LITERAL; }

[Ff][Aa][Ll][Ss][Ee]	{ sparql_parser_lval->literal=rasqal_new_boolean_literal(0);
                  return BOOLEAN_LITERAL; }

<ID>{VARNAME}	{ sparql_parser_lval->name=sparql_copy_name(rq, (const unsigned char*)yytext, yyleng, SPARQL_NAME_CHECK_VARNAME);
		  if(!sparql_parser_lval->name)
                    yyterminate();
                          BEGIN(INITIAL);
                          return IDENTIFIER; }
<ID>(.|\n)	{	BEGIN(INITIAL);
		sparql_syntax_error(rq, "SPARQL syntax error - missing variable name after ?");
                yyterminate();
}


<PREF>[\ \t\v]+ { /* eat up leading whitespace */ }
<PREF>{NCNAME_PREFIX}":"	{ BEGIN(INITIAL);
		  	  sparql_parser_lval->name=sparql_copy_name(rq, (const unsigned char*)yytext, yyleng-1, SPARQL_NAME_CHECK_PREFIX);
			  if(!sparql_parser_lval->name)
			    yyterminate();
                          return IDENTIFIER; }
<PREF>":"	{ BEGIN(INITIAL);
		  sparql_parser_lval->name=NULL;
                  return IDENTIFIER; }

<PREF>(.|\n)	{ BEGIN(INITIAL);
		  if (!*yytext)
                    return EOF;

                  sparql_syntax_error(rq, "SPARQL syntax error at '%c'", *yytext);
                  yyterminate();
}

{QNAME}\(?	{
		int have_brace=(yytext[yyleng-1]=='(');
		if(have_brace)
			yyleng--;
		sparql_parser_lval->uri=sparql_copy_qname(rq, (const unsigned char*)yytext, yyleng);
		if(!sparql_parser_lval->uri)
		  yyterminate();
 		return have_brace ? URI_LITERAL_BRACE : URI_LITERAL;
}

{BNAME}	{	sparql_parser_lval->name=sparql_copy_name(rq, (unsigned char*)yytext+2, yyleng-2, SPARQL_NAME_CHECK_NCNAME);
		return BLANK_LITERAL;
}

{QUOTEDURI}\(?   { 
		int have_brace=(yytext[yyleng-1]=='(');
		if(have_brace)
			yyleng--;
		if(yyleng == 2) 
                  sparql_parser_lval->uri=raptor_uri_copy(rq->base_uri);
                else {
                  unsigned char* uri_string;

                  yytext[yyleng-1]='\0';
                  uri_string=rasqal_escaped_name_to_utf8_string((unsigned char*)yytext+1,
                                                                yyleng-1,
                                                                NULL,
                                                                (raptor_simple_message_handler)sparql_syntax_error, rq);
                  if(!uri_string)
                    yyterminate();

                  sparql_parser_lval->uri=raptor_new_uri_relative_to_base(rq->base_uri, uri_string);
                  RASQAL_FREE(cstring, uri_string);
                 }
                 return have_brace ? URI_LITERAL_BRACE : URI_LITERAL; }

\#[^\r\n]*(\r\n|\r|\n)	{ /* # comment */
        	rqe->lineno++;
                }

.         	{ if (!*yytext)
                    return EOF;

                  sparql_syntax_error(rq, "SPARQL syntax error at '%c'", *yytext);
                  yyterminate();
		}

%%
  /* user code */

int
yywrap (yyscan_t yyscanner) {
  return 1;
}


static int
rasqal_sparql_name_check(const unsigned char *string, size_t length,
                         sparql_name_check_flags check_flags)
{
#if RASQAL_DEBUG > 2
  RASQAL_DEBUG1("Checking name '");
  if(length)
     fwrite(string, length, sizeof(unsigned char), stderr);
  fprintf(stderr, "' (length %d), flags %d\n", length, (int)check_flags);
#endif

  if(!length)
    return 1;

  if(!raptor_xml_name_check(string, length, 11)) /* 11 = XML 1.1 */
    return 0;
  
  if((check_flags & SPARQL_NAME_CHECK_NO_UL_FIRST) && *string == '_')
    return 0;

  if((check_flags & SPARQL_NAME_CHECK_NO_DOT_LAST) && string[length-1] == '.')
    return 0;

  if(check_flags & SPARQL_NAME_CHECK_NO_DOT_MINUS) {
    int i;
    for(i=0; i < (int)length; i++)
      if(string[i] == '.' || string[i] == '-')
        return 0;
  }
  
  return 1;
}


static unsigned char *
sparql_copy_name(rasqal_query *rq, const unsigned char *text, size_t len,
                 sparql_name_check_flags check_flags) {
  size_t dest_len=0;
  unsigned char *s;

  s=rasqal_escaped_name_to_utf8_string((unsigned char*)text, len,
                                       &dest_len,
                                       (raptor_simple_message_handler)sparql_syntax_error, rq);
  if(!s)
    return s;

  if(!rasqal_sparql_name_check(s, dest_len, check_flags))
    sparql_syntax_error(rq, "Invalid SPARQL name \"%s\"", s);

  return s;
}


static raptor_uri*
sparql_copy_qname(rasqal_query *rq, const unsigned char *text, size_t len) {
  unsigned char *p;
  size_t dest_len=0;
  unsigned char *s;
  raptor_uri* uri=NULL;
  
  s=rasqal_escaped_name_to_utf8_string((unsigned char*)text, len,
                                       &dest_len,
                                       (raptor_simple_message_handler)sparql_syntax_error, rq);
  if(!s)
    return NULL;

  p=(unsigned char*)strchr((const char*)s, ':');
  if(!rasqal_sparql_name_check(s, p-s, SPARQL_NAME_CHECK_PREFIX))
    sparql_syntax_error(rq, "Invalid SPARQL prefix name \"%s\"", s);
  if(!rasqal_sparql_name_check(p+1, dest_len-((p+1)-s), SPARQL_NAME_CHECK_NCNAME))
    sparql_syntax_error(rq, "Invalid SPARQL local name \"%s\"", p+1);

#ifdef STANDALONE
  /* lexer test cannot declare namespaces - so just ignore expansion */
  uri=raptor_new_uri_relative_to_base(rq->base_uri, s);
#else
  if(!rq->namespaces) {
    sparql_syntax_error(rq, "SPARQL syntax error - no namespaces declared");
    return NULL;
  }
  
  uri=raptor_qname_string_to_uri(rq->namespaces,
                                 s, dest_len,
                                 (raptor_simple_message_handler)rasqal_query_simple_error, rq);
#endif
  RASQAL_FREE(cstring, s);
  
  return uri;
}


static int
sparql_copy_string_token(rasqal_query* rq, YYSTYPE* lval,
                         const unsigned char *text, size_t len, int delim) {
  unsigned int i;
  const unsigned char *s;
  unsigned char *d;
  unsigned char *string=(unsigned char *)RASQAL_MALLOC(cstring, len+1);
  char *language=NULL;
  unsigned char *dt=NULL;
  raptor_uri *dt_uri=NULL;
  unsigned char *dt_qname=NULL;

  for(s=text, d=string, i=0; i<len; s++, i++) {
    unsigned char c=*s;

    if(c == '\\' ) {
      s++; i++;
      c=*s;
      if(c == 'n')
        *d++= '\n';
      else if(c == 'r')
        *d++= '\r';
      else if(c == 't')
        *d++= '\t';
      else if(c == '\\' || c == delim)
        *d++=c;
      else if (c == 'u' || c == 'U') {
        int ulen=(c == 'u') ? 4 : 8;
        unsigned long unichar=0;
        int n;
        
        s++; i++;
        if(i+ulen > len) {
          printf("\\%c over end of line", c);
          RASQAL_FREE(cstring, string);
          return 1;
        }
        
        n=sscanf((const char*)s, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar);
        if(n != 1) {
          sparql_syntax_error(rq, "SPARQL syntax error - Illegal Uncode escape '%c%s...'", c, s);
          RASQAL_FREE(cstring, string);
          return 1;
        }

        s+= ulen-1;
        i+= ulen-1;
        
        if(unichar > 0x10ffff) {
          sparql_syntax_error(rq, "SPARQL syntax error - Illegal Unicode character with code point #x%lX.", unichar);
          RASQAL_FREE(cstring, string);
          return 1;
        }
          
        d+=raptor_unicode_char_to_utf8(unichar, d);
      } else {
        /* Ignore \x where x isn't the one of: \n \r \t \\ (delim) \u \U */
        sparql_syntax_warning(rq, "Unknown SPARQL string escape \\%c in \"%s\"", c, text);
        *d++=c;
      }
    } else if(c== delim) {
      *d++='\0';

      /* skip delim */
      s++; i++;

      c=*s++; i++;
      if(c=='@') {
        language=(char*)d;
        while(i<=len) {
          c=*s++; i++;
          if(!isalpha(c) && !isdigit(c))
            break;
          *d++=c;
        }
        *d++='\0';
      }
      if(c=='^') {
        /* skip second char of ^^ */
        s++; i++;

        dt=d;
        while(i++<=len)
          *d++=*s++;
        /* *d='\0' below */
      } else if (language)
        *d='\0';
      
      break;
   } else
    *d++=c;
  } /* end of for */

  *d='\0';

  if(language) {
    char *new_language=(char *)RASQAL_MALLOC(cstring, strlen((const char*)language)+1);
    strcpy(new_language, language);
    language=new_language;
  }
  
  if(dt) {
    /* dt can be a URI or qname */
    if(*dt == '<') {
      dt[strlen((const char*)dt)-1]='\0';
      dt_uri=raptor_new_uri(dt+1);
    } else {
      unsigned char *dt_p;
      size_t dest_len=0;
      unsigned char *dt_s;
      
      dt_s=rasqal_escaped_name_to_utf8_string(dt,
                                              strlen((const char*)dt),
                                              &dest_len,
                                              (raptor_simple_message_handler)sparql_syntax_error, rq);
      if(!dt_s)
        return 1;

      dt_p=(unsigned char*)strchr((const char*)dt_s, ':');
      if(!rasqal_sparql_name_check(dt_s, dt_p-dt_s, SPARQL_NAME_CHECK_PREFIX))
        sparql_syntax_error(rq, "Invalid SPARQL prefix name \"%s\"", dt_s);
      if(!rasqal_sparql_name_check(dt_p+1, dest_len-((dt_p+1)-dt_s), 
                                   SPARQL_NAME_CHECK_NCNAME))
        sparql_syntax_error(rq, "Invalid SPARQL local name \"%s\"", dt_p+1);

#ifdef STANDALONE
      /* lexer test cannot declare namespaces - so just ignore expansion */
      dt_qname=dt_s;
#else
      if(!rq->namespaces) {
        sparql_syntax_error(rq, "SPARQL syntax error - no namespaces declared");
        RASQAL_FREE(cstring, dt_s);
        if(language)
          RASQAL_FREE(cstring, language);
        RASQAL_FREE(cstring, string);
        return 1;
      }
  
      dt_uri=raptor_qname_string_to_uri(rq->namespaces,
                                        dt_s, dest_len,
                                        (raptor_simple_message_handler)rasqal_query_simple_error, rq);
      RASQAL_FREE(cstring, dt_s);

      if(!dt_uri) {
        if(language)
          RASQAL_FREE(cstring, language);
        RASQAL_FREE(cstring, string);
        return 1;
      }
#endif
    }
  }

#if RASQAL_DEBUG >3
  fprintf(stderr, "string='%s', language='%s'\n", 
          string, (language ? language : ""));
  fprintf(stderr, "datatype uri='%s'\n",
          (dt_uri ? (const char*)raptor_uri_as_string(dt_uri) : ""));
#endif

  lval->literal=rasqal_new_string_literal(string, language, dt_uri, dt_qname);

  return 0;
}


static int
sparql_skip_c_comment(rasqal_query *rq) {
  rasqal_sparql_query_engine *rqe=(rasqal_sparql_query_engine*)rq->context;
  yyscan_t yyscanner=rqe->scanner;
  int lines=0;
  int c;
  int lastc= -1;
  
  while(1) {
    while ((c=INPUT_FN(yyscanner)) != '*' && c!= EOF) {
      if(c == '\r' || (c == '\n' && lastc != '\r'))
        lines++;
      lastc= c;
    }
    if( c == '*') {
      while ((c=INPUT_FN(yyscanner)) == '*') {
        if(c == '\r' || (c == '\n' && lastc != '\r'))
          lines++;
        lastc= c;
      }

      if(c == '/')
        break;
    }
    if (c == EOF) {
      sparql_syntax_error(rq, "SPARQL syntax error - EOF in comment");
      lines= -1;
      break;
    }
    lastc= c;
  }
  return lines;
}


#ifdef RASQAL_DEBUG

const char *
sparql_token_print(int token, YYSTYPE *lval)
{
  static char buffer[2048];

  if(!token)
    return "<<EOF>>";
  
  switch(token) {
    case SELECT:
      return "SELECT";

    case FROM:
      return "FROM";

    case WHERE:
      return "WHERE";

    case PREFIX:
      return "PREFIX";

    case DESCRIBE:
      return "DESCRIBE";

    case CONSTRUCT:
      return "CONSTRUCT";

    case ASK:
      return "ASK";

    case DISTINCT:
      return "DISTINCT";

    case LIMIT:
      return "LIMIT";

    case UNION:
      return "UNION";

    case OPTIONAL:
      return "OPTIONAL";

    case BASE:
      return "BASE";

    case BOUND:
      return "BOUND";

    case STR:
      return "STR";
      
    case LANG:
      return "LANG";
      
    case DATATYPE:
      return "DATATYPE";
      
    case ISURI:
      return "ISURI";
      
    case ISBLANK:
      return "ISBLANK";
      
    case ISLITERAL:
      return "ISLITERAL";
      
    case GRAPH:
      return "GRAPH";
      
    case NAMED:
      return "NAMED";
      
    case FILTER:
      return "FILTER";

    case OFFSET:
      return "OFFSET";
      
    case A:
      return "a";
      
    case ORDER:
      return "ORDER";
      
    case BY:
      return "BY";
      
    case REGEX:
      return "REGEX";
      
    case ASC:
      return "ASC[";
      
    case DESC:
      return "DESC[";
      
    case LANGMATCHES:
      return "LANGMATCHES";
      
    case ',':
      return ",";

    case '(':
      return "(";

    case ')':
      return ")";

    case '[':
      return "[";

    case ']':
      return "]";

    case '{':
      return "{";

    case '}':
      return "}";

    case '.':
      return ".";

    case ';':
      return ";";

    case '?':
      return "?";

    case '$':
      return "$";

    case SC_AND:
      return "SC_AND";

    case SC_OR:
      return "SC_OR";

    case GE:
      return "GE";

    case LE:
      return "LE";

    case GT:
      return "GT";

    case LT:
      return "LT";

    case NEQ:
      return "NEQ";

    case EQ:
      return "EQ";

    case '/':
      return "/";

    case '*':
      return "*";

    case '-':
      return "-";

    case '+':
      return "+";

    case '!':
      return "!";

    case INTEGER_LITERAL:
      sprintf(buffer, "INTEGER_LITERAL(%d)", lval->literal->value.integer);
      return buffer;

    case FLOATING_POINT_LITERAL:
      sprintf(buffer, "FLOATING_POINT_LITERAL(%g)", lval->floating);
      return buffer;

    case STRING_LITERAL:
      if(lval->literal->language) {
        if(lval->literal->datatype)
          sprintf(buffer, "STRING_LITERAL(\"%s\"@%s^^%s)",
                  lval->literal->string, lval->literal->language,
                  raptor_uri_as_string(lval->literal->datatype));
        else
          sprintf(buffer, "STRING_LITERAL(\"%s\"@%s)",
                  lval->literal->string, lval->literal->language);
      } else {
        if(lval->literal->datatype)
          sprintf(buffer, "STRING_LITERAL(\"%s\"^^%s)", 
                  lval->literal->string,
                  raptor_uri_as_string(lval->literal->datatype));
        else
          sprintf(buffer, "STRING_LITERAL(\"%s\")", lval->literal->string);
      }
      return buffer;

    case BOOLEAN_LITERAL:
      return (lval->literal->value.integer ? "BOOLEAN_LITERAL(true)" : "BOOLEAN_LITERAL(false)");

    case URI_LITERAL:
      sprintf(buffer, "URI_LITERAL(%s)", raptor_uri_as_string(lval->uri));
      return buffer;

    case QNAME_LITERAL:
      sprintf(buffer, "QNAME_LITERAL(%s)", lval->name);
      return buffer;

    case URI_LITERAL_BRACE:
      sprintf(buffer, "URI_LITERAL_BRACE(%s)", raptor_uri_as_string(lval->uri));
      return buffer;

    case QNAME_LITERAL_BRACE:
      sprintf(buffer, "QNAME_LITERAL_BRACE(%s)", lval->name);
      return buffer;

    case IDENTIFIER:
      sprintf(buffer, "IDENTIFIER(%s)", lval->name);
      return buffer;

    case BLANK_LITERAL:
      sprintf(buffer, "BLANK_LITERAL(%s)", lval->name);
      return buffer;

    case DECIMAL_LITERAL:
      sprintf(buffer, "DECIMAL_LITERAL(%s)", lval->literal->string);
      return buffer;

   default:
     RASQAL_DEBUG2("UNKNOWN token %d - add a new case\n", token);
     abort();
  }
}
#endif



#ifdef STANDALONE
static void
sparql_token_free(int token, YYSTYPE *lval)
{
  if(!token)
    return;
  
  switch(token) {
    case STRING_LITERAL:
      rasqal_free_literal(lval->literal);
      break;
    case URI_LITERAL:
      raptor_free_uri(lval->uri);
      break;
    case IDENTIFIER:
    case BLANK_LITERAL:
      RASQAL_FREE(cstring, lval->name);
      break;
    case QNAME_LITERAL:
      if(lval->name)
        RASQAL_FREE(cstring, lval->name);
      break;
    default:
      break;
  }
}


#define FILE_READ_BUF_SIZE 2048

int
main(int argc, char *argv[]) 
{
  const char *program=rasqal_basename(argv[0]);
  char *query_string=NULL;
  rasqal_query rq;
  rasqal_sparql_query_engine sparql;
  yyscan_t scanner;
  int token=EOF;
  YYSTYPE lval;
  const unsigned char *uri_string;
  const char *filename=NULL;
  char *buf=NULL;
  size_t len;
  void *buffer;

  rasqal_init();
  
  if(argc > 1) {
    FILE *fh;
    query_string=(char*)RASQAL_CALLOC(cstring, FILE_READ_BUF_SIZE, 1);
    filename=argv[1];
    fh=fopen(filename, "r");
    if(fh) {
      fread(query_string, FILE_READ_BUF_SIZE, 1, fh);
      fclose(fh);
    } else {
      fprintf(stderr, "%s: Cannot open file %s - %s\n", program, filename,
              strerror(errno));
      exit(1);
    }
  } else {
    filename="<stdin>";
    query_string=(char*)RASQAL_CALLOC(cstring, FILE_READ_BUF_SIZE, 1);
    fread(query_string, FILE_READ_BUF_SIZE, 1, stdin);
  }

  memset(&rq, 0, sizeof(rasqal_query));
  memset(&sparql, 0, sizeof(rasqal_sparql_query_engine));

  yylex_init(&sparql.scanner);
  scanner=sparql.scanner;

  len= strlen((const char*)query_string);
  buf= (char *)RASQAL_MALLOC(cstring, len+3);
  strncpy(buf, query_string, len);
  buf[len]= ' ';
  buf[len+1]= buf[len+2]='\0'; /* YY_END_OF_BUFFER_CHAR; */
  buffer= sparql_lexer__scan_buffer(buf, len+3, scanner);

  sparql_lexer_set_extra(&rq, scanner);

  /* Initialise enough of the rasqal_query and locator to get error messages */
  rq.context=&sparql;
  sparql.lineno=1;
  rq.locator.file=filename;
  rq.locator.column= -1;

  uri_string=raptor_uri_filename_to_uri_string(filename);
  rq.base_uri=raptor_new_uri(uri_string);
  raptor_free_memory((void*)uri_string);

  while(1) {
    memset(&lval, 0, sizeof(YYSTYPE));
    if(sparql_lexer_get_text(scanner) != NULL)
      printf("yyinput '%s'\n", sparql_lexer_get_text(scanner));
    token=yylex(&lval, scanner);
#ifdef RASQAL_DEBUG
    printf("token %s\n", sparql_token_print(token, &lval));
#else
    printf("token %d\n", token);
#endif
    sparql_token_free(token, &lval);
    if(!token || token == EOF)
      break;
  }

  if(buf)
    RASQAL_FREE(cstring, buf);
  
  yylex_destroy(scanner);

  raptor_free_uri(rq.base_uri);

  RASQAL_FREE(cstring, query_string);

  rasqal_finish();

  if(rq.failed)
    return 1;
 
  return 0;
}
#endif