The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/* 
 * Copyright 1999-2009, Gisle Aas
 * Copyright 1999-2000, Michael A. Chase
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the same terms as Perl itself.
 */

#ifndef EXTERN
#define EXTERN extern
#endif

#include "hctype.h"    /* isH...() macros */
#include "tokenpos.h"  /* dTOKEN; PUSH_TOKEN() */


static
struct literal_tag {
    int len;
    char* str;
    int is_cdata;
}
literal_mode_elem[] =
{
    {6, "script", 1},
    {5, "style", 1},
    {3, "xmp", 1},
    {6, "iframe", 1},
    {9, "plaintext", 1},
    {5, "title", 0},
    {8, "textarea", 0},
    {0, 0, 0}
};

enum argcode {
    ARG_SELF = 1,  /* need to avoid '\0' in argspec string */
    ARG_TOKENS,
    ARG_TOKENPOS,
    ARG_TOKEN0,
    ARG_TAGNAME,
    ARG_TAG,
    ARG_ATTR,
    ARG_ATTRARR,
    ARG_ATTRSEQ,
    ARG_TEXT,
    ARG_DTEXT,
    ARG_IS_CDATA,
    ARG_SKIPPED_TEXT,
    ARG_OFFSET,
    ARG_OFFSET_END,
    ARG_LENGTH,
    ARG_LINE,
    ARG_COLUMN,
    ARG_EVENT,
    ARG_UNDEF,
    ARG_LITERAL, /* Always keep last */

    /* extra flags always encoded first */
    ARG_FLAG_FLAT_ARRAY
};

char *argname[] = {
    /* Must be in the same order as enum argcode */
    "self",     /* ARG_SELF */
    "tokens",   /* ARG_TOKENS */   
    "tokenpos", /* ARG_TOKENPOS */
    "token0",   /* ARG_TOKEN0 */
    "tagname",  /* ARG_TAGNAME */
    "tag",      /* ARG_TAG */
    "attr",     /* ARG_ATTR */
    "@attr",    /* ARG_ATTRARR */
    "attrseq",  /* ARG_ATTRSEQ */
    "text",     /* ARG_TEXT */
    "dtext",    /* ARG_DTEXT */
    "is_cdata", /* ARG_IS_CDATA */
    "skipped_text", /* ARG_SKIPPED_TEXT */
    "offset",   /* ARG_OFFSET */
    "offset_end", /* ARG_OFFSET_END */
    "length",   /* ARG_LENGTH */
    "line",     /* ARG_LINE */
    "column",   /* ARG_COLUMN */
    "event",    /* ARG_EVENT */
    "undef",    /* ARG_UNDEF */
    /* ARG_LITERAL (not compared) */
    /* ARG_FLAG_FLAT_ARRAY */
};

#define CASE_SENSITIVE(p_state) \
         ((p_state)->xml_mode || (p_state)->case_sensitive)
#define STRICT_NAMES(p_state) \
         ((p_state)->xml_mode || (p_state)->strict_names)
#define ALLOW_EMPTY_TAG(p_state) \
         ((p_state)->xml_mode || (p_state)->empty_element_tags)

static void flush_pending_text(PSTATE* p_state, SV* self);

/*
 * Parser functions.
 *
 *   parse()                       - top level entry point.
 *                                   deals with text and calls one of its
 *                                   subordinate parse_*() routines after
 *                                   looking at the first char after "<"
 *     parse_decl()                - deals with declarations         <!...>
 *       parse_comment()           - deals with <!-- ... -->
 *       parse_marked_section      - deals with <![ ... [ ... ]]>
 *     parse_end()                 - deals with end tags             </...>
 *     parse_start()               - deals with start tags           <A...>
 *     parse_process()             - deals with process instructions <?...>
 *     parse_null()                - deals with anything else        <....>
 *
 *     report_event() - called whenever any of the parse*() routines
 *                      has recongnized something.
 */

static void
report_event(PSTATE* p_state,
	     event_id_t event,
	     char *beg, char *end, U32 utf8,
	     token_pos_t *tokens, int num_tokens,
	     SV* self
	    )
{
    struct p_handler *h;
    dTHX;
    dSP;
    AV *array;
    STRLEN my_na;
    char *argspec;
    char *s;
    STRLEN offset;
    STRLEN line;
    STRLEN column;

#ifdef UNICODE_HTML_PARSER
    #define CHR_DIST(a,b) (utf8 ? utf8_distance((U8*)(a),(U8*)(b)) : (a) - (b))
#else
    #define CHR_DIST(a,b) ((a) - (b))
#endif

    /* some events might still fire after a handler has signaled eof
     * so suppress them here.
     */
    if (p_state->eof)
	return;

    /* capture offsets */
    offset = p_state->offset;
    line = p_state->line;
    column = p_state->column;

#if 0
    {  /* used for debugging at some point */
	char *s = beg;
	int i;

	/* print debug output */
	switch(event) {
	case E_DECLARATION: printf("DECLARATION"); break;
	case E_COMMENT:     printf("COMMENT"); break;
	case E_START:       printf("START"); break;
	case E_END:         printf("END"); break;
	case E_TEXT:        printf("TEXT"); break;
	case E_PROCESS:     printf("PROCESS"); break;
	case E_NONE:        printf("NONE"); break;
	default:            printf("EVENT #%d", event); break;
	}

	printf(" [");
	while (s < end) {
	    if (*s == '\n') {
		putchar('\\'); putchar('n');
	    }
	    else
		putchar(*s);
	    s++;
	}
	printf("] %d\n", end - beg);
	for (i = 0; i < num_tokens; i++) {
	    printf("  token %d: %d %d\n",
		   i,
		   tokens[i].beg - beg,
		   tokens[i].end - tokens[i].beg);
	}
    }
#endif

    if (p_state->pending_end_tag && event != E_TEXT && event != E_COMMENT) {
	token_pos_t t;
	char dummy;
	t.beg = p_state->pending_end_tag;
	t.end = p_state->pending_end_tag + strlen(p_state->pending_end_tag);
	p_state->pending_end_tag = 0;
	report_event(p_state, E_END, &dummy, &dummy, 0, &t, 1, self);
	SPAGAIN;
    }

    /* update offsets */
    p_state->offset += CHR_DIST(end, beg);
    if (line) {
	char *s = beg;
	char *nl = NULL;
	while (s < end) {
	    if (*s == '\n') {
		p_state->line++;
		nl = s;
	    }
	    s++;
	}
	if (nl)
	    p_state->column = CHR_DIST(end, nl) - 1;
	else
	    p_state->column += CHR_DIST(end, beg);
    }

    if (event == E_NONE)
	goto IGNORE_EVENT;
    
#ifdef MARKED_SECTION
    if (p_state->ms == MS_IGNORE)
	goto IGNORE_EVENT;
#endif

    /* tag filters */
    if (p_state->ignore_tags || p_state->report_tags || p_state->ignore_elements) {

	if (event == E_START || event == E_END) {
	    SV* tagname = p_state->tmp;

	    assert(num_tokens >= 1);
	    sv_setpvn(tagname, tokens[0].beg, tokens[0].end - tokens[0].beg);
	    if (utf8)
		SvUTF8_on(tagname);
	    else
		SvUTF8_off(tagname);
	    if (!CASE_SENSITIVE(p_state))
		sv_lower(aTHX_ tagname);

	    if (p_state->ignoring_element) {
		if (sv_eq(p_state->ignoring_element, tagname)) {
		    if (event == E_START)
			p_state->ignore_depth++;
		    else if (--p_state->ignore_depth == 0) {
			SvREFCNT_dec(p_state->ignoring_element);
			p_state->ignoring_element = 0;
		    }
		}
		goto IGNORE_EVENT;
	    }

	    if (p_state->ignore_elements &&
		hv_fetch_ent(p_state->ignore_elements, tagname, 0, 0))
	    {
		if (event == E_START) {
		    p_state->ignoring_element = newSVsv(tagname);
		    p_state->ignore_depth = 1;
		}
		goto IGNORE_EVENT;
	    }

	    if (p_state->ignore_tags &&
		hv_fetch_ent(p_state->ignore_tags, tagname, 0, 0))
	    {
		goto IGNORE_EVENT;
	    }
	    if (p_state->report_tags &&
		!hv_fetch_ent(p_state->report_tags, tagname, 0, 0))
	    {
		goto IGNORE_EVENT;
	    }
	}
	else if (p_state->ignoring_element) {
	    goto IGNORE_EVENT;
	}
    }

    h = &p_state->handlers[event];
    if (!h->cb) {
	/* event = E_DEFAULT; */
	h = &p_state->handlers[E_DEFAULT];
	if (!h->cb)
	    goto IGNORE_EVENT;
    }

    if (SvTYPE(h->cb) != SVt_PVAV && !SvTRUE(h->cb)) {
	/* FALSE scalar ('' or 0) means IGNORE this event */
	return;
    }

    if (p_state->unbroken_text && event == E_TEXT) {
	/* should buffer text */
	if (!p_state->pend_text)
	    p_state->pend_text = newSV(256);
	if (SvOK(p_state->pend_text)) {
	    if (p_state->is_cdata != p_state->pend_text_is_cdata) {
		flush_pending_text(p_state, self);
		SPAGAIN;
		goto INIT_PEND_TEXT;
	    }
	}
	else {
	INIT_PEND_TEXT:
	    p_state->pend_text_offset = offset;
	    p_state->pend_text_line = line;
	    p_state->pend_text_column = column;
	    p_state->pend_text_is_cdata = p_state->is_cdata;
	    sv_setpvn(p_state->pend_text, "", 0);
	    if (!utf8)
		SvUTF8_off(p_state->pend_text);
	}
#ifdef UNICODE_HTML_PARSER
	if (utf8 && !SvUTF8(p_state->pend_text))
	    sv_utf8_upgrade(p_state->pend_text);
	if (utf8 || !SvUTF8(p_state->pend_text)) {
	    sv_catpvn(p_state->pend_text, beg, end - beg);
	}
	else {
	    SV *tmp = newSVpvn(beg, end - beg);
	    sv_utf8_upgrade(tmp);
	    sv_catsv(p_state->pend_text, tmp);
	    SvREFCNT_dec(tmp);
	}
#else
	sv_catpvn(p_state->pend_text, beg, end - beg);
#endif
	return;
    }
    else if (p_state->pend_text && SvOK(p_state->pend_text)) {
	flush_pending_text(p_state, self);
	SPAGAIN;
    }

    /* At this point we have decided to generate an event callback */

    argspec = h->argspec ? SvPV(h->argspec, my_na) : "";

    if (SvTYPE(h->cb) == SVt_PVAV) {
	
	if (*argspec == ARG_FLAG_FLAT_ARRAY) {
	    argspec++;
	    array = (AV*)h->cb;
	}
	else {
	    /* start sub-array for accumulator array */
	    array = newAV();
	}
    }
    else {
	array = 0;
	if (*argspec == ARG_FLAG_FLAT_ARRAY)
	    argspec++;

	/* start argument stack for callback */
	ENTER;
	SAVETMPS;
	PUSHMARK(SP);
    }

    for (s = argspec; *s; s++) {
	SV* arg = 0;
	int push_arg = 1;
	enum argcode argcode = (enum argcode)*s;

	switch( argcode ) {

	case ARG_SELF:
	    arg = sv_mortalcopy(self);
	    break;

	case ARG_TOKENS:
	    if (num_tokens >= 1) {
		AV* av = newAV();
		SV* prev_token = &PL_sv_undef;
		int i;
		av_extend(av, num_tokens);
		for (i = 0; i < num_tokens; i++) {
		    if (tokens[i].beg) {
			prev_token = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg);
			if (utf8)
			    SvUTF8_on(prev_token);
			av_push(av, prev_token);
		    }
		    else { /* boolean */
			av_push(av, p_state->bool_attr_val
				? newSVsv(p_state->bool_attr_val)
				: newSVsv(prev_token));
		    }
		}
		arg = sv_2mortal(newRV_noinc((SV*)av));
	    }
	    break;

	case ARG_TOKENPOS:
	    if (num_tokens >= 1 && tokens[0].beg >= beg) {
		AV* av = newAV();
		int i;
		av_extend(av, num_tokens*2);
		for (i = 0; i < num_tokens; i++) {
		    if (tokens[i].beg) {
			av_push(av, newSViv(CHR_DIST(tokens[i].beg, beg)));
			av_push(av, newSViv(CHR_DIST(tokens[i].end, tokens[i].beg)));
		    }
		    else { /* boolean tag value */
			av_push(av, newSViv(0));
			av_push(av, newSViv(0));
		    }
		}
		arg = sv_2mortal(newRV_noinc((SV*)av));
	    }
	    break;

	case ARG_TOKEN0:
	case ARG_TAGNAME:
	    /* fall through */

	case ARG_TAG:
	    if (num_tokens >= 1) {
		arg = sv_2mortal(newSVpvn(tokens[0].beg,
					  tokens[0].end - tokens[0].beg));
		if (utf8)
		    SvUTF8_on(arg);
		if (!CASE_SENSITIVE(p_state) && argcode != ARG_TOKEN0)
		    sv_lower(aTHX_ arg);
		if (argcode == ARG_TAG && event != E_START) {
		    char *e_type = "!##/#?#";
		    sv_insert(arg, 0, 0, &e_type[event], 1);
		}
	    }
	    break;

	case ARG_ATTR:
	case ARG_ATTRARR:
	    if (event == E_START) {
		HV* hv;
		int i;
		if (argcode == ARG_ATTR) {
		    hv = newHV();
		    arg = sv_2mortal(newRV_noinc((SV*)hv));
		}
		else {
#ifdef __GNUC__
		    /* gcc -Wall reports this variable as possibly used uninitialized */
		    hv = 0;
#endif
		    push_arg = 0;  /* deal with argument pushing here */
		}

		for (i = 1; i < num_tokens; i += 2) {
		    SV* attrname = newSVpvn(tokens[i].beg,
					    tokens[i].end-tokens[i].beg);
		    SV* attrval;

		    if (utf8)
			SvUTF8_on(attrname);
		    if (tokens[i+1].beg) {
			char *beg = tokens[i+1].beg;
			STRLEN len = tokens[i+1].end - beg;
			if (*beg == '"' || *beg == '\'' || (*beg == '`' && p_state->backquote)) {
			    assert(len >= 2 && *beg == beg[len-1]);
			    beg++; len -= 2;
			}
			attrval = newSVpvn(beg, len);
			if (utf8)
			    SvUTF8_on(attrval);
			if (!p_state->attr_encoded) {
#ifdef UNICODE_HTML_PARSER
			    if (p_state->utf8_mode)
				sv_utf8_decode(attrval);
#endif
			    decode_entities(aTHX_ attrval, p_state->entity2char, 0);
			    if (p_state->utf8_mode)
				SvUTF8_off(attrval);
			}
		    }
		    else { /* boolean */
			if (p_state->bool_attr_val)
			    attrval = newSVsv(p_state->bool_attr_val);
			else
			    attrval = newSVsv(attrname);
		    }

		    if (!CASE_SENSITIVE(p_state))
			sv_lower(aTHX_ attrname);

		    if (argcode == ARG_ATTR) {
			if (hv_exists_ent(hv, attrname, 0) ||
			    !hv_store_ent(hv, attrname, attrval, 0)) {
			    SvREFCNT_dec(attrval);
			}
			SvREFCNT_dec(attrname);
		    }
		    else { /* ARG_ATTRARR */
			if (array) {
			    av_push(array, attrname);
			    av_push(array, attrval);
			}
			else {
			    XPUSHs(sv_2mortal(attrname));
			    XPUSHs(sv_2mortal(attrval));
			}
		    }
		}
	    }
	    else if (argcode == ARG_ATTRARR) {
		push_arg = 0;
	    }
	    break;

	case ARG_ATTRSEQ:       /* (v2 compatibility stuff) */
	    if (event == E_START) {
		AV* av = newAV();
		int i;
		for (i = 1; i < num_tokens; i += 2) {
		    SV* attrname = newSVpvn(tokens[i].beg,
					    tokens[i].end-tokens[i].beg);
		    if (utf8)
			SvUTF8_on(attrname);
		    if (!CASE_SENSITIVE(p_state))
			sv_lower(aTHX_ attrname);
		    av_push(av, attrname);
		}
		arg = sv_2mortal(newRV_noinc((SV*)av));
	    }
	    break;
	
	case ARG_TEXT:
	    arg = sv_2mortal(newSVpvn(beg, end - beg));
	    if (utf8)
		SvUTF8_on(arg);
	    break;

	case ARG_DTEXT:
	    if (event == E_TEXT) {
		arg = sv_2mortal(newSVpvn(beg, end - beg));
		if (utf8)
		    SvUTF8_on(arg);
		if (!p_state->is_cdata) {
#ifdef UNICODE_HTML_PARSER
		    if (p_state->utf8_mode)
			sv_utf8_decode(arg);
#endif
		    decode_entities(aTHX_ arg, p_state->entity2char, 1);
		    if (p_state->utf8_mode)
			SvUTF8_off(arg);
		}
	    }
	    break;
      
	case ARG_IS_CDATA:
	    if (event == E_TEXT) {
		arg = boolSV(p_state->is_cdata);
	    }
	    break;

        case ARG_SKIPPED_TEXT:
	    arg = sv_2mortal(p_state->skipped_text);
	    p_state->skipped_text = newSVpvn("", 0);
            break;

	case ARG_OFFSET:
	    arg = sv_2mortal(newSViv(offset));
	    break;

	case ARG_OFFSET_END:
	    arg = sv_2mortal(newSViv(offset + CHR_DIST(end, beg)));
	    break;

	case ARG_LENGTH:
	    arg = sv_2mortal(newSViv(CHR_DIST(end, beg)));
	    break;

	case ARG_LINE:
	    arg = sv_2mortal(newSViv(line));
	    break;

	case ARG_COLUMN:
	    arg = sv_2mortal(newSViv(column));
	    break;

	case ARG_EVENT:
	    assert(event >= 0 && event < EVENT_COUNT);
	    arg = sv_2mortal(newSVpv(event_id_str[event], 0));
	    break;

	case ARG_LITERAL:
	{
	    int len = (unsigned char)s[1];
	    arg = sv_2mortal(newSVpvn(s+2, len));
	    if (SvUTF8(h->argspec))
		SvUTF8_on(arg);
	    s += len + 1;
	}
	break;

	case ARG_UNDEF:
	    arg = sv_mortalcopy(&PL_sv_undef);
	    break;
      
	default:
	    arg = sv_2mortal(newSVpvf("Bad argspec %d", *s));
	    break;
	}

	if (push_arg) {
	    if (!arg)
		arg = sv_mortalcopy(&PL_sv_undef);

	    if (array) {
		/* have to fix mortality here or add mortality to
		 * XPUSHs after removing it from the switch cases.
		 */
		av_push(array, SvREFCNT_inc(arg));
	    }
	    else {
		XPUSHs(arg);
	    }
	}
    }

    if (array) {
	if (array != (AV*)h->cb)
	    av_push((AV*)h->cb, newRV_noinc((SV*)array));
    }
    else {
	PUTBACK;

	if ((enum argcode)*argspec == ARG_SELF && !SvROK(h->cb)) {
	    char *method = SvPV(h->cb, my_na);
	    perl_call_method(method, G_DISCARD | G_EVAL | G_VOID);
	}
	else {
	    perl_call_sv(h->cb, G_DISCARD | G_EVAL | G_VOID);
	}

	if (SvTRUE(ERRSV)) {
	    RETHROW;
	}

	FREETMPS;
	LEAVE;
    }
    if (p_state->skipped_text)
	SvCUR_set(p_state->skipped_text, 0);
    return;

IGNORE_EVENT:
    if (p_state->skipped_text) {
	if (event != E_TEXT && p_state->pend_text && SvOK(p_state->pend_text))
	    flush_pending_text(p_state, self);
#ifdef UNICODE_HTML_PARSER
	if (utf8 && !SvUTF8(p_state->skipped_text))
	    sv_utf8_upgrade(p_state->skipped_text);
	if (utf8 || !SvUTF8(p_state->skipped_text)) {
#endif
	    sv_catpvn(p_state->skipped_text, beg, end - beg);
#ifdef UNICODE_HTML_PARSER
	}
	else {
	    SV *tmp = newSVpvn(beg, end - beg);
	    sv_utf8_upgrade(tmp);
	    sv_catsv(p_state->skipped_text, tmp);
	    SvREFCNT_dec(tmp);
	}
#endif
    }
#undef CHR_DIST    
    return;
}


EXTERN SV*
argspec_compile(SV* src, PSTATE* p_state)
{
    dTHX;
    SV* argspec = newSVpvn("", 0);
    STRLEN len;
    char *s = SvPV(src, len);
    char *end = s + len;

    if (SvUTF8(src))
	SvUTF8_on(argspec);

    while (isHSPACE(*s))
	s++;

    if (*s == '@') {
	/* try to deal with '@{ ... }' wrapping */
	char *tmp = s + 1;
	while (isHSPACE(*tmp))
	    tmp++;
	if (*tmp == '{') {
	    char c = ARG_FLAG_FLAT_ARRAY;
	    sv_catpvn(argspec, &c, 1);
	    tmp++;
	    while (isHSPACE(*tmp))
		tmp++;
	    s = tmp;
	}
    }
    while (s < end) {
	if (isHNAME_FIRST(*s) || *s == '@') {
	    char *name = s;
	    int a = ARG_SELF;
	    char **arg_name;

	    s++;
	    while (isHNAME_CHAR(*s))
		s++;

	    /* check identifier */
	    for ( arg_name = argname; a < ARG_LITERAL ; ++a, ++arg_name ) {
		if (strnEQ(*arg_name, name, s - name) &&
		    (*arg_name)[s - name] == '\0')
		    break;
	    }
	    if (a < ARG_LITERAL) {
		char c = (unsigned char) a;
		sv_catpvn(argspec, &c, 1);

		if (a == ARG_LINE || a == ARG_COLUMN) {
		    if (!p_state->line)
			p_state->line = 1; /* enable tracing of line/column */
		}
		if (a == ARG_SKIPPED_TEXT) {
		    if (!p_state->skipped_text) {
			p_state->skipped_text = newSVpvn("", 0);
                    }
                }
		if (a == ARG_ATTR || a == ARG_ATTRARR) {
		    if (p_state->argspec_entity_decode != ARG_DTEXT)
			p_state->argspec_entity_decode = ARG_ATTR;
		}
		else if (a == ARG_DTEXT) {
		    p_state->argspec_entity_decode = ARG_DTEXT;
		}
	    }
	    else {
		croak("Unrecognized identifier %.*s in argspec", s - name, name);
	    }
	}
	else if (*s == '"' || *s == '\'') {
	    char *string_beg = s;
	    s++;
	    while (s < end && *s != *string_beg && *s != '\\')
		s++;
	    if (*s == *string_beg) {
		/* literal */
		int len = s - string_beg - 1;
		unsigned char buf[2];
		if (len > 255)
		    croak("Literal string is longer than 255 chars in argspec");
		buf[0] = ARG_LITERAL;
		buf[1] = len;
		sv_catpvn(argspec, (char*)buf, 2);
		sv_catpvn(argspec, string_beg+1, len);
		s++;
	    }
	    else if (*s == '\\') {
		croak("Backslash reserved for literal string in argspec");
	    }
	    else {
		croak("Unterminated literal string in argspec");
	    }
	}
	else {
	    croak("Bad argspec (%s)", s);
	}

	while (isHSPACE(*s))
	    s++;
	
	if (*s == '}' && SvPVX(argspec)[0] == ARG_FLAG_FLAT_ARRAY) {
	    /* end of '@{ ... }' */
	    s++;
	    while (isHSPACE(*s))
		s++;
	    if (s < end)
		croak("Bad argspec: stuff after @{...} (%s)", s);
	}

	if (s == end)
	    break;
	if (*s != ',') {
	    croak("Missing comma separator in argspec");
	}
	s++;
	while (isHSPACE(*s))
	    s++;
    }
    return argspec;
}


static void
flush_pending_text(PSTATE* p_state, SV* self)
{
    dTHX;
    bool   old_unbroken_text = p_state->unbroken_text;
    SV*    old_pend_text     = p_state->pend_text;
    bool   old_is_cdata      = p_state->is_cdata;
    STRLEN old_offset        = p_state->offset;
    STRLEN old_line          = p_state->line;
    STRLEN old_column        = p_state->column;

    assert(p_state->pend_text && SvOK(p_state->pend_text));

    p_state->unbroken_text = 0;
    p_state->pend_text     = 0;
    p_state->is_cdata      = p_state->pend_text_is_cdata;
    p_state->offset        = p_state->pend_text_offset;
    p_state->line          = p_state->pend_text_line;
    p_state->column        = p_state->pend_text_column;

    report_event(p_state, E_TEXT,
		 SvPVX(old_pend_text), SvEND(old_pend_text), 
		 SvUTF8(old_pend_text), 0, 0, self);
    SvOK_off(old_pend_text);

    p_state->unbroken_text = old_unbroken_text;
    p_state->pend_text     = old_pend_text;
    p_state->is_cdata      = old_is_cdata;
    p_state->offset        = old_offset;
    p_state->line          = old_line;
    p_state->column        = old_column;
}

static char*
skip_until_gt(char *beg, char *end)
{
    /* tries to emulate quote skipping behaviour observed in MSIE */
    char *s = beg;
    char quote = '\0';
    char prev = ' ';
    while (s < end) {
	if (!quote && *s == '>')
	    return s;
	if (*s == '"' || *s == '\'') {
	    if (*s == quote) {
		quote = '\0';  /* end of quoted string */
	    }
	    else if (!quote && (prev == ' ' || prev == '=')) {
		quote = *s;
	    }
	}
	prev = *s++;
    }
    return end;
}

static char*
parse_comment(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
    char *s = beg;

    if (p_state->strict_comment) {
	dTOKENS(4);
	char *start_com = s;  /* also used to signal inside/outside */

	while (1) {
	    /* try to locate "--" */
	FIND_DASH_DASH:
	    /* printf("find_dash_dash: [%s]\n", s); */
	    while (s < end && *s != '-' && *s != '>')
		s++;

	    if (s == end) {
		FREE_TOKENS;
		return beg;
	    }

	    if (*s == '>') {
		s++;
		if (start_com)
		    goto FIND_DASH_DASH;

		/* we are done recognizing all comments, make callbacks */
		report_event(p_state, E_COMMENT,
			     beg - 4, s, utf8,
			     tokens, num_tokens,
			     self);
		FREE_TOKENS;

		return s;
	    }

	    s++;
	    if (s == end) {
		FREE_TOKENS;
		return beg;
	    }

	    if (*s == '-') {
		/* two dashes in a row seen */
		s++;
		/* do something */
		if (start_com) {
		    PUSH_TOKEN(start_com, s-2);
		    start_com = 0;
		}
		else {
		    start_com = s;
		}
	    }
	}
    }
    else if (p_state->no_dash_dash_comment_end) {
	token_pos_t token;
        token.beg = beg;
        /* a lone '>' signals end-of-comment */
	while (s < end && *s != '>')
	    s++;
	token.end = s;
	if (s < end) {
	    s++;
	    report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self);
	    return s;
	}
	else {
	    return beg;
	}
    }
    else { /* non-strict comment */
	token_pos_t token;
	token.beg = beg;
	/* try to locate /--\s*>/ which signals end-of-comment */
    LOCATE_END:
	while (s < end && *s != '-')
	    s++;
	token.end = s;
	if (s < end) {
	    s++;
	    if (*s == '-') {
		s++;
		while (isHSPACE(*s))
		    s++;
		if (*s == '>') {
		    s++;
		    /* yup */
		    report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self);
		    return s;
		}
	    }
	    if (s < end) {
		s = token.end + 1;
		goto LOCATE_END;
	    }
	}
    
	if (s == end)
	    return beg;
    }

    return 0;
}


#ifdef MARKED_SECTION

static void
marked_section_update(PSTATE* p_state)
{
    dTHX;
    /* we look at p_state->ms_stack to determine p_state->ms */
    AV* ms_stack = p_state->ms_stack;
    p_state->ms = MS_NONE;

    if (ms_stack) {
	int stack_len = av_len(ms_stack);
	int stack_idx;
	for (stack_idx = 0; stack_idx <= stack_len; stack_idx++) {
	    SV** svp = av_fetch(ms_stack, stack_idx, 0);
	    if (svp) {
		AV* tokens = (AV*)SvRV(*svp);
		int tokens_len = av_len(tokens);
		int i;
		assert(SvTYPE(tokens) == SVt_PVAV);
		for (i = 0; i <= tokens_len; i++) {
		    SV** svp = av_fetch(tokens, i, 0);
		    if (svp) {
			STRLEN len;
			char *token_str = SvPV(*svp, len);
			enum marked_section_t token;
			if (strEQ(token_str, "include"))
			    token = MS_INCLUDE;
			else if (strEQ(token_str, "rcdata"))
			    token = MS_RCDATA;
			else if (strEQ(token_str, "cdata"))
			    token = MS_CDATA;
			else if (strEQ(token_str, "ignore"))
			    token = MS_IGNORE;
			else
			    token = MS_NONE;
			if (p_state->ms < token)
			    p_state->ms = token;
		    }
		}
	    }
	}
    }
    /* printf("MS %d\n", p_state->ms); */
    p_state->is_cdata = (p_state->ms == MS_CDATA);
    return;
}


static char*
parse_marked_section(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
    dTHX;
    char *s;
    AV* tokens = 0;

    if (!p_state->marked_sections)
	return 0;

    assert(beg[0] == '<');
    assert(beg[1] == '!');
    assert(beg[2] == '[');
    s = beg + 3;

FIND_NAMES:
    while (isHSPACE(*s))
	s++;
    while (isHNAME_FIRST(*s)) {
	char *name_start = s;
	char *name_end;
	SV *name;
	s++;
	while (isHNAME_CHAR(*s))
	    s++;
	name_end = s;
	while (isHSPACE(*s))
	    s++;
	if (s == end)
	    goto PREMATURE;

	if (!tokens)
	    tokens = newAV();
	name = newSVpvn(name_start, name_end - name_start);
	if (utf8)
	    SvUTF8_on(name);
	av_push(tokens, sv_lower(aTHX_ name));
    }
    if (*s == '-') {
	s++;
	if (*s == '-') {
	    /* comment */
	    s++;
	    while (1) {
		while (s < end && *s != '-')
		    s++;
		if (s == end)
		    goto PREMATURE;

		s++;  /* skip first '-' */
		if (*s == '-') {
		    s++;
		    /* comment finished */
		    goto FIND_NAMES;
		}
	    }      
	}
	else
	    goto FAIL;
      
    }
    if (*s == '[') {
	s++;
	/* yup */

	if (!tokens) {
	    tokens = newAV();
	    av_push(tokens, newSVpvn("include", 7));
	}

	if (!p_state->ms_stack)
	    p_state->ms_stack = newAV();
	av_push(p_state->ms_stack, newRV_noinc((SV*)tokens));
	marked_section_update(p_state);
	report_event(p_state, E_NONE, beg, s, utf8, 0, 0, self);
	return s;
    }

FAIL:
    SvREFCNT_dec(tokens);
    return 0; /* not yet implemented */
  
PREMATURE:
    SvREFCNT_dec(tokens);
    return beg;
}
#endif


static char*
parse_decl(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
    char *s = beg + 2;

    if (*s == '-') {
	/* comment? */

	char *tmp;
	s++;
	if (s == end)
	    return beg;

	if (*s != '-')
	    goto DECL_FAIL;  /* nope, illegal */

	/* yes, two dashes seen */
	s++;

	tmp = parse_comment(p_state, s, end, utf8, self);
	return (tmp == s) ? beg : tmp;
    }

#ifdef MARKED_SECTION
    if (*s == '[') {
	/* marked section */
	char *tmp;
	tmp = parse_marked_section(p_state, beg, end, utf8, self);
	if (!tmp)
	    goto DECL_FAIL;
	return tmp;
    }
#endif

    if (*s == '>') {
	/* make <!> into empty comment <SGML Handbook 36:32> */
	token_pos_t token;
	token.beg = s;
	token.end = s;
	s++;
	report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self);
	return s;
    }

    if (isALPHA(*s)) {
	dTOKENS(8);
	char *decl_id = s;
	STRLEN decl_id_len;

	s++;
	/* declaration */
	while (s < end && isHNAME_CHAR(*s))
	    s++;
	decl_id_len = s - decl_id;
	if (s == end)
	    goto PREMATURE;

	/* just hardcode a few names as the recognized declarations */
	if (!((decl_id_len == 7 &&
	       strnEQx(decl_id, "DOCTYPE", 7, !CASE_SENSITIVE(p_state))) ||
	      (decl_id_len == 6 &&
	       strnEQx(decl_id, "ENTITY",  6, !CASE_SENSITIVE(p_state)))
	    )
	    )
	{
	    goto FAIL;
	}

	/* first word available */
	PUSH_TOKEN(decl_id, s);

	while (1) {
	    while (s < end && isHSPACE(*s))
		s++;

	    if (s == end)
		goto PREMATURE;

	    if (*s == '"' || *s == '\'' || (*s == '`' && p_state->backquote)) {
		char *str_beg = s;
		s++;
		while (s < end && *s != *str_beg)
		    s++;
		if (s == end)
		    goto PREMATURE;
		s++;
		PUSH_TOKEN(str_beg, s);
	    }
	    else if (*s == '-') {
		/* comment */
		char *com_beg = s;
		s++;
		if (s == end)
		    goto PREMATURE;
		if (*s != '-')
		    goto FAIL;
		s++;

		while (1) {
		    while (s < end && *s != '-')
			s++;
		    if (s == end)
			goto PREMATURE;
		    s++;
		    if (s == end)
			goto PREMATURE;
		    if (*s == '-') {
			s++;
			PUSH_TOKEN(com_beg, s);
			break;
		    }
		}
	    }
	    else if (*s != '>') {
		/* plain word */
		char *word_beg = s;
		s++;
		while (s < end && isHNOT_SPACE_GT(*s))
		    s++;
		if (s == end)
		    goto PREMATURE;
		PUSH_TOKEN(word_beg, s);
	    }
	    else {
		break;
	    }
	}

	if (s == end)
	    goto PREMATURE;
	if (*s == '>') {
	    s++;
	    report_event(p_state, E_DECLARATION, beg, s, utf8, tokens, num_tokens, self);
	    FREE_TOKENS;
	    return s;
	}

    FAIL:
	FREE_TOKENS;
	goto DECL_FAIL;

    PREMATURE:
	FREE_TOKENS;
	return beg;

    }

DECL_FAIL:
    if (p_state->strict_comment)
	return 0;

    /* consider everything up to the first '>' a comment */
    while (s < end && *s != '>')
	s++;
    if (s < end) {
	token_pos_t token;
	token.beg = beg + 2;
	token.end = s;
	s++;
	report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self);
	return s;
    }
    else {
	return beg;
    }
}


static char*
parse_start(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
    char *s = beg;
    int empty_tag = 0;
    dTOKENS(16);

    hctype_t tag_name_first, tag_name_char;
    hctype_t attr_name_first, attr_name_char;

    if (STRICT_NAMES(p_state)) {
	tag_name_first = attr_name_first = HCTYPE_NAME_FIRST;
	tag_name_char  = attr_name_char  = HCTYPE_NAME_CHAR;
    }
    else {
	tag_name_first = tag_name_char = HCTYPE_NOT_SPACE_GT;
	attr_name_first = HCTYPE_NOT_SPACE_GT;
	attr_name_char  = HCTYPE_NOT_SPACE_EQ_GT;
    }

    s += 2;

    while (s < end && isHCTYPE(*s, tag_name_char)) {
	if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) {
	    if ((s + 1) == end)
		goto PREMATURE;
	    if (*(s + 1) == '>')
		break;
	}
	s++;
    }
    PUSH_TOKEN(beg+1, s);  /* tagname */

    while (isHSPACE(*s))
	s++;
    if (s == end)
	goto PREMATURE;

    while (isHCTYPE(*s, attr_name_first)) {
	/* attribute */
	char *attr_name_beg = s;
	char *attr_name_end;
	if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) {
	    if ((s + 1) == end)
		goto PREMATURE;
	    if (*(s + 1) == '>')
		break;
	}
	s++;
	while (s < end && isHCTYPE(*s, attr_name_char)) {
	    if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) {
		if ((s + 1) == end)
		    goto PREMATURE;
		if (*(s + 1) == '>')
		    break;
	    }
	    s++;
	}
	if (s == end)
	    goto PREMATURE;

	attr_name_end = s;
	PUSH_TOKEN(attr_name_beg, attr_name_end); /* attr name */

	while (isHSPACE(*s))
	    s++;
	if (s == end)
	    goto PREMATURE;

	if (*s == '=') {
	    /* with a value */
	    s++;
	    while (isHSPACE(*s))
		s++;
	    if (s == end)
		goto PREMATURE;
	    if (*s == '>') {
		/* parse it similar to ="" */
		PUSH_TOKEN(s, s);
		break;
	    }
	    if (*s == '"' || *s == '\'' || (*s == '`' && p_state->backquote)) {
		char *str_beg = s;
		s++;
		while (s < end && *s != *str_beg)
		    s++;
		if (s == end)
		    goto PREMATURE;
		s++;
		PUSH_TOKEN(str_beg, s);
	    }
	    else {
		char *word_start = s;
		while (s < end && isHNOT_SPACE_GT(*s)) {
		    if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) {
			if ((s + 1) == end)
			    goto PREMATURE;
			if (*(s + 1) == '>')
			    break;
		    }
		    s++;
		}
		if (s == end)
		    goto PREMATURE;
		PUSH_TOKEN(word_start, s);
	    }
	    while (isHSPACE(*s))
		s++;
	    if (s == end)
		goto PREMATURE;
	}
	else {
	    PUSH_TOKEN(0, 0); /* boolean attr value */
	}
    }

    if (ALLOW_EMPTY_TAG(p_state) && *s == '/') {
	s++;
	if (s == end)
	    goto PREMATURE;
	empty_tag = 1;
    }

    if (*s == '>') {
	s++;
	/* done */
	report_event(p_state, E_START, beg, s, utf8, tokens, num_tokens, self);
	if (empty_tag) {
	    report_event(p_state, E_END, s, s, utf8, tokens, 1, self);
	}
	else if (!p_state->xml_mode) {
	    /* find out if this start tag should put us into literal_mode
	     */
	    int i;
	    int tag_len = tokens[0].end - tokens[0].beg;

	    for (i = 0; literal_mode_elem[i].len; i++) {
		if (tag_len == literal_mode_elem[i].len) {
		    /* try to match it */
		    char *s = beg + 1;
		    char *t = literal_mode_elem[i].str;
		    int len = tag_len;
		    while (len) {
			if (toLOWER(*s) != *t)
			    break;
			s++;
			t++;
			if (!--len) {
			    /* found it */
			    p_state->literal_mode = literal_mode_elem[i].str;
			    p_state->is_cdata = literal_mode_elem[i].is_cdata;
			    /* printf("Found %s\n", p_state->literal_mode); */
			    goto END_OF_LITERAL_SEARCH;
			}
		    }
		}
	    }
	END_OF_LITERAL_SEARCH:
	    ;
	}

	FREE_TOKENS;
	return s;
    }
  
    FREE_TOKENS;
    return 0;

PREMATURE:
    FREE_TOKENS;
    return beg;
}


static char*
parse_end(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
    char *s = beg+2;
    hctype_t name_first, name_char;

    if (STRICT_NAMES(p_state)) {
	name_first = HCTYPE_NAME_FIRST;
	name_char  = HCTYPE_NAME_CHAR;
    }
    else {
	name_first = name_char = HCTYPE_NOT_SPACE_GT;
    }

    if (isHCTYPE(*s, name_first)) {
	token_pos_t tagname;
	tagname.beg = s;
	s++;
	while (s < end && isHCTYPE(*s, name_char))
	    s++;
	tagname.end = s;

	if (p_state->strict_end) {
	    while (isHSPACE(*s))
		s++;
	}
	else {
	    s = skip_until_gt(s, end);
	}
	if (s < end) {
	    if (*s == '>') {
		s++;
		/* a complete end tag has been recognized */
		report_event(p_state, E_END, beg, s, utf8, &tagname, 1, self);
		return s;
	    }
	}
	else {
	    return beg;
	}
    }
    else if (!p_state->strict_comment) {
	s = skip_until_gt(s, end);
	if (s < end) {
	    token_pos_t token;
	    token.beg = beg + 2;
	    token.end = s;
	    s++;
	    report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self);
	    return s;
	}
	else {
	    return beg;
	}
    }
    return 0;
}


static char*
parse_process(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
    char *s = beg + 2;  /* skip '<?' */
    /* processing instruction */
    token_pos_t token_pos;
    token_pos.beg = s;

    while (s < end) {
	if (*s == '>') {
	    token_pos.end = s;
	    s++;

	    if (p_state->xml_mode || p_state->xml_pic) {
		/* XML processing instructions are ended by "?>" */
		if (s - beg < 4 || s[-2] != '?')
		    continue;
		token_pos.end = s - 2;
	    }
      
	    /* a complete processing instruction seen */
	    report_event(p_state, E_PROCESS, beg, s, utf8, 
			 &token_pos, 1, self);
	    return s;
	}
	s++;
    }
    return beg;  /* could not find end */
}


#ifdef USE_PFUNC
static char*
parse_null(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
    return 0;
}



#include "pfunc.h"                   /* declares the parsefunc[] */
#endif /* USE_PFUNC */

static char*
parse_buf(pTHX_ PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
    char *s = beg;
    char *t = beg;
    char *new_pos;

    while (!p_state->eof) {
	/*
	 * At the start of this loop we will always be ready for eating text
	 * or a new tag.  We will never be inside some tag.  The 't' points
	 * to where we started and the 's' is advanced as we go.
	 */

	while (p_state->literal_mode) {
	    char *l = p_state->literal_mode;
	    char *end_text;

	    while (s < end && *s != '<') {
		s++;
	    }

	    if (s == end) {
		s = t;
		goto DONE;
	    }

	    end_text = s;
	    s++;
      
	    /* here we rely on '\0' termination of perl svpv buffers */
	    if (*s == '/') {
		s++;
		while (*l && toLOWER(*s) == *l) {
		    s++;
		    l++;
		}

		if (!*l && (strNE(p_state->literal_mode, "plaintext") || p_state->closing_plaintext)) {
		    /* matched it all */
		    token_pos_t end_token;
		    end_token.beg = end_text + 2;
		    end_token.end = s;

		    while (isHSPACE(*s))
			s++;
		    if (*s == '>') {
			s++;
			if (t != end_text)
			    report_event(p_state, E_TEXT, t, end_text, utf8,
					 0, 0, self);
			report_event(p_state, E_END,  end_text, s, utf8,
				     &end_token, 1, self);
			p_state->literal_mode = 0;
			p_state->is_cdata = 0;
			t = s;
		    }
		}
	    }
	}

#ifdef MARKED_SECTION
	while (p_state->ms == MS_CDATA || p_state->ms == MS_RCDATA) {
	    while (s < end && *s != ']')
		s++;
	    if (*s == ']') {
		char *end_text = s;
		s++;
		if (*s == ']' && *(s + 1) == '>') {
		    s += 2;
		    /* marked section end */
		    if (t != end_text)
			report_event(p_state, E_TEXT, t, end_text, utf8,
				     0, 0, self);
		    report_event(p_state, E_NONE, end_text, s, utf8, 0, 0, self);
		    t = s;
		    SvREFCNT_dec(av_pop(p_state->ms_stack));
		    marked_section_update(p_state);
		    continue;
		}
	    }
	    if (s == end) {
		s = t;
		goto DONE;
	    }
	}
#endif

	/* first we try to match as much text as possible */
	while (s < end && *s != '<') {
#ifdef MARKED_SECTION
	    if (p_state->ms && *s == ']') {
		char *end_text = s;
		s++;
		if (*s == ']') {
		    s++;
		    if (*s == '>') {
			s++;
			report_event(p_state, E_TEXT, t, end_text, utf8,
				     0, 0, self);
			report_event(p_state, E_NONE, end_text, s, utf8,
				     0, 0, self);
			t = s;
			SvREFCNT_dec(av_pop(p_state->ms_stack));
			marked_section_update(p_state);    
			continue;
		    }
		}
	    }
#endif
	    s++;
	}
	if (s != t) {
	    if (*s == '<') {
		report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self);
		t = s;
	    }
	    else {
		s--;
		if (isHSPACE(*s)) {
		    /* wait with white space at end */
		    while (s >= t && isHSPACE(*s))
			s--;
		}
		else {
		    /* might be a chopped up entities/words */
		    while (s >= t && !isHSPACE(*s))
			s--;
		    while (s >= t && isHSPACE(*s))
			s--;
		}
		s++;
		if (s != t)
		    report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self);
		break;
	    }
	}

	if (end - s < 3)
	    break;

	/* next char is known to be '<' and pointed to by 't' as well as 's' */
	s++;

#ifdef USE_PFUNC
	new_pos = parsefunc[(unsigned char)*s](p_state, t, end, utf8, self);
#else
	if (isHNAME_FIRST(*s))
	    new_pos = parse_start(p_state, t, end, utf8, self);
	else if (*s == '/')
	    new_pos = parse_end(p_state, t, end, utf8, self);
	else if (*s == '!')
	    new_pos = parse_decl(p_state, t, end, utf8, self);
	else if (*s == '?')
	    new_pos = parse_process(p_state, t, end, utf8, self);
	else
	    new_pos = 0;
#endif /* USE_PFUNC */

	if (new_pos) {
	    if (new_pos == t) {
		/* no progress, need more data to know what it is */
		s = t;
		break;
	    }
	    t = s = new_pos;
	}

	/* if we get out here then this was not a conforming tag, so
	 * treat it is plain text at the top of the loop again (we
	 * have already skipped past the "<").
	 */
    }

DONE:
    return s;

}

EXTERN void
parse(pTHX_
      PSTATE* p_state,
      SV* chunk,
      SV* self)
{
    char *s, *beg, *end;
    U32 utf8 = 0;
    STRLEN len;

    if (!p_state->start_document) {
	char dummy[1];
	report_event(p_state, E_START_DOCUMENT, dummy, dummy, 0, 0, 0, self);
	p_state->start_document = 1;
    }

    if (!chunk) {
	/* eof */
	char empty[1];
	if (p_state->buf && SvOK(p_state->buf)) {
	    /* flush it */
	    s = SvPV(p_state->buf, len);
	    end = s + len;
	    utf8 = SvUTF8(p_state->buf);
	    assert(len);

	    while (s < end) {
		if (p_state->literal_mode) {
		    if (strEQ(p_state->literal_mode, "plaintext") ||
			strEQ(p_state->literal_mode, "xmp") ||
			strEQ(p_state->literal_mode, "iframe") ||
			strEQ(p_state->literal_mode, "textarea"))
		    {
			/* rest is considered text */
			break;
                    }
		    if (strEQ(p_state->literal_mode, "script") ||
			strEQ(p_state->literal_mode, "style"))
		    {
			/* effectively make it an empty element */
			token_pos_t t;
			char dummy;
			t.beg = p_state->literal_mode;
			t.end = p_state->literal_mode + strlen(p_state->literal_mode);
			report_event(p_state, E_END, &dummy, &dummy, 0, &t, 1, self);
		    }
		    else {
			p_state->pending_end_tag = p_state->literal_mode;
		    }
		    p_state->literal_mode = 0;
		    s = parse_buf(aTHX_ p_state, s, end, utf8, self);
		    continue;
		}

		if (!p_state->strict_comment && !p_state->no_dash_dash_comment_end && *s == '<') {
		    p_state->no_dash_dash_comment_end = 1;
		    s = parse_buf(aTHX_ p_state, s, end, utf8, self);
		    continue;
		}

		if (!p_state->strict_comment && *s == '<') {
		    char *s1 = s + 1;
		    if (s1 == end || isHNAME_FIRST(*s1) || *s1 == '/' || *s1 == '!' || *s1 == '?') {
			/* some kind of unterminated markup.  Report rest as as comment */
			token_pos_t token;
			token.beg = s + 1;
			token.end = end;
			report_event(p_state, E_COMMENT, s, end, utf8, &token, 1, self);
			s = end;
		    }
		}

		break;
	    }

	    if (s < end) {
		/* report rest as text */
		report_event(p_state, E_TEXT, s, end, utf8, 0, 0, self);
	    }
	    
	    SvREFCNT_dec(p_state->buf);
	    p_state->buf = 0;
	}
	if (p_state->pend_text && SvOK(p_state->pend_text))
	    flush_pending_text(p_state, self);

	if (p_state->ignoring_element) {
	    /* document not balanced */
	    SvREFCNT_dec(p_state->ignoring_element);
	    p_state->ignoring_element = 0;
	}
	report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, 0, self);

	/* reset state */
	p_state->offset = 0;
	if (p_state->line)
	    p_state->line = 1;
	p_state->column = 0;
	p_state->start_document = 0;
	p_state->literal_mode = 0;
	p_state->is_cdata = 0;
	return;
    }

#ifdef UNICODE_HTML_PARSER
    if (p_state->utf8_mode)
	sv_utf8_downgrade(chunk, 0);
#endif

    if (p_state->buf && SvOK(p_state->buf)) {
	sv_catsv(p_state->buf, chunk);
	beg = SvPV(p_state->buf, len);
	utf8 = SvUTF8(p_state->buf);
    }
    else {
	beg = SvPV(chunk, len);
	utf8 = SvUTF8(chunk);
	if (p_state->offset == 0 && DOWARN) {
	    /* Print warnings if we find unexpected Unicode BOM forms */
#ifdef UNICODE_HTML_PARSER
	    if (p_state->argspec_entity_decode &&
		!(p_state->attr_encoded && p_state->argspec_entity_decode == ARG_ATTR) &&
		!p_state->utf8_mode && (
                 (!utf8 && len >= 3 && strnEQ(beg, "\xEF\xBB\xBF", 3)) ||
		 (utf8 && len >= 6 && strnEQ(beg, "\xC3\xAF\xC2\xBB\xC2\xBF", 6)) ||
		 (!utf8 && probably_utf8_chunk(aTHX_ beg, len))
		)
	       )
	    {
		warn("Parsing of undecoded UTF-8 will give garbage when decoding entities");
	    }
	    if (utf8 && len >= 2 && strnEQ(beg, "\xFF\xFE", 2)) {
		warn("Parsing string decoded with wrong endianness");
	    }
#endif
	    if (!utf8 && len >= 4 &&
		(strnEQ(beg, "\x00\x00\xFE\xFF", 4) ||
		 strnEQ(beg, "\xFE\xFF\x00\x00", 4))
		)
	    {
		warn("Parsing of undecoded UTF-32");
	    }
	    else if (!utf8 && len >= 2 &&
		     (strnEQ(beg, "\xFE\xFF", 2) || strnEQ(beg, "\xFF\xFE", 2))
		)
	    {
		warn("Parsing of undecoded UTF-16");
	    }
	}
    }

    if (!len)
	return; /* nothing to do */

    end = beg + len;
    s = parse_buf(aTHX_ p_state, beg, end, utf8, self);

    if (s == end || p_state->eof) {
	if (p_state->buf) {
	    SvOK_off(p_state->buf);
	}
    }
    else {
	/* need to keep rest in buffer */
	if (p_state->buf) {
	    /* chop off some chars at the beginning */
	    if (SvOK(p_state->buf)) {
		sv_chop(p_state->buf, s);
	    }
	    else {
		sv_setpvn(p_state->buf, s, end - s);
		if (utf8)
		    SvUTF8_on(p_state->buf);
		else
		    SvUTF8_off(p_state->buf);
	    }
	}
	else {
	    p_state->buf = newSVpv(s, end - s);
	    if (utf8)
		SvUTF8_on(p_state->buf);
	}
    }
    return;
}