##
#
# Copyright 2001, AllAfrica Global Media
#
# This file is part of XML::Comma
#
# XML::Comma is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# For more information about XML::Comma, point a web browser at
# http://xymbollab.com/tools/comma/, or read the tutorial included
# with the XML::Comma distribution at docs/guide.html
#
##
##
#
# This parser and the PurePerl parser share very similar top_level
# code (all the stuff in perl, here). The main difference are 1) this
# module needs to do special _c_new stuff to create an object, and 2)
# the resulting objects aren't hashrefs and use accessor methods where
# the PurePerl version sometimes uses private hash slots.
#
# It would be nice to refactor the code so that both of these inherit
# from a common abstract parent. On the other hand, it's probably not
# worth doing all that refactoring unless the architecture needs to
# change substantially, as the code here is stable and well-tested.
#
##
package XML::Comma::Parsing::SimpleC;
use strict;
use XML::Comma;
use XML::Comma::Util qw( dbg trim );
my $OPEN_TAG = 1;
my $CLOSE_TAG = 2;
my $COMMENT = 3;
my $CDATA = 4;
my $PROCESSING_INSTRUCTION = 5;
my $DOCTYPE = 6;
my $TEXT = 7;
my $DONE = 8;
my $EMPTY_ELEMENT = 9;
# _top_level_class
# _from_file
# _string
# _pos
# _wpos
# _el_stack : [] for reporting context
sub new {
my ( $class, %arg ) = @_;
my $string;
my $filename;
if ( $arg{block} ) {
$string = $arg{block};
} elsif ( $arg{filename} ) {
$filename = $arg{filename};
open ( FILE, "${ \( $arg{filename} )}" ) ||
die "can't open file '${ \( $arg{filename} )}': $!\n";
local $/ = undef;
$string = <FILE>;
close ( FILE );
} else {
die "no block or filename to parse";
}
my $self = _c_new ( 'XML::Comma::Parsing::SimpleC',
$string,
$filename || '',
$arg{top_level_class} || 'XML::Comma::Doc',
0 );
return $self->handle_document ( $arg{read_args} );
}
# create a "child" of this parser to process include directives
# args:
#
# name -- name of include for DefManager to find
# handle_args -- bundle of arguments to pass directly to handle_element
#
sub includes_parser {
my ( $parent, %arg ) = @_;
my ( $string, $file ) =
XML::Comma::DefManager->include_string ( $arg{name}, $arg{args_string} );
my $self = _c_new ( 'XML::Comma::Parsing::SimpleC',
$string,
$file,
$parent->top_level_class(),
1 );
#$self->{_in_include} = 1;
eval {
$self->handle_element ( @{$arg{handle_args}} );
}; if ( $@ ) {
# otherwise, we should construct a pretty error string
my $context = join '/', map { $_->tag() } $self->down_tree_branch();
$context = ( $file . ':' . $context) if $file;
XML::Comma::Log->err
( 'PARSE_ERR', $@, undef,
"(in '$context' at " . $self->pos_line_and_column() . ")\n" );
}
}
sub parse {
my ( $class, %arg ) = @_;
my $string = $arg{block} || die "need a block to SimpleC::parse";
my $self = _c_new ( 'XML::Comma::Parsing::SimpleC', $string, '', '', 0 );
eval {
# prolog
my ( $type, $string, $tag ) = $self->skip_prolog();
# root element
$self->handle_element ( $self, $tag, 0, 0 );
# nothing else
$self->eat_whitespace();
( $type, $string, $tag ) = $self->next_token();
while ( $type != $DONE ) {
if ( $string and
$type != $COMMENT and $type != $PROCESSING_INSTRUCTION ) {
die "more content found after root element: '$string'\n";
}
$self->eat_whitespace();
( $type, $string, $tag ) = $self->next_token();
}
}; if ( $@ ) {
my $context = join '/', map { $_->tag() } $self->down_tree_branch();
undef @{$self->_el_stack()};
XML::Comma::Log->err
( 'PARSE_ERR', $@, undef,
"(in '$context' at " . $self->pos_line_and_column() . ")\n" );
}
}
sub raw_append {}
sub cdata_wrap {}
####
# document parsing
####
sub handle_document {
my ( $self, $read_args ) = @_;
my $doc;
my $file = $self->from_file();
eval {
# prolog and outermost envelope
my ( $type, $string, $tag ) = $self->skip_prolog();
# create document
$doc = $self->top_level_class()
->new ( type => $tag,
from_file => $file,
last_mod_time => ($file ? (stat($file))[9] : 0),
read_args => $read_args );
push @{$self->_el_stack()}, $doc;
# recursively handle elements
$self->handle_element ( $doc, $tag, 1, 1 );
# nothing else except comments and whitespace
$self->eat_whitespace();
( $type, $string, $tag ) = $self->next_token();
while ( $type != $DONE ) {
if ( $type != $COMMENT and $type != $PROCESSING_INSTRUCTION ) {
die "more content found after root element: '$string'\n";
}
$self->eat_whitespace();
( $type, $string, $tag ) = $self->next_token();
}
}; if ( $@ ) {
my $context = join '/', map { $_->tag() } $self->down_tree_branch();
$context = ($file . ':' . $context) if $file;
XML::Comma::Log->err
( 'PARSE_ERR', $@, undef,
"(in '$context' at " . $self->pos_line_and_column() . ")\n" );
}
return $doc;
}
sub down_tree_branch {
my $self = shift();
return @{$self->_el_stack()};
}
sub skip_prolog {
my $self = shift();
# let's be overly forgiving and accept docs with leading whitespace
$self->eat_whitespace();
my ( $type, $string, $special ) = $self->next_token();
while ( $type != $OPEN_TAG ) {
if ( $type == $CDATA ) {
die "unexpected CDATA\n";
} elsif ( $type == $TEXT ) {
die "text outside of root element\n";
} elsif ( $type == $DONE ) {
die "no document content\n";
}
$self->eat_whitespace();
( $type, $string, $special ) = $self->next_token();
}
return ( $type, $string, $special );
}
sub handle_element {
my ( $self, $el, $tag, $nested, $comma_level ) = @_;
while ( 1 ) {
my ( $type, $string, $special ) = $self->next_token();
if ( $type == $TEXT ) {
# text -- append (let el do its own checking)
$el->raw_append ( $string );
} elsif ( $type == $OPEN_TAG ) {
# open tag -- recurse
if ( $nested ) {
my $new = $el->add_element ( $special, $string );
push @{$self->_el_stack()}, $new;
$self->handle_element
( $new,
$special,
($new->def() ? $new->def()->is_nested() : 1),
1 );
} else {
$el->raw_append ( $string );
$self->handle_element ( $el, $special, 0, 0 );
}
} elsif ( $type == $EMPTY_ELEMENT ) {
if ( $nested ) {
$el->add_element ( $special );
} else {
$el->raw_append ( $string );
}
} elsif ( $type == $CLOSE_TAG ) {
# close tag -- check for match and return
if ( $special eq $tag ) {
if ( $comma_level ) {
$el->finish_initial_read ( $self );
pop @{$self->_el_stack()};
} else {
$el->raw_append ( $string );
}
return; # ok
} else {
die "mismatched tag: '$tag', '$special'\n";
}
} elsif ( $type == $CDATA ) {
# cdata -- extract and append
if ( $nested ) {
die "cdata content '$string' found for nested element '$tag'\n";
} else {
$el->cdata_wrap();
$el->raw_append ( $special );
}
} elsif ( $type == $DOCTYPE ) {
# doctype -- throw an error
die "doctype after prolog\n";
} elsif ( $type == $DONE ) {
unless ( $self->_in_include() ) {
# finished prematurely
die "reached end of document unexpectedly\n";
}
return; # putatively ok
} elsif ( $type == $PROCESSING_INSTRUCTION ) {
my $content = trim ( substr $string, 2, length($string) - 4 );
my ( $directive, $first_word, $rest ) = split ( /\s+/, $content, 3 );
if ( $directive and $directive eq '#include' ) {
die "#include directive with no include name given\n"
unless $first_word;
$self->includes_parser ( name => $first_word,
args_string => $rest,
handle_args => [ $el, $tag,
$nested, $comma_level ] );
}
## ignore other processing instructions
}
## ignore comments
}
}
sub pos_line_and_column {
my $self = shift();
my $line = 1;
my $pos = 0;
my $col = 0;
my $string = $self->string();
while ( $string =~ /(\r\n)|(\r)|(\n)/g ) {
last if pos($string) > $self->pos();
$line++;
$pos = pos ( $string );
}
$col = $self->pos() - $pos;
return "line $line, column $col";
}
# ------------------------------------------------------------------------------
# C code follows
# ------------------------------------------------------------------------------
my $code;
BEGIN {
$code = <<'END';
#include <string.h>
typedef struct {
char* pos;
char* wpos;
char* string;
char* from_file;
char* doc_class;
AV* el_stack;
int in_include;
} Cobj;
SV* _c_new ( char* class, char* string, char* from_file, char* doc_class, int in_include );
void DESTROY ( SV* self );
char* from_file ( SV* obj );
char* top_level_class ( SV* obj );
int pos ( SV* obj );
int _in_include ( SV* obj );
char* string ( SV* obj );
AV* _el_stack ( SV* obj );
void eat_whitespace ( SV* self );
void next_token ( SV* self );
void done_return ( void );
void b_token ( Cobj* cobj );
void open_tag ( Cobj* cobj );
void close_tag ( Cobj* cobj );
void processing_instruction ( Cobj* cobj );
void bang_instruction ( Cobj* cobj );
void doctype ( Cobj* cobj );
void comment ( Cobj* cobj );
void cdata ( Cobj* cobj );
void text ( Cobj* cobj );
char t_get_c ( Cobj* cobj );
//----------------------------------------
SV* _c_new ( char* class, char* string, char* from_file, char* doc_class, int in_include ) {
Cobj* cobj = malloc ( sizeof(Cobj) );
SV* obj_ref = newSViv(0);
SV* obj = newSVrv ( obj_ref, class );
cobj->string = strdup ( string );
cobj->pos = cobj->string;
cobj->wpos = cobj->string;
cobj->from_file = strdup ( from_file );
cobj->doc_class = strdup ( doc_class );
cobj->el_stack = newAV();
cobj->in_include = in_include;
sv_setiv ( obj, (IV)cobj );
SvREADONLY_on ( obj );
return obj_ref;
}
void DESTROY ( SV* self ) {
Cobj* cobj = (Cobj*)SvIV(SvRV(self));
free ( cobj->string );
free ( cobj->from_file );
free ( cobj->doc_class );
av_undef ( cobj->el_stack );
sv_2mortal ( (SV*)cobj->el_stack );
free ( cobj );
}
//----------------------------------------
char* from_file ( SV* obj ) {
return ((Cobj*)SvIV(SvRV(obj)))->from_file;
}
char* top_level_class ( SV* obj ) {
return ((Cobj*)SvIV(SvRV(obj)))->doc_class;
}
// get position, relateive to start of string
int pos ( SV* obj ) {
return (((Cobj*)SvIV(SvRV(obj)))->pos) - (((Cobj*)SvIV(SvRV(obj)))->string);
}
// get boolean indicating whether we're processing an include file or not
int _in_include ( SV* obj ) {
return (((Cobj*)SvIV(SvRV(obj)))->in_include);
}
char* string ( SV* obj ) {
return ((Cobj*)SvIV(SvRV(obj)))->string;
}
AV* _el_stack ( SV* obj ) {
return ((Cobj*)SvIV(SvRV(obj)))->el_stack;
}
void eat_whitespace ( SV* self ) {
Cobj* cobj = (Cobj*)SvIV(SvRV(self));
cobj->pos += strspn ( cobj->pos, " \t\n\r" );
cobj->wpos = cobj->pos;
}
//----------------------------------------
void next_token ( SV* self ) {
Cobj* cobj = (Cobj*)SvIV(SvRV(self));
char c = t_get_c(cobj);
if ( c == '\0' ) {
return done_return();
} else if ( c == '<' ) {
return b_token(cobj);
} else {
return text(cobj);
}
}
void done_return() {
Inline_Stack_Vars;
Inline_Stack_Reset;
Inline_Stack_Push(sv_2mortal(newSViv(8))); //DONE
Inline_Stack_Done;
Inline_Stack_Return ( 1 );
}
void b_token ( Cobj* cobj ) {
char c = t_get_c(cobj);
if ( c == '/' ) {
return close_tag(cobj);
} else if ( c == '?' ) {
return processing_instruction(cobj);
} else if ( c == '!' ) {
return bang_instruction(cobj);
} else {
return open_tag(cobj);
}
}
void open_tag ( Cobj* cobj ) {
char c;
char* i;
char* tag_name_end;
Inline_Stack_Vars;
cobj->wpos = strchr ( cobj->pos, '>' );
if ( cobj->wpos == NULL ) {
croak ( "reached end of document while inside open tag...\n" );
}
tag_name_end = strpbrk ( cobj->pos, "/ \t\n\r>" );
cobj->wpos++;
// check tag name
for ( i=(cobj->pos)+1; i < tag_name_end; i++ ) {
c = *i;
if ( ! (((c >= 'a') && (c <= 'z')) ||
((c >= 'A') && (c <= 'Z')) ||
((c >= '0') && (c <= '9')) ||
(c == '_')) ) {
croak ( "illegal tag name\n" );
}
}
// check entity legality inside tag
check_entities ( cobj, cobj->pos, cobj->wpos );
Inline_Stack_Reset;
if ( *(cobj->wpos - 2) == '/' ) {
Inline_Stack_Push(sv_2mortal(newSViv(9))); //EMPTY_ELEMENT
} else {
Inline_Stack_Push(sv_2mortal(newSViv(1))); //OPEN_TAG
}
// complete token string
Inline_Stack_Push(sv_2mortal(newSVpvn(cobj->pos, cobj->wpos - cobj->pos)));
// tag name string
Inline_Stack_Push(sv_2mortal(newSVpvn(cobj->pos + 1,
tag_name_end - (cobj->pos + 1))));
cobj->pos = cobj->wpos;
Inline_Stack_Done;
Inline_Stack_Return ( 3 );
}
void close_tag ( Cobj* cobj ) {
char c;
Inline_Stack_Vars;
cobj->wpos = strchr ( cobj->pos, '>' );
if ( cobj->wpos == NULL ) {
croak ( "reached end of document while inside close tag\n" );
}
cobj->wpos++;
Inline_Stack_Reset;
Inline_Stack_Push(sv_2mortal(newSViv(2))); //CLOSE_TAG
// complete token string
Inline_Stack_Push(sv_2mortal(newSVpvn(cobj->pos,cobj->wpos - cobj->pos)));
// tag name string
Inline_Stack_Push(sv_2mortal(newSVpvn(cobj->pos + 2,
cobj->wpos - cobj->pos - 3)));
// printf ( "tag name: %s\n", SvPV(tagname,PL_na) );
Inline_Stack_Done;
cobj->pos = cobj->wpos;;
Inline_Stack_Return ( 3 );
}
void processing_instruction ( Cobj* cobj ) {
char c;
Inline_Stack_Vars;
cobj->wpos = strstr ( cobj->pos, "?>" );
if ( cobj->wpos == NULL ) {
croak ( "reached end of ducument while inside <?...\n" );
}
cobj->wpos += 2;
Inline_Stack_Reset;
Inline_Stack_Push(sv_2mortal(newSViv(5))); //PROCESSING_INSTRUCTION
Inline_Stack_Push(sv_2mortal(newSVpvn(cobj->pos,cobj->wpos - cobj->pos)));
Inline_Stack_Done;
cobj->pos = cobj->wpos;
Inline_Stack_Return ( 2 );
}
void bang_instruction ( Cobj* cobj ) {
char c,d;
c = *(cobj->wpos++); d = *(cobj->wpos++);
if ( (c == '-') && (d == '-') ) {
return comment(cobj);
} else if ( (c == 'D') && (d == 'O') &&
(*(cobj->wpos++) == 'C') && (*(cobj->wpos++) == 'T') &&
(*(cobj->wpos++) == 'Y') && (*(cobj->wpos++) == 'P') &&
(*(cobj->wpos++) == 'E') ) {
return doctype(cobj);
} else if ( (c == '[') && (d == 'C') &&
(*(cobj->wpos++) == 'D') && (*(cobj->wpos++) == 'A') &&
(*(cobj->wpos++) == 'T') && (*(cobj->wpos++) == 'A') &&
(*(cobj->wpos++) == '[') ) {
return cdata(cobj);
} else {
croak ( "bad <! tag\n" );
}
}
void doctype ( Cobj* cobj ) {
char c;
Inline_Stack_Vars;
while ( (c = t_get_c(cobj)) != '\0' ) {
if ( c == '>' ) {
Inline_Stack_Reset;
Inline_Stack_Push(sv_2mortal(newSViv(6)));
Inline_Stack_Push(sv_2mortal(newSVpvn(cobj->pos,cobj->wpos - cobj->pos)));
Inline_Stack_Done;
cobj->pos = cobj->wpos;
Inline_Stack_Return ( 2 );
} else if ( c == '[' ) {
croak ( "parser doesn't handle in-line doctype declarations\n" );
}
}
croak ( "reached end of document while inside <!DOCTYPE...\n" );
}
void comment ( Cobj* cobj ) {
char c;
Inline_Stack_Vars;
cobj->wpos = strstr ( cobj->pos+4, "--" );
if ( cobj->wpos == NULL ) {
croak ( "reached end of document while inside a comment\n" );
}
cobj->wpos += 2;
if ( t_get_c(cobj) == '>' ) {
Inline_Stack_Reset;
Inline_Stack_Push(sv_2mortal(newSViv(3))); //COMMENT
Inline_Stack_Push(sv_2mortal(newSVpvn(cobj->pos,cobj->wpos-cobj->pos)));
Inline_Stack_Done;
cobj->pos = cobj->wpos;
Inline_Stack_Return ( 2 );
} else {
croak ( "string '--' not allowed inside comments\n" );
}
}
void cdata ( Cobj* cobj ) {
char c;
Inline_Stack_Vars;
cobj->wpos = strstr ( cobj->pos, "]]>" );
if ( cobj->wpos == NULL ) {
croak ( "reached end of document while inside <![CDATA...\n" );
}
cobj->wpos += 3;
Inline_Stack_Reset;
Inline_Stack_Push(sv_2mortal(newSViv(4))); //CDATA
Inline_Stack_Push(sv_2mortal(newSVpvn(cobj->pos,cobj->wpos-cobj->pos)));
Inline_Stack_Push(sv_2mortal(newSVpvn(cobj->pos+9,
cobj->wpos - cobj->pos - 12)));
Inline_Stack_Done;
cobj->pos = cobj->wpos;
Inline_Stack_Return ( 3 );
}
void text ( Cobj* cobj ) {
char c;
//char* index;
Inline_Stack_Vars;
cobj->wpos = strchr ( cobj->pos, '<' );
// make sure we haven't overrun our document end. if we have, return
// what we've got so far and let someone higher up the stack worry
// about it
if ( cobj->wpos == NULL ) {
cobj->wpos = index ( cobj->pos, '\0' );
}
// make sure all the entities in this text chunk look legal
check_entities ( cobj, cobj->pos, cobj->wpos );
Inline_Stack_Reset;
Inline_Stack_Push(sv_2mortal(newSViv(7))); //TEXT
Inline_Stack_Push(sv_2mortal(newSVpvn(cobj->pos,cobj->wpos - cobj->pos)));
Inline_Stack_Done;
cobj->pos = cobj->wpos;
Inline_Stack_Return ( 2 );
}
void check_entities ( Cobj* cobj, char* start, char* end ) {
char c;
char* pos = strchr ( start, '&' );
while ( (pos < end) && (pos != NULL) ) {
// each char until ; must be a-zA-Z0-9
while ( (c = *(++pos)) != ';' ) {
if ( (pos >= end) || ! (((c >= 'a') && (c <= 'z')) ||
((c >= 'A') && (c <= 'Z')) ||
((c >= '0') && (c <= '9')) ||
(c == '_') || (c == '#')) ) {
// not ok, set pos so perl error reporting will give the right pos
cobj->pos = pos;
croak ( "& found that isn't part of an entity reference" );
}
}
pos = strchr ( pos, '&' );
}
}
char t_get_c ( Cobj* cobj ) {
// return undef if we've overreached the end of the string
if ( ((cobj->wpos) - (cobj->string)) >= strlen(cobj->string) ) {
return '\0';
}
return *(cobj->wpos)++;
}
END
}
use Inline C => $code,
DIRECTORY => XML::Comma->sys_directory();
1;