The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package XML::DTD::Parser;

use XML::DTD::AttList;
use XML::DTD::Comment;
use XML::DTD::Element;
use XML::DTD::Entity;
use XML::DTD::EntityManager;
use XML::DTD::Ignore;
use XML::DTD::Include;
use XML::DTD::Notation;
use XML::DTD::PERef;
use XML::DTD::PI;
use XML::DTD::Text;
use XML::DTD::Error;
use URI::file;

use 5.008;
use strict;
use warnings;

our @ISA = qw();

our $VERSION = '0.09';


# Constructor
sub new {
  my $arg = shift;
  my $val = shift; # Parser is validating

  my $cls = ref($arg) || $arg;
  my $obj = ref($arg) && $arg;

  my $self;
  if ($obj) {
    # Called as a copy constructor
    $self = { %$obj };
  } else {
    # Called as the main constructor
    $self = { };
    $self->{'ALL'} = [];
    $self->{'ELEMENTS'} = {};
    $self->{'ATTLISTS'} = {};
    $self->{'INCFLAG'} = 0;
    $self->{'VALIDATING'} = $val;
    $self->{'EXPANDINGPE'} = 0;
  }
  bless $self, $cls;
  return $self;
}


# Determine whether object is of this type
sub isa {
  my $cls = shift;
  my $r = shift;

  if (defined($r) && ref($r) eq $cls) {
    return 1;
  } else {
    return 0;
  }
}


# Parse a DTD file
sub parse {
  my $self = shift;
  my $fh   = shift;
  my $rt   = shift;
  my $uri  = shift; # The URI of the entity being parsed, if known

  # If the URI is relative (has no scheme), then interpret it as a file:
  # URI relative to the current working directory.  The test for the
  # presence of a scheme is strictly incorrect, to to avoid interpreting
  # DOS drive numbers as schemes, so that c:\x\y\z.dtd is interpreted as
  # a file name, and translated to the URI file:///c:/x/y/z.dtd, not taken
  # as being a URI with scheme c: and containing the unwise character '\'.

  $uri = URI::file->new_abs($uri)->as_string
    if (defined $uri && $uri !~ /^[a-zA-Z][a-zA-Z0-9+\-.]+:/);

  ##print "DTD::Parser:: parse URI: $uri\n" if (defined $uri);

  my ($lt, $dcl, $dcllt, $dclrt);
  # Get first line of input
  $lt = (defined $fh)?<$fh>:''; # Read from file handle if defined
  $lt = $rt . $lt if (defined $rt);
  while ($lt) {

    if ($self->{'INCFLAG'} == 0) {
      # Scan for start of declaration
      ($lt, $dcllt, $rt) = _scanuntil($fh,$lt, '<\!--|<\!\[|<\!|<\?|\%', 0);
    } else {
      # Scan for start of declaration or end of include section
      ($lt, $dcllt, $rt) = _scanuntil($fh,$lt,
				      '<\!--|<\!\[|<\!|<\?|\%|\]\]>', 0);
    }

    # Deal with text before declaration
    push @{$self->{'ALL'}}, XML::DTD::Text->new($lt)
      if ($lt ne '' and !$self->{'EXPANDINGPE'});
    $lt = '';

    # Terminate loop if no declaration found
    last if ($dcllt eq '');

    # Terminate loop if in include mode and ]]> encountered
    last if ($self->{'INCFLAG'} == 1 and $dcllt eq ']]>');

    # Parse markup declarations
    if ($dcllt eq '<!') { # Declaration
      $rt = $self->_parsedecl($fh, $dcllt.$rt, $uri);
    } elsif ($dcllt eq '<![') { # Conditional section
      $rt = $self->_parsecondsec($fh, $dcllt.$rt);
    } elsif ($dcllt eq '<!--') { # Comment
      ($dcl, $dclrt, $rt) = _scanuntil($fh, $rt, '-->', 0);
      push @{$self->{'ALL'}}, XML::DTD::Comment->new($dcllt.$dcl.$dclrt)
        if (!$self->{'EXPANDINGPE'});
    } elsif ($dcllt eq '<?') { # Processing instruction
      ($dcl, $dclrt, $rt) = _scanuntil($fh, $rt, '\?>', 0);
      push @{$self->{'ALL'}}, XML::DTD::PI->new($dcllt.$dcl.$dclrt)
        if (!$self->{'EXPANDINGPE'});
    } elsif ($dcllt eq '%') { # Parameter entity reference
      ($dcl, $dclrt, $rt) = _scanuntil($fh, $rt, ';', 0);
      push @{$self->{'ALL'}},
           XML::DTD::PERef->new($self->_entitymanager, '%'.$dcl.';')
        if (!$self->{'EXPANDINGPE'});
      if ($self->{'VALIDATING'}) {
	my $expanding = $self->{'EXPANDINGPE'};
	$self->{'EXPANDINGPE'} = 1;
	$self->parse(undef,
		     $self->_entitymanager->peexpand($dcl),
		     $self->_entitymanager->peuri($dcl));
	$self->{'EXPANDINGPE'} = $expanding;
      }
    } else {
      ##print "X: |$lt| |$dcllt| |$rt|\n";
      throw XML::DTD::Error("Parser found unrecognised markup: $dcllt",
			    $self);
      return $rt;
    }
    # Copy text after match into unparsed buffer
    $lt = $rt;
    $rt = '';
    # Get another line of text if unparsed buffer is empty
    $lt .= <$fh> if (!$lt and defined $fh);
  }
  ##print "RT: |$rt|\n";
  return $rt;
}


# Return the entity manager object
sub _entitymanager {
  my $self = shift;

  return $self->{'ENTMAN'};
}


# Scan string lt for regex $re, reading lines from filehandle fh until matched
# Ignores quoted matches of $re if $quo is passed and is non-zero.
sub _scanuntil {
  my $fh  = shift; # File handle from which to obtain input
  my $buf = shift; # Initial text already read from input
  my $re  = shift; # Regular expression to match
  my $quo = shift; # True if re is to be ignored if quoted

  $re = "($re)|['\"]" if ($quo);
  my $quoted = '';
  my ($left, $match, $right) = ('');
  while(!defined $match) {
    if ($buf =~ /$re/s) {
      my ($lt, $mt, $rt) = ($`, $&, $');
      my $isquote = !$quoted && ($mt eq '"' || $mt eq "'")
		    || $mt eq $quoted;
      if ($isquote or $quoted) {
	$quoted = $quoted ? '' : $mt if ($isquote);
	$left .= $lt.$mt;
	$buf = $rt;
      } elsif (!$quoted) {
	$left .= $lt;
	($match, $right) = ($mt, $rt);
      }
    } else {
      my $line;
      if (defined $fh and $line = <$fh>) {
	$buf .= $line;
      } else {
        $left = $buf;
        $buf = $match = $right = '';
      }
    }
  }
  return ($left, $match, $right);
}


# Handle element, attlist, entity, and notation declarations
sub _parsedecl {
  my $self = shift;
  my $fh = shift;
  my $rt = shift;
  my $uri = shift;

  my ($dcl, $dclrt, $type, $elt, $atl, $ent);
  ($dcl, $dclrt, $rt) = _scanuntil($fh, $rt, '>', 1);
  if ($dcl =~ /^\<\!(\w+)\s+/) {
    $type = $1;
    $dcl .= $dclrt;
    if ($type eq "ELEMENT") {
      $elt = XML::DTD::Element->new($self->_entitymanager, $dcl);
      if (!exists $self->{'ELEMENTS'}->{$elt->name()}) {
	push @{$self->{'ALL'}}, $elt
	  if (!$self->{'EXPANDINGPE'});
	$self->{'ELEMENTS'}->{$elt->name()} = $elt;
	##print STDERR "ELT: $self ".$elt->name()."\n";
      } else {
	throw XML::DTD::Error("Element " . $elt->name().
			      " redefined", $self);
      }
    } elsif ($type eq "ATTLIST") {
      my $atl = XML::DTD::AttList->new($self->_entitymanager, $dcl);
      push @{$self->{'ALL'}}, $atl
        if (!$self->{'EXPANDINGPE'});
      if (!exists $self->{'ATTLISTS'}->{$atl->name()}) {
	$self->{'ATTLISTS'}->{$atl->name()} = $atl;
      } else {
	$self->{'ATTLISTS'}->{$atl->name()}->merge($atl);
      }
    } elsif ($type eq "ENTITY") {
      $ent = XML::DTD::Entity->new($dcl, $self->{'VALIDATING'}, $uri);
      push @{$self->{'ALL'}}, $ent
        if (!$self->{'EXPANDINGPE'});
      $self->_entitymanager->insert($ent);
    } elsif ($type eq "NOTATION") {
      push @{$self->{'ALL'}}, XML::DTD::Notation->new($dcl)
        if (!$self->{'EXPANDINGPE'});
    } else {
      throw XML::DTD::Error("Unrecognised declaration type: $type",
			    $self);
    }	
  }
  return $rt;
}


# Handle conditional sections
sub _parsecondsec {
  my $self = shift;
  my $fh = shift;
  my $rt = shift;

  my ($pre, $lt, $m, $r, $cond);
  # Ensure that the INCLUDE/IGNORE has been read from fh
  ($lt, $m, $rt) = _scanuntil($fh, $rt, '<\!\[\s*(%[\w\.:\-_]+;|\w+)\s*\[', 0);
  $rt = $lt . $m . $rt;

  # Extract the INCLUDE/IGNORE word
  $rt =~ /<\!\[\s*(%[\w\.:\-_]+;|\w+)\s*\[/;
  $cond = $1;
  $m = $&;
  $r = $';

  $cond = $self->_entitymanager->peexpand($cond)
    if ($cond =~ /^%([\w\.:\-_]+);$/);

  if ($cond eq 'IGNORE') { # An IGNORE section
    my $lev = 0;
    my $ltdlm = $m;
    $lt = '';
    # Scan until nested <![ and ]]> delimiters are closed
    do {
      ($pre, $m, $rt) = _scanuntil($fh, $rt, '<\!\[|\]\]>', 0);
      $lt .= $pre . $m;
      if ($m eq '<![') {
	$lev++;
      } else {
	$lev--;
      }
    } while ($lev > 0);
    push @{$self->{'ALL'}}, XML::DTD::Ignore->new($lt, $ltdlm)
      if (!$self->{'EXPANDINGPE'});
  } elsif ($cond eq 'INCLUDE') { # An INCLUDE section
    $rt = $r;
    my $inc = XML::DTD::Include->new($self->_entitymanager, $m);
    $rt = $inc->parse($fh, $rt);
    push @{$self->{'ALL'}}, $inc
      if (!$self->{'EXPANDINGPE'});
    # Copy elements and attributes up to parent level
    my $hk;
    foreach $hk (keys %{$inc->{'ELEMENTS'}} ) {
      $self->{'ELEMENTS'}->{$hk} = $inc->{'ELEMENTS'}->{$hk};
    }
    foreach $hk (keys %{$inc->{'ATTLISTS'}} ) {
      $self->{'ATTLISTS'}->{$hk} = $inc->{'ATTLISTS'}->{$hk};
    }
  } else { # A section of unrecognised type
    ($lt, $m, $rt) = _scanuntil($fh, $rt, '\]\]>', 0);
    throw XML::DTD::Error("Unrecognised conditional section type: $cond",
			  $self);
  }
  return $rt;
}


1;
__END__

=head1 NAME

XML::DTD::Parser - Perl module for parsing XML DTDs

=head1 SYNOPSIS

  use XML::DTD::Parser;

  my $dp = new XML::DTD::Parser [ ($val) ];

=head1 DESCRIPTION

  XML::DTD::Parser is a support module for top level parsing of an XML
  DTD. The following methods are provided.

=over 4

=item B<new>

 my $dp = new XML::DTD::Parser [ ($val) ];

Construct a new XML::DTD::Parser object.

The parser will be validating, and hence will make parameter and character
entity substitutions, if the argument C<$val> is present and non-zero.

=item B<isa>

if (XML::DTD::Parser->isa($obj) {
 ...
 }

Test object type

=item B<parse>

 open(FH,'<file.dtd');
 my $rt = '';
 $dp->parse(*FH, $rt);

Parse a DTD file.

 my $dtduri = 'http://nonesuch.com/MyDTD.dtd'
 my $dtd = LWP::Simple::get($dtduri);
 $dp->parse(undef, $dtd, $dtduri);

Parse a DTD from a URL.

If the parser is validating, the URI of the document containing the DTD
should be passed. If it isn't, it is arbitrarily given the relative
URI C<unknown.dtd>.

 my $dp = DML::DTD::Parser->new(1);
 my $file = 'file.dtd'
 open(FH,"<$file");
 my $rt = '';
 $dp->parse(*FH, $rt, $file);

For a correct validating parse of a file.

If the URI isn't absolute, then it is converted into an absolute C<file:>
URI relative to the current working directory. The test for this assumes
that the URI scheme is more than one character long, so that a DOS drive
number isn't used as a scheme.

Since the default URI is relative, any relative
URIs in external entity declarations will be interpreted relative to a
(probably non-existent) file in the parser's current working directory.
In this case it's probably safest not to use relative URIs in the DTD
being parsed.

The order of parsing of C<$rt> and C<$file> is such that the internal subset
can be passed in C<$rt>, and the external subset in C<$file>, however, if
any of the output methods of subclass L<DTD|../DTD.pm> is called, the result
will be the merger of the internal and external subsets.

=back

=head1 SEE ALSO

L<XML::DTD>

=head1 AUTHOR

Brendt Wohlberg E<lt>wohl@cpan.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2004-2010 by Brendt Wohlberg

This library is available under the terms of the GNU General Public
License (GPL), described in the GPL file included in this distribution.

=head1 ACKNOWLEDGMENTS

Peter Lamb E<lt>Peter.Lamb@csiro.auE<gt> added fetching of external
entities, improved entity substitution, and implemented more robust
parsing of some classes of declaration.

=cut