package HTML::Validator;
# Copyright 1997-1999 Sami Itkonen <si@iki.fi>
#
# Distributed under the GNU General Public License
BEGIN {
if (eval 'use LWP::Simple', $@ eq '') {
$USELWP = 1;
}
else {
$USELWP = 0;
}
}
use strict;
use vars qw($VERSION $USELWP);
use FileHandle;
use Carp;
$VERSION = "0.13";
my $tmpfile;
my $tmpdir;
my $tmpdtd;
my $defaultdtd = 'html4';
my $dtdmap = {
'xhtml1' => q{PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"},
'html4' => q{PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"},
'html4strict' => q{PUBLIC "-//W3C//DTD HTML 4.0//EN"},
'html4frame' => q{PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN"},
'html3.2' => q{PUBLIC "-//W3C//DTD HTML 3.2//EN"},
'html3' => q{PUBLIC "-//IETF//DTD HTML 3.0//EN//"},
'html2' => q{PUBLIC "-//IETF//DTD HTML 2.0//EN"},
'nshtml' => q{PUBLIC "-//Netscape Comm. Corp.//DTD HTML//EN"},
'iehtml' => q{PUBLIC "-//Microsoft//DTD Internet Explorer 3.0 HTML//EN"},
};
=head1 NAME
HTML::Validator - HTML validation by using nsgmls
=head1 SYNOPSIS
use HTML::Validator;
$doc = new HTML::Validator($file);
$doc->validate;
print "Document is valid\n" if $doc->is_valid;
=head1 DESCRIPTION
This module can be used to validate HTML (or SGML) documents. For the
validation itself, it uses nsgmls and a set of document type definition
files (aka DTDs).
HTML::Validator uses I<libwww-perl> to validate remote files.
=cut
#---------------------------------------------
# The constructor
#---------------------------------------------
sub new {
my $this = shift;
my $class = ref($this) || $this;
my $self = {};
bless $self, $class;
# $self->initialize();
my $file = shift;
my $user = $ENV{LOGNAME} || $ENV{USER};
my $gentmp = $ENV{TEMP} || ($^O =~ /Win/i) ? "c:/temp/" : '/tmp/';
-d $gentmp or mkdir $gentmp, 0700; # to be sure or die?
$tmpdir = join '', $gentmp, "validator-", $user, "$$";
mkdir $tmpdir, 0700;
$tmpfile = join '', $tmpdir, '/', $user, time unless defined $tmpfile;
$self->{tmpfile} = $tmpfile;
$self->{tmpdir} = $tmpdir;
# print $tmpdir,"\n";
# print $tmpfile,"\n";
$self->deltempfiles;
$self->open($file) if defined $file;
$self->{catalog} = "@SGMLCAT@";
$self->{nsgmls} = "@NSGMLS@";
$self->{maxerr} = 500;
$self->{xmlcat} = "@XHTMLCAT@";
$self->{usexml} = "@XML@";
$self->{xmldir} = substr($self->{xmlcat}, 0,
rindex ($self->{xmlcat}, '/') + 1);
$self->{defaultdtd} = $defaultdtd;
$self->{dtdmap} = $dtdmap;
return $self;
}
=over 4
=item $doc->open($file)
The file or url will be used as the document to validate. This method
will be called implicitely if the constructor is called with an argument.
=cut
#---------------------------------------------
# Open a file for read
#---------------------------------------------
sub open {
my $obj = shift;
my $file = shift;
return undef if defined $obj->{file};
$obj->{fh} = new FileHandle;
$obj->{url} = $file;
if ($file =~ m|^(\w+)://|) {
$obj->{file} = $obj->{tmpfile};
$obj->{useLWP} = 1;
}
else {
return undef unless -r $file;
$obj->{file} = $obj->{tmpfile};
$obj->{url} = $file;
}
return 1;
}
=item $doc->doctype
Returns the document type.
The return value is undefined if no filename has been passed to the
object via the constructor or the I<open> method.
If the file has not been retrieved yet, it will be done.
If you want to replace the document type, you must do so with the first
call to this method. The document types are defined in $doc->{dtdmap}.
=cut
#---------------------------------------------
# read the doctype
#---------------------------------------------
sub doctype {
my $obj = shift;
my ($input) = shift;
if (defined $input) {
$obj->{newdoctype} = $input;
}
else {
return undef unless defined $obj->{file};
$obj->get_file unless defined $obj->{data};
return $obj->{doctype};
}
}
=item $doc->checkdtd
Finds out the actual name of document type definition file that is used.
The return value is the name of the file, or undefined if the document type
is not defined.
=cut
#---------------------------------------------
# check which dtd is used
#---------------------------------------------
sub checkdtd {
my $obj = shift;
return $obj->{dtdfile} if defined $obj->{dtdfile};
return undef unless defined $obj->{"doctype"};
my $dtdfile;
# print $obj->{doctype},"\n";
my ($id,$url) = $obj->{"doctype"} =~ /\"(.*?)\"\s*\"(.*?)\"/;
#"
if (defined $url) {
$obj->{dtdfile} = $url;
$obj->{"doctype"} =~ s/\s*\"$url\"\s*$//;
#"
return $dtdfile;
}
my $fh = new FileHandle;
return -1 unless CORE::open($fh,$obj->{catalog});
my $dtdregexp = $obj->{"doctype"};
$dtdregexp =~ s/\s+/\\s\+/gs;
$dtdregexp =~ s/\[/\\\[/gs;
# print $dtdregexp,"\n";
my $line;
while (defined($line = <$fh>)) {
if (($dtdfile) = $line =~ /$dtdregexp\s+(.*?)$/) {
close($fh);
$obj->{dtdfile} = $dtdfile;
return $dtdfile;
}
}
close($fh);
return undef;
}
=item $doc->validate
Validates the document. The return value is a reference to an array
containing the modified output from nsgmls.
=cut
#---------------------------------------------
# validate the file
#---------------------------------------------
sub validate {
my $obj = shift;
return $obj->{errors} if defined $obj->{errors};
my $file = $obj->{file};
# print "$file\n";
unless (defined $file) {
$obj->message("file not found");
return undef;
}
unless (defined $obj->{data}) {
my $ret = $obj->get_file;
return undef if $ret eq -1;
}
if (defined $obj->{errors}) {
return $obj->{errors};
}
my $fh = $obj->{fh};
my $nsgmls = $obj->{nsgmls};
my $catalog = $obj->{catalog};
my $xmlcat = $obj->{xmlcat};
if ($nsgmls =~ m:/: && ! -e $nsgmls) {
# carp("nsgmls binary '$nsgmls' not found");
$obj->message("Nsgmls not found");
return undef;
}
my $opts = defined $obj->{maxerr} ? "-E ".$obj->{maxerr} . " " : "";
# print "$file $nsgmls $catalog\n";
if (defined $obj->{XML}) {
if ($obj->{usexml} eq "NO") {
$obj->message("No XML support - cannot validate XML file");
return $obj->{errors};
}
$opts .= "-wxml -c$xmlcat";
}
else {
$opts .= "-c $catalog";
}
# $obj->message("$nsgmls -s $opts $file");
if (!CORE::open($fh,"$nsgmls -s $opts $file 2>&1 |")) {
$obj->message("Unable to execute nsgmls");
return undef;
}
# print "$file $nsgmls $catalog\n";
my $oldsep = $/;
undef $/;
$obj->{raw_output} = <$fh>;
# print $obj->{raw_output},"\n";
$/ = $oldsep;
$obj->{valid} = 1;
# $obj->message("valid");
$obj->parse_errors;
# $obj->message("Document is valid") if $obj->{valid};
# print $obj->{valid},"\n";
return $obj->{errors};
}
=item $doc->get_file
Internal method to get the file and process the doctype information.
=cut
#---------------------------------------------
# get the file and process the doctype information
#---------------------------------------------
sub get_file {
my $obj = shift;
return $obj->{data} if defined $obj->{data};
return undef unless defined $obj->{file};
my $fh = new FileHandle;
my $data;
if ($obj->{useLWP}) {
if (!$USELWP) {
$obj->message("HTTP downloads not supported without LWP");
return -1;
}
$data = get($obj->{url});
if ($data =~ /^\s*$/) {
$obj->message("Unable to download file");
return -1;
}
}
else {
my $oldsep = $/;
undef $/;
return undef unless CORE::open($fh,$obj->{url});
$data = <$fh>;
close($fh);
$/ = $oldsep;
}
my $xmlrxp = '\s*\<\s*\?\s*xml\s+.*?\s*\?\s*\>';
my ($xml) = $data =~ /^($xmlrxp*)/s;
if (defined $xml) {
$obj->{XML} = $xml;
}
my $docrxp = '\s*\<\s*\!DOCTYPE\s+(\w+)\s*(\[.*?\]|.*?)\s*\>';
my ($type,$dtd) = $data =~ /$docrxp/s;
# print $docrxp,"\n";
$obj->{data} = $data;
if (!defined $obj->{XML}) {
if (defined $type && $type !~ /HTML/) {
$obj->{XML} = $type;
}
else {
$obj->{"doctype"} = $dtd;
if (my $type = $obj->{newdoctype}) {
if (my $dtdstr = $obj->{dtdmap}->{$type}) {
$obj->replace_dtd($dtdstr);
}
}
elsif ($dtd =~ /^\s*$/) {
$obj->replace_dtd;
}
}
}
if (defined $obj->{XML}) {
$obj->{"doctype"} = $dtd;
}
$obj->checkdtd;
# check if the doctype contains an URL
my ($id,$url) = $dtd =~ /\"(.*?)\"\s*\"(.*?)\"/;
#"
unless (defined $url) {
# print $dtd,"\n";
($id,$url) = $dtd =~ /(.*?)\s*\"(.*?)\"/;
# print $id,"\n",$url,"\n";#"
}
if (defined $url) {
# we might have to download this
if ($url =~ m,http://\S+,) {
if (defined $obj->{subst_URL}) {
$obj->substitute_dtd_url($url);
}
}
else {
# a local document, we'll have to make sure it is found
# print $url,"\n";
if (index($url,"/") lt 0) {
$tmpdtd = $tmpdir."/".$url;
if (-r $url) {
system ("cp", $url,$tmpdir);
$obj->{dtdfile} = $url;
}
elsif (-r $obj->{xmldir}."/".$url) {
system ("cp", $obj->{xmldir}."/".$url,$tmpdir);
$obj->{dtdfile} = $url;
}
}
# print "$tmpdtd\n";
}
}
# print $data,"\n";
# system("cp",$obj->{url},$obj->{file});
return undef unless CORE::open($fh,">$obj->{file}");
print $fh $obj->{data};
close($fh);
}
=item $doc->substitute_dtd_url
If there is an URL in the doctype, it will be replaced to support
nsgmls binaries that do not support URLs
=cut
#---------------------------------------------
# substitutes the url in dtd
#---------------------------------------------
sub substitute_dtd_url {
my $obj = shift;
my ($url) = @_;
unlink $tmpdtd if defined $tmpdtd;
undef $tmpdtd;
$tmpdtd = $obj->{tmpdir} . substr($url, rindex ($url, '/') + 1);
if (getstore($url,$tmpdtd) eq "200") {
$obj->{data} =~ s/$url/$tmpdtd/;
}
}
=item $doc->replace_dtd
Replaces the document type definition on the file. The new dtd is the
first argument, or the default dtd if no argument is supplied.
=cut
#---------------------------------------------
# replace the dtd in the file
#---------------------------------------------
sub replace_dtd {
my $obj = shift;
my ($dtd) = @_;
$dtd = $obj->{dtdmap}->{$obj->{defaultdtd}} unless defined $dtd;
my $dtdstr = "<!DOCTYPE HTML " . $dtd . ">";
# print $dtd,"\n";
my $data = $obj->{data};
if ($obj->{"doctype"} =~ /^\s*$/) {
# no doctype; need a new one
$data =~ s/^/$dtdstr\n/;
$obj->message("No document type defined");
}
else {
# just replace it
$data =~ s/\s*\<\s*\!DOCTYPE HTML\s*(.*?)\>\s*/$dtdstr\n/i;
}
$obj->message("Replacing dtd with '$dtd'");
$obj->{data} = $data;
# print $obj->{newdoctype},"\n";
# print $obj->{defaultdtd},"\n";
# print "$data\n";
$obj->{"doctype"} = $dtd;
}
=item $doc->errors
Returns an error from nsgmls error output queue.
=cut
#---------------------------------------------
# the error output
#---------------------------------------------
sub errors {
my $obj = shift;
my $r = $obj->{errors};
my @array;
@array = defined $r ? @{$r} : ();
my $error = shift @array;
# print $error,"\n";
$obj->{errors} = \@array;
return $error;
}
=item $doc->parse_errors
Internal method to parse the raw nsgmsl output to a more readable form.
If you want to call this method more than once per object, purge the error
output queue with $doc->errors first.
This method will call a parser method to do the actual parsing, which is
$doc->parser() by default. It can overriden by setting $doc->{parser}.
=cut
#---------------------------------------------
# parse the nsgmls errors
#---------------------------------------------
sub parse_errors {
my $obj = shift;
return undef if defined $obj->{"errors"};
my $error_output;
if (defined $obj->{parser}) {
$error_output = &{$obj->{parser}}($obj->{raw_output});
}
else {
$error_output = $obj->parser($obj->{raw_output});
}
if (defined $error_output) {
# the document is invalid
$obj->message("Document is invalid");
$obj->{valid} = 0;
my @array = @{$error_output};
$obj->{"errors"} = \@array;
}
else {
# the document is valid
$obj->message("Document is valid");
$obj->{valid} = 1;
}
# print $obj->{valid},"\n";
}
=item $doc->parser
The default nsgmls output parser. This is called from parse_errors.
If the return value is undef, then to parser is assumed to have found no
errors. Otherwise the parser will return a reference to an array containing
the errors.
=cut
#---------------------------------------------
# the default parser
#---------------------------------------------
sub parser {
my $obj = shift;
my $output = shift;
my @errors;
my $i=0;
my $valid = 1;
# print $output,"\n";
return undef unless defined $output;
while ($output =~ s/^(.*?)\n//) {
my $line = $1;
# print $line,"\n";
$line =~ s/^$obj->{nsgmls}://;
next if $line =~ /\.dtd:(\d+):(\d):E: /;
next if $line =~ /^\s*$/;
# print $line,"\n";
next if $line =~ /is not a character number/;
if ($line =~ s/^.*?://) {
if ($line =~ s/^(\d+):(\d+):E: // || $line =~ s/^(\d+):(\d+): //
|| $line =~ s/^(\d+):(\d+):W: //) {
$errors[$i] = sprintf(" Line %3d.%2d: ",$1,$2);
}
$valid = 0;
$errors[$i] .= "$line";
$i++;
}
# $output =~ s/^.*?\n//;
}
return $valid ? undef : \@errors;
}
#---------------------------------------------
# parse the nsgmls errors
#---------------------------------------------
=item $doc->is_valid
Returns 1 if document is valid, 0 if document is invalid and undef if
document hasn't been validated yet.
=cut
#---------------------------------------------
# is the document valid ?
#---------------------------------------------
sub is_valid {
my $obj = shift;
return $obj->{valid};
}
=item $doc->source
Contains the source of the HTML file as a scalar.
=cut
#---------------------------------------------
# the source of the HTML file
#---------------------------------------------
sub source {
my $obj = shift;
return $obj->{data} if defined $obj->{data};
return undef unless defined $obj->{file};
$obj->get_file unless defined $obj->{data};
return $obj->{data};
}
=item $doc->message
Contains the message queue. If called with an argument, places a new message
to the queue, without an argument a message (if any) is removed.
If the argument is '-1', the last message on the queue is returned.
=cut
#----------------------------------------------
# the message HTML::Validator wants to send out
#----------------------------------------------
sub message {
my $obj = shift;
my ($msg) = @_;
my $r = $obj->{"message"};
# return undef unless defined $r;
my @array;
@array = @{$r} if defined $r;
if (defined $msg) {
# print "adding message '$msg'\n";
if ($msg eq -1) {
$msg = pop (@array);
}
else {
push (@array,$msg);
}
}
else {
$msg = shift (@array);
# print "removing message '$msg'\n";
}
$obj->{message} = \@array;
# print @{$r},"\n" if defined $r;
return $msg;
}
=item $doc->reset
Resets the object to original state so we can reuse it
=cut
#----------------------------------------------
# reset all values so we can reuse the object
#----------------------------------------------
sub reset {
my $obj = shift;
$obj->deltempfiles;
delete $obj->{"doctype"};
delete $obj->{newdoctype};
delete $obj->{dtdfile};
delete $obj->{"errors"};
delete $obj->{"message"};
undef $obj->{valid};
delete $obj->{data};
delete $obj->{file};
delete $obj->{url};
delete $obj->{useLWP};
delete $obj->{XML};
}
#----------------------------------------------
# delete temporary files
#----------------------------------------------
sub deltempfiles {
unlink $tmpfile;
unlink $tmpdtd if defined $tmpdtd;
# rmdir $tmpdir;
}
#----------------------------------------------
# The destructor
#----------------------------------------------
END {
my $obj = shift;
# print $tmpfile,"\n";
deltempfiles;
rmdir $tmpdir;
};
1;
__END__
=back
=head1 VARIABLES
=item $doc->{nsgmls}
The used I<nsgmls> binary
=item $doc->{catalog}
The used catalog file
=item $doc->{message}
Array of messages
=item $doc->{maxerr}
The maximum number of errors. This is passed to nsgmls with the C<-E> option
=item $doc->{doctype}
The document type for the document
=item $doc->{defaultdtd}
The default type for the document. By default this is 'html4'.
=item $doc->{dtdfile}
The dtd used for the document
=item $doc->{dtdmap}
The mapping for document types and the explicit document type definition
strings
=item $doc->{parser}
The custom parser to use. See the information for the parser method.
=head1 REQUIREMENTS
HTML::Validator requires that
=over 3
=item *
I<nsgmls> is installed. It comes with James Clark's I<sp> SGML toolkit,
I<http://www.jclark.com/sp/howtoget.htm>
=item *
you have a reasonable set of document type files. For this purpose, you can use
W3C's sgml library at I<http://validator.w3.org/sgml-lib.tar.gz>
=back
=head1 SEE ALSO
L<LWP>, L<nsgmls>
=head1 ACKNOWLEDGEMENTS
Thanks go to:
- Heikki Kantola <hezu@iki.fi>, for his help in the early testing
phases and his excellent knowledge about HTML standards.
=head1 AVAILABILITY
The latest version of HTML::Validator can be found from
I<http://www.iki.fi/si/HTML-Validator/>.
It is also available from CPAN (I<http://www.perl.com/CPAN/>).
=head1 COPYRIGHT
HTML::Validator is (c) 1997-1999 Sami Itkonen <si@iki.fi>
HTML::Validator is distributed under the GNU General Public License.
=cut