The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
package HTML::Validator;

# Copyright 1997-1999 Sami Itkonen <si@iki.fi>
#
# Distributed under the GNU General Public License

BEGIN {
  if (eval 'use LWP::Simple', $@ eq '') {
    $USELWP = 1;
  }
  else {
    $USELWP = 0;
  }
}

use strict;
use vars qw($VERSION $USELWP);
use FileHandle;
use Carp;

$VERSION = "0.13";

my $tmpfile;
my $tmpdir;
my $tmpdtd;

my $defaultdtd = 'html4';

my $dtdmap = {
     'xhtml1'      => q{PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"},
     'html4'       => q{PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"},
     'html4strict' => q{PUBLIC "-//W3C//DTD HTML 4.0//EN"},
     'html4frame'  => q{PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN"},
     'html3.2'     => q{PUBLIC "-//W3C//DTD HTML 3.2//EN"},
     'html3'  => q{PUBLIC  "-//IETF//DTD HTML 3.0//EN//"},
     'html2'  => q{PUBLIC "-//IETF//DTD HTML 2.0//EN"},
     'nshtml' => q{PUBLIC "-//Netscape Comm. Corp.//DTD HTML//EN"},
     'iehtml' => q{PUBLIC "-//Microsoft//DTD Internet Explorer 3.0 HTML//EN"},
};

=head1 NAME

HTML::Validator - HTML validation by using nsgmls

=head1 SYNOPSIS

 use HTML::Validator;
 $doc = new HTML::Validator($file);
 $doc->validate;
 print "Document is valid\n" if $doc->is_valid;

=head1 DESCRIPTION

This module can be used to validate HTML (or SGML) documents. For the
validation itself, it uses nsgmls and a set of document type definition
files (aka DTDs).

HTML::Validator uses I<libwww-perl> to validate remote files.

=cut

#---------------------------------------------
# The constructor
#---------------------------------------------

sub new {
  my $this = shift;
  my $class = ref($this) || $this;
  my $self = {};
  bless $self, $class;
#    $self->initialize();
  my $file = shift;
  my $user = $ENV{LOGNAME} || $ENV{USER};                                   

  my $gentmp = $ENV{TEMP} || ($^O =~ /Win/i) ? "c:/temp/" : '/tmp/';
  -d $gentmp or mkdir $gentmp, 0700; # to be sure or die?

  $tmpdir = join '', $gentmp, "validator-", $user, "$$";
  mkdir $tmpdir, 0700;

  $tmpfile = join '', $tmpdir, '/', $user, time unless defined $tmpfile;    
  $self->{tmpfile} = $tmpfile;
  $self->{tmpdir} = $tmpdir;
#  print $tmpdir,"\n";
#  print $tmpfile,"\n";
  $self->deltempfiles;
  $self->open($file) if defined $file; 
  $self->{catalog} = "@SGMLCAT@";
  $self->{nsgmls} = "@NSGMLS@";
  $self->{maxerr} = 500;
  $self->{xmlcat} = "@XHTMLCAT@";
  $self->{usexml} = "@XML@";
  $self->{xmldir} = substr($self->{xmlcat}, 0, 
			   rindex ($self->{xmlcat}, '/') + 1);
  $self->{defaultdtd} = $defaultdtd;
  $self->{dtdmap} = $dtdmap;
  return $self;
}

=over 4

=item $doc->open($file)

The file or url will be used as the document to validate. This method
will be called implicitely if the constructor is called with an argument.

=cut

#---------------------------------------------
# Open a file for read
#---------------------------------------------

sub open {
  my $obj = shift;
  my $file = shift;
  return undef if defined $obj->{file};
  $obj->{fh} = new FileHandle;
  $obj->{url} = $file;
  if ($file =~ m|^(\w+)://|) {
    $obj->{file} = $obj->{tmpfile};
    $obj->{useLWP} = 1;
  }
  else {
    return undef unless -r $file;
    $obj->{file} = $obj->{tmpfile};
    $obj->{url} = $file;
  }
  return 1;
}

=item $doc->doctype

Returns the document type.

The return value is undefined if no filename has been passed to the 
object via the constructor or the I<open> method.

If the file has not been retrieved yet, it will be done.

If you want to replace the document type, you must do so with the first
call to this method. The document types are defined in $doc->{dtdmap}.

=cut

#---------------------------------------------
# read the doctype
#---------------------------------------------

sub doctype {
  my $obj = shift;
  
  my ($input) = shift;

  if (defined $input) {
    $obj->{newdoctype} = $input;
  }
  else {
    return undef unless defined $obj->{file};
    $obj->get_file unless defined $obj->{data};
    return $obj->{doctype};
  }
}

=item $doc->checkdtd

Finds out the actual name of document type definition file that is used.
The return value is the name of the file, or undefined if the document type
is not defined.

=cut

#---------------------------------------------
# check which dtd is used
#---------------------------------------------

sub checkdtd {
  my $obj = shift;
  return $obj->{dtdfile} if defined $obj->{dtdfile};
  return undef unless defined $obj->{"doctype"};
  my $dtdfile;
#  print $obj->{doctype},"\n";
  my ($id,$url) = $obj->{"doctype"} =~ /\"(.*?)\"\s*\"(.*?)\"/;
#"

  if (defined $url) {
    $obj->{dtdfile} = $url;
    $obj->{"doctype"} =~ s/\s*\"$url\"\s*$//;
#"
    return $dtdfile;
  }
  my $fh = new FileHandle;
  return -1 unless CORE::open($fh,$obj->{catalog});
  my $dtdregexp = $obj->{"doctype"};
  $dtdregexp =~ s/\s+/\\s\+/gs;
  $dtdregexp =~ s/\[/\\\[/gs;
#  print $dtdregexp,"\n";
  my $line;
  while (defined($line = <$fh>)) {
    if (($dtdfile) = $line =~ /$dtdregexp\s+(.*?)$/) {
      close($fh);
      $obj->{dtdfile} = $dtdfile;
      return $dtdfile;
    }
  }
  close($fh);
  return undef;
}

=item $doc->validate

Validates the document. The return value is a reference to an array 
containing the modified output from nsgmls.

=cut

#---------------------------------------------
# validate the file
#---------------------------------------------

sub validate {
  my $obj = shift;
  return $obj->{errors} if defined $obj->{errors};
  my $file = $obj->{file};
#  print "$file\n";
  unless (defined $file) {
    $obj->message("file not found");
    return undef;
  }

  unless (defined $obj->{data}) {
    my $ret = $obj->get_file; 
    return undef if $ret eq -1;
  }	
  
  if (defined $obj->{errors}) {
    return $obj->{errors};
  }
  my $fh = $obj->{fh};
  my $nsgmls = $obj->{nsgmls};
  my $catalog = $obj->{catalog};
  my $xmlcat = $obj->{xmlcat};

  if ($nsgmls =~ m:/: && ! -e $nsgmls) {
#    carp("nsgmls binary '$nsgmls' not found");
    $obj->message("Nsgmls not found");
    return undef;
  }

  my $opts = defined $obj->{maxerr} ? "-E ".$obj->{maxerr} . " " : "";

#  print "$file $nsgmls $catalog\n";
  if (defined $obj->{XML}) {
    if ($obj->{usexml} eq "NO") {
      $obj->message("No XML support - cannot validate XML file");
      return $obj->{errors};
    }
    $opts .= "-wxml -c$xmlcat";
  }
  else {
    $opts .= "-c $catalog";
  }

#  $obj->message("$nsgmls -s $opts $file");

  if (!CORE::open($fh,"$nsgmls -s $opts $file 2>&1 |")) {
    $obj->message("Unable to execute nsgmls");
    return undef;
  }

#  print "$file $nsgmls $catalog\n";
  my $oldsep = $/;
  undef $/;
  $obj->{raw_output} = <$fh>;
#  print $obj->{raw_output},"\n";
  $/ = $oldsep;

  $obj->{valid} = 1;
#  $obj->message("valid");
  $obj->parse_errors;
#  $obj->message("Document is valid") if $obj->{valid};
#  print $obj->{valid},"\n";
  return $obj->{errors};
}

=item $doc->get_file

Internal method to get the file and process the doctype information.

=cut

#---------------------------------------------
# get the file and process the doctype information
#---------------------------------------------

sub get_file {
  my $obj = shift;
  return $obj->{data} if defined $obj->{data};
  return undef unless defined $obj->{file};
  my $fh = new FileHandle;
  my $data;
  if ($obj->{useLWP}) {
    if (!$USELWP) {
      $obj->message("HTTP downloads not supported without LWP");
      return -1;
    }
    $data = get($obj->{url});
    if ($data =~ /^\s*$/) {
      $obj->message("Unable to download file");
      return -1;
    }
  }
  else {
    my $oldsep = $/;
    undef $/;
    return undef unless CORE::open($fh,$obj->{url});
    
    $data = <$fh>;
    close($fh);
    $/ = $oldsep;
    
  }
  my $xmlrxp = '\s*\<\s*\?\s*xml\s+.*?\s*\?\s*\>';
  my ($xml) = $data =~ /^($xmlrxp*)/s;

  if (defined $xml) {
    $obj->{XML} = $xml;
  }

  my $docrxp = '\s*\<\s*\!DOCTYPE\s+(\w+)\s*(\[.*?\]|.*?)\s*\>';
  my ($type,$dtd) = $data =~ /$docrxp/s;

#  print $docrxp,"\n";
  $obj->{data} = $data;

  if (!defined $obj->{XML}) {
    if (defined $type && $type !~ /HTML/) {    
      $obj->{XML} = $type;
    }
    else {
      $obj->{"doctype"} = $dtd;
      if (my $type =  $obj->{newdoctype}) {
	if (my $dtdstr = $obj->{dtdmap}->{$type}) {
	  $obj->replace_dtd($dtdstr);
	}
      }
      elsif ($dtd =~ /^\s*$/) {
	$obj->replace_dtd;
      }
    }
  }

  if (defined $obj->{XML}) {
    $obj->{"doctype"} = $dtd;
  }

  $obj->checkdtd;
# check if the doctype contains an URL

  my ($id,$url) = $dtd =~ /\"(.*?)\"\s*\"(.*?)\"/;
#"
  unless (defined $url) {
#    print $dtd,"\n";
    ($id,$url) = $dtd =~ /(.*?)\s*\"(.*?)\"/;
#     print $id,"\n",$url,"\n";#"

  }
  if (defined $url) {
    # we might have to download this
    if ($url =~ m,http://\S+,) {
      if (defined $obj->{subst_URL}) {
	$obj->substitute_dtd_url($url);
      }
    }
    else {
      # a local document, we'll have to make sure it is found
#      print $url,"\n";
      if (index($url,"/") lt 0) {
	$tmpdtd = $tmpdir."/".$url;
	if (-r $url) {
	  system ("cp", $url,$tmpdir);
	  $obj->{dtdfile} = $url;
	}
	elsif (-r $obj->{xmldir}."/".$url) {
	  system ("cp", $obj->{xmldir}."/".$url,$tmpdir);
	  $obj->{dtdfile} = $url;
	}
      }
#      print "$tmpdtd\n";
    }
  }

#  print $data,"\n";
#    system("cp",$obj->{url},$obj->{file});

  return undef unless CORE::open($fh,">$obj->{file}");
  print $fh $obj->{data};
  close($fh);
}

=item $doc->substitute_dtd_url

If there is an URL in the doctype, it will be replaced to support
nsgmls binaries that do not support URLs

=cut

#---------------------------------------------
# substitutes the url in dtd 
#---------------------------------------------

sub substitute_dtd_url {
    my $obj = shift;
    my ($url) = @_;

    unlink $tmpdtd if defined $tmpdtd;
    undef $tmpdtd;

    $tmpdtd = $obj->{tmpdir} . substr($url, rindex ($url, '/') + 1);
 
    if (getstore($url,$tmpdtd) eq "200") {
       $obj->{data} =~ s/$url/$tmpdtd/;
    }
}

=item $doc->replace_dtd

Replaces the document type definition on the file. The new dtd is the
first argument, or the default dtd if no argument is supplied.

=cut

#---------------------------------------------
# replace the dtd in the file
#---------------------------------------------

sub replace_dtd {
  my $obj = shift;
  my ($dtd) = @_;
  $dtd = $obj->{dtdmap}->{$obj->{defaultdtd}} unless defined $dtd;
  my $dtdstr = "<!DOCTYPE HTML " . $dtd . ">";
#  print $dtd,"\n";
  my $data = $obj->{data};
  if ($obj->{"doctype"} =~ /^\s*$/) {
    # no doctype; need a new one
    $data =~ s/^/$dtdstr\n/;
    $obj->message("No document type defined");
  }
  else {
    # just replace it
    $data =~ s/\s*\<\s*\!DOCTYPE HTML\s*(.*?)\>\s*/$dtdstr\n/i;
  }
  $obj->message("Replacing dtd with '$dtd'");
  $obj->{data} = $data;
#  print $obj->{newdoctype},"\n";
#  print $obj->{defaultdtd},"\n";
#  print "$data\n";
  $obj->{"doctype"} = $dtd;
}

=item $doc->errors

Returns an error from nsgmls error output queue.

=cut

#---------------------------------------------
# the error output
#---------------------------------------------

sub errors {
  my $obj = shift;
  my $r = $obj->{errors};
  my @array;
  @array = defined $r ? @{$r} : ();
  my $error = shift @array;
#  print $error,"\n";
  $obj->{errors} = \@array;
  return $error;
}

=item $doc->parse_errors

Internal method to parse the raw nsgmsl output to a more readable form.
If you want to call this method more than once per object, purge the error
output queue with $doc->errors first.

This method will call a parser method to do the actual parsing, which is
$doc->parser() by default. It can overriden by setting $doc->{parser}.

=cut

#---------------------------------------------
# parse the nsgmls errors
#---------------------------------------------

sub parse_errors {
  my $obj = shift;

  return undef if defined $obj->{"errors"};

  my $error_output;

  if (defined $obj->{parser}) {
    $error_output = &{$obj->{parser}}($obj->{raw_output});
  }
  else {
    $error_output = $obj->parser($obj->{raw_output});
  }

  if (defined $error_output) {
    # the document is invalid
    $obj->message("Document is invalid");
    $obj->{valid} = 0;
    my @array = @{$error_output};
    $obj->{"errors"} = \@array;
  }
  else {
    # the document is valid
    $obj->message("Document is valid");
    $obj->{valid} = 1;
  }
#  print $obj->{valid},"\n";
}

=item $doc->parser

The default nsgmls output parser. This is called from parse_errors. 
If the return value is undef, then to parser is assumed to have found no
errors. Otherwise the parser will return a reference to an array containing
the errors.

=cut

#---------------------------------------------
# the default parser
#---------------------------------------------

sub parser {
  my $obj = shift;

  my $output = shift;
  
  my @errors;
  my $i=0;
  my $valid = 1;

#  print $output,"\n";
  return undef unless defined $output;
  while ($output =~ s/^(.*?)\n//) {
    my $line = $1;

#    print $line,"\n";
    $line =~ s/^$obj->{nsgmls}://;
    next if $line =~ /\.dtd:(\d+):(\d):E: /;
    next if $line =~ /^\s*$/;
#    print $line,"\n";
    next if $line =~ /is not a character number/;
    if ($line =~ s/^.*?://) {

      if ($line =~ s/^(\d+):(\d+):E: // || $line =~ s/^(\d+):(\d+): //
	 || $line =~ s/^(\d+):(\d+):W: //) {
	$errors[$i] = sprintf(" Line %3d.%2d: ",$1,$2);
      }
      $valid = 0;
      $errors[$i] .= "$line";
      $i++;
    }
#    $output =~ s/^.*?\n//;
  }

  return $valid ? undef : \@errors;
}

#---------------------------------------------
# parse the nsgmls errors
#---------------------------------------------

=item $doc->is_valid

Returns 1 if document is valid, 0 if document is invalid and undef if
document hasn't been validated yet.

=cut

#---------------------------------------------
# is the document valid ?
#---------------------------------------------

sub is_valid {
  my $obj = shift;
  return $obj->{valid};
}

=item $doc->source

Contains the source of the HTML file as a scalar.

=cut

#---------------------------------------------
# the source of the HTML file
#---------------------------------------------

sub source {
  my $obj = shift;
  return $obj->{data} if defined $obj->{data};
  return undef unless defined $obj->{file};
  $obj->get_file unless defined $obj->{data};
  return $obj->{data};
}

=item $doc->message

Contains the message queue. If called with an argument, places a new message
to the queue, without an argument a message (if any) is removed.

If the argument is '-1', the last message on the queue is returned.

=cut

#----------------------------------------------
# the message HTML::Validator wants to send out
#----------------------------------------------

sub message {
  my $obj = shift;
  my ($msg) = @_;
  my $r = $obj->{"message"};
#  return undef unless defined $r;
  my @array;
  @array = @{$r} if defined $r;
  if (defined $msg) {
#    print "adding message '$msg'\n";
    if ($msg eq -1) {
      $msg = pop (@array);
    }
    else {
      push (@array,$msg);
    }
  }
  else {
    $msg = shift (@array);
#    print "removing message '$msg'\n";
  }
  $obj->{message} = \@array;
#  print @{$r},"\n" if defined $r;
  return $msg;
}

=item $doc->reset

Resets the object to original state so we can reuse it

=cut

#----------------------------------------------
# reset all values so we can reuse the object
#----------------------------------------------

sub reset {
  my $obj = shift;
  $obj->deltempfiles;
  delete $obj->{"doctype"};
  delete $obj->{newdoctype};
  delete $obj->{dtdfile};
  delete $obj->{"errors"};
  delete $obj->{"message"};
  undef $obj->{valid};
  delete $obj->{data};
  delete $obj->{file};
  delete $obj->{url};
  delete $obj->{useLWP};
  delete $obj->{XML};
}

#----------------------------------------------
# delete temporary files
#----------------------------------------------

sub deltempfiles {
  unlink $tmpfile;
  unlink $tmpdtd if defined $tmpdtd;
#  rmdir $tmpdir;
}

#----------------------------------------------
# The destructor
#----------------------------------------------

END { 
  my $obj = shift;
#  print $tmpfile,"\n";
  deltempfiles;
  rmdir $tmpdir;
};

1;

__END__

=back

=head1 VARIABLES

=item $doc->{nsgmls}

The used I<nsgmls> binary

=item $doc->{catalog}

The used catalog file

=item $doc->{message}

Array of messages

=item $doc->{maxerr}

The maximum number of errors. This is passed to nsgmls with the C<-E> option

=item $doc->{doctype}

The document type for the document

=item $doc->{defaultdtd}

The default type for the document. By default this is 'html4'.

=item $doc->{dtdfile}

The dtd used for the document

=item $doc->{dtdmap}

The mapping for document types and the explicit document type definition
strings

=item $doc->{parser}

The custom parser to use. See the information for the parser method.

=head1 REQUIREMENTS

  HTML::Validator requires that

=over 3

=item *

I<nsgmls> is installed. It comes with James Clark's I<sp> SGML toolkit,
I<http://www.jclark.com/sp/howtoget.htm>

=item *

you have a reasonable set of document type files. For this purpose, you can use
W3C's sgml library at I<http://validator.w3.org/sgml-lib.tar.gz>

=back

=head1 SEE ALSO

L<LWP>, L<nsgmls>

=head1 ACKNOWLEDGEMENTS

Thanks go to:

 - Heikki Kantola <hezu@iki.fi>, for his help in the early testing
   phases and his excellent knowledge about HTML standards.

=head1 AVAILABILITY

The latest version of HTML::Validator can be found from 
I<http://www.iki.fi/si/HTML-Validator/>.

It is also available from CPAN (I<http://www.perl.com/CPAN/>).

=head1 COPYRIGHT

HTML::Validator is (c) 1997-1999 Sami Itkonen <si@iki.fi>

HTML::Validator is distributed under the GNU General Public License.

=cut