The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl

use strict;
use warnings;

use Regex::PreSuf qw(presuf);
use HTML::Tagset 3.03 ();

sub make_re {
  return '\b(?:' . presuf(@_) . ')\b';
}

my $tmp = '';
my %tmp = ();

# -----

print <<EOF;
#
# This file was generated by genregexps.PL (Regex::PreSuf $Regex::PreSuf::VERSION), do not edit!
#

EOF

print <<'EOF';
# Matches a ">" that is not preceded by "-" (to protect Perl's "->").
$Tag_End = qr/(?<!-)>/o;

# Matches a non-">" char or a "->", $Tag_End negated.
$Non_Tag_End = qr/(?:[^>]|(?<=-)>)/o;

# This is used in optimizations.  Elements are from <a> to <var>.
# We also allow end tags, hence the "/".
$Tag_Start = qr/<\/?[a-vA-V]/o;

EOF

# -----

foreach my $attr (values(%HTML::Tagset::linkElements)) {
  %tmp = (%tmp, map { $_ => 1 } @$attr) if (ref($attr) eq 'ARRAY');
  $tmp{xmlns} = 1;
  $tmp{profile} = 1;
}
printf <<'EOF', make_re(keys(%tmp));
# Attributes with URI values.
$URI_Attrs = '%s';

EOF

# -----

%tmp = %HTML::Tagset::optionalEndTag;
$tmp{option} = 1;
delete($tmp{p}); # This just causes too much bogus.
printf <<'EOF', make_re(keys(%tmp));
# Elements with optional end tags.
$End_Omit = '%s';

EOF

# -----

# isKnown contains some entries like "~pi" etc, hence the grep
printf <<'EOF', make_re(grep(!/^~/, keys %HTML::Tagset::isKnown));
# All known elements.
$All_Elems = '%s';

EOF

# -----

%tmp = ();
foreach my $attr (values(%HTML::Tagset::boolean_attr)) {
  if (ref($attr) eq 'HASH') {
    foreach my $a (keys %$attr) {
      $tmp{$a} = 1;
    }
  } elsif (! ref($attr)) {
    $tmp{$attr} = 1;
  }
  $tmp{noresize} = 1;
  $tmp{readonly} = 1;
  $tmp{declare}  = 1;
  $tmp{defer}    = 1;
}
printf <<'EOF', make_re(keys(%tmp));
# Minimizable attributes.
$Min_Attrs = '%s';

EOF

# -----

printf <<'EOF', make_re(qw(area base basefont br col embed frame hr img input isindex link meta param));
# Elements with forbidden end tags.
$Min_Elems = '%s';

EOF

# -----

printf <<'EOF', make_re(qw(p script textarea));
# Elements that require an end tag but are commonly seen without it.
$Compat_Elems = '%s';

EOF

# -----

printf <<'EOF', make_re(qw(application audio image message multipart text video));
# MIME types. RFC 2046 says x-foo top-level types are allowed, but discouraged.
$MIME_Type = qr/^%s\/\w+?[-\w\.]+?\w+\b$/i;

EOF

# -----

printf <<'EOF', make_re(qw(a link param style script));
# Elements that have attributes with multiple (comma-separated) MIME types:
#   form, input: accept
# Parens in these regexps:
#   1: element, 2: attribute, 3:first quote,
#   4: value (including possible end quote), 5: value
$tmp =
  '<(%%s)\b\s??' . $Non_Tag_End .
  '*?\s(%%s)=(\\\?["\'])?((.*?)(?:\3|\s|' . $Tag_End . '))';
@MIME_Attrs =
  map { qr/$_/i }
  ( sprintf($tmp, '%s', 'type'),
    sprintf($tmp, 'object', '(?:code)type'),
    sprintf($tmp, 'form', 'enctype'),
  );
undef $tmp;

EOF

# -----

printf <<'EOF', make_re(qw(abbr accept accept-charset accesskey action align alink alt archive axis background bgcolor border cellpadding cellspacing char charoff charset checked cite class classid clear code codebase codetype color cols colspan compact content coords data datetime declare defer dir disabled enctype face for frame frameborder headers height href hreflang hspace http-equiv id ismap label lang language leftmargin link longdesc marginheight marginwidth maxlength media method multiple name nohref noresize noshade nowrap object onblur onchange onclick ondblclick onfocus onkeydown onkeypress onkeyup onload onmousedown onmousemove onmouseout onmouseover onmouseup onreset onselect onsubmit onunload profile prompt readonly rel rev rows rowspan rules scheme scope scrolling selected shape size span src standby start style summary tabindex target text title topmargin type usemap valign value valuetype version vlink vspace width wrap));
# All known attributes.
$All_Attrs = '%s';

EOF

# -----

printf <<'EOF', make_re(qw(area img)), make_re(qw(input map param)), make_re(qw(script style));
$tmp = '<((%%s)\b\s??.*?(?:\s(%%s)=|' . $Tag_End . '))';

# Required attributes by element.
# This has some special cases which are handled in code.  See _attributes()
my %%tmp =
    ( action  => sprintf($tmp, 'form', 'action' ),
      alt     => sprintf($tmp, '%s', 'alt'    ),
      cols    => sprintf($tmp, 'textarea', 'cols'   ),
      content => sprintf($tmp, 'meta', 'content'),
      dir     => sprintf($tmp, 'bdo', 'dir'    ),
      height  => sprintf($tmp, 'applet', 'height' ),
      href    => sprintf($tmp, 'base', 'href'   ),
      id      => sprintf($tmp, 'map', 'id'     ), # *
      label   => sprintf($tmp, 'optgroup', 'label'  ),
      name    => sprintf($tmp, '%s', 'name' ), # *
      rows    => sprintf($tmp, 'textarea', 'rows'   ),
      size    => sprintf($tmp, 'basefont', 'size'   ),
      src     => sprintf($tmp, 'img', 'src'    ),
      type    => sprintf($tmp, '%s', 'type'   ),
      width   => sprintf($tmp, 'applet', 'width'  ),
    );
%%Req_Attrs = map { $_ => qr/$tmp{$_}/i } keys(%%tmp);
undef %%tmp;

EOF

printf <<'EOF', make_re(qw(caption iframe img input object legend table hr div h1 h2 h3 h4 h5 h6 p)), make_re(qw(alink background link text vlink)), make_re(qw(noshade width)), make_re(qw(img object)), make_re(qw(border hspace vspace)), make_re(qw(td th)), make_re(qw(bgcolor height nowrap width)), make_re(qw(ol ul)), make_re(qw(body table tr)), make_re(qw(dl ol ul));
# Deprecated attributes as of HTML 4.01, not including deprecated elements
# and their attributes.
@Depr_Attrs =
  map { qr/$_/i }
  ( sprintf($tmp, '%s', 'align' ),
    sprintf($tmp, 'body', '%s'),
    sprintf($tmp, 'hr', '%s'),
    sprintf($tmp, '%s', '%s'),
    sprintf($tmp, '%s', '%s'),
    sprintf($tmp, '%s', 'type'),
    sprintf($tmp, '%s', 'bgcolor'),
    sprintf($tmp, '%s', 'compact'),
    sprintf($tmp, 'ol', 'start'),
    sprintf($tmp, 'li', 'value'),
    sprintf($tmp, 'html', 'version'),
    sprintf($tmp, 'br', 'clear'),
    sprintf($tmp, 'script', 'language'),
    sprintf($tmp, 'li', 'type'),
    sprintf($tmp, 'pre', 'width'),
  );
undef $tmp;

EOF

# -----

printf <<'EOF', make_re(qw(applet basefont center dir font isindex menu s strike u));
# Deprecated elements as of HTML 4.01.
$Depr_Elems = '%s';

EOF

# -----

$tmp = '<((%%s)\b\s??[^>]*?\s(%%s)=(["\'])([^>]+?)\4)';

printf <<EOF, make_re(qw(cols rows)), make_re(qw(td th)), make_re(qw(colspan rowspan)), make_re(qw(col colgroup)), make_re(qw(a area button input object select textarea)), make_re(qw(applet img)), make_re(qw(hspace vspace)), make_re(qw(frame iframe)), make_re(qw(marginheight marginwidth)), make_re(qw(img table)), make_re(qw(border hspace vspace));

# Attributes whose value is an integer, from HTML 4.01 and XHTML 1.0.
# A couple of special cases here, handled in _attributes().
\@Int_Attrs =
  map { qr/\$_/i }
  (
   # NUMBER/Number
   sprintf(q{$tmp}, 'textarea', '%s'),
   sprintf(q{$tmp}, '%s', '%s'),
   sprintf(q{$tmp}, 'input', 'maxlength'),
   sprintf(q{$tmp}, 'select', 'size'),
   sprintf(q{$tmp}, '%s', 'span'),
   sprintf(q{$tmp}, 'ol', 'start'),
   sprintf(q{$tmp}, '%s', 'tabindex'),
   sprintf(q{$tmp}, 'li', 'value'),
   sprintf(q{$tmp}, 'pre', 'width'),
   # Pixels
   sprintf(q{$tmp}, '%s', '%s'),
   sprintf(q{$tmp}, '%s', '%s'),
   sprintf(q{$tmp}, 'hr', 'size'),
   sprintf(q{$tmp}, '%s', 'border'), # img: HTML 4
   sprintf(q{$tmp}, 'object', '%s'),
  );

EOF

printf <<EOF, make_re(qw(cellpadding cellspacing)), make_re(qw(col colgroup tbody td tfoot th thead tr)), make_re(qw(applet iframe img object td th)), make_re(qw(applet hr iframe img object table td th));
# Attributes whose value is %%Length, from HTML 4.01 and XHTML 1.0.
# Some special cases here, handled in _attributes().
\@Length_Attrs =
  map { qr/\$_/i }
  ( sprintf(q{$tmp}, 'table', '%s'),
    sprintf(q{$tmp}, '%s', 'charoff'),
    sprintf(q{$tmp}, '%s', 'height'),
    sprintf(q{$tmp}, '%s', 'width'),
    sprintf(q{$tmp}, 'img', 'border'), # XHTML 1.0
  );

EOF

$tmp =~ s/%%/%/g;
%tmp =
  ( sprintf($tmp, make_re(qw(a area)), 'shape') =>
    [ qw(rect circle poly default) ],
    sprintf($tmp, make_re(qw(applet iframe img input object)), 'align') =>
    [ qw(top middle bottom left right) ],
    sprintf($tmp, 'area', 'nohref') => [ 'nohref' ],
    sprintf($tmp, 'br', 'clear') => [ qw(left all right none) ],
    sprintf($tmp, 'button', 'type') => [ qw(button submit reset) ],
    sprintf($tmp, make_re(qw(button input option optgroup select textarea)),
            'disabled') => [ 'disabled' ],
    sprintf($tmp, make_re(qw(caption legend)), 'align') =>
    [ qw(top bottom left right) ],
    sprintf($tmp, make_re(qw(col colgroup tbody td tfoot th thead tr)),
            'align') => [ qw(left center right justify char) ],
    sprintf($tmp, make_re(qw(col colgroup tbody td tfoot th thead tr)),
            'valign') => [ qw(top middle bottom baseline) ],
    sprintf($tmp, make_re(qw(dir dl menu ol ul)), 'compact') => [ 'compact'],
    sprintf($tmp, make_re(qw(div h1 h2 h3 h4 h5 h6 p)), 'align') =>
    [ qw(left center right justify) ],
    sprintf($tmp, 'form', 'method') => [ qw(get post) ],
    sprintf($tmp, 'frame', 'noresize') => [ 'noresize' ],
    sprintf($tmp, make_re(qw(frame iframe)), 'frameborder') => [ qw(0 1) ],
    sprintf($tmp, make_re(qw(frame iframe)), 'scrolling') =>
    [ qw(yes no auto) ],
    sprintf($tmp, 'hr', 'align') => [ qw(left center right) ],
    sprintf($tmp, 'hr', 'noshade') => [ 'noshade' ],
    sprintf($tmp, make_re(qw(img input)), 'ismap') => [ 'ismap' ],
    sprintf($tmp, 'input', 'checked') => [ 'checked' ],
    sprintf($tmp, make_re(qw(input textarea)), 'readonly') =>
    [ qw(readonly) ],
    sprintf($tmp, 'li', 'type') => [ qw(disc square circle 1 a A i I) ],
    sprintf($tmp, 'object', 'declare') => [ 'declare' ],
    sprintf($tmp, 'ol', 'type') => [ qw(1 a A i I) ],
    sprintf($tmp, 'param', 'valuetype') => [ qw(data ref object) ],
    sprintf($tmp, 'script', 'defer') => [ 'defer' ],
    sprintf($tmp, 'table', 'align') => [ qw(left center right) ],
    sprintf($tmp, 'table', 'frame') =>
    [ qw(void above below hsides lhs rhs vsides box border) ],
    sprintf($tmp, 'table', 'rules') => [ qw(none groups rows cols all) ],
    sprintf($tmp, make_re(qw(td th)), 'nowrap') => [ qw(nowrap) ],
    sprintf($tmp, make_re(qw(td th)), 'scope') =>
    [ qw(row col rowgroup colgroup) ],
    sprintf($tmp, 'ul', 'type') => [ qw(disc square circle) ],
    sprintf($tmp, 'input', 'type') =>
    [qw(text password checkbox radio submit reset file hidden image button)],
    # --- these are XHTML only ---
    sprintf($tmp, 'html', 'xmlns') => [ 'http://www.w3.org/1999/xhtml' ],
    sprintf($tmp, make_re(qw(pre script style)),'xml:space') => ['preserve'],
  );

print <<'EOF';
# Attributes that have a fixed set of values, from HTML 4.01.
@Fixed_Attrs = ();
EOF
while (my ($re, $vals) = each(%tmp)) {
  my $v = make_re(@$vals);
  printf <<'EOF', $re, $v, join('|', @$vals);
push(@Fixed_Attrs, [ qr/%s/i, '%s', '%s' ]);
EOF
}

# Phew.
print <<'EOF';

1;
EOF