The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl
#
# Format XHTML generated by pod2html (via tidy) for websites
#
# Usage: tidy ... pydoc.html | fix-python-xhtml OUTPUT-FILE
#
# (C) Copyright 2002 Dave Beckett - http://www.dajobe.org/
# University of Bristol
#

use strict;
use File::Basename;

my $progname=basename $0;

my $main_title="Redland RDF Application Framework";

my $doc_title="Python API Reference";

die "USAGE: $progname OUTPUT-FILE\n" if @ARGV < 1;

my($file)=@ARGV;

my($pod_rdf_rel)=($file =~ m%^pod/RDF/%) ? '../' : '';
my($pod_rdf_redland_rel)=($file =~ m%^pod/RDF/Redland/%) ? '' : 'Redland/';

my $skip_heading=0;

open(OUT, ">$file") or die "$progname: Cannot create $file - $!\n";
print OUT qq{<?xml version="1.0" encoding="iso-8859-1"?>\n};
open(IN, "-");
while(<IN>) {
  if(m%<title>(.*?)</title>%) {
    s%<title>(.*?)</title>%<title>$main_title - $doc_title</title>%;
  }

  next if /^<link|meta/i;

  # skip the #-commented heading 
  if(!$skip_heading && m%^<p><tt>\#%) {
     $skip_heading=1;
     next;
   } elsif ($skip_heading == 1) {
     if(m%\#</tt></p>%) {
       $skip_heading=2;
       print OUT "<!-- LICENSE HEADING -->\n";
     }
     next;
   }

  # Links to other Python modules, exceptions
  s%<a href="(Redland|string|sys|exceptions|__builtin__).html[^"]*">(.+?)</a>%$2%g; #"
  # The horrors of guessing urls in text
  s%<a href="http://example.(org|com|net)[^"]*">([^<]+)</a>%$2%g; # "
  s%<a href="http://(?:localhost/r.rdf|foo|bar)[^"]*">([^<]+)</a>%$2%g; # "
  # Never leave these in web pages
  s%<a href="file:[^"]+">/[^<]+</a>%%g; # "

  s%RDF.html\#%#%;

  # Bad IDs, who writes this stuff?
  s%<a name="([^"]+)">%my $a=$1; $a =~ s/\&lt;.*?\&gt;//g; qq{<a id="$a" name="$a">}%ge;
  s%<a id="([^"]+)" name="[^"]+">%my $a=$1; $a =~ s/\&lt;.*?\&gt;//g; $a =~ s/^-//; $a=~ s/ /-/g; qq{<a id="$a" name="$a">}%ge;
  s%<a href="#-[^"+]">%<a href="#$1">%g;

  # All the __new__ links are broken
  s%<a href="\#\w+\-__new__">__new__</a>%__new__%;
  # This is broken too, dunno why
  s%<a href="\#\-debug">debug</a>%debug%;

  if(m%^<body.*>%) {
    print OUT qq{<body>\n\n<h1 style="text-align:center">$main_title - $doc_title</h1>\n\n};
    next;
  } 

  my $year=1900+(localtime)[5];
  print OUT <<"EOT" if m%^</body>%;
<hr />

<p>(C) Copyright 2000-$year <a href="http://www.dajobe.org/">Dave Beckett</a>, Copyright 2000-2005 <a href="http://www.bristol.ac.uk/">University of Bristol</a></p>

EOT

  print OUT;
}
close(IN);
close(OUT);