The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
## another case of deja-vu
## this time, we want the slashdot style (what Yahoo said to do) only allow
## certain tags... we'll make it an option
## we'll have to tie this in some way to our HTML body displayer...
##
## Ok, there are basically four types of tags:
## 1) safe - ie, <b>, <i>, etc.
## 2) render problems - <table><form><body><frame> - these we either strip, 
##    or we have to ensure they match
## 3) definitely evil independent tags that we always strip
## 4) definitely evil tags which denote a region, we strip the entire region

from PassSGMLParser import PassSGMLParser
from urllib import basejoin
import string, sys
import neo_cgi

try:
  from cStringIO import StringIO
except:
  from StringIO import StringIO

class SafeHtml (PassSGMLParser):
  _safeTags = {"P":1, "LI":1, "DD":1, "DT":1, "EM":1, "BR":1, "CITE":1, 
               "DFN":1, "Q":1, "STRONG":1, "IMG":1, "HR":1,
               "TR":1, "TD":1, "TH":1, "CAPTION":1, "THEAD":1, "TFOOT":1, 
               "TBODY":1}
  _matchTags = {"TABLE":1, "OL":1, "UL":1, "DL":1, "CENTER":1, "DIV":1, "PRE":1,
                "SUB":1, "SUP":1, "BIG":1, "SMALL":1, "CODE":1,
                "B":1, "I":1, "A":1, "TT":1, "BLOCKQUOTE":1, "U":1,
                "H1":1, "H2":1, "H3":1, "H4":1, "H5":1, "H6":1, "FONT":1}
  _skipTags = {"FORM":1, "HTML":1, "BODY":1, "EMBED":1, "AREA":1, "MAP":1,
               "FRAME":1, "FRAMESET":1, "IFRAME":1, "META":1}
  _stripTags = {"HEAD":1, "JAVA":1, "APPLET":1, "OBJECT":1,
                "JAVASCRIPT":1, "LAYER":1, "STYLE":1, "SCRIPT":1}

  def __init__ (self, fp, extra_safe=1, base=None, map_urls=None, new_window=1):
    self._extra_safe = extra_safe
    PassSGMLParser.__init__ (self, fp, extra_safe)
    self._matchDict = {}
    self._stripping = 0
    self._base = base 
    self._map_urls = map_urls
    self._new_window = new_window

  def safe_start_strip (self):
    if self._stripping == 0:
      self.flush()
    self._stripping = self._stripping + 1

  def safe_end_strip (self):
    self.flush()
    self._stripping = self._stripping - 1
    if self._stripping < 0: self._stripping = 0

  def write (self, data):
    # sys.stderr.write("write[%d] %s\n" % (self._stripping, data))
    if self._stripping == 0:
      # sys.stderr.write("write %s\n" % data)
      PassSGMLParser.write(self, data)

  def cleanup_attrs (self, tag, attrs):
    new_attrs = [] 
    tag = string.lower(tag)
    if self._new_window and tag == "a":
        new_attrs.append(('target', '_blank'))
    for name, value in attrs:
      name = string.lower(name)
      if name[:2] == "on": continue   ## skip any javascript events
      if string.lower(value)[:11] == "javascript:": continue
      if self._map_urls and name in ["action", "href", "src", "lowsrc", "background"] and value[:4] == 'cid:':
        try:
          value = self._map_urls[value[4:]]
        except KeyError:
          pass
      else:
          if self._base and name in ["action", "href", "src", "lowsrc", "background"]:
            value = basejoin (self._base, value)
          if name in ["action", "href", "src", "lowsrc", "background"]:
            value = 'http://www.google.com/url?sa=D&q=%s' % (neo_cgi.urlEscape(value))
      if self._new_window and tag == "a" and name == "target": continue
      new_attrs.append ((name, value))
    return new_attrs

  def unknown_starttag(self, tag, attrs):
    tag = string.upper(tag)
    if SafeHtml._stripTags.has_key(tag):
      self.safe_start_strip()
      # sys.stderr.write("Stripping tag %s: %d\n" % (tag, self._stripping))
    elif SafeHtml._skipTags.has_key(tag):
      # sys.stderr.write("Skipping tag %s\n" % tag)
      pass
    elif SafeHtml._matchTags.has_key(tag):
      # sys.stderr.write("Matching tag %s\n" % tag)
      if self._matchDict.has_key(tag):
        self._matchDict[tag] = self._matchDict[tag] + 1
      else:
        self._matchDict[tag] = 1
      self.write_starttag (tag, self.cleanup_attrs(tag, attrs))
    elif SafeHtml._safeTags.has_key(tag):
      # sys.stderr.write("Safe tag %s\n" % tag)
      self.write_starttag (tag, self.cleanup_attrs(tag, attrs))
    elif not self._extra_safe:
      # sys.stderr.write("Other tag %s\n" % tag)
      self.write_starttag (tag, self.cleanup_attrs(tag, attrs))

  def unknown_endtag(self, tag):
    tag = string.upper(tag)
    if SafeHtml._stripTags.has_key(tag):
      self.safe_end_strip()
      # sys.stderr.write("End Stripping tag %s: %d\n" % (tag, self._stripping))
    elif self._stripping == 0:
      if SafeHtml._skipTags.has_key(tag):
        pass
      elif SafeHtml._matchTags.has_key(tag):
        if self._matchDict.has_key(tag):
          self._matchDict[tag] = self._matchDict[tag] - 1
        self.write_endtag (tag)
      elif SafeHtml._safeTags.has_key(tag):
        self.write_endtag (tag)
      elif not self._extra_safe:
        self.write_endtag (tag)

  def close (self):
    self._stripping = 0
    for tag in self._matchDict.keys():
      if self._matchDict[tag] > 0:
        for x in range (self._matchDict[tag]):
          self.write_endtag(tag)
    PassSGMLParser.close(self)

def SafeHtmlString (s, really_safe=1, map_urls=None):
#  fp = open("/tmp/safe_html.in", "w")
#  fp.write(s)
#  fp.close()
  fp = StringIO()
  parser = SafeHtml(fp, really_safe, map_urls=map_urls)
  parser.feed (s)
  parser.close ()
  s = fp.getvalue()
#  fp = open("/tmp/safe_html.out", "w")
#  fp.write(s)
#  fp.close()
  return s