The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env python
#
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
#

# mlpatch.py: Run with no arguments for usage

import sys, os
import sgmllib
try:
  # Python >=3.0
  from html.entities import entitydefs
  from urllib.request import urlopen as urllib_request_urlopen
except ImportError:
  # Python <3.0
  from htmlentitydefs import entitydefs
  from urllib2 import urlopen as urllib_request_urlopen
import fileinput

CHUNKSIZE = 8 * 1024

class MyParser(sgmllib.SGMLParser):
  def __init__(self):
    self.baseclass = sgmllib.SGMLParser
    self.baseclass.__init__(self)
    self.entitydefs = entitydefs
    self.entitydefs["nbsp"] = " "
    self.inbody = False
    self.complete_line = False
    self.discard_gathered()

  def discard_gathered(self):
    self.gather_data = False
    self.gathered_data = ""

  def noop(self):
    pass

  def out(self, data):
    sys.stdout.write(data)

  def handle_starttag(self, tag, method, attrs):
    if not self.inbody: return
    self.baseclass.handle_starttag(self, tag, method, attrs)

  def handle_endtag(self, tag, method):
    if not self.inbody: return
    self.baseclass.handle_endtag(self, tag, method)

  def handle_data(self, data):
    if not self.inbody: return
    data = data.replace('\n','')
    if len(data) == 0: return
    if self.gather_data:
      self.gathered_data += data
    else:
      if self.complete_line:
        if data[0] in ('+', '-', ' ', '#') \
            or data.startswith("Index:") \
            or data.startswith("@@ ") \
            or data.startswith("======"):
          # Real new line
          self.out('\n')
        else:
          # Presume that we are wrapped
          self.out(' ')
      self.complete_line = False
      self.out(data)

  def handle_charref(self, ref):
    if not self.inbody: return
    self.baseclass.handle_charref(self, ref)

  def handle_entityref(self, ref):
    if not self.inbody: return
    self.baseclass.handle_entityref(self, ref)

  def handle_comment(self, comment):
    if comment == ' body="start" ':
      self.inbody = True
    elif comment == ' body="end" ':
      self.inbody = False

  def handle_decl(self, data):
    if not self.inbody: return
    print("DECL: " + data)

  def unknown_starttag(self, tag, attrs):
    if not self.inbody: return
    print("UNKTAG: %s %s" % (tag, attrs))

  def unknown_endtag(self, tag):
    if not self.inbody: return
    print("UNKTAG: /%s" % (tag))

  def do_br(self, attrs):
    self.complete_line = True

  def do_p(self, attrs):
    if self.complete_line:
      self.out('\n')
    self.out(' ')
    self.complete_line = True

  def start_a(self, attrs):
    self.gather_data = True

  def end_a(self):
    self.out(self.gathered_data.replace('_at_', '@'))
    self.discard_gathered()

  def close(self):
    if self.complete_line:
      self.out('\n')
    self.baseclass.close(self)


def main():
  if len(sys.argv) == 1:
    sys.stderr.write(
    "usage:   mlpatch.py dev|users year month msgno > foobar.patch\n" +
    "example: mlpatch.py dev 2005 01 0001 > issue-XXXX.patch\n" +
    """
    Very annoyingly, the http://svn.haxx.se/ subversion mailing list archives
    mangle inline patches, and provide no raw message download facility
    (other than for an entire month's email as an mbox).

    So, I wrote this script, to demangle them. It's not perfect, as it has to
    guess about whitespace, but it does an acceptable job.\n""")
    sys.exit(0)
  elif len(sys.argv) != 5:
    sys.stderr.write("error: mlpatch.py: Bad parameters - run with no "
    + "parameters for usage\n")
    sys.exit(1)
  else:
    list, year, month, msgno = sys.argv[1:]
    url = "http://svn.haxx.se/" \
        + "%(list)s/archive-%(year)s-%(month)s/%(msgno)s.shtml" % locals()
    print("MsgUrl: " + url)
    msgfile = urllib_request_urlopen(url)
    p = MyParser()
    buffer = msgfile.read(CHUNKSIZE)
    while buffer:
      p.feed(buffer)
      buffer = msgfile.read(CHUNKSIZE)
    p.close()
    msgfile.close()

if __name__ == '__main__':
  main()