The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/python
# Copyright 2008 The RE2 Authors.  All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.

"""Generate C++ tables for Unicode Script and Category groups."""

import sys
import unicode

_header = """
// GENERATED BY make_unicode_groups.py; DO NOT EDIT.
// make_unicode_groups.py >unicode_groups.cc

#include "re2/unicode_groups.h"

namespace re2 {

"""

_trailer = """

}  // namespace re2

"""

n16 = 0
n32 = 0

def MakeRanges(codes):
  """Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]"""
  ranges = []
  last = -100
  for c in codes:
    if c == last+1:
      ranges[-1][1] = c
    else:
      ranges.append([c, c])
    last = c
  return ranges

def PrintRanges(type, name, ranges):
  """Print the ranges as an array of type named name."""
  print "static %s %s[] = {" % (type, name,)
  for lo, hi in ranges:
    print "\t{ %d, %d }," % (lo, hi)
  print "};"

# def PrintCodes(type, name, codes):
#   """Print the codes as an array of type named name."""
#   print "static %s %s[] = {" % (type, name,)
#   for c in codes:
#     print "\t%d," % (c,)
#   print "};"

def PrintGroup(name, codes):
  """Print the data structures for the group of codes.
  Return a UGroup literal for the group."""

  # See unicode_groups.h for a description of the data structure.

  # Split codes into 16-bit ranges and 32-bit ranges.
  range16 = MakeRanges([c for c in codes if c < 65536])
  range32 = MakeRanges([c for c in codes if c >= 65536])

  # Pull singleton ranges out of range16.
  # code16 = [lo for lo, hi in range16 if lo == hi]
  # range16 = [[lo, hi] for lo, hi in range16 if lo != hi]

  global n16
  global n32
  n16 += len(range16)
  n32 += len(range32)

  ugroup = "{ \"%s\", +1" % (name,)
  # if len(code16) > 0:
  #   PrintCodes("uint16", name+"_code16", code16)
  #   ugroup += ", %s_code16, %d" % (name, len(code16))
  # else:
  #   ugroup += ", 0, 0"
  if len(range16) > 0:
    PrintRanges("URange16", name+"_range16", range16)
    ugroup += ", %s_range16, %d" % (name, len(range16))
  else:
    ugroup += ", 0, 0"
  if len(range32) > 0:
    PrintRanges("URange32", name+"_range32", range32)
    ugroup += ", %s_range32, %d" % (name, len(range32))
  else:
    ugroup += ", 0, 0"
  ugroup += " }"
  return ugroup

def main():
  print _header
  ugroups = []
  for name, codes in unicode.Categories().iteritems():
    ugroups.append(PrintGroup(name, codes))
  for name, codes in unicode.Scripts().iteritems():
    ugroups.append(PrintGroup(name, codes))
  print "// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32)
  print "UGroup unicode_groups[] = {";
  ugroups.sort()
  for ug in ugroups:
    print "\t%s," % (ug,)
  print "};"
  print "int num_unicode_groups = %d;" % (len(ugroups),)
  print _trailer

if __name__ == '__main__':
  main()