summaryrefslogtreecommitdiffstats
path: root/mailnews/extensions/fts3/data/generate_table.py
diff options
context:
space:
mode:
Diffstat (limited to 'mailnews/extensions/fts3/data/generate_table.py')
-rw-r--r--mailnews/extensions/fts3/data/generate_table.py264
1 files changed, 264 insertions, 0 deletions
diff --git a/mailnews/extensions/fts3/data/generate_table.py b/mailnews/extensions/fts3/data/generate_table.py
new file mode 100644
index 000000000..f6b012685
--- /dev/null
+++ b/mailnews/extensions/fts3/data/generate_table.py
@@ -0,0 +1,264 @@
+#!/usr/bin/python
+# ***** BEGIN LICENSE BLOCK *****
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is Mozilla Thunderbird.
+#
+# The Initial Developer of the Original Code is Mozilla Japan.
+# Portions created by the Initial Developer are Copyright (C) 2010
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Makoto Kato <m_kato@ga2.so-net.ne.jp>
+# Andrew Sutherland <asutherland@asutherland.org>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ***** END LICENSE BLOCK *****
+
+import re
+
+def printTable(f, t):
+ i = f
+ while i <= t:
+ c = array[i]
+ print "0x%04x," % c,
+ i = i + 1
+ if not i % 8:
+ print "\n\t",
+
+print '''/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is mozilla.org code.
+ *
+ * The Initial Developer of the Original Code is Mozilla Japan.
+ * Portions created by the Initial Developer are Copyright (C) 2010
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ * Makoto Kato <m_kato@ga2.so-net.ne.jp>
+ * Andrew Sutherland <asutherland@asutherland.org>
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either of the GNU General Public License Version 2 or later (the "GPL"),
+ * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+/* THIS FILE IS GENERATED BY generate_table.py. DON'T EDIT THIS */
+'''
+
+p = re.compile('([0-9A-F]{4,5})(?:\.\.([0-9A-F]{4,5}))?[=\>]([0-9A-F]{4,5})?')
+G_FROM = 1
+G_TO = 2
+G_FIRSTVAL = 3
+
+# Array whose value at index i is the unicode value unicode character i should
+# map to.
+array = []
+# Contents of gNormalizeTable. We insert zero entries for sub-pages where we
+# have no mappings. We insert references to the tables where we do have
+# such tables.
+globalTable = []
+globalTable.append("0")
+# The (exclusive) upper bound of the conversion table, unicode character-wise.
+# This is 0x10000 because our generated table is only 16-bit. This also limits
+# the values we can map to; we perform an identity mapping for target values
+# that >= maxmapping.
+maxmapping = 0x10000
+sizePerTable = 64
+
+# Map characters that the mapping tells us to obliterate to the NUKE_CHAR
+# (such lines look like "FFF0..FFF8>")
+# We do this because if we didn't do this, we would emit these characters as
+# part of a token, which we definitely don't want.
+NUKE_CHAR = 0x20
+
+# --- load case folding table
+# entries in the file look like:
+# 0041>0061
+# 02D8>0020 0306
+# 2000..200A>0020
+#
+# The 0041 (uppercase A) tells us it lowercases to 0061 (lowercase a).
+# The 02D8 is a "spacing clone[s] of diacritic" breve which gets decomposed into
+# a space character and a breve. This entry/type of entry also shows up in
+# 'nfkc.txt'.
+# The 2000..200A covers a range of space characters and maps them down to the
+# 'normal' space character.
+
+file = open('nfkc_cf.txt')
+
+m = None
+line = "\n"
+i = 0x0
+while i < maxmapping and line:
+ if not m:
+ line = file.readline()
+ m = p.match(line)
+ if not m:
+ continue
+ low = int(m.group(G_FROM), 16)
+ # if G_TO is present, use it, otherwise fallback to low
+ high = m.group(G_TO) and int(m.group(G_TO), 16) or low
+ # if G_FIRSTVAL is present use it, otherwise use NUKE_CHAR
+ val = (m.group(G_FIRSTVAL) and int(m.group(G_FIRSTVAL), 16)
+ or NUKE_CHAR)
+ continue
+
+
+ if i >= low and i <= high:
+ if val >= maxmapping:
+ array.append(i)
+ else:
+ array.append(val)
+ if i == high:
+ m = None
+ else:
+ array.append(i)
+ i = i + 1
+file.close()
+
+# --- load normalization / decomposition table
+# It is important that this file gets processed second because the other table
+# will tell us about mappings from uppercase U with diaeresis to lowercase u
+# with diaeresis. We obviously don't want that clobbering our value. (Although
+# this would work out if we propagated backwards rather than forwards...)
+#
+# - entries in this file that we care about look like:
+# 00A0>0020
+# 0100=0041 0304
+#
+# They are found in the "Canonical and compatibility decomposition mappings"
+# section.
+#
+# The 00A0 is mapping NBSP to the normal space character.
+# The 0100 (a capital A with a bar over top of) is equivalent to 0041 (capital
+# A) plus a 0304 (combining overline). We do not care about the combining
+# marks which is why our regular expression does not capture it.
+#
+#
+# - entries that we do not care about look like:
+# 0300..0314:230
+#
+# These map marks to their canonical combining class which appears to be a way
+# of specifying the precedence / order in which marks should be combined. The
+# key thing is we don't care about them.
+file = open('nfkc.txt')
+line = file.readline()
+m = p.match(line)
+while line:
+ if not m:
+ line = file.readline()
+ m = p.match(line)
+ continue
+
+ low = int(m.group(G_FROM), 16)
+ # if G_TO is present, use it, otherwise fallback to low
+ high = m.group(G_TO) and int(m.group(G_TO), 16) or low
+ # if G_FIRSTVAL is present use it, otherwise fall back to NUKE_CHAR
+ val = m.group(G_FIRSTVAL) and int(m.group(G_FIRSTVAL), 16) or NUKE_CHAR
+ for i in range(low, high+1):
+ if i < maxmapping and val < maxmapping:
+ array[i] = val
+ m = None
+file.close()
+
+# --- generate a normalized table to support case and accent folding
+
+i = 0
+needTerm = False;
+while i < maxmapping:
+ if not i % sizePerTable:
+ # table is empty?
+ j = i
+ while j < i + sizePerTable:
+ if array[j] != j:
+ break
+ j += 1
+
+ if j == i + sizePerTable:
+ if i:
+ globalTable.append("0")
+ i += sizePerTable
+ continue
+
+ if needTerm:
+ print "};\n"
+ globalTable.append("gNormalizeTable%04x" % i)
+ print "static const unsigned short gNormalizeTable%04x[] = {\n\t" % i,
+ print "/* U+%04x */\n\t" % i,
+ needTerm = True
+ # Decomposition does not case-fold, so we want to compensate by
+ # performing a lookup here. Because decomposition chains can be
+ # example: 01d5, a capital U with a diaeresis and a bar. yes, really.
+ # 01d5 -> 00dc -> 0055 (U) -> 0075 (u)
+ c = array[i]
+ while c != array[c]:
+ c = array[c]
+ if c >= 0x41 and c <= 0x5a:
+ raise Exception('got an uppercase character somehow: %x => %x'
+ % (i, c))
+ print "0x%04x," % c,
+ i = i + 1
+ if not i % 8:
+ print "\n\t",
+
+print "};\n\nstatic const unsigned short* gNormalizeTable[] = {",
+i = 0
+while i < (maxmapping / sizePerTable):
+ if not i % 4:
+ print "\n\t",
+ print globalTable[i] + ",",
+ i += 1
+
+print '''
+};
+
+unsigned int normalize_character(const unsigned int c)
+{
+ if (c >= ''' + ('0x%x' % (maxmapping,)) + ''' || !gNormalizeTable[c >> 6])
+ return c;
+ return gNormalizeTable[c >> 6][c & 0x3f];
+}
+'''