diff options
Diffstat (limited to 'mailnews/extensions/fts3/data/generate_table.py')
-rw-r--r-- | mailnews/extensions/fts3/data/generate_table.py | 264 |
1 files changed, 264 insertions, 0 deletions
diff --git a/mailnews/extensions/fts3/data/generate_table.py b/mailnews/extensions/fts3/data/generate_table.py new file mode 100644 index 000000000..f6b012685 --- /dev/null +++ b/mailnews/extensions/fts3/data/generate_table.py @@ -0,0 +1,264 @@ +#!/usr/bin/python +# ***** BEGIN LICENSE BLOCK ***** +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Thunderbird. +# +# The Initial Developer of the Original Code is Mozilla Japan. +# Portions created by the Initial Developer are Copyright (C) 2010 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Makoto Kato <m_kato@ga2.so-net.ne.jp> +# Andrew Sutherland <asutherland@asutherland.org> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ***** END LICENSE BLOCK ***** + +import re + +def printTable(f, t): + i = f + while i <= t: + c = array[i] + print "0x%04x," % c, + i = i + 1 + if not i % 8: + print "\n\t", + +print '''/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is Mozilla Japan. + * Portions created by the Initial Developer are Copyright (C) 2010 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Makoto Kato <m_kato@ga2.so-net.ne.jp> + * Andrew Sutherland <asutherland@asutherland.org> + * + * Alternatively, the contents of this file may be used under the terms of + * either of the GNU General Public License Version 2 or later (the "GPL"), + * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +/* THIS FILE IS GENERATED BY generate_table.py. DON'T EDIT THIS */ +''' + +p = re.compile('([0-9A-F]{4,5})(?:\.\.([0-9A-F]{4,5}))?[=\>]([0-9A-F]{4,5})?') +G_FROM = 1 +G_TO = 2 +G_FIRSTVAL = 3 + +# Array whose value at index i is the unicode value unicode character i should +# map to. +array = [] +# Contents of gNormalizeTable. We insert zero entries for sub-pages where we +# have no mappings. We insert references to the tables where we do have +# such tables. +globalTable = [] +globalTable.append("0") +# The (exclusive) upper bound of the conversion table, unicode character-wise. +# This is 0x10000 because our generated table is only 16-bit. This also limits +# the values we can map to; we perform an identity mapping for target values +# that >= maxmapping. +maxmapping = 0x10000 +sizePerTable = 64 + +# Map characters that the mapping tells us to obliterate to the NUKE_CHAR +# (such lines look like "FFF0..FFF8>") +# We do this because if we didn't do this, we would emit these characters as +# part of a token, which we definitely don't want. +NUKE_CHAR = 0x20 + +# --- load case folding table +# entries in the file look like: +# 0041>0061 +# 02D8>0020 0306 +# 2000..200A>0020 +# +# The 0041 (uppercase A) tells us it lowercases to 0061 (lowercase a). +# The 02D8 is a "spacing clone[s] of diacritic" breve which gets decomposed into +# a space character and a breve. This entry/type of entry also shows up in +# 'nfkc.txt'. +# The 2000..200A covers a range of space characters and maps them down to the +# 'normal' space character. + +file = open('nfkc_cf.txt') + +m = None +line = "\n" +i = 0x0 +while i < maxmapping and line: + if not m: + line = file.readline() + m = p.match(line) + if not m: + continue + low = int(m.group(G_FROM), 16) + # if G_TO is present, use it, otherwise fallback to low + high = m.group(G_TO) and int(m.group(G_TO), 16) or low + # if G_FIRSTVAL is present use it, otherwise use NUKE_CHAR + val = (m.group(G_FIRSTVAL) and int(m.group(G_FIRSTVAL), 16) + or NUKE_CHAR) + continue + + + if i >= low and i <= high: + if val >= maxmapping: + array.append(i) + else: + array.append(val) + if i == high: + m = None + else: + array.append(i) + i = i + 1 +file.close() + +# --- load normalization / decomposition table +# It is important that this file gets processed second because the other table +# will tell us about mappings from uppercase U with diaeresis to lowercase u +# with diaeresis. We obviously don't want that clobbering our value. (Although +# this would work out if we propagated backwards rather than forwards...) +# +# - entries in this file that we care about look like: +# 00A0>0020 +# 0100=0041 0304 +# +# They are found in the "Canonical and compatibility decomposition mappings" +# section. +# +# The 00A0 is mapping NBSP to the normal space character. +# The 0100 (a capital A with a bar over top of) is equivalent to 0041 (capital +# A) plus a 0304 (combining overline). We do not care about the combining +# marks which is why our regular expression does not capture it. +# +# +# - entries that we do not care about look like: +# 0300..0314:230 +# +# These map marks to their canonical combining class which appears to be a way +# of specifying the precedence / order in which marks should be combined. The +# key thing is we don't care about them. +file = open('nfkc.txt') +line = file.readline() +m = p.match(line) +while line: + if not m: + line = file.readline() + m = p.match(line) + continue + + low = int(m.group(G_FROM), 16) + # if G_TO is present, use it, otherwise fallback to low + high = m.group(G_TO) and int(m.group(G_TO), 16) or low + # if G_FIRSTVAL is present use it, otherwise fall back to NUKE_CHAR + val = m.group(G_FIRSTVAL) and int(m.group(G_FIRSTVAL), 16) or NUKE_CHAR + for i in range(low, high+1): + if i < maxmapping and val < maxmapping: + array[i] = val + m = None +file.close() + +# --- generate a normalized table to support case and accent folding + +i = 0 +needTerm = False; +while i < maxmapping: + if not i % sizePerTable: + # table is empty? + j = i + while j < i + sizePerTable: + if array[j] != j: + break + j += 1 + + if j == i + sizePerTable: + if i: + globalTable.append("0") + i += sizePerTable + continue + + if needTerm: + print "};\n" + globalTable.append("gNormalizeTable%04x" % i) + print "static const unsigned short gNormalizeTable%04x[] = {\n\t" % i, + print "/* U+%04x */\n\t" % i, + needTerm = True + # Decomposition does not case-fold, so we want to compensate by + # performing a lookup here. Because decomposition chains can be + # example: 01d5, a capital U with a diaeresis and a bar. yes, really. + # 01d5 -> 00dc -> 0055 (U) -> 0075 (u) + c = array[i] + while c != array[c]: + c = array[c] + if c >= 0x41 and c <= 0x5a: + raise Exception('got an uppercase character somehow: %x => %x' + % (i, c)) + print "0x%04x," % c, + i = i + 1 + if not i % 8: + print "\n\t", + +print "};\n\nstatic const unsigned short* gNormalizeTable[] = {", +i = 0 +while i < (maxmapping / sizePerTable): + if not i % 4: + print "\n\t", + print globalTable[i] + ",", + i += 1 + +print ''' +}; + +unsigned int normalize_character(const unsigned int c) +{ + if (c >= ''' + ('0x%x' % (maxmapping,)) + ''' || !gNormalizeTable[c >> 6]) + return c; + return gNormalizeTable[c >> 6][c & 0x3f]; +} +''' |