From 302bf1b523012e11b60425d6eee1221ebc2724eb Mon Sep 17 00:00:00 2001 From: "Matt A. Tobin" Date: Sun, 3 Nov 2019 00:17:46 -0400 Subject: Issue #1258 - Part 1: Import mailnews, ldap, and mork from comm-esr52.9.1 --- mailnews/extensions/fts3/src/Normalize.c | 1929 ++++++++++++++++++++ mailnews/extensions/fts3/src/README.mozilla | 3 + mailnews/extensions/fts3/src/fts3_porter.c | 1150 ++++++++++++ mailnews/extensions/fts3/src/fts3_tokenizer.h | 148 ++ mailnews/extensions/fts3/src/moz.build | 18 + mailnews/extensions/fts3/src/nsFts3Tokenizer.cpp | 72 + mailnews/extensions/fts3/src/nsFts3Tokenizer.h | 26 + mailnews/extensions/fts3/src/nsFts3TokenizerCID.h | 16 + .../extensions/fts3/src/nsGlodaRankerFunction.cpp | 145 ++ .../extensions/fts3/src/nsGlodaRankerFunction.h | 25 + 10 files changed, 3532 insertions(+) create mode 100644 mailnews/extensions/fts3/src/Normalize.c create mode 100644 mailnews/extensions/fts3/src/README.mozilla create mode 100644 mailnews/extensions/fts3/src/fts3_porter.c create mode 100644 mailnews/extensions/fts3/src/fts3_tokenizer.h create mode 100644 mailnews/extensions/fts3/src/moz.build create mode 100644 mailnews/extensions/fts3/src/nsFts3Tokenizer.cpp create mode 100644 mailnews/extensions/fts3/src/nsFts3Tokenizer.h create mode 100644 mailnews/extensions/fts3/src/nsFts3TokenizerCID.h create mode 100644 mailnews/extensions/fts3/src/nsGlodaRankerFunction.cpp create mode 100644 mailnews/extensions/fts3/src/nsGlodaRankerFunction.h (limited to 'mailnews/extensions/fts3/src') diff --git a/mailnews/extensions/fts3/src/Normalize.c b/mailnews/extensions/fts3/src/Normalize.c new file mode 100644 index 000000000..92ac400e2 --- /dev/null +++ b/mailnews/extensions/fts3/src/Normalize.c @@ -0,0 +1,1929 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* THIS FILE IS GENERATED BY generate_table.py. DON'T EDIT THIS */ + +static const unsigned short gNormalizeTable0040[] = { + /* U+0040 */ + 0x0040, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, + 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, + 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, + 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, + }; + +static const unsigned short gNormalizeTable0080[] = { + /* U+0080 */ + 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, + 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, + 0x0020, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, + 0x0020, 0x00a9, 0x0061, 0x00ab, 0x00ac, 0x0020, 0x00ae, 0x0020, + 0x00b0, 0x00b1, 0x0032, 0x0033, 0x0020, 0x03bc, 0x00b6, 0x00b7, + 0x0020, 0x0031, 0x006f, 0x00bb, 0x0031, 0x0031, 0x0033, 0x00bf, + }; + +static const unsigned short gNormalizeTable00c0[] = { + /* U+00c0 */ + 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x00e6, 0x0063, + 0x0065, 0x0065, 0x0065, 0x0065, 0x0069, 0x0069, 0x0069, 0x0069, + 0x00f0, 0x006e, 0x006f, 0x006f, 0x006f, 0x006f, 0x006f, 0x00d7, + 0x00f8, 0x0075, 0x0075, 0x0075, 0x0075, 0x0079, 0x00fe, 0x0073, + 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x00e6, 0x0063, + 0x0065, 0x0065, 0x0065, 0x0065, 0x0069, 0x0069, 0x0069, 0x0069, + 0x00f0, 0x006e, 0x006f, 0x006f, 0x006f, 0x006f, 0x006f, 0x00f7, + 0x00f8, 0x0075, 0x0075, 0x0075, 0x0075, 0x0079, 0x00fe, 0x0079, + }; + +static const unsigned short gNormalizeTable0100[] = { + /* U+0100 */ + 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x0063, 0x0063, + 0x0063, 0x0063, 0x0063, 0x0063, 0x0063, 0x0063, 0x0064, 0x0064, + 0x0111, 0x0111, 0x0065, 0x0065, 0x0065, 0x0065, 0x0065, 0x0065, + 0x0065, 0x0065, 0x0065, 0x0065, 0x0067, 0x0067, 0x0067, 0x0067, + 0x0067, 0x0067, 0x0067, 0x0067, 0x0068, 0x0068, 0x0127, 0x0127, + 0x0069, 0x0069, 0x0069, 0x0069, 0x0069, 0x0069, 0x0069, 0x0069, + 0x0069, 0x0131, 0x0069, 0x0069, 0x006a, 0x006a, 0x006b, 0x006b, + 0x0138, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, + }; + +static const unsigned short gNormalizeTable0140[] = { + /* U+0140 */ + 0x006c, 0x0142, 0x0142, 0x006e, 0x006e, 0x006e, 0x006e, 0x006e, + 0x006e, 0x02bc, 0x014b, 0x014b, 0x006f, 0x006f, 0x006f, 0x006f, + 0x006f, 0x006f, 0x0153, 0x0153, 0x0072, 0x0072, 0x0072, 0x0072, + 0x0072, 0x0072, 0x0073, 0x0073, 0x0073, 0x0073, 0x0073, 0x0073, + 0x0073, 0x0073, 0x0074, 0x0074, 0x0074, 0x0074, 0x0167, 0x0167, + 0x0075, 0x0075, 0x0075, 0x0075, 0x0075, 0x0075, 0x0075, 0x0075, + 0x0075, 0x0075, 0x0075, 0x0075, 0x0077, 0x0077, 0x0079, 0x0079, + 0x0079, 0x007a, 0x007a, 0x007a, 0x007a, 0x007a, 0x007a, 0x0073, + }; + +static const unsigned short gNormalizeTable0180[] = { + /* U+0180 */ + 0x0180, 0x0253, 0x0183, 0x0183, 0x0185, 0x0185, 0x0254, 0x0188, + 0x0188, 0x0256, 0x0257, 0x018c, 0x018c, 0x018d, 0x01dd, 0x0259, + 0x025b, 0x0192, 0x0192, 0x0260, 0x0263, 0x0195, 0x0269, 0x0268, + 0x0199, 0x0199, 0x019a, 0x019b, 0x026f, 0x0272, 0x019e, 0x0275, + 0x006f, 0x006f, 0x01a3, 0x01a3, 0x01a5, 0x01a5, 0x0280, 0x01a8, + 0x01a8, 0x0283, 0x01aa, 0x01ab, 0x01ad, 0x01ad, 0x0288, 0x0075, + 0x0075, 0x028a, 0x028b, 0x01b4, 0x01b4, 0x01b6, 0x01b6, 0x0292, + 0x01b9, 0x01b9, 0x01ba, 0x01bb, 0x01bd, 0x01bd, 0x01be, 0x01bf, + }; + +static const unsigned short gNormalizeTable01c0[] = { + /* U+01c0 */ + 0x01c0, 0x01c1, 0x01c2, 0x01c3, 0x0064, 0x0064, 0x0064, 0x006c, + 0x006c, 0x006c, 0x006e, 0x006e, 0x006e, 0x0061, 0x0061, 0x0069, + 0x0069, 0x006f, 0x006f, 0x0075, 0x0075, 0x0075, 0x0075, 0x0075, + 0x0075, 0x0075, 0x0075, 0x0075, 0x0075, 0x01dd, 0x0061, 0x0061, + 0x0061, 0x0061, 0x00e6, 0x00e6, 0x01e5, 0x01e5, 0x0067, 0x0067, + 0x006b, 0x006b, 0x006f, 0x006f, 0x006f, 0x006f, 0x0292, 0x0292, + 0x006a, 0x0064, 0x0064, 0x0064, 0x0067, 0x0067, 0x0195, 0x01bf, + 0x006e, 0x006e, 0x0061, 0x0061, 0x00e6, 0x00e6, 0x00f8, 0x00f8, + }; + +static const unsigned short gNormalizeTable0200[] = { + /* U+0200 */ + 0x0061, 0x0061, 0x0061, 0x0061, 0x0065, 0x0065, 0x0065, 0x0065, + 0x0069, 0x0069, 0x0069, 0x0069, 0x006f, 0x006f, 0x006f, 0x006f, + 0x0072, 0x0072, 0x0072, 0x0072, 0x0075, 0x0075, 0x0075, 0x0075, + 0x0073, 0x0073, 0x0074, 0x0074, 0x021d, 0x021d, 0x0068, 0x0068, + 0x019e, 0x0221, 0x0223, 0x0223, 0x0225, 0x0225, 0x0061, 0x0061, + 0x0065, 0x0065, 0x006f, 0x006f, 0x006f, 0x006f, 0x006f, 0x006f, + 0x006f, 0x006f, 0x0079, 0x0079, 0x0234, 0x0235, 0x0236, 0x0237, + 0x0238, 0x0239, 0x2c65, 0x023c, 0x023c, 0x019a, 0x2c66, 0x023f, + }; + +static const unsigned short gNormalizeTable0240[] = { + /* U+0240 */ + 0x0240, 0x0242, 0x0242, 0x0180, 0x0289, 0x028c, 0x0247, 0x0247, + 0x0249, 0x0249, 0x024b, 0x024b, 0x024d, 0x024d, 0x024f, 0x024f, + 0x0250, 0x0251, 0x0252, 0x0253, 0x0254, 0x0255, 0x0256, 0x0257, + 0x0258, 0x0259, 0x025a, 0x025b, 0x025c, 0x025d, 0x025e, 0x025f, + 0x0260, 0x0261, 0x0262, 0x0263, 0x0264, 0x0265, 0x0266, 0x0267, + 0x0268, 0x0269, 0x026a, 0x026b, 0x026c, 0x026d, 0x026e, 0x026f, + 0x0270, 0x0271, 0x0272, 0x0273, 0x0274, 0x0275, 0x0276, 0x0277, + 0x0278, 0x0279, 0x027a, 0x027b, 0x027c, 0x027d, 0x027e, 0x027f, + }; + +static const unsigned short gNormalizeTable0280[] = { + /* U+0280 */ + 0x0280, 0x0281, 0x0282, 0x0283, 0x0284, 0x0285, 0x0286, 0x0287, + 0x0288, 0x0289, 0x028a, 0x028b, 0x028c, 0x028d, 0x028e, 0x028f, + 0x0290, 0x0291, 0x0292, 0x0293, 0x0294, 0x0295, 0x0296, 0x0297, + 0x0298, 0x0299, 0x029a, 0x029b, 0x029c, 0x029d, 0x029e, 0x029f, + 0x02a0, 0x02a1, 0x02a2, 0x02a3, 0x02a4, 0x02a5, 0x02a6, 0x02a7, + 0x02a8, 0x02a9, 0x02aa, 0x02ab, 0x02ac, 0x02ad, 0x02ae, 0x02af, + 0x0068, 0x0266, 0x006a, 0x0072, 0x0279, 0x027b, 0x0281, 0x0077, + 0x0079, 0x02b9, 0x02ba, 0x02bb, 0x02bc, 0x02bd, 0x02be, 0x02bf, + }; + +static const unsigned short gNormalizeTable02c0[] = { + /* U+02c0 */ + 0x02c0, 0x02c1, 0x02c2, 0x02c3, 0x02c4, 0x02c5, 0x02c6, 0x02c7, + 0x02c8, 0x02c9, 0x02ca, 0x02cb, 0x02cc, 0x02cd, 0x02ce, 0x02cf, + 0x02d0, 0x02d1, 0x02d2, 0x02d3, 0x02d4, 0x02d5, 0x02d6, 0x02d7, + 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x02de, 0x02df, + 0x0263, 0x006c, 0x0073, 0x0078, 0x0295, 0x02e5, 0x02e6, 0x02e7, + 0x02e8, 0x02e9, 0x02ea, 0x02eb, 0x02ec, 0x02ed, 0x02ee, 0x02ef, + 0x02f0, 0x02f1, 0x02f2, 0x02f3, 0x02f4, 0x02f5, 0x02f6, 0x02f7, + 0x02f8, 0x02f9, 0x02fa, 0x02fb, 0x02fc, 0x02fd, 0x02fe, 0x02ff, + }; + +static const unsigned short gNormalizeTable0340[] = { + /* U+0340 */ + 0x0300, 0x0301, 0x0342, 0x0313, 0x0308, 0x03b9, 0x0346, 0x0347, + 0x0348, 0x0349, 0x034a, 0x034b, 0x034c, 0x034d, 0x034e, 0x0020, + 0x0350, 0x0351, 0x0352, 0x0353, 0x0354, 0x0355, 0x0356, 0x0357, + 0x0358, 0x0359, 0x035a, 0x035b, 0x035c, 0x035d, 0x035e, 0x035f, + 0x0360, 0x0361, 0x0362, 0x0363, 0x0364, 0x0365, 0x0366, 0x0367, + 0x0368, 0x0369, 0x036a, 0x036b, 0x036c, 0x036d, 0x036e, 0x036f, + 0x0371, 0x0371, 0x0373, 0x0373, 0x02b9, 0x0375, 0x0377, 0x0377, + 0x0378, 0x0379, 0x0020, 0x037b, 0x037c, 0x037d, 0x003b, 0x037f, + }; + +static const unsigned short gNormalizeTable0380[] = { + /* U+0380 */ + 0x0380, 0x0381, 0x0382, 0x0383, 0x0020, 0x0020, 0x03b1, 0x00b7, + 0x03b5, 0x03b7, 0x03b9, 0x038b, 0x03bf, 0x038d, 0x03c5, 0x03c9, + 0x03b9, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7, + 0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf, + 0x03c0, 0x03c1, 0x03a2, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7, + 0x03c8, 0x03c9, 0x03b9, 0x03c5, 0x03b1, 0x03b5, 0x03b7, 0x03b9, + 0x03c5, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7, + 0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf, + }; + +static const unsigned short gNormalizeTable03c0[] = { + /* U+03c0 */ + 0x03c0, 0x03c1, 0x03c3, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7, + 0x03c8, 0x03c9, 0x03b9, 0x03c5, 0x03bf, 0x03c5, 0x03c9, 0x03d7, + 0x03b2, 0x03b8, 0x03c5, 0x03c5, 0x03c5, 0x03c6, 0x03c0, 0x03d7, + 0x03d9, 0x03d9, 0x03db, 0x03db, 0x03dd, 0x03dd, 0x03df, 0x03df, + 0x03e1, 0x03e1, 0x03e3, 0x03e3, 0x03e5, 0x03e5, 0x03e7, 0x03e7, + 0x03e9, 0x03e9, 0x03eb, 0x03eb, 0x03ed, 0x03ed, 0x03ef, 0x03ef, + 0x03ba, 0x03c1, 0x03c3, 0x03f3, 0x03b8, 0x03b5, 0x03f6, 0x03f8, + 0x03f8, 0x03c3, 0x03fb, 0x03fb, 0x03fc, 0x037b, 0x037c, 0x037d, + }; + +static const unsigned short gNormalizeTable0400[] = { + /* U+0400 */ + 0x0435, 0x0435, 0x0452, 0x0433, 0x0454, 0x0455, 0x0456, 0x0456, + 0x0458, 0x0459, 0x045a, 0x045b, 0x043a, 0x0438, 0x0443, 0x045f, + 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, + 0x0438, 0x0438, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, + 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, + 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, + 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, + 0x0438, 0x0438, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, + }; + +static const unsigned short gNormalizeTable0440[] = { + /* U+0440 */ + 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, + 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, + 0x0435, 0x0435, 0x0452, 0x0433, 0x0454, 0x0455, 0x0456, 0x0456, + 0x0458, 0x0459, 0x045a, 0x045b, 0x043a, 0x0438, 0x0443, 0x045f, + 0x0461, 0x0461, 0x0463, 0x0463, 0x0465, 0x0465, 0x0467, 0x0467, + 0x0469, 0x0469, 0x046b, 0x046b, 0x046d, 0x046d, 0x046f, 0x046f, + 0x0471, 0x0471, 0x0473, 0x0473, 0x0475, 0x0475, 0x0475, 0x0475, + 0x0479, 0x0479, 0x047b, 0x047b, 0x047d, 0x047d, 0x047f, 0x047f, + }; + +static const unsigned short gNormalizeTable0480[] = { + /* U+0480 */ + 0x0481, 0x0481, 0x0482, 0x0483, 0x0484, 0x0485, 0x0486, 0x0487, + 0x0488, 0x0489, 0x048b, 0x048b, 0x048d, 0x048d, 0x048f, 0x048f, + 0x0491, 0x0491, 0x0493, 0x0493, 0x0495, 0x0495, 0x0497, 0x0497, + 0x0499, 0x0499, 0x049b, 0x049b, 0x049d, 0x049d, 0x049f, 0x049f, + 0x04a1, 0x04a1, 0x04a3, 0x04a3, 0x04a5, 0x04a5, 0x04a7, 0x04a7, + 0x04a9, 0x04a9, 0x04ab, 0x04ab, 0x04ad, 0x04ad, 0x04af, 0x04af, + 0x04b1, 0x04b1, 0x04b3, 0x04b3, 0x04b5, 0x04b5, 0x04b7, 0x04b7, + 0x04b9, 0x04b9, 0x04bb, 0x04bb, 0x04bd, 0x04bd, 0x04bf, 0x04bf, + }; + +static const unsigned short gNormalizeTable04c0[] = { + /* U+04c0 */ + 0x04cf, 0x0436, 0x0436, 0x04c4, 0x04c4, 0x04c6, 0x04c6, 0x04c8, + 0x04c8, 0x04ca, 0x04ca, 0x04cc, 0x04cc, 0x04ce, 0x04ce, 0x04cf, + 0x0430, 0x0430, 0x0430, 0x0430, 0x04d5, 0x04d5, 0x0435, 0x0435, + 0x04d9, 0x04d9, 0x04d9, 0x04d9, 0x0436, 0x0436, 0x0437, 0x0437, + 0x04e1, 0x04e1, 0x0438, 0x0438, 0x0438, 0x0438, 0x043e, 0x043e, + 0x04e9, 0x04e9, 0x04e9, 0x04e9, 0x044d, 0x044d, 0x0443, 0x0443, + 0x0443, 0x0443, 0x0443, 0x0443, 0x0447, 0x0447, 0x04f7, 0x04f7, + 0x044b, 0x044b, 0x04fb, 0x04fb, 0x04fd, 0x04fd, 0x04ff, 0x04ff, + }; + +static const unsigned short gNormalizeTable0500[] = { + /* U+0500 */ + 0x0501, 0x0501, 0x0503, 0x0503, 0x0505, 0x0505, 0x0507, 0x0507, + 0x0509, 0x0509, 0x050b, 0x050b, 0x050d, 0x050d, 0x050f, 0x050f, + 0x0511, 0x0511, 0x0513, 0x0513, 0x0515, 0x0515, 0x0517, 0x0517, + 0x0519, 0x0519, 0x051b, 0x051b, 0x051d, 0x051d, 0x051f, 0x051f, + 0x0521, 0x0521, 0x0523, 0x0523, 0x0525, 0x0525, 0x0526, 0x0527, + 0x0528, 0x0529, 0x052a, 0x052b, 0x052c, 0x052d, 0x052e, 0x052f, + 0x0530, 0x0561, 0x0562, 0x0563, 0x0564, 0x0565, 0x0566, 0x0567, + 0x0568, 0x0569, 0x056a, 0x056b, 0x056c, 0x056d, 0x056e, 0x056f, + }; + +static const unsigned short gNormalizeTable0540[] = { + /* U+0540 */ + 0x0570, 0x0571, 0x0572, 0x0573, 0x0574, 0x0575, 0x0576, 0x0577, + 0x0578, 0x0579, 0x057a, 0x057b, 0x057c, 0x057d, 0x057e, 0x057f, + 0x0580, 0x0581, 0x0582, 0x0583, 0x0584, 0x0585, 0x0586, 0x0557, + 0x0558, 0x0559, 0x055a, 0x055b, 0x055c, 0x055d, 0x055e, 0x055f, + 0x0560, 0x0561, 0x0562, 0x0563, 0x0564, 0x0565, 0x0566, 0x0567, + 0x0568, 0x0569, 0x056a, 0x056b, 0x056c, 0x056d, 0x056e, 0x056f, + 0x0570, 0x0571, 0x0572, 0x0573, 0x0574, 0x0575, 0x0576, 0x0577, + 0x0578, 0x0579, 0x057a, 0x057b, 0x057c, 0x057d, 0x057e, 0x057f, + }; + +static const unsigned short gNormalizeTable0580[] = { + /* U+0580 */ + 0x0580, 0x0581, 0x0582, 0x0583, 0x0584, 0x0585, 0x0586, 0x0565, + 0x0588, 0x0589, 0x058a, 0x058b, 0x058c, 0x058d, 0x058e, 0x058f, + 0x0590, 0x0591, 0x0592, 0x0593, 0x0594, 0x0595, 0x0596, 0x0597, + 0x0598, 0x0599, 0x059a, 0x059b, 0x059c, 0x059d, 0x059e, 0x059f, + 0x05a0, 0x05a1, 0x05a2, 0x05a3, 0x05a4, 0x05a5, 0x05a6, 0x05a7, + 0x05a8, 0x05a9, 0x05aa, 0x05ab, 0x05ac, 0x05ad, 0x05ae, 0x05af, + 0x05b0, 0x05b1, 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, + 0x05b8, 0x05b9, 0x05ba, 0x05bb, 0x05bc, 0x05bd, 0x05be, 0x05bf, + }; + +static const unsigned short gNormalizeTable0600[] = { + /* U+0600 */ + 0x0600, 0x0601, 0x0602, 0x0603, 0x0604, 0x0605, 0x0606, 0x0607, + 0x0608, 0x0609, 0x060a, 0x060b, 0x060c, 0x060d, 0x060e, 0x060f, + 0x0610, 0x0611, 0x0612, 0x0613, 0x0614, 0x0615, 0x0616, 0x0617, + 0x0618, 0x0619, 0x061a, 0x061b, 0x061c, 0x061d, 0x061e, 0x061f, + 0x0620, 0x0621, 0x0627, 0x0627, 0x0648, 0x0627, 0x064a, 0x0627, + 0x0628, 0x0629, 0x062a, 0x062b, 0x062c, 0x062d, 0x062e, 0x062f, + 0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637, + 0x0638, 0x0639, 0x063a, 0x063b, 0x063c, 0x063d, 0x063e, 0x063f, + }; + +static const unsigned short gNormalizeTable0640[] = { + /* U+0640 */ + 0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647, + 0x0648, 0x0649, 0x064a, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, + 0x0650, 0x0651, 0x0652, 0x0653, 0x0654, 0x0655, 0x0656, 0x0657, + 0x0658, 0x0659, 0x065a, 0x065b, 0x065c, 0x065d, 0x065e, 0x065f, + 0x0660, 0x0661, 0x0662, 0x0663, 0x0664, 0x0665, 0x0666, 0x0667, + 0x0668, 0x0669, 0x066a, 0x066b, 0x066c, 0x066d, 0x066e, 0x066f, + 0x0670, 0x0671, 0x0672, 0x0673, 0x0674, 0x0627, 0x0648, 0x06c7, + 0x064a, 0x0679, 0x067a, 0x067b, 0x067c, 0x067d, 0x067e, 0x067f, + }; + +static const unsigned short gNormalizeTable06c0[] = { + /* U+06c0 */ + 0x06d5, 0x06c1, 0x06c1, 0x06c3, 0x06c4, 0x06c5, 0x06c6, 0x06c7, + 0x06c8, 0x06c9, 0x06ca, 0x06cb, 0x06cc, 0x06cd, 0x06ce, 0x06cf, + 0x06d0, 0x06d1, 0x06d2, 0x06d2, 0x06d4, 0x06d5, 0x06d6, 0x06d7, + 0x06d8, 0x06d9, 0x06da, 0x06db, 0x06dc, 0x06dd, 0x06de, 0x06df, + 0x06e0, 0x06e1, 0x06e2, 0x06e3, 0x06e4, 0x06e5, 0x06e6, 0x06e7, + 0x06e8, 0x06e9, 0x06ea, 0x06eb, 0x06ec, 0x06ed, 0x06ee, 0x06ef, + 0x06f0, 0x06f1, 0x06f2, 0x06f3, 0x06f4, 0x06f5, 0x06f6, 0x06f7, + 0x06f8, 0x06f9, 0x06fa, 0x06fb, 0x06fc, 0x06fd, 0x06fe, 0x06ff, + }; + +static const unsigned short gNormalizeTable0900[] = { + /* U+0900 */ + 0x0900, 0x0901, 0x0902, 0x0903, 0x0904, 0x0905, 0x0906, 0x0907, + 0x0908, 0x0909, 0x090a, 0x090b, 0x090c, 0x090d, 0x090e, 0x090f, + 0x0910, 0x0911, 0x0912, 0x0913, 0x0914, 0x0915, 0x0916, 0x0917, + 0x0918, 0x0919, 0x091a, 0x091b, 0x091c, 0x091d, 0x091e, 0x091f, + 0x0920, 0x0921, 0x0922, 0x0923, 0x0924, 0x0925, 0x0926, 0x0927, + 0x0928, 0x0928, 0x092a, 0x092b, 0x092c, 0x092d, 0x092e, 0x092f, + 0x0930, 0x0930, 0x0932, 0x0933, 0x0933, 0x0935, 0x0936, 0x0937, + 0x0938, 0x0939, 0x093a, 0x093b, 0x093c, 0x093d, 0x093e, 0x093f, + }; + +static const unsigned short gNormalizeTable0940[] = { + /* U+0940 */ + 0x0940, 0x0941, 0x0942, 0x0943, 0x0944, 0x0945, 0x0946, 0x0947, + 0x0948, 0x0949, 0x094a, 0x094b, 0x094c, 0x094d, 0x094e, 0x094f, + 0x0950, 0x0951, 0x0952, 0x0953, 0x0954, 0x0955, 0x0956, 0x0957, + 0x0915, 0x0916, 0x0917, 0x091c, 0x0921, 0x0922, 0x092b, 0x092f, + 0x0960, 0x0961, 0x0962, 0x0963, 0x0964, 0x0965, 0x0966, 0x0967, + 0x0968, 0x0969, 0x096a, 0x096b, 0x096c, 0x096d, 0x096e, 0x096f, + 0x0970, 0x0971, 0x0972, 0x0973, 0x0974, 0x0975, 0x0976, 0x0977, + 0x0978, 0x0979, 0x097a, 0x097b, 0x097c, 0x097d, 0x097e, 0x097f, + }; + +static const unsigned short gNormalizeTable09c0[] = { + /* U+09c0 */ + 0x09c0, 0x09c1, 0x09c2, 0x09c3, 0x09c4, 0x09c5, 0x09c6, 0x09c7, + 0x09c8, 0x09c9, 0x09ca, 0x09c7, 0x09c7, 0x09cd, 0x09ce, 0x09cf, + 0x09d0, 0x09d1, 0x09d2, 0x09d3, 0x09d4, 0x09d5, 0x09d6, 0x09d7, + 0x09d8, 0x09d9, 0x09da, 0x09db, 0x09a1, 0x09a2, 0x09de, 0x09af, + 0x09e0, 0x09e1, 0x09e2, 0x09e3, 0x09e4, 0x09e5, 0x09e6, 0x09e7, + 0x09e8, 0x09e9, 0x09ea, 0x09eb, 0x09ec, 0x09ed, 0x09ee, 0x09ef, + 0x09f0, 0x09f1, 0x09f2, 0x09f3, 0x09f4, 0x09f5, 0x09f6, 0x09f7, + 0x09f8, 0x09f9, 0x09fa, 0x09fb, 0x09fc, 0x09fd, 0x09fe, 0x09ff, + }; + +static const unsigned short gNormalizeTable0a00[] = { + /* U+0a00 */ + 0x0a00, 0x0a01, 0x0a02, 0x0a03, 0x0a04, 0x0a05, 0x0a06, 0x0a07, + 0x0a08, 0x0a09, 0x0a0a, 0x0a0b, 0x0a0c, 0x0a0d, 0x0a0e, 0x0a0f, + 0x0a10, 0x0a11, 0x0a12, 0x0a13, 0x0a14, 0x0a15, 0x0a16, 0x0a17, + 0x0a18, 0x0a19, 0x0a1a, 0x0a1b, 0x0a1c, 0x0a1d, 0x0a1e, 0x0a1f, + 0x0a20, 0x0a21, 0x0a22, 0x0a23, 0x0a24, 0x0a25, 0x0a26, 0x0a27, + 0x0a28, 0x0a29, 0x0a2a, 0x0a2b, 0x0a2c, 0x0a2d, 0x0a2e, 0x0a2f, + 0x0a30, 0x0a31, 0x0a32, 0x0a32, 0x0a34, 0x0a35, 0x0a38, 0x0a37, + 0x0a38, 0x0a39, 0x0a3a, 0x0a3b, 0x0a3c, 0x0a3d, 0x0a3e, 0x0a3f, + }; + +static const unsigned short gNormalizeTable0a40[] = { + /* U+0a40 */ + 0x0a40, 0x0a41, 0x0a42, 0x0a43, 0x0a44, 0x0a45, 0x0a46, 0x0a47, + 0x0a48, 0x0a49, 0x0a4a, 0x0a4b, 0x0a4c, 0x0a4d, 0x0a4e, 0x0a4f, + 0x0a50, 0x0a51, 0x0a52, 0x0a53, 0x0a54, 0x0a55, 0x0a56, 0x0a57, + 0x0a58, 0x0a16, 0x0a17, 0x0a1c, 0x0a5c, 0x0a5d, 0x0a2b, 0x0a5f, + 0x0a60, 0x0a61, 0x0a62, 0x0a63, 0x0a64, 0x0a65, 0x0a66, 0x0a67, + 0x0a68, 0x0a69, 0x0a6a, 0x0a6b, 0x0a6c, 0x0a6d, 0x0a6e, 0x0a6f, + 0x0a70, 0x0a71, 0x0a72, 0x0a73, 0x0a74, 0x0a75, 0x0a76, 0x0a77, + 0x0a78, 0x0a79, 0x0a7a, 0x0a7b, 0x0a7c, 0x0a7d, 0x0a7e, 0x0a7f, + }; + +static const unsigned short gNormalizeTable0b40[] = { + /* U+0b40 */ + 0x0b40, 0x0b41, 0x0b42, 0x0b43, 0x0b44, 0x0b45, 0x0b46, 0x0b47, + 0x0b47, 0x0b49, 0x0b4a, 0x0b47, 0x0b47, 0x0b4d, 0x0b4e, 0x0b4f, + 0x0b50, 0x0b51, 0x0b52, 0x0b53, 0x0b54, 0x0b55, 0x0b56, 0x0b57, + 0x0b58, 0x0b59, 0x0b5a, 0x0b5b, 0x0b21, 0x0b22, 0x0b5e, 0x0b5f, + 0x0b60, 0x0b61, 0x0b62, 0x0b63, 0x0b64, 0x0b65, 0x0b66, 0x0b67, + 0x0b68, 0x0b69, 0x0b6a, 0x0b6b, 0x0b6c, 0x0b6d, 0x0b6e, 0x0b6f, + 0x0b70, 0x0b71, 0x0b72, 0x0b73, 0x0b74, 0x0b75, 0x0b76, 0x0b77, + 0x0b78, 0x0b79, 0x0b7a, 0x0b7b, 0x0b7c, 0x0b7d, 0x0b7e, 0x0b7f, + }; + +static const unsigned short gNormalizeTable0b80[] = { + /* U+0b80 */ + 0x0b80, 0x0b81, 0x0b82, 0x0b83, 0x0b84, 0x0b85, 0x0b86, 0x0b87, + 0x0b88, 0x0b89, 0x0b8a, 0x0b8b, 0x0b8c, 0x0b8d, 0x0b8e, 0x0b8f, + 0x0b90, 0x0b91, 0x0b92, 0x0b93, 0x0b92, 0x0b95, 0x0b96, 0x0b97, + 0x0b98, 0x0b99, 0x0b9a, 0x0b9b, 0x0b9c, 0x0b9d, 0x0b9e, 0x0b9f, + 0x0ba0, 0x0ba1, 0x0ba2, 0x0ba3, 0x0ba4, 0x0ba5, 0x0ba6, 0x0ba7, + 0x0ba8, 0x0ba9, 0x0baa, 0x0bab, 0x0bac, 0x0bad, 0x0bae, 0x0baf, + 0x0bb0, 0x0bb1, 0x0bb2, 0x0bb3, 0x0bb4, 0x0bb5, 0x0bb6, 0x0bb7, + 0x0bb8, 0x0bb9, 0x0bba, 0x0bbb, 0x0bbc, 0x0bbd, 0x0bbe, 0x0bbf, + }; + +static const unsigned short gNormalizeTable0bc0[] = { + /* U+0bc0 */ + 0x0bc0, 0x0bc1, 0x0bc2, 0x0bc3, 0x0bc4, 0x0bc5, 0x0bc6, 0x0bc7, + 0x0bc8, 0x0bc9, 0x0bc6, 0x0bc7, 0x0bc6, 0x0bcd, 0x0bce, 0x0bcf, + 0x0bd0, 0x0bd1, 0x0bd2, 0x0bd3, 0x0bd4, 0x0bd5, 0x0bd6, 0x0bd7, + 0x0bd8, 0x0bd9, 0x0bda, 0x0bdb, 0x0bdc, 0x0bdd, 0x0bde, 0x0bdf, + 0x0be0, 0x0be1, 0x0be2, 0x0be3, 0x0be4, 0x0be5, 0x0be6, 0x0be7, + 0x0be8, 0x0be9, 0x0bea, 0x0beb, 0x0bec, 0x0bed, 0x0bee, 0x0bef, + 0x0bf0, 0x0bf1, 0x0bf2, 0x0bf3, 0x0bf4, 0x0bf5, 0x0bf6, 0x0bf7, + 0x0bf8, 0x0bf9, 0x0bfa, 0x0bfb, 0x0bfc, 0x0bfd, 0x0bfe, 0x0bff, + }; + +static const unsigned short gNormalizeTable0c40[] = { + /* U+0c40 */ + 0x0c40, 0x0c41, 0x0c42, 0x0c43, 0x0c44, 0x0c45, 0x0c46, 0x0c47, + 0x0c46, 0x0c49, 0x0c4a, 0x0c4b, 0x0c4c, 0x0c4d, 0x0c4e, 0x0c4f, + 0x0c50, 0x0c51, 0x0c52, 0x0c53, 0x0c54, 0x0c55, 0x0c56, 0x0c57, + 0x0c58, 0x0c59, 0x0c5a, 0x0c5b, 0x0c5c, 0x0c5d, 0x0c5e, 0x0c5f, + 0x0c60, 0x0c61, 0x0c62, 0x0c63, 0x0c64, 0x0c65, 0x0c66, 0x0c67, + 0x0c68, 0x0c69, 0x0c6a, 0x0c6b, 0x0c6c, 0x0c6d, 0x0c6e, 0x0c6f, + 0x0c70, 0x0c71, 0x0c72, 0x0c73, 0x0c74, 0x0c75, 0x0c76, 0x0c77, + 0x0c78, 0x0c79, 0x0c7a, 0x0c7b, 0x0c7c, 0x0c7d, 0x0c7e, 0x0c7f, + }; + +static const unsigned short gNormalizeTable0cc0[] = { + /* U+0cc0 */ + 0x0cbf, 0x0cc1, 0x0cc2, 0x0cc3, 0x0cc4, 0x0cc5, 0x0cc6, 0x0cc6, + 0x0cc6, 0x0cc9, 0x0cc6, 0x0cc6, 0x0ccc, 0x0ccd, 0x0cce, 0x0ccf, + 0x0cd0, 0x0cd1, 0x0cd2, 0x0cd3, 0x0cd4, 0x0cd5, 0x0cd6, 0x0cd7, + 0x0cd8, 0x0cd9, 0x0cda, 0x0cdb, 0x0cdc, 0x0cdd, 0x0cde, 0x0cdf, + 0x0ce0, 0x0ce1, 0x0ce2, 0x0ce3, 0x0ce4, 0x0ce5, 0x0ce6, 0x0ce7, + 0x0ce8, 0x0ce9, 0x0cea, 0x0ceb, 0x0cec, 0x0ced, 0x0cee, 0x0cef, + 0x0cf0, 0x0cf1, 0x0cf2, 0x0cf3, 0x0cf4, 0x0cf5, 0x0cf6, 0x0cf7, + 0x0cf8, 0x0cf9, 0x0cfa, 0x0cfb, 0x0cfc, 0x0cfd, 0x0cfe, 0x0cff, + }; + +static const unsigned short gNormalizeTable0d40[] = { + /* U+0d40 */ + 0x0d40, 0x0d41, 0x0d42, 0x0d43, 0x0d44, 0x0d45, 0x0d46, 0x0d47, + 0x0d48, 0x0d49, 0x0d46, 0x0d47, 0x0d46, 0x0d4d, 0x0d4e, 0x0d4f, + 0x0d50, 0x0d51, 0x0d52, 0x0d53, 0x0d54, 0x0d55, 0x0d56, 0x0d57, + 0x0d58, 0x0d59, 0x0d5a, 0x0d5b, 0x0d5c, 0x0d5d, 0x0d5e, 0x0d5f, + 0x0d60, 0x0d61, 0x0d62, 0x0d63, 0x0d64, 0x0d65, 0x0d66, 0x0d67, + 0x0d68, 0x0d69, 0x0d6a, 0x0d6b, 0x0d6c, 0x0d6d, 0x0d6e, 0x0d6f, + 0x0d70, 0x0d71, 0x0d72, 0x0d73, 0x0d74, 0x0d75, 0x0d76, 0x0d77, + 0x0d78, 0x0d79, 0x0d7a, 0x0d7b, 0x0d7c, 0x0d7d, 0x0d7e, 0x0d7f, + }; + +static const unsigned short gNormalizeTable0dc0[] = { + /* U+0dc0 */ + 0x0dc0, 0x0dc1, 0x0dc2, 0x0dc3, 0x0dc4, 0x0dc5, 0x0dc6, 0x0dc7, + 0x0dc8, 0x0dc9, 0x0dca, 0x0dcb, 0x0dcc, 0x0dcd, 0x0dce, 0x0dcf, + 0x0dd0, 0x0dd1, 0x0dd2, 0x0dd3, 0x0dd4, 0x0dd5, 0x0dd6, 0x0dd7, + 0x0dd8, 0x0dd9, 0x0dd9, 0x0ddb, 0x0dd9, 0x0dd9, 0x0dd9, 0x0ddf, + 0x0de0, 0x0de1, 0x0de2, 0x0de3, 0x0de4, 0x0de5, 0x0de6, 0x0de7, + 0x0de8, 0x0de9, 0x0dea, 0x0deb, 0x0dec, 0x0ded, 0x0dee, 0x0def, + 0x0df0, 0x0df1, 0x0df2, 0x0df3, 0x0df4, 0x0df5, 0x0df6, 0x0df7, + 0x0df8, 0x0df9, 0x0dfa, 0x0dfb, 0x0dfc, 0x0dfd, 0x0dfe, 0x0dff, + }; + +static const unsigned short gNormalizeTable0e00[] = { + /* U+0e00 */ + 0x0e00, 0x0e01, 0x0e02, 0x0e03, 0x0e04, 0x0e05, 0x0e06, 0x0e07, + 0x0e08, 0x0e09, 0x0e0a, 0x0e0b, 0x0e0c, 0x0e0d, 0x0e0e, 0x0e0f, + 0x0e10, 0x0e11, 0x0e12, 0x0e13, 0x0e14, 0x0e15, 0x0e16, 0x0e17, + 0x0e18, 0x0e19, 0x0e1a, 0x0e1b, 0x0e1c, 0x0e1d, 0x0e1e, 0x0e1f, + 0x0e20, 0x0e21, 0x0e22, 0x0e23, 0x0e24, 0x0e25, 0x0e26, 0x0e27, + 0x0e28, 0x0e29, 0x0e2a, 0x0e2b, 0x0e2c, 0x0e2d, 0x0e2e, 0x0e2f, + 0x0e30, 0x0e31, 0x0e32, 0x0e4d, 0x0e34, 0x0e35, 0x0e36, 0x0e37, + 0x0e38, 0x0e39, 0x0e3a, 0x0e3b, 0x0e3c, 0x0e3d, 0x0e3e, 0x0e3f, + }; + +static const unsigned short gNormalizeTable0e80[] = { + /* U+0e80 */ + 0x0e80, 0x0e81, 0x0e82, 0x0e83, 0x0e84, 0x0e85, 0x0e86, 0x0e87, + 0x0e88, 0x0e89, 0x0e8a, 0x0e8b, 0x0e8c, 0x0e8d, 0x0e8e, 0x0e8f, + 0x0e90, 0x0e91, 0x0e92, 0x0e93, 0x0e94, 0x0e95, 0x0e96, 0x0e97, + 0x0e98, 0x0e99, 0x0e9a, 0x0e9b, 0x0e9c, 0x0e9d, 0x0e9e, 0x0e9f, + 0x0ea0, 0x0ea1, 0x0ea2, 0x0ea3, 0x0ea4, 0x0ea5, 0x0ea6, 0x0ea7, + 0x0ea8, 0x0ea9, 0x0eaa, 0x0eab, 0x0eac, 0x0ead, 0x0eae, 0x0eaf, + 0x0eb0, 0x0eb1, 0x0eb2, 0x0ecd, 0x0eb4, 0x0eb5, 0x0eb6, 0x0eb7, + 0x0eb8, 0x0eb9, 0x0eba, 0x0ebb, 0x0ebc, 0x0ebd, 0x0ebe, 0x0ebf, + }; + +static const unsigned short gNormalizeTable0ec0[] = { + /* U+0ec0 */ + 0x0ec0, 0x0ec1, 0x0ec2, 0x0ec3, 0x0ec4, 0x0ec5, 0x0ec6, 0x0ec7, + 0x0ec8, 0x0ec9, 0x0eca, 0x0ecb, 0x0ecc, 0x0ecd, 0x0ece, 0x0ecf, + 0x0ed0, 0x0ed1, 0x0ed2, 0x0ed3, 0x0ed4, 0x0ed5, 0x0ed6, 0x0ed7, + 0x0ed8, 0x0ed9, 0x0eda, 0x0edb, 0x0eab, 0x0eab, 0x0ede, 0x0edf, + 0x0ee0, 0x0ee1, 0x0ee2, 0x0ee3, 0x0ee4, 0x0ee5, 0x0ee6, 0x0ee7, + 0x0ee8, 0x0ee9, 0x0eea, 0x0eeb, 0x0eec, 0x0eed, 0x0eee, 0x0eef, + 0x0ef0, 0x0ef1, 0x0ef2, 0x0ef3, 0x0ef4, 0x0ef5, 0x0ef6, 0x0ef7, + 0x0ef8, 0x0ef9, 0x0efa, 0x0efb, 0x0efc, 0x0efd, 0x0efe, 0x0eff, + }; + +static const unsigned short gNormalizeTable0f00[] = { + /* U+0f00 */ + 0x0f00, 0x0f01, 0x0f02, 0x0f03, 0x0f04, 0x0f05, 0x0f06, 0x0f07, + 0x0f08, 0x0f09, 0x0f0a, 0x0f0b, 0x0f0b, 0x0f0d, 0x0f0e, 0x0f0f, + 0x0f10, 0x0f11, 0x0f12, 0x0f13, 0x0f14, 0x0f15, 0x0f16, 0x0f17, + 0x0f18, 0x0f19, 0x0f1a, 0x0f1b, 0x0f1c, 0x0f1d, 0x0f1e, 0x0f1f, + 0x0f20, 0x0f21, 0x0f22, 0x0f23, 0x0f24, 0x0f25, 0x0f26, 0x0f27, + 0x0f28, 0x0f29, 0x0f2a, 0x0f2b, 0x0f2c, 0x0f2d, 0x0f2e, 0x0f2f, + 0x0f30, 0x0f31, 0x0f32, 0x0f33, 0x0f34, 0x0f35, 0x0f36, 0x0f37, + 0x0f38, 0x0f39, 0x0f3a, 0x0f3b, 0x0f3c, 0x0f3d, 0x0f3e, 0x0f3f, + }; + +static const unsigned short gNormalizeTable0f40[] = { + /* U+0f40 */ + 0x0f40, 0x0f41, 0x0f42, 0x0f42, 0x0f44, 0x0f45, 0x0f46, 0x0f47, + 0x0f48, 0x0f49, 0x0f4a, 0x0f4b, 0x0f4c, 0x0f4c, 0x0f4e, 0x0f4f, + 0x0f50, 0x0f51, 0x0f51, 0x0f53, 0x0f54, 0x0f55, 0x0f56, 0x0f56, + 0x0f58, 0x0f59, 0x0f5a, 0x0f5b, 0x0f5b, 0x0f5d, 0x0f5e, 0x0f5f, + 0x0f60, 0x0f61, 0x0f62, 0x0f63, 0x0f64, 0x0f65, 0x0f66, 0x0f67, + 0x0f68, 0x0f40, 0x0f6a, 0x0f6b, 0x0f6c, 0x0f6d, 0x0f6e, 0x0f6f, + 0x0f70, 0x0f71, 0x0f72, 0x0f71, 0x0f74, 0x0f71, 0x0fb2, 0x0fb2, + 0x0fb3, 0x0fb3, 0x0f7a, 0x0f7b, 0x0f7c, 0x0f7d, 0x0f7e, 0x0f7f, + }; + +static const unsigned short gNormalizeTable0f80[] = { + /* U+0f80 */ + 0x0f80, 0x0f71, 0x0f82, 0x0f83, 0x0f84, 0x0f85, 0x0f86, 0x0f87, + 0x0f88, 0x0f89, 0x0f8a, 0x0f8b, 0x0f8c, 0x0f8d, 0x0f8e, 0x0f8f, + 0x0f90, 0x0f91, 0x0f92, 0x0f92, 0x0f94, 0x0f95, 0x0f96, 0x0f97, + 0x0f98, 0x0f99, 0x0f9a, 0x0f9b, 0x0f9c, 0x0f9c, 0x0f9e, 0x0f9f, + 0x0fa0, 0x0fa1, 0x0fa1, 0x0fa3, 0x0fa4, 0x0fa5, 0x0fa6, 0x0fa6, + 0x0fa8, 0x0fa9, 0x0faa, 0x0fab, 0x0fab, 0x0fad, 0x0fae, 0x0faf, + 0x0fb0, 0x0fb1, 0x0fb2, 0x0fb3, 0x0fb4, 0x0fb5, 0x0fb6, 0x0fb7, + 0x0fb8, 0x0f90, 0x0fba, 0x0fbb, 0x0fbc, 0x0fbd, 0x0fbe, 0x0fbf, + }; + +static const unsigned short gNormalizeTable1000[] = { + /* U+1000 */ + 0x1000, 0x1001, 0x1002, 0x1003, 0x1004, 0x1005, 0x1006, 0x1007, + 0x1008, 0x1009, 0x100a, 0x100b, 0x100c, 0x100d, 0x100e, 0x100f, + 0x1010, 0x1011, 0x1012, 0x1013, 0x1014, 0x1015, 0x1016, 0x1017, + 0x1018, 0x1019, 0x101a, 0x101b, 0x101c, 0x101d, 0x101e, 0x101f, + 0x1020, 0x1021, 0x1022, 0x1023, 0x1024, 0x1025, 0x1025, 0x1027, + 0x1028, 0x1029, 0x102a, 0x102b, 0x102c, 0x102d, 0x102e, 0x102f, + 0x1030, 0x1031, 0x1032, 0x1033, 0x1034, 0x1035, 0x1036, 0x1037, + 0x1038, 0x1039, 0x103a, 0x103b, 0x103c, 0x103d, 0x103e, 0x103f, + }; + +static const unsigned short gNormalizeTable1080[] = { + /* U+1080 */ + 0x1080, 0x1081, 0x1082, 0x1083, 0x1084, 0x1085, 0x1086, 0x1087, + 0x1088, 0x1089, 0x108a, 0x108b, 0x108c, 0x108d, 0x108e, 0x108f, + 0x1090, 0x1091, 0x1092, 0x1093, 0x1094, 0x1095, 0x1096, 0x1097, + 0x1098, 0x1099, 0x109a, 0x109b, 0x109c, 0x109d, 0x109e, 0x109f, + 0x2d00, 0x2d01, 0x2d02, 0x2d03, 0x2d04, 0x2d05, 0x2d06, 0x2d07, + 0x2d08, 0x2d09, 0x2d0a, 0x2d0b, 0x2d0c, 0x2d0d, 0x2d0e, 0x2d0f, + 0x2d10, 0x2d11, 0x2d12, 0x2d13, 0x2d14, 0x2d15, 0x2d16, 0x2d17, + 0x2d18, 0x2d19, 0x2d1a, 0x2d1b, 0x2d1c, 0x2d1d, 0x2d1e, 0x2d1f, + }; + +static const unsigned short gNormalizeTable10c0[] = { + /* U+10c0 */ + 0x2d20, 0x2d21, 0x2d22, 0x2d23, 0x2d24, 0x2d25, 0x10c6, 0x10c7, + 0x10c8, 0x10c9, 0x10ca, 0x10cb, 0x10cc, 0x10cd, 0x10ce, 0x10cf, + 0x10d0, 0x10d1, 0x10d2, 0x10d3, 0x10d4, 0x10d5, 0x10d6, 0x10d7, + 0x10d8, 0x10d9, 0x10da, 0x10db, 0x10dc, 0x10dd, 0x10de, 0x10df, + 0x10e0, 0x10e1, 0x10e2, 0x10e3, 0x10e4, 0x10e5, 0x10e6, 0x10e7, + 0x10e8, 0x10e9, 0x10ea, 0x10eb, 0x10ec, 0x10ed, 0x10ee, 0x10ef, + 0x10f0, 0x10f1, 0x10f2, 0x10f3, 0x10f4, 0x10f5, 0x10f6, 0x10f7, + 0x10f8, 0x10f9, 0x10fa, 0x10fb, 0x10dc, 0x10fd, 0x10fe, 0x10ff, + }; + +static const unsigned short gNormalizeTable1140[] = { + /* U+1140 */ + 0x1140, 0x1141, 0x1142, 0x1143, 0x1144, 0x1145, 0x1146, 0x1147, + 0x1148, 0x1149, 0x114a, 0x114b, 0x114c, 0x114d, 0x114e, 0x114f, + 0x1150, 0x1151, 0x1152, 0x1153, 0x1154, 0x1155, 0x1156, 0x1157, + 0x1158, 0x1159, 0x115a, 0x115b, 0x115c, 0x115d, 0x115e, 0x0020, + 0x0020, 0x1161, 0x1162, 0x1163, 0x1164, 0x1165, 0x1166, 0x1167, + 0x1168, 0x1169, 0x116a, 0x116b, 0x116c, 0x116d, 0x116e, 0x116f, + 0x1170, 0x1171, 0x1172, 0x1173, 0x1174, 0x1175, 0x1176, 0x1177, + 0x1178, 0x1179, 0x117a, 0x117b, 0x117c, 0x117d, 0x117e, 0x117f, + }; + +static const unsigned short gNormalizeTable1780[] = { + /* U+1780 */ + 0x1780, 0x1781, 0x1782, 0x1783, 0x1784, 0x1785, 0x1786, 0x1787, + 0x1788, 0x1789, 0x178a, 0x178b, 0x178c, 0x178d, 0x178e, 0x178f, + 0x1790, 0x1791, 0x1792, 0x1793, 0x1794, 0x1795, 0x1796, 0x1797, + 0x1798, 0x1799, 0x179a, 0x179b, 0x179c, 0x179d, 0x179e, 0x179f, + 0x17a0, 0x17a1, 0x17a2, 0x17a3, 0x17a4, 0x17a5, 0x17a6, 0x17a7, + 0x17a8, 0x17a9, 0x17aa, 0x17ab, 0x17ac, 0x17ad, 0x17ae, 0x17af, + 0x17b0, 0x17b1, 0x17b2, 0x17b3, 0x0020, 0x0020, 0x17b6, 0x17b7, + 0x17b8, 0x17b9, 0x17ba, 0x17bb, 0x17bc, 0x17bd, 0x17be, 0x17bf, + }; + +static const unsigned short gNormalizeTable1800[] = { + /* U+1800 */ + 0x1800, 0x1801, 0x1802, 0x1803, 0x1804, 0x1805, 0x1806, 0x1807, + 0x1808, 0x1809, 0x180a, 0x0020, 0x0020, 0x0020, 0x180e, 0x180f, + 0x1810, 0x1811, 0x1812, 0x1813, 0x1814, 0x1815, 0x1816, 0x1817, + 0x1818, 0x1819, 0x181a, 0x181b, 0x181c, 0x181d, 0x181e, 0x181f, + 0x1820, 0x1821, 0x1822, 0x1823, 0x1824, 0x1825, 0x1826, 0x1827, + 0x1828, 0x1829, 0x182a, 0x182b, 0x182c, 0x182d, 0x182e, 0x182f, + 0x1830, 0x1831, 0x1832, 0x1833, 0x1834, 0x1835, 0x1836, 0x1837, + 0x1838, 0x1839, 0x183a, 0x183b, 0x183c, 0x183d, 0x183e, 0x183f, + }; + +static const unsigned short gNormalizeTable1b00[] = { + /* U+1b00 */ + 0x1b00, 0x1b01, 0x1b02, 0x1b03, 0x1b04, 0x1b05, 0x1b05, 0x1b07, + 0x1b07, 0x1b09, 0x1b09, 0x1b0b, 0x1b0b, 0x1b0d, 0x1b0d, 0x1b0f, + 0x1b10, 0x1b11, 0x1b11, 0x1b13, 0x1b14, 0x1b15, 0x1b16, 0x1b17, + 0x1b18, 0x1b19, 0x1b1a, 0x1b1b, 0x1b1c, 0x1b1d, 0x1b1e, 0x1b1f, + 0x1b20, 0x1b21, 0x1b22, 0x1b23, 0x1b24, 0x1b25, 0x1b26, 0x1b27, + 0x1b28, 0x1b29, 0x1b2a, 0x1b2b, 0x1b2c, 0x1b2d, 0x1b2e, 0x1b2f, + 0x1b30, 0x1b31, 0x1b32, 0x1b33, 0x1b34, 0x1b35, 0x1b36, 0x1b37, + 0x1b38, 0x1b39, 0x1b3a, 0x1b3a, 0x1b3c, 0x1b3c, 0x1b3e, 0x1b3f, + }; + +static const unsigned short gNormalizeTable1b40[] = { + /* U+1b40 */ + 0x1b3e, 0x1b3f, 0x1b42, 0x1b42, 0x1b44, 0x1b45, 0x1b46, 0x1b47, + 0x1b48, 0x1b49, 0x1b4a, 0x1b4b, 0x1b4c, 0x1b4d, 0x1b4e, 0x1b4f, + 0x1b50, 0x1b51, 0x1b52, 0x1b53, 0x1b54, 0x1b55, 0x1b56, 0x1b57, + 0x1b58, 0x1b59, 0x1b5a, 0x1b5b, 0x1b5c, 0x1b5d, 0x1b5e, 0x1b5f, + 0x1b60, 0x1b61, 0x1b62, 0x1b63, 0x1b64, 0x1b65, 0x1b66, 0x1b67, + 0x1b68, 0x1b69, 0x1b6a, 0x1b6b, 0x1b6c, 0x1b6d, 0x1b6e, 0x1b6f, + 0x1b70, 0x1b71, 0x1b72, 0x1b73, 0x1b74, 0x1b75, 0x1b76, 0x1b77, + 0x1b78, 0x1b79, 0x1b7a, 0x1b7b, 0x1b7c, 0x1b7d, 0x1b7e, 0x1b7f, + }; + +static const unsigned short gNormalizeTable1d00[] = { + /* U+1d00 */ + 0x1d00, 0x1d01, 0x1d02, 0x1d03, 0x1d04, 0x1d05, 0x1d06, 0x1d07, + 0x1d08, 0x1d09, 0x1d0a, 0x1d0b, 0x1d0c, 0x1d0d, 0x1d0e, 0x1d0f, + 0x1d10, 0x1d11, 0x1d12, 0x1d13, 0x1d14, 0x1d15, 0x1d16, 0x1d17, + 0x1d18, 0x1d19, 0x1d1a, 0x1d1b, 0x1d1c, 0x1d1d, 0x1d1e, 0x1d1f, + 0x1d20, 0x1d21, 0x1d22, 0x1d23, 0x1d24, 0x1d25, 0x1d26, 0x1d27, + 0x1d28, 0x1d29, 0x1d2a, 0x1d2b, 0x0061, 0x00e6, 0x0062, 0x1d2f, + 0x0064, 0x0065, 0x01dd, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x1d3b, 0x006f, 0x0223, 0x0070, 0x0072, + }; + +static const unsigned short gNormalizeTable1d40[] = { + /* U+1d40 */ + 0x0074, 0x0075, 0x0077, 0x0061, 0x0250, 0x0251, 0x1d02, 0x0062, + 0x0064, 0x0065, 0x0259, 0x025b, 0x025c, 0x0067, 0x1d4e, 0x006b, + 0x006d, 0x014b, 0x006f, 0x0254, 0x1d16, 0x1d17, 0x0070, 0x0074, + 0x0075, 0x1d1d, 0x026f, 0x0076, 0x1d25, 0x03b2, 0x03b3, 0x03b4, + 0x03c6, 0x03c7, 0x0069, 0x0072, 0x0075, 0x0076, 0x03b2, 0x03b3, + 0x03c1, 0x03c6, 0x03c7, 0x1d6b, 0x1d6c, 0x1d6d, 0x1d6e, 0x1d6f, + 0x1d70, 0x1d71, 0x1d72, 0x1d73, 0x1d74, 0x1d75, 0x1d76, 0x1d77, + 0x043d, 0x1d79, 0x1d7a, 0x1d7b, 0x1d7c, 0x1d7d, 0x1d7e, 0x1d7f, + }; + +static const unsigned short gNormalizeTable1d80[] = { + /* U+1d80 */ + 0x1d80, 0x1d81, 0x1d82, 0x1d83, 0x1d84, 0x1d85, 0x1d86, 0x1d87, + 0x1d88, 0x1d89, 0x1d8a, 0x1d8b, 0x1d8c, 0x1d8d, 0x1d8e, 0x1d8f, + 0x1d90, 0x1d91, 0x1d92, 0x1d93, 0x1d94, 0x1d95, 0x1d96, 0x1d97, + 0x1d98, 0x1d99, 0x1d9a, 0x0252, 0x0063, 0x0255, 0x00f0, 0x025c, + 0x0066, 0x025f, 0x0261, 0x0265, 0x0268, 0x0269, 0x026a, 0x1d7b, + 0x029d, 0x026d, 0x1d85, 0x029f, 0x0271, 0x0270, 0x0272, 0x0273, + 0x0274, 0x0275, 0x0278, 0x0282, 0x0283, 0x01ab, 0x0289, 0x028a, + 0x1d1c, 0x028b, 0x028c, 0x007a, 0x0290, 0x0291, 0x0292, 0x03b8, + }; + +static const unsigned short gNormalizeTable1e00[] = { + /* U+1e00 */ + 0x0061, 0x0061, 0x0062, 0x0062, 0x0062, 0x0062, 0x0062, 0x0062, + 0x0063, 0x0063, 0x0064, 0x0064, 0x0064, 0x0064, 0x0064, 0x0064, + 0x0064, 0x0064, 0x0064, 0x0064, 0x0065, 0x0065, 0x0065, 0x0065, + 0x0065, 0x0065, 0x0065, 0x0065, 0x0065, 0x0065, 0x0066, 0x0066, + 0x0067, 0x0067, 0x0068, 0x0068, 0x0068, 0x0068, 0x0068, 0x0068, + 0x0068, 0x0068, 0x0068, 0x0068, 0x0069, 0x0069, 0x0069, 0x0069, + 0x006b, 0x006b, 0x006b, 0x006b, 0x006b, 0x006b, 0x006c, 0x006c, + 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006d, 0x006d, + }; + +static const unsigned short gNormalizeTable1e40[] = { + /* U+1e40 */ + 0x006d, 0x006d, 0x006d, 0x006d, 0x006e, 0x006e, 0x006e, 0x006e, + 0x006e, 0x006e, 0x006e, 0x006e, 0x006f, 0x006f, 0x006f, 0x006f, + 0x006f, 0x006f, 0x006f, 0x006f, 0x0070, 0x0070, 0x0070, 0x0070, + 0x0072, 0x0072, 0x0072, 0x0072, 0x0072, 0x0072, 0x0072, 0x0072, + 0x0073, 0x0073, 0x0073, 0x0073, 0x0073, 0x0073, 0x0073, 0x0073, + 0x0073, 0x0073, 0x0074, 0x0074, 0x0074, 0x0074, 0x0074, 0x0074, + 0x0074, 0x0074, 0x0075, 0x0075, 0x0075, 0x0075, 0x0075, 0x0075, + 0x0075, 0x0075, 0x0075, 0x0075, 0x0076, 0x0076, 0x0076, 0x0076, + }; + +static const unsigned short gNormalizeTable1e80[] = { + /* U+1e80 */ + 0x0077, 0x0077, 0x0077, 0x0077, 0x0077, 0x0077, 0x0077, 0x0077, + 0x0077, 0x0077, 0x0078, 0x0078, 0x0078, 0x0078, 0x0079, 0x0079, + 0x007a, 0x007a, 0x007a, 0x007a, 0x007a, 0x007a, 0x0068, 0x0074, + 0x0077, 0x0079, 0x0061, 0x0073, 0x1e9c, 0x1e9d, 0x0073, 0x1e9f, + 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, + 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, + 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, + 0x0065, 0x0065, 0x0065, 0x0065, 0x0065, 0x0065, 0x0065, 0x0065, + }; + +static const unsigned short gNormalizeTable1ec0[] = { + /* U+1ec0 */ + 0x0065, 0x0065, 0x0065, 0x0065, 0x0065, 0x0065, 0x0065, 0x0065, + 0x0069, 0x0069, 0x0069, 0x0069, 0x006f, 0x006f, 0x006f, 0x006f, + 0x006f, 0x006f, 0x006f, 0x006f, 0x006f, 0x006f, 0x006f, 0x006f, + 0x006f, 0x006f, 0x006f, 0x006f, 0x006f, 0x006f, 0x006f, 0x006f, + 0x006f, 0x006f, 0x006f, 0x006f, 0x0075, 0x0075, 0x0075, 0x0075, + 0x0075, 0x0075, 0x0075, 0x0075, 0x0075, 0x0075, 0x0075, 0x0075, + 0x0075, 0x0075, 0x0079, 0x0079, 0x0079, 0x0079, 0x0079, 0x0079, + 0x0079, 0x0079, 0x1efb, 0x1efb, 0x1efd, 0x1efd, 0x1eff, 0x1eff, + }; + +static const unsigned short gNormalizeTable1f00[] = { + /* U+1f00 */ + 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, + 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, + 0x03b5, 0x03b5, 0x03b5, 0x03b5, 0x03b5, 0x03b5, 0x1f16, 0x1f17, + 0x03b5, 0x03b5, 0x03b5, 0x03b5, 0x03b5, 0x03b5, 0x1f1e, 0x1f1f, + 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, + 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, + 0x03b9, 0x03b9, 0x03b9, 0x03b9, 0x03b9, 0x03b9, 0x03b9, 0x03b9, + 0x03b9, 0x03b9, 0x03b9, 0x03b9, 0x03b9, 0x03b9, 0x03b9, 0x03b9, + }; + +static const unsigned short gNormalizeTable1f40[] = { + /* U+1f40 */ + 0x03bf, 0x03bf, 0x03bf, 0x03bf, 0x03bf, 0x03bf, 0x1f46, 0x1f47, + 0x03bf, 0x03bf, 0x03bf, 0x03bf, 0x03bf, 0x03bf, 0x1f4e, 0x1f4f, + 0x03c5, 0x03c5, 0x03c5, 0x03c5, 0x03c5, 0x03c5, 0x03c5, 0x03c5, + 0x1f58, 0x03c5, 0x1f5a, 0x03c5, 0x1f5c, 0x03c5, 0x1f5e, 0x03c5, + 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, + 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, + 0x03b1, 0x03b1, 0x03b5, 0x03b5, 0x03b7, 0x03b7, 0x03b9, 0x03b9, + 0x03bf, 0x03bf, 0x03c5, 0x03c5, 0x03c9, 0x03c9, 0x1f7e, 0x1f7f, + }; + +static const unsigned short gNormalizeTable1f80[] = { + /* U+1f80 */ + 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, + 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, + 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, + 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x03b7, + 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, + 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, 0x03c9, + 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x1fb5, 0x03b1, 0x03b1, + 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x03b1, 0x0020, 0x03b9, 0x0020, + }; + +static const unsigned short gNormalizeTable1fc0[] = { + /* U+1fc0 */ + 0x0020, 0x0020, 0x03b7, 0x03b7, 0x03b7, 0x1fc5, 0x03b7, 0x03b7, + 0x03b5, 0x03b5, 0x03b7, 0x03b7, 0x03b7, 0x0020, 0x0020, 0x0020, + 0x03b9, 0x03b9, 0x03b9, 0x03b9, 0x1fd4, 0x1fd5, 0x03b9, 0x03b9, + 0x03b9, 0x03b9, 0x03b9, 0x03b9, 0x1fdc, 0x0020, 0x0020, 0x0020, + 0x03c5, 0x03c5, 0x03c5, 0x03c5, 0x03c1, 0x03c1, 0x03c5, 0x03c5, + 0x03c5, 0x03c5, 0x03c5, 0x03c5, 0x03c1, 0x0020, 0x0020, 0x0060, + 0x1ff0, 0x1ff1, 0x03c9, 0x03c9, 0x03c9, 0x1ff5, 0x03c9, 0x03c9, + 0x03bf, 0x03bf, 0x03c9, 0x03c9, 0x03c9, 0x0020, 0x0020, 0x1fff, + }; + +static const unsigned short gNormalizeTable2000[] = { + /* U+2000 */ + 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, + 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, + 0x2010, 0x2010, 0x2012, 0x2013, 0x2014, 0x2015, 0x2016, 0x0020, + 0x2018, 0x2019, 0x201a, 0x201b, 0x201c, 0x201d, 0x201e, 0x201f, + 0x2020, 0x2021, 0x2022, 0x2023, 0x002e, 0x002e, 0x002e, 0x2027, + 0x2028, 0x2029, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, + 0x2030, 0x2031, 0x2032, 0x2032, 0x2032, 0x2035, 0x2035, 0x2035, + 0x2038, 0x2039, 0x203a, 0x203b, 0x0021, 0x203d, 0x0020, 0x203f, + }; + +static const unsigned short gNormalizeTable2040[] = { + /* U+2040 */ + 0x2040, 0x2041, 0x2042, 0x2043, 0x2044, 0x2045, 0x2046, 0x003f, + 0x003f, 0x0021, 0x204a, 0x204b, 0x204c, 0x204d, 0x204e, 0x204f, + 0x2050, 0x2051, 0x2052, 0x2053, 0x2054, 0x2055, 0x2056, 0x2032, + 0x2058, 0x2059, 0x205a, 0x205b, 0x205c, 0x205d, 0x205e, 0x0020, + 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, + 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, + 0x0030, 0x0069, 0x2072, 0x2073, 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x002b, 0x2212, 0x003d, 0x0028, 0x0029, 0x006e, + }; + +static const unsigned short gNormalizeTable2080[] = { + /* U+2080 */ + 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x002b, 0x2212, 0x003d, 0x0028, 0x0029, 0x208f, + 0x0061, 0x0065, 0x006f, 0x0078, 0x0259, 0x2095, 0x2096, 0x2097, + 0x2098, 0x2099, 0x209a, 0x209b, 0x209c, 0x209d, 0x209e, 0x209f, + 0x20a0, 0x20a1, 0x20a2, 0x20a3, 0x20a4, 0x20a5, 0x20a6, 0x20a7, + 0x0072, 0x20a9, 0x20aa, 0x20ab, 0x20ac, 0x20ad, 0x20ae, 0x20af, + 0x20b0, 0x20b1, 0x20b2, 0x20b3, 0x20b4, 0x20b5, 0x20b6, 0x20b7, + 0x20b8, 0x20b9, 0x20ba, 0x20bb, 0x20bc, 0x20bd, 0x20be, 0x20bf, + }; + +static const unsigned short gNormalizeTable2100[] = { + /* U+2100 */ + 0x0061, 0x0061, 0x0063, 0x00b0, 0x2104, 0x0063, 0x0063, 0x025b, + 0x2108, 0x00b0, 0x0067, 0x0068, 0x0068, 0x0068, 0x0068, 0x0127, + 0x0069, 0x0069, 0x006c, 0x006c, 0x2114, 0x006e, 0x006e, 0x2117, + 0x2118, 0x0070, 0x0071, 0x0072, 0x0072, 0x0072, 0x211e, 0x211f, + 0x0073, 0x0074, 0x0074, 0x2123, 0x007a, 0x2125, 0x03c9, 0x2127, + 0x007a, 0x2129, 0x006b, 0x0061, 0x0062, 0x0063, 0x212e, 0x0065, + 0x0065, 0x0066, 0x214e, 0x006d, 0x006f, 0x05d0, 0x05d1, 0x05d2, + 0x05d3, 0x0069, 0x213a, 0x0066, 0x03c0, 0x03b3, 0x03b3, 0x03c0, + }; + +static const unsigned short gNormalizeTable2140[] = { + /* U+2140 */ + 0x2211, 0x2141, 0x2142, 0x2143, 0x2144, 0x0064, 0x0064, 0x0065, + 0x0069, 0x006a, 0x214a, 0x214b, 0x214c, 0x214d, 0x214e, 0x214f, + 0x0031, 0x0031, 0x0031, 0x0031, 0x0032, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0031, 0x0035, 0x0031, 0x0033, 0x0035, 0x0037, 0x0031, + 0x0069, 0x0069, 0x0069, 0x0069, 0x0076, 0x0076, 0x0076, 0x0076, + 0x0069, 0x0078, 0x0078, 0x0078, 0x006c, 0x0063, 0x0064, 0x006d, + 0x0069, 0x0069, 0x0069, 0x0069, 0x0076, 0x0076, 0x0076, 0x0076, + 0x0069, 0x0078, 0x0078, 0x0078, 0x006c, 0x0063, 0x0064, 0x006d, + }; + +static const unsigned short gNormalizeTable2180[] = { + /* U+2180 */ + 0x2180, 0x2181, 0x2182, 0x2184, 0x2184, 0x2185, 0x2186, 0x2187, + 0x2188, 0x0030, 0x218a, 0x218b, 0x218c, 0x218d, 0x218e, 0x218f, + 0x2190, 0x2191, 0x2192, 0x2193, 0x2194, 0x2195, 0x2196, 0x2197, + 0x2198, 0x2199, 0x2190, 0x2192, 0x219c, 0x219d, 0x219e, 0x219f, + 0x21a0, 0x21a1, 0x21a2, 0x21a3, 0x21a4, 0x21a5, 0x21a6, 0x21a7, + 0x21a8, 0x21a9, 0x21aa, 0x21ab, 0x21ac, 0x21ad, 0x2194, 0x21af, + 0x21b0, 0x21b1, 0x21b2, 0x21b3, 0x21b4, 0x21b5, 0x21b6, 0x21b7, + 0x21b8, 0x21b9, 0x21ba, 0x21bb, 0x21bc, 0x21bd, 0x21be, 0x21bf, + }; + +static const unsigned short gNormalizeTable21c0[] = { + /* U+21c0 */ + 0x21c0, 0x21c1, 0x21c2, 0x21c3, 0x21c4, 0x21c5, 0x21c6, 0x21c7, + 0x21c8, 0x21c9, 0x21ca, 0x21cb, 0x21cc, 0x21d0, 0x21d4, 0x21d2, + 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x21d5, 0x21d6, 0x21d7, + 0x21d8, 0x21d9, 0x21da, 0x21db, 0x21dc, 0x21dd, 0x21de, 0x21df, + 0x21e0, 0x21e1, 0x21e2, 0x21e3, 0x21e4, 0x21e5, 0x21e6, 0x21e7, + 0x21e8, 0x21e9, 0x21ea, 0x21eb, 0x21ec, 0x21ed, 0x21ee, 0x21ef, + 0x21f0, 0x21f1, 0x21f2, 0x21f3, 0x21f4, 0x21f5, 0x21f6, 0x21f7, + 0x21f8, 0x21f9, 0x21fa, 0x21fb, 0x21fc, 0x21fd, 0x21fe, 0x21ff, + }; + +static const unsigned short gNormalizeTable2200[] = { + /* U+2200 */ + 0x2200, 0x2201, 0x2202, 0x2203, 0x2203, 0x2205, 0x2206, 0x2207, + 0x2208, 0x2208, 0x220a, 0x220b, 0x220b, 0x220d, 0x220e, 0x220f, + 0x2210, 0x2211, 0x2212, 0x2213, 0x2214, 0x2215, 0x2216, 0x2217, + 0x2218, 0x2219, 0x221a, 0x221b, 0x221c, 0x221d, 0x221e, 0x221f, + 0x2220, 0x2221, 0x2222, 0x2223, 0x2223, 0x2225, 0x2225, 0x2227, + 0x2228, 0x2229, 0x222a, 0x222b, 0x222b, 0x222b, 0x222e, 0x222e, + 0x222e, 0x2231, 0x2232, 0x2233, 0x2234, 0x2235, 0x2236, 0x2237, + 0x2238, 0x2239, 0x223a, 0x223b, 0x223c, 0x223d, 0x223e, 0x223f, + }; + +static const unsigned short gNormalizeTable2240[] = { + /* U+2240 */ + 0x2240, 0x223c, 0x2242, 0x2243, 0x2243, 0x2245, 0x2246, 0x2245, + 0x2248, 0x2248, 0x224a, 0x224b, 0x224c, 0x224d, 0x224e, 0x224f, + 0x2250, 0x2251, 0x2252, 0x2253, 0x2254, 0x2255, 0x2256, 0x2257, + 0x2258, 0x2259, 0x225a, 0x225b, 0x225c, 0x225d, 0x225e, 0x225f, + 0x003d, 0x2261, 0x2261, 0x2263, 0x2264, 0x2265, 0x2266, 0x2267, + 0x2268, 0x2269, 0x226a, 0x226b, 0x226c, 0x224d, 0x003c, 0x003e, + 0x2264, 0x2265, 0x2272, 0x2273, 0x2272, 0x2273, 0x2276, 0x2277, + 0x2276, 0x2277, 0x227a, 0x227b, 0x227c, 0x227d, 0x227e, 0x227f, + }; + +static const unsigned short gNormalizeTable2280[] = { + /* U+2280 */ + 0x227a, 0x227b, 0x2282, 0x2283, 0x2282, 0x2283, 0x2286, 0x2287, + 0x2286, 0x2287, 0x228a, 0x228b, 0x228c, 0x228d, 0x228e, 0x228f, + 0x2290, 0x2291, 0x2292, 0x2293, 0x2294, 0x2295, 0x2296, 0x2297, + 0x2298, 0x2299, 0x229a, 0x229b, 0x229c, 0x229d, 0x229e, 0x229f, + 0x22a0, 0x22a1, 0x22a2, 0x22a3, 0x22a4, 0x22a5, 0x22a6, 0x22a7, + 0x22a8, 0x22a9, 0x22aa, 0x22ab, 0x22a2, 0x22a8, 0x22a9, 0x22ab, + 0x22b0, 0x22b1, 0x22b2, 0x22b3, 0x22b4, 0x22b5, 0x22b6, 0x22b7, + 0x22b8, 0x22b9, 0x22ba, 0x22bb, 0x22bc, 0x22bd, 0x22be, 0x22bf, + }; + +static const unsigned short gNormalizeTable22c0[] = { + /* U+22c0 */ + 0x22c0, 0x22c1, 0x22c2, 0x22c3, 0x22c4, 0x22c5, 0x22c6, 0x22c7, + 0x22c8, 0x22c9, 0x22ca, 0x22cb, 0x22cc, 0x22cd, 0x22ce, 0x22cf, + 0x22d0, 0x22d1, 0x22d2, 0x22d3, 0x22d4, 0x22d5, 0x22d6, 0x22d7, + 0x22d8, 0x22d9, 0x22da, 0x22db, 0x22dc, 0x22dd, 0x22de, 0x22df, + 0x227c, 0x227d, 0x2291, 0x2292, 0x22e4, 0x22e5, 0x22e6, 0x22e7, + 0x22e8, 0x22e9, 0x22b2, 0x22b3, 0x22b4, 0x22b5, 0x22ee, 0x22ef, + 0x22f0, 0x22f1, 0x22f2, 0x22f3, 0x22f4, 0x22f5, 0x22f6, 0x22f7, + 0x22f8, 0x22f9, 0x22fa, 0x22fb, 0x22fc, 0x22fd, 0x22fe, 0x22ff, + }; + +static const unsigned short gNormalizeTable2300[] = { + /* U+2300 */ + 0x2300, 0x2301, 0x2302, 0x2303, 0x2304, 0x2305, 0x2306, 0x2307, + 0x2308, 0x2309, 0x230a, 0x230b, 0x230c, 0x230d, 0x230e, 0x230f, + 0x2310, 0x2311, 0x2312, 0x2313, 0x2314, 0x2315, 0x2316, 0x2317, + 0x2318, 0x2319, 0x231a, 0x231b, 0x231c, 0x231d, 0x231e, 0x231f, + 0x2320, 0x2321, 0x2322, 0x2323, 0x2324, 0x2325, 0x2326, 0x2327, + 0x2328, 0x3008, 0x3009, 0x232b, 0x232c, 0x232d, 0x232e, 0x232f, + 0x2330, 0x2331, 0x2332, 0x2333, 0x2334, 0x2335, 0x2336, 0x2337, + 0x2338, 0x2339, 0x233a, 0x233b, 0x233c, 0x233d, 0x233e, 0x233f, + }; + +static const unsigned short gNormalizeTable2440[] = { + /* U+2440 */ + 0x2440, 0x2441, 0x2442, 0x2443, 0x2444, 0x2445, 0x2446, 0x2447, + 0x2448, 0x2449, 0x244a, 0x244b, 0x244c, 0x244d, 0x244e, 0x244f, + 0x2450, 0x2451, 0x2452, 0x2453, 0x2454, 0x2455, 0x2456, 0x2457, + 0x2458, 0x2459, 0x245a, 0x245b, 0x245c, 0x245d, 0x245e, 0x245f, + 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, + 0x0039, 0x0031, 0x0031, 0x0031, 0x0031, 0x0031, 0x0031, 0x0031, + 0x0031, 0x0031, 0x0031, 0x0032, 0x0028, 0x0028, 0x0028, 0x0028, + 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, + }; + +static const unsigned short gNormalizeTable2480[] = { + /* U+2480 */ + 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, + 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, + 0x0039, 0x0031, 0x0031, 0x0031, 0x0031, 0x0031, 0x0031, 0x0031, + 0x0031, 0x0031, 0x0031, 0x0032, 0x0028, 0x0028, 0x0028, 0x0028, + 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, + 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, + 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0061, 0x0062, + 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, + }; + +static const unsigned short gNormalizeTable24c0[] = { + /* U+24c0 */ + 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070, 0x0071, 0x0072, + 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, + 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, + 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070, + 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, + 0x0079, 0x007a, 0x0030, 0x24eb, 0x24ec, 0x24ed, 0x24ee, 0x24ef, + 0x24f0, 0x24f1, 0x24f2, 0x24f3, 0x24f4, 0x24f5, 0x24f6, 0x24f7, + 0x24f8, 0x24f9, 0x24fa, 0x24fb, 0x24fc, 0x24fd, 0x24fe, 0x24ff, + }; + +static const unsigned short gNormalizeTable2a00[] = { + /* U+2a00 */ + 0x2a00, 0x2a01, 0x2a02, 0x2a03, 0x2a04, 0x2a05, 0x2a06, 0x2a07, + 0x2a08, 0x2a09, 0x2a0a, 0x2a0b, 0x222b, 0x2a0d, 0x2a0e, 0x2a0f, + 0x2a10, 0x2a11, 0x2a12, 0x2a13, 0x2a14, 0x2a15, 0x2a16, 0x2a17, + 0x2a18, 0x2a19, 0x2a1a, 0x2a1b, 0x2a1c, 0x2a1d, 0x2a1e, 0x2a1f, + 0x2a20, 0x2a21, 0x2a22, 0x2a23, 0x2a24, 0x2a25, 0x2a26, 0x2a27, + 0x2a28, 0x2a29, 0x2a2a, 0x2a2b, 0x2a2c, 0x2a2d, 0x2a2e, 0x2a2f, + 0x2a30, 0x2a31, 0x2a32, 0x2a33, 0x2a34, 0x2a35, 0x2a36, 0x2a37, + 0x2a38, 0x2a39, 0x2a3a, 0x2a3b, 0x2a3c, 0x2a3d, 0x2a3e, 0x2a3f, + }; + +static const unsigned short gNormalizeTable2a40[] = { + /* U+2a40 */ + 0x2a40, 0x2a41, 0x2a42, 0x2a43, 0x2a44, 0x2a45, 0x2a46, 0x2a47, + 0x2a48, 0x2a49, 0x2a4a, 0x2a4b, 0x2a4c, 0x2a4d, 0x2a4e, 0x2a4f, + 0x2a50, 0x2a51, 0x2a52, 0x2a53, 0x2a54, 0x2a55, 0x2a56, 0x2a57, + 0x2a58, 0x2a59, 0x2a5a, 0x2a5b, 0x2a5c, 0x2a5d, 0x2a5e, 0x2a5f, + 0x2a60, 0x2a61, 0x2a62, 0x2a63, 0x2a64, 0x2a65, 0x2a66, 0x2a67, + 0x2a68, 0x2a69, 0x2a6a, 0x2a6b, 0x2a6c, 0x2a6d, 0x2a6e, 0x2a6f, + 0x2a70, 0x2a71, 0x2a72, 0x2a73, 0x003a, 0x003d, 0x003d, 0x2a77, + 0x2a78, 0x2a79, 0x2a7a, 0x2a7b, 0x2a7c, 0x2a7d, 0x2a7e, 0x2a7f, + }; + +static const unsigned short gNormalizeTable2ac0[] = { + /* U+2ac0 */ + 0x2ac0, 0x2ac1, 0x2ac2, 0x2ac3, 0x2ac4, 0x2ac5, 0x2ac6, 0x2ac7, + 0x2ac8, 0x2ac9, 0x2aca, 0x2acb, 0x2acc, 0x2acd, 0x2ace, 0x2acf, + 0x2ad0, 0x2ad1, 0x2ad2, 0x2ad3, 0x2ad4, 0x2ad5, 0x2ad6, 0x2ad7, + 0x2ad8, 0x2ad9, 0x2ada, 0x2adb, 0x2add, 0x2add, 0x2ade, 0x2adf, + 0x2ae0, 0x2ae1, 0x2ae2, 0x2ae3, 0x2ae4, 0x2ae5, 0x2ae6, 0x2ae7, + 0x2ae8, 0x2ae9, 0x2aea, 0x2aeb, 0x2aec, 0x2aed, 0x2aee, 0x2aef, + 0x2af0, 0x2af1, 0x2af2, 0x2af3, 0x2af4, 0x2af5, 0x2af6, 0x2af7, + 0x2af8, 0x2af9, 0x2afa, 0x2afb, 0x2afc, 0x2afd, 0x2afe, 0x2aff, + }; + +static const unsigned short gNormalizeTable2c00[] = { + /* U+2c00 */ + 0x2c30, 0x2c31, 0x2c32, 0x2c33, 0x2c34, 0x2c35, 0x2c36, 0x2c37, + 0x2c38, 0x2c39, 0x2c3a, 0x2c3b, 0x2c3c, 0x2c3d, 0x2c3e, 0x2c3f, + 0x2c40, 0x2c41, 0x2c42, 0x2c43, 0x2c44, 0x2c45, 0x2c46, 0x2c47, + 0x2c48, 0x2c49, 0x2c4a, 0x2c4b, 0x2c4c, 0x2c4d, 0x2c4e, 0x2c4f, + 0x2c50, 0x2c51, 0x2c52, 0x2c53, 0x2c54, 0x2c55, 0x2c56, 0x2c57, + 0x2c58, 0x2c59, 0x2c5a, 0x2c5b, 0x2c5c, 0x2c5d, 0x2c5e, 0x2c2f, + 0x2c30, 0x2c31, 0x2c32, 0x2c33, 0x2c34, 0x2c35, 0x2c36, 0x2c37, + 0x2c38, 0x2c39, 0x2c3a, 0x2c3b, 0x2c3c, 0x2c3d, 0x2c3e, 0x2c3f, + }; + +static const unsigned short gNormalizeTable2c40[] = { + /* U+2c40 */ + 0x2c40, 0x2c41, 0x2c42, 0x2c43, 0x2c44, 0x2c45, 0x2c46, 0x2c47, + 0x2c48, 0x2c49, 0x2c4a, 0x2c4b, 0x2c4c, 0x2c4d, 0x2c4e, 0x2c4f, + 0x2c50, 0x2c51, 0x2c52, 0x2c53, 0x2c54, 0x2c55, 0x2c56, 0x2c57, + 0x2c58, 0x2c59, 0x2c5a, 0x2c5b, 0x2c5c, 0x2c5d, 0x2c5e, 0x2c5f, + 0x2c61, 0x2c61, 0x026b, 0x1d7d, 0x027d, 0x2c65, 0x2c66, 0x2c68, + 0x2c68, 0x2c6a, 0x2c6a, 0x2c6c, 0x2c6c, 0x0251, 0x0271, 0x0250, + 0x0252, 0x2c71, 0x2c73, 0x2c73, 0x2c74, 0x2c76, 0x2c76, 0x2c77, + 0x2c78, 0x2c79, 0x2c7a, 0x2c7b, 0x006a, 0x0076, 0x023f, 0x0240, + }; + +static const unsigned short gNormalizeTable2c80[] = { + /* U+2c80 */ + 0x2c81, 0x2c81, 0x2c83, 0x2c83, 0x2c85, 0x2c85, 0x2c87, 0x2c87, + 0x2c89, 0x2c89, 0x2c8b, 0x2c8b, 0x2c8d, 0x2c8d, 0x2c8f, 0x2c8f, + 0x2c91, 0x2c91, 0x2c93, 0x2c93, 0x2c95, 0x2c95, 0x2c97, 0x2c97, + 0x2c99, 0x2c99, 0x2c9b, 0x2c9b, 0x2c9d, 0x2c9d, 0x2c9f, 0x2c9f, + 0x2ca1, 0x2ca1, 0x2ca3, 0x2ca3, 0x2ca5, 0x2ca5, 0x2ca7, 0x2ca7, + 0x2ca9, 0x2ca9, 0x2cab, 0x2cab, 0x2cad, 0x2cad, 0x2caf, 0x2caf, + 0x2cb1, 0x2cb1, 0x2cb3, 0x2cb3, 0x2cb5, 0x2cb5, 0x2cb7, 0x2cb7, + 0x2cb9, 0x2cb9, 0x2cbb, 0x2cbb, 0x2cbd, 0x2cbd, 0x2cbf, 0x2cbf, + }; + +static const unsigned short gNormalizeTable2cc0[] = { + /* U+2cc0 */ + 0x2cc1, 0x2cc1, 0x2cc3, 0x2cc3, 0x2cc5, 0x2cc5, 0x2cc7, 0x2cc7, + 0x2cc9, 0x2cc9, 0x2ccb, 0x2ccb, 0x2ccd, 0x2ccd, 0x2ccf, 0x2ccf, + 0x2cd1, 0x2cd1, 0x2cd3, 0x2cd3, 0x2cd5, 0x2cd5, 0x2cd7, 0x2cd7, + 0x2cd9, 0x2cd9, 0x2cdb, 0x2cdb, 0x2cdd, 0x2cdd, 0x2cdf, 0x2cdf, + 0x2ce1, 0x2ce1, 0x2ce3, 0x2ce3, 0x2ce4, 0x2ce5, 0x2ce6, 0x2ce7, + 0x2ce8, 0x2ce9, 0x2cea, 0x2cec, 0x2cec, 0x2cee, 0x2cee, 0x2cef, + 0x2cf0, 0x2cf1, 0x2cf2, 0x2cf3, 0x2cf4, 0x2cf5, 0x2cf6, 0x2cf7, + 0x2cf8, 0x2cf9, 0x2cfa, 0x2cfb, 0x2cfc, 0x2cfd, 0x2cfe, 0x2cff, + }; + +static const unsigned short gNormalizeTable2d40[] = { + /* U+2d40 */ + 0x2d40, 0x2d41, 0x2d42, 0x2d43, 0x2d44, 0x2d45, 0x2d46, 0x2d47, + 0x2d48, 0x2d49, 0x2d4a, 0x2d4b, 0x2d4c, 0x2d4d, 0x2d4e, 0x2d4f, + 0x2d50, 0x2d51, 0x2d52, 0x2d53, 0x2d54, 0x2d55, 0x2d56, 0x2d57, + 0x2d58, 0x2d59, 0x2d5a, 0x2d5b, 0x2d5c, 0x2d5d, 0x2d5e, 0x2d5f, + 0x2d60, 0x2d61, 0x2d62, 0x2d63, 0x2d64, 0x2d65, 0x2d66, 0x2d67, + 0x2d68, 0x2d69, 0x2d6a, 0x2d6b, 0x2d6c, 0x2d6d, 0x2d6e, 0x2d61, + 0x2d70, 0x2d71, 0x2d72, 0x2d73, 0x2d74, 0x2d75, 0x2d76, 0x2d77, + 0x2d78, 0x2d79, 0x2d7a, 0x2d7b, 0x2d7c, 0x2d7d, 0x2d7e, 0x2d7f, + }; + +static const unsigned short gNormalizeTable2e80[] = { + /* U+2e80 */ + 0x2e80, 0x2e81, 0x2e82, 0x2e83, 0x2e84, 0x2e85, 0x2e86, 0x2e87, + 0x2e88, 0x2e89, 0x2e8a, 0x2e8b, 0x2e8c, 0x2e8d, 0x2e8e, 0x2e8f, + 0x2e90, 0x2e91, 0x2e92, 0x2e93, 0x2e94, 0x2e95, 0x2e96, 0x2e97, + 0x2e98, 0x2e99, 0x2e9a, 0x2e9b, 0x2e9c, 0x2e9d, 0x2e9e, 0x6bcd, + 0x2ea0, 0x2ea1, 0x2ea2, 0x2ea3, 0x2ea4, 0x2ea5, 0x2ea6, 0x2ea7, + 0x2ea8, 0x2ea9, 0x2eaa, 0x2eab, 0x2eac, 0x2ead, 0x2eae, 0x2eaf, + 0x2eb0, 0x2eb1, 0x2eb2, 0x2eb3, 0x2eb4, 0x2eb5, 0x2eb6, 0x2eb7, + 0x2eb8, 0x2eb9, 0x2eba, 0x2ebb, 0x2ebc, 0x2ebd, 0x2ebe, 0x2ebf, + }; + +static const unsigned short gNormalizeTable2ec0[] = { + /* U+2ec0 */ + 0x2ec0, 0x2ec1, 0x2ec2, 0x2ec3, 0x2ec4, 0x2ec5, 0x2ec6, 0x2ec7, + 0x2ec8, 0x2ec9, 0x2eca, 0x2ecb, 0x2ecc, 0x2ecd, 0x2ece, 0x2ecf, + 0x2ed0, 0x2ed1, 0x2ed2, 0x2ed3, 0x2ed4, 0x2ed5, 0x2ed6, 0x2ed7, + 0x2ed8, 0x2ed9, 0x2eda, 0x2edb, 0x2edc, 0x2edd, 0x2ede, 0x2edf, + 0x2ee0, 0x2ee1, 0x2ee2, 0x2ee3, 0x2ee4, 0x2ee5, 0x2ee6, 0x2ee7, + 0x2ee8, 0x2ee9, 0x2eea, 0x2eeb, 0x2eec, 0x2eed, 0x2eee, 0x2eef, + 0x2ef0, 0x2ef1, 0x2ef2, 0x9f9f, 0x2ef4, 0x2ef5, 0x2ef6, 0x2ef7, + 0x2ef8, 0x2ef9, 0x2efa, 0x2efb, 0x2efc, 0x2efd, 0x2efe, 0x2eff, + }; + +static const unsigned short gNormalizeTable2f00[] = { + /* U+2f00 */ + 0x4e00, 0x4e28, 0x4e36, 0x4e3f, 0x4e59, 0x4e85, 0x4e8c, 0x4ea0, + 0x4eba, 0x513f, 0x5165, 0x516b, 0x5182, 0x5196, 0x51ab, 0x51e0, + 0x51f5, 0x5200, 0x529b, 0x52f9, 0x5315, 0x531a, 0x5338, 0x5341, + 0x535c, 0x5369, 0x5382, 0x53b6, 0x53c8, 0x53e3, 0x56d7, 0x571f, + 0x58eb, 0x5902, 0x590a, 0x5915, 0x5927, 0x5973, 0x5b50, 0x5b80, + 0x5bf8, 0x5c0f, 0x5c22, 0x5c38, 0x5c6e, 0x5c71, 0x5ddb, 0x5de5, + 0x5df1, 0x5dfe, 0x5e72, 0x5e7a, 0x5e7f, 0x5ef4, 0x5efe, 0x5f0b, + 0x5f13, 0x5f50, 0x5f61, 0x5f73, 0x5fc3, 0x6208, 0x6236, 0x624b, + }; + +static const unsigned short gNormalizeTable2f40[] = { + /* U+2f40 */ + 0x652f, 0x6534, 0x6587, 0x6597, 0x65a4, 0x65b9, 0x65e0, 0x65e5, + 0x66f0, 0x6708, 0x6728, 0x6b20, 0x6b62, 0x6b79, 0x6bb3, 0x6bcb, + 0x6bd4, 0x6bdb, 0x6c0f, 0x6c14, 0x6c34, 0x706b, 0x722a, 0x7236, + 0x723b, 0x723f, 0x7247, 0x7259, 0x725b, 0x72ac, 0x7384, 0x7389, + 0x74dc, 0x74e6, 0x7518, 0x751f, 0x7528, 0x7530, 0x758b, 0x7592, + 0x7676, 0x767d, 0x76ae, 0x76bf, 0x76ee, 0x77db, 0x77e2, 0x77f3, + 0x793a, 0x79b8, 0x79be, 0x7a74, 0x7acb, 0x7af9, 0x7c73, 0x7cf8, + 0x7f36, 0x7f51, 0x7f8a, 0x7fbd, 0x8001, 0x800c, 0x8012, 0x8033, + }; + +static const unsigned short gNormalizeTable2f80[] = { + /* U+2f80 */ + 0x807f, 0x8089, 0x81e3, 0x81ea, 0x81f3, 0x81fc, 0x820c, 0x821b, + 0x821f, 0x826e, 0x8272, 0x8278, 0x864d, 0x866b, 0x8840, 0x884c, + 0x8863, 0x897e, 0x898b, 0x89d2, 0x8a00, 0x8c37, 0x8c46, 0x8c55, + 0x8c78, 0x8c9d, 0x8d64, 0x8d70, 0x8db3, 0x8eab, 0x8eca, 0x8f9b, + 0x8fb0, 0x8fb5, 0x9091, 0x9149, 0x91c6, 0x91cc, 0x91d1, 0x9577, + 0x9580, 0x961c, 0x96b6, 0x96b9, 0x96e8, 0x9751, 0x975e, 0x9762, + 0x9769, 0x97cb, 0x97ed, 0x97f3, 0x9801, 0x98a8, 0x98db, 0x98df, + 0x9996, 0x9999, 0x99ac, 0x9aa8, 0x9ad8, 0x9adf, 0x9b25, 0x9b2f, + }; + +static const unsigned short gNormalizeTable2fc0[] = { + /* U+2fc0 */ + 0x9b32, 0x9b3c, 0x9b5a, 0x9ce5, 0x9e75, 0x9e7f, 0x9ea5, 0x9ebb, + 0x9ec3, 0x9ecd, 0x9ed1, 0x9ef9, 0x9efd, 0x9f0e, 0x9f13, 0x9f20, + 0x9f3b, 0x9f4a, 0x9f52, 0x9f8d, 0x9f9c, 0x9fa0, 0x2fd6, 0x2fd7, + 0x2fd8, 0x2fd9, 0x2fda, 0x2fdb, 0x2fdc, 0x2fdd, 0x2fde, 0x2fdf, + 0x2fe0, 0x2fe1, 0x2fe2, 0x2fe3, 0x2fe4, 0x2fe5, 0x2fe6, 0x2fe7, + 0x2fe8, 0x2fe9, 0x2fea, 0x2feb, 0x2fec, 0x2fed, 0x2fee, 0x2fef, + 0x2ff0, 0x2ff1, 0x2ff2, 0x2ff3, 0x2ff4, 0x2ff5, 0x2ff6, 0x2ff7, + 0x2ff8, 0x2ff9, 0x2ffa, 0x2ffb, 0x2ffc, 0x2ffd, 0x2ffe, 0x2fff, + }; + +static const unsigned short gNormalizeTable3000[] = { + /* U+3000 */ + 0x0020, 0x3001, 0x3002, 0x3003, 0x3004, 0x3005, 0x3006, 0x3007, + 0x3008, 0x3009, 0x300a, 0x300b, 0x300c, 0x300d, 0x300e, 0x300f, + 0x3010, 0x3011, 0x3012, 0x3013, 0x3014, 0x3015, 0x3016, 0x3017, + 0x3018, 0x3019, 0x301a, 0x301b, 0x301c, 0x301d, 0x301e, 0x301f, + 0x3020, 0x3021, 0x3022, 0x3023, 0x3024, 0x3025, 0x3026, 0x3027, + 0x3028, 0x3029, 0x302a, 0x302b, 0x302c, 0x302d, 0x302e, 0x302f, + 0x3030, 0x3031, 0x3032, 0x3033, 0x3034, 0x3035, 0x3012, 0x3037, + 0x5341, 0x5344, 0x5345, 0x303b, 0x303c, 0x303d, 0x303e, 0x303f, + }; + +static const unsigned short gNormalizeTable3040[] = { + /* U+3040 */ + 0x3040, 0x3041, 0x3042, 0x3043, 0x3044, 0x3045, 0x3046, 0x3047, + 0x3048, 0x3049, 0x304a, 0x304b, 0x304b, 0x304d, 0x304d, 0x304f, + 0x304f, 0x3051, 0x3051, 0x3053, 0x3053, 0x3055, 0x3055, 0x3057, + 0x3057, 0x3059, 0x3059, 0x305b, 0x305b, 0x305d, 0x305d, 0x305f, + 0x305f, 0x3061, 0x3061, 0x3063, 0x3064, 0x3064, 0x3066, 0x3066, + 0x3068, 0x3068, 0x306a, 0x306b, 0x306c, 0x306d, 0x306e, 0x306f, + 0x306f, 0x306f, 0x3072, 0x3072, 0x3072, 0x3075, 0x3075, 0x3075, + 0x3078, 0x3078, 0x3078, 0x307b, 0x307b, 0x307b, 0x307e, 0x307f, + }; + +static const unsigned short gNormalizeTable3080[] = { + /* U+3080 */ + 0x3080, 0x3081, 0x3082, 0x3083, 0x3084, 0x3085, 0x3086, 0x3087, + 0x3088, 0x3089, 0x308a, 0x308b, 0x308c, 0x308d, 0x308e, 0x308f, + 0x3090, 0x3091, 0x3092, 0x3093, 0x3046, 0x3095, 0x3096, 0x3097, + 0x3098, 0x3099, 0x309a, 0x0020, 0x0020, 0x309d, 0x309d, 0x3088, + 0x30a0, 0x30a1, 0x30a2, 0x30a3, 0x30a4, 0x30a5, 0x30a6, 0x30a7, + 0x30a8, 0x30a9, 0x30aa, 0x30ab, 0x30ab, 0x30ad, 0x30ad, 0x30af, + 0x30af, 0x30b1, 0x30b1, 0x30b3, 0x30b3, 0x30b5, 0x30b5, 0x30b7, + 0x30b7, 0x30b9, 0x30b9, 0x30bb, 0x30bb, 0x30bd, 0x30bd, 0x30bf, + }; + +static const unsigned short gNormalizeTable30c0[] = { + /* U+30c0 */ + 0x30bf, 0x30c1, 0x30c1, 0x30c3, 0x30c4, 0x30c4, 0x30c6, 0x30c6, + 0x30c8, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd, 0x30ce, 0x30cf, + 0x30cf, 0x30cf, 0x30d2, 0x30d2, 0x30d2, 0x30d5, 0x30d5, 0x30d5, + 0x30d8, 0x30d8, 0x30d8, 0x30db, 0x30db, 0x30db, 0x30de, 0x30df, + 0x30e0, 0x30e1, 0x30e2, 0x30e3, 0x30e4, 0x30e5, 0x30e6, 0x30e7, + 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec, 0x30ed, 0x30ee, 0x30ef, + 0x30f0, 0x30f1, 0x30f2, 0x30f3, 0x30a6, 0x30f5, 0x30f6, 0x30ef, + 0x30f0, 0x30f1, 0x30f2, 0x30fb, 0x30fc, 0x30fd, 0x30fd, 0x30b3, + }; + +static const unsigned short gNormalizeTable3100[] = { + /* U+3100 */ + 0x3100, 0x3101, 0x3102, 0x3103, 0x3104, 0x3105, 0x3106, 0x3107, + 0x3108, 0x3109, 0x310a, 0x310b, 0x310c, 0x310d, 0x310e, 0x310f, + 0x3110, 0x3111, 0x3112, 0x3113, 0x3114, 0x3115, 0x3116, 0x3117, + 0x3118, 0x3119, 0x311a, 0x311b, 0x311c, 0x311d, 0x311e, 0x311f, + 0x3120, 0x3121, 0x3122, 0x3123, 0x3124, 0x3125, 0x3126, 0x3127, + 0x3128, 0x3129, 0x312a, 0x312b, 0x312c, 0x312d, 0x312e, 0x312f, + 0x3130, 0x1100, 0x1101, 0x11aa, 0x1102, 0x11ac, 0x11ad, 0x1103, + 0x1104, 0x1105, 0x11b0, 0x11b1, 0x11b2, 0x11b3, 0x11b4, 0x11b5, + }; + +static const unsigned short gNormalizeTable3140[] = { + /* U+3140 */ + 0x111a, 0x1106, 0x1107, 0x1108, 0x1121, 0x1109, 0x110a, 0x110b, + 0x110c, 0x110d, 0x110e, 0x110f, 0x1110, 0x1111, 0x1112, 0x1161, + 0x1162, 0x1163, 0x1164, 0x1165, 0x1166, 0x1167, 0x1168, 0x1169, + 0x116a, 0x116b, 0x116c, 0x116d, 0x116e, 0x116f, 0x1170, 0x1171, + 0x1172, 0x1173, 0x1174, 0x1175, 0x0020, 0x1114, 0x1115, 0x11c7, + 0x11c8, 0x11cc, 0x11ce, 0x11d3, 0x11d7, 0x11d9, 0x111c, 0x11dd, + 0x11df, 0x111d, 0x111e, 0x1120, 0x1122, 0x1123, 0x1127, 0x1129, + 0x112b, 0x112c, 0x112d, 0x112e, 0x112f, 0x1132, 0x1136, 0x1140, + }; + +static const unsigned short gNormalizeTable3180[] = { + /* U+3180 */ + 0x1147, 0x114c, 0x11f1, 0x11f2, 0x1157, 0x1158, 0x1159, 0x1184, + 0x1185, 0x1188, 0x1191, 0x1192, 0x1194, 0x119e, 0x11a1, 0x318f, + 0x3190, 0x3191, 0x4e00, 0x4e8c, 0x4e09, 0x56db, 0x4e0a, 0x4e2d, + 0x4e0b, 0x7532, 0x4e59, 0x4e19, 0x4e01, 0x5929, 0x5730, 0x4eba, + 0x31a0, 0x31a1, 0x31a2, 0x31a3, 0x31a4, 0x31a5, 0x31a6, 0x31a7, + 0x31a8, 0x31a9, 0x31aa, 0x31ab, 0x31ac, 0x31ad, 0x31ae, 0x31af, + 0x31b0, 0x31b1, 0x31b2, 0x31b3, 0x31b4, 0x31b5, 0x31b6, 0x31b7, + 0x31b8, 0x31b9, 0x31ba, 0x31bb, 0x31bc, 0x31bd, 0x31be, 0x31bf, + }; + +static const unsigned short gNormalizeTable3200[] = { + /* U+3200 */ + 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, + 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, + 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, + 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x321f, + 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, + 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, + 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, + 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, 0x0028, + }; + +static const unsigned short gNormalizeTable3240[] = { + /* U+3240 */ + 0x0028, 0x0028, 0x0028, 0x0028, 0x554f, 0x5e7c, 0x6587, 0x7b8f, + 0x3248, 0x3249, 0x324a, 0x324b, 0x324c, 0x324d, 0x324e, 0x324f, + 0x0070, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, + 0x0032, 0x0032, 0x0033, 0x0033, 0x0033, 0x0033, 0x0033, 0x0033, + 0x1100, 0x1102, 0x1103, 0x1105, 0x1106, 0x1107, 0x1109, 0x110b, + 0x110c, 0x110e, 0x110f, 0x1110, 0x1111, 0x1112, 0x1100, 0x1102, + 0x1103, 0x1105, 0x1106, 0x1107, 0x1109, 0x110b, 0x110c, 0x110e, + 0x110f, 0x1110, 0x1111, 0x1112, 0x110e, 0x110c, 0x110b, 0x327f, + }; + +static const unsigned short gNormalizeTable3280[] = { + /* U+3280 */ + 0x4e00, 0x4e8c, 0x4e09, 0x56db, 0x4e94, 0x516d, 0x4e03, 0x516b, + 0x4e5d, 0x5341, 0x6708, 0x706b, 0x6c34, 0x6728, 0x91d1, 0x571f, + 0x65e5, 0x682a, 0x6709, 0x793e, 0x540d, 0x7279, 0x8ca1, 0x795d, + 0x52b4, 0x79d8, 0x7537, 0x5973, 0x9069, 0x512a, 0x5370, 0x6ce8, + 0x9805, 0x4f11, 0x5199, 0x6b63, 0x4e0a, 0x4e2d, 0x4e0b, 0x5de6, + 0x53f3, 0x533b, 0x5b97, 0x5b66, 0x76e3, 0x4f01, 0x8cc7, 0x5354, + 0x591c, 0x0033, 0x0033, 0x0033, 0x0033, 0x0034, 0x0034, 0x0034, + 0x0034, 0x0034, 0x0034, 0x0034, 0x0034, 0x0034, 0x0034, 0x0035, + }; + +static const unsigned short gNormalizeTable32c0[] = { + /* U+32c0 */ + 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, + 0x0039, 0x0031, 0x0031, 0x0031, 0x0068, 0x0065, 0x0065, 0x006c, + 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab, 0x30ad, 0x30af, + 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd, 0x30bf, + 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd, + 0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 0x30df, + 0x30e0, 0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, + 0x30eb, 0x30ec, 0x30ed, 0x30ef, 0x30f0, 0x30f1, 0x30f2, 0x32ff, + }; + +static const unsigned short gNormalizeTable3300[] = { + /* U+3300 */ + 0x30a2, 0x30a2, 0x30a2, 0x30a2, 0x30a4, 0x30a4, 0x30a6, 0x30a8, + 0x30a8, 0x30aa, 0x30aa, 0x30ab, 0x30ab, 0x30ab, 0x30ab, 0x30ab, + 0x30ad, 0x30ad, 0x30ad, 0x30ad, 0x30ad, 0x30ad, 0x30ad, 0x30ad, + 0x30af, 0x30af, 0x30af, 0x30af, 0x30b1, 0x30b3, 0x30b3, 0x30b5, + 0x30b5, 0x30b7, 0x30bb, 0x30bb, 0x30bf, 0x30c6, 0x30c8, 0x30c8, + 0x30ca, 0x30ce, 0x30cf, 0x30cf, 0x30cf, 0x30cf, 0x30d2, 0x30d2, + 0x30d2, 0x30d2, 0x30d5, 0x30d5, 0x30d5, 0x30d5, 0x30d8, 0x30d8, + 0x30d8, 0x30d8, 0x30d8, 0x30d8, 0x30d8, 0x30db, 0x30db, 0x30db, + }; + +static const unsigned short gNormalizeTable3340[] = { + /* U+3340 */ + 0x30db, 0x30db, 0x30db, 0x30de, 0x30de, 0x30de, 0x30de, 0x30de, + 0x30df, 0x30df, 0x30df, 0x30e1, 0x30e1, 0x30e1, 0x30e4, 0x30e4, + 0x30e6, 0x30ea, 0x30ea, 0x30eb, 0x30eb, 0x30ec, 0x30ec, 0x30ef, + 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x0031, 0x0031, 0x0031, 0x0031, 0x0031, 0x0031, + 0x0031, 0x0031, 0x0031, 0x0031, 0x0032, 0x0032, 0x0032, 0x0032, + 0x0032, 0x0068, 0x0064, 0x0061, 0x0062, 0x006f, 0x0070, 0x0064, + 0x0064, 0x0064, 0x0069, 0x5e73, 0x662d, 0x5927, 0x660e, 0x682a, + }; + +static const unsigned short gNormalizeTable3380[] = { + /* U+3380 */ + 0x0070, 0x006e, 0x03bc, 0x006d, 0x006b, 0x006b, 0x006d, 0x0067, + 0x0063, 0x006b, 0x0070, 0x006e, 0x03bc, 0x03bc, 0x006d, 0x006b, + 0x0068, 0x006b, 0x006d, 0x0067, 0x0074, 0x03bc, 0x006d, 0x0064, + 0x006b, 0x0066, 0x006e, 0x03bc, 0x006d, 0x0063, 0x006b, 0x006d, + 0x0063, 0x006d, 0x006b, 0x006d, 0x0063, 0x006d, 0x006b, 0x006d, + 0x006d, 0x0070, 0x006b, 0x006d, 0x0067, 0x0072, 0x0072, 0x0072, + 0x0070, 0x006e, 0x03bc, 0x006d, 0x0070, 0x006e, 0x03bc, 0x006d, + 0x006b, 0x006d, 0x0070, 0x006e, 0x03bc, 0x006d, 0x006b, 0x006d, + }; + +static const unsigned short gNormalizeTable33c0[] = { + /* U+33c0 */ + 0x006b, 0x006d, 0x0061, 0x0062, 0x0063, 0x0063, 0x0063, 0x0063, + 0x0064, 0x0067, 0x0068, 0x0068, 0x0069, 0x006b, 0x006b, 0x006b, + 0x006c, 0x006c, 0x006c, 0x006c, 0x006d, 0x006d, 0x006d, 0x0070, + 0x0070, 0x0070, 0x0070, 0x0073, 0x0073, 0x0077, 0x0076, 0x0061, + 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, + 0x0039, 0x0031, 0x0031, 0x0031, 0x0031, 0x0031, 0x0031, 0x0031, + 0x0031, 0x0031, 0x0031, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, + 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0033, 0x0033, 0x0067, + }; + +static const unsigned short gNormalizeTablea640[] = { + /* U+a640 */ + 0xa641, 0xa641, 0xa643, 0xa643, 0xa645, 0xa645, 0xa647, 0xa647, + 0xa649, 0xa649, 0xa64b, 0xa64b, 0xa64d, 0xa64d, 0xa64f, 0xa64f, + 0xa651, 0xa651, 0xa653, 0xa653, 0xa655, 0xa655, 0xa657, 0xa657, + 0xa659, 0xa659, 0xa65b, 0xa65b, 0xa65d, 0xa65d, 0xa65f, 0xa65f, + 0xa660, 0xa661, 0xa663, 0xa663, 0xa665, 0xa665, 0xa667, 0xa667, + 0xa669, 0xa669, 0xa66b, 0xa66b, 0xa66d, 0xa66d, 0xa66e, 0xa66f, + 0xa670, 0xa671, 0xa672, 0xa673, 0xa674, 0xa675, 0xa676, 0xa677, + 0xa678, 0xa679, 0xa67a, 0xa67b, 0xa67c, 0xa67d, 0xa67e, 0xa67f, + }; + +static const unsigned short gNormalizeTablea680[] = { + /* U+a680 */ + 0xa681, 0xa681, 0xa683, 0xa683, 0xa685, 0xa685, 0xa687, 0xa687, + 0xa689, 0xa689, 0xa68b, 0xa68b, 0xa68d, 0xa68d, 0xa68f, 0xa68f, + 0xa691, 0xa691, 0xa693, 0xa693, 0xa695, 0xa695, 0xa697, 0xa697, + 0xa698, 0xa699, 0xa69a, 0xa69b, 0xa69c, 0xa69d, 0xa69e, 0xa69f, + 0xa6a0, 0xa6a1, 0xa6a2, 0xa6a3, 0xa6a4, 0xa6a5, 0xa6a6, 0xa6a7, + 0xa6a8, 0xa6a9, 0xa6aa, 0xa6ab, 0xa6ac, 0xa6ad, 0xa6ae, 0xa6af, + 0xa6b0, 0xa6b1, 0xa6b2, 0xa6b3, 0xa6b4, 0xa6b5, 0xa6b6, 0xa6b7, + 0xa6b8, 0xa6b9, 0xa6ba, 0xa6bb, 0xa6bc, 0xa6bd, 0xa6be, 0xa6bf, + }; + +static const unsigned short gNormalizeTablea700[] = { + /* U+a700 */ + 0xa700, 0xa701, 0xa702, 0xa703, 0xa704, 0xa705, 0xa706, 0xa707, + 0xa708, 0xa709, 0xa70a, 0xa70b, 0xa70c, 0xa70d, 0xa70e, 0xa70f, + 0xa710, 0xa711, 0xa712, 0xa713, 0xa714, 0xa715, 0xa716, 0xa717, + 0xa718, 0xa719, 0xa71a, 0xa71b, 0xa71c, 0xa71d, 0xa71e, 0xa71f, + 0xa720, 0xa721, 0xa723, 0xa723, 0xa725, 0xa725, 0xa727, 0xa727, + 0xa729, 0xa729, 0xa72b, 0xa72b, 0xa72d, 0xa72d, 0xa72f, 0xa72f, + 0xa730, 0xa731, 0xa733, 0xa733, 0xa735, 0xa735, 0xa737, 0xa737, + 0xa739, 0xa739, 0xa73b, 0xa73b, 0xa73d, 0xa73d, 0xa73f, 0xa73f, + }; + +static const unsigned short gNormalizeTablea740[] = { + /* U+a740 */ + 0xa741, 0xa741, 0xa743, 0xa743, 0xa745, 0xa745, 0xa747, 0xa747, + 0xa749, 0xa749, 0xa74b, 0xa74b, 0xa74d, 0xa74d, 0xa74f, 0xa74f, + 0xa751, 0xa751, 0xa753, 0xa753, 0xa755, 0xa755, 0xa757, 0xa757, + 0xa759, 0xa759, 0xa75b, 0xa75b, 0xa75d, 0xa75d, 0xa75f, 0xa75f, + 0xa761, 0xa761, 0xa763, 0xa763, 0xa765, 0xa765, 0xa767, 0xa767, + 0xa769, 0xa769, 0xa76b, 0xa76b, 0xa76d, 0xa76d, 0xa76f, 0xa76f, + 0xa76f, 0xa771, 0xa772, 0xa773, 0xa774, 0xa775, 0xa776, 0xa777, + 0xa778, 0xa77a, 0xa77a, 0xa77c, 0xa77c, 0x1d79, 0xa77f, 0xa77f, + }; + +static const unsigned short gNormalizeTablea780[] = { + /* U+a780 */ + 0xa781, 0xa781, 0xa783, 0xa783, 0xa785, 0xa785, 0xa787, 0xa787, + 0xa788, 0xa789, 0xa78a, 0xa78c, 0xa78c, 0xa78d, 0xa78e, 0xa78f, + 0xa790, 0xa791, 0xa792, 0xa793, 0xa794, 0xa795, 0xa796, 0xa797, + 0xa798, 0xa799, 0xa79a, 0xa79b, 0xa79c, 0xa79d, 0xa79e, 0xa79f, + 0xa7a0, 0xa7a1, 0xa7a2, 0xa7a3, 0xa7a4, 0xa7a5, 0xa7a6, 0xa7a7, + 0xa7a8, 0xa7a9, 0xa7aa, 0xa7ab, 0xa7ac, 0xa7ad, 0xa7ae, 0xa7af, + 0xa7b0, 0xa7b1, 0xa7b2, 0xa7b3, 0xa7b4, 0xa7b5, 0xa7b6, 0xa7b7, + 0xa7b8, 0xa7b9, 0xa7ba, 0xa7bb, 0xa7bc, 0xa7bd, 0xa7be, 0xa7bf, + }; + +static const unsigned short gNormalizeTablef900[] = { + /* U+f900 */ + 0x8c48, 0x66f4, 0x8eca, 0x8cc8, 0x6ed1, 0x4e32, 0x53e5, 0x9f9c, + 0x9f9c, 0x5951, 0x91d1, 0x5587, 0x5948, 0x61f6, 0x7669, 0x7f85, + 0x863f, 0x87ba, 0x88f8, 0x908f, 0x6a02, 0x6d1b, 0x70d9, 0x73de, + 0x843d, 0x916a, 0x99f1, 0x4e82, 0x5375, 0x6b04, 0x721b, 0x862d, + 0x9e1e, 0x5d50, 0x6feb, 0x85cd, 0x8964, 0x62c9, 0x81d8, 0x881f, + 0x5eca, 0x6717, 0x6d6a, 0x72fc, 0x90ce, 0x4f86, 0x51b7, 0x52de, + 0x64c4, 0x6ad3, 0x7210, 0x76e7, 0x8001, 0x8606, 0x865c, 0x8def, + 0x9732, 0x9b6f, 0x9dfa, 0x788c, 0x797f, 0x7da0, 0x83c9, 0x9304, + }; + +static const unsigned short gNormalizeTablef940[] = { + /* U+f940 */ + 0x9e7f, 0x8ad6, 0x58df, 0x5f04, 0x7c60, 0x807e, 0x7262, 0x78ca, + 0x8cc2, 0x96f7, 0x58d8, 0x5c62, 0x6a13, 0x6dda, 0x6f0f, 0x7d2f, + 0x7e37, 0x964b, 0x52d2, 0x808b, 0x51dc, 0x51cc, 0x7a1c, 0x7dbe, + 0x83f1, 0x9675, 0x8b80, 0x62cf, 0x6a02, 0x8afe, 0x4e39, 0x5be7, + 0x6012, 0x7387, 0x7570, 0x5317, 0x78fb, 0x4fbf, 0x5fa9, 0x4e0d, + 0x6ccc, 0x6578, 0x7d22, 0x53c3, 0x585e, 0x7701, 0x8449, 0x8aaa, + 0x6bba, 0x8fb0, 0x6c88, 0x62fe, 0x82e5, 0x63a0, 0x7565, 0x4eae, + 0x5169, 0x51c9, 0x6881, 0x7ce7, 0x826f, 0x8ad2, 0x91cf, 0x52f5, + }; + +static const unsigned short gNormalizeTablef980[] = { + /* U+f980 */ + 0x5442, 0x5973, 0x5eec, 0x65c5, 0x6ffe, 0x792a, 0x95ad, 0x9a6a, + 0x9e97, 0x9ece, 0x529b, 0x66c6, 0x6b77, 0x8f62, 0x5e74, 0x6190, + 0x6200, 0x649a, 0x6f23, 0x7149, 0x7489, 0x79ca, 0x7df4, 0x806f, + 0x8f26, 0x84ee, 0x9023, 0x934a, 0x5217, 0x52a3, 0x54bd, 0x70c8, + 0x88c2, 0x8aaa, 0x5ec9, 0x5ff5, 0x637b, 0x6bae, 0x7c3e, 0x7375, + 0x4ee4, 0x56f9, 0x5be7, 0x5dba, 0x601c, 0x73b2, 0x7469, 0x7f9a, + 0x8046, 0x9234, 0x96f6, 0x9748, 0x9818, 0x4f8b, 0x79ae, 0x91b4, + 0x96b8, 0x60e1, 0x4e86, 0x50da, 0x5bee, 0x5c3f, 0x6599, 0x6a02, + }; + +static const unsigned short gNormalizeTablef9c0[] = { + /* U+f9c0 */ + 0x71ce, 0x7642, 0x84fc, 0x907c, 0x9f8d, 0x6688, 0x962e, 0x5289, + 0x677b, 0x67f3, 0x6d41, 0x6e9c, 0x7409, 0x7559, 0x786b, 0x7d10, + 0x985e, 0x516d, 0x622e, 0x9678, 0x502b, 0x5d19, 0x6dea, 0x8f2a, + 0x5f8b, 0x6144, 0x6817, 0x7387, 0x9686, 0x5229, 0x540f, 0x5c65, + 0x6613, 0x674e, 0x68a8, 0x6ce5, 0x7406, 0x75e2, 0x7f79, 0x88cf, + 0x88e1, 0x91cc, 0x96e2, 0x533f, 0x6eba, 0x541d, 0x71d0, 0x7498, + 0x85fa, 0x96a3, 0x9c57, 0x9e9f, 0x6797, 0x6dcb, 0x81e8, 0x7acb, + 0x7b20, 0x7c92, 0x72c0, 0x7099, 0x8b58, 0x4ec0, 0x8336, 0x523a, + }; + +static const unsigned short gNormalizeTablefa00[] = { + /* U+fa00 */ + 0x5207, 0x5ea6, 0x62d3, 0x7cd6, 0x5b85, 0x6d1e, 0x66b4, 0x8f3b, + 0x884c, 0x964d, 0x898b, 0x5ed3, 0x5140, 0x55c0, 0xfa0e, 0xfa0f, + 0x585a, 0xfa11, 0x6674, 0xfa13, 0xfa14, 0x51de, 0x732a, 0x76ca, + 0x793c, 0x795e, 0x7965, 0x798f, 0x9756, 0x7cbe, 0x7fbd, 0xfa1f, + 0x8612, 0xfa21, 0x8af8, 0xfa23, 0xfa24, 0x9038, 0x90fd, 0xfa27, + 0xfa28, 0xfa29, 0x98ef, 0x98fc, 0x9928, 0x9db4, 0xfa2e, 0xfa2f, + 0x4fae, 0x50e7, 0x514d, 0x52c9, 0x52e4, 0x5351, 0x559d, 0x5606, + 0x5668, 0x5840, 0x58a8, 0x5c64, 0x5c6e, 0x6094, 0x6168, 0x618e, + }; + +static const unsigned short gNormalizeTablefa40[] = { + /* U+fa40 */ + 0x61f2, 0x654f, 0x65e2, 0x6691, 0x6885, 0x6d77, 0x6e1a, 0x6f22, + 0x716e, 0x722b, 0x7422, 0x7891, 0x793e, 0x7949, 0x7948, 0x7950, + 0x7956, 0x795d, 0x798d, 0x798e, 0x7a40, 0x7a81, 0x7bc0, 0x7df4, + 0x7e09, 0x7e41, 0x7f72, 0x8005, 0x81ed, 0x8279, 0x8279, 0x8457, + 0x8910, 0x8996, 0x8b01, 0x8b39, 0x8cd3, 0x8d08, 0x8fb6, 0x9038, + 0x96e3, 0x97ff, 0x983b, 0x6075, 0xfa6c, 0x8218, 0xfa6e, 0xfa6f, + 0x4e26, 0x51b5, 0x5168, 0x4f80, 0x5145, 0x5180, 0x52c7, 0x52fa, + 0x559d, 0x5555, 0x5599, 0x55e2, 0x585a, 0x58b3, 0x5944, 0x5954, + }; + +static const unsigned short gNormalizeTablefa80[] = { + /* U+fa80 */ + 0x5a62, 0x5b28, 0x5ed2, 0x5ed9, 0x5f69, 0x5fad, 0x60d8, 0x614e, + 0x6108, 0x618e, 0x6160, 0x61f2, 0x6234, 0x63c4, 0x641c, 0x6452, + 0x6556, 0x6674, 0x6717, 0x671b, 0x6756, 0x6b79, 0x6bba, 0x6d41, + 0x6edb, 0x6ecb, 0x6f22, 0x701e, 0x716e, 0x77a7, 0x7235, 0x72af, + 0x732a, 0x7471, 0x7506, 0x753b, 0x761d, 0x761f, 0x76ca, 0x76db, + 0x76f4, 0x774a, 0x7740, 0x78cc, 0x7ab1, 0x7bc0, 0x7c7b, 0x7d5b, + 0x7df4, 0x7f3e, 0x8005, 0x8352, 0x83ef, 0x8779, 0x8941, 0x8986, + 0x8996, 0x8abf, 0x8af8, 0x8acb, 0x8b01, 0x8afe, 0x8aed, 0x8b39, + }; + +static const unsigned short gNormalizeTablefac0[] = { + /* U+fac0 */ + 0x8b8a, 0x8d08, 0x8f38, 0x9072, 0x9199, 0x9276, 0x967c, 0x96e3, + 0x9756, 0x97db, 0x97ff, 0x980b, 0x983b, 0x9b12, 0x9f9c, 0xfacf, + 0xfad0, 0xfad1, 0x3b9d, 0x4018, 0x4039, 0xfad5, 0xfad6, 0xfad7, + 0x9f43, 0x9f8e, 0xfada, 0xfadb, 0xfadc, 0xfadd, 0xfade, 0xfadf, + 0xfae0, 0xfae1, 0xfae2, 0xfae3, 0xfae4, 0xfae5, 0xfae6, 0xfae7, + 0xfae8, 0xfae9, 0xfaea, 0xfaeb, 0xfaec, 0xfaed, 0xfaee, 0xfaef, + 0xfaf0, 0xfaf1, 0xfaf2, 0xfaf3, 0xfaf4, 0xfaf5, 0xfaf6, 0xfaf7, + 0xfaf8, 0xfaf9, 0xfafa, 0xfafb, 0xfafc, 0xfafd, 0xfafe, 0xfaff, + }; + +static const unsigned short gNormalizeTablefb00[] = { + /* U+fb00 */ + 0x0066, 0x0066, 0x0066, 0x0066, 0x0066, 0x0073, 0x0073, 0xfb07, + 0xfb08, 0xfb09, 0xfb0a, 0xfb0b, 0xfb0c, 0xfb0d, 0xfb0e, 0xfb0f, + 0xfb10, 0xfb11, 0xfb12, 0x0574, 0x0574, 0x0574, 0x057e, 0x0574, + 0xfb18, 0xfb19, 0xfb1a, 0xfb1b, 0xfb1c, 0x05d9, 0xfb1e, 0x05f2, + 0x05e2, 0x05d0, 0x05d3, 0x05d4, 0x05db, 0x05dc, 0x05dd, 0x05e8, + 0x05ea, 0x002b, 0x05e9, 0x05e9, 0x05e9, 0x05e9, 0x05d0, 0x05d0, + 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x05d5, 0x05d6, 0xfb37, + 0x05d8, 0x05d9, 0x05da, 0x05db, 0x05dc, 0xfb3d, 0x05de, 0xfb3f, + }; + +static const unsigned short gNormalizeTablefb40[] = { + /* U+fb40 */ + 0x05e0, 0x05e1, 0xfb42, 0x05e3, 0x05e4, 0xfb45, 0x05e6, 0x05e7, + 0x05e8, 0x05e9, 0x05ea, 0x05d5, 0x05d1, 0x05db, 0x05e4, 0x05d0, + 0x0671, 0x0671, 0x067b, 0x067b, 0x067b, 0x067b, 0x067e, 0x067e, + 0x067e, 0x067e, 0x0680, 0x0680, 0x0680, 0x0680, 0x067a, 0x067a, + 0x067a, 0x067a, 0x067f, 0x067f, 0x067f, 0x067f, 0x0679, 0x0679, + 0x0679, 0x0679, 0x06a4, 0x06a4, 0x06a4, 0x06a4, 0x06a6, 0x06a6, + 0x06a6, 0x06a6, 0x0684, 0x0684, 0x0684, 0x0684, 0x0683, 0x0683, + 0x0683, 0x0683, 0x0686, 0x0686, 0x0686, 0x0686, 0x0687, 0x0687, + }; + +static const unsigned short gNormalizeTablefb80[] = { + /* U+fb80 */ + 0x0687, 0x0687, 0x068d, 0x068d, 0x068c, 0x068c, 0x068e, 0x068e, + 0x0688, 0x0688, 0x0698, 0x0698, 0x0691, 0x0691, 0x06a9, 0x06a9, + 0x06a9, 0x06a9, 0x06af, 0x06af, 0x06af, 0x06af, 0x06b3, 0x06b3, + 0x06b3, 0x06b3, 0x06b1, 0x06b1, 0x06b1, 0x06b1, 0x06ba, 0x06ba, + 0x06bb, 0x06bb, 0x06bb, 0x06bb, 0x06d5, 0x06d5, 0x06c1, 0x06c1, + 0x06c1, 0x06c1, 0x06be, 0x06be, 0x06be, 0x06be, 0x06d2, 0x06d2, + 0x06d2, 0x06d2, 0xfbb2, 0xfbb3, 0xfbb4, 0xfbb5, 0xfbb6, 0xfbb7, + 0xfbb8, 0xfbb9, 0xfbba, 0xfbbb, 0xfbbc, 0xfbbd, 0xfbbe, 0xfbbf, + }; + +static const unsigned short gNormalizeTablefbc0[] = { + /* U+fbc0 */ + 0xfbc0, 0xfbc1, 0xfbc2, 0xfbc3, 0xfbc4, 0xfbc5, 0xfbc6, 0xfbc7, + 0xfbc8, 0xfbc9, 0xfbca, 0xfbcb, 0xfbcc, 0xfbcd, 0xfbce, 0xfbcf, + 0xfbd0, 0xfbd1, 0xfbd2, 0x06ad, 0x06ad, 0x06ad, 0x06ad, 0x06c7, + 0x06c7, 0x06c6, 0x06c6, 0x06c8, 0x06c8, 0x06c7, 0x06cb, 0x06cb, + 0x06c5, 0x06c5, 0x06c9, 0x06c9, 0x06d0, 0x06d0, 0x06d0, 0x06d0, + 0x0649, 0x0649, 0x064a, 0x064a, 0x064a, 0x064a, 0x064a, 0x064a, + 0x064a, 0x064a, 0x064a, 0x064a, 0x064a, 0x064a, 0x064a, 0x064a, + 0x064a, 0x064a, 0x064a, 0x064a, 0x06cc, 0x06cc, 0x06cc, 0x06cc, + }; + +static const unsigned short gNormalizeTablefc00[] = { + /* U+fc00 */ + 0x064a, 0x064a, 0x064a, 0x064a, 0x064a, 0x0628, 0x0628, 0x0628, + 0x0628, 0x0628, 0x0628, 0x062a, 0x062a, 0x062a, 0x062a, 0x062a, + 0x062a, 0x062b, 0x062b, 0x062b, 0x062b, 0x062c, 0x062c, 0x062d, + 0x062d, 0x062e, 0x062e, 0x062e, 0x0633, 0x0633, 0x0633, 0x0633, + 0x0635, 0x0635, 0x0636, 0x0636, 0x0636, 0x0636, 0x0637, 0x0637, + 0x0638, 0x0639, 0x0639, 0x063a, 0x063a, 0x0641, 0x0641, 0x0641, + 0x0641, 0x0641, 0x0641, 0x0642, 0x0642, 0x0642, 0x0642, 0x0643, + 0x0643, 0x0643, 0x0643, 0x0643, 0x0643, 0x0643, 0x0643, 0x0644, + }; + +static const unsigned short gNormalizeTablefc40[] = { + /* U+fc40 */ + 0x0644, 0x0644, 0x0644, 0x0644, 0x0644, 0x0645, 0x0645, 0x0645, + 0x0645, 0x0645, 0x0645, 0x0646, 0x0646, 0x0646, 0x0646, 0x0646, + 0x0646, 0x0647, 0x0647, 0x0647, 0x0647, 0x064a, 0x064a, 0x064a, + 0x064a, 0x064a, 0x064a, 0x0630, 0x0631, 0x0649, 0x0020, 0x0020, + 0x0020, 0x0020, 0x0020, 0x0020, 0x064a, 0x064a, 0x064a, 0x064a, + 0x064a, 0x064a, 0x0628, 0x0628, 0x0628, 0x0628, 0x0628, 0x0628, + 0x062a, 0x062a, 0x062a, 0x062a, 0x062a, 0x062a, 0x062b, 0x062b, + 0x062b, 0x062b, 0x062b, 0x062b, 0x0641, 0x0641, 0x0642, 0x0642, + }; + +static const unsigned short gNormalizeTablefc80[] = { + /* U+fc80 */ + 0x0643, 0x0643, 0x0643, 0x0643, 0x0643, 0x0644, 0x0644, 0x0644, + 0x0645, 0x0645, 0x0646, 0x0646, 0x0646, 0x0646, 0x0646, 0x0646, + 0x0649, 0x064a, 0x064a, 0x064a, 0x064a, 0x064a, 0x064a, 0x064a, + 0x064a, 0x064a, 0x064a, 0x064a, 0x0628, 0x0628, 0x0628, 0x0628, + 0x0628, 0x062a, 0x062a, 0x062a, 0x062a, 0x062a, 0x062b, 0x062c, + 0x062c, 0x062d, 0x062d, 0x062e, 0x062e, 0x0633, 0x0633, 0x0633, + 0x0633, 0x0635, 0x0635, 0x0635, 0x0636, 0x0636, 0x0636, 0x0636, + 0x0637, 0x0638, 0x0639, 0x0639, 0x063a, 0x063a, 0x0641, 0x0641, + }; + +static const unsigned short gNormalizeTablefcc0[] = { + /* U+fcc0 */ + 0x0641, 0x0641, 0x0642, 0x0642, 0x0643, 0x0643, 0x0643, 0x0643, + 0x0643, 0x0644, 0x0644, 0x0644, 0x0644, 0x0644, 0x0645, 0x0645, + 0x0645, 0x0645, 0x0646, 0x0646, 0x0646, 0x0646, 0x0646, 0x0647, + 0x0647, 0x0647, 0x064a, 0x064a, 0x064a, 0x064a, 0x064a, 0x064a, + 0x064a, 0x0628, 0x0628, 0x062a, 0x062a, 0x062b, 0x062b, 0x0633, + 0x0633, 0x0634, 0x0634, 0x0643, 0x0643, 0x0644, 0x0646, 0x0646, + 0x064a, 0x064a, 0x0640, 0x0640, 0x0640, 0x0637, 0x0637, 0x0639, + 0x0639, 0x063a, 0x063a, 0x0633, 0x0633, 0x0634, 0x0634, 0x062d, + }; + +static const unsigned short gNormalizeTablefd00[] = { + /* U+fd00 */ + 0x062d, 0x062c, 0x062c, 0x062e, 0x062e, 0x0635, 0x0635, 0x0636, + 0x0636, 0x0634, 0x0634, 0x0634, 0x0634, 0x0634, 0x0633, 0x0635, + 0x0636, 0x0637, 0x0637, 0x0639, 0x0639, 0x063a, 0x063a, 0x0633, + 0x0633, 0x0634, 0x0634, 0x062d, 0x062d, 0x062c, 0x062c, 0x062e, + 0x062e, 0x0635, 0x0635, 0x0636, 0x0636, 0x0634, 0x0634, 0x0634, + 0x0634, 0x0634, 0x0633, 0x0635, 0x0636, 0x0634, 0x0634, 0x0634, + 0x0634, 0x0633, 0x0634, 0x0637, 0x0633, 0x0633, 0x0633, 0x0634, + 0x0634, 0x0634, 0x0637, 0x0638, 0x0627, 0x0627, 0xfd3e, 0xfd3f, + }; + +static const unsigned short gNormalizeTablefd40[] = { + /* U+fd40 */ + 0xfd40, 0xfd41, 0xfd42, 0xfd43, 0xfd44, 0xfd45, 0xfd46, 0xfd47, + 0xfd48, 0xfd49, 0xfd4a, 0xfd4b, 0xfd4c, 0xfd4d, 0xfd4e, 0xfd4f, + 0x062a, 0x062a, 0x062a, 0x062a, 0x062a, 0x062a, 0x062a, 0x062a, + 0x062c, 0x062c, 0x062d, 0x062d, 0x0633, 0x0633, 0x0633, 0x0633, + 0x0633, 0x0633, 0x0633, 0x0633, 0x0635, 0x0635, 0x0635, 0x0634, + 0x0634, 0x0634, 0x0634, 0x0634, 0x0634, 0x0634, 0x0636, 0x0636, + 0x0636, 0x0637, 0x0637, 0x0637, 0x0637, 0x0639, 0x0639, 0x0639, + 0x0639, 0x063a, 0x063a, 0x063a, 0x0641, 0x0641, 0x0642, 0x0642, + }; + +static const unsigned short gNormalizeTablefd80[] = { + /* U+fd80 */ + 0x0644, 0x0644, 0x0644, 0x0644, 0x0644, 0x0644, 0x0644, 0x0644, + 0x0644, 0x0645, 0x0645, 0x0645, 0x0645, 0x0645, 0x0645, 0x0645, + 0xfd90, 0xfd91, 0x0645, 0x0647, 0x0647, 0x0646, 0x0646, 0x0646, + 0x0646, 0x0646, 0x0646, 0x0646, 0x064a, 0x064a, 0x0628, 0x062a, + 0x062a, 0x062a, 0x062a, 0x062a, 0x062a, 0x062c, 0x062c, 0x062c, + 0x0633, 0x0635, 0x0634, 0x0636, 0x0644, 0x0644, 0x064a, 0x064a, + 0x064a, 0x0645, 0x0642, 0x0646, 0x0642, 0x0644, 0x0639, 0x0643, + 0x0646, 0x0645, 0x0644, 0x0643, 0x0644, 0x0646, 0x062c, 0x062d, + }; + +static const unsigned short gNormalizeTablefdc0[] = { + /* U+fdc0 */ + 0x0645, 0x0641, 0x0628, 0x0643, 0x0639, 0x0635, 0x0633, 0x0646, + 0xfdc8, 0xfdc9, 0xfdca, 0xfdcb, 0xfdcc, 0xfdcd, 0xfdce, 0xfdcf, + 0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, + 0xfdd8, 0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, + 0xfde0, 0xfde1, 0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, + 0xfde8, 0xfde9, 0xfdea, 0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef, + 0x0635, 0x0642, 0x0627, 0x0627, 0x0645, 0x0635, 0x0631, 0x0639, + 0x0648, 0x0635, 0x0635, 0x062c, 0x0631, 0xfdfd, 0xfdfe, 0xfdff, + }; + +static const unsigned short gNormalizeTablefe00[] = { + /* U+fe00 */ + 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, + 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, + 0x002c, 0x3001, 0x3002, 0x003a, 0x003b, 0x0021, 0x003f, 0x3016, + 0x3017, 0x002e, 0xfe1a, 0xfe1b, 0xfe1c, 0xfe1d, 0xfe1e, 0xfe1f, + 0xfe20, 0xfe21, 0xfe22, 0xfe23, 0xfe24, 0xfe25, 0xfe26, 0xfe27, + 0xfe28, 0xfe29, 0xfe2a, 0xfe2b, 0xfe2c, 0xfe2d, 0xfe2e, 0xfe2f, + 0x002e, 0x2014, 0x2013, 0x005f, 0x005f, 0x0028, 0x0029, 0x007b, + 0x007d, 0x3014, 0x3015, 0x3010, 0x3011, 0x300a, 0x300b, 0x3008, + }; + +static const unsigned short gNormalizeTablefe40[] = { + /* U+fe40 */ + 0x3009, 0x300c, 0x300d, 0x300e, 0x300f, 0xfe45, 0xfe46, 0x005b, + 0x005d, 0x0020, 0x0020, 0x0020, 0x0020, 0x005f, 0x005f, 0x005f, + 0x002c, 0x3001, 0x002e, 0xfe53, 0x003b, 0x003a, 0x003f, 0x0021, + 0x2014, 0x0028, 0x0029, 0x007b, 0x007d, 0x3014, 0x3015, 0x0023, + 0x0026, 0x002a, 0x002b, 0x002d, 0x003c, 0x003e, 0x003d, 0xfe67, + 0x005c, 0x0024, 0x0025, 0x0040, 0xfe6c, 0xfe6d, 0xfe6e, 0xfe6f, + 0x0020, 0x0640, 0x0020, 0xfe73, 0x0020, 0xfe75, 0x0020, 0x0640, + 0x0020, 0x0640, 0x0020, 0x0640, 0x0020, 0x0640, 0x0020, 0x0640, + }; + +static const unsigned short gNormalizeTablefe80[] = { + /* U+fe80 */ + 0x0621, 0x0627, 0x0627, 0x0627, 0x0627, 0x0648, 0x0648, 0x0627, + 0x0627, 0x064a, 0x064a, 0x064a, 0x064a, 0x0627, 0x0627, 0x0628, + 0x0628, 0x0628, 0x0628, 0x0629, 0x0629, 0x062a, 0x062a, 0x062a, + 0x062a, 0x062b, 0x062b, 0x062b, 0x062b, 0x062c, 0x062c, 0x062c, + 0x062c, 0x062d, 0x062d, 0x062d, 0x062d, 0x062e, 0x062e, 0x062e, + 0x062e, 0x062f, 0x062f, 0x0630, 0x0630, 0x0631, 0x0631, 0x0632, + 0x0632, 0x0633, 0x0633, 0x0633, 0x0633, 0x0634, 0x0634, 0x0634, + 0x0634, 0x0635, 0x0635, 0x0635, 0x0635, 0x0636, 0x0636, 0x0636, + }; + +static const unsigned short gNormalizeTablefec0[] = { + /* U+fec0 */ + 0x0636, 0x0637, 0x0637, 0x0637, 0x0637, 0x0638, 0x0638, 0x0638, + 0x0638, 0x0639, 0x0639, 0x0639, 0x0639, 0x063a, 0x063a, 0x063a, + 0x063a, 0x0641, 0x0641, 0x0641, 0x0641, 0x0642, 0x0642, 0x0642, + 0x0642, 0x0643, 0x0643, 0x0643, 0x0643, 0x0644, 0x0644, 0x0644, + 0x0644, 0x0645, 0x0645, 0x0645, 0x0645, 0x0646, 0x0646, 0x0646, + 0x0646, 0x0647, 0x0647, 0x0647, 0x0647, 0x0648, 0x0648, 0x0649, + 0x0649, 0x064a, 0x064a, 0x064a, 0x064a, 0x0644, 0x0644, 0x0644, + 0x0644, 0x0644, 0x0644, 0x0644, 0x0644, 0xfefd, 0xfefe, 0x0020, + }; + +static const unsigned short gNormalizeTableff00[] = { + /* U+ff00 */ + 0xff00, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, + 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, + 0x0040, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, + 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, + }; + +static const unsigned short gNormalizeTableff40[] = { + /* U+ff40 */ + 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, + 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x2985, + 0x2986, 0x3002, 0x300c, 0x300d, 0x3001, 0x30fb, 0x30f2, 0x30a1, + 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5, 0x30e7, 0x30c3, + 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab, 0x30ad, + 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd, + }; + +static const unsigned short gNormalizeTableff80[] = { + /* U+ff80 */ + 0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, + 0x30cd, 0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, + 0x30df, 0x30e0, 0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, + 0x30ea, 0x30eb, 0x30ec, 0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309a, + 0x0020, 0x1100, 0x1101, 0x11aa, 0x1102, 0x11ac, 0x11ad, 0x1103, + 0x1104, 0x1105, 0x11b0, 0x11b1, 0x11b2, 0x11b3, 0x11b4, 0x11b5, + 0x111a, 0x1106, 0x1107, 0x1108, 0x1121, 0x1109, 0x110a, 0x110b, + 0x110c, 0x110d, 0x110e, 0x110f, 0x1110, 0x1111, 0x1112, 0xffbf, + }; + +static const unsigned short gNormalizeTableffc0[] = { + /* U+ffc0 */ + 0xffc0, 0xffc1, 0x1161, 0x1162, 0x1163, 0x1164, 0x1165, 0x1166, + 0xffc8, 0xffc9, 0x1167, 0x1168, 0x1169, 0x116a, 0x116b, 0x116c, + 0xffd0, 0xffd1, 0x116d, 0x116e, 0x116f, 0x1170, 0x1171, 0x1172, + 0xffd8, 0xffd9, 0x1173, 0x1174, 0x1175, 0xffdd, 0xffde, 0xffdf, + 0x00a2, 0x00a3, 0x00ac, 0x0020, 0x00a6, 0x00a5, 0x20a9, 0xffe7, + 0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25a0, 0x25cb, 0xffef, + 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, + 0x0020, 0xfff9, 0xfffa, 0xfffb, 0xfffc, 0xfffd, 0xfffe, 0xffff, + }; + +static const unsigned short* gNormalizeTable[] = { + 0, gNormalizeTable0040, gNormalizeTable0080, gNormalizeTable00c0, + gNormalizeTable0100, gNormalizeTable0140, gNormalizeTable0180, gNormalizeTable01c0, + gNormalizeTable0200, gNormalizeTable0240, gNormalizeTable0280, gNormalizeTable02c0, + 0, gNormalizeTable0340, gNormalizeTable0380, gNormalizeTable03c0, + gNormalizeTable0400, gNormalizeTable0440, gNormalizeTable0480, gNormalizeTable04c0, + gNormalizeTable0500, gNormalizeTable0540, gNormalizeTable0580, 0, + gNormalizeTable0600, gNormalizeTable0640, 0, gNormalizeTable06c0, + 0, 0, 0, 0, + 0, 0, 0, 0, + gNormalizeTable0900, gNormalizeTable0940, 0, gNormalizeTable09c0, + gNormalizeTable0a00, gNormalizeTable0a40, 0, 0, + 0, gNormalizeTable0b40, gNormalizeTable0b80, gNormalizeTable0bc0, + 0, gNormalizeTable0c40, 0, gNormalizeTable0cc0, + 0, gNormalizeTable0d40, 0, gNormalizeTable0dc0, + gNormalizeTable0e00, 0, gNormalizeTable0e80, gNormalizeTable0ec0, + gNormalizeTable0f00, gNormalizeTable0f40, gNormalizeTable0f80, 0, + gNormalizeTable1000, 0, gNormalizeTable1080, gNormalizeTable10c0, + 0, gNormalizeTable1140, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, gNormalizeTable1780, 0, + gNormalizeTable1800, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + gNormalizeTable1b00, gNormalizeTable1b40, 0, 0, + 0, 0, 0, 0, + gNormalizeTable1d00, gNormalizeTable1d40, gNormalizeTable1d80, 0, + gNormalizeTable1e00, gNormalizeTable1e40, gNormalizeTable1e80, gNormalizeTable1ec0, + gNormalizeTable1f00, gNormalizeTable1f40, gNormalizeTable1f80, gNormalizeTable1fc0, + gNormalizeTable2000, gNormalizeTable2040, gNormalizeTable2080, 0, + gNormalizeTable2100, gNormalizeTable2140, gNormalizeTable2180, gNormalizeTable21c0, + gNormalizeTable2200, gNormalizeTable2240, gNormalizeTable2280, gNormalizeTable22c0, + gNormalizeTable2300, 0, 0, 0, + 0, gNormalizeTable2440, gNormalizeTable2480, gNormalizeTable24c0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + gNormalizeTable2a00, gNormalizeTable2a40, 0, gNormalizeTable2ac0, + 0, 0, 0, 0, + gNormalizeTable2c00, gNormalizeTable2c40, gNormalizeTable2c80, gNormalizeTable2cc0, + 0, gNormalizeTable2d40, 0, 0, + 0, 0, gNormalizeTable2e80, gNormalizeTable2ec0, + gNormalizeTable2f00, gNormalizeTable2f40, gNormalizeTable2f80, gNormalizeTable2fc0, + gNormalizeTable3000, gNormalizeTable3040, gNormalizeTable3080, gNormalizeTable30c0, + gNormalizeTable3100, gNormalizeTable3140, gNormalizeTable3180, 0, + gNormalizeTable3200, gNormalizeTable3240, gNormalizeTable3280, gNormalizeTable32c0, + gNormalizeTable3300, gNormalizeTable3340, gNormalizeTable3380, gNormalizeTable33c0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, gNormalizeTablea640, gNormalizeTablea680, 0, + gNormalizeTablea700, gNormalizeTablea740, gNormalizeTablea780, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + gNormalizeTablef900, gNormalizeTablef940, gNormalizeTablef980, gNormalizeTablef9c0, + gNormalizeTablefa00, gNormalizeTablefa40, gNormalizeTablefa80, gNormalizeTablefac0, + gNormalizeTablefb00, gNormalizeTablefb40, gNormalizeTablefb80, gNormalizeTablefbc0, + gNormalizeTablefc00, gNormalizeTablefc40, gNormalizeTablefc80, gNormalizeTablefcc0, + gNormalizeTablefd00, gNormalizeTablefd40, gNormalizeTablefd80, gNormalizeTablefdc0, + gNormalizeTablefe00, gNormalizeTablefe40, gNormalizeTablefe80, gNormalizeTablefec0, + gNormalizeTableff00, gNormalizeTableff40, gNormalizeTableff80, gNormalizeTableffc0, +}; + +unsigned int normalize_character(const unsigned int c) +{ + if (c >= 0x10000 || !gNormalizeTable[c >> 6]) + return c; + return gNormalizeTable[c >> 6][c & 0x3f]; +} + diff --git a/mailnews/extensions/fts3/src/README.mozilla b/mailnews/extensions/fts3/src/README.mozilla new file mode 100644 index 000000000..0bfe7deb3 --- /dev/null +++ b/mailnews/extensions/fts3/src/README.mozilla @@ -0,0 +1,3 @@ +fts3_porter.c code is from SQLite3. + +This customized tokenizer "mozporter" by Mozilla supports CJK indexing using bi-gram. So you have to use bi-gram search string if you wanto to search CJK character. diff --git a/mailnews/extensions/fts3/src/fts3_porter.c b/mailnews/extensions/fts3/src/fts3_porter.c new file mode 100644 index 000000000..2276244c1 --- /dev/null +++ b/mailnews/extensions/fts3/src/fts3_porter.c @@ -0,0 +1,1150 @@ +/* +** 2006 September 30 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +************************************************************************* +** Implementation of the full-text-search tokenizer that implements +** a Porter stemmer. +** +*/ + +/* + * This file is based on the SQLite FTS3 Porter Stemmer implementation. + * + * This is an attempt to provide some level of full-text search to users of + * Thunderbird who use languages that are not space/punctuation delimited. + * This is accomplished by performing bi-gram indexing of characters fall + * into the unicode space occupied by character sets used in such languages. + * + * Bi-gram indexing means that given the string "12345" we would index the + * pairs "12", "23", "34", and "45" (with position information). We do this + * because we are not sure where the word/semantic boundaries are in that + * string. Then, when a user searches for "234" the FTS3 engine tokenizes the + * search query into "23" and "34". Using special phrase-logic FTS3 requires + * the matches to have the tokens "23" and "34" adjacent to each other and in + * that order. In theory if the user searched for "2345" we we could just + * search for "23 NEAR/2 34". Unfortunately, NEAR does not imply ordering, + * so even though that would be more efficient, we would lose correctness + * and cannot do it. + * + * The efficiency and usability of bi-gram search assumes that the character + * space is large enough and actually observed bi-grams sufficiently + * distributed throughout the potential space so that the search bi-grams + * generated when the user issues a query find a 'reasonable' number of + * documents for each bi-gram match. + * + * Mozilla contributors: + * Makoto Kato + * Andrew Sutherland + */ + +/* +** The code in this file is only compiled if: +** +** * The FTS3 module is being built as an extension +** (in which case SQLITE_CORE is not defined), or +** +** * The FTS3 module is being built into the core of +** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). +*/ +#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) + + +#include +#include +#include +#include +#include + +#include "fts3_tokenizer.h" + +/* need some defined to compile without sqlite3 code */ + +#define sqlite3_malloc malloc +#define sqlite3_free free +#define sqlite3_realloc realloc + +static const unsigned char sqlite3Utf8Trans1[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, +}; + +typedef unsigned char u8; + +/** + * SQLite helper macro from sqlite3.c (really utf.c) to encode a unicode + * character into utf8. + * + * @param zOut A pointer to the current write position that is updated by + * the routine. At entry it should point to one-past the last valid + * encoded byte. The same holds true at exit. + * @param c The character to encode; this should be an unsigned int. + */ +#define WRITE_UTF8(zOut, c) { \ + if( c<0x0080 ){ \ + *zOut++ = (u8)(c&0xff); \ + } \ + else if( c<0x0800 ){ \ + *zOut++ = 0xC0 + (u8)((c>>6) & 0x1F); \ + *zOut++ = 0x80 + (u8)(c & 0x3F); \ + } \ + else if( c<0x10000 ){ \ + *zOut++ = 0xE0 + (u8)((c>>12) & 0x0F); \ + *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \ + *zOut++ = 0x80 + (u8)(c & 0x3F); \ + }else{ \ + *zOut++ = 0xf0 + (u8)((c>>18) & 0x07); \ + *zOut++ = 0x80 + (u8)((c>>12) & 0x3F); \ + *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \ + *zOut++ = 0x80 + (u8)(c & 0x3F); \ + } \ +} + +/** + * Fudge factor to avoid buffer overwrites when WRITE_UTF8 is involved. + * + * Our normalization table includes entries that may result in a larger + * utf-8 encoding. Namely, 023a maps to 2c65. This is a growth from 2 bytes + * as utf-8 encoded to 3 bytes. This is currently the only transition possible + * because 1-byte encodings are known to stay 1-byte and our normalization + * table is 16-bit and so can't generate a 4-byte encoded output. + * + * For simplicity, we just multiple by 2 which covers the current case and + * potential growth for 2-byte to 4-byte growth. We can afford to do this + * because we're not talking about a lot of memory here as a rule. + */ +#define MAX_UTF8_GROWTH_FACTOR 2 + +/** + * Helper from sqlite3.c to read a single UTF8 character. + * + * The clever bit with multi-byte reading is that you keep going until you find + * a byte whose top bits are not '10'. A single-byte UTF8 character will have + * '00' or '01', and a multi-byte UTF8 character must start with '11'. + * + * In the event of illegal UTF-8 this macro may read an arbitrary number of + * characters but will never read past zTerm. The resulting character value + * of illegal UTF-8 can be anything, although efforts are made to return the + * illegal character (0xfffd) for UTF-16 surrogates. + * + * @param zIn A pointer to the current position that is updated by the routine, + * pointing at the start of the next character when the routine returns. + * @param zTerm A pointer one past the end of the buffer. + * @param c The 'unsigned int' to hold the resulting character value. Do not + * use a short or a char. + */ +#define READ_UTF8(zIn, zTerm, c) { \ + c = *(zIn++); \ + if( c>=0xc0 ){ \ + c = sqlite3Utf8Trans1[c-0xc0]; \ + while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \ + c = (c<<6) + (0x3f & *(zIn++)); \ + } \ + if( c<0x80 \ + || (c&0xFFFFF800)==0xD800 \ + || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \ + } \ +} + +/* end of compatible block to complie codes */ + +/* +** Class derived from sqlite3_tokenizer +*/ +typedef struct porter_tokenizer { + sqlite3_tokenizer base; /* Base class */ +} porter_tokenizer; + +/* +** Class derived from sqlit3_tokenizer_cursor +*/ +typedef struct porter_tokenizer_cursor { + sqlite3_tokenizer_cursor base; + const char *zInput; /* input we are tokenizing */ + int nInput; /* size of the input */ + int iOffset; /* current position in zInput */ + int iToken; /* index of next token to be returned */ + unsigned char *zToken; /* storage for current token */ + int nAllocated; /* space allocated to zToken buffer */ + /** + * Store the offset of the second character in the bi-gram pair that we just + * emitted so that we can consider it being the first character in a bi-gram + * pair. + * The value 0 indicates that there is no previous such character. This is + * an acceptable sentinel value because the 0th offset can never be the + * offset of the second in a bi-gram pair. + * + * For example, let us say we are tokenizing a string of 4 CJK characters + * represented by the byte-string "11223344" where each repeated digit + * indicates 2-bytes of storage used to encode the character in UTF-8. + * (It actually takes 3, btw.) Then on the passes to emit each token, + * the iOffset and iPrevGigramOffset values at entry will be: + * + * 1122: iOffset = 0, iPrevBigramOffset = 0 + * 2233: iOffset = 4, iPrevBigramOffset = 2 + * 3344: iOffset = 6, iPrevBigramOffset = 4 + * (nothing will be emitted): iOffset = 8, iPrevBigramOffset = 6 + */ + int iPrevBigramOffset; /* previous result was bi-gram */ +} porter_tokenizer_cursor; + + +/* Forward declaration */ +static const sqlite3_tokenizer_module porterTokenizerModule; + +/* from normalize.c */ +extern unsigned int normalize_character(const unsigned int c); + +/* +** Create a new tokenizer instance. +*/ +static int porterCreate( + int argc, const char * const *argv, + sqlite3_tokenizer **ppTokenizer +){ + porter_tokenizer *t; + t = (porter_tokenizer *) sqlite3_malloc(sizeof(*t)); + if( t==NULL ) return SQLITE_NOMEM; + memset(t, 0, sizeof(*t)); + *ppTokenizer = &t->base; + return SQLITE_OK; +} + +/* +** Destroy a tokenizer +*/ +static int porterDestroy(sqlite3_tokenizer *pTokenizer){ + sqlite3_free(pTokenizer); + return SQLITE_OK; +} + +/* +** Prepare to begin tokenizing a particular string. The input +** string to be tokenized is zInput[0..nInput-1]. A cursor +** used to incrementally tokenize this string is returned in +** *ppCursor. +*/ +static int porterOpen( + sqlite3_tokenizer *pTokenizer, /* The tokenizer */ + const char *zInput, int nInput, /* String to be tokenized */ + sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ +){ + porter_tokenizer_cursor *c; + + c = (porter_tokenizer_cursor *) sqlite3_malloc(sizeof(*c)); + if( c==NULL ) return SQLITE_NOMEM; + + c->zInput = zInput; + if( zInput==0 ){ + c->nInput = 0; + }else if( nInput<0 ){ + c->nInput = (int)strlen(zInput); + }else{ + c->nInput = nInput; + } + c->iOffset = 0; /* start tokenizing at the beginning */ + c->iToken = 0; + c->zToken = NULL; /* no space allocated, yet. */ + c->nAllocated = 0; + c->iPrevBigramOffset = 0; + + *ppCursor = &c->base; + return SQLITE_OK; +} + +/* +** Close a tokenization cursor previously opened by a call to +** porterOpen() above. +*/ +static int porterClose(sqlite3_tokenizer_cursor *pCursor){ + porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor; + sqlite3_free(c->zToken); + sqlite3_free(c); + return SQLITE_OK; +} +/* +** Vowel or consonant +*/ +static const char cType[] = { + 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, + 1, 1, 1, 2, 1 +}; + +/* +** isConsonant() and isVowel() determine if their first character in +** the string they point to is a consonant or a vowel, according +** to Porter ruls. +** +** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'. +** 'Y' is a consonant unless it follows another consonant, +** in which case it is a vowel. +** +** In these routine, the letters are in reverse order. So the 'y' rule +** is that 'y' is a consonant unless it is followed by another +** consonent. +*/ +static int isVowel(const char*); +static int isConsonant(const char *z){ + int j; + char x = *z; + if( x==0 ) return 0; + assert( x>='a' && x<='z' ); + j = cType[x-'a']; + if( j<2 ) return j; + return z[1]==0 || isVowel(z + 1); +} +static int isVowel(const char *z){ + int j; + char x = *z; + if( x==0 ) return 0; + assert( x>='a' && x<='z' ); + j = cType[x-'a']; + if( j<2 ) return 1-j; + return isConsonant(z + 1); +} + +/* +** Let any sequence of one or more vowels be represented by V and let +** C be sequence of one or more consonants. Then every word can be +** represented as: +** +** [C] (VC){m} [V] +** +** In prose: A word is an optional consonant followed by zero or +** vowel-consonant pairs followed by an optional vowel. "m" is the +** number of vowel consonant pairs. This routine computes the value +** of m for the first i bytes of a word. +** +** Return true if the m-value for z is 1 or more. In other words, +** return true if z contains at least one vowel that is followed +** by a consonant. +** +** In this routine z[] is in reverse order. So we are really looking +** for an instance of of a consonant followed by a vowel. +*/ +static int m_gt_0(const char *z){ + while( isVowel(z) ){ z++; } + if( *z==0 ) return 0; + while( isConsonant(z) ){ z++; } + return *z!=0; +} + +/* Like mgt0 above except we are looking for a value of m which is +** exactly 1 +*/ +static int m_eq_1(const char *z){ + while( isVowel(z) ){ z++; } + if( *z==0 ) return 0; + while( isConsonant(z) ){ z++; } + if( *z==0 ) return 0; + while( isVowel(z) ){ z++; } + if( *z==0 ) return 1; + while( isConsonant(z) ){ z++; } + return *z==0; +} + +/* Like mgt0 above except we are looking for a value of m>1 instead +** or m>0 +*/ +static int m_gt_1(const char *z){ + while( isVowel(z) ){ z++; } + if( *z==0 ) return 0; + while( isConsonant(z) ){ z++; } + if( *z==0 ) return 0; + while( isVowel(z) ){ z++; } + if( *z==0 ) return 0; + while( isConsonant(z) ){ z++; } + return *z!=0; +} + +/* +** Return TRUE if there is a vowel anywhere within z[0..n-1] +*/ +static int hasVowel(const char *z){ + while( isConsonant(z) ){ z++; } + return *z!=0; +} + +/* +** Return TRUE if the word ends in a double consonant. +** +** The text is reversed here. So we are really looking at +** the first two characters of z[]. +*/ +static int doubleConsonant(const char *z){ + return isConsonant(z) && z[0]==z[1] && isConsonant(z+1); +} + +/* +** Return TRUE if the word ends with three letters which +** are consonant-vowel-consonent and where the final consonant +** is not 'w', 'x', or 'y'. +** +** The word is reversed here. So we are really checking the +** first three letters and the first one cannot be in [wxy]. +*/ +static int star_oh(const char *z){ + return + z[0]!=0 && isConsonant(z) && + z[0]!='w' && z[0]!='x' && z[0]!='y' && + z[1]!=0 && isVowel(z+1) && + z[2]!=0 && isConsonant(z+2); +} + +/* +** If the word ends with zFrom and xCond() is true for the stem +** of the word that preceeds the zFrom ending, then change the +** ending to zTo. +** +** The input word *pz and zFrom are both in reverse order. zTo +** is in normal order. +** +** Return TRUE if zFrom matches. Return FALSE if zFrom does not +** match. Not that TRUE is returned even if xCond() fails and +** no substitution occurs. +*/ +static int stem( + char **pz, /* The word being stemmed (Reversed) */ + const char *zFrom, /* If the ending matches this... (Reversed) */ + const char *zTo, /* ... change the ending to this (not reversed) */ + int (*xCond)(const char*) /* Condition that must be true */ +){ + char *z = *pz; + while( *zFrom && *zFrom==*z ){ z++; zFrom++; } + if( *zFrom!=0 ) return 0; + if( xCond && !xCond(z) ) return 1; + while( *zTo ){ + *(--z) = *(zTo++); + } + *pz = z; + return 1; +} + +/** + * Voiced sound mark is only on Japanese. It is like accent. It combines with + * previous character. Example, "サ" (Katakana) with "゛" (voiced sound mark) is + * "ザ". Although full-width character mapping has combined character like "ザ", + * there is no combined character on half-width Katanaka character mapping. + */ +static int isVoicedSoundMark(const unsigned int c) +{ + if (c == 0xff9e || c == 0xff9f || c == 0x3099 || c == 0x309a) + return 1; + return 0; +} + +/** + * How many unicode characters to take from the front and back of a term in + * |copy_stemmer|. + */ +#define COPY_STEMMER_COPY_HALF_LEN 10 + +/** + * Normalizing but non-stemming term copying. + * + * The original function would take 10 bytes from the front and 10 bytes from + * the back if there were no digits in the string and it was more than 20 + * bytes long. If there were digits involved that would decrease to 3 bytes + * from the front and 3 from the back. This would potentially corrupt utf-8 + * encoded characters, which is fine from the perspective of the FTS3 logic. + * + * In our revised form we now operate on a unicode character basis rather than + * a byte basis. Additionally we use the same length limit even if there are + * digits involved because it's not clear digit token-space reduction is saving + * us from anything and could be hurting. Specifically, if no one is ever + * going to search on things with digits, then we should just remove them. + * Right now, the space reduction is going to increase false positives when + * people do search on them and increase the number of collisions sufficiently + * to make it really expensive. The caveat is there will be some increase in + * index size which could be meaningful if people are receiving lots of emails + * full of distinct numbers. + * + * In order to do the copy-from-the-front and copy-from-the-back trick, once + * we reach N characters in, we set zFrontEnd to the current value of zOut + * (which represents the termination of the first part of the result string) + * and set zBackStart to the value of zOutStart. We then advanced zBackStart + * along a character at a time as we write more characters. Once we have + * traversed the entire string, if zBackStart > zFrontEnd, then we know + * the string should be shrunk using the characters in the two ranges. + * + * (It would be faster to scan from the back with specialized logic but that + * particular logic seems easy to screw up and we don't have unit tests in here + * to the extent required.) + * + * @param zIn Input string to normalize and potentially shrink. + * @param nBytesIn The number of bytes in zIn, distinct from the number of + * unicode characters encoded in zIn. + * @param zOut The string to write our output into. This must have at least + * nBytesIn * MAX_UTF8_GROWTH_FACTOR in order to compensate for + * normalization that results in a larger utf-8 encoding. + * @param pnBytesOut Integer to write the number of bytes in zOut into. + */ +static void copy_stemmer(const unsigned char *zIn, const int nBytesIn, + unsigned char *zOut, int *pnBytesOut){ + const unsigned char *zInTerm = zIn + nBytesIn; + unsigned char *zOutStart = zOut; + unsigned int c; + unsigned int charCount = 0; + unsigned char *zFrontEnd = NULL, *zBackStart = NULL; + unsigned int trashC; + + /* copy normalized character */ + while (zIn < zInTerm) { + READ_UTF8(zIn, zInTerm, c); + c = normalize_character(c); + + /* ignore voiced/semi-voiced sound mark */ + if (!isVoicedSoundMark(c)) { + /* advance one non-voiced sound mark character. */ + if (zBackStart) + READ_UTF8(zBackStart, zOut, trashC); + + WRITE_UTF8(zOut, c); + charCount++; + if (charCount == COPY_STEMMER_COPY_HALF_LEN) { + zFrontEnd = zOut; + zBackStart = zOutStart; + } + } + } + + /* if we need to shrink the string, transplant the back bytes */ + if (zBackStart > zFrontEnd) { /* this handles when both are null too */ + size_t backBytes = zOut - zBackStart; + memmove(zFrontEnd, zBackStart, backBytes); + zOut = zFrontEnd + backBytes; + } + *zOut = 0; + *pnBytesOut = zOut - zOutStart; +} + + +/* +** Stem the input word zIn[0..nIn-1]. Store the output in zOut. +** zOut is at least big enough to hold nIn bytes. Write the actual +** size of the output word (exclusive of the '\0' terminator) into *pnOut. +** +** Any upper-case characters in the US-ASCII character set ([A-Z]) +** are converted to lower case. Upper-case UTF characters are +** unchanged. +** +** Words that are longer than about 20 bytes are stemmed by retaining +** a few bytes from the beginning and the end of the word. If the +** word contains digits, 3 bytes are taken from the beginning and +** 3 bytes from the end. For long words without digits, 10 bytes +** are taken from each end. US-ASCII case folding still applies. +** +** If the input word contains not digits but does characters not +** in [a-zA-Z] then no stemming is attempted and this routine just +** copies the input into the input into the output with US-ASCII +** case folding. +** +** Stemming never increases the length of the word. So there is +** no chance of overflowing the zOut buffer. +*/ +static void porter_stemmer( + const unsigned char *zIn, + unsigned int nIn, + unsigned char *zOut, + int *pnOut +){ + unsigned int i, j, c; + char zReverse[28]; + char *z, *z2; + const unsigned char *zTerm = zIn + nIn; + const unsigned char *zTmp = zIn; + + if( nIn<3 || nIn>=sizeof(zReverse)-7 ){ + /* The word is too big or too small for the porter stemmer. + ** Fallback to the copy stemmer */ + copy_stemmer(zIn, nIn, zOut, pnOut); + return; + } + for (j = sizeof(zReverse) - 6; zTmp < zTerm; j--) { + READ_UTF8(zTmp, zTerm, c); + c = normalize_character(c); + if( c>='a' && c<='z' ){ + zReverse[j] = c; + }else{ + /* The use of a character not in [a-zA-Z] means that we fallback + ** to the copy stemmer */ + copy_stemmer(zIn, nIn, zOut, pnOut); + return; + } + } + memset(&zReverse[sizeof(zReverse)-5], 0, 5); + z = &zReverse[j+1]; + + + /* Step 1a */ + if( z[0]=='s' ){ + if( + !stem(&z, "sess", "ss", 0) && + !stem(&z, "sei", "i", 0) && + !stem(&z, "ss", "ss", 0) + ){ + z++; + } + } + + /* Step 1b */ + z2 = z; + if( stem(&z, "dee", "ee", m_gt_0) ){ + /* Do nothing. The work was all in the test */ + }else if( + (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel)) + && z!=z2 + ){ + if( stem(&z, "ta", "ate", 0) || + stem(&z, "lb", "ble", 0) || + stem(&z, "zi", "ize", 0) ){ + /* Do nothing. The work was all in the test */ + }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){ + z++; + }else if( m_eq_1(z) && star_oh(z) ){ + *(--z) = 'e'; + } + } + + /* Step 1c */ + if( z[0]=='y' && hasVowel(z+1) ){ + z[0] = 'i'; + } + + /* Step 2 */ + switch( z[1] ){ + case 'a': + (void) (stem(&z, "lanoita", "ate", m_gt_0) || + stem(&z, "lanoit", "tion", m_gt_0)); + break; + case 'c': + (void) (stem(&z, "icne", "ence", m_gt_0) || + stem(&z, "icna", "ance", m_gt_0)); + break; + case 'e': + (void) (stem(&z, "rezi", "ize", m_gt_0)); + break; + case 'g': + (void) (stem(&z, "igol", "log", m_gt_0)); + break; + case 'l': + (void) (stem(&z, "ilb", "ble", m_gt_0) || + stem(&z, "illa", "al", m_gt_0) || + stem(&z, "iltne", "ent", m_gt_0) || + stem(&z, "ile", "e", m_gt_0) || + stem(&z, "ilsuo", "ous", m_gt_0)); + break; + case 'o': + (void) (stem(&z, "noitazi", "ize", m_gt_0) || + stem(&z, "noita", "ate", m_gt_0) || + stem(&z, "rota", "ate", m_gt_0)); + break; + case 's': + (void) (stem(&z, "msila", "al", m_gt_0) || + stem(&z, "ssenevi", "ive", m_gt_0) || + stem(&z, "ssenluf", "ful", m_gt_0) || + stem(&z, "ssensuo", "ous", m_gt_0)); + break; + case 't': + (void) (stem(&z, "itila", "al", m_gt_0) || + stem(&z, "itivi", "ive", m_gt_0) || + stem(&z, "itilib", "ble", m_gt_0)); + break; + } + + /* Step 3 */ + switch( z[0] ){ + case 'e': + (void) (stem(&z, "etaci", "ic", m_gt_0) || + stem(&z, "evita", "", m_gt_0) || + stem(&z, "ezila", "al", m_gt_0)); + break; + case 'i': + (void) (stem(&z, "itici", "ic", m_gt_0)); + break; + case 'l': + (void) (stem(&z, "laci", "ic", m_gt_0) || + stem(&z, "luf", "", m_gt_0)); + break; + case 's': + (void) (stem(&z, "ssen", "", m_gt_0)); + break; + } + + /* Step 4 */ + switch( z[1] ){ + case 'a': + if( z[0]=='l' && m_gt_1(z+2) ){ + z += 2; + } + break; + case 'c': + if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){ + z += 4; + } + break; + case 'e': + if( z[0]=='r' && m_gt_1(z+2) ){ + z += 2; + } + break; + case 'i': + if( z[0]=='c' && m_gt_1(z+2) ){ + z += 2; + } + break; + case 'l': + if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){ + z += 4; + } + break; + case 'n': + if( z[0]=='t' ){ + if( z[2]=='a' ){ + if( m_gt_1(z+3) ){ + z += 3; + } + }else if( z[2]=='e' ){ + (void) (stem(&z, "tneme", "", m_gt_1) || + stem(&z, "tnem", "", m_gt_1) || + stem(&z, "tne", "", m_gt_1)); + } + } + break; + case 'o': + if( z[0]=='u' ){ + if( m_gt_1(z+2) ){ + z += 2; + } + }else if( z[3]=='s' || z[3]=='t' ){ + (void) (stem(&z, "noi", "", m_gt_1)); + } + break; + case 's': + if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){ + z += 3; + } + break; + case 't': + (void) (stem(&z, "eta", "", m_gt_1) || + stem(&z, "iti", "", m_gt_1)); + break; + case 'u': + if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){ + z += 3; + } + break; + case 'v': + case 'z': + if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){ + z += 3; + } + break; + } + + /* Step 5a */ + if( z[0]=='e' ){ + if( m_gt_1(z+1) ){ + z++; + }else if( m_eq_1(z+1) && !star_oh(z+1) ){ + z++; + } + } + + /* Step 5b */ + if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){ + z++; + } + + /* z[] is now the stemmed word in reverse order. Flip it back + ** around into forward order and return. + */ + *pnOut = i = strlen(z); + zOut[i] = 0; + while( *z ){ + zOut[--i] = *(z++); + } +} + +/** + * Indicate whether characters in the 0x30 - 0x7f region can be part of a token. + * Letters and numbers can; punctuation (and 'del') can't. + */ +static const char porterIdChar[] = { +/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */ + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */ + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */ +}; + +/** + * Test whether a character is a (non-ascii) space character or not. isDelim + * uses the existing porter stemmer logic for anything in the ASCII (< 0x80) + * space which covers 0x20. + * + * 0x2000-0x206F is the general punctuation table. 0x2000 - 0x200b are spaces. + * The spaces 0x2000 - 0x200a are all defined as roughly equivalent to a + * standard 0x20 space. 0x200b is a "zero width space" (ZWSP) and not like an + * 0x20 space. 0x202f is a narrow no-break space and roughly equivalent to an + * 0x20 space. 0x205f is a "medium mathematical space" and defined as roughly + * equivalent to an 0x20 space. + */ +#define IS_UNI_SPACE(x) (((x)>=0x2000&&(x)<=0x200a) || (x)==0x202f || (x)==0x205f) +/** + * What we are checking for: + * - 0x3001: Ideographic comma (-> 0x2c ',') + * - 0x3002: Ideographic full stop (-> 0x2e '.') + * - 0xff0c: fullwidth comma (~ wide 0x2c ',') + * - 0xff0e: fullwidth full stop (~ wide 0x2e '.') + * - 0xff61: halfwidth ideographic full stop (~ narrow 0x3002) + * - 0xff64: halfwidth ideographic comma (~ narrow 0x3001) + * + * It is possible we should be treating other things as delimiters! + */ +#define IS_JA_DELIM(x) (((x)==0x3001)||((x)==0xFF64)||((x)==0xFF0E)||((x)==0x3002)||((x)==0xFF61)||((x)==0xFF0C)) + +/** + * The previous character was a delimeter (which includes the start of the + * string). + */ +#define BIGRAM_RESET 0 +/** + * The previous character was a CJK character and we have only seen one of them. + * If we had seen more than one in a row it would be the BIGRAM_USE state. + */ +#define BIGRAM_UNKNOWN 1 +/** + * We have seen two or more CJK characters in a row. + */ +#define BIGRAM_USE 2 +/** + * The previous character was ASCII or something in the unicode general scripts + * area that we do not believe is a delimeter. We call it 'alpha' as in + * alphabetic/alphanumeric and something that should be tokenized based on + * delimiters rather than on a bi-gram basis. + */ +#define BIGRAM_ALPHA 3 + +static int isDelim( + const unsigned char *zCur, /* IN: current pointer of token */ + const unsigned char *zTerm, /* IN: one character beyond end of token */ + int *len, /* OUT: analyzed bytes in this token */ + int *state /* IN/OUT: analyze state */ +){ + const unsigned char *zIn = zCur; + unsigned int c; + int delim; + + /* get the unicode character to analyze */ + READ_UTF8(zIn, zTerm, c); + c = normalize_character(c); + *len = zIn - zCur; + + /* ASCII character range has rule */ + if( c < 0x80 ){ + // This is original porter stemmer isDelim logic. + // 0x0 - 0x1f are all control characters, 0x20 is space, 0x21-0x2f are + // punctuation. + delim = (c < 0x30 || !porterIdChar[c - 0x30]); + // cases: "&a", "&." + if (*state == BIGRAM_USE || *state == BIGRAM_UNKNOWN ){ + /* previous maybe CJK and current is ascii */ + *state = BIGRAM_ALPHA; /*ascii*/ + delim = 1; /* must break */ + } else if (delim == 1) { + // cases: "a.", ".." + /* this is delimiter character */ + *state = BIGRAM_RESET; /*reset*/ + } else { + // cases: "aa", ".a" + *state = BIGRAM_ALPHA; /*ascii*/ + } + return delim; + } + + // (at this point we must be a non-ASCII character) + + /* voiced/semi-voiced sound mark is ignore */ + if (isVoicedSoundMark(c) && *state != BIGRAM_ALPHA) { + /* ignore this because it is combined with previous char */ + return 0; + } + + /* this isn't CJK range, so return as no delim */ + // Anything less than 0x2000 (except to U+0E00-U+0EFF and U+1780-U+17FF) + // is the general scripts area and should not be bi-gram indexed. + // 0xa000 - 0a4cf is the Yi area. It is apparently a phonetic language whose + // usage does not appear to have simple delimeter rules, so we're leaving it + // as bigram processed. This is a guess, if you know better, let us know. + // (We previously bailed on this range too.) + // Addition, U+0E00-U+0E7F is Thai, U+0E80-U+0EFF is Laos, + // and U+1780-U+17FF is Khmer. It is no easy way to break each word. + // So these should use bi-gram too. + // cases: "aa", ".a", "&a" + if (c < 0xe00 || + (c >= 0xf00 && c < 0x1780) || + (c >= 0x1800 && c < 0x2000)) { + *state = BIGRAM_ALPHA; /* not really ASCII but same idea; tokenize it */ + return 0; + } + + // (at this point we must be a bi-grammable char or delimiter) + + /* this is space character or delim character */ + // cases: "a.", "..", "&." + if( IS_UNI_SPACE(c) || IS_JA_DELIM(c) ){ + *state = BIGRAM_RESET; /* reset */ + return 1; /* it actually is a delimiter; report as such */ + } + + // (at this point we must be a bi-grammable char) + + // cases: "a&" + if( *state==BIGRAM_ALPHA ){ + /* Previous is ascii and current maybe CJK */ + *state = BIGRAM_UNKNOWN; /* mark as unknown */ + return 1; /* break to emit the ASCII token*/ + } + + /* We have no rule for CJK!. use bi-gram */ + // cases: "&&" + if( *state==BIGRAM_UNKNOWN || *state==BIGRAM_USE ){ + /* previous state is unknown. mark as bi-gram */ + *state = BIGRAM_USE; + return 1; /* break to emit the digram */ + } + + // cases: ".&" (*state == BIGRAM_RESET) + *state = BIGRAM_UNKNOWN; /* mark as unknown */ + return 0; /* no need to break; nothing to emit */ +} + +/** + * Generate a new token. There are basically three types of token we can + * generate: + * - A porter stemmed token. This is a word entirely comprised of ASCII + * characters. We run the porter stemmer algorithm against the word. + * Because we have no way to know what is and is not an English word + * (the only language for which the porter stemmer was designed), this + * could theoretically map multiple words that are not variations of the + * same word down to the same root, resulting in potentially unexpected + * result inclusions in the search results. We accept this result because + * there's not a lot we can do about it and false positives are much + * better than false negatives. + * - A copied token; case/accent-folded but not stemmed. We call the porter + * stemmer for all non-CJK cases and it diverts to the copy stemmer if it + * sees any non-ASCII characters (after folding) or if the string is too + * long. The copy stemmer will shrink the string if it is deemed too long. + * - A bi-gram token; two CJK-ish characters. For query reasons we generate a + * series of overlapping bi-grams. (We can't require the user to start their + * search based on the arbitrary context of the indexed documents.) + * + * It may be useful to think of this function as operating at the points between + * characters. While we are considering the 'current' character (the one after + * the 'point'), we are also interested in the 'previous' character (the one + * preceding the point). + * At any 'point', there are a number of possible situations which I will + * illustrate with pairs of characters. 'a' means alphanumeric ASCII or a + * non-ASCII character that is not bi-grammable or a delimeter, '.' + * means a delimiter (space or punctuation), '&' means a bi-grammable + * character. + * - aa: We are in the midst of a token. State remains BIGRAM_ALPHA. + * - a.: We will generate a porter stemmed or copied token. State was + * BIGRAM_ALPHA, gets set to BIGRAM_RESET. + * - a&: We will generate a porter stemmed or copied token; we will set our + * state to BIGRAM_UNKNOWN to indicate we have seen one bigram character + * but that it is not yet time to emit a bigram. + * - .a: We are starting a token. State was BIGRAM_RESET, gets set to + * BIGRAM_ALPHA. + * - ..: We skip/eat the delimeters. State stays BIGRAM_RESET. + * - .&: State set to BIGRAM_UNKNOWN to indicate we have seen one bigram char. + * - &a: If the state was BIGRAM_USE, we generate a bi-gram token. If the state + * was BIGRAM_UNKNOWN we had only seen one CJK character and so don't do + * anything. State is set to BIGRAM_ALPHA. + * - &.: Same as the "&a" case, but state is set to BIGRAM_RESET. + * - &&: We will generate a bi-gram token. State was either BIGRAM_UNKNOWN or + * BIGRAM_USE, gets set to BIGRAM_USE. + */ +static int porterNext( + sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */ + const char **pzToken, /* OUT: *pzToken is the token text */ + int *pnBytes, /* OUT: Number of bytes in token */ + int *piStartOffset, /* OUT: Starting offset of token */ + int *piEndOffset, /* OUT: Ending offset of token */ + int *piPosition /* OUT: Position integer of token */ +){ + porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor; + const unsigned char *z = (unsigned char *) c->zInput; + int len = 0; + int state; + + while( c->iOffset < c->nInput ){ + int iStartOffset, numChars; + + /* + * This loop basically has two modes of operation: + * - general processing (iPrevBigramOffset == 0 here) + * - CJK processing (iPrevBigramOffset != 0 here) + * + * In an general processing pass we skip over all the delimiters, leaving us + * at a character that promises to produce a token. This could be a CJK + * token (state == BIGRAM_USE) or an ALPHA token (state == BIGRAM_ALPHA). + * If it was a CJK token, we transition into CJK state for the next loop. + * If it was an alpha token, our current offset is pointing at a delimiter + * (which could be a CJK character), so it is good that our next pass + * through the function and loop will skip over any delimiters. If the + * delimiter we hit was a CJK character, the next time through we will + * not treat it as a delimiter though; the entry state for that scan is + * BIGRAM_RESET so the transition is not treated as a delimiter! + * + * The CJK pass always starts with the second character in a bi-gram emitted + * as a token in the previous step. No delimiter skipping is required + * because we know that first character might produce a token for us. It + * only 'might' produce a token because the previous pass performed no + * lookahead and cannot be sure it is followed by another CJK character. + * This is why + */ + + // If we have a previous bigram offset + if (c->iPrevBigramOffset == 0) { + /* Scan past delimiter characters */ + state = BIGRAM_RESET; /* reset */ + while (c->iOffset < c->nInput && + isDelim(z + c->iOffset, z + c->nInput, &len, &state)) { + c->iOffset += len; + } + + } else { + /* for bigram indexing, use previous offset */ + c->iOffset = c->iPrevBigramOffset; + } + + /* Count non-delimiter characters. */ + iStartOffset = c->iOffset; + numChars = 0; + + // Start from a reset state. This means the first character we see + // (which will not be a delimiter) determines which of ALPHA or CJK modes + // we are operating in. (It won't be a delimiter because in a 'general' + // pass as defined above, we will have eaten all the delimiters, and in + // a CJK pass we are guaranteed that the first character is CJK.) + state = BIGRAM_RESET; /* state is reset */ + // Advance until it is time to emit a token. + // For ALPHA characters, this means advancing until we encounter a delimiter + // or a CJK character. iOffset will be pointing at the delimiter or CJK + // character, aka one beyond the last ALPHA character. + // For CJK characters this means advancing until we encounter an ALPHA + // character, a delimiter, or we have seen two consecutive CJK + // characters. iOffset points at the ALPHA/delimiter in the first 2 cases + // and the second of two CJK characters in the last case. + // Because of the way this loop is structured, iOffset is only updated + // when we don't terminate. However, if we terminate, len still contains + // the number of bytes in the character found at iOffset. (This is useful + // in the CJK case.) + while (c->iOffset < c->nInput && + !isDelim(z + c->iOffset, z + c->nInput, &len, &state)) { + c->iOffset += len; + numChars++; + } + + if (state == BIGRAM_USE) { + /* Split word by bigram */ + // Right now iOffset is pointing at the second character in a pair. + // Save this offset so next-time through we start with that as the + // first character. + c->iPrevBigramOffset = c->iOffset; + // And now advance so that iOffset is pointing at the character after + // the second character in the bi-gram pair. Also count the char. + c->iOffset += len; + numChars++; + } else { + /* Reset bigram offset */ + c->iPrevBigramOffset = 0; + } + + /* We emit a token if: + * - there are two ideograms together, + * - there are three chars or more, + * - we think this is a query and wildcard magic is desired. + * We think is a wildcard query when we have a single character, it starts + * at the start of the buffer, it's CJK, our current offset is one shy of + * nInput and the character at iOffset is '*'. Because the state gets + * clobbered by the incidence of '*' our requirement for CJK is that the + * implied character length is at least 3 given that it takes at least 3 + * bytes to encode to 0x2000. + */ + // It is possible we have no token to emit here if iPrevBigramOffset was not + // 0 on entry and there was no second CJK character. iPrevBigramOffset + // will now be 0 if that is the case (and c->iOffset == iStartOffset). + if (// allow two-character words only if in bigram + (numChars == 2 && state == BIGRAM_USE) || + // otherwise, drop two-letter words (considered stop-words) + (numChars >=3) || + // wildcard case: + (numChars == 1 && iStartOffset == 0 && + (c->iOffset >= 3) && + (c->iOffset == c->nInput - 1) && + (z[c->iOffset] == '*'))) { + /* figure out the number of bytes to copy/stem */ + int n = c->iOffset - iStartOffset; + /* make sure there is enough buffer space */ + if (n * MAX_UTF8_GROWTH_FACTOR > c->nAllocated) { + c->nAllocated = n * MAX_UTF8_GROWTH_FACTOR + 20; + c->zToken = sqlite3_realloc(c->zToken, c->nAllocated); + if (c->zToken == NULL) + return SQLITE_NOMEM; + } + + if (state == BIGRAM_USE) { + /* This is by bigram. So it is unnecessary to convert word */ + copy_stemmer(&z[iStartOffset], n, c->zToken, pnBytes); + } else { + porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes); + } + *pzToken = (const char*)c->zToken; + *piStartOffset = iStartOffset; + *piEndOffset = c->iOffset; + *piPosition = c->iToken++; + return SQLITE_OK; + } + } + return SQLITE_DONE; +} + +/* +** The set of routines that implement the porter-stemmer tokenizer +*/ +static const sqlite3_tokenizer_module porterTokenizerModule = { + 0, + porterCreate, + porterDestroy, + porterOpen, + porterClose, + porterNext, +}; + +/* +** Allocate a new porter tokenizer. Return a pointer to the new +** tokenizer in *ppModule +*/ +void sqlite3Fts3PorterTokenizerModule( + sqlite3_tokenizer_module const**ppModule +){ + *ppModule = &porterTokenizerModule; +} + +#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ diff --git a/mailnews/extensions/fts3/src/fts3_tokenizer.h b/mailnews/extensions/fts3/src/fts3_tokenizer.h new file mode 100644 index 000000000..906303db4 --- /dev/null +++ b/mailnews/extensions/fts3/src/fts3_tokenizer.h @@ -0,0 +1,148 @@ +/* +** 2006 July 10 +** +** The author disclaims copyright to this source code. +** +************************************************************************* +** Defines the interface to tokenizers used by fulltext-search. There +** are three basic components: +** +** sqlite3_tokenizer_module is a singleton defining the tokenizer +** interface functions. This is essentially the class structure for +** tokenizers. +** +** sqlite3_tokenizer is used to define a particular tokenizer, perhaps +** including customization information defined at creation time. +** +** sqlite3_tokenizer_cursor is generated by a tokenizer to generate +** tokens from a particular input. +*/ +#ifndef _FTS3_TOKENIZER_H_ +#define _FTS3_TOKENIZER_H_ + +/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time. +** If tokenizers are to be allowed to call sqlite3_*() functions, then +** we will need a way to register the API consistently. +*/ +#include "sqlite3.h" + +/* +** Structures used by the tokenizer interface. When a new tokenizer +** implementation is registered, the caller provides a pointer to +** an sqlite3_tokenizer_module containing pointers to the callback +** functions that make up an implementation. +** +** When an fts3 table is created, it passes any arguments passed to +** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the +** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer +** implementation. The xCreate() function in turn returns an +** sqlite3_tokenizer structure representing the specific tokenizer to +** be used for the fts3 table (customized by the tokenizer clause arguments). +** +** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen() +** method is called. It returns an sqlite3_tokenizer_cursor object +** that may be used to tokenize a specific input buffer based on +** the tokenization rules supplied by a specific sqlite3_tokenizer +** object. +*/ +typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module; +typedef struct sqlite3_tokenizer sqlite3_tokenizer; +typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor; + +struct sqlite3_tokenizer_module { + + /* + ** Structure version. Should always be set to 0. + */ + int iVersion; + + /* + ** Create a new tokenizer. The values in the argv[] array are the + ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL + ** TABLE statement that created the fts3 table. For example, if + ** the following SQL is executed: + ** + ** CREATE .. USING fts3( ... , tokenizer arg1 arg2) + ** + ** then argc is set to 2, and the argv[] array contains pointers + ** to the strings "arg1" and "arg2". + ** + ** This method should return either SQLITE_OK (0), or an SQLite error + ** code. If SQLITE_OK is returned, then *ppTokenizer should be set + ** to point at the newly created tokenizer structure. The generic + ** sqlite3_tokenizer.pModule variable should not be initialised by + ** this callback. The caller will do so. + */ + int (*xCreate)( + int argc, /* Size of argv array */ + const char *const*argv, /* Tokenizer argument strings */ + sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ + ); + + /* + ** Destroy an existing tokenizer. The fts3 module calls this method + ** exactly once for each successful call to xCreate(). + */ + int (*xDestroy)(sqlite3_tokenizer *pTokenizer); + + /* + ** Create a tokenizer cursor to tokenize an input buffer. The caller + ** is responsible for ensuring that the input buffer remains valid + ** until the cursor is closed (using the xClose() method). + */ + int (*xOpen)( + sqlite3_tokenizer *pTokenizer, /* Tokenizer object */ + const char *pInput, int nBytes, /* Input buffer */ + sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */ + ); + + /* + ** Destroy an existing tokenizer cursor. The fts3 module calls this + ** method exactly once for each successful call to xOpen(). + */ + int (*xClose)(sqlite3_tokenizer_cursor *pCursor); + + /* + ** Retrieve the next token from the tokenizer cursor pCursor. This + ** method should either return SQLITE_OK and set the values of the + ** "OUT" variables identified below, or SQLITE_DONE to indicate that + ** the end of the buffer has been reached, or an SQLite error code. + ** + ** *ppToken should be set to point at a buffer containing the + ** normalized version of the token (i.e. after any case-folding and/or + ** stemming has been performed). *pnBytes should be set to the length + ** of this buffer in bytes. The input text that generated the token is + ** identified by the byte offsets returned in *piStartOffset and + ** *piEndOffset. *piStartOffset should be set to the index of the first + ** byte of the token in the input buffer. *piEndOffset should be set + ** to the index of the first byte just past the end of the token in + ** the input buffer. + ** + ** The buffer *ppToken is set to point at is managed by the tokenizer + ** implementation. It is only required to be valid until the next call + ** to xNext() or xClose(). + */ + /* TODO(shess) current implementation requires pInput to be + ** nul-terminated. This should either be fixed, or pInput/nBytes + ** should be converted to zInput. + */ + int (*xNext)( + sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */ + const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */ + int *piStartOffset, /* OUT: Byte offset of token in input buffer */ + int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */ + int *piPosition /* OUT: Number of tokens returned before this one */ + ); +}; + +struct sqlite3_tokenizer { + const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */ + /* Tokenizer implementations will typically add additional fields */ +}; + +struct sqlite3_tokenizer_cursor { + sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */ + /* Tokenizer implementations will typically add additional fields */ +}; + +#endif /* _FTS3_TOKENIZER_H_ */ diff --git a/mailnews/extensions/fts3/src/moz.build b/mailnews/extensions/fts3/src/moz.build new file mode 100644 index 000000000..a2b3c60d5 --- /dev/null +++ b/mailnews/extensions/fts3/src/moz.build @@ -0,0 +1,18 @@ +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +SOURCES += [ + 'fts3_porter.c', + 'Normalize.c', +] + +SOURCES += [ + 'nsFts3Tokenizer.cpp', + 'nsGlodaRankerFunction.cpp', +] + +FINAL_LIBRARY = 'mail' + +CXXFLAGS += CONFIG['SQLITE_CFLAGS'] diff --git a/mailnews/extensions/fts3/src/nsFts3Tokenizer.cpp b/mailnews/extensions/fts3/src/nsFts3Tokenizer.cpp new file mode 100644 index 000000000..12bd70ead --- /dev/null +++ b/mailnews/extensions/fts3/src/nsFts3Tokenizer.cpp @@ -0,0 +1,72 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsFts3Tokenizer.h" + +#include "nsGlodaRankerFunction.h" + +#include "nsIFts3Tokenizer.h" +#include "mozIStorageConnection.h" +#include "mozIStorageStatement.h" +#include "nsStringGlue.h" + +extern "C" void sqlite3Fts3PorterTokenizerModule( + sqlite3_tokenizer_module const**ppModule); + +extern "C" void glodaRankFunc(sqlite3_context *pCtx, + int nVal, + sqlite3_value **apVal); + +NS_IMPL_ISUPPORTS(nsFts3Tokenizer,nsIFts3Tokenizer) + +nsFts3Tokenizer::nsFts3Tokenizer() +{ +} + +nsFts3Tokenizer::~nsFts3Tokenizer() +{ +} + +NS_IMETHODIMP +nsFts3Tokenizer::RegisterTokenizer(mozIStorageConnection *connection) +{ + nsresult rv; + nsCOMPtr selectStatement; + + // -- register the tokenizer + rv = connection->CreateStatement(NS_LITERAL_CSTRING( + "SELECT fts3_tokenizer(?1, ?2)"), + getter_AddRefs(selectStatement)); + NS_ENSURE_SUCCESS(rv, rv); + + const sqlite3_tokenizer_module* module = nullptr; + sqlite3Fts3PorterTokenizerModule(&module); + if (!module) + return NS_ERROR_FAILURE; + + rv = selectStatement->BindUTF8StringParameter( + 0, NS_LITERAL_CSTRING("mozporter")); + NS_ENSURE_SUCCESS(rv, rv); + rv = selectStatement->BindBlobParameter(1, + (uint8_t*)&module, + sizeof(module)); + NS_ENSURE_SUCCESS(rv, rv); + + bool hasMore; + rv = selectStatement->ExecuteStep(&hasMore); + NS_ENSURE_SUCCESS(rv, rv); + + // -- register the ranking function + nsCOMPtr func = new nsGlodaRankerFunction(); + NS_ENSURE_TRUE(func, NS_ERROR_OUT_OF_MEMORY); + rv = connection->CreateFunction( + NS_LITERAL_CSTRING("glodaRank"), + -1, // variable argument support + func + ); + NS_ENSURE_SUCCESS(rv, rv); + + return rv; +} diff --git a/mailnews/extensions/fts3/src/nsFts3Tokenizer.h b/mailnews/extensions/fts3/src/nsFts3Tokenizer.h new file mode 100644 index 000000000..eb1e83bd5 --- /dev/null +++ b/mailnews/extensions/fts3/src/nsFts3Tokenizer.h @@ -0,0 +1,26 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsFts3Tokenizer_h__ +#define nsFts3Tokenizer_h__ + +#include "nsCOMPtr.h" +#include "nsIFts3Tokenizer.h" +#include "fts3_tokenizer.h" + +extern const sqlite3_tokenizer_module* getWindowsTokenizer(); + +class nsFts3Tokenizer final : public nsIFts3Tokenizer { +public: + NS_DECL_ISUPPORTS + NS_DECL_NSIFTS3TOKENIZER + + nsFts3Tokenizer(); + +private: + ~nsFts3Tokenizer(); +}; + +#endif diff --git a/mailnews/extensions/fts3/src/nsFts3TokenizerCID.h b/mailnews/extensions/fts3/src/nsFts3TokenizerCID.h new file mode 100644 index 000000000..1f9c101bc --- /dev/null +++ b/mailnews/extensions/fts3/src/nsFts3TokenizerCID.h @@ -0,0 +1,16 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsFts3TokenizerCID_h__ +#define nsFts3TokenizerCID_h__ + +#define NS_FTS3TOKENIZER_CONTRACTID \ + "@mozilla.org/messenger/fts3tokenizer;1" +#define NS_FTS3TOKENIZER_CID \ +{ /* a67d724d-0015-4e2e-8cad-b84775330924 */ \ + 0xa67d724d, 0x0015, 0x4e2e, \ + { 0x8c, 0xad, 0xb8, 0x47, 0x75, 0x33, 0x09, 0x24 }} + +#endif /* nsFts3TokenizerCID_h__ */ diff --git a/mailnews/extensions/fts3/src/nsGlodaRankerFunction.cpp b/mailnews/extensions/fts3/src/nsGlodaRankerFunction.cpp new file mode 100644 index 000000000..fb576685d --- /dev/null +++ b/mailnews/extensions/fts3/src/nsGlodaRankerFunction.cpp @@ -0,0 +1,145 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Thunderbird Global Database. + * + * The Initial Developer of the Original Code is the Mozilla Foundation. + * Portions created by the Initial Developer are Copyright (C) 2010 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Andrew Sutherland + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "nsGlodaRankerFunction.h" +#include "mozIStorageValueArray.h" + +#include "sqlite3.h" + +#include "nsCOMPtr.h" +#include "nsVariant.h" +#include "nsComponentManagerUtils.h" + +#ifndef SQLITE_VERSION_NUMBER +#error "We need SQLITE_VERSION_NUMBER defined!" +#endif + +NS_IMPL_ISUPPORTS(nsGlodaRankerFunction, mozIStorageFunction) + +nsGlodaRankerFunction::nsGlodaRankerFunction() +{ +} + +nsGlodaRankerFunction::~nsGlodaRankerFunction() +{ +} + +static uint32_t COLUMN_SATURATION[] = {10, 1, 1, 1, 1}; + +/** + * Our ranking function basically just multiplies the weight of the column + * against the number of (saturating) matches. + * + * The original code is a SQLite example ranking function, although somewhat + * rather modified at this point. All SQLite code is public domain, so we are + * subsuming it to MPL1.1/LGPL2/GPL2. + */ +NS_IMETHODIMP +nsGlodaRankerFunction::OnFunctionCall(mozIStorageValueArray *aArguments, + nsIVariant **_result) +{ + // all argument names are maintained from the original SQLite code. + uint32_t nVal; + nsresult rv = aArguments->GetNumEntries(&nVal); + NS_ENSURE_SUCCESS(rv, rv); + + /* Check that the number of arguments passed to this function is correct. + * If not, return an error. Set aArgsData to point to the array + * of unsigned integer values returned by FTS3 function. Set nPhrase + * to contain the number of reportable phrases in the users full-text + * query, and nCol to the number of columns in the table. + */ + if (nVal < 1) + return NS_ERROR_INVALID_ARG; + + uint32_t lenArgsData; + uint32_t *aArgsData = (uint32_t *)aArguments->AsSharedBlob(0, &lenArgsData); + + uint32_t nPhrase = aArgsData[0]; + uint32_t nCol = aArgsData[1]; + if (nVal != (1 + nCol)) + return NS_ERROR_INVALID_ARG; + + double score = 0.0; + + // SQLite 3.6.22 has a different matchinfo layout than SQLite 3.6.23+ +#if SQLITE_VERSION_NUMBER <= 3006022 + /* Iterate through each phrase in the users query. */ + for (uint32_t iPhrase = 0; iPhrase < nPhrase; iPhrase++) { + // in SQ + for (uint32_t iCol = 0; iCol < nCol; iCol++) { + uint32_t nHitCount = aArgsData[2 + (iPhrase+1)*nCol + iCol]; + double weight = aArguments->AsDouble(iCol+1); + if (nHitCount > 0) { + score += (nHitCount > COLUMN_SATURATION[iCol]) ? + (COLUMN_SATURATION[iCol] * weight) : + (nHitCount * weight); + } + } + } +#else + /* Iterate through each phrase in the users query. */ + for (uint32_t iPhrase = 0; iPhrase < nPhrase; iPhrase++) { + /* Now iterate through each column in the users query. For each column, + ** increment the relevancy score by: + ** + ** ( / ) * + ** + ** aPhraseinfo[] points to the start of the data for phrase iPhrase. So + ** the hit count and global hit counts for each column are found in + ** aPhraseinfo[iCol*3] and aPhraseinfo[iCol*3+1], respectively. + */ + uint32_t *aPhraseinfo = &aArgsData[2 + iPhrase*nCol*3]; + for (uint32_t iCol = 0; iCol < nCol; iCol++) { + uint32_t nHitCount = aPhraseinfo[3 * iCol]; + double weight = aArguments->AsDouble(iCol+1); + if (nHitCount > 0) { + score += (nHitCount > COLUMN_SATURATION[iCol]) ? + (COLUMN_SATURATION[iCol] * weight) : + (nHitCount * weight); + } + } + } +#endif + + nsCOMPtr result = new nsVariant(); + + rv = result->SetAsDouble(score); + NS_ENSURE_SUCCESS(rv, rv); + + NS_ADDREF(*_result = result); + return NS_OK; +} diff --git a/mailnews/extensions/fts3/src/nsGlodaRankerFunction.h b/mailnews/extensions/fts3/src/nsGlodaRankerFunction.h new file mode 100644 index 000000000..5c5c920d0 --- /dev/null +++ b/mailnews/extensions/fts3/src/nsGlodaRankerFunction.h @@ -0,0 +1,25 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef _nsGlodaRankerFunction_h_ +#define _nsGlodaRankerFunction_h_ + +#include "mozIStorageFunction.h" + +/** + * Basically a port of the example FTS3 ranking function to mozStorage's + * view of the universe. This might get fancier at some point. + */ +class nsGlodaRankerFunction final : public mozIStorageFunction +{ +public: + NS_DECL_ISUPPORTS + NS_DECL_MOZISTORAGEFUNCTION + + nsGlodaRankerFunction(); +private: + ~nsGlodaRankerFunction(); +}; + +#endif // _nsGlodaRankerFunction_h_ -- cgit v1.2.3