summaryrefslogtreecommitdiffstats
path: root/intl/lwbrk
diff options
context:
space:
mode:
Diffstat (limited to 'intl/lwbrk')
-rw-r--r--intl/lwbrk/crashtests/416721.html11
-rw-r--r--intl/lwbrk/crashtests/crashtests.list1
-rw-r--r--intl/lwbrk/gtest/TestLineBreak.cpp323
-rw-r--r--intl/lwbrk/gtest/moz.build12
-rw-r--r--intl/lwbrk/jisx4051class.h218
-rw-r--r--intl/lwbrk/jisx4051pairtable.txt286
-rw-r--r--intl/lwbrk/moz.build48
-rw-r--r--intl/lwbrk/nsCarbonBreaker.cpp44
-rw-r--r--intl/lwbrk/nsComplexBreaker.h19
-rw-r--r--intl/lwbrk/nsILineBreaker.h74
-rw-r--r--intl/lwbrk/nsISemanticUnitScanner.idl48
-rw-r--r--intl/lwbrk/nsIWordBreaker.h41
-rw-r--r--intl/lwbrk/nsJISx4051LineBreaker.cpp999
-rw-r--r--intl/lwbrk/nsJISx4051LineBreaker.h37
-rw-r--r--intl/lwbrk/nsLWBrkCIID.h22
-rw-r--r--intl/lwbrk/nsPangoBreaker.cpp60
-rw-r--r--intl/lwbrk/nsRuleBreaker.cpp20
-rw-r--r--intl/lwbrk/nsSampleWordBreaker.cpp150
-rw-r--r--intl/lwbrk/nsSampleWordBreaker.h42
-rw-r--r--intl/lwbrk/nsSemanticUnitScanner.cpp76
-rw-r--r--intl/lwbrk/nsSemanticUnitScanner.h27
-rw-r--r--intl/lwbrk/nsUniscribeBreaker.cpp58
-rw-r--r--intl/lwbrk/rulebrk.c376
-rw-r--r--intl/lwbrk/rulebrk.h26
-rw-r--r--intl/lwbrk/th_char.h54
-rw-r--r--intl/lwbrk/tools/anzx4051.html669
-rw-r--r--intl/lwbrk/tools/anzx4051.pl356
-rw-r--r--intl/lwbrk/tools/jisx4051class.txt159
-rw-r--r--intl/lwbrk/tools/jisx4051simp.txt24
-rw-r--r--intl/lwbrk/tools/spec_table.html127
30 files changed, 4407 insertions, 0 deletions
diff --git a/intl/lwbrk/crashtests/416721.html b/intl/lwbrk/crashtests/416721.html
new file mode 100644
index 000000000..0a6625ba8
--- /dev/null
+++ b/intl/lwbrk/crashtests/416721.html
@@ -0,0 +1,11 @@
+<!DOCTYPE html>
+<html>
+ <head>
+ <title>Testcase for bug 416721</title>
+ <meta http-equiv="content-type" content="text/html; charset=utf-8">
+ </head>
+ <body>
+ <p>กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛</p>
+ </body>
+</html>
+
diff --git a/intl/lwbrk/crashtests/crashtests.list b/intl/lwbrk/crashtests/crashtests.list
new file mode 100644
index 000000000..a7cb7a173
--- /dev/null
+++ b/intl/lwbrk/crashtests/crashtests.list
@@ -0,0 +1 @@
+load 416721.html
diff --git a/intl/lwbrk/gtest/TestLineBreak.cpp b/intl/lwbrk/gtest/TestLineBreak.cpp
new file mode 100644
index 000000000..5824bf70f
--- /dev/null
+++ b/intl/lwbrk/gtest/TestLineBreak.cpp
@@ -0,0 +1,323 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <stdio.h>
+#include "nsXPCOM.h"
+#include "nsIComponentManager.h"
+#include "nsISupports.h"
+#include "nsServiceManagerUtils.h"
+#include "nsILineBreaker.h"
+#include "nsIWordBreaker.h"
+#include "nsLWBrkCIID.h"
+#include "nsString.h"
+#include "gtest/gtest.h"
+
+NS_DEFINE_CID(kLBrkCID, NS_LBRK_CID);
+NS_DEFINE_CID(kWBrkCID, NS_WBRK_CID);
+
+static char teng1[] =
+// 1 2 3 4 5 6 7
+//01234567890123456789012345678901234567890123456789012345678901234567890123456789
+ "This is a test to test(reasonable) line break. This 0.01123 = 45 x 48.";
+
+static uint32_t lexp1[] = {
+ 4,7,9,14,17,34,39,40,41,42,49,54,62,64,67,69,73
+};
+
+static uint32_t wexp1[] = {
+ 4,5,7,8,9,10,14,15,17,18,22,23,33,34,35,39,43,48,49,50,54,55,56,57,62,63,
+ 64,65,67,68,69,70,72
+};
+
+static char teng2[] =
+// 1 2 3 4 5 6 7
+//01234567890123456789012345678901234567890123456789012345678901234567890123456789
+ "()((reasonab(l)e) line break. .01123=45x48.";
+
+static uint32_t lexp2[] = {
+ 17,22,23,30,44
+};
+
+static uint32_t wexp2[] = {
+ 4,12,13,14,15,16,17,18,22,24,29,30,31,32,37,38,43
+};
+
+static char teng3[] =
+// 1 2 3 4 5 6 7
+//01234567890123456789012345678901234567890123456789012345678901234567890123456789
+ "It's a test to test(ronae ) line break....";
+
+static uint32_t lexp3[] = {
+ 4,6,11,14,25,27,32,42
+};
+
+static uint32_t wexp3[] = {
+ 2,3,4,5,6,7,11,12,14,15,19,20,25,26,27,28,32,33,38
+};
+
+static char ruler1[] =
+" 1 2 3 4 5 6 7 ";
+static char ruler2[] =
+"0123456789012345678901234567890123456789012345678901234567890123456789012";
+
+bool
+Check(const char* in, const uint32_t* out, uint32_t outlen, uint32_t i,
+ uint32_t res[256])
+{
+ bool ok = true;
+
+ if (i != outlen) {
+ ok = false;
+ printf("WARNING!!! return size wrong, expect %d but got %d \n",
+ outlen, i);
+ }
+
+ for (uint32_t j = 0; j < i; j++) {
+ if (j < outlen) {
+ if (res[j] != out[j]) {
+ ok = false;
+ printf("[%d] expect %d but got %d\n", j, out[j], res[j]);
+ }
+ } else {
+ ok = false;
+ printf("[%d] additional %d\n", j, res[j]);
+ }
+ }
+
+ if (!ok) {
+ printf("string = \n%s\n", in);
+ printf("%s\n", ruler1);
+ printf("%s\n", ruler2);
+
+ printf("Expect = \n");
+ for (uint32_t j = 0; j < outlen; j++) {
+ printf("%d,", out[j]);
+ }
+
+ printf("\nResult = \n");
+ for (uint32_t j = 0; j < i; j++) {
+ printf("%d,", res[j]);
+ }
+ printf("\n");
+ }
+
+ return ok;
+}
+
+bool
+TestASCIILB(nsILineBreaker *lb,
+ const char* in,
+ const uint32_t* out, uint32_t outlen)
+{
+ NS_ConvertASCIItoUTF16 eng1(in);
+ uint32_t i;
+ uint32_t res[256];
+ int32_t curr;
+
+ for (i = 0, curr = 0;
+ curr != NS_LINEBREAKER_NEED_MORE_TEXT && i < 256;
+ i++) {
+ curr = lb->Next(eng1.get(), eng1.Length(), curr);
+ res[i] = curr != NS_LINEBREAKER_NEED_MORE_TEXT ? curr : eng1.Length();
+ }
+
+ return Check(in, out, outlen, i, res);
+}
+
+bool
+TestASCIIWB(nsIWordBreaker *lb,
+ const char* in,
+ const uint32_t* out, uint32_t outlen)
+{
+ NS_ConvertASCIItoUTF16 eng1(in);
+
+ uint32_t i;
+ uint32_t res[256];
+ int32_t curr = 0;
+
+ for (i = 0, curr = lb->NextWord(eng1.get(), eng1.Length(), curr);
+ curr != NS_WORDBREAKER_NEED_MORE_TEXT && i < 256;
+ curr = lb->NextWord(eng1.get(), eng1.Length(), curr), i++) {
+ res [i] = curr != NS_WORDBREAKER_NEED_MORE_TEXT ? curr : eng1.Length();
+ }
+
+ return Check(in, out, outlen, i, res);
+}
+
+TEST(LineBreak, LineBreaker)
+{
+ nsILineBreaker *t = nullptr;
+ nsresult res = CallGetService(kLBrkCID, &t);
+ ASSERT_TRUE(NS_SUCCEEDED(res) && t);
+ NS_IF_RELEASE(t);
+
+ res = CallGetService(kLBrkCID, &t);
+ ASSERT_TRUE(NS_SUCCEEDED(res) && t);
+
+ ASSERT_TRUE(TestASCIILB(t, teng1, lexp1, sizeof(lexp1) / sizeof(uint32_t)));
+ ASSERT_TRUE(TestASCIILB(t, teng2, lexp2, sizeof(lexp2) / sizeof(uint32_t)));
+ ASSERT_TRUE(TestASCIILB(t, teng3, lexp3, sizeof(lexp3) / sizeof(uint32_t)));
+
+ NS_RELEASE(t);
+}
+
+TEST(LineBreak, WordBreaker)
+{
+ nsIWordBreaker *t = nullptr;
+ nsresult res = CallGetService(kWBrkCID, &t);
+ ASSERT_TRUE(NS_SUCCEEDED(res) && t);
+ NS_IF_RELEASE(t);
+
+ res = CallGetService(kWBrkCID, &t);
+ ASSERT_TRUE(NS_SUCCEEDED(res) && t);
+
+ ASSERT_TRUE(TestASCIIWB(t, teng1, wexp1, sizeof(wexp1) / sizeof(uint32_t)));
+ ASSERT_TRUE(TestASCIIWB(t, teng2, wexp2, sizeof(wexp2) / sizeof(uint32_t)));
+ ASSERT_TRUE(TestASCIIWB(t, teng3, wexp3, sizeof(wexp3) / sizeof(uint32_t)));
+
+ NS_RELEASE(t);
+}
+
+// 012345678901234
+static const char wb0[] = "T";
+static const char wb1[] = "h";
+static const char wb2[] = "is is a int";
+static const char wb3[] = "ernationali";
+static const char wb4[] = "zation work.";
+
+static const char* wb[] = { wb0, wb1, wb2, wb3, wb4 };
+
+void
+TestPrintWordWithBreak()
+{
+ uint32_t numOfFragment = sizeof(wb) / sizeof(char*);
+ nsIWordBreaker* wbk = nullptr;
+
+ CallGetService(kWBrkCID, &wbk);
+
+ nsAutoString result;
+
+ for (uint32_t i = 0; i < numOfFragment; i++) {
+ NS_ConvertASCIItoUTF16 fragText(wb[i]);
+
+ int32_t cur = 0;
+ cur = wbk->NextWord(fragText.get(), fragText.Length(), cur);
+ uint32_t start = 0;
+ for (uint32_t j = 0; cur != NS_WORDBREAKER_NEED_MORE_TEXT; j++) {
+ result.Append(Substring(fragText, start, cur - start));
+ result.Append('^');
+ start = (cur >= 0 ? cur : cur - start);
+ cur = wbk->NextWord(fragText.get(), fragText.Length(), cur);
+ }
+
+ result.Append(Substring(fragText, fragText.Length() - start));
+
+ if (i != numOfFragment - 1) {
+ NS_ConvertASCIItoUTF16 nextFragText(wb[i+1]);
+
+ bool canBreak = true;
+ canBreak = wbk->BreakInBetween(fragText.get(),
+ fragText.Length(),
+ nextFragText.get(),
+ nextFragText.Length());
+ if (canBreak) {
+ result.Append('^');
+ }
+ fragText.Assign(nextFragText);
+ }
+ }
+ ASSERT_STREQ("is^ ^is^ ^a^ ^ is a intzation^ ^work^ation work.",
+ NS_ConvertUTF16toUTF8(result).get());
+
+ NS_IF_RELEASE(wbk);
+}
+
+void
+TestFindWordBreakFromPosition(uint32_t fragN, uint32_t offset,
+ const char* expected)
+{
+ uint32_t numOfFragment = sizeof(wb) / sizeof(char*);
+ nsIWordBreaker* wbk = nullptr;
+
+ CallGetService(kWBrkCID, &wbk);
+
+ NS_ConvertASCIItoUTF16 fragText(wb[fragN]);
+
+ nsWordRange res = wbk->FindWord(fragText.get(), fragText.Length(), offset);
+
+ bool canBreak;
+ nsAutoString result(Substring(fragText, res.mBegin, res.mEnd-res.mBegin));
+
+ if ((uint32_t)fragText.Length() == res.mEnd) {
+ // if we hit the end of the fragment
+ nsAutoString curFragText = fragText;
+ for(uint32_t p = fragN +1; p < numOfFragment ;p++)
+ {
+ NS_ConvertASCIItoUTF16 nextFragText(wb[p]);
+ canBreak = wbk->BreakInBetween(curFragText.get(),
+ curFragText.Length(),
+ nextFragText.get(),
+ nextFragText.Length());
+ if (canBreak) {
+ break;
+ }
+ nsWordRange r = wbk->FindWord(nextFragText.get(), nextFragText.Length(),
+ 0);
+
+ result.Append(Substring(nextFragText, r.mBegin, r.mEnd - r.mBegin));
+
+ if ((uint32_t)nextFragText.Length() != r.mEnd) {
+ break;
+ }
+ nextFragText.Assign(curFragText);
+ }
+ }
+
+ if (0 == res.mBegin) {
+ // if we hit the beginning of the fragment
+ nsAutoString curFragText = fragText;
+ for (uint32_t p = fragN; p > 0; p--) {
+ NS_ConvertASCIItoUTF16 prevFragText(wb[p-1]);
+ canBreak = wbk->BreakInBetween(prevFragText.get(),
+ prevFragText.Length(),
+ curFragText.get(),
+ curFragText.Length());
+ if (canBreak) {
+ break;
+ }
+ nsWordRange r = wbk->FindWord(prevFragText.get(), prevFragText.Length(),
+ prevFragText.Length());
+
+ result.Insert(Substring(prevFragText, r.mBegin, r.mEnd - r.mBegin), 0);
+
+ if (0 != r.mBegin) {
+ break;
+ }
+ prevFragText.Assign(curFragText);
+ }
+ }
+
+ ASSERT_STREQ(expected, NS_ConvertUTF16toUTF8(result).get())
+ << "FindWordBreakFromPosition(" << fragN << ", " << offset << ")";
+
+ NS_IF_RELEASE(wbk);
+}
+
+TEST(LineBreak, WordBreakUsage)
+{
+ TestPrintWordWithBreak();
+ TestFindWordBreakFromPosition(0, 0, "This");
+ TestFindWordBreakFromPosition(1, 0, "his");
+ TestFindWordBreakFromPosition(2, 0, "is");
+ TestFindWordBreakFromPosition(2, 1, "is");
+ TestFindWordBreakFromPosition(2, 9, " ");
+ TestFindWordBreakFromPosition(2, 10, "internationalization");
+ TestFindWordBreakFromPosition(3, 4, "ernationalization");
+ TestFindWordBreakFromPosition(3, 8, "ernationalization");
+ TestFindWordBreakFromPosition(4, 6, " ");
+ TestFindWordBreakFromPosition(4, 7, "work");
+}
+
diff --git a/intl/lwbrk/gtest/moz.build b/intl/lwbrk/gtest/moz.build
new file mode 100644
index 000000000..64a3919cb
--- /dev/null
+++ b/intl/lwbrk/gtest/moz.build
@@ -0,0 +1,12 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+UNIFIED_SOURCES += [
+ 'TestLineBreak.cpp',
+]
+
+FINAL_LIBRARY = 'xul-gtest'
+
diff --git a/intl/lwbrk/jisx4051class.h b/intl/lwbrk/jisx4051class.h
new file mode 100644
index 000000000..70585ac51
--- /dev/null
+++ b/intl/lwbrk/jisx4051class.h
@@ -0,0 +1,218 @@
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+/*
+ DO NOT EDIT THIS DOCUMENT !!! THIS DOCUMENT IS GENERATED BY
+ mozilla/intl/lwbrk/tools/anzx4051.pl
+ */
+static const uint32_t gLBClass00[32] = {
+0x55555555, // U+0000 - U+0007
+0x55555555, // U+0008 - U+000F
+0x55555555, // U+0010 - U+0017
+0x55555555, // U+0018 - U+001F
+0x7AABAAA5, // U+0020 - U+0027
+0x7A7AAAA9, // U+0028 - U+002F
+0x66666666, // U+0030 - U+0037
+0xAAA9AA66, // U+0038 - U+003F
+0x77777777, // U+0040 - U+0047
+0x77777777, // U+0048 - U+004F
+0x77777777, // U+0050 - U+0057
+0x77AA9777, // U+0058 - U+005F
+0x77777777, // U+0060 - U+0067
+0x77777777, // U+0068 - U+006F
+0x77777777, // U+0070 - U+0077
+0x7AAA9777, // U+0078 - U+007F
+0x77777777, // U+0080 - U+0087
+0x77777777, // U+0088 - U+008F
+0x77777777, // U+0090 - U+0097
+0x77777777, // U+0098 - U+009F
+0xAA9A9AAB, // U+00A0 - U+00A7
+0x77A9777A, // U+00A8 - U+00AF
+0xAAAAAAAA, // U+00B0 - U+00B7
+0xAAAAAAAA, // U+00B8 - U+00BF
+0x77777777, // U+00C0 - U+00C7
+0x77777777, // U+00C8 - U+00CF
+0x77777777, // U+00D0 - U+00D7
+0x77777777, // U+00D8 - U+00DF
+0x77777777, // U+00E0 - U+00E7
+0x77777777, // U+00E8 - U+00EF
+0xA7777777, // U+00F0 - U+00F7
+0x77777777, // U+00F8 - U+00FF
+};
+
+static const uint32_t gLBClass20[32] = {
+0xB5555555, // U+2000 - U+2007
+0x77775555, // U+2008 - U+200F
+0x777277B7, // U+2010 - U+2017
+0x77A777A7, // U+2018 - U+201F
+0xAAAA7777, // U+2020 - U+2027
+0xB7777777, // U+2028 - U+202F
+0x77744444, // U+2030 - U+2037
+0x7A115107, // U+2038 - U+203F
+0x11017777, // U+2040 - U+2047
+0x77777711, // U+2048 - U+204F
+0x77777777, // U+2050 - U+2057
+0x57777777, // U+2058 - U+205F
+0x7777777B, // U+2060 - U+2067
+0x77777777, // U+2068 - U+206F
+0x77777777, // U+2070 - U+2077
+0x77777777, // U+2078 - U+207F
+0x77777777, // U+2080 - U+2087
+0x77777777, // U+2088 - U+208F
+0x77777777, // U+2090 - U+2097
+0x77777777, // U+2098 - U+209F
+0x77777777, // U+20A0 - U+20A7
+0x77777777, // U+20A8 - U+20AF
+0x77777777, // U+20B0 - U+20B7
+0x77777777, // U+20B8 - U+20BF
+0x77777777, // U+20C0 - U+20C7
+0x77777777, // U+20C8 - U+20CF
+0x77777777, // U+20D0 - U+20D7
+0x77777777, // U+20D8 - U+20DF
+0x77777777, // U+20E0 - U+20E7
+0x77777777, // U+20E8 - U+20EF
+0x77777777, // U+20F0 - U+20F7
+0x77777777, // U+20F8 - U+20FF
+};
+
+static const uint32_t gLBClass21[32] = {
+0x77777777, // U+2100 - U+2107
+0x77777777, // U+2108 - U+210F
+0x73777777, // U+2110 - U+2117
+0x77777777, // U+2118 - U+211F
+0x77777777, // U+2120 - U+2127
+0x77777777, // U+2128 - U+212F
+0x77777777, // U+2130 - U+2137
+0x77777777, // U+2138 - U+213F
+0x77777777, // U+2140 - U+2147
+0x77777777, // U+2148 - U+214F
+0x77777777, // U+2150 - U+2157
+0x77777777, // U+2158 - U+215F
+0x55555555, // U+2160 - U+2167
+0x55555555, // U+2168 - U+216F
+0x55555555, // U+2170 - U+2177
+0x55555555, // U+2178 - U+217F
+0x77777777, // U+2180 - U+2187
+0x77777777, // U+2188 - U+218F
+0x77777777, // U+2190 - U+2197
+0x77777777, // U+2198 - U+219F
+0x77777777, // U+21A0 - U+21A7
+0x77777777, // U+21A8 - U+21AF
+0x77777777, // U+21B0 - U+21B7
+0x77777777, // U+21B8 - U+21BF
+0x77777777, // U+21C0 - U+21C7
+0x77777777, // U+21C8 - U+21CF
+0x77777777, // U+21D0 - U+21D7
+0x77777777, // U+21D8 - U+21DF
+0x77777777, // U+21E0 - U+21E7
+0x77777777, // U+21E8 - U+21EF
+0x77777777, // U+21F0 - U+21F7
+0x77777777, // U+21F8 - U+21FF
+};
+
+static const uint32_t gLBClass30[32] = {
+0x55155115, // U+3000 - U+3007
+0x10101010, // U+3008 - U+300F
+0x10105510, // U+3010 - U+3017
+0x11011010, // U+3018 - U+301F
+0x55555555, // U+3020 - U+3027
+0x55555555, // U+3028 - U+302F
+0x55555555, // U+3030 - U+3037
+0x55555555, // U+3038 - U+303F
+0x15151515, // U+3040 - U+3047
+0x55555515, // U+3048 - U+304F
+0x55555555, // U+3050 - U+3057
+0x55555555, // U+3058 - U+305F
+0x55551555, // U+3060 - U+3067
+0x55555555, // U+3068 - U+306F
+0x55555555, // U+3070 - U+3077
+0x55555555, // U+3078 - U+307F
+0x15151555, // U+3080 - U+3087
+0x51555555, // U+3088 - U+308F
+0x55555555, // U+3090 - U+3097
+0x51111115, // U+3098 - U+309F
+0x15151515, // U+30A0 - U+30A7
+0x55555515, // U+30A8 - U+30AF
+0x55555555, // U+30B0 - U+30B7
+0x55555555, // U+30B8 - U+30BF
+0x55551555, // U+30C0 - U+30C7
+0x55555555, // U+30C8 - U+30CF
+0x55555555, // U+30D0 - U+30D7
+0x55555555, // U+30D8 - U+30DF
+0x15151555, // U+30E0 - U+30E7
+0x51555555, // U+30E8 - U+30EF
+0x51155555, // U+30F0 - U+30F7
+0x51111555, // U+30F8 - U+30FF
+};
+
+static const uint32_t gLBClass0E[32] = {
+0x88888888, // U+0E00 - U+0E07
+0x88888888, // U+0E08 - U+0E0F
+0x88888888, // U+0E10 - U+0E17
+0x88888888, // U+0E18 - U+0E1F
+0x88888888, // U+0E20 - U+0E27
+0x18888888, // U+0E28 - U+0E2F
+0x88888888, // U+0E30 - U+0E37
+0x08888888, // U+0E38 - U+0E3F
+0x81888888, // U+0E40 - U+0E47
+0x78888888, // U+0E48 - U+0E4F
+0x66666666, // U+0E50 - U+0E57
+0x88881166, // U+0E58 - U+0E5F
+0x88888888, // U+0E60 - U+0E67
+0x88888888, // U+0E68 - U+0E6F
+0x88888888, // U+0E70 - U+0E77
+0x88888888, // U+0E78 - U+0E7F
+0x88888888, // U+0E80 - U+0E87
+0x88888888, // U+0E88 - U+0E8F
+0x88888888, // U+0E90 - U+0E97
+0x88888888, // U+0E98 - U+0E9F
+0x88888888, // U+0EA0 - U+0EA7
+0x18888888, // U+0EA8 - U+0EAF
+0x88888888, // U+0EB0 - U+0EB7
+0x88888888, // U+0EB8 - U+0EBF
+0x81888888, // U+0EC0 - U+0EC7
+0x88888888, // U+0EC8 - U+0ECF
+0x66666666, // U+0ED0 - U+0ED7
+0x88888866, // U+0ED8 - U+0EDF
+0x88888888, // U+0EE0 - U+0EE7
+0x88888888, // U+0EE8 - U+0EEF
+0x88888888, // U+0EF0 - U+0EF7
+0x88888888, // U+0EF8 - U+0EFF
+};
+
+static const uint32_t gLBClass17[32] = {
+0x77777777, // U+1700 - U+1707
+0x77777777, // U+1708 - U+170F
+0x77777777, // U+1710 - U+1717
+0x77777777, // U+1718 - U+171F
+0x77777777, // U+1720 - U+1727
+0x77777777, // U+1728 - U+172F
+0x70077777, // U+1730 - U+1737
+0x77777777, // U+1738 - U+173F
+0x77777777, // U+1740 - U+1747
+0x77777777, // U+1748 - U+174F
+0x77777777, // U+1750 - U+1757
+0x77777777, // U+1758 - U+175F
+0x77777777, // U+1760 - U+1767
+0x77777777, // U+1768 - U+176F
+0x77777777, // U+1770 - U+1777
+0x77777777, // U+1778 - U+177F
+0x88888888, // U+1780 - U+1787
+0x88888888, // U+1788 - U+178F
+0x88888888, // U+1790 - U+1797
+0x88888888, // U+1798 - U+179F
+0x88888888, // U+17A0 - U+17A7
+0x88888888, // U+17A8 - U+17AF
+0x88888888, // U+17B0 - U+17B7
+0x88888888, // U+17B8 - U+17BF
+0x88888888, // U+17C0 - U+17C7
+0x88888888, // U+17C8 - U+17CF
+0x88118888, // U+17D0 - U+17D7
+0x77888181, // U+17D8 - U+17DF
+0x88888888, // U+17E0 - U+17E7
+0x77777788, // U+17E8 - U+17EF
+0x88888888, // U+17F0 - U+17F7
+0x77777788, // U+17F8 - U+17FF
+};
+
diff --git a/intl/lwbrk/jisx4051pairtable.txt b/intl/lwbrk/jisx4051pairtable.txt
new file mode 100644
index 000000000..2bae1b18f
--- /dev/null
+++ b/intl/lwbrk/jisx4051pairtable.txt
@@ -0,0 +1,286 @@
+
+
+
+/*
+
+ Simplification of Pair Table in JIS X 4051
+
+ 1. The Origion Table - in 4.1.3
+
+ In JIS x 4051. The pair table is defined as below
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20
+ * # * #
+ 1 X X X X X X X X X X X X X X X X X X X X X E
+ 2 X X X X X X
+ 3 X X X X X X
+ 4 X X X X X X
+ 5 X X X X X X
+ 6 X X X X X X
+ 7 X X X X X X X
+ 8 X X X X X X E
+ 9 X X X X X X
+ 10 X X X X X X
+ 11 X X X X X X
+ 12 X X X X X X
+ 13 X X X X X X X
+ 14 X X X X X X X
+ 15 X X X X X X X X X
+ 16 X X X X X X X X
+ 17 X X X X X E
+ 18 X X X X X X X X X
+ 19 X E E E E E X X X X X X X X X X X X E X E E
+ 20 X X X X X E
+
+ * Same Char
+ # Other Char
+
+ 2. Simplified by remove the class which we do not care
+
+ However, since we do not care about class 13(Subscript), 14(Ruby),
+ 19(split line note begin quote), and 20(split line note end quote)
+ we can simplify this par table into the following
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 2 3 4 5 6 7 8 9 10 11 12 15 16 17 18
+
+ 1 X X X X X X X X X X X X X X X X
+ 2 X X X X X
+ 3 X X X X X
+ 4 X X X X X
+ 5 X X X X X
+ 6 X X X X X
+ 7 X X X X X X
+ 8 X X X X X X
+ 9 X X X X X
+ 10 X X X X X
+ 11 X X X X X
+ 12 X X X X X
+ 15 X X X X X X X X
+ 16 X X X X X X X
+ 17 X X X X X
+ 18 X X X X X X X X
+
+ 3. Simplified by merged classes
+
+ After the 2 simplification, the pair table have some duplication
+ a. class 2, 3, 4, 5, 6, are the same- we can merged them
+ b. class 10, 11, 12, 17 are the same- we can merged them
+
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 [a] 7 8 9 [b]15 16 18
+
+ 1 X X X X X X X X X
+ [a] X
+ 7 X X
+ 8 X X
+ 9 X
+ [b] X
+ 15 X X X X
+ 16 X X X
+ 18 X X X X
+
+
+ 4. Now we use one bit to encode weather it is breakable, and use 2 bytes
+ for one row, then the bit table will look like:
+
+ 18 <- 1
+
+ 1 0000 0001 1111 1111 = 0x01FF
+ [a] 0000 0000 0000 0010 = 0x0002
+ 7 0000 0000 0000 0110 = 0x0006
+ 8 0000 0000 0100 0010 = 0x0042
+ 9 0000 0000 0000 0010 = 0x0002
+ [b] 0000 0000 0000 0010 = 0x0042
+ 15 0000 0001 0101 0010 = 0x0152
+ 16 0000 0001 1000 0010 = 0x0182
+ 17 0000 0001 1100 0010 = 0x01C2
+
+*/
+
+static uint16_t gJISx4051SimplifiedPair[9] = {
+ 0x01FF, 0x0002, 0x0006, 0x0042, 0x0002, 0x0042, 0x0152, 0x0182, 0x01C2
+};
+
+PRBool XXXX::ClassesToPair(nsJISx4051Cls aCls1, nsJISx4051Cls aCls1)
+{
+ NS_ASSERTION( (aCls1 < 9) "invalid class");
+ NS_ASSERTION( (aCls2 < 9) "invalid class");
+ return ( 0 != (gJISx4051SimplifiedPair[aCls1] & (1L << aCls2) ));
+}
+
+
+#define X4051_IS_DIGIT(u) ((0x0030 >= (u)) && ((u) >= 0x0039))
+
+nsJISx4051Cls XXXX::GetClass(
+ PRUnichar aChar, PRUnichar aBefore = 0, PRUnichar aAfter = 0)
+{
+ // take care the special case in cls 15
+ if( ((0x2C == aChar) || (0x2E == aChar)) &&
+ (X4051_IS_DIGIT(aBefore)) && X4051_IS_DIGIT(aAfter)))
+ {
+ return kJISx4051Cls_15;
+ }
+
+ nsJISx4051Cls cls;
+ if(gSingle->Lookup(aChar, &cls))
+ return cls;
+
+ if(gRange->Lookup(aChar, &cls))
+ return cls;
+
+ return kJISx4051Cls_15;
+}
+
+
+typedef enum {
+ kJISx4051Cls_1 = 0,
+ kJISx4051Cls_2 = 1,
+ kJISx4051Cls_3 = 1,
+ kJISx4051Cls_4 = 1,
+ kJISx4051Cls_5 = 1,
+ kJISx4051Cls_6 = 1,
+ kJISx4051Cls_7 = 2,
+ kJISx4051Cls_8 = 3,
+ kJISx4051Cls_9 = 4,
+ kJISx4051Cls_10 = 5,
+ kJISx4051Cls_11 = 5,
+ kJISx4051Cls_12 = 5,
+ // kJISx4051Cls_13 = 0,
+ // kJISx4051Cls_14 = 0,
+ kJISx4051Cls_15 = 6,
+ kJISx4051Cls_16 = 7,
+ kJISx4051Cls_17 = 5,
+ kJISx4051Cls_18 = 8,
+ // kJISx4051Cls_19 = 0,
+ // kJISx4051Cls_20 = 0
+} nsJISx4051Cls;
+
+
+ // Table 2
+ YYYY(kJISx4051Cls_1 , 0x0028),
+ YYYY(kJISx4051Cls_1 , 0x005B),
+ YYYY(kJISx4051Cls_1 , 0x007B),
+ YYYY(kJISx4051Cls_1 , 0x2018),
+ YYYY(kJISx4051Cls_1 , 0x201B),
+ YYYY(kJISx4051Cls_1 , 0x201C),
+ YYYY(kJISx4051Cls_1 , 0x201F),
+ YYYY(kJISx4051Cls_1 , 0x3008),
+ YYYY(kJISx4051Cls_1 , 0x300A),
+ YYYY(kJISx4051Cls_1 , 0x300C),
+ YYYY(kJISx4051Cls_1 , 0x300E),
+ YYYY(kJISx4051Cls_1 , 0x3010),
+ YYYY(kJISx4051Cls_1 , 0x3014),
+ YYYY(kJISx4051Cls_1 , 0x3016),
+ YYYY(kJISx4051Cls_1 , 0x3018),
+ YYYY(kJISx4051Cls_1 , 0x301A),
+ YYYY(kJISx4051Cls_1 , 0x301D),
+
+ // Table 3
+ YYYY(kJISx4051Cls_2 , 0x0029),
+ YYYY(kJISx4051Cls_2 , 0x002C),
+ YYYY(kJISx4051Cls_2 , 0x005D),
+ YYYY(kJISx4051Cls_2 , 0x007D),
+ YYYY(kJISx4051Cls_2 , 0x2019),
+ YYYY(kJISx4051Cls_2 , 0x201A),
+ YYYY(kJISx4051Cls_2 , 0x201D),
+ YYYY(kJISx4051Cls_2 , 0x201E),
+ YYYY(kJISx4051Cls_2 , 0x3001),
+ YYYY(kJISx4051Cls_2 , 0x3009),
+ YYYY(kJISx4051Cls_2 , 0x300B),
+ YYYY(kJISx4051Cls_2 , 0x300D),
+ YYYY(kJISx4051Cls_2 , 0x300F),
+ YYYY(kJISx4051Cls_2 , 0x3011),
+ YYYY(kJISx4051Cls_2 , 0x3015),
+ YYYY(kJISx4051Cls_2 , 0x3017),
+ YYYY(kJISx4051Cls_2 , 0x3019),
+ YYYY(kJISx4051Cls_2 , 0x301B),
+ YYYY(kJISx4051Cls_2 , 0x301E),
+ YYYY(kJISx4051Cls_2 , 0x301F),
+
+ // Table 4
+ YYYY(kJISx4051Cls_3 , 0x203C),
+ YYYY(kJISx4051Cls_3 , 0x2044),
+ YYYY(kJISx4051Cls_3 , 0x301C),
+ YYYY(kJISx4051Cls_3 , 0x3041),
+ YYYY(kJISx4051Cls_3 , 0x3043),
+ YYYY(kJISx4051Cls_3 , 0x3045),
+ YYYY(kJISx4051Cls_3 , 0x3047),
+ YYYY(kJISx4051Cls_3 , 0x3049),
+ YYYY(kJISx4051Cls_3 , 0x3063),
+ YYYY(kJISx4051Cls_3 , 0x3083),
+ YYYY(kJISx4051Cls_3 , 0x3085),
+ YYYY(kJISx4051Cls_3 , 0x3087),
+ YYYY(kJISx4051Cls_3 , 0x308E),
+ YYYY(kJISx4051Cls_3 , 0x309D),
+ YYYY(kJISx4051Cls_3 , 0x309E),
+ YYYY(kJISx4051Cls_3 , 0x30A1),
+ YYYY(kJISx4051Cls_3 , 0x30A3),
+ YYYY(kJISx4051Cls_3 , 0x30A5),
+ YYYY(kJISx4051Cls_3 , 0x30A7),
+ YYYY(kJISx4051Cls_3 , 0x30A9),
+ YYYY(kJISx4051Cls_3 , 0x30C3),
+ YYYY(kJISx4051Cls_3 , 0x30E3),
+ YYYY(kJISx4051Cls_3 , 0x30E5),
+ YYYY(kJISx4051Cls_3 , 0x30E7),
+ YYYY(kJISx4051Cls_3 , 0x30EE),
+ YYYY(kJISx4051Cls_3 , 0x30F5),
+ YYYY(kJISx4051Cls_3 , 0x30F6),
+ YYYY(kJISx4051Cls_3 , 0x30FC),
+ YYYY(kJISx4051Cls_3 , 0x30FD),
+ YYYY(kJISx4051Cls_3 , 0x30FE),
+
+ // Table 5
+ YYYY(kJISx4051Cls_4 , 0x0021),
+ YYYY(kJISx4051Cls_4 , 0x003F),
+
+ // Table 6
+ YYYY(kJISx4051Cls_5 , 0x003A),
+ YYYY(kJISx4051Cls_5 , 0x003B),
+ YYYY(kJISx4051Cls_5 , 0x30FB),
+
+ // Table 7
+ YYYY(kJISx4051Cls_6 , 0x002E),
+ YYYY(kJISx4051Cls_6 , 0x3002),
+
+ // Table 8
+ YYYY(kJISx4051Cls_7 , 0x2014),
+ YYYY(kJISx4051Cls_7 , 0x2024),
+ YYYY(kJISx4051Cls_7 , 0x2025),
+ YYYY(kJISx4051Cls_7 , 0x2026),
+
+ // Table 9
+ YYYY(kJISx4051Cls_8 , 0x0024),
+ YYYY(kJISx4051Cls_8 , 0x00A3),
+ YYYY(kJISx4051Cls_8 , 0x00A5),
+ YYYY(kJISx4051Cls_8 , 0x2116),
+
+ // Table 10
+ YYYY(kJISx4051Cls_9 , 0x0025),
+ YYYY(kJISx4051Cls_9 , 0x00A2),
+ YYYY(kJISx4051Cls_9 , 0x00B0),
+ YYYY(kJISx4051Cls_9 , 0x2030),
+ YYYY(kJISx4051Cls_9 , 0x2031),
+ YYYY(kJISx4051Cls_9 , 0x2032),
+ YYYY(kJISx4051Cls_9 , 0x2033),
+
+ // Table 1
+ YYYY(kJISx4051Cls_10, 0x3000),
+
+ // Table 1
+ ZZZZ(kJISx4051Cls_11, 0x3000),
+
+
+
+
diff --git a/intl/lwbrk/moz.build b/intl/lwbrk/moz.build
new file mode 100644
index 000000000..63ffbff8e
--- /dev/null
+++ b/intl/lwbrk/moz.build
@@ -0,0 +1,48 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+TEST_DIRS += ['gtest']
+
+XPIDL_SOURCES += [
+ 'nsISemanticUnitScanner.idl',
+]
+
+XPIDL_MODULE = 'lwbrk'
+
+EXPORTS += [
+ 'nsILineBreaker.h',
+ 'nsIWordBreaker.h',
+ 'nsLWBrkCIID.h',
+]
+
+UNIFIED_SOURCES += [
+ 'nsJISx4051LineBreaker.cpp',
+ 'nsSampleWordBreaker.cpp',
+ 'nsSemanticUnitScanner.cpp',
+]
+
+if 'gtk' in CONFIG['MOZ_WIDGET_TOOLKIT']:
+ SOURCES += [
+ 'nsPangoBreaker.cpp',
+ ]
+ CXXFLAGS += CONFIG['MOZ_PANGO_CFLAGS']
+elif CONFIG['MOZ_WIDGET_TOOLKIT'] == 'windows':
+ SOURCES += [
+ 'nsUniscribeBreaker.cpp',
+ ]
+elif CONFIG['MOZ_WIDGET_TOOLKIT'] == 'cocoa':
+ UNIFIED_SOURCES += [
+ 'nsCarbonBreaker.cpp',
+ ]
+else:
+ SOURCES += [
+ 'nsRuleBreaker.cpp',
+ ]
+ SOURCES += [
+ 'rulebrk.c',
+ ]
+
+FINAL_LIBRARY = 'xul'
diff --git a/intl/lwbrk/nsCarbonBreaker.cpp b/intl/lwbrk/nsCarbonBreaker.cpp
new file mode 100644
index 000000000..1b37bc129
--- /dev/null
+++ b/intl/lwbrk/nsCarbonBreaker.cpp
@@ -0,0 +1,44 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <CoreFoundation/CoreFoundation.h>
+#include <stdint.h>
+#include "nsDebug.h"
+#include "nscore.h"
+
+void
+NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength,
+ uint8_t* aBreakBefore)
+{
+ NS_ASSERTION(aText, "aText shouldn't be null");
+
+ memset(aBreakBefore, 0, aLength * sizeof(uint8_t));
+
+ CFStringRef str = ::CFStringCreateWithCharactersNoCopy(kCFAllocatorDefault, reinterpret_cast<const UniChar*>(aText), aLength, kCFAllocatorNull);
+ if (!str) {
+ return;
+ }
+
+ CFStringTokenizerRef st = ::CFStringTokenizerCreate(kCFAllocatorDefault, str,
+ ::CFRangeMake(0, aLength),
+ kCFStringTokenizerUnitLineBreak,
+ nullptr);
+ if (!st) {
+ ::CFRelease(str);
+ return;
+ }
+
+ CFStringTokenizerTokenType tt = ::CFStringTokenizerAdvanceToNextToken(st);
+ while (tt != kCFStringTokenizerTokenNone) {
+ CFRange r = ::CFStringTokenizerGetCurrentTokenRange(st);
+ if (r.location != 0) { // Ignore leading edge
+ aBreakBefore[r.location] = true;
+ }
+ tt = CFStringTokenizerAdvanceToNextToken(st);
+ }
+
+ ::CFRelease(st);
+ ::CFRelease(str);
+}
diff --git a/intl/lwbrk/nsComplexBreaker.h b/intl/lwbrk/nsComplexBreaker.h
new file mode 100644
index 000000000..d4ebb3581
--- /dev/null
+++ b/intl/lwbrk/nsComplexBreaker.h
@@ -0,0 +1,19 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef nsComplexBreaker_h__
+#define nsComplexBreaker_h__
+
+#include "nsString.h"
+
+/**
+ * Find line break opportunities in aText[] of aLength characters,
+ * filling boolean values indicating line break opportunities for
+ * corresponding charactersin aBreakBefore[] on return.
+ */
+void
+NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength,
+ uint8_t* aBreakBefore);
+
+#endif /* nsComplexBreaker_h__ */
diff --git a/intl/lwbrk/nsILineBreaker.h b/intl/lwbrk/nsILineBreaker.h
new file mode 100644
index 000000000..19adbac10
--- /dev/null
+++ b/intl/lwbrk/nsILineBreaker.h
@@ -0,0 +1,74 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef nsILineBreaker_h__
+#define nsILineBreaker_h__
+
+#include "nsISupports.h"
+
+#include "nscore.h"
+
+#define NS_LINEBREAKER_NEED_MORE_TEXT -1
+
+// {0x4b0b9e04-6ffb-4647-aa5f-2fa2ebd883e8}
+#define NS_ILINEBREAKER_IID \
+{0x4b0b9e04, 0x6ffb, 0x4647, \
+ {0xaa, 0x5f, 0x2f, 0xa2, 0xeb, 0xd8, 0x83, 0xe8}}
+
+class nsILineBreaker : public nsISupports
+{
+public:
+ NS_DECLARE_STATIC_IID_ACCESSOR(NS_ILINEBREAKER_IID)
+
+ enum {
+ kWordBreak_Normal = 0, // default
+ kWordBreak_BreakAll = 1, // break all
+ kWordBreak_KeepAll = 2 // always keep
+ };
+
+ virtual int32_t Next( const char16_t* aText, uint32_t aLen,
+ uint32_t aPos) = 0;
+
+ virtual int32_t Prev( const char16_t* aText, uint32_t aLen,
+ uint32_t aPos) = 0;
+
+ // Call this on a word with whitespace at either end. We will apply JISx4051
+ // rules to find breaks inside the word. aBreakBefore is set to the break-
+ // before status of each character; aBreakBefore[0] will always be false
+ // because we never return a break before the first character.
+ // aLength is the length of the aText array and also the length of the aBreakBefore
+ // output array.
+ virtual void GetJISx4051Breaks(const char16_t* aText, uint32_t aLength,
+ uint8_t aWordBreak,
+ uint8_t* aBreakBefore) = 0;
+ virtual void GetJISx4051Breaks(const uint8_t* aText, uint32_t aLength,
+ uint8_t aWordBreak,
+ uint8_t* aBreakBefore) = 0;
+};
+
+NS_DEFINE_STATIC_IID_ACCESSOR(nsILineBreaker, NS_ILINEBREAKER_IID)
+
+static inline bool
+NS_IsSpace(char16_t u)
+{
+ return u == 0x0020 || // SPACE
+ u == 0x0009 || // CHARACTER TABULATION
+ u == 0x000D || // CARRIAGE RETURN
+ u == 0x1680 || // OGHAM SPACE MARK
+ (0x2000 <= u && u <= 0x2006) || // EN QUAD, EM QUAD, EN SPACE,
+ // EM SPACE, THREE-PER-EM SPACE,
+ // FOUR-PER-SPACE, SIX-PER-EM SPACE,
+ (0x2008 <= u && u <= 0x200B) || // PUNCTUATION SPACE, THIN SPACE,
+ // HAIR SPACE, ZERO WIDTH SPACE
+ u == 0x205F; // MEDIUM MATHEMATICAL SPACE
+}
+
+static inline bool
+NS_NeedsPlatformNativeHandling(char16_t aChar)
+{
+ return (0x0e01 <= aChar && aChar <= 0x0fff) || // Thai, Lao, Tibetan
+ (0x1780 <= aChar && aChar <= 0x17ff); // Khmer
+}
+
+#endif /* nsILineBreaker_h__ */
diff --git a/intl/lwbrk/nsISemanticUnitScanner.idl b/intl/lwbrk/nsISemanticUnitScanner.idl
new file mode 100644
index 000000000..e6e99fc07
--- /dev/null
+++ b/intl/lwbrk/nsISemanticUnitScanner.idl
@@ -0,0 +1,48 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsISupports.idl"
+
+%{C++
+// {ADF42751-1CEF-4ad2-AA8E-BCB849D8D31F}
+#define NS_SEMANTICUNITSCANNER_CID { 0xadf42751, 0x1cef, 0x4ad2, { 0xaa, 0x8e, 0xbc, 0xb8, 0x49, 0xd8, 0xd3, 0x1f}}
+#define NS_SEMANTICUNITSCANNER_CONTRACTID "@mozilla.org/intl/semanticunitscanner;1"
+%}
+
+/**
+ * Provides a language independent way to break UNICODE
+ * text into meaningful semantic units (e.g. words).
+ */
+[scriptable, uuid(9f620be4-e535-11d6-b254-00039310a47a)]
+interface nsISemanticUnitScanner : nsISupports {
+ /**
+ * start()
+ *
+ * Starts up the semantic unit scanner with an optional
+ * character set, which acts as a hint to optimize the heuristics
+ * used to determine the language(s) of the processed text.
+ *
+ * @param characterSet the character set the text was originally
+ * encoded in (can be NULL)
+ */
+ void start(in string characterSet);
+
+ /**
+ * next()
+ * Get the begin / end offset of the next unit in the current text
+ *
+ * @param text the text to be scanned
+ * @param length the number of characters in the text to be processed
+ * @param pos the current position
+ * @param isLastBuffer, the buffer is the last one
+ * @param begin the begin offset of the next unit
+ * @param begin the end offset of the next unit
+ * @return has more unit in the current text
+ */
+ boolean next(in wstring text, in long length, in long pos,
+ in boolean isLastBuffer,
+ out long begin, out long end );
+
+};
diff --git a/intl/lwbrk/nsIWordBreaker.h b/intl/lwbrk/nsIWordBreaker.h
new file mode 100644
index 000000000..3867fba06
--- /dev/null
+++ b/intl/lwbrk/nsIWordBreaker.h
@@ -0,0 +1,41 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef nsIWordBreaker_h__
+#define nsIWordBreaker_h__
+
+#include "nsISupports.h"
+
+#include "nscore.h"
+
+#define NS_WORDBREAKER_NEED_MORE_TEXT -1
+
+// {E86B3379-BF89-11d2-B3AF-00805F8A6670}
+#define NS_IWORDBREAKER_IID \
+{ 0xe86b3379, 0xbf89, 0x11d2, \
+ { 0xb3, 0xaf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
+
+typedef struct {
+ uint32_t mBegin;
+ uint32_t mEnd;
+} nsWordRange;
+
+class nsIWordBreaker : public nsISupports
+{
+public:
+ NS_DECLARE_STATIC_IID_ACCESSOR(NS_IWORDBREAKER_IID)
+
+ virtual bool BreakInBetween(const char16_t* aText1 , uint32_t aTextLen1,
+ const char16_t* aText2 ,
+ uint32_t aTextLen2) = 0;
+ virtual nsWordRange FindWord(const char16_t* aText1 , uint32_t aTextLen1,
+ uint32_t aOffset) = 0;
+ virtual int32_t NextWord(const char16_t* aText, uint32_t aLen,
+ uint32_t aPos) = 0;
+
+};
+
+NS_DEFINE_STATIC_IID_ACCESSOR(nsIWordBreaker, NS_IWORDBREAKER_IID)
+
+#endif /* nsIWordBreaker_h__ */
diff --git a/intl/lwbrk/nsJISx4051LineBreaker.cpp b/intl/lwbrk/nsJISx4051LineBreaker.cpp
new file mode 100644
index 000000000..1b262fa2c
--- /dev/null
+++ b/intl/lwbrk/nsJISx4051LineBreaker.cpp
@@ -0,0 +1,999 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+
+
+#include "nsJISx4051LineBreaker.h"
+
+#include "jisx4051class.h"
+#include "nsComplexBreaker.h"
+#include "nsTArray.h"
+#include "nsUnicodeProperties.h"
+
+/*
+
+ Simplification of Pair Table in JIS X 4051
+
+ 1. The Origion Table - in 4.1.3
+
+ In JIS x 4051. The pair table is defined as below
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20
+ * # * #
+ 1 X X X X X X X X X X X X X X X X X X X X X E
+ 2 X X X X X X
+ 3 X X X X X X
+ 4 X X X X X X
+ 5 X X X X X X
+ 6 X X X X X X
+ 7 X X X X X X X
+ 8 X X X X X X E
+ 9 X X X X X X
+ 10 X X X X X X
+ 11 X X X X X X
+ 12 X X X X X X
+ 13 X X X X X X X
+ 14 X X X X X X X
+ 15 X X X X X X X X X
+ 16 X X X X X X X X
+ 17 X X X X X E
+ 18 X X X X X X X X X
+ 19 X E E E E E X X X X X X X X X X X X E X E E
+ 20 X X X X X E
+
+ * Same Char
+ # Other Char
+
+ X Cannot Break
+
+ The classes mean:
+ 1: Open parenthesis
+ 2: Close parenthesis
+ 3: Prohibit a line break before
+ 4: Punctuation for sentence end (except Full stop, e.g., "!" and "?")
+ 5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT)
+ 6: Full stop
+ 7: Non-breakable between same characters
+ 8: Prefix (e.g., "$", "NO.")
+ 9: Postfix (e.g., "%")
+ 10: Ideographic space
+ 11: Hiragana
+ 12: Japanese characters (except class 11)
+ 13: Subscript
+ 14: Ruby
+ 15: Numeric
+ 16: Alphabet
+ 17: Space for Western language
+ 18: Western characters (except class 17)
+ 19: Split line note (Warichu) begin quote
+ 20: Split line note (Warichu) end quote
+
+ 2. Simplified by remove the class which we do not care
+
+ However, since we do not care about class 13(Subscript), 14(Ruby),
+ 16 (Aphabet), 19(split line note begin quote), and 20(split line note end
+ quote) we can simplify this par table into the following
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 2 3 4 5 6 7 8 9 10 11 12 15 17 18
+
+ 1 X X X X X X X X X X X X X X X
+ 2 X X X X X
+ 3 X X X X X
+ 4 X X X X X
+ 5 X X X X X
+ 6 X X X X X
+ 7 X X X X X X
+ 8 X X X X X X
+ 9 X X X X X
+ 10 X X X X X
+ 11 X X X X X
+ 12 X X X X X
+ 15 X X X X X X X X
+ 17 X X X X X
+ 18 X X X X X X X
+
+ 3. Simplified by merged classes
+
+ After the 2 simplification, the pair table have some duplication
+ a. class 2, 3, 4, 5, 6, are the same- we can merged them
+ b. class 10, 11, 12, 17 are the same- we can merged them
+
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 [a] 7 8 9 [b]15 18
+
+ 1 X X X X X X X X
+ [a] X
+ 7 X X
+ 8 X X
+ 9 X
+ [b] X
+ 15 X X X X
+ 18 X X X
+
+
+ 4. We add COMPLEX characters and make it breakable w/ all ther class
+ except after class 1 and before class [a]
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 [a] 7 8 9 [b]15 18 COMPLEX
+
+ 1 X X X X X X X X X
+ [a] X
+ 7 X X
+ 8 X X
+ 9 X
+ [b] X
+ 15 X X X X
+ 18 X X X
+ COMPLEX X T
+
+ T : need special handling
+
+
+ 5. However, we need two special class for some punctuations/parentheses,
+ theirs breaking rules like character class (18), see bug 389056.
+ And also we need character like punctuation that is same behavior with 18,
+ but the characters are not letters of all languages. (e.g., '_')
+ [c]. Based on open parenthesis class (1), but it is not breakable after
+ character class (18) or numeric class (15).
+ [d]. Based on close parenthesis (or punctuation) class (2), but it is not
+ breakable before character class (18) or numeric class (15).
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d]
+
+ 1 X X X X X X X X X X X
+ [a] X X X
+ 7 X X
+ 8 X X
+ 9 X
+ [b] X X
+ 15 X X X X X X
+ 18 X X X X X
+ COMPLEX X T
+ [c] X X X X X X X X X X X
+ [d] X X X X
+
+
+ 6. And Unicode has "NON-BREAK" characters. The lines should be broken around
+ them. But in JIS X 4051, such class is not, therefore, we create [e].
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e]
+
+ 1 X X X X X X X X X X X X
+ [a] X X X
+ 7 X X X
+ 8 X X X
+ 9 X X
+ [b] X X X
+ 15 X X X X X X X
+ 18 X X X X X X
+ COMPLEX X T X
+ [c] X X X X X X X X X X X X
+ [d] X X X X X
+ [e] X X X X X X X X X X X X
+
+
+ 7. Now we use one bit to encode weather it is breakable, and use 2 bytes
+ for one row, then the bit table will look like:
+
+ 18 <- 1
+
+ 1 0000 1111 1111 1111 = 0x0FFF
+ [a] 0000 1100 0000 0010 = 0x0C02
+ 7 0000 1000 0000 0110 = 0x0806
+ 8 0000 1000 0100 0010 = 0x0842
+ 9 0000 1000 0000 0010 = 0x0802
+ [b] 0000 1100 0000 0010 = 0x0C02
+ 15 0000 1110 1101 0010 = 0x0ED2
+ 18 0000 1110 1100 0010 = 0x0EC2
+ COMPLEX 0000 1001 0000 0010 = 0x0902
+ [c] 0000 1111 1111 1111 = 0x0FFF
+ [d] 0000 1100 1100 0010 = 0x0CC2
+ [e] 0000 1111 1111 1111 = 0x0FFF
+*/
+
+#define MAX_CLASSES 12
+
+static const uint16_t gPair[MAX_CLASSES] = {
+ 0x0FFF,
+ 0x0C02,
+ 0x0806,
+ 0x0842,
+ 0x0802,
+ 0x0C02,
+ 0x0ED2,
+ 0x0EC2,
+ 0x0902,
+ 0x0FFF,
+ 0x0CC2,
+ 0x0FFF
+};
+
+
+/*
+
+ 8. And if the character is not enough far from word start, word end and
+ another break point, we should not break in non-CJK languages.
+ I.e., Don't break around 15, 18, [c] and [d], but don't change
+ that if they are related to [b].
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e]
+
+ 1 X X X X X X X X X X X X
+ [a] X X X X X X
+ 7 X X X X X X X
+ 8 X X X X X X
+ 9 X X X X X X
+ [b] X X X
+ 15 X X X X X X X X X X X
+ 18 X X X X X X X X X X X
+ COMPLEX X X X T X X X
+ [c] X X X X X X X X X X X X
+ [d] X X X X X X X X X X X
+ [e] X X X X X X X X X X X X
+
+ 18 <- 1
+
+ 1 0000 1111 1111 1111 = 0x0FFF
+ [a] 0000 1110 1100 0010 = 0x0EC2
+ 7 0000 1110 1100 0110 = 0x0EC6
+ 8 0000 1110 1100 0010 = 0x0EC2
+ 9 0000 1110 1100 0010 = 0x0EC2
+ [b] 0000 1100 0000 0010 = 0x0C02
+ 15 0000 1111 1101 1111 = 0x0FDF
+ 18 0000 1111 1101 1111 = 0x0FDF
+ COMPLEX 0000 1111 1100 0010 = 0x0FC2
+ [c] 0000 1111 1111 1111 = 0x0FFF
+ [d] 0000 1111 1101 1111 = 0x0FDF
+ [e] 0000 1111 1111 1111 = 0x0FFF
+*/
+
+static const uint16_t gPairConservative[MAX_CLASSES] = {
+ 0x0FFF,
+ 0x0EC2,
+ 0x0EC6,
+ 0x0EC2,
+ 0x0EC2,
+ 0x0C02,
+ 0x0FDF,
+ 0x0FDF,
+ 0x0FC2,
+ 0x0FFF,
+ 0x0FDF,
+ 0x0FFF
+};
+
+
+/*
+
+ 9. Now we map the class to number
+
+ 0: 1
+ 1: [a]- 2, 3, 4, 5, 6
+ 2: 7
+ 3: 8
+ 4: 9
+ 5: [b]- 10, 11, 12, 17
+ 6: 15
+ 7: 18
+ 8: COMPLEX
+ 9: [c]
+ A: [d]
+ B: [e]
+
+ and they mean:
+ 0: Open parenthesis
+ 1: Punctuation that prohibits break before
+ 2: Non-breakable between same classes
+ 3: Prefix
+ 4: Postfix
+ 5: Breakable character (Spaces and Most Japanese characters)
+ 6: Numeric
+ 7: Characters
+ 8: Need special handling characters (E.g., Thai)
+ 9: Open parentheses like Character (See bug 389056)
+ A: Close parenthese (or punctuations) like Character (See bug 389056)
+ B: Non breakable (See bug 390920)
+
+*/
+
+#define CLASS_NONE INT8_MAX
+
+#define CLASS_OPEN 0x00
+#define CLASS_CLOSE 0x01
+#define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02
+#define CLASS_PREFIX 0x03
+#define CLASS_POSTFFIX 0x04
+#define CLASS_BREAKABLE 0x05
+#define CLASS_NUMERIC 0x06
+#define CLASS_CHARACTER 0x07
+#define CLASS_COMPLEX 0x08
+#define CLASS_OPEN_LIKE_CHARACTER 0x09
+#define CLASS_CLOSE_LIKE_CHARACTER 0x0A
+#define CLASS_NON_BREAKABLE 0x0B
+
+#define U_NULL char16_t(0x0000)
+#define U_SLASH char16_t('/')
+#define U_SPACE char16_t(' ')
+#define U_HYPHEN char16_t('-')
+#define U_EQUAL char16_t('=')
+#define U_PERCENT char16_t('%')
+#define U_AMPERSAND char16_t('&')
+#define U_SEMICOLON char16_t(';')
+#define U_BACKSLASH char16_t('\\')
+#define U_OPEN_SINGLE_QUOTE char16_t(0x2018)
+#define U_OPEN_DOUBLE_QUOTE char16_t(0x201C)
+#define U_OPEN_GUILLEMET char16_t(0x00AB)
+
+#define NEED_CONTEXTUAL_ANALYSIS(c) (IS_HYPHEN(c) || \
+ (c) == U_SLASH || \
+ (c) == U_PERCENT || \
+ (c) == U_AMPERSAND || \
+ (c) == U_SEMICOLON || \
+ (c) == U_BACKSLASH || \
+ (c) == U_OPEN_SINGLE_QUOTE || \
+ (c) == U_OPEN_DOUBLE_QUOTE || \
+ (c) == U_OPEN_GUILLEMET)
+
+#define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039)
+
+static inline int
+GETCLASSFROMTABLE(const uint32_t* t, uint16_t l)
+{
+ return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f);
+}
+
+static inline int
+IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u)
+{
+ return ((0xff66 <= (u)) && ((u) <= 0xff70));
+}
+
+static inline int
+IS_CJK_CHAR(char16_t u)
+{
+ return ((0x1100 <= (u) && (u) <= 0x11ff) ||
+ (0x2e80 <= (u) && (u) <= 0xd7ff) ||
+ (0xf900 <= (u) && (u) <= 0xfaff) ||
+ (0xff00 <= (u) && (u) <= 0xffef) );
+}
+
+static inline bool
+IS_NONBREAKABLE_SPACE(char16_t u)
+{
+ return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE
+}
+
+static inline bool
+IS_HYPHEN(char16_t u)
+{
+ return (u == U_HYPHEN ||
+ u == 0x058A || // ARMENIAN HYPHEN
+ u == 0x2010 || // HYPHEN
+ u == 0x2012 || // FIGURE DASH
+ u == 0x2013); // EN DASH
+}
+
+static int8_t
+GetClass(uint32_t u)
+{
+ if (u < 0x10000) {
+ uint16_t h = u & 0xFF00;
+ uint16_t l = u & 0x00ff;
+
+ // Handle 3 range table first
+ if (0x0000 == h) {
+ return GETCLASSFROMTABLE(gLBClass00, l);
+ }
+ if (0x1700 == h) {
+ return GETCLASSFROMTABLE(gLBClass17, l);
+ }
+ if (NS_NeedsPlatformNativeHandling(u)) {
+ return CLASS_COMPLEX;
+ }
+ if (0x0E00 == h) {
+ return GETCLASSFROMTABLE(gLBClass0E, l);
+ }
+ if (0x2000 == h) {
+ return GETCLASSFROMTABLE(gLBClass20, l);
+ }
+ if (0x2100 == h) {
+ return GETCLASSFROMTABLE(gLBClass21, l);
+ }
+ if (0x3000 == h) {
+ return GETCLASSFROMTABLE(gLBClass30, l);
+ }
+ if (0xff00 == h) {
+ if (l < 0x0060) { // Fullwidth ASCII variant
+ return GETCLASSFROMTABLE(gLBClass00, (l+0x20));
+ }
+ if (l < 0x00a0) { // Halfwidth Katakana variants
+ switch (l) {
+ case 0x61: return GetClass(0x3002);
+ case 0x62: return GetClass(0x300c);
+ case 0x63: return GetClass(0x300d);
+ case 0x64: return GetClass(0x3001);
+ case 0x65: return GetClass(0x30fb);
+ case 0x9e: return GetClass(0x309b);
+ case 0x9f: return GetClass(0x309c);
+ default:
+ if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u)) {
+ return CLASS_CLOSE; // jis x4051 class 3
+ }
+ return CLASS_BREAKABLE; // jis x4051 class 11
+ }
+ }
+ if (l < 0x00e0) {
+ return CLASS_CHARACTER; // Halfwidth Hangul variants
+ }
+ if (l < 0x00f0) {
+ static char16_t NarrowFFEx[16] = {
+ 0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000,
+ 0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000
+ };
+ return GetClass(NarrowFFEx[l - 0x00e0]);
+ }
+ } else if (0x3100 == h) {
+ if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun
+ // XXX: This is per UAX #14, but UAX #14 may change
+ // the line breaking rules about Kanbun and Bopomofo.
+ return CLASS_BREAKABLE;
+ }
+ if (l >= 0xf0) { // Katakana small letters for Ainu
+ return CLASS_CLOSE;
+ }
+ } else if (0x0300 == h) {
+ if (0x4F == l || (0x5C <= l && l <= 0x62)) {
+ return CLASS_NON_BREAKABLE;
+ }
+ } else if (0x0500 == h) {
+ // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14)
+ if (l == 0x8A) {
+ return GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN));
+ }
+ } else if (0x0F00 == h) {
+ if (0x08 == l || 0x0C == l || 0x12 == l) {
+ return CLASS_NON_BREAKABLE;
+ }
+ } else if (0x1800 == h) {
+ if (0x0E == l) {
+ return CLASS_NON_BREAKABLE;
+ }
+ } else if (0x1600 == h) {
+ if (0x80 == l) { // U+1680 OGHAM SPACE MARK
+ return CLASS_BREAKABLE;
+ }
+ } else if (u == 0xfeff) {
+ return CLASS_NON_BREAKABLE;
+ }
+ }
+
+ // Mapping for Unicode LineBreak.txt classes to the (simplified) set of
+ // character classes used here.
+ // XXX The mappings here were derived by comparing the Unicode LineBreak
+ // values of BMP characters to the classes our existing GetClass returns
+ // for the same codepoints; in cases where characters with the same
+ // LineBreak class mapped to various classes here, I picked what seemed
+ // the most prevalent equivalence.
+ // Some of these are unclear to me, but currently they are ONLY used
+ // for characters not handled by the old code above, so all the JISx405
+ // special cases should already be accounted for.
+ static const int8_t sUnicodeLineBreakToClass[] = {
+ /* UNKNOWN = 0, [XX] */ CLASS_CHARACTER,
+ /* AMBIGUOUS = 1, [AI] */ CLASS_CHARACTER,
+ /* ALPHABETIC = 2, [AL] */ CLASS_CHARACTER,
+ /* BREAK_BOTH = 3, [B2] */ CLASS_CHARACTER,
+ /* BREAK_AFTER = 4, [BA] */ CLASS_CHARACTER,
+ /* BREAK_BEFORE = 5, [BB] */ CLASS_OPEN_LIKE_CHARACTER,
+ /* MANDATORY_BREAK = 6, [BK] */ CLASS_CHARACTER,
+ /* CONTINGENT_BREAK = 7, [CB] */ CLASS_CHARACTER,
+ /* CLOSE_PUNCTUATION = 8, [CL] */ CLASS_CHARACTER,
+ /* COMBINING_MARK = 9, [CM] */ CLASS_CHARACTER,
+ /* CARRIAGE_RETURN = 10, [CR] */ CLASS_BREAKABLE,
+ /* EXCLAMATION = 11, [EX] */ CLASS_CHARACTER,
+ /* GLUE = 12, [GL] */ CLASS_NON_BREAKABLE,
+ /* HYPHEN = 13, [HY] */ CLASS_CHARACTER,
+ /* IDEOGRAPHIC = 14, [ID] */ CLASS_BREAKABLE,
+ /* INSEPARABLE = 15, [IN] */ CLASS_CLOSE_LIKE_CHARACTER,
+ /* INFIX_NUMERIC = 16, [IS] */ CLASS_CHARACTER,
+ /* LINE_FEED = 17, [LF] */ CLASS_BREAKABLE,
+ /* NONSTARTER = 18, [NS] */ CLASS_CLOSE_LIKE_CHARACTER,
+ /* NUMERIC = 19, [NU] */ CLASS_CHARACTER,
+ /* OPEN_PUNCTUATION = 20, [OP] */ CLASS_CHARACTER,
+ /* POSTFIX_NUMERIC = 21, [PO] */ CLASS_CHARACTER,
+ /* PREFIX_NUMERIC = 22, [PR] */ CLASS_CHARACTER,
+ /* QUOTATION = 23, [QU] */ CLASS_CHARACTER,
+ /* COMPLEX_CONTEXT = 24, [SA] */ CLASS_CHARACTER,
+ /* SURROGATE = 25, [SG] */ CLASS_CHARACTER,
+ /* SPACE = 26, [SP] */ CLASS_BREAKABLE,
+ /* BREAK_SYMBOLS = 27, [SY] */ CLASS_CHARACTER,
+ /* ZWSPACE = 28, [ZW] */ CLASS_BREAKABLE,
+ /* NEXT_LINE = 29, [NL] */ CLASS_CHARACTER,
+ /* WORD_JOINER = 30, [WJ] */ CLASS_NON_BREAKABLE,
+ /* H2 = 31, [H2] */ CLASS_BREAKABLE,
+ /* H3 = 32, [H3] */ CLASS_BREAKABLE,
+ /* JL = 33, [JL] */ CLASS_CHARACTER,
+ /* JT = 34, [JT] */ CLASS_CHARACTER,
+ /* JV = 35, [JV] */ CLASS_CHARACTER,
+ /* CLOSE_PARENTHESIS = 36, [CP] */ CLASS_CLOSE_LIKE_CHARACTER,
+ /* CONDITIONAL_JAPANESE_STARTER = 37, [CJ] */ CLASS_CLOSE,
+ /* HEBREW_LETTER = 38, [HL] */ CLASS_CHARACTER,
+ /* REGIONAL_INDICATOR = 39, [RI] */ CLASS_CHARACTER,
+ /* E_BASE = 40, [EB] */ CLASS_BREAKABLE,
+ /* E_MODIFIER = 41, [EM] */ CLASS_CHARACTER,
+ /* ZWJ = 42, [ZWJ]*/ CLASS_CHARACTER
+ };
+
+#if ENABLE_INTL_API
+ static_assert(U_LB_COUNT == mozilla::ArrayLength(sUnicodeLineBreakToClass),
+ "Gecko vs ICU LineBreak class mismatch");
+#endif
+
+ auto cls = mozilla::unicode::GetLineBreakClass(u);
+ MOZ_ASSERT(cls < mozilla::ArrayLength(sUnicodeLineBreakToClass));
+ return sUnicodeLineBreakToClass[cls];
+}
+
+static bool
+GetPair(int8_t c1, int8_t c2)
+{
+ NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
+ NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
+
+ return (0 == ((gPair[c1] >> c2) & 0x0001));
+}
+
+static bool
+GetPairConservative(int8_t c1, int8_t c2)
+{
+ NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
+ NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
+
+ return (0 == ((gPairConservative[c1] >> c2) & 0x0001));
+}
+
+nsJISx4051LineBreaker::nsJISx4051LineBreaker()
+{
+}
+
+nsJISx4051LineBreaker::~nsJISx4051LineBreaker()
+{
+}
+
+NS_IMPL_ISUPPORTS(nsJISx4051LineBreaker, nsILineBreaker)
+
+class ContextState {
+public:
+ ContextState(const char16_t* aText, uint32_t aLength) {
+ mUniText = aText;
+ mText = nullptr;
+ mLength = aLength;
+ Init();
+ }
+
+ ContextState(const uint8_t* aText, uint32_t aLength) {
+ mUniText = nullptr;
+ mText = aText;
+ mLength = aLength;
+ Init();
+ }
+
+ uint32_t Length() { return mLength; }
+ uint32_t Index() { return mIndex; }
+
+ char16_t GetCharAt(uint32_t aIndex) {
+ NS_ASSERTION(aIndex < mLength, "Out of range!");
+ return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]);
+ }
+
+ void AdvanceIndex() {
+ ++mIndex;
+ }
+
+ void NotifyBreakBefore() { mLastBreakIndex = mIndex; }
+
+// A word of western language should not be broken. But even if the word has
+// only ASCII characters, non-natural context words should be broken, e.g.,
+// URL and file path. For protecting the natural words, we should use
+// conservative breaking rules at following conditions:
+// 1. at near the start of word
+// 2. at near the end of word
+// 3. at near the latest broken point
+// CONSERVATIVE_BREAK_RANGE define the 'near' in characters.
+#define CONSERVATIVE_BREAK_RANGE 6
+
+ bool UseConservativeBreaking(uint32_t aOffset = 0) {
+ if (mHasCJKChar)
+ return false;
+ uint32_t index = mIndex + aOffset;
+ bool result = (index < CONSERVATIVE_BREAK_RANGE ||
+ mLength - index < CONSERVATIVE_BREAK_RANGE ||
+ index - mLastBreakIndex < CONSERVATIVE_BREAK_RANGE);
+ if (result || !mHasNonbreakableSpace)
+ return result;
+
+ // This text has no-breakable space, we need to check whether the index
+ // is near it.
+
+ // Note that index is always larger than CONSERVATIVE_BREAK_RANGE here.
+ for (uint32_t i = index; index - CONSERVATIVE_BREAK_RANGE < i; --i) {
+ if (IS_NONBREAKABLE_SPACE(GetCharAt(i - 1)))
+ return true;
+ }
+ // Note that index is always less than mLength - CONSERVATIVE_BREAK_RANGE.
+ for (uint32_t i = index + 1; i < index + CONSERVATIVE_BREAK_RANGE; ++i) {
+ if (IS_NONBREAKABLE_SPACE(GetCharAt(i)))
+ return true;
+ }
+ return false;
+ }
+
+ bool HasPreviousEqualsSign() const {
+ return mHasPreviousEqualsSign;
+ }
+ void NotifySeenEqualsSign() {
+ mHasPreviousEqualsSign = true;
+ }
+
+ bool HasPreviousSlash() const {
+ return mHasPreviousSlash;
+ }
+ void NotifySeenSlash() {
+ mHasPreviousSlash = true;
+ }
+
+ bool HasPreviousBackslash() const {
+ return mHasPreviousBackslash;
+ }
+ void NotifySeenBackslash() {
+ mHasPreviousBackslash = true;
+ }
+
+ uint32_t GetPreviousNonHyphenCharacter() const {
+ return mPreviousNonHyphenCharacter;
+ }
+ void NotifyNonHyphenCharacter(uint32_t ch) {
+ mPreviousNonHyphenCharacter = ch;
+ }
+
+private:
+ void Init() {
+ mIndex = 0;
+ mLastBreakIndex = 0;
+ mPreviousNonHyphenCharacter = U_NULL;
+ mHasCJKChar = 0;
+ mHasNonbreakableSpace = 0;
+ mHasPreviousEqualsSign = false;
+ mHasPreviousSlash = false;
+ mHasPreviousBackslash = false;
+
+ for (uint32_t i = 0; i < mLength; ++i) {
+ char16_t u = GetCharAt(i);
+ if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u))
+ mHasNonbreakableSpace = 1;
+ else if (mUniText && !mHasCJKChar && IS_CJK_CHAR(u))
+ mHasCJKChar = 1;
+ }
+ }
+
+ const char16_t* mUniText;
+ const uint8_t* mText;
+
+ uint32_t mIndex;
+ uint32_t mLength; // length of text
+ uint32_t mLastBreakIndex;
+ uint32_t mPreviousNonHyphenCharacter; // The last character we have seen
+ // which is not U_HYPHEN
+ bool mHasCJKChar; // if the text has CJK character, this is true.
+ bool mHasNonbreakableSpace; // if the text has no-breakable space,
+ // this is true.
+ bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL
+ bool mHasPreviousSlash; // True if we have seen a U_SLASH
+ bool mHasPreviousBackslash; // True if we have seen a U_BACKSLASH
+};
+
+static int8_t
+ContextualAnalysis(char16_t prev, char16_t cur, char16_t next,
+ ContextState &aState)
+{
+ // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE.
+
+ if (IS_HYPHEN(cur)) {
+ // If next character is hyphen, we don't need to break between them.
+ if (IS_HYPHEN(next))
+ return CLASS_CHARACTER;
+ // If prev and next characters are numeric, it may be in Math context.
+ // So, we should not break here.
+ bool prevIsNum = IS_ASCII_DIGIT(prev);
+ bool nextIsNum = IS_ASCII_DIGIT(next);
+ if (prevIsNum && nextIsNum)
+ return CLASS_NUMERIC;
+ // If one side is numeric and the other is a character, or if both sides are
+ // characters, the hyphen should be breakable.
+ if (!aState.UseConservativeBreaking(1)) {
+ char16_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter();
+ if (prevOfHyphen && next) {
+ int8_t prevClass = GetClass(prevOfHyphen);
+ int8_t nextClass = GetClass(next);
+ bool prevIsNumOrCharOrClose =
+ prevIsNum ||
+ (prevClass == CLASS_CHARACTER &&
+ !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) ||
+ prevClass == CLASS_CLOSE ||
+ prevClass == CLASS_CLOSE_LIKE_CHARACTER;
+ bool nextIsNumOrCharOrOpen =
+ nextIsNum ||
+ (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) ||
+ nextClass == CLASS_OPEN ||
+ nextClass == CLASS_OPEN_LIKE_CHARACTER ||
+ next == U_OPEN_SINGLE_QUOTE ||
+ next == U_OPEN_DOUBLE_QUOTE ||
+ next == U_OPEN_GUILLEMET;
+ if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) {
+ return CLASS_CLOSE;
+ }
+ }
+ }
+ } else {
+ aState.NotifyNonHyphenCharacter(cur);
+ if (cur == U_SLASH || cur == U_BACKSLASH) {
+ // If this is immediately after same char, we should not break here.
+ if (prev == cur)
+ return CLASS_CHARACTER;
+ // If this text has two or more (BACK)SLASHs, this may be file path or URL.
+ // Make sure to compute shouldReturn before we notify on this slash.
+ bool shouldReturn = !aState.UseConservativeBreaking() &&
+ (cur == U_SLASH ?
+ aState.HasPreviousSlash() : aState.HasPreviousBackslash());
+
+ if (cur == U_SLASH) {
+ aState.NotifySeenSlash();
+ } else {
+ aState.NotifySeenBackslash();
+ }
+
+ if (shouldReturn)
+ return CLASS_OPEN;
+ } else if (cur == U_PERCENT) {
+ // If this is a part of the param of URL, we should break before.
+ if (!aState.UseConservativeBreaking()) {
+ if (aState.Index() >= 3 &&
+ aState.GetCharAt(aState.Index() - 3) == U_PERCENT)
+ return CLASS_OPEN;
+ if (aState.Index() + 3 < aState.Length() &&
+ aState.GetCharAt(aState.Index() + 3) == U_PERCENT)
+ return CLASS_OPEN;
+ }
+ } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) {
+ // If this may be a separator of params of URL, we should break after.
+ if (!aState.UseConservativeBreaking(1) &&
+ aState.HasPreviousEqualsSign())
+ return CLASS_CLOSE;
+ } else if (cur == U_OPEN_SINGLE_QUOTE ||
+ cur == U_OPEN_DOUBLE_QUOTE ||
+ cur == U_OPEN_GUILLEMET) {
+ // for CJK usage, we treat these as openers to allow a break before them,
+ // but otherwise treat them as normal characters because quote mark usage
+ // in various Western languages varies too much; see bug #450088 discussion.
+ if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next))
+ return CLASS_OPEN;
+ } else {
+ NS_ERROR("Forgot to handle the current character!");
+ }
+ }
+ return GetClass(cur);
+}
+
+
+int32_t
+nsJISx4051LineBreaker::WordMove(const char16_t* aText, uint32_t aLen,
+ uint32_t aPos, int8_t aDirection)
+{
+ bool textNeedsJISx4051 = false;
+ int32_t begin, end;
+
+ for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) {
+ if (IS_CJK_CHAR(aText[begin]) || NS_NeedsPlatformNativeHandling(aText[begin])) {
+ textNeedsJISx4051 = true;
+ }
+ }
+ for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) {
+ if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) {
+ textNeedsJISx4051 = true;
+ }
+ }
+
+ int32_t ret;
+ AutoTArray<uint8_t, 2000> breakState;
+ if (!textNeedsJISx4051 || !breakState.AppendElements(end - begin)) {
+ // No complex text character, do not try to do complex line break.
+ // (This is required for serializers. See Bug #344816.)
+ // Also fall back to this when out of memory.
+ if (aDirection < 0) {
+ ret = (begin == int32_t(aPos)) ? begin - 1 : begin;
+ } else {
+ ret = end;
+ }
+ } else {
+ GetJISx4051Breaks(aText + begin, end - begin, nsILineBreaker::kWordBreak_Normal,
+ breakState.Elements());
+
+ ret = aPos;
+ do {
+ ret += aDirection;
+ } while (begin < ret && ret < end && !breakState[ret - begin]);
+ }
+
+ return ret;
+}
+
+int32_t
+nsJISx4051LineBreaker::Next(const char16_t* aText, uint32_t aLen,
+ uint32_t aPos)
+{
+ NS_ASSERTION(aText, "aText shouldn't be null");
+ NS_ASSERTION(aLen > aPos, "Bad position passed to nsJISx4051LineBreaker::Next");
+
+ int32_t nextPos = WordMove(aText, aLen, aPos, 1);
+ return nextPos < int32_t(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT;
+}
+
+int32_t
+nsJISx4051LineBreaker::Prev(const char16_t* aText, uint32_t aLen,
+ uint32_t aPos)
+{
+ NS_ASSERTION(aText, "aText shouldn't be null");
+ NS_ASSERTION(aLen >= aPos && aPos > 0,
+ "Bad position passed to nsJISx4051LineBreaker::Prev");
+
+ int32_t prevPos = WordMove(aText, aLen, aPos, -1);
+ return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT;
+}
+
+void
+nsJISx4051LineBreaker::GetJISx4051Breaks(const char16_t* aChars, uint32_t aLength,
+ uint8_t aWordBreak,
+ uint8_t* aBreakBefore)
+{
+ uint32_t cur;
+ int8_t lastClass = CLASS_NONE;
+ ContextState state(aChars, aLength);
+
+ for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
+ uint32_t ch = aChars[cur];
+ if (NS_IS_HIGH_SURROGATE(ch)) {
+ if (cur + 1 < aLength && NS_IS_LOW_SURROGATE(aChars[cur + 1])) {
+ ch = SURROGATE_TO_UCS4(ch, aChars[cur + 1]);
+ }
+ }
+ int8_t cl;
+
+ if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
+ cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
+ ch,
+ cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
+ state);
+ } else {
+ if (ch == U_EQUAL)
+ state.NotifySeenEqualsSign();
+ state.NotifyNonHyphenCharacter(ch);
+ cl = GetClass(ch);
+ }
+
+ bool allowBreak = false;
+ if (cur > 0) {
+ NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl,
+ "Loop should have prevented adjacent complex chars here");
+ if (aWordBreak == nsILineBreaker::kWordBreak_Normal) {
+ allowBreak = (state.UseConservativeBreaking()) ?
+ GetPairConservative(lastClass, cl) : GetPair(lastClass, cl);
+ } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) {
+ allowBreak = true;
+ }
+ }
+ aBreakBefore[cur] = allowBreak;
+ if (allowBreak)
+ state.NotifyBreakBefore();
+ lastClass = cl;
+ if (CLASS_COMPLEX == cl) {
+ uint32_t end = cur + 1;
+
+ while (end < aLength && CLASS_COMPLEX == GetClass(aChars[end])) {
+ ++end;
+ }
+
+ NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur);
+
+ // We have to consider word-break value again for complex characters
+ if (aWordBreak != nsILineBreaker::kWordBreak_Normal) {
+ // Respect word-break property
+ for (uint32_t i = cur; i < end; i++)
+ aBreakBefore[i] = (aWordBreak == nsILineBreaker::kWordBreak_BreakAll);
+ }
+
+ // restore breakability at chunk begin, which was always set to false
+ // by the complex line breaker
+ aBreakBefore[cur] = allowBreak;
+
+ cur = end - 1;
+ }
+
+ if (ch > 0xffff) {
+ // Supplementary-plane character: mark that we cannot break before the
+ // trailing low surrogate, and advance past it.
+ ++cur;
+ aBreakBefore[cur] = false;
+ state.AdvanceIndex();
+ }
+ }
+}
+
+void
+nsJISx4051LineBreaker::GetJISx4051Breaks(const uint8_t* aChars, uint32_t aLength,
+ uint8_t aWordBreak,
+ uint8_t* aBreakBefore)
+{
+ uint32_t cur;
+ int8_t lastClass = CLASS_NONE;
+ ContextState state(aChars, aLength);
+
+ for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
+ char16_t ch = aChars[cur];
+ int8_t cl;
+
+ if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
+ cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
+ ch,
+ cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
+ state);
+ } else {
+ if (ch == U_EQUAL)
+ state.NotifySeenEqualsSign();
+ state.NotifyNonHyphenCharacter(ch);
+ cl = GetClass(ch);
+ }
+
+ bool allowBreak = false;
+ if (cur > 0) {
+ if (aWordBreak == nsILineBreaker::kWordBreak_Normal) {
+ allowBreak = (state.UseConservativeBreaking()) ?
+ GetPairConservative(lastClass, cl) : GetPair(lastClass, cl);
+ } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) {
+ allowBreak = true;
+ }
+ }
+ aBreakBefore[cur] = allowBreak;
+ if (allowBreak)
+ state.NotifyBreakBefore();
+ lastClass = cl;
+ }
+}
diff --git a/intl/lwbrk/nsJISx4051LineBreaker.h b/intl/lwbrk/nsJISx4051LineBreaker.h
new file mode 100644
index 000000000..6b41f80df
--- /dev/null
+++ b/intl/lwbrk/nsJISx4051LineBreaker.h
@@ -0,0 +1,37 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef nsJISx4051LineBreaker_h__
+#define nsJISx4051LineBreaker_h__
+
+
+#include "nsILineBreaker.h"
+
+class nsJISx4051LineBreaker : public nsILineBreaker
+{
+ NS_DECL_ISUPPORTS
+
+private:
+ virtual ~nsJISx4051LineBreaker();
+
+public:
+ nsJISx4051LineBreaker();
+
+ int32_t Next( const char16_t* aText, uint32_t aLen, uint32_t aPos) override;
+
+ int32_t Prev( const char16_t* aText, uint32_t aLen, uint32_t aPos) override;
+
+ virtual void GetJISx4051Breaks(const char16_t* aText, uint32_t aLength,
+ uint8_t aBreakMode,
+ uint8_t* aBreakBefore) override;
+ virtual void GetJISx4051Breaks(const uint8_t* aText, uint32_t aLength,
+ uint8_t aBreakMode,
+ uint8_t* aBreakBefore) override;
+
+private:
+ int32_t WordMove(const char16_t* aText, uint32_t aLen, uint32_t aPos,
+ int8_t aDirection);
+};
+
+#endif /* nsJISx4051LineBreaker_h__ */
diff --git a/intl/lwbrk/nsLWBrkCIID.h b/intl/lwbrk/nsLWBrkCIID.h
new file mode 100644
index 000000000..75e280058
--- /dev/null
+++ b/intl/lwbrk/nsLWBrkCIID.h
@@ -0,0 +1,22 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef nsLWBrkCIID_h__
+#define nsLWBrkCIID_h__
+
+// {2BF64764-997F-450D-AF96-3028D1A902B0}
+#define NS_LBRK_CID \
+{ 0x2bf64764, 0x997f, 0x450d, \
+ { 0xaf, 0x96, 0x30, 0x28, 0xd1, 0xa9, 0x2, 0xb0 } }
+
+#define NS_LBRK_CONTRACTID "@mozilla.org/intl/lbrk;1"
+
+// {2BF64765-997F-450D-AF96-3028D1A902B0}
+#define NS_WBRK_CID \
+{ 0x2bf64765, 0x997f, 0x450d, \
+ { 0xaf, 0x96, 0x30, 0x28, 0xd1, 0xa9, 0x2, 0xb0 } }
+
+#define NS_WBRK_CONTRACTID "@mozilla.org/intl/wbrk;1"
+
+#endif
diff --git a/intl/lwbrk/nsPangoBreaker.cpp b/intl/lwbrk/nsPangoBreaker.cpp
new file mode 100644
index 000000000..c6fcb37cf
--- /dev/null
+++ b/intl/lwbrk/nsPangoBreaker.cpp
@@ -0,0 +1,60 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsComplexBreaker.h"
+
+#include <pango/pango-break.h>
+#include "nsUTF8Utils.h"
+#include "nsString.h"
+#include "nsTArray.h"
+
+void
+NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength,
+ uint8_t* aBreakBefore)
+{
+ NS_ASSERTION(aText, "aText shouldn't be null");
+
+ memset(aBreakBefore, false, aLength * sizeof(uint8_t));
+
+ AutoTArray<PangoLogAttr, 2000> attrBuffer;
+ if (!attrBuffer.AppendElements(aLength + 1))
+ return;
+
+ NS_ConvertUTF16toUTF8 aUTF8(aText, aLength);
+
+ const gchar* p = aUTF8.Data();
+ const gchar* end = p + aUTF8.Length();
+ uint32_t u16Offset = 0;
+
+ static PangoLanguage* language = pango_language_from_string("en");
+
+ while (p < end)
+ {
+ PangoLogAttr* attr = attrBuffer.Elements();
+ pango_get_log_attrs(p, end - p, -1, language, attr, attrBuffer.Length());
+
+ while (p < end)
+ {
+ aBreakBefore[u16Offset] = attr->is_line_break;
+ if (NS_IS_LOW_SURROGATE(aText[u16Offset]))
+ aBreakBefore[++u16Offset] = false; // Skip high surrogate
+ ++u16Offset;
+
+ bool err;
+ uint32_t ch = UTF8CharEnumerator::NextChar(&p, end, &err);
+ ++attr;
+
+ if (ch == 0 || err) {
+ // pango_break (pango 1.16.2) only analyses text before the
+ // first NUL (but sets one extra attr). Workaround loop to call
+ // pango_break again to analyse after the NUL is done somewhere else
+ // (gfx/thebes/gfxFontconfigFonts.cpp: SetupClusterBoundaries()).
+ // So, we do the same here for pango_get_log_attrs.
+ break;
+ }
+ }
+ }
+}
+
diff --git a/intl/lwbrk/nsRuleBreaker.cpp b/intl/lwbrk/nsRuleBreaker.cpp
new file mode 100644
index 000000000..035996873
--- /dev/null
+++ b/intl/lwbrk/nsRuleBreaker.cpp
@@ -0,0 +1,20 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsComplexBreaker.h"
+
+#define TH_UNICODE
+#include "rulebrk.h"
+
+void
+NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength,
+ uint8_t* aBreakBefore)
+{
+ NS_ASSERTION(aText, "aText shouldn't be null");
+
+ for (uint32_t i = 0; i < aLength; i++)
+ aBreakBefore[i] = (0 == TrbWordBreakPos(aText, i, aText + i, aLength - i));
+}
+
diff --git a/intl/lwbrk/nsSampleWordBreaker.cpp b/intl/lwbrk/nsSampleWordBreaker.cpp
new file mode 100644
index 000000000..fa54adeda
--- /dev/null
+++ b/intl/lwbrk/nsSampleWordBreaker.cpp
@@ -0,0 +1,150 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+
+#include "nsSampleWordBreaker.h"
+
+nsSampleWordBreaker::nsSampleWordBreaker()
+{
+}
+nsSampleWordBreaker::~nsSampleWordBreaker()
+{
+}
+
+NS_IMPL_ISUPPORTS(nsSampleWordBreaker, nsIWordBreaker)
+
+bool nsSampleWordBreaker::BreakInBetween(
+ const char16_t* aText1 , uint32_t aTextLen1,
+ const char16_t* aText2 , uint32_t aTextLen2)
+{
+ NS_PRECONDITION( nullptr != aText1, "null ptr");
+ NS_PRECONDITION( nullptr != aText2, "null ptr");
+
+ if(!aText1 || !aText2 || (0 == aTextLen1) || (0 == aTextLen2))
+ return false;
+
+ return (this->GetClass(aText1[aTextLen1-1]) != this->GetClass(aText2[0]));
+}
+
+
+#define IS_ASCII(c) (0 == ( 0xFF80 & (c)))
+#define ASCII_IS_ALPHA(c) ((( 'a' <= (c)) && ((c) <= 'z')) || (( 'A' <= (c)) && ((c) <= 'Z')))
+#define ASCII_IS_DIGIT(c) (( '0' <= (c)) && ((c) <= '9'))
+#define ASCII_IS_SPACE(c) (( ' ' == (c)) || ( '\t' == (c)) || ( '\r' == (c)) || ( '\n' == (c)))
+#define IS_ALPHABETICAL_SCRIPT(c) ((c) < 0x2E80)
+
+// we change the beginning of IS_HAN from 0x4e00 to 0x3400 to relfect Unicode 3.0
+#define IS_HAN(c) (( 0x3400 <= (c)) && ((c) <= 0x9fff))||(( 0xf900 <= (c)) && ((c) <= 0xfaff))
+#define IS_KATAKANA(c) (( 0x30A0 <= (c)) && ((c) <= 0x30FF))
+#define IS_HIRAGANA(c) (( 0x3040 <= (c)) && ((c) <= 0x309F))
+#define IS_HALFWIDTHKATAKANA(c) (( 0xFF60 <= (c)) && ((c) <= 0xFF9F))
+#define IS_THAI(c) (0x0E00 == (0xFF80 & (c) )) // Look at the higest 9 bits
+
+uint8_t nsSampleWordBreaker::GetClass(char16_t c)
+{
+ // begin of the hack
+
+ if (IS_ALPHABETICAL_SCRIPT(c)) {
+ if(IS_ASCII(c)) {
+ if(ASCII_IS_SPACE(c)) {
+ return kWbClassSpace;
+ } else if(ASCII_IS_ALPHA(c) || ASCII_IS_DIGIT(c)) {
+ return kWbClassAlphaLetter;
+ } else {
+ return kWbClassPunct;
+ }
+ } else if(IS_THAI(c)) {
+ return kWbClassThaiLetter;
+ } else if (c == 0x00A0/*NBSP*/) {
+ return kWbClassSpace;
+ } else {
+ return kWbClassAlphaLetter;
+ }
+ } else {
+ if(IS_HAN(c)) {
+ return kWbClassHanLetter;
+ } else if(IS_KATAKANA(c)) {
+ return kWbClassKatakanaLetter;
+ } else if(IS_HIRAGANA(c)) {
+ return kWbClassHiraganaLetter;
+ } else if(IS_HALFWIDTHKATAKANA(c)) {
+ return kWbClassHWKatakanaLetter;
+ } else {
+ return kWbClassAlphaLetter;
+ }
+ }
+ return 0;
+}
+
+nsWordRange nsSampleWordBreaker::FindWord(
+ const char16_t* aText , uint32_t aTextLen,
+ uint32_t aOffset)
+{
+ nsWordRange range;
+ NS_PRECONDITION( nullptr != aText, "null ptr");
+ NS_PRECONDITION( 0 != aTextLen, "len = 0");
+ NS_PRECONDITION( aOffset <= aTextLen, "aOffset > aTextLen");
+
+ range.mBegin = aTextLen + 1;
+ range.mEnd = aTextLen + 1;
+
+ if(!aText || aOffset > aTextLen)
+ return range;
+
+ uint8_t c = this->GetClass(aText[aOffset]);
+ uint32_t i;
+ // Scan forward
+ range.mEnd--;
+ for(i = aOffset +1;i <= aTextLen; i++)
+ {
+ if( c != this->GetClass(aText[i]))
+ {
+ range.mEnd = i;
+ break;
+ }
+ }
+
+ // Scan backward
+ range.mBegin = 0;
+ for(i = aOffset ;i > 0; i--)
+ {
+ if( c != this->GetClass(aText[i-1]))
+ {
+ range.mBegin = i;
+ break;
+ }
+ }
+ if(kWbClassThaiLetter == c)
+ {
+ // need to call Thai word breaker from here
+ // we should pass the whole Thai segment to the thai word breaker to find a shorter answer
+ }
+ return range;
+}
+
+int32_t nsSampleWordBreaker::NextWord(
+ const char16_t* aText, uint32_t aLen, uint32_t aPos)
+{
+ int8_t c1, c2;
+ uint32_t cur = aPos;
+ if (cur == aLen)
+ return NS_WORDBREAKER_NEED_MORE_TEXT;
+ c1 = this->GetClass(aText[cur]);
+
+ for(cur++; cur <aLen; cur++)
+ {
+ c2 = this->GetClass(aText[cur]);
+ if(c2 != c1)
+ break;
+ }
+ if(kWbClassThaiLetter == c1)
+ {
+ // need to call Thai word breaker from here
+ // we should pass the whole Thai segment to the thai word breaker to find a shorter answer
+ }
+ if (cur == aLen)
+ return NS_WORDBREAKER_NEED_MORE_TEXT;
+ return cur;
+}
diff --git a/intl/lwbrk/nsSampleWordBreaker.h b/intl/lwbrk/nsSampleWordBreaker.h
new file mode 100644
index 000000000..51e17daa7
--- /dev/null
+++ b/intl/lwbrk/nsSampleWordBreaker.h
@@ -0,0 +1,42 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef nsSampleWordBreaker_h__
+#define nsSampleWordBreaker_h__
+
+
+#include "nsIWordBreaker.h"
+
+typedef enum {
+ kWbClassSpace = 0,
+ kWbClassAlphaLetter,
+ kWbClassPunct,
+ kWbClassHanLetter,
+ kWbClassKatakanaLetter,
+ kWbClassHiraganaLetter,
+ kWbClassHWKatakanaLetter,
+ kWbClassThaiLetter
+} wb_class;
+
+class nsSampleWordBreaker : public nsIWordBreaker
+{
+ NS_DECL_ISUPPORTS
+public:
+
+ nsSampleWordBreaker() ;
+
+ bool BreakInBetween(const char16_t* aText1 , uint32_t aTextLen1,
+ const char16_t* aText2 , uint32_t aTextLen2) override;
+ nsWordRange FindWord(const char16_t* aText1 , uint32_t aTextLen1,
+ uint32_t aOffset) override;
+
+ int32_t NextWord(const char16_t* aText, uint32_t aLen, uint32_t aPos) override;
+
+protected:
+ uint8_t GetClass(char16_t aChar);
+
+ virtual ~nsSampleWordBreaker();
+};
+
+#endif /* nsSampleWordBreaker_h__ */
diff --git a/intl/lwbrk/nsSemanticUnitScanner.cpp b/intl/lwbrk/nsSemanticUnitScanner.cpp
new file mode 100644
index 000000000..8feb738a8
--- /dev/null
+++ b/intl/lwbrk/nsSemanticUnitScanner.cpp
@@ -0,0 +1,76 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsSemanticUnitScanner.h"
+
+NS_IMPL_ISUPPORTS_INHERITED(nsSemanticUnitScanner, nsSampleWordBreaker, nsISemanticUnitScanner)
+
+nsSemanticUnitScanner::nsSemanticUnitScanner() : nsSampleWordBreaker()
+{
+ /* member initializers and constructor code */
+}
+
+nsSemanticUnitScanner::~nsSemanticUnitScanner()
+{
+ /* destructor code */
+}
+
+
+NS_IMETHODIMP nsSemanticUnitScanner::Start(const char *characterSet)
+{
+ // do nothing for now.
+ return NS_OK;
+}
+
+NS_IMETHODIMP nsSemanticUnitScanner::Next(const char16_t *text, int32_t length, int32_t pos, bool isLastBuffer, int32_t *begin, int32_t *end, bool *_retval)
+{
+ // xxx need to bullet proff and check input pointer
+ // make sure begin, end and _retval is not nullptr here
+
+ // if we reach the end, just return
+ if (pos >= length) {
+ *begin = pos;
+ *end = pos;
+ *_retval = false;
+ return NS_OK;
+ }
+
+ uint8_t char_class = nsSampleWordBreaker::GetClass(text[pos]);
+
+ // if we are in chinese mode, return one han letter at a time
+ // we should not do this if we are in Japanese or Korean mode
+ if (kWbClassHanLetter == char_class) {
+ *begin = pos;
+ *end = pos+1;
+ *_retval = true;
+ return NS_OK;
+ }
+
+ int32_t next;
+ // find the next "word"
+ next = NextWord(text, (uint32_t) length, (uint32_t) pos);
+
+ // if we don't have enough text to make decision, return
+ if (next == NS_WORDBREAKER_NEED_MORE_TEXT) {
+ *begin = pos;
+ *end = isLastBuffer ? length : pos;
+ *_retval = isLastBuffer;
+ return NS_OK;
+ }
+
+ // if what we got is space or punct, look at the next break
+ if ((char_class == kWbClassSpace) || (char_class == kWbClassPunct)) {
+ // if the next "word" is not letters,
+ // call itself recursively with the new pos
+ return Next(text, length, next, isLastBuffer, begin, end, _retval);
+ }
+
+ // for the rest, return
+ *begin = pos;
+ *end = next;
+ *_retval = true;
+ return NS_OK;
+}
+
diff --git a/intl/lwbrk/nsSemanticUnitScanner.h b/intl/lwbrk/nsSemanticUnitScanner.h
new file mode 100644
index 000000000..5e13fe78c
--- /dev/null
+++ b/intl/lwbrk/nsSemanticUnitScanner.h
@@ -0,0 +1,27 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef nsSemanticUnitScanner_h__
+#define nsSemanticUnitScanner_h__
+
+#include "nsSampleWordBreaker.h"
+#include "nsISemanticUnitScanner.h"
+
+
+class nsSemanticUnitScanner : public nsISemanticUnitScanner
+ , public nsSampleWordBreaker
+{
+public:
+ NS_DECL_ISUPPORTS_INHERITED
+ NS_DECL_NSISEMANTICUNITSCANNER
+
+ nsSemanticUnitScanner();
+
+private:
+ virtual ~nsSemanticUnitScanner();
+ /* additional members */
+};
+
+#endif
diff --git a/intl/lwbrk/nsUniscribeBreaker.cpp b/intl/lwbrk/nsUniscribeBreaker.cpp
new file mode 100644
index 000000000..2a1b69b22
--- /dev/null
+++ b/intl/lwbrk/nsUniscribeBreaker.cpp
@@ -0,0 +1,58 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsComplexBreaker.h"
+
+#include <windows.h>
+
+#include <usp10.h>
+
+#include "nsUTF8Utils.h"
+#include "nsString.h"
+#include "nsTArray.h"
+
+void
+NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength,
+ uint8_t* aBreakBefore)
+{
+ NS_ASSERTION(aText, "aText shouldn't be null");
+
+ int outItems = 0;
+ HRESULT result;
+ AutoTArray<SCRIPT_ITEM, 64> items;
+ char16ptr_t text = aText;
+
+ memset(aBreakBefore, false, aLength);
+
+ if (!items.AppendElements(64))
+ return;
+
+ do {
+ result = ScriptItemize(text, aLength, items.Length(), nullptr, nullptr,
+ items.Elements(), &outItems);
+
+ if (result == E_OUTOFMEMORY) {
+ if (!items.AppendElements(items.Length()))
+ return;
+ }
+ } while (result == E_OUTOFMEMORY);
+
+ for (int iItem = 0; iItem < outItems; ++iItem) {
+ uint32_t endOffset = (iItem + 1 == outItems ? aLength : items[iItem + 1].iCharPos);
+ uint32_t startOffset = items[iItem].iCharPos;
+ AutoTArray<SCRIPT_LOGATTR, 64> sla;
+
+ if (!sla.AppendElements(endOffset - startOffset))
+ return;
+
+ if (ScriptBreak(text + startOffset, endOffset - startOffset,
+ &items[iItem].a, sla.Elements()) < 0)
+ return;
+
+ for (uint32_t j=0; j+startOffset < endOffset; ++j) {
+ aBreakBefore[j+startOffset] = sla[j].fSoftBreak;
+ }
+ }
+}
diff --git a/intl/lwbrk/rulebrk.c b/intl/lwbrk/rulebrk.c
new file mode 100644
index 000000000..0c9e86e82
--- /dev/null
+++ b/intl/lwbrk/rulebrk.c
@@ -0,0 +1,376 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#define TH_UNICODE
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include "th_char.h"
+#define th_isalpha(c) (((c)>='a'&&(c)<='z')||((c)>='A'&&(c)<='Z'))
+#define th_isspace(c) ((c)==' '||(c)=='\t')
+
+
+/*
+/////////////////////////////////////////////////
+// Thai character type array
+*/
+
+typedef unsigned short twb_t;
+extern const twb_t _TwbType[0x100-0xa0];
+
+/*
+// bit definition
+*/
+
+#define VRS 0x0001
+#define VRE 0x0002
+#define VRX 0x0004
+
+#define VRA 0x0008
+
+#define VLA 0x0010
+#define VLO 0x0020
+#define VLI 0x0040
+
+#define VC 0x0080
+
+#define CC 0x0100
+#define CS 0x0200
+
+#define C2 0x0400
+#define CHB 0x0800
+#define CHE 0x1000
+
+#define MT 0x2000
+/*
+//_#define me 0x2000
+*/
+#define M 0x4000
+
+#define T 0x8000
+
+#define VL (VLA|VLO|VLI)
+#define VR (VRS|VRE|VRX)
+#define NE (VL|VRS)
+#define NB (VR|M)
+#define V (VL|VR)
+#define CX (CC|CS)
+#define C (CX|VC)
+#define A (C|V|M)
+
+#define twbtype(c) (_TwbType[th_zcode(c)])
+
+#ifndef TRUE
+#define TRUE 1
+#define FALSE 0
+#endif
+#define RETURN(b) return (b)
+
+
+/*
+/////////////////////////////////////////////////
+*/
+
+int TrbWordBreakPos(const th_char *pstr, int left,
+ const th_char *rstr, int right)
+/* const ThBreakIterator *it, const th_char **p)*/
+{
+ /*
+ //int left, right;
+ //const th_char *s = *p;
+ */
+ const th_char *lstr = pstr + left;
+ th_char _c[6];
+ twb_t _t[6];
+ #define c(i) (_c[(i)+3])
+ #define t(i) (_t[(i)+3])
+ int i, j;
+
+ /*
+ //left = s - it->begin;
+ */
+ if(left < 0) return -1;
+ /*
+ //right = (it->end == NULL) ? 4 : it->begin - s;
+ */
+ if(right < 1) return -1;
+
+ /*
+ // get c(0), t(0)
+ */
+ c(0) = rstr[0]; /* may be '\0' */
+ if(!th_isthai(c(0))) return -1;
+ t(0) = twbtype(c(0));
+ if(!(t(0) & A)) return -1;
+
+ /*
+ // get c(-1), t(-1)
+ */
+ if(left >= 1) {
+ c(-1) = lstr[-1];
+ if(!th_isthai(c(-1))) return 0;
+ t(-1) = twbtype(c(-1));
+ if(!(t(-1) & A)) return 0; /* handle punctuation marks here */
+ } else { c(-1) = 0; t(-1) = 0; }
+
+ /*
+ // get c(1..2), t(1..2)
+ */
+ for(i = 1; i <= 2; i++) {
+ if(i >= right) { c(i) = 0; t(i) = 0; }
+ else {
+ c(i) = rstr[i]; /* may be '\0'; */
+ if(!th_isthai(c(i))) right = i--;
+ else {
+ t(i) = twbtype(c(i));
+ if(!(t(i) & A)) right = i--;
+ }
+ }
+ }
+ /*
+ // get c(-2..-3), t(-2..-3)
+ */
+ for(i = -2, j = -2; i >= -3 ; j--) {
+ if(j < -left) { c(i) = 0; t(i) = 0; i--; }
+ else {
+ c(i) = lstr[j];
+ if(!th_isthai(c(i))) left = 0;
+ else {
+ t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0);
+ if(!(t(i) & A)) left = 0;
+ else {
+ if((t(i+1) & MT) && ((t(i) & VR) || (t(i+2) & VR))) {
+ c(i+1) = c(i); t(i+1) = t(i);
+ } else i--;
+ }
+ }
+ }
+ }
+
+ /*
+ // prohibit the unlikely
+ */
+ if((t(-1) & C) && (t(0) & C)) {
+ if((t(-1) & CHE) || (t(0) & CHB)) return -1;
+ }
+ /*
+ // special case : vlao, C/ sara_a|aa, !sara_a
+ */
+ if((t(-3) & (VLA|VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) &&
+ (c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA)) return 0;
+
+ /*
+ // prohibit break
+ */
+ if(t(0) & NB) return -1;
+ if(t(-1) & NE) return -1;
+
+
+ /*
+ // apply 100% rules
+ */
+ if(t(-1) & VRE) {
+ if(c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0;
+ return -1; /* usually too short syllable, part of word */
+ }
+
+ if(t(-2) & VRE) return -1;
+
+ if((t(0) & C) && (t(1) & (VR|MT)) && (c(2) != TH_THANTHAKHAT)) { /*?C, NB */
+ if((t(-1) & (VRS|VRX)) && c(1) == TH_SARA_I) return -1; /* exception */
+ if(t(-1) & (V|M)) return 0; /* !C/ C, NB */
+ if(t(-2) & VRS) return 0; /* VRS, C / C, NB */
+ if(!(t(0) & C2) && c(1) == TH_SARA_I) { /* / !C2 or /c, sara_i */
+ if(t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */
+ if(t(-2) & VC) return 0; /* VC, C / C, NB ? 100% */
+ }
+ }
+ if((t(-1) & VRX) && (t(0) & CC)) return 0; /* VRX/ CC */
+ if((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V|M))) return 0;/* VRS, C/ !C */
+
+
+ if((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) {
+ if((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */
+ if((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */
+ }
+ /*
+ // apply 90% rules
+ */
+ if(t(0) & VL) return 0;
+ if(t(1) & VL) return -1;
+ if(c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING) return 0;
+
+ /*
+ //return -1;
+ // apply 80% rules
+ */
+ if(t(0) & CHE) {
+ if((t(-2) & VRS) && (t(-1) & C)) return 0; /* VRS, C/ CHE */
+ /*if(t(-1) & VRX) return 0; // VRX/ CHE */
+ if(t(-1) & VC) return 0; /* VC/ CHE */
+ }
+ if(t(-1) & CHB) {
+ if((t(0) & C) && (t(1) & VR)) return 0; /* CHB/ CC, VR */
+ if(t(0) & VC) return 0; /* CHB/ VC */
+ }
+
+ if((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */
+ if(t(-2) & VLI) return 0; /* VLI,C/C,VR .*/
+ else { /* vlao, C ? C , VR */
+ if(c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */
+ if(t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */
+ if(!(t(1) & VRA)) return 0; /* VLA, C/ C, !vca */
+ }
+ }
+ /* C,MT,C */
+ if((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1;
+
+ return -1;
+}
+
+
+int TrbFollowing(const th_char *begin, int length, int offset)
+/*
+//(ThBreakIterator *this, int offset)
+*/
+{
+ const th_char *w = begin + offset;
+ const th_char *end = begin + length;
+ while(w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++;
+
+ if(w < end && *w && !th_isthai(*w)) {
+ int english = FALSE;
+ while(w < end && *w && !th_isthai(*w) && !th_isspace(*w)) {
+ if(th_isalpha(*w)) english = TRUE;
+ w++;
+ }
+ if(english || w == end ||
+ (!th_isthai(*w) && th_isspace(*w))) return w - begin;
+ }
+ if(w == end || *w == 0 || !th_isthai(*w)) return w - begin;
+ w++;
+ if(w < end && *w && th_isthai(*w)) {
+ int brk = TrbWordBreakPos(begin, w-begin, w, end-w);
+ while (brk < 0) {
+ w++;
+ if(w == end || *w == 0 || !th_isthai(*w)) break;
+ brk = TrbWordBreakPos(begin, w-begin, w, end-w);
+ }
+ if (brk > 0) w += brk;
+ }
+ if(w < end && *w && !th_isthai(*w)) {
+ while(w < end && *w && !th_isthai(*w) &&
+ !th_isalpha(*w) && !th_isspace(*w)) w++;
+ }
+ return w - begin;
+}
+
+
+/*
+/////////////////////////////////////////////////
+*/
+const twb_t _TwbType[0x100-0xa0] = {
+#if 0
+/* 80 */ T,
+/* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+/* 90 */ T,
+/* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+#endif
+/* a0 */ 0,
+/* a1 */ CS,
+/* a2 */ CS | CHE,
+/* a3 */ CC | CHE,
+/* a4 */ CS | CHE,
+/* a5 */ CC | CHE,
+/* a6 */ CS,
+/* a7 */ CS | CHB,
+/* a8 */ CS,
+/* a9 */ CC | CHE,
+/* aa */ CS,
+/* ab */ CC | CHE,
+/* ac */ CC | CHB | CHE,
+/* ad */ CS | CHB,
+/* ae */ CS | CHB,
+/* af */ CS | CHB,
+/* b0 */ CS,
+/* b1 */ CS | CHB | CHE,
+/* b2 */ CS | CHB | CHE,
+/* b3 */ CS | CHB,
+/* b4 */ CS,
+/* b5 */ CS,
+/* b6 */ CS,
+/* b7 */ CS,
+/* b8 */ CS,
+/* b9 */ CS,
+/* ba */ CS,
+/* bb */ CS,
+/* bc */ CC | CHE,
+/* bd */ CC | CHE,
+/* be */ CS,
+/* bf */ CS,
+/* c0 */ CS | CHE,
+/* c1 */ CS,
+/* c2 */ CS,
+/* c3 */ CS | C2 | CHE, /* ? add CHE */
+/* c4 */ VC | CHE,
+/* c5 */ CS | C2,
+/* c6 */ VC | CHE,
+/* c7 */ VC | C2,
+/* c8 */ CS,
+/* c9 */ CS | CHB,
+/* ca */ CS | CHE,
+/* cb */ CC | CHE,
+/* CC */ CS | CHB | CHE,
+/* cd */ VC,
+/* ce */ CC | CHE,
+/* cf */ T,
+/* d0 */ VRE | VRA,
+/* d1 */ VRS,
+/* d2 */ VRX | VRA,
+/* d3 */ VRE,
+/* d4 */ VRX | VRA,
+/* d5 */ VRX | VRA,
+/* d6 */ VRS,
+/* d7 */ VRS | VRA,
+/* d8 */ VRX,
+/* d9 */ VRX,
+/* da */ T,
+/* db */ 0,
+/* dc */ 0,
+/* dd */ 0,
+/* de */ 0,
+/* df */ T,
+/* e0 */ VLA,
+/* e1 */ VLO,
+/* e2 */ VLO,
+/* e3 */ VLI,
+/* e4 */ VLI,
+/* e5 */ VRE,
+/* e6 */ M,
+/* e7 */ M,
+/* e8 */ M | MT,
+/* e9 */ M | MT,
+/* ea */ M | MT,
+/* eb */ M | MT,
+/* ec */ M,
+/* ed */ T,
+/* ee */ T,
+/* ef */ T,
+/* f0 */ T,
+/* f1 */ T,
+/* f2 */ T,
+/* f3 */ T,
+/* f4 */ T,
+/* f5 */ T,
+/* f6 */ T,
+/* f7 */ T,
+/* f8 */ T,
+/* f9 */ T,
+/* fa */ T,
+/* fb */ T,
+/* fc */ 0,
+/* fd */ 0,
+/* fe */ 0,
+/* ff */ 0
+};
diff --git a/intl/lwbrk/rulebrk.h b/intl/lwbrk/rulebrk.h
new file mode 100644
index 000000000..edc88651b
--- /dev/null
+++ b/intl/lwbrk/rulebrk.h
@@ -0,0 +1,26 @@
+/*
+Copyright (c) 1999 Samphan Raruenrom <samphan@thai.com>
+Permission to use, copy, modify, distribute and sell this software
+and its documentation for any purpose is hereby granted without fee,
+provided that the above copyright notice appear in all copies and
+that both that copyright notice and this permission notice appear
+in supporting documentation. Samphan Raruenrom makes no
+representations about the suitability of this software for any
+purpose. It is provided "as is" without express or implied warranty.
+*/
+#ifndef __RULEBRK_H__
+#define __RULEBRK_H__
+#include "th_char.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int TrbWordBreakPos(const th_char *pstr, int left,
+ const th_char *rstr, int right);
+int TrbFollowing(const th_char *begin, int length, int offset);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/intl/lwbrk/th_char.h b/intl/lwbrk/th_char.h
new file mode 100644
index 000000000..c6d7420f4
--- /dev/null
+++ b/intl/lwbrk/th_char.h
@@ -0,0 +1,54 @@
+/*
+Copyright (c) 1999 Samphan Raruenrom <samphan@thai.com>
+Permission to use, copy, modify, distribute and sell this software
+and its documentation for any purpose is hereby granted without fee,
+provided that the above copyright notice appear in all copies and
+that both that copyright notice and this permission notice appear
+in supporting documentation. Samphan Raruenrom makes no
+representations about the suitability of this software for any
+purpose. It is provided "as is" without express or implied warranty.
+*/
+#ifndef __TH_CHAR_H__
+#define __TH_CHAR_H__
+
+
+typedef unsigned char tis_char;
+
+#ifdef TH_UNICODE
+/*
+ * The char16_t type is only usable in C++ code, so we need this ugly hack to
+ * select a binary compatible C type for the expat C code to use.
+ */
+#ifdef __cplusplus
+typedef char16_t th_char;
+#else
+typedef uint16_t th_char;
+#endif
+#define TH_THAIBEGIN_ 0x0e00
+#define th_isthai(c) (0x0e00 <= (c) && (c) <= 0x0e5f)
+#else
+typedef tis_char th_char;
+#define TH_THAIBEGIN_ 0xa0
+#define th_isthai(c) ((c) >= 0xa0)
+#endif
+#define th_zcode(c) ((c) - TH_THAIBEGIN_)
+
+enum TH_CHARNAME {
+TH_THAIBEGIN = TH_THAIBEGIN_,
+TH_KOKAI,TH_KHOKHAI,TH_KHOKHUAT,TH_KHOKHWAI,TH_KHOKHON,TH_KHORAKHANG,
+TH_NGONGU,TH_CHOCHAN,TH_CHOCHING,TH_CHOCHANG,TH_SOSO,TH_CHOCHOE,TH_YOYING,
+TH_DOCHADA,TH_TOPATAK,TH_THOTHAN,TH_THONANGMONTHO,TH_THOPHUTHAO,TH_NONEN,
+TH_DODEK,TH_TOTAO,TH_THOTHUNG,TH_THOTHAHAN,TH_THOTHONG,TH_NONU,TH_BOBAIMAI,
+TH_POPLA,TH_PHOPHUNG,TH_FOFA,TH_PHOPHAN,TH_FOFAN,TH_PHOSAMPHAO,TH_MOMA,
+TH_YOYAK,TH_RORUA,TH_RU,TH_LOLING,TH_LU,TH_WOWAEN,TH_SOSALA,TH_SORUSI,
+TH_SOSUA,TH_HOHIP,TH_LOCHULA,TH_OANG,TH_HONOKHUK,TH_PAIYANNOI,TH_SARA_A,
+TH_MAIHANAKAT,TH_SARA_AA,TH_SARA_AM,TH_SARA_I,TH_SARA_II,TH_SARA_UE,
+TH_SARA_UEE,TH_SARA_U,TH_SARA_UU,TH_PHINTHU,TH_REM_CHERNG_,TH_TAC_WBRK_,
+TH_UNDEF_DD,TH_UNDEF_DE,TH_BAHT,TH_SARA_E,TH_SARA_AE,TH_SARA_O,TH_MAIMUAN,
+TH_MAIMALAI,TH_LAKKHANGYAO,TH_MAIYAMOK,TH_MAITAIKHU,TH_MAIEK,TH_MAITHO,
+TH_MAITRI,TH_MAICHATTAWA,TH_THANTHAKHAT,TH_NIKHAHIT,TH_YAMAKKAN,TH_FONGMAN,
+TH_THAIZERO,TH_THAIONE,TH_THAITWO,TH_THAITHREE,TH_THAIFOUR,TH_THAIFIVE,
+TH_THAISIX,TH_THAISEVEN,TH_THAIEIGHT,TH_THAININE,TH_ANGKHANKHU,TH_KHOMUT,
+TH_UNDEF_FC,TH_UNDEF_FD,TH_UNDEF_FE,TH_THAIEND
+};
+#endif
diff --git a/intl/lwbrk/tools/anzx4051.html b/intl/lwbrk/tools/anzx4051.html
new file mode 100644
index 000000000..d894ce811
--- /dev/null
+++ b/intl/lwbrk/tools/anzx4051.html
@@ -0,0 +1,669 @@
+<!-- This Source Code Form is subject to the terms of the Mozilla Public
+ - License, v. 2.0. If a copy of the MPL was not distributed with this
+ - file, You can obtain one at http://mozilla.org/MPL/2.0/. -->
+
+<HTML>
+<HEAD>
+<TITLE>
+Analysis of JIS X 4051 to Unicode General Category Mapping
+</TITLE>
+</HEAD>
+<BODY>
+<H1>
+Analysis of JIS X 4051 to Unicode General Category Mapping
+</H1>
+<TABLE BORDER=3>
+<TR BGCOLOR=blue><TH><TH>
+<TD BGCOLOR=red>C</TD>
+<TD BGCOLOR=red>L</TD>
+<TD BGCOLOR=red>M</TD>
+<TD BGCOLOR=red>N</TD>
+<TD BGCOLOR=red>P</TD>
+<TD BGCOLOR=red>S</TD>
+<TD BGCOLOR=red>Z</TD>
+<TD BGCOLOR=white>Total</TD>
+<TD BGCOLOR=yellow>Cc</TD>
+<TD BGCOLOR=yellow>Cf</TD>
+<TD BGCOLOR=yellow>Co</TD>
+<TD BGCOLOR=yellow>Cs</TD>
+<TD BGCOLOR=yellow>Ll</TD>
+<TD BGCOLOR=yellow>Lm</TD>
+<TD BGCOLOR=yellow>Lo</TD>
+<TD BGCOLOR=yellow>Lt</TD>
+<TD BGCOLOR=yellow>Lu</TD>
+<TD BGCOLOR=yellow>Mc</TD>
+<TD BGCOLOR=yellow>Me</TD>
+<TD BGCOLOR=yellow>Mn</TD>
+<TD BGCOLOR=yellow>Nd</TD>
+<TD BGCOLOR=yellow>Nl</TD>
+<TD BGCOLOR=yellow>No</TD>
+<TD BGCOLOR=yellow>Pc</TD>
+<TD BGCOLOR=yellow>Pd</TD>
+<TD BGCOLOR=yellow>Pe</TD>
+<TD BGCOLOR=yellow>Pf</TD>
+<TD BGCOLOR=yellow>Pi</TD>
+<TD BGCOLOR=yellow>Po</TD>
+<TD BGCOLOR=yellow>Ps</TD>
+<TD BGCOLOR=yellow>Sc</TD>
+<TD BGCOLOR=yellow>Sk</TD>
+<TD BGCOLOR=yellow>Sm</TD>
+<TD BGCOLOR=yellow>So</TD>
+<TD BGCOLOR=yellow>Zl</TD>
+<TD BGCOLOR=yellow>Zp</TD>
+<TD BGCOLOR=yellow>Zs</TD>
+</TR>
+<TR><TH>00_1<TH>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>14</TD>
+<TD>1</TD>
+<TD></TD>
+<TD BGCOLOR=white>15</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD>2</TD>
+<TD>11</TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>01_[a]<TH>
+<TD></TD>
+<TD>32</TD>
+<TD>2</TD>
+<TD></TD>
+<TD>28</TD>
+<TD>3</TD>
+<TD></TD>
+<TD BGCOLOR=white>65</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>8</TD>
+<TD>24</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>2</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD>12</TD>
+<TD>1</TD>
+<TD></TD>
+<TD>14</TD>
+<TD></TD>
+<TD></TD>
+<TD>2</TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>02_7<TH>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD BGCOLOR=white>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>03_8<TH>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD BGCOLOR=white>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>04_9<TH>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>5</TD>
+<TD></TD>
+<TD></TD>
+<TD BGCOLOR=white>5</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>5</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>05_[b]<TH>
+<TD>33</TD>
+<TD>153</TD>
+<TD></TD>
+<TD>33</TD>
+<TD>2</TD>
+<TD>5</TD>
+<TD>13</TD>
+<TD BGCOLOR=white>239</TD>
+<TD>32</TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>153</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>33</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>2</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>5</TD>
+<TD></TD>
+<TD></TD>
+<TD>13</TD>
+</TR>
+<TR><TH>06_15<TH>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>30</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD BGCOLOR=white>30</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>30</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>07_18<TH>
+<TD>18</TD>
+<TD>157</TD>
+<TD></TD>
+<TD>33</TD>
+<TD>56</TD>
+<TD>125</TD>
+<TD>2</TD>
+<TD BGCOLOR=white>391</TD>
+<TD></TD>
+<TD>18</TD>
+<TD></TD>
+<TD></TD>
+<TD>64</TD>
+<TD>7</TD>
+<TD>5</TD>
+<TD></TD>
+<TD>81</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>3</TD>
+<TD>30</TD>
+<TD>4</TD>
+<TD>5</TD>
+<TD>2</TD>
+<TD></TD>
+<TD>5</TD>
+<TD>36</TD>
+<TD>4</TD>
+<TD></TD>
+<TD>3</TD>
+<TD>24</TD>
+<TD>98</TD>
+<TD>1</TD>
+<TD>1</TD>
+<TD></TD>
+</TR>
+<TR><TH>08_COMPLEX<TH>
+<TD></TD>
+<TD>54</TD>
+<TD>33</TD>
+<TD>20</TD>
+<TD>2</TD>
+<TD>1</TD>
+<TD></TD>
+<TD BGCOLOR=white>110</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD>53</TD>
+<TD></TD>
+<TD></TD>
+<TD>11</TD>
+<TD></TD>
+<TD>22</TD>
+<TD>10</TD>
+<TD></TD>
+<TD>10</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>2</TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>09_[c]<TH>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>3</TD>
+<TD>4</TD>
+<TD></TD>
+<TD BGCOLOR=white>7</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>3</TD>
+<TD>2</TD>
+<TD></TD>
+<TD>2</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>0A_[d]<TH>
+<TD>1</TD>
+<TD>2</TD>
+<TD></TD>
+<TD>6</TD>
+<TD>28</TD>
+<TD>14</TD>
+<TD></TD>
+<TD BGCOLOR=white>51</TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>6</TD>
+<TD></TD>
+<TD></TD>
+<TD>3</TD>
+<TD>3</TD>
+<TD></TD>
+<TD>22</TD>
+<TD></TD>
+<TD>2</TD>
+<TD>3</TD>
+<TD>7</TD>
+<TD>2</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>0B_[e]<TH>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD>1</TD>
+<TD>3</TD>
+<TD BGCOLOR=white>6</TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>3</TD>
+</TR>
+<TR><TH>X<TH>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD BGCOLOR=white>0</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+</TABLE>
+<TABLE BORDER=3>
+<TR BGCOLOR=blue><TH><TH>
+<TD BGCOLOR=red>00_1</TD>
+<TD BGCOLOR=red>01_[a]</TD>
+<TD BGCOLOR=red>02_7</TD>
+<TD BGCOLOR=red>03_8</TD>
+<TD BGCOLOR=red>04_9</TD>
+<TD BGCOLOR=red>05_[b]</TD>
+<TD BGCOLOR=red>06_15</TD>
+<TD BGCOLOR=red>07_18</TD>
+<TD BGCOLOR=red>08_COMPLEX</TD>
+<TD BGCOLOR=red>09_[c]</TD>
+<TD BGCOLOR=red>0A_[d]</TD>
+<TD BGCOLOR=red>0B_[e]</TD>
+<TD BGCOLOR=red>X</TD>
+</TR>
+<TR><TH>00<TH>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>33</TD>
+<TD>10</TD>
+<TD>127</TD>
+<TD></TD>
+<TD>7</TD>
+<TD>44</TD>
+<TD>2</TD>
+<TD></TD>
+</TR>
+<TR><TH>0E<TH>
+<TD>1</TD>
+<TD>6</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>20</TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>17<TH>
+<TD>2</TD>
+<TD>4</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>110</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>20<TH>
+<TD>2</TD>
+<TD>8</TD>
+<TD>1</TD>
+<TD></TD>
+<TD>5</TD>
+<TD>13</TD>
+<TD></TD>
+<TD>100</TD>
+<TD></TD>
+<TD></TD>
+<TD>7</TD>
+<TD>4</TD>
+<TD></TD>
+</TR>
+<TR><TH>21<TH>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD>32</TD>
+<TD></TD>
+<TD>163</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>30<TH>
+<TD>10</TD>
+<TD>47</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>161</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+</TABLE>
diff --git a/intl/lwbrk/tools/anzx4051.pl b/intl/lwbrk/tools/anzx4051.pl
new file mode 100644
index 000000000..b13315b38
--- /dev/null
+++ b/intl/lwbrk/tools/anzx4051.pl
@@ -0,0 +1,356 @@
+#!/usr/bin/perl
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+######################################################################
+#
+# Initial global variable
+#
+######################################################################
+%utot = ();
+$ui=0;
+$li=0;
+
+######################################################################
+#
+# Open the unicode database file
+#
+######################################################################
+open ( UNICODATA , "< ../../unicharutil/tools/UnicodeData-Latest.txt")
+ || die "cannot find UnicodeData-Latest.txt";
+
+######################################################################
+#
+# Open the JIS X 4051 Class file
+#
+######################################################################
+open ( CLASS , "< jisx4051class.txt")
+ || die "cannot find jisx4051class.txt";
+
+######################################################################
+#
+# Open the JIS X 4051 Class simplified mapping
+#
+######################################################################
+open ( SIMP , "< jisx4051simp.txt")
+ || die "cannot find jisx4051simp.txt";
+
+######################################################################
+#
+# Open the output file
+#
+######################################################################
+open ( OUT , "> anzx4051.html")
+ || die "cannot open output anzx4051.html file";
+
+######################################################################
+#
+# Open the output file
+#
+######################################################################
+open ( HEADER , "> ../src/jisx4051class.h")
+ || die "cannot open output ../src/jisx4051class.h file";
+
+######################################################################
+#
+# Generate license and header
+#
+######################################################################
+$hthmlheader = <<END_OF_HTML;
+<!-- This Source Code Form is subject to the terms of the Mozilla Public
+ - License, v. 2.0. If a copy of the MPL was not distributed with this
+ - file, You can obtain one at http://mozilla.org/MPL/2.0/. -->
+
+<HTML>
+<HEAD>
+<TITLE>
+Analysis of JIS X 4051 to Unicode General Category Mapping
+</TITLE>
+</HEAD>
+<BODY>
+<H1>
+Analysis of JIS X 4051 to Unicode General Category Mapping
+</H1>
+END_OF_HTML
+print OUT $hthmlheader;
+
+######################################################################
+#
+# Generate license and header
+#
+######################################################################
+$npl = <<END_OF_NPL;
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+/*
+ DO NOT EDIT THIS DOCUMENT !!! THIS DOCUMENT IS GENERATED BY
+ mozilla/intl/lwbrk/tools/anzx4051.pl
+ */
+END_OF_NPL
+print HEADER $npl;
+
+%occ = ();
+%gcat = ();
+%dcat = ();
+%simp = ();
+%gcount = ();
+%dcount = ();
+%sccount = ();
+%rangecount = ();
+
+######################################################################
+#
+# Process the file line by line
+#
+######################################################################
+while(<UNICODATA>) {
+ chop;
+ ######################################################################
+ #
+ # Get value from fields
+ #
+ ######################################################################
+ @f = split(/;/ , $_);
+ $c = $f[0]; # The unicode value
+ $g = $f[2];
+ $d = substr($g, 0, 1);
+
+ $gcat{$c} = $g;
+ $dcat{$c} = $d;
+ $gcount{$g}++;
+ $dcount{$d}++;
+}
+close(UNIDATA);
+
+while(<SIMP>) {
+ chop;
+ ######################################################################
+ #
+ # Get value from fields
+ #
+ ######################################################################
+ @f = split(/;/ , $_);
+
+ $simp{$f[0]} = $f[1];
+ $sccount{$f[1]}++;
+}
+close(SIMP);
+
+sub GetClass{
+ my ($u) = @_;
+ my $hex = DecToHex($u);
+ $g = $gcat{$hex};
+ if($g ne "") {
+ return $g;
+ } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) {
+ return "Han";
+ } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) {
+ return "Lo";
+ } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) {
+ return "Cs";
+ } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) {
+ return "Cs";
+ } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) {
+ return "Cs";
+ } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) {
+ return "Co";
+ } else {
+ printf "WARNING !!!! Cannot find General Category for U+%s \n" , $hex;
+ }
+}
+sub GetDClass{
+ my ($u) = @_;
+ my $hex = DecToHex($u);
+ $g = $dcat{$hex};
+ if($g ne "") {
+ return $g;
+ } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) {
+ return "Han";
+ } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) {
+ return "L";
+ } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) {
+ return "C";
+ } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) {
+ return "C";
+ } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) {
+ return "C";
+ } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) {
+ return "C";
+ } else {
+ printf "WARNING !!!! Cannot find Detailed General Category for U+%s \n" , $hex;
+ }
+}
+sub DecToHex{
+ my ($d) = @_;
+ return sprintf("%04X", $d);
+}
+%gtotal = ();
+%dtotal = ();
+while(<CLASS>) {
+ chop;
+ ######################################################################
+ #
+ # Get value from fields
+ #
+ ######################################################################
+ @f = split(/;/ , $_);
+
+ if( substr($f[2], 0, 1) ne "a")
+ {
+ $sc = $simp{$f[2]};
+ $l = hex($f[0]);
+ if($f[1] eq "")
+ {
+ $h = $l;
+ } else {
+ $h = hex($f[1]);
+ }
+ for($k = $l; $k <= $h ; $k++)
+ {
+ if( exists($occ{$k}))
+ {
+ # printf "WARNING !! Conflict defination!!! U+%s -> [%s] [%s | %s]\n",
+ # DecToHex($k), $occ{$k} , $f[2] , $sc;
+ }
+ else
+ {
+ $occ{$k} = $sc . " | " . $f[2];
+ $gclass = GetClass($k);
+ $dclass = GetDClass($k);
+ $gtotal{$sc . $gclass}++;
+ $dtotal{$sc . $dclass}++;
+ $u = DecToHex($k);
+ $rk = " " . substr($u,0,2) . ":" . $sc;
+ $rangecount{$rk}++;
+ }
+ }
+ }
+}
+
+#print %gtotal;
+#print %dtotal;
+
+sub printreport
+{
+ print OUT "<TABLE BORDER=3>\n";
+ print OUT "<TR BGCOLOR=blue><TH><TH>\n";
+
+ foreach $d (sort(keys %dcount)) {
+ print OUT "<TD BGCOLOR=red>$d</TD>\n";
+ }
+
+ print OUT "<TD BGCOLOR=white>Total</TD>\n";
+ foreach $g (sort(keys %gcount)) {
+ print OUT "<TD BGCOLOR=yellow>$g</TD>\n";
+ }
+ print OUT "</TR>\n";
+ foreach $sc (sort(keys %sccount)) {
+
+ print OUT "<TR><TH>$sc<TH>\n";
+
+ $total = 0;
+ foreach $d (sort (keys %dcount)) {
+ $count = $dtotal{$sc . $d};
+ $total += $count;
+ print OUT "<TD>$count</TD>\n";
+ }
+
+ print OUT "<TD BGCOLOR=white>$total</TD>\n";
+
+ foreach $g (sort(keys %gcount)) {
+ $count = $gtotal{$sc . $g};
+ print OUT "<TD>$count</TD>\n";
+ }
+
+
+ print OUT "</TR>\n";
+ }
+ print OUT "</TABLE>\n";
+
+
+ print OUT "<TABLE BORDER=3>\n";
+ print OUT "<TR BGCOLOR=blue><TH><TH>\n";
+
+ foreach $sc (sort(keys %sccount))
+ {
+ print OUT "<TD BGCOLOR=red>$sc</TD>\n";
+ }
+
+ print OUT "</TR>\n";
+
+
+ for($rr = 0; $rr < 0x4f; $rr++)
+ {
+ $empty = 0;
+ $r = sprintf("%02X" , $rr) ;
+ $tmp = "<TR><TH>" . $r . "<TH>\n";
+
+ foreach $sc (sort(keys %sccount)) {
+ $count = $rangecount{ " " .$r . ":" .$sc};
+ $tmp .= sprintf("<TD>%s</TD>\n", $count);
+ $empty += $count;
+ }
+
+ $tmp .= "</TR>\n";
+
+ if($empty ne 0)
+ {
+ print OUT $tmp;
+ }
+ }
+ print OUT "</TABLE>\n";
+
+}
+printreport();
+
+sub printarray
+{
+ my($r, $def) = @_;
+printf "[%s || %s]\n", $r, $def;
+ $k = hex($r) * 256;
+ printf HEADER "static const uint32_t gLBClass%s[32] = {\n", $r;
+ for($i = 0 ; $i < 256; $i+= 8)
+ {
+ for($j = 7 ; $j >= 0; $j-- )
+ {
+ $v = $k + $i + $j;
+ if( exists($occ{$v}))
+ {
+ $p = substr($occ{$v}, 1,1);
+ } else {
+ $p = $def;
+ }
+
+ if($j eq 7 )
+ {
+ printf HEADER "0x%s" , $p;
+ } else {
+ printf HEADER "%s", $p ;
+ }
+ }
+ printf HEADER ", // U+%04X - U+%04X\n", $k + $i ,( $k + $i + 7);
+ }
+ print HEADER "};\n\n";
+}
+printarray("00", "7");
+printarray("20", "7");
+printarray("21", "7");
+printarray("30", "5");
+printarray("0E", "8");
+printarray("17", "7");
+
+#print %rangecount;
+
+######################################################################
+#
+# Close files
+#
+######################################################################
+close(HEADER);
+close(CLASS);
+close(OUT);
+
diff --git a/intl/lwbrk/tools/jisx4051class.txt b/intl/lwbrk/tools/jisx4051class.txt
new file mode 100644
index 000000000..5b26b7267
--- /dev/null
+++ b/intl/lwbrk/tools/jisx4051class.txt
@@ -0,0 +1,159 @@
+0000;001f;17
+0020;;17
+0024;;24
+0027;;18
+0028;;22
+002D;;18
+002F;;18
+0021;002F;23
+0030;0039;15
+003C;;22
+003A;003F;23
+0040;;18
+0041;005A;18
+005B;;22
+005E;;18
+005F;;18
+005B;005F;23
+0060;;18
+0061;007A;18
+007B;;22
+007B;007E;23
+00A0;;24
+00A3;;22
+00A5;;22
+00A9;;18
+00AA;;18
+00AB;;18
+00AC;;22
+00AE;;18
+00AF;;18
+00A1;00BF;23
+00B0;;18
+00F7;;23
+00C0;00FF;18
+0E3F;;1
+0E2F;;4
+0E46;;4
+0E5A;0E5B;4
+0E50;0E59;15
+0E4F;;18
+0EAF;;4
+0EC6;;4
+0ED0;0ED9;15
+1735;1736;1
+17D4;17D5;4
+17D8;;4
+17DA;;4
+1780;17DD;21
+17E0;17E9;21
+17F0;17F9;21
+2007;;24
+2000;200B;17
+200C;200F;18
+2010;;18
+2011;;24
+2012;2013;18
+2014;;7
+2015;;18
+2016;2017;18
+2019;;23
+201D;;23
+2018;201F;18
+2020;2023;18
+2024;2026;23
+2027;;23
+2028;202E;18
+202F;;24
+2030;2034;9
+2035;2038;18
+2039;;1
+203A;;2
+203B;;12
+203C;203D;3
+203E;;23
+203F;2043;18
+2044;;3
+2045;;1
+2046;;2
+2047;2049;3
+204A;205E;18
+205F;;17
+2060;;24
+2061;2063;18
+206A;206F;18
+2070;2071;18
+2074;208E;18
+2090;2094;18
+2116;;8
+2160;217F;12
+2190;21EA;a12
+2126;;18
+2100;2138;18
+2153;2182;18
+2190;21EA;18
+3008;;1
+300A;;1
+300C;;1
+300E;;1
+3010;;1
+3014;;1
+3016;;1
+3018;;1
+301A;;1
+301D;;1
+3001;;2
+3009;;2
+300B;;2
+300D;;2
+300F;;2
+3011;;2
+3015;;2
+3017;;2
+3019;;2
+301B;;2
+301E;;2
+301F;;2
+3005;;3
+301C;;3
+3041;;3
+3043;;3
+3045;;3
+3047;;3
+3049;;3
+3063;;3
+3083;;3
+3085;;3
+3087;;3
+308E;;3
+309D;;3
+309E;;3
+30A1;;3
+30A3;;3
+30A5;;3
+30A7;;3
+30A9;;3
+30C3;;3
+30E3;;3
+30E5;;3
+30E7;;3
+30EE;;3
+30F5;;3
+30F6;;3
+30FC;;3
+30FD;;3
+30FE;;3
+30FB;;5
+3002;;6
+3000;;10
+3042;3094;11
+3099;309E;3
+3003;;12
+3004;;12
+3006;;12
+3007;;12
+3012;;12
+3013;;12
+3020;;12
+3036;;12
+30A2;30FA;12
diff --git a/intl/lwbrk/tools/jisx4051simp.txt b/intl/lwbrk/tools/jisx4051simp.txt
new file mode 100644
index 000000000..e12a7fd80
--- /dev/null
+++ b/intl/lwbrk/tools/jisx4051simp.txt
@@ -0,0 +1,24 @@
+1;00_1
+2;01_[a]
+3;01_[a]
+4;01_[a]
+5;01_[a]
+6;01_[a]
+7;02_7
+8;03_8
+9;04_9
+10;05_[b]
+11;05_[b]
+12;05_[b]
+13;X
+14;X
+15;06_15
+16;X
+17;05_[b]
+18;07_18
+19;X
+20;X
+21;08_COMPLEX
+22;09_[c]
+23;0A_[d]
+24;0B_[e]
diff --git a/intl/lwbrk/tools/spec_table.html b/intl/lwbrk/tools/spec_table.html
new file mode 100644
index 000000000..519f98c53
--- /dev/null
+++ b/intl/lwbrk/tools/spec_table.html
@@ -0,0 +1,127 @@
+<!-- This Source Code Form is subject to the terms of the Mozilla Public
+ - License, v. 2.0. If a copy of the MPL was not distributed with this
+ - file, You can obtain one at http://mozilla.org/MPL/2.0/. -->
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+<title></title>
+<style type="text/css">
+table {
+ border: solid 1px;
+ border-collapse: collapse;
+}
+tbody, tfoot {
+ border-top: solid 2px;
+}
+td, th {
+ border: solid 1px;
+}
+td {
+ text-align: center;
+}
+</style>
+</head>
+<body>
+<p>This is a specification table for line breaking.</p>
+<p>The values of IE7 and Opera9: 'A' means that the line is breakable After the character, and 'B' means Before. 'BA' means Before and After.</p>
+<p>(C) which is the tail of the IE7 and the Opera9 means Character. (N) means Numeric.
+This means that they are around the character at testing. E.g., "a$a" is a testcase for (C), "0$0" is a testcase for (N).</p>
+<p>Gecko is not breaking the lines on most western language context. But for file paths, URLs and very long word which is connected hyphens,
+some characters might be breakable. They are 'breakable' in the table. However, they are not always breakable,
+they <em>depend on the context</em> in the word.</p>
+<table border="1">
+<thead>
+<tr><th colspan="2">character</th><th>Gecko</th><th>IE7(C)</th><th>IE7(N)</th><th>Opera9.2(C)</th><th>Opera9.2(N)</th></tr>
+</thead>
+<tfoot>
+<tr><th colspan="2">character</th><th>Gecko</th><th>IE7(C)</th><th>IE7(N)</th><th>Opera9.2(C)</th><th>Opera9.2(N)</th></tr>
+</tfoot>
+<tbody>
+<tr><th>0x21</th><th>&#x21;</th><td></td><td>A</td><td>A</td><td></td><td></td></tr>
+<tr><th>0x22</th><th>&#x22;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x23</th><th>&#x23;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x24</th><th>&#x24;</th><td></td><td></td><td>B</td><td></td><td></td></tr>
+<tr><th>0x25</th><th>&#x25;</th><td>breakable</td><td>A</td><td>A</td><td></td><td></td></tr>
+<tr><th>0x26</th><th>&#x26;</th><td>breakable</td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x27</th><th>&#x27;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x28</th><th>&#x28;</th><td></td><td>B</td><td>B</td><td></td><td></td></tr>
+<tr><th>0x29</th><th>&#x29;</th><td></td><td>A</td><td>A</td><td></td><td></td></tr>
+<tr><th>0x2A</th><th>&#x2A;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x2B</th><th>&#x2B;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x2C</th><th>&#x2C;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x2D</th><th>&#x2D;</th><td>breakable</td><td>BA</td><td>BA</td><td>A</td><td>A</td></tr>
+<tr><th>0x2E</th><th>&#x2E;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x2F</th><th>&#x2F;</th><td>breakable</td><td></td><td></td><td>A</td><td>A</td></tr>
+</tbody>
+<tbody>
+<tr><th>0x3A</th><th>&#x3A;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x3B</th><th>&#x3B;</th><td>breakable</td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x3C</th><th>&#x3C;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x3D</th><th>&#x3D;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x3E</th><th>&#x3E;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x3F</th><th>&#x3F;</th><td></td><td>A</td><td>A</td><td></td><td></td></tr>
+</tbody>
+<tbody>
+<tr><th>0x40</th><th>&#x40;</th><td></td><td></td><td></td><td></td></tr>
+</tbody>
+<tbody>
+<tr><th>0x5B</th><th>&#x5B;</th><td></td><td>B</td><td>B</td><td></td><td></td></tr>
+<tr><th>0x5C</th><th>&#x5C;</th><td>breakable</td><td></td><td>B</td><td></td><td></td></tr>
+<tr><th>0x5D</th><th>&#x5D;</th><td></td><td>A</td><td>A</td><td></td><td></td></tr>
+<tr><th>0x5E</th><th>&#x5E;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x5F</th><th>&#x5F;</th><td></td><td></td><td></td><td></td><td></td></tr>
+</tbody>
+<tbody>
+<tr><th>0x60</th><th>&#x60;</th><td></td><td></td><td></td><td></td><td></td></tr>
+</tbody>
+<tbody>
+<tr><th>0x7B</th><th>&#x7B;</th><td></td><td>B</td><td>B</td><td></td><td></td></tr>
+<tr><th>0x7C</th><th>&#x7C;</th><td></td><td></td><td></td><td>A</td><td>A</td></tr>
+<tr><th>0x7D</th><th>&#x7D;</th><td></td><td>A</td><td>A</td><td></td><td></td></tr>
+<tr><th>0x7E</th><th>&#x7E;</th><td></td><td></td><td></td><td></td><td></td></tr>
+</tbody>
+<tbody>
+<tr><th>0xA1</th><th>&#xA1;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xA2</th><th>&#xA2;</th><td></td><td>A</td><td>A</td><td></td><td></td></tr>
+<tr><th>0xA3</th><th>&#xA3;</th><td></td><td></td><td>B</td><td></td><td></td></tr>
+<tr><th>0xA4</th><th>&#xA4;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xA5</th><th>&#xA5;</th><td></td><td></td><td>B</td><td></td><td></td></tr>
+<tr><th>0xA6</th><th>&#xA6;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xA7</th><th>&#xA7;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xA8</th><th>&#xA8;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xA9</th><th>&#xA9;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xAA</th><th>&#xAA;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xAB</th><th>&#xAB;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xAC</th><th>&#xAC;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xAE</th><th>&#xAE;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xAF</th><th>&#xAF;</th><td></td><td></td><td></td><td></td><td></td></tr>
+</tbody>
+<tbody>
+<tr><th>0xB0</th><th>&#xB0;</th><td></td><td>A</td><td>A</td><td></td><td></td></tr>
+<tr><th>0xB1</th><th>&#xB1;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xB2</th><th>&#xB2;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xB3</th><th>&#xB3;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xB4</th><th>&#xB4;</th><td></td><td></td><td></td><td>B</td><td>B</td></tr>
+<tr><th>0xB5</th><th>&#xB5;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xB6</th><th>&#xB6;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xB7</th><th>&#xB7;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xB8</th><th>&#xB8;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xB9</th><th>&#xB9;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xBA</th><th>&#xBA;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xBB</th><th>&#xBB;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xBC</th><th>&#xBC;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xBD</th><th>&#xBD;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xBE</th><th>&#xBE;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xBF</th><th>&#xBF;</th><td></td><td></td><td></td><td></td><td></td></tr>
+</tbody>
+<tbody>
+<tr><th>0xD7</th><th>&#xD7;</th><td></td><td></td><td></td><td></td><td></td></tr>
+</tbody>
+<tbody>
+<tr><th>0xF7</th><th>&#xF7;</th><td></td><td></td><td></td><td></td><td></td></tr>
+</tbody>
+</table>
+</body>
+</html>