diff options
Diffstat (limited to 'intl/lwbrk')
30 files changed, 4407 insertions, 0 deletions
diff --git a/intl/lwbrk/crashtests/416721.html b/intl/lwbrk/crashtests/416721.html new file mode 100644 index 000000000..0a6625ba8 --- /dev/null +++ b/intl/lwbrk/crashtests/416721.html @@ -0,0 +1,11 @@ +<!DOCTYPE html> +<html> + <head> + <title>Testcase for bug 416721</title> + <meta http-equiv="content-type" content="text/html; charset=utf-8"> + </head> + <body> + <p>กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛</p> + </body> +</html> + diff --git a/intl/lwbrk/crashtests/crashtests.list b/intl/lwbrk/crashtests/crashtests.list new file mode 100644 index 000000000..a7cb7a173 --- /dev/null +++ b/intl/lwbrk/crashtests/crashtests.list @@ -0,0 +1 @@ +load 416721.html
diff --git a/intl/lwbrk/gtest/TestLineBreak.cpp b/intl/lwbrk/gtest/TestLineBreak.cpp new file mode 100644 index 000000000..5824bf70f --- /dev/null +++ b/intl/lwbrk/gtest/TestLineBreak.cpp @@ -0,0 +1,323 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include <stdio.h> +#include "nsXPCOM.h" +#include "nsIComponentManager.h" +#include "nsISupports.h" +#include "nsServiceManagerUtils.h" +#include "nsILineBreaker.h" +#include "nsIWordBreaker.h" +#include "nsLWBrkCIID.h" +#include "nsString.h" +#include "gtest/gtest.h" + +NS_DEFINE_CID(kLBrkCID, NS_LBRK_CID); +NS_DEFINE_CID(kWBrkCID, NS_WBRK_CID); + +static char teng1[] = +// 1 2 3 4 5 6 7 +//01234567890123456789012345678901234567890123456789012345678901234567890123456789 + "This is a test to test(reasonable) line break. This 0.01123 = 45 x 48."; + +static uint32_t lexp1[] = { + 4,7,9,14,17,34,39,40,41,42,49,54,62,64,67,69,73 +}; + +static uint32_t wexp1[] = { + 4,5,7,8,9,10,14,15,17,18,22,23,33,34,35,39,43,48,49,50,54,55,56,57,62,63, + 64,65,67,68,69,70,72 +}; + +static char teng2[] = +// 1 2 3 4 5 6 7 +//01234567890123456789012345678901234567890123456789012345678901234567890123456789 + "()((reasonab(l)e) line break. .01123=45x48."; + +static uint32_t lexp2[] = { + 17,22,23,30,44 +}; + +static uint32_t wexp2[] = { + 4,12,13,14,15,16,17,18,22,24,29,30,31,32,37,38,43 +}; + +static char teng3[] = +// 1 2 3 4 5 6 7 +//01234567890123456789012345678901234567890123456789012345678901234567890123456789 + "It's a test to test(ronae ) line break...."; + +static uint32_t lexp3[] = { + 4,6,11,14,25,27,32,42 +}; + +static uint32_t wexp3[] = { + 2,3,4,5,6,7,11,12,14,15,19,20,25,26,27,28,32,33,38 +}; + +static char ruler1[] = +" 1 2 3 4 5 6 7 "; +static char ruler2[] = +"0123456789012345678901234567890123456789012345678901234567890123456789012"; + +bool +Check(const char* in, const uint32_t* out, uint32_t outlen, uint32_t i, + uint32_t res[256]) +{ + bool ok = true; + + if (i != outlen) { + ok = false; + printf("WARNING!!! return size wrong, expect %d but got %d \n", + outlen, i); + } + + for (uint32_t j = 0; j < i; j++) { + if (j < outlen) { + if (res[j] != out[j]) { + ok = false; + printf("[%d] expect %d but got %d\n", j, out[j], res[j]); + } + } else { + ok = false; + printf("[%d] additional %d\n", j, res[j]); + } + } + + if (!ok) { + printf("string = \n%s\n", in); + printf("%s\n", ruler1); + printf("%s\n", ruler2); + + printf("Expect = \n"); + for (uint32_t j = 0; j < outlen; j++) { + printf("%d,", out[j]); + } + + printf("\nResult = \n"); + for (uint32_t j = 0; j < i; j++) { + printf("%d,", res[j]); + } + printf("\n"); + } + + return ok; +} + +bool +TestASCIILB(nsILineBreaker *lb, + const char* in, + const uint32_t* out, uint32_t outlen) +{ + NS_ConvertASCIItoUTF16 eng1(in); + uint32_t i; + uint32_t res[256]; + int32_t curr; + + for (i = 0, curr = 0; + curr != NS_LINEBREAKER_NEED_MORE_TEXT && i < 256; + i++) { + curr = lb->Next(eng1.get(), eng1.Length(), curr); + res[i] = curr != NS_LINEBREAKER_NEED_MORE_TEXT ? curr : eng1.Length(); + } + + return Check(in, out, outlen, i, res); +} + +bool +TestASCIIWB(nsIWordBreaker *lb, + const char* in, + const uint32_t* out, uint32_t outlen) +{ + NS_ConvertASCIItoUTF16 eng1(in); + + uint32_t i; + uint32_t res[256]; + int32_t curr = 0; + + for (i = 0, curr = lb->NextWord(eng1.get(), eng1.Length(), curr); + curr != NS_WORDBREAKER_NEED_MORE_TEXT && i < 256; + curr = lb->NextWord(eng1.get(), eng1.Length(), curr), i++) { + res [i] = curr != NS_WORDBREAKER_NEED_MORE_TEXT ? curr : eng1.Length(); + } + + return Check(in, out, outlen, i, res); +} + +TEST(LineBreak, LineBreaker) +{ + nsILineBreaker *t = nullptr; + nsresult res = CallGetService(kLBrkCID, &t); + ASSERT_TRUE(NS_SUCCEEDED(res) && t); + NS_IF_RELEASE(t); + + res = CallGetService(kLBrkCID, &t); + ASSERT_TRUE(NS_SUCCEEDED(res) && t); + + ASSERT_TRUE(TestASCIILB(t, teng1, lexp1, sizeof(lexp1) / sizeof(uint32_t))); + ASSERT_TRUE(TestASCIILB(t, teng2, lexp2, sizeof(lexp2) / sizeof(uint32_t))); + ASSERT_TRUE(TestASCIILB(t, teng3, lexp3, sizeof(lexp3) / sizeof(uint32_t))); + + NS_RELEASE(t); +} + +TEST(LineBreak, WordBreaker) +{ + nsIWordBreaker *t = nullptr; + nsresult res = CallGetService(kWBrkCID, &t); + ASSERT_TRUE(NS_SUCCEEDED(res) && t); + NS_IF_RELEASE(t); + + res = CallGetService(kWBrkCID, &t); + ASSERT_TRUE(NS_SUCCEEDED(res) && t); + + ASSERT_TRUE(TestASCIIWB(t, teng1, wexp1, sizeof(wexp1) / sizeof(uint32_t))); + ASSERT_TRUE(TestASCIIWB(t, teng2, wexp2, sizeof(wexp2) / sizeof(uint32_t))); + ASSERT_TRUE(TestASCIIWB(t, teng3, wexp3, sizeof(wexp3) / sizeof(uint32_t))); + + NS_RELEASE(t); +} + +// 012345678901234 +static const char wb0[] = "T"; +static const char wb1[] = "h"; +static const char wb2[] = "is is a int"; +static const char wb3[] = "ernationali"; +static const char wb4[] = "zation work."; + +static const char* wb[] = { wb0, wb1, wb2, wb3, wb4 }; + +void +TestPrintWordWithBreak() +{ + uint32_t numOfFragment = sizeof(wb) / sizeof(char*); + nsIWordBreaker* wbk = nullptr; + + CallGetService(kWBrkCID, &wbk); + + nsAutoString result; + + for (uint32_t i = 0; i < numOfFragment; i++) { + NS_ConvertASCIItoUTF16 fragText(wb[i]); + + int32_t cur = 0; + cur = wbk->NextWord(fragText.get(), fragText.Length(), cur); + uint32_t start = 0; + for (uint32_t j = 0; cur != NS_WORDBREAKER_NEED_MORE_TEXT; j++) { + result.Append(Substring(fragText, start, cur - start)); + result.Append('^'); + start = (cur >= 0 ? cur : cur - start); + cur = wbk->NextWord(fragText.get(), fragText.Length(), cur); + } + + result.Append(Substring(fragText, fragText.Length() - start)); + + if (i != numOfFragment - 1) { + NS_ConvertASCIItoUTF16 nextFragText(wb[i+1]); + + bool canBreak = true; + canBreak = wbk->BreakInBetween(fragText.get(), + fragText.Length(), + nextFragText.get(), + nextFragText.Length()); + if (canBreak) { + result.Append('^'); + } + fragText.Assign(nextFragText); + } + } + ASSERT_STREQ("is^ ^is^ ^a^ ^ is a intzation^ ^work^ation work.", + NS_ConvertUTF16toUTF8(result).get()); + + NS_IF_RELEASE(wbk); +} + +void +TestFindWordBreakFromPosition(uint32_t fragN, uint32_t offset, + const char* expected) +{ + uint32_t numOfFragment = sizeof(wb) / sizeof(char*); + nsIWordBreaker* wbk = nullptr; + + CallGetService(kWBrkCID, &wbk); + + NS_ConvertASCIItoUTF16 fragText(wb[fragN]); + + nsWordRange res = wbk->FindWord(fragText.get(), fragText.Length(), offset); + + bool canBreak; + nsAutoString result(Substring(fragText, res.mBegin, res.mEnd-res.mBegin)); + + if ((uint32_t)fragText.Length() == res.mEnd) { + // if we hit the end of the fragment + nsAutoString curFragText = fragText; + for(uint32_t p = fragN +1; p < numOfFragment ;p++) + { + NS_ConvertASCIItoUTF16 nextFragText(wb[p]); + canBreak = wbk->BreakInBetween(curFragText.get(), + curFragText.Length(), + nextFragText.get(), + nextFragText.Length()); + if (canBreak) { + break; + } + nsWordRange r = wbk->FindWord(nextFragText.get(), nextFragText.Length(), + 0); + + result.Append(Substring(nextFragText, r.mBegin, r.mEnd - r.mBegin)); + + if ((uint32_t)nextFragText.Length() != r.mEnd) { + break; + } + nextFragText.Assign(curFragText); + } + } + + if (0 == res.mBegin) { + // if we hit the beginning of the fragment + nsAutoString curFragText = fragText; + for (uint32_t p = fragN; p > 0; p--) { + NS_ConvertASCIItoUTF16 prevFragText(wb[p-1]); + canBreak = wbk->BreakInBetween(prevFragText.get(), + prevFragText.Length(), + curFragText.get(), + curFragText.Length()); + if (canBreak) { + break; + } + nsWordRange r = wbk->FindWord(prevFragText.get(), prevFragText.Length(), + prevFragText.Length()); + + result.Insert(Substring(prevFragText, r.mBegin, r.mEnd - r.mBegin), 0); + + if (0 != r.mBegin) { + break; + } + prevFragText.Assign(curFragText); + } + } + + ASSERT_STREQ(expected, NS_ConvertUTF16toUTF8(result).get()) + << "FindWordBreakFromPosition(" << fragN << ", " << offset << ")"; + + NS_IF_RELEASE(wbk); +} + +TEST(LineBreak, WordBreakUsage) +{ + TestPrintWordWithBreak(); + TestFindWordBreakFromPosition(0, 0, "This"); + TestFindWordBreakFromPosition(1, 0, "his"); + TestFindWordBreakFromPosition(2, 0, "is"); + TestFindWordBreakFromPosition(2, 1, "is"); + TestFindWordBreakFromPosition(2, 9, " "); + TestFindWordBreakFromPosition(2, 10, "internationalization"); + TestFindWordBreakFromPosition(3, 4, "ernationalization"); + TestFindWordBreakFromPosition(3, 8, "ernationalization"); + TestFindWordBreakFromPosition(4, 6, " "); + TestFindWordBreakFromPosition(4, 7, "work"); +} + diff --git a/intl/lwbrk/gtest/moz.build b/intl/lwbrk/gtest/moz.build new file mode 100644 index 000000000..64a3919cb --- /dev/null +++ b/intl/lwbrk/gtest/moz.build @@ -0,0 +1,12 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +UNIFIED_SOURCES += [ + 'TestLineBreak.cpp', +] + +FINAL_LIBRARY = 'xul-gtest' + diff --git a/intl/lwbrk/jisx4051class.h b/intl/lwbrk/jisx4051class.h new file mode 100644 index 000000000..70585ac51 --- /dev/null +++ b/intl/lwbrk/jisx4051class.h @@ -0,0 +1,218 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +/* + DO NOT EDIT THIS DOCUMENT !!! THIS DOCUMENT IS GENERATED BY + mozilla/intl/lwbrk/tools/anzx4051.pl + */ +static const uint32_t gLBClass00[32] = { +0x55555555, // U+0000 - U+0007 +0x55555555, // U+0008 - U+000F +0x55555555, // U+0010 - U+0017 +0x55555555, // U+0018 - U+001F +0x7AABAAA5, // U+0020 - U+0027 +0x7A7AAAA9, // U+0028 - U+002F +0x66666666, // U+0030 - U+0037 +0xAAA9AA66, // U+0038 - U+003F +0x77777777, // U+0040 - U+0047 +0x77777777, // U+0048 - U+004F +0x77777777, // U+0050 - U+0057 +0x77AA9777, // U+0058 - U+005F +0x77777777, // U+0060 - U+0067 +0x77777777, // U+0068 - U+006F +0x77777777, // U+0070 - U+0077 +0x7AAA9777, // U+0078 - U+007F +0x77777777, // U+0080 - U+0087 +0x77777777, // U+0088 - U+008F +0x77777777, // U+0090 - U+0097 +0x77777777, // U+0098 - U+009F +0xAA9A9AAB, // U+00A0 - U+00A7 +0x77A9777A, // U+00A8 - U+00AF +0xAAAAAAAA, // U+00B0 - U+00B7 +0xAAAAAAAA, // U+00B8 - U+00BF +0x77777777, // U+00C0 - U+00C7 +0x77777777, // U+00C8 - U+00CF +0x77777777, // U+00D0 - U+00D7 +0x77777777, // U+00D8 - U+00DF +0x77777777, // U+00E0 - U+00E7 +0x77777777, // U+00E8 - U+00EF +0xA7777777, // U+00F0 - U+00F7 +0x77777777, // U+00F8 - U+00FF +}; + +static const uint32_t gLBClass20[32] = { +0xB5555555, // U+2000 - U+2007 +0x77775555, // U+2008 - U+200F +0x777277B7, // U+2010 - U+2017 +0x77A777A7, // U+2018 - U+201F +0xAAAA7777, // U+2020 - U+2027 +0xB7777777, // U+2028 - U+202F +0x77744444, // U+2030 - U+2037 +0x7A115107, // U+2038 - U+203F +0x11017777, // U+2040 - U+2047 +0x77777711, // U+2048 - U+204F +0x77777777, // U+2050 - U+2057 +0x57777777, // U+2058 - U+205F +0x7777777B, // U+2060 - U+2067 +0x77777777, // U+2068 - U+206F +0x77777777, // U+2070 - U+2077 +0x77777777, // U+2078 - U+207F +0x77777777, // U+2080 - U+2087 +0x77777777, // U+2088 - U+208F +0x77777777, // U+2090 - U+2097 +0x77777777, // U+2098 - U+209F +0x77777777, // U+20A0 - U+20A7 +0x77777777, // U+20A8 - U+20AF +0x77777777, // U+20B0 - U+20B7 +0x77777777, // U+20B8 - U+20BF +0x77777777, // U+20C0 - U+20C7 +0x77777777, // U+20C8 - U+20CF +0x77777777, // U+20D0 - U+20D7 +0x77777777, // U+20D8 - U+20DF +0x77777777, // U+20E0 - U+20E7 +0x77777777, // U+20E8 - U+20EF +0x77777777, // U+20F0 - U+20F7 +0x77777777, // U+20F8 - U+20FF +}; + +static const uint32_t gLBClass21[32] = { +0x77777777, // U+2100 - U+2107 +0x77777777, // U+2108 - U+210F +0x73777777, // U+2110 - U+2117 +0x77777777, // U+2118 - U+211F +0x77777777, // U+2120 - U+2127 +0x77777777, // U+2128 - U+212F +0x77777777, // U+2130 - U+2137 +0x77777777, // U+2138 - U+213F +0x77777777, // U+2140 - U+2147 +0x77777777, // U+2148 - U+214F +0x77777777, // U+2150 - U+2157 +0x77777777, // U+2158 - U+215F +0x55555555, // U+2160 - U+2167 +0x55555555, // U+2168 - U+216F +0x55555555, // U+2170 - U+2177 +0x55555555, // U+2178 - U+217F +0x77777777, // U+2180 - U+2187 +0x77777777, // U+2188 - U+218F +0x77777777, // U+2190 - U+2197 +0x77777777, // U+2198 - U+219F +0x77777777, // U+21A0 - U+21A7 +0x77777777, // U+21A8 - U+21AF +0x77777777, // U+21B0 - U+21B7 +0x77777777, // U+21B8 - U+21BF +0x77777777, // U+21C0 - U+21C7 +0x77777777, // U+21C8 - U+21CF +0x77777777, // U+21D0 - U+21D7 +0x77777777, // U+21D8 - U+21DF +0x77777777, // U+21E0 - U+21E7 +0x77777777, // U+21E8 - U+21EF +0x77777777, // U+21F0 - U+21F7 +0x77777777, // U+21F8 - U+21FF +}; + +static const uint32_t gLBClass30[32] = { +0x55155115, // U+3000 - U+3007 +0x10101010, // U+3008 - U+300F +0x10105510, // U+3010 - U+3017 +0x11011010, // U+3018 - U+301F +0x55555555, // U+3020 - U+3027 +0x55555555, // U+3028 - U+302F +0x55555555, // U+3030 - U+3037 +0x55555555, // U+3038 - U+303F +0x15151515, // U+3040 - U+3047 +0x55555515, // U+3048 - U+304F +0x55555555, // U+3050 - U+3057 +0x55555555, // U+3058 - U+305F +0x55551555, // U+3060 - U+3067 +0x55555555, // U+3068 - U+306F +0x55555555, // U+3070 - U+3077 +0x55555555, // U+3078 - U+307F +0x15151555, // U+3080 - U+3087 +0x51555555, // U+3088 - U+308F +0x55555555, // U+3090 - U+3097 +0x51111115, // U+3098 - U+309F +0x15151515, // U+30A0 - U+30A7 +0x55555515, // U+30A8 - U+30AF +0x55555555, // U+30B0 - U+30B7 +0x55555555, // U+30B8 - U+30BF +0x55551555, // U+30C0 - U+30C7 +0x55555555, // U+30C8 - U+30CF +0x55555555, // U+30D0 - U+30D7 +0x55555555, // U+30D8 - U+30DF +0x15151555, // U+30E0 - U+30E7 +0x51555555, // U+30E8 - U+30EF +0x51155555, // U+30F0 - U+30F7 +0x51111555, // U+30F8 - U+30FF +}; + +static const uint32_t gLBClass0E[32] = { +0x88888888, // U+0E00 - U+0E07 +0x88888888, // U+0E08 - U+0E0F +0x88888888, // U+0E10 - U+0E17 +0x88888888, // U+0E18 - U+0E1F +0x88888888, // U+0E20 - U+0E27 +0x18888888, // U+0E28 - U+0E2F +0x88888888, // U+0E30 - U+0E37 +0x08888888, // U+0E38 - U+0E3F +0x81888888, // U+0E40 - U+0E47 +0x78888888, // U+0E48 - U+0E4F +0x66666666, // U+0E50 - U+0E57 +0x88881166, // U+0E58 - U+0E5F +0x88888888, // U+0E60 - U+0E67 +0x88888888, // U+0E68 - U+0E6F +0x88888888, // U+0E70 - U+0E77 +0x88888888, // U+0E78 - U+0E7F +0x88888888, // U+0E80 - U+0E87 +0x88888888, // U+0E88 - U+0E8F +0x88888888, // U+0E90 - U+0E97 +0x88888888, // U+0E98 - U+0E9F +0x88888888, // U+0EA0 - U+0EA7 +0x18888888, // U+0EA8 - U+0EAF +0x88888888, // U+0EB0 - U+0EB7 +0x88888888, // U+0EB8 - U+0EBF +0x81888888, // U+0EC0 - U+0EC7 +0x88888888, // U+0EC8 - U+0ECF +0x66666666, // U+0ED0 - U+0ED7 +0x88888866, // U+0ED8 - U+0EDF +0x88888888, // U+0EE0 - U+0EE7 +0x88888888, // U+0EE8 - U+0EEF +0x88888888, // U+0EF0 - U+0EF7 +0x88888888, // U+0EF8 - U+0EFF +}; + +static const uint32_t gLBClass17[32] = { +0x77777777, // U+1700 - U+1707 +0x77777777, // U+1708 - U+170F +0x77777777, // U+1710 - U+1717 +0x77777777, // U+1718 - U+171F +0x77777777, // U+1720 - U+1727 +0x77777777, // U+1728 - U+172F +0x70077777, // U+1730 - U+1737 +0x77777777, // U+1738 - U+173F +0x77777777, // U+1740 - U+1747 +0x77777777, // U+1748 - U+174F +0x77777777, // U+1750 - U+1757 +0x77777777, // U+1758 - U+175F +0x77777777, // U+1760 - U+1767 +0x77777777, // U+1768 - U+176F +0x77777777, // U+1770 - U+1777 +0x77777777, // U+1778 - U+177F +0x88888888, // U+1780 - U+1787 +0x88888888, // U+1788 - U+178F +0x88888888, // U+1790 - U+1797 +0x88888888, // U+1798 - U+179F +0x88888888, // U+17A0 - U+17A7 +0x88888888, // U+17A8 - U+17AF +0x88888888, // U+17B0 - U+17B7 +0x88888888, // U+17B8 - U+17BF +0x88888888, // U+17C0 - U+17C7 +0x88888888, // U+17C8 - U+17CF +0x88118888, // U+17D0 - U+17D7 +0x77888181, // U+17D8 - U+17DF +0x88888888, // U+17E0 - U+17E7 +0x77777788, // U+17E8 - U+17EF +0x88888888, // U+17F0 - U+17F7 +0x77777788, // U+17F8 - U+17FF +}; + diff --git a/intl/lwbrk/jisx4051pairtable.txt b/intl/lwbrk/jisx4051pairtable.txt new file mode 100644 index 000000000..2bae1b18f --- /dev/null +++ b/intl/lwbrk/jisx4051pairtable.txt @@ -0,0 +1,286 @@ + + + +/* + + Simplification of Pair Table in JIS X 4051 + + 1. The Origion Table - in 4.1.3 + + In JIS x 4051. The pair table is defined as below + + Class of + Leading Class of Trailing Char Class + Char + + 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20 + * # * # + 1 X X X X X X X X X X X X X X X X X X X X X E + 2 X X X X X X + 3 X X X X X X + 4 X X X X X X + 5 X X X X X X + 6 X X X X X X + 7 X X X X X X X + 8 X X X X X X E + 9 X X X X X X + 10 X X X X X X + 11 X X X X X X + 12 X X X X X X + 13 X X X X X X X + 14 X X X X X X X + 15 X X X X X X X X X + 16 X X X X X X X X + 17 X X X X X E + 18 X X X X X X X X X + 19 X E E E E E X X X X X X X X X X X X E X E E + 20 X X X X X E + + * Same Char + # Other Char + + 2. Simplified by remove the class which we do not care + + However, since we do not care about class 13(Subscript), 14(Ruby), + 19(split line note begin quote), and 20(split line note end quote) + we can simplify this par table into the following + + Class of + Leading Class of Trailing Char Class + Char + + 1 2 3 4 5 6 7 8 9 10 11 12 15 16 17 18 + + 1 X X X X X X X X X X X X X X X X + 2 X X X X X + 3 X X X X X + 4 X X X X X + 5 X X X X X + 6 X X X X X + 7 X X X X X X + 8 X X X X X X + 9 X X X X X + 10 X X X X X + 11 X X X X X + 12 X X X X X + 15 X X X X X X X X + 16 X X X X X X X + 17 X X X X X + 18 X X X X X X X X + + 3. Simplified by merged classes + + After the 2 simplification, the pair table have some duplication + a. class 2, 3, 4, 5, 6, are the same- we can merged them + b. class 10, 11, 12, 17 are the same- we can merged them + + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 16 18 + + 1 X X X X X X X X X + [a] X + 7 X X + 8 X X + 9 X + [b] X + 15 X X X X + 16 X X X + 18 X X X X + + + 4. Now we use one bit to encode weather it is breakable, and use 2 bytes + for one row, then the bit table will look like: + + 18 <- 1 + + 1 0000 0001 1111 1111 = 0x01FF + [a] 0000 0000 0000 0010 = 0x0002 + 7 0000 0000 0000 0110 = 0x0006 + 8 0000 0000 0100 0010 = 0x0042 + 9 0000 0000 0000 0010 = 0x0002 + [b] 0000 0000 0000 0010 = 0x0042 + 15 0000 0001 0101 0010 = 0x0152 + 16 0000 0001 1000 0010 = 0x0182 + 17 0000 0001 1100 0010 = 0x01C2 + +*/ + +static uint16_t gJISx4051SimplifiedPair[9] = { + 0x01FF, 0x0002, 0x0006, 0x0042, 0x0002, 0x0042, 0x0152, 0x0182, 0x01C2 +}; + +PRBool XXXX::ClassesToPair(nsJISx4051Cls aCls1, nsJISx4051Cls aCls1) +{ + NS_ASSERTION( (aCls1 < 9) "invalid class"); + NS_ASSERTION( (aCls2 < 9) "invalid class"); + return ( 0 != (gJISx4051SimplifiedPair[aCls1] & (1L << aCls2) )); +} + + +#define X4051_IS_DIGIT(u) ((0x0030 >= (u)) && ((u) >= 0x0039)) + +nsJISx4051Cls XXXX::GetClass( + PRUnichar aChar, PRUnichar aBefore = 0, PRUnichar aAfter = 0) +{ + // take care the special case in cls 15 + if( ((0x2C == aChar) || (0x2E == aChar)) && + (X4051_IS_DIGIT(aBefore)) && X4051_IS_DIGIT(aAfter))) + { + return kJISx4051Cls_15; + } + + nsJISx4051Cls cls; + if(gSingle->Lookup(aChar, &cls)) + return cls; + + if(gRange->Lookup(aChar, &cls)) + return cls; + + return kJISx4051Cls_15; +} + + +typedef enum { + kJISx4051Cls_1 = 0, + kJISx4051Cls_2 = 1, + kJISx4051Cls_3 = 1, + kJISx4051Cls_4 = 1, + kJISx4051Cls_5 = 1, + kJISx4051Cls_6 = 1, + kJISx4051Cls_7 = 2, + kJISx4051Cls_8 = 3, + kJISx4051Cls_9 = 4, + kJISx4051Cls_10 = 5, + kJISx4051Cls_11 = 5, + kJISx4051Cls_12 = 5, + // kJISx4051Cls_13 = 0, + // kJISx4051Cls_14 = 0, + kJISx4051Cls_15 = 6, + kJISx4051Cls_16 = 7, + kJISx4051Cls_17 = 5, + kJISx4051Cls_18 = 8, + // kJISx4051Cls_19 = 0, + // kJISx4051Cls_20 = 0 +} nsJISx4051Cls; + + + // Table 2 + YYYY(kJISx4051Cls_1 , 0x0028), + YYYY(kJISx4051Cls_1 , 0x005B), + YYYY(kJISx4051Cls_1 , 0x007B), + YYYY(kJISx4051Cls_1 , 0x2018), + YYYY(kJISx4051Cls_1 , 0x201B), + YYYY(kJISx4051Cls_1 , 0x201C), + YYYY(kJISx4051Cls_1 , 0x201F), + YYYY(kJISx4051Cls_1 , 0x3008), + YYYY(kJISx4051Cls_1 , 0x300A), + YYYY(kJISx4051Cls_1 , 0x300C), + YYYY(kJISx4051Cls_1 , 0x300E), + YYYY(kJISx4051Cls_1 , 0x3010), + YYYY(kJISx4051Cls_1 , 0x3014), + YYYY(kJISx4051Cls_1 , 0x3016), + YYYY(kJISx4051Cls_1 , 0x3018), + YYYY(kJISx4051Cls_1 , 0x301A), + YYYY(kJISx4051Cls_1 , 0x301D), + + // Table 3 + YYYY(kJISx4051Cls_2 , 0x0029), + YYYY(kJISx4051Cls_2 , 0x002C), + YYYY(kJISx4051Cls_2 , 0x005D), + YYYY(kJISx4051Cls_2 , 0x007D), + YYYY(kJISx4051Cls_2 , 0x2019), + YYYY(kJISx4051Cls_2 , 0x201A), + YYYY(kJISx4051Cls_2 , 0x201D), + YYYY(kJISx4051Cls_2 , 0x201E), + YYYY(kJISx4051Cls_2 , 0x3001), + YYYY(kJISx4051Cls_2 , 0x3009), + YYYY(kJISx4051Cls_2 , 0x300B), + YYYY(kJISx4051Cls_2 , 0x300D), + YYYY(kJISx4051Cls_2 , 0x300F), + YYYY(kJISx4051Cls_2 , 0x3011), + YYYY(kJISx4051Cls_2 , 0x3015), + YYYY(kJISx4051Cls_2 , 0x3017), + YYYY(kJISx4051Cls_2 , 0x3019), + YYYY(kJISx4051Cls_2 , 0x301B), + YYYY(kJISx4051Cls_2 , 0x301E), + YYYY(kJISx4051Cls_2 , 0x301F), + + // Table 4 + YYYY(kJISx4051Cls_3 , 0x203C), + YYYY(kJISx4051Cls_3 , 0x2044), + YYYY(kJISx4051Cls_3 , 0x301C), + YYYY(kJISx4051Cls_3 , 0x3041), + YYYY(kJISx4051Cls_3 , 0x3043), + YYYY(kJISx4051Cls_3 , 0x3045), + YYYY(kJISx4051Cls_3 , 0x3047), + YYYY(kJISx4051Cls_3 , 0x3049), + YYYY(kJISx4051Cls_3 , 0x3063), + YYYY(kJISx4051Cls_3 , 0x3083), + YYYY(kJISx4051Cls_3 , 0x3085), + YYYY(kJISx4051Cls_3 , 0x3087), + YYYY(kJISx4051Cls_3 , 0x308E), + YYYY(kJISx4051Cls_3 , 0x309D), + YYYY(kJISx4051Cls_3 , 0x309E), + YYYY(kJISx4051Cls_3 , 0x30A1), + YYYY(kJISx4051Cls_3 , 0x30A3), + YYYY(kJISx4051Cls_3 , 0x30A5), + YYYY(kJISx4051Cls_3 , 0x30A7), + YYYY(kJISx4051Cls_3 , 0x30A9), + YYYY(kJISx4051Cls_3 , 0x30C3), + YYYY(kJISx4051Cls_3 , 0x30E3), + YYYY(kJISx4051Cls_3 , 0x30E5), + YYYY(kJISx4051Cls_3 , 0x30E7), + YYYY(kJISx4051Cls_3 , 0x30EE), + YYYY(kJISx4051Cls_3 , 0x30F5), + YYYY(kJISx4051Cls_3 , 0x30F6), + YYYY(kJISx4051Cls_3 , 0x30FC), + YYYY(kJISx4051Cls_3 , 0x30FD), + YYYY(kJISx4051Cls_3 , 0x30FE), + + // Table 5 + YYYY(kJISx4051Cls_4 , 0x0021), + YYYY(kJISx4051Cls_4 , 0x003F), + + // Table 6 + YYYY(kJISx4051Cls_5 , 0x003A), + YYYY(kJISx4051Cls_5 , 0x003B), + YYYY(kJISx4051Cls_5 , 0x30FB), + + // Table 7 + YYYY(kJISx4051Cls_6 , 0x002E), + YYYY(kJISx4051Cls_6 , 0x3002), + + // Table 8 + YYYY(kJISx4051Cls_7 , 0x2014), + YYYY(kJISx4051Cls_7 , 0x2024), + YYYY(kJISx4051Cls_7 , 0x2025), + YYYY(kJISx4051Cls_7 , 0x2026), + + // Table 9 + YYYY(kJISx4051Cls_8 , 0x0024), + YYYY(kJISx4051Cls_8 , 0x00A3), + YYYY(kJISx4051Cls_8 , 0x00A5), + YYYY(kJISx4051Cls_8 , 0x2116), + + // Table 10 + YYYY(kJISx4051Cls_9 , 0x0025), + YYYY(kJISx4051Cls_9 , 0x00A2), + YYYY(kJISx4051Cls_9 , 0x00B0), + YYYY(kJISx4051Cls_9 , 0x2030), + YYYY(kJISx4051Cls_9 , 0x2031), + YYYY(kJISx4051Cls_9 , 0x2032), + YYYY(kJISx4051Cls_9 , 0x2033), + + // Table 1 + YYYY(kJISx4051Cls_10, 0x3000), + + // Table 1 + ZZZZ(kJISx4051Cls_11, 0x3000), + + + + diff --git a/intl/lwbrk/moz.build b/intl/lwbrk/moz.build new file mode 100644 index 000000000..63ffbff8e --- /dev/null +++ b/intl/lwbrk/moz.build @@ -0,0 +1,48 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +TEST_DIRS += ['gtest'] + +XPIDL_SOURCES += [ + 'nsISemanticUnitScanner.idl', +] + +XPIDL_MODULE = 'lwbrk' + +EXPORTS += [ + 'nsILineBreaker.h', + 'nsIWordBreaker.h', + 'nsLWBrkCIID.h', +] + +UNIFIED_SOURCES += [ + 'nsJISx4051LineBreaker.cpp', + 'nsSampleWordBreaker.cpp', + 'nsSemanticUnitScanner.cpp', +] + +if 'gtk' in CONFIG['MOZ_WIDGET_TOOLKIT']: + SOURCES += [ + 'nsPangoBreaker.cpp', + ] + CXXFLAGS += CONFIG['MOZ_PANGO_CFLAGS'] +elif CONFIG['MOZ_WIDGET_TOOLKIT'] == 'windows': + SOURCES += [ + 'nsUniscribeBreaker.cpp', + ] +elif CONFIG['MOZ_WIDGET_TOOLKIT'] == 'cocoa': + UNIFIED_SOURCES += [ + 'nsCarbonBreaker.cpp', + ] +else: + SOURCES += [ + 'nsRuleBreaker.cpp', + ] + SOURCES += [ + 'rulebrk.c', + ] + +FINAL_LIBRARY = 'xul' diff --git a/intl/lwbrk/nsCarbonBreaker.cpp b/intl/lwbrk/nsCarbonBreaker.cpp new file mode 100644 index 000000000..1b37bc129 --- /dev/null +++ b/intl/lwbrk/nsCarbonBreaker.cpp @@ -0,0 +1,44 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include <CoreFoundation/CoreFoundation.h> +#include <stdint.h> +#include "nsDebug.h" +#include "nscore.h" + +void +NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength, + uint8_t* aBreakBefore) +{ + NS_ASSERTION(aText, "aText shouldn't be null"); + + memset(aBreakBefore, 0, aLength * sizeof(uint8_t)); + + CFStringRef str = ::CFStringCreateWithCharactersNoCopy(kCFAllocatorDefault, reinterpret_cast<const UniChar*>(aText), aLength, kCFAllocatorNull); + if (!str) { + return; + } + + CFStringTokenizerRef st = ::CFStringTokenizerCreate(kCFAllocatorDefault, str, + ::CFRangeMake(0, aLength), + kCFStringTokenizerUnitLineBreak, + nullptr); + if (!st) { + ::CFRelease(str); + return; + } + + CFStringTokenizerTokenType tt = ::CFStringTokenizerAdvanceToNextToken(st); + while (tt != kCFStringTokenizerTokenNone) { + CFRange r = ::CFStringTokenizerGetCurrentTokenRange(st); + if (r.location != 0) { // Ignore leading edge + aBreakBefore[r.location] = true; + } + tt = CFStringTokenizerAdvanceToNextToken(st); + } + + ::CFRelease(st); + ::CFRelease(str); +} diff --git a/intl/lwbrk/nsComplexBreaker.h b/intl/lwbrk/nsComplexBreaker.h new file mode 100644 index 000000000..d4ebb3581 --- /dev/null +++ b/intl/lwbrk/nsComplexBreaker.h @@ -0,0 +1,19 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef nsComplexBreaker_h__ +#define nsComplexBreaker_h__ + +#include "nsString.h" + +/** + * Find line break opportunities in aText[] of aLength characters, + * filling boolean values indicating line break opportunities for + * corresponding charactersin aBreakBefore[] on return. + */ +void +NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength, + uint8_t* aBreakBefore); + +#endif /* nsComplexBreaker_h__ */ diff --git a/intl/lwbrk/nsILineBreaker.h b/intl/lwbrk/nsILineBreaker.h new file mode 100644 index 000000000..19adbac10 --- /dev/null +++ b/intl/lwbrk/nsILineBreaker.h @@ -0,0 +1,74 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef nsILineBreaker_h__ +#define nsILineBreaker_h__ + +#include "nsISupports.h" + +#include "nscore.h" + +#define NS_LINEBREAKER_NEED_MORE_TEXT -1 + +// {0x4b0b9e04-6ffb-4647-aa5f-2fa2ebd883e8} +#define NS_ILINEBREAKER_IID \ +{0x4b0b9e04, 0x6ffb, 0x4647, \ + {0xaa, 0x5f, 0x2f, 0xa2, 0xeb, 0xd8, 0x83, 0xe8}} + +class nsILineBreaker : public nsISupports +{ +public: + NS_DECLARE_STATIC_IID_ACCESSOR(NS_ILINEBREAKER_IID) + + enum { + kWordBreak_Normal = 0, // default + kWordBreak_BreakAll = 1, // break all + kWordBreak_KeepAll = 2 // always keep + }; + + virtual int32_t Next( const char16_t* aText, uint32_t aLen, + uint32_t aPos) = 0; + + virtual int32_t Prev( const char16_t* aText, uint32_t aLen, + uint32_t aPos) = 0; + + // Call this on a word with whitespace at either end. We will apply JISx4051 + // rules to find breaks inside the word. aBreakBefore is set to the break- + // before status of each character; aBreakBefore[0] will always be false + // because we never return a break before the first character. + // aLength is the length of the aText array and also the length of the aBreakBefore + // output array. + virtual void GetJISx4051Breaks(const char16_t* aText, uint32_t aLength, + uint8_t aWordBreak, + uint8_t* aBreakBefore) = 0; + virtual void GetJISx4051Breaks(const uint8_t* aText, uint32_t aLength, + uint8_t aWordBreak, + uint8_t* aBreakBefore) = 0; +}; + +NS_DEFINE_STATIC_IID_ACCESSOR(nsILineBreaker, NS_ILINEBREAKER_IID) + +static inline bool +NS_IsSpace(char16_t u) +{ + return u == 0x0020 || // SPACE + u == 0x0009 || // CHARACTER TABULATION + u == 0x000D || // CARRIAGE RETURN + u == 0x1680 || // OGHAM SPACE MARK + (0x2000 <= u && u <= 0x2006) || // EN QUAD, EM QUAD, EN SPACE, + // EM SPACE, THREE-PER-EM SPACE, + // FOUR-PER-SPACE, SIX-PER-EM SPACE, + (0x2008 <= u && u <= 0x200B) || // PUNCTUATION SPACE, THIN SPACE, + // HAIR SPACE, ZERO WIDTH SPACE + u == 0x205F; // MEDIUM MATHEMATICAL SPACE +} + +static inline bool +NS_NeedsPlatformNativeHandling(char16_t aChar) +{ + return (0x0e01 <= aChar && aChar <= 0x0fff) || // Thai, Lao, Tibetan + (0x1780 <= aChar && aChar <= 0x17ff); // Khmer +} + +#endif /* nsILineBreaker_h__ */ diff --git a/intl/lwbrk/nsISemanticUnitScanner.idl b/intl/lwbrk/nsISemanticUnitScanner.idl new file mode 100644 index 000000000..e6e99fc07 --- /dev/null +++ b/intl/lwbrk/nsISemanticUnitScanner.idl @@ -0,0 +1,48 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsISupports.idl" + +%{C++ +// {ADF42751-1CEF-4ad2-AA8E-BCB849D8D31F} +#define NS_SEMANTICUNITSCANNER_CID { 0xadf42751, 0x1cef, 0x4ad2, { 0xaa, 0x8e, 0xbc, 0xb8, 0x49, 0xd8, 0xd3, 0x1f}} +#define NS_SEMANTICUNITSCANNER_CONTRACTID "@mozilla.org/intl/semanticunitscanner;1" +%} + +/** + * Provides a language independent way to break UNICODE + * text into meaningful semantic units (e.g. words). + */ +[scriptable, uuid(9f620be4-e535-11d6-b254-00039310a47a)] +interface nsISemanticUnitScanner : nsISupports { + /** + * start() + * + * Starts up the semantic unit scanner with an optional + * character set, which acts as a hint to optimize the heuristics + * used to determine the language(s) of the processed text. + * + * @param characterSet the character set the text was originally + * encoded in (can be NULL) + */ + void start(in string characterSet); + + /** + * next() + * Get the begin / end offset of the next unit in the current text + * + * @param text the text to be scanned + * @param length the number of characters in the text to be processed + * @param pos the current position + * @param isLastBuffer, the buffer is the last one + * @param begin the begin offset of the next unit + * @param begin the end offset of the next unit + * @return has more unit in the current text + */ + boolean next(in wstring text, in long length, in long pos, + in boolean isLastBuffer, + out long begin, out long end ); + +}; diff --git a/intl/lwbrk/nsIWordBreaker.h b/intl/lwbrk/nsIWordBreaker.h new file mode 100644 index 000000000..3867fba06 --- /dev/null +++ b/intl/lwbrk/nsIWordBreaker.h @@ -0,0 +1,41 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef nsIWordBreaker_h__ +#define nsIWordBreaker_h__ + +#include "nsISupports.h" + +#include "nscore.h" + +#define NS_WORDBREAKER_NEED_MORE_TEXT -1 + +// {E86B3379-BF89-11d2-B3AF-00805F8A6670} +#define NS_IWORDBREAKER_IID \ +{ 0xe86b3379, 0xbf89, 0x11d2, \ + { 0xb3, 0xaf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } + +typedef struct { + uint32_t mBegin; + uint32_t mEnd; +} nsWordRange; + +class nsIWordBreaker : public nsISupports +{ +public: + NS_DECLARE_STATIC_IID_ACCESSOR(NS_IWORDBREAKER_IID) + + virtual bool BreakInBetween(const char16_t* aText1 , uint32_t aTextLen1, + const char16_t* aText2 , + uint32_t aTextLen2) = 0; + virtual nsWordRange FindWord(const char16_t* aText1 , uint32_t aTextLen1, + uint32_t aOffset) = 0; + virtual int32_t NextWord(const char16_t* aText, uint32_t aLen, + uint32_t aPos) = 0; + +}; + +NS_DEFINE_STATIC_IID_ACCESSOR(nsIWordBreaker, NS_IWORDBREAKER_IID) + +#endif /* nsIWordBreaker_h__ */ diff --git a/intl/lwbrk/nsJISx4051LineBreaker.cpp b/intl/lwbrk/nsJISx4051LineBreaker.cpp new file mode 100644 index 000000000..1b262fa2c --- /dev/null +++ b/intl/lwbrk/nsJISx4051LineBreaker.cpp @@ -0,0 +1,999 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + + + +#include "nsJISx4051LineBreaker.h" + +#include "jisx4051class.h" +#include "nsComplexBreaker.h" +#include "nsTArray.h" +#include "nsUnicodeProperties.h" + +/* + + Simplification of Pair Table in JIS X 4051 + + 1. The Origion Table - in 4.1.3 + + In JIS x 4051. The pair table is defined as below + + Class of + Leading Class of Trailing Char Class + Char + + 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20 + * # * # + 1 X X X X X X X X X X X X X X X X X X X X X E + 2 X X X X X X + 3 X X X X X X + 4 X X X X X X + 5 X X X X X X + 6 X X X X X X + 7 X X X X X X X + 8 X X X X X X E + 9 X X X X X X + 10 X X X X X X + 11 X X X X X X + 12 X X X X X X + 13 X X X X X X X + 14 X X X X X X X + 15 X X X X X X X X X + 16 X X X X X X X X + 17 X X X X X E + 18 X X X X X X X X X + 19 X E E E E E X X X X X X X X X X X X E X E E + 20 X X X X X E + + * Same Char + # Other Char + + X Cannot Break + + The classes mean: + 1: Open parenthesis + 2: Close parenthesis + 3: Prohibit a line break before + 4: Punctuation for sentence end (except Full stop, e.g., "!" and "?") + 5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT) + 6: Full stop + 7: Non-breakable between same characters + 8: Prefix (e.g., "$", "NO.") + 9: Postfix (e.g., "%") + 10: Ideographic space + 11: Hiragana + 12: Japanese characters (except class 11) + 13: Subscript + 14: Ruby + 15: Numeric + 16: Alphabet + 17: Space for Western language + 18: Western characters (except class 17) + 19: Split line note (Warichu) begin quote + 20: Split line note (Warichu) end quote + + 2. Simplified by remove the class which we do not care + + However, since we do not care about class 13(Subscript), 14(Ruby), + 16 (Aphabet), 19(split line note begin quote), and 20(split line note end + quote) we can simplify this par table into the following + + Class of + Leading Class of Trailing Char Class + Char + + 1 2 3 4 5 6 7 8 9 10 11 12 15 17 18 + + 1 X X X X X X X X X X X X X X X + 2 X X X X X + 3 X X X X X + 4 X X X X X + 5 X X X X X + 6 X X X X X + 7 X X X X X X + 8 X X X X X X + 9 X X X X X + 10 X X X X X + 11 X X X X X + 12 X X X X X + 15 X X X X X X X X + 17 X X X X X + 18 X X X X X X X + + 3. Simplified by merged classes + + After the 2 simplification, the pair table have some duplication + a. class 2, 3, 4, 5, 6, are the same- we can merged them + b. class 10, 11, 12, 17 are the same- we can merged them + + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 18 + + 1 X X X X X X X X + [a] X + 7 X X + 8 X X + 9 X + [b] X + 15 X X X X + 18 X X X + + + 4. We add COMPLEX characters and make it breakable w/ all ther class + except after class 1 and before class [a] + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 18 COMPLEX + + 1 X X X X X X X X X + [a] X + 7 X X + 8 X X + 9 X + [b] X + 15 X X X X + 18 X X X + COMPLEX X T + + T : need special handling + + + 5. However, we need two special class for some punctuations/parentheses, + theirs breaking rules like character class (18), see bug 389056. + And also we need character like punctuation that is same behavior with 18, + but the characters are not letters of all languages. (e.g., '_') + [c]. Based on open parenthesis class (1), but it is not breakable after + character class (18) or numeric class (15). + [d]. Based on close parenthesis (or punctuation) class (2), but it is not + breakable before character class (18) or numeric class (15). + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] + + 1 X X X X X X X X X X X + [a] X X X + 7 X X + 8 X X + 9 X + [b] X X + 15 X X X X X X + 18 X X X X X + COMPLEX X T + [c] X X X X X X X X X X X + [d] X X X X + + + 6. And Unicode has "NON-BREAK" characters. The lines should be broken around + them. But in JIS X 4051, such class is not, therefore, we create [e]. + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e] + + 1 X X X X X X X X X X X X + [a] X X X + 7 X X X + 8 X X X + 9 X X + [b] X X X + 15 X X X X X X X + 18 X X X X X X + COMPLEX X T X + [c] X X X X X X X X X X X X + [d] X X X X X + [e] X X X X X X X X X X X X + + + 7. Now we use one bit to encode weather it is breakable, and use 2 bytes + for one row, then the bit table will look like: + + 18 <- 1 + + 1 0000 1111 1111 1111 = 0x0FFF + [a] 0000 1100 0000 0010 = 0x0C02 + 7 0000 1000 0000 0110 = 0x0806 + 8 0000 1000 0100 0010 = 0x0842 + 9 0000 1000 0000 0010 = 0x0802 + [b] 0000 1100 0000 0010 = 0x0C02 + 15 0000 1110 1101 0010 = 0x0ED2 + 18 0000 1110 1100 0010 = 0x0EC2 + COMPLEX 0000 1001 0000 0010 = 0x0902 + [c] 0000 1111 1111 1111 = 0x0FFF + [d] 0000 1100 1100 0010 = 0x0CC2 + [e] 0000 1111 1111 1111 = 0x0FFF +*/ + +#define MAX_CLASSES 12 + +static const uint16_t gPair[MAX_CLASSES] = { + 0x0FFF, + 0x0C02, + 0x0806, + 0x0842, + 0x0802, + 0x0C02, + 0x0ED2, + 0x0EC2, + 0x0902, + 0x0FFF, + 0x0CC2, + 0x0FFF +}; + + +/* + + 8. And if the character is not enough far from word start, word end and + another break point, we should not break in non-CJK languages. + I.e., Don't break around 15, 18, [c] and [d], but don't change + that if they are related to [b]. + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e] + + 1 X X X X X X X X X X X X + [a] X X X X X X + 7 X X X X X X X + 8 X X X X X X + 9 X X X X X X + [b] X X X + 15 X X X X X X X X X X X + 18 X X X X X X X X X X X + COMPLEX X X X T X X X + [c] X X X X X X X X X X X X + [d] X X X X X X X X X X X + [e] X X X X X X X X X X X X + + 18 <- 1 + + 1 0000 1111 1111 1111 = 0x0FFF + [a] 0000 1110 1100 0010 = 0x0EC2 + 7 0000 1110 1100 0110 = 0x0EC6 + 8 0000 1110 1100 0010 = 0x0EC2 + 9 0000 1110 1100 0010 = 0x0EC2 + [b] 0000 1100 0000 0010 = 0x0C02 + 15 0000 1111 1101 1111 = 0x0FDF + 18 0000 1111 1101 1111 = 0x0FDF + COMPLEX 0000 1111 1100 0010 = 0x0FC2 + [c] 0000 1111 1111 1111 = 0x0FFF + [d] 0000 1111 1101 1111 = 0x0FDF + [e] 0000 1111 1111 1111 = 0x0FFF +*/ + +static const uint16_t gPairConservative[MAX_CLASSES] = { + 0x0FFF, + 0x0EC2, + 0x0EC6, + 0x0EC2, + 0x0EC2, + 0x0C02, + 0x0FDF, + 0x0FDF, + 0x0FC2, + 0x0FFF, + 0x0FDF, + 0x0FFF +}; + + +/* + + 9. Now we map the class to number + + 0: 1 + 1: [a]- 2, 3, 4, 5, 6 + 2: 7 + 3: 8 + 4: 9 + 5: [b]- 10, 11, 12, 17 + 6: 15 + 7: 18 + 8: COMPLEX + 9: [c] + A: [d] + B: [e] + + and they mean: + 0: Open parenthesis + 1: Punctuation that prohibits break before + 2: Non-breakable between same classes + 3: Prefix + 4: Postfix + 5: Breakable character (Spaces and Most Japanese characters) + 6: Numeric + 7: Characters + 8: Need special handling characters (E.g., Thai) + 9: Open parentheses like Character (See bug 389056) + A: Close parenthese (or punctuations) like Character (See bug 389056) + B: Non breakable (See bug 390920) + +*/ + +#define CLASS_NONE INT8_MAX + +#define CLASS_OPEN 0x00 +#define CLASS_CLOSE 0x01 +#define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02 +#define CLASS_PREFIX 0x03 +#define CLASS_POSTFFIX 0x04 +#define CLASS_BREAKABLE 0x05 +#define CLASS_NUMERIC 0x06 +#define CLASS_CHARACTER 0x07 +#define CLASS_COMPLEX 0x08 +#define CLASS_OPEN_LIKE_CHARACTER 0x09 +#define CLASS_CLOSE_LIKE_CHARACTER 0x0A +#define CLASS_NON_BREAKABLE 0x0B + +#define U_NULL char16_t(0x0000) +#define U_SLASH char16_t('/') +#define U_SPACE char16_t(' ') +#define U_HYPHEN char16_t('-') +#define U_EQUAL char16_t('=') +#define U_PERCENT char16_t('%') +#define U_AMPERSAND char16_t('&') +#define U_SEMICOLON char16_t(';') +#define U_BACKSLASH char16_t('\\') +#define U_OPEN_SINGLE_QUOTE char16_t(0x2018) +#define U_OPEN_DOUBLE_QUOTE char16_t(0x201C) +#define U_OPEN_GUILLEMET char16_t(0x00AB) + +#define NEED_CONTEXTUAL_ANALYSIS(c) (IS_HYPHEN(c) || \ + (c) == U_SLASH || \ + (c) == U_PERCENT || \ + (c) == U_AMPERSAND || \ + (c) == U_SEMICOLON || \ + (c) == U_BACKSLASH || \ + (c) == U_OPEN_SINGLE_QUOTE || \ + (c) == U_OPEN_DOUBLE_QUOTE || \ + (c) == U_OPEN_GUILLEMET) + +#define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039) + +static inline int +GETCLASSFROMTABLE(const uint32_t* t, uint16_t l) +{ + return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f); +} + +static inline int +IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u) +{ + return ((0xff66 <= (u)) && ((u) <= 0xff70)); +} + +static inline int +IS_CJK_CHAR(char16_t u) +{ + return ((0x1100 <= (u) && (u) <= 0x11ff) || + (0x2e80 <= (u) && (u) <= 0xd7ff) || + (0xf900 <= (u) && (u) <= 0xfaff) || + (0xff00 <= (u) && (u) <= 0xffef) ); +} + +static inline bool +IS_NONBREAKABLE_SPACE(char16_t u) +{ + return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE +} + +static inline bool +IS_HYPHEN(char16_t u) +{ + return (u == U_HYPHEN || + u == 0x058A || // ARMENIAN HYPHEN + u == 0x2010 || // HYPHEN + u == 0x2012 || // FIGURE DASH + u == 0x2013); // EN DASH +} + +static int8_t +GetClass(uint32_t u) +{ + if (u < 0x10000) { + uint16_t h = u & 0xFF00; + uint16_t l = u & 0x00ff; + + // Handle 3 range table first + if (0x0000 == h) { + return GETCLASSFROMTABLE(gLBClass00, l); + } + if (0x1700 == h) { + return GETCLASSFROMTABLE(gLBClass17, l); + } + if (NS_NeedsPlatformNativeHandling(u)) { + return CLASS_COMPLEX; + } + if (0x0E00 == h) { + return GETCLASSFROMTABLE(gLBClass0E, l); + } + if (0x2000 == h) { + return GETCLASSFROMTABLE(gLBClass20, l); + } + if (0x2100 == h) { + return GETCLASSFROMTABLE(gLBClass21, l); + } + if (0x3000 == h) { + return GETCLASSFROMTABLE(gLBClass30, l); + } + if (0xff00 == h) { + if (l < 0x0060) { // Fullwidth ASCII variant + return GETCLASSFROMTABLE(gLBClass00, (l+0x20)); + } + if (l < 0x00a0) { // Halfwidth Katakana variants + switch (l) { + case 0x61: return GetClass(0x3002); + case 0x62: return GetClass(0x300c); + case 0x63: return GetClass(0x300d); + case 0x64: return GetClass(0x3001); + case 0x65: return GetClass(0x30fb); + case 0x9e: return GetClass(0x309b); + case 0x9f: return GetClass(0x309c); + default: + if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u)) { + return CLASS_CLOSE; // jis x4051 class 3 + } + return CLASS_BREAKABLE; // jis x4051 class 11 + } + } + if (l < 0x00e0) { + return CLASS_CHARACTER; // Halfwidth Hangul variants + } + if (l < 0x00f0) { + static char16_t NarrowFFEx[16] = { + 0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000, + 0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000 + }; + return GetClass(NarrowFFEx[l - 0x00e0]); + } + } else if (0x3100 == h) { + if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun + // XXX: This is per UAX #14, but UAX #14 may change + // the line breaking rules about Kanbun and Bopomofo. + return CLASS_BREAKABLE; + } + if (l >= 0xf0) { // Katakana small letters for Ainu + return CLASS_CLOSE; + } + } else if (0x0300 == h) { + if (0x4F == l || (0x5C <= l && l <= 0x62)) { + return CLASS_NON_BREAKABLE; + } + } else if (0x0500 == h) { + // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14) + if (l == 0x8A) { + return GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN)); + } + } else if (0x0F00 == h) { + if (0x08 == l || 0x0C == l || 0x12 == l) { + return CLASS_NON_BREAKABLE; + } + } else if (0x1800 == h) { + if (0x0E == l) { + return CLASS_NON_BREAKABLE; + } + } else if (0x1600 == h) { + if (0x80 == l) { // U+1680 OGHAM SPACE MARK + return CLASS_BREAKABLE; + } + } else if (u == 0xfeff) { + return CLASS_NON_BREAKABLE; + } + } + + // Mapping for Unicode LineBreak.txt classes to the (simplified) set of + // character classes used here. + // XXX The mappings here were derived by comparing the Unicode LineBreak + // values of BMP characters to the classes our existing GetClass returns + // for the same codepoints; in cases where characters with the same + // LineBreak class mapped to various classes here, I picked what seemed + // the most prevalent equivalence. + // Some of these are unclear to me, but currently they are ONLY used + // for characters not handled by the old code above, so all the JISx405 + // special cases should already be accounted for. + static const int8_t sUnicodeLineBreakToClass[] = { + /* UNKNOWN = 0, [XX] */ CLASS_CHARACTER, + /* AMBIGUOUS = 1, [AI] */ CLASS_CHARACTER, + /* ALPHABETIC = 2, [AL] */ CLASS_CHARACTER, + /* BREAK_BOTH = 3, [B2] */ CLASS_CHARACTER, + /* BREAK_AFTER = 4, [BA] */ CLASS_CHARACTER, + /* BREAK_BEFORE = 5, [BB] */ CLASS_OPEN_LIKE_CHARACTER, + /* MANDATORY_BREAK = 6, [BK] */ CLASS_CHARACTER, + /* CONTINGENT_BREAK = 7, [CB] */ CLASS_CHARACTER, + /* CLOSE_PUNCTUATION = 8, [CL] */ CLASS_CHARACTER, + /* COMBINING_MARK = 9, [CM] */ CLASS_CHARACTER, + /* CARRIAGE_RETURN = 10, [CR] */ CLASS_BREAKABLE, + /* EXCLAMATION = 11, [EX] */ CLASS_CHARACTER, + /* GLUE = 12, [GL] */ CLASS_NON_BREAKABLE, + /* HYPHEN = 13, [HY] */ CLASS_CHARACTER, + /* IDEOGRAPHIC = 14, [ID] */ CLASS_BREAKABLE, + /* INSEPARABLE = 15, [IN] */ CLASS_CLOSE_LIKE_CHARACTER, + /* INFIX_NUMERIC = 16, [IS] */ CLASS_CHARACTER, + /* LINE_FEED = 17, [LF] */ CLASS_BREAKABLE, + /* NONSTARTER = 18, [NS] */ CLASS_CLOSE_LIKE_CHARACTER, + /* NUMERIC = 19, [NU] */ CLASS_CHARACTER, + /* OPEN_PUNCTUATION = 20, [OP] */ CLASS_CHARACTER, + /* POSTFIX_NUMERIC = 21, [PO] */ CLASS_CHARACTER, + /* PREFIX_NUMERIC = 22, [PR] */ CLASS_CHARACTER, + /* QUOTATION = 23, [QU] */ CLASS_CHARACTER, + /* COMPLEX_CONTEXT = 24, [SA] */ CLASS_CHARACTER, + /* SURROGATE = 25, [SG] */ CLASS_CHARACTER, + /* SPACE = 26, [SP] */ CLASS_BREAKABLE, + /* BREAK_SYMBOLS = 27, [SY] */ CLASS_CHARACTER, + /* ZWSPACE = 28, [ZW] */ CLASS_BREAKABLE, + /* NEXT_LINE = 29, [NL] */ CLASS_CHARACTER, + /* WORD_JOINER = 30, [WJ] */ CLASS_NON_BREAKABLE, + /* H2 = 31, [H2] */ CLASS_BREAKABLE, + /* H3 = 32, [H3] */ CLASS_BREAKABLE, + /* JL = 33, [JL] */ CLASS_CHARACTER, + /* JT = 34, [JT] */ CLASS_CHARACTER, + /* JV = 35, [JV] */ CLASS_CHARACTER, + /* CLOSE_PARENTHESIS = 36, [CP] */ CLASS_CLOSE_LIKE_CHARACTER, + /* CONDITIONAL_JAPANESE_STARTER = 37, [CJ] */ CLASS_CLOSE, + /* HEBREW_LETTER = 38, [HL] */ CLASS_CHARACTER, + /* REGIONAL_INDICATOR = 39, [RI] */ CLASS_CHARACTER, + /* E_BASE = 40, [EB] */ CLASS_BREAKABLE, + /* E_MODIFIER = 41, [EM] */ CLASS_CHARACTER, + /* ZWJ = 42, [ZWJ]*/ CLASS_CHARACTER + }; + +#if ENABLE_INTL_API + static_assert(U_LB_COUNT == mozilla::ArrayLength(sUnicodeLineBreakToClass), + "Gecko vs ICU LineBreak class mismatch"); +#endif + + auto cls = mozilla::unicode::GetLineBreakClass(u); + MOZ_ASSERT(cls < mozilla::ArrayLength(sUnicodeLineBreakToClass)); + return sUnicodeLineBreakToClass[cls]; +} + +static bool +GetPair(int8_t c1, int8_t c2) +{ + NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1"); + NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2"); + + return (0 == ((gPair[c1] >> c2) & 0x0001)); +} + +static bool +GetPairConservative(int8_t c1, int8_t c2) +{ + NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1"); + NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2"); + + return (0 == ((gPairConservative[c1] >> c2) & 0x0001)); +} + +nsJISx4051LineBreaker::nsJISx4051LineBreaker() +{ +} + +nsJISx4051LineBreaker::~nsJISx4051LineBreaker() +{ +} + +NS_IMPL_ISUPPORTS(nsJISx4051LineBreaker, nsILineBreaker) + +class ContextState { +public: + ContextState(const char16_t* aText, uint32_t aLength) { + mUniText = aText; + mText = nullptr; + mLength = aLength; + Init(); + } + + ContextState(const uint8_t* aText, uint32_t aLength) { + mUniText = nullptr; + mText = aText; + mLength = aLength; + Init(); + } + + uint32_t Length() { return mLength; } + uint32_t Index() { return mIndex; } + + char16_t GetCharAt(uint32_t aIndex) { + NS_ASSERTION(aIndex < mLength, "Out of range!"); + return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]); + } + + void AdvanceIndex() { + ++mIndex; + } + + void NotifyBreakBefore() { mLastBreakIndex = mIndex; } + +// A word of western language should not be broken. But even if the word has +// only ASCII characters, non-natural context words should be broken, e.g., +// URL and file path. For protecting the natural words, we should use +// conservative breaking rules at following conditions: +// 1. at near the start of word +// 2. at near the end of word +// 3. at near the latest broken point +// CONSERVATIVE_BREAK_RANGE define the 'near' in characters. +#define CONSERVATIVE_BREAK_RANGE 6 + + bool UseConservativeBreaking(uint32_t aOffset = 0) { + if (mHasCJKChar) + return false; + uint32_t index = mIndex + aOffset; + bool result = (index < CONSERVATIVE_BREAK_RANGE || + mLength - index < CONSERVATIVE_BREAK_RANGE || + index - mLastBreakIndex < CONSERVATIVE_BREAK_RANGE); + if (result || !mHasNonbreakableSpace) + return result; + + // This text has no-breakable space, we need to check whether the index + // is near it. + + // Note that index is always larger than CONSERVATIVE_BREAK_RANGE here. + for (uint32_t i = index; index - CONSERVATIVE_BREAK_RANGE < i; --i) { + if (IS_NONBREAKABLE_SPACE(GetCharAt(i - 1))) + return true; + } + // Note that index is always less than mLength - CONSERVATIVE_BREAK_RANGE. + for (uint32_t i = index + 1; i < index + CONSERVATIVE_BREAK_RANGE; ++i) { + if (IS_NONBREAKABLE_SPACE(GetCharAt(i))) + return true; + } + return false; + } + + bool HasPreviousEqualsSign() const { + return mHasPreviousEqualsSign; + } + void NotifySeenEqualsSign() { + mHasPreviousEqualsSign = true; + } + + bool HasPreviousSlash() const { + return mHasPreviousSlash; + } + void NotifySeenSlash() { + mHasPreviousSlash = true; + } + + bool HasPreviousBackslash() const { + return mHasPreviousBackslash; + } + void NotifySeenBackslash() { + mHasPreviousBackslash = true; + } + + uint32_t GetPreviousNonHyphenCharacter() const { + return mPreviousNonHyphenCharacter; + } + void NotifyNonHyphenCharacter(uint32_t ch) { + mPreviousNonHyphenCharacter = ch; + } + +private: + void Init() { + mIndex = 0; + mLastBreakIndex = 0; + mPreviousNonHyphenCharacter = U_NULL; + mHasCJKChar = 0; + mHasNonbreakableSpace = 0; + mHasPreviousEqualsSign = false; + mHasPreviousSlash = false; + mHasPreviousBackslash = false; + + for (uint32_t i = 0; i < mLength; ++i) { + char16_t u = GetCharAt(i); + if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u)) + mHasNonbreakableSpace = 1; + else if (mUniText && !mHasCJKChar && IS_CJK_CHAR(u)) + mHasCJKChar = 1; + } + } + + const char16_t* mUniText; + const uint8_t* mText; + + uint32_t mIndex; + uint32_t mLength; // length of text + uint32_t mLastBreakIndex; + uint32_t mPreviousNonHyphenCharacter; // The last character we have seen + // which is not U_HYPHEN + bool mHasCJKChar; // if the text has CJK character, this is true. + bool mHasNonbreakableSpace; // if the text has no-breakable space, + // this is true. + bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL + bool mHasPreviousSlash; // True if we have seen a U_SLASH + bool mHasPreviousBackslash; // True if we have seen a U_BACKSLASH +}; + +static int8_t +ContextualAnalysis(char16_t prev, char16_t cur, char16_t next, + ContextState &aState) +{ + // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE. + + if (IS_HYPHEN(cur)) { + // If next character is hyphen, we don't need to break between them. + if (IS_HYPHEN(next)) + return CLASS_CHARACTER; + // If prev and next characters are numeric, it may be in Math context. + // So, we should not break here. + bool prevIsNum = IS_ASCII_DIGIT(prev); + bool nextIsNum = IS_ASCII_DIGIT(next); + if (prevIsNum && nextIsNum) + return CLASS_NUMERIC; + // If one side is numeric and the other is a character, or if both sides are + // characters, the hyphen should be breakable. + if (!aState.UseConservativeBreaking(1)) { + char16_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter(); + if (prevOfHyphen && next) { + int8_t prevClass = GetClass(prevOfHyphen); + int8_t nextClass = GetClass(next); + bool prevIsNumOrCharOrClose = + prevIsNum || + (prevClass == CLASS_CHARACTER && + !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) || + prevClass == CLASS_CLOSE || + prevClass == CLASS_CLOSE_LIKE_CHARACTER; + bool nextIsNumOrCharOrOpen = + nextIsNum || + (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) || + nextClass == CLASS_OPEN || + nextClass == CLASS_OPEN_LIKE_CHARACTER || + next == U_OPEN_SINGLE_QUOTE || + next == U_OPEN_DOUBLE_QUOTE || + next == U_OPEN_GUILLEMET; + if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) { + return CLASS_CLOSE; + } + } + } + } else { + aState.NotifyNonHyphenCharacter(cur); + if (cur == U_SLASH || cur == U_BACKSLASH) { + // If this is immediately after same char, we should not break here. + if (prev == cur) + return CLASS_CHARACTER; + // If this text has two or more (BACK)SLASHs, this may be file path or URL. + // Make sure to compute shouldReturn before we notify on this slash. + bool shouldReturn = !aState.UseConservativeBreaking() && + (cur == U_SLASH ? + aState.HasPreviousSlash() : aState.HasPreviousBackslash()); + + if (cur == U_SLASH) { + aState.NotifySeenSlash(); + } else { + aState.NotifySeenBackslash(); + } + + if (shouldReturn) + return CLASS_OPEN; + } else if (cur == U_PERCENT) { + // If this is a part of the param of URL, we should break before. + if (!aState.UseConservativeBreaking()) { + if (aState.Index() >= 3 && + aState.GetCharAt(aState.Index() - 3) == U_PERCENT) + return CLASS_OPEN; + if (aState.Index() + 3 < aState.Length() && + aState.GetCharAt(aState.Index() + 3) == U_PERCENT) + return CLASS_OPEN; + } + } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) { + // If this may be a separator of params of URL, we should break after. + if (!aState.UseConservativeBreaking(1) && + aState.HasPreviousEqualsSign()) + return CLASS_CLOSE; + } else if (cur == U_OPEN_SINGLE_QUOTE || + cur == U_OPEN_DOUBLE_QUOTE || + cur == U_OPEN_GUILLEMET) { + // for CJK usage, we treat these as openers to allow a break before them, + // but otherwise treat them as normal characters because quote mark usage + // in various Western languages varies too much; see bug #450088 discussion. + if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next)) + return CLASS_OPEN; + } else { + NS_ERROR("Forgot to handle the current character!"); + } + } + return GetClass(cur); +} + + +int32_t +nsJISx4051LineBreaker::WordMove(const char16_t* aText, uint32_t aLen, + uint32_t aPos, int8_t aDirection) +{ + bool textNeedsJISx4051 = false; + int32_t begin, end; + + for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) { + if (IS_CJK_CHAR(aText[begin]) || NS_NeedsPlatformNativeHandling(aText[begin])) { + textNeedsJISx4051 = true; + } + } + for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) { + if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) { + textNeedsJISx4051 = true; + } + } + + int32_t ret; + AutoTArray<uint8_t, 2000> breakState; + if (!textNeedsJISx4051 || !breakState.AppendElements(end - begin)) { + // No complex text character, do not try to do complex line break. + // (This is required for serializers. See Bug #344816.) + // Also fall back to this when out of memory. + if (aDirection < 0) { + ret = (begin == int32_t(aPos)) ? begin - 1 : begin; + } else { + ret = end; + } + } else { + GetJISx4051Breaks(aText + begin, end - begin, nsILineBreaker::kWordBreak_Normal, + breakState.Elements()); + + ret = aPos; + do { + ret += aDirection; + } while (begin < ret && ret < end && !breakState[ret - begin]); + } + + return ret; +} + +int32_t +nsJISx4051LineBreaker::Next(const char16_t* aText, uint32_t aLen, + uint32_t aPos) +{ + NS_ASSERTION(aText, "aText shouldn't be null"); + NS_ASSERTION(aLen > aPos, "Bad position passed to nsJISx4051LineBreaker::Next"); + + int32_t nextPos = WordMove(aText, aLen, aPos, 1); + return nextPos < int32_t(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT; +} + +int32_t +nsJISx4051LineBreaker::Prev(const char16_t* aText, uint32_t aLen, + uint32_t aPos) +{ + NS_ASSERTION(aText, "aText shouldn't be null"); + NS_ASSERTION(aLen >= aPos && aPos > 0, + "Bad position passed to nsJISx4051LineBreaker::Prev"); + + int32_t prevPos = WordMove(aText, aLen, aPos, -1); + return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT; +} + +void +nsJISx4051LineBreaker::GetJISx4051Breaks(const char16_t* aChars, uint32_t aLength, + uint8_t aWordBreak, + uint8_t* aBreakBefore) +{ + uint32_t cur; + int8_t lastClass = CLASS_NONE; + ContextState state(aChars, aLength); + + for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) { + uint32_t ch = aChars[cur]; + if (NS_IS_HIGH_SURROGATE(ch)) { + if (cur + 1 < aLength && NS_IS_LOW_SURROGATE(aChars[cur + 1])) { + ch = SURROGATE_TO_UCS4(ch, aChars[cur + 1]); + } + } + int8_t cl; + + if (NEED_CONTEXTUAL_ANALYSIS(ch)) { + cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, + ch, + cur + 1 < aLength ? aChars[cur + 1] : U_NULL, + state); + } else { + if (ch == U_EQUAL) + state.NotifySeenEqualsSign(); + state.NotifyNonHyphenCharacter(ch); + cl = GetClass(ch); + } + + bool allowBreak = false; + if (cur > 0) { + NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl, + "Loop should have prevented adjacent complex chars here"); + if (aWordBreak == nsILineBreaker::kWordBreak_Normal) { + allowBreak = (state.UseConservativeBreaking()) ? + GetPairConservative(lastClass, cl) : GetPair(lastClass, cl); + } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) { + allowBreak = true; + } + } + aBreakBefore[cur] = allowBreak; + if (allowBreak) + state.NotifyBreakBefore(); + lastClass = cl; + if (CLASS_COMPLEX == cl) { + uint32_t end = cur + 1; + + while (end < aLength && CLASS_COMPLEX == GetClass(aChars[end])) { + ++end; + } + + NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur); + + // We have to consider word-break value again for complex characters + if (aWordBreak != nsILineBreaker::kWordBreak_Normal) { + // Respect word-break property + for (uint32_t i = cur; i < end; i++) + aBreakBefore[i] = (aWordBreak == nsILineBreaker::kWordBreak_BreakAll); + } + + // restore breakability at chunk begin, which was always set to false + // by the complex line breaker + aBreakBefore[cur] = allowBreak; + + cur = end - 1; + } + + if (ch > 0xffff) { + // Supplementary-plane character: mark that we cannot break before the + // trailing low surrogate, and advance past it. + ++cur; + aBreakBefore[cur] = false; + state.AdvanceIndex(); + } + } +} + +void +nsJISx4051LineBreaker::GetJISx4051Breaks(const uint8_t* aChars, uint32_t aLength, + uint8_t aWordBreak, + uint8_t* aBreakBefore) +{ + uint32_t cur; + int8_t lastClass = CLASS_NONE; + ContextState state(aChars, aLength); + + for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) { + char16_t ch = aChars[cur]; + int8_t cl; + + if (NEED_CONTEXTUAL_ANALYSIS(ch)) { + cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, + ch, + cur + 1 < aLength ? aChars[cur + 1] : U_NULL, + state); + } else { + if (ch == U_EQUAL) + state.NotifySeenEqualsSign(); + state.NotifyNonHyphenCharacter(ch); + cl = GetClass(ch); + } + + bool allowBreak = false; + if (cur > 0) { + if (aWordBreak == nsILineBreaker::kWordBreak_Normal) { + allowBreak = (state.UseConservativeBreaking()) ? + GetPairConservative(lastClass, cl) : GetPair(lastClass, cl); + } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) { + allowBreak = true; + } + } + aBreakBefore[cur] = allowBreak; + if (allowBreak) + state.NotifyBreakBefore(); + lastClass = cl; + } +} diff --git a/intl/lwbrk/nsJISx4051LineBreaker.h b/intl/lwbrk/nsJISx4051LineBreaker.h new file mode 100644 index 000000000..6b41f80df --- /dev/null +++ b/intl/lwbrk/nsJISx4051LineBreaker.h @@ -0,0 +1,37 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef nsJISx4051LineBreaker_h__ +#define nsJISx4051LineBreaker_h__ + + +#include "nsILineBreaker.h" + +class nsJISx4051LineBreaker : public nsILineBreaker +{ + NS_DECL_ISUPPORTS + +private: + virtual ~nsJISx4051LineBreaker(); + +public: + nsJISx4051LineBreaker(); + + int32_t Next( const char16_t* aText, uint32_t aLen, uint32_t aPos) override; + + int32_t Prev( const char16_t* aText, uint32_t aLen, uint32_t aPos) override; + + virtual void GetJISx4051Breaks(const char16_t* aText, uint32_t aLength, + uint8_t aBreakMode, + uint8_t* aBreakBefore) override; + virtual void GetJISx4051Breaks(const uint8_t* aText, uint32_t aLength, + uint8_t aBreakMode, + uint8_t* aBreakBefore) override; + +private: + int32_t WordMove(const char16_t* aText, uint32_t aLen, uint32_t aPos, + int8_t aDirection); +}; + +#endif /* nsJISx4051LineBreaker_h__ */ diff --git a/intl/lwbrk/nsLWBrkCIID.h b/intl/lwbrk/nsLWBrkCIID.h new file mode 100644 index 000000000..75e280058 --- /dev/null +++ b/intl/lwbrk/nsLWBrkCIID.h @@ -0,0 +1,22 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef nsLWBrkCIID_h__ +#define nsLWBrkCIID_h__ + +// {2BF64764-997F-450D-AF96-3028D1A902B0} +#define NS_LBRK_CID \ +{ 0x2bf64764, 0x997f, 0x450d, \ + { 0xaf, 0x96, 0x30, 0x28, 0xd1, 0xa9, 0x2, 0xb0 } } + +#define NS_LBRK_CONTRACTID "@mozilla.org/intl/lbrk;1" + +// {2BF64765-997F-450D-AF96-3028D1A902B0} +#define NS_WBRK_CID \ +{ 0x2bf64765, 0x997f, 0x450d, \ + { 0xaf, 0x96, 0x30, 0x28, 0xd1, 0xa9, 0x2, 0xb0 } } + +#define NS_WBRK_CONTRACTID "@mozilla.org/intl/wbrk;1" + +#endif diff --git a/intl/lwbrk/nsPangoBreaker.cpp b/intl/lwbrk/nsPangoBreaker.cpp new file mode 100644 index 000000000..c6fcb37cf --- /dev/null +++ b/intl/lwbrk/nsPangoBreaker.cpp @@ -0,0 +1,60 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsComplexBreaker.h" + +#include <pango/pango-break.h> +#include "nsUTF8Utils.h" +#include "nsString.h" +#include "nsTArray.h" + +void +NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength, + uint8_t* aBreakBefore) +{ + NS_ASSERTION(aText, "aText shouldn't be null"); + + memset(aBreakBefore, false, aLength * sizeof(uint8_t)); + + AutoTArray<PangoLogAttr, 2000> attrBuffer; + if (!attrBuffer.AppendElements(aLength + 1)) + return; + + NS_ConvertUTF16toUTF8 aUTF8(aText, aLength); + + const gchar* p = aUTF8.Data(); + const gchar* end = p + aUTF8.Length(); + uint32_t u16Offset = 0; + + static PangoLanguage* language = pango_language_from_string("en"); + + while (p < end) + { + PangoLogAttr* attr = attrBuffer.Elements(); + pango_get_log_attrs(p, end - p, -1, language, attr, attrBuffer.Length()); + + while (p < end) + { + aBreakBefore[u16Offset] = attr->is_line_break; + if (NS_IS_LOW_SURROGATE(aText[u16Offset])) + aBreakBefore[++u16Offset] = false; // Skip high surrogate + ++u16Offset; + + bool err; + uint32_t ch = UTF8CharEnumerator::NextChar(&p, end, &err); + ++attr; + + if (ch == 0 || err) { + // pango_break (pango 1.16.2) only analyses text before the + // first NUL (but sets one extra attr). Workaround loop to call + // pango_break again to analyse after the NUL is done somewhere else + // (gfx/thebes/gfxFontconfigFonts.cpp: SetupClusterBoundaries()). + // So, we do the same here for pango_get_log_attrs. + break; + } + } + } +} + diff --git a/intl/lwbrk/nsRuleBreaker.cpp b/intl/lwbrk/nsRuleBreaker.cpp new file mode 100644 index 000000000..035996873 --- /dev/null +++ b/intl/lwbrk/nsRuleBreaker.cpp @@ -0,0 +1,20 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsComplexBreaker.h" + +#define TH_UNICODE +#include "rulebrk.h" + +void +NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength, + uint8_t* aBreakBefore) +{ + NS_ASSERTION(aText, "aText shouldn't be null"); + + for (uint32_t i = 0; i < aLength; i++) + aBreakBefore[i] = (0 == TrbWordBreakPos(aText, i, aText + i, aLength - i)); +} + diff --git a/intl/lwbrk/nsSampleWordBreaker.cpp b/intl/lwbrk/nsSampleWordBreaker.cpp new file mode 100644 index 000000000..fa54adeda --- /dev/null +++ b/intl/lwbrk/nsSampleWordBreaker.cpp @@ -0,0 +1,150 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + + +#include "nsSampleWordBreaker.h" + +nsSampleWordBreaker::nsSampleWordBreaker() +{ +} +nsSampleWordBreaker::~nsSampleWordBreaker() +{ +} + +NS_IMPL_ISUPPORTS(nsSampleWordBreaker, nsIWordBreaker) + +bool nsSampleWordBreaker::BreakInBetween( + const char16_t* aText1 , uint32_t aTextLen1, + const char16_t* aText2 , uint32_t aTextLen2) +{ + NS_PRECONDITION( nullptr != aText1, "null ptr"); + NS_PRECONDITION( nullptr != aText2, "null ptr"); + + if(!aText1 || !aText2 || (0 == aTextLen1) || (0 == aTextLen2)) + return false; + + return (this->GetClass(aText1[aTextLen1-1]) != this->GetClass(aText2[0])); +} + + +#define IS_ASCII(c) (0 == ( 0xFF80 & (c))) +#define ASCII_IS_ALPHA(c) ((( 'a' <= (c)) && ((c) <= 'z')) || (( 'A' <= (c)) && ((c) <= 'Z'))) +#define ASCII_IS_DIGIT(c) (( '0' <= (c)) && ((c) <= '9')) +#define ASCII_IS_SPACE(c) (( ' ' == (c)) || ( '\t' == (c)) || ( '\r' == (c)) || ( '\n' == (c))) +#define IS_ALPHABETICAL_SCRIPT(c) ((c) < 0x2E80) + +// we change the beginning of IS_HAN from 0x4e00 to 0x3400 to relfect Unicode 3.0 +#define IS_HAN(c) (( 0x3400 <= (c)) && ((c) <= 0x9fff))||(( 0xf900 <= (c)) && ((c) <= 0xfaff)) +#define IS_KATAKANA(c) (( 0x30A0 <= (c)) && ((c) <= 0x30FF)) +#define IS_HIRAGANA(c) (( 0x3040 <= (c)) && ((c) <= 0x309F)) +#define IS_HALFWIDTHKATAKANA(c) (( 0xFF60 <= (c)) && ((c) <= 0xFF9F)) +#define IS_THAI(c) (0x0E00 == (0xFF80 & (c) )) // Look at the higest 9 bits + +uint8_t nsSampleWordBreaker::GetClass(char16_t c) +{ + // begin of the hack + + if (IS_ALPHABETICAL_SCRIPT(c)) { + if(IS_ASCII(c)) { + if(ASCII_IS_SPACE(c)) { + return kWbClassSpace; + } else if(ASCII_IS_ALPHA(c) || ASCII_IS_DIGIT(c)) { + return kWbClassAlphaLetter; + } else { + return kWbClassPunct; + } + } else if(IS_THAI(c)) { + return kWbClassThaiLetter; + } else if (c == 0x00A0/*NBSP*/) { + return kWbClassSpace; + } else { + return kWbClassAlphaLetter; + } + } else { + if(IS_HAN(c)) { + return kWbClassHanLetter; + } else if(IS_KATAKANA(c)) { + return kWbClassKatakanaLetter; + } else if(IS_HIRAGANA(c)) { + return kWbClassHiraganaLetter; + } else if(IS_HALFWIDTHKATAKANA(c)) { + return kWbClassHWKatakanaLetter; + } else { + return kWbClassAlphaLetter; + } + } + return 0; +} + +nsWordRange nsSampleWordBreaker::FindWord( + const char16_t* aText , uint32_t aTextLen, + uint32_t aOffset) +{ + nsWordRange range; + NS_PRECONDITION( nullptr != aText, "null ptr"); + NS_PRECONDITION( 0 != aTextLen, "len = 0"); + NS_PRECONDITION( aOffset <= aTextLen, "aOffset > aTextLen"); + + range.mBegin = aTextLen + 1; + range.mEnd = aTextLen + 1; + + if(!aText || aOffset > aTextLen) + return range; + + uint8_t c = this->GetClass(aText[aOffset]); + uint32_t i; + // Scan forward + range.mEnd--; + for(i = aOffset +1;i <= aTextLen; i++) + { + if( c != this->GetClass(aText[i])) + { + range.mEnd = i; + break; + } + } + + // Scan backward + range.mBegin = 0; + for(i = aOffset ;i > 0; i--) + { + if( c != this->GetClass(aText[i-1])) + { + range.mBegin = i; + break; + } + } + if(kWbClassThaiLetter == c) + { + // need to call Thai word breaker from here + // we should pass the whole Thai segment to the thai word breaker to find a shorter answer + } + return range; +} + +int32_t nsSampleWordBreaker::NextWord( + const char16_t* aText, uint32_t aLen, uint32_t aPos) +{ + int8_t c1, c2; + uint32_t cur = aPos; + if (cur == aLen) + return NS_WORDBREAKER_NEED_MORE_TEXT; + c1 = this->GetClass(aText[cur]); + + for(cur++; cur <aLen; cur++) + { + c2 = this->GetClass(aText[cur]); + if(c2 != c1) + break; + } + if(kWbClassThaiLetter == c1) + { + // need to call Thai word breaker from here + // we should pass the whole Thai segment to the thai word breaker to find a shorter answer + } + if (cur == aLen) + return NS_WORDBREAKER_NEED_MORE_TEXT; + return cur; +} diff --git a/intl/lwbrk/nsSampleWordBreaker.h b/intl/lwbrk/nsSampleWordBreaker.h new file mode 100644 index 000000000..51e17daa7 --- /dev/null +++ b/intl/lwbrk/nsSampleWordBreaker.h @@ -0,0 +1,42 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef nsSampleWordBreaker_h__ +#define nsSampleWordBreaker_h__ + + +#include "nsIWordBreaker.h" + +typedef enum { + kWbClassSpace = 0, + kWbClassAlphaLetter, + kWbClassPunct, + kWbClassHanLetter, + kWbClassKatakanaLetter, + kWbClassHiraganaLetter, + kWbClassHWKatakanaLetter, + kWbClassThaiLetter +} wb_class; + +class nsSampleWordBreaker : public nsIWordBreaker +{ + NS_DECL_ISUPPORTS +public: + + nsSampleWordBreaker() ; + + bool BreakInBetween(const char16_t* aText1 , uint32_t aTextLen1, + const char16_t* aText2 , uint32_t aTextLen2) override; + nsWordRange FindWord(const char16_t* aText1 , uint32_t aTextLen1, + uint32_t aOffset) override; + + int32_t NextWord(const char16_t* aText, uint32_t aLen, uint32_t aPos) override; + +protected: + uint8_t GetClass(char16_t aChar); + + virtual ~nsSampleWordBreaker(); +}; + +#endif /* nsSampleWordBreaker_h__ */ diff --git a/intl/lwbrk/nsSemanticUnitScanner.cpp b/intl/lwbrk/nsSemanticUnitScanner.cpp new file mode 100644 index 000000000..8feb738a8 --- /dev/null +++ b/intl/lwbrk/nsSemanticUnitScanner.cpp @@ -0,0 +1,76 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsSemanticUnitScanner.h" + +NS_IMPL_ISUPPORTS_INHERITED(nsSemanticUnitScanner, nsSampleWordBreaker, nsISemanticUnitScanner) + +nsSemanticUnitScanner::nsSemanticUnitScanner() : nsSampleWordBreaker() +{ + /* member initializers and constructor code */ +} + +nsSemanticUnitScanner::~nsSemanticUnitScanner() +{ + /* destructor code */ +} + + +NS_IMETHODIMP nsSemanticUnitScanner::Start(const char *characterSet) +{ + // do nothing for now. + return NS_OK; +} + +NS_IMETHODIMP nsSemanticUnitScanner::Next(const char16_t *text, int32_t length, int32_t pos, bool isLastBuffer, int32_t *begin, int32_t *end, bool *_retval) +{ + // xxx need to bullet proff and check input pointer + // make sure begin, end and _retval is not nullptr here + + // if we reach the end, just return + if (pos >= length) { + *begin = pos; + *end = pos; + *_retval = false; + return NS_OK; + } + + uint8_t char_class = nsSampleWordBreaker::GetClass(text[pos]); + + // if we are in chinese mode, return one han letter at a time + // we should not do this if we are in Japanese or Korean mode + if (kWbClassHanLetter == char_class) { + *begin = pos; + *end = pos+1; + *_retval = true; + return NS_OK; + } + + int32_t next; + // find the next "word" + next = NextWord(text, (uint32_t) length, (uint32_t) pos); + + // if we don't have enough text to make decision, return + if (next == NS_WORDBREAKER_NEED_MORE_TEXT) { + *begin = pos; + *end = isLastBuffer ? length : pos; + *_retval = isLastBuffer; + return NS_OK; + } + + // if what we got is space or punct, look at the next break + if ((char_class == kWbClassSpace) || (char_class == kWbClassPunct)) { + // if the next "word" is not letters, + // call itself recursively with the new pos + return Next(text, length, next, isLastBuffer, begin, end, _retval); + } + + // for the rest, return + *begin = pos; + *end = next; + *_retval = true; + return NS_OK; +} + diff --git a/intl/lwbrk/nsSemanticUnitScanner.h b/intl/lwbrk/nsSemanticUnitScanner.h new file mode 100644 index 000000000..5e13fe78c --- /dev/null +++ b/intl/lwbrk/nsSemanticUnitScanner.h @@ -0,0 +1,27 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsSemanticUnitScanner_h__ +#define nsSemanticUnitScanner_h__ + +#include "nsSampleWordBreaker.h" +#include "nsISemanticUnitScanner.h" + + +class nsSemanticUnitScanner : public nsISemanticUnitScanner + , public nsSampleWordBreaker +{ +public: + NS_DECL_ISUPPORTS_INHERITED + NS_DECL_NSISEMANTICUNITSCANNER + + nsSemanticUnitScanner(); + +private: + virtual ~nsSemanticUnitScanner(); + /* additional members */ +}; + +#endif diff --git a/intl/lwbrk/nsUniscribeBreaker.cpp b/intl/lwbrk/nsUniscribeBreaker.cpp new file mode 100644 index 000000000..2a1b69b22 --- /dev/null +++ b/intl/lwbrk/nsUniscribeBreaker.cpp @@ -0,0 +1,58 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsComplexBreaker.h" + +#include <windows.h> + +#include <usp10.h> + +#include "nsUTF8Utils.h" +#include "nsString.h" +#include "nsTArray.h" + +void +NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength, + uint8_t* aBreakBefore) +{ + NS_ASSERTION(aText, "aText shouldn't be null"); + + int outItems = 0; + HRESULT result; + AutoTArray<SCRIPT_ITEM, 64> items; + char16ptr_t text = aText; + + memset(aBreakBefore, false, aLength); + + if (!items.AppendElements(64)) + return; + + do { + result = ScriptItemize(text, aLength, items.Length(), nullptr, nullptr, + items.Elements(), &outItems); + + if (result == E_OUTOFMEMORY) { + if (!items.AppendElements(items.Length())) + return; + } + } while (result == E_OUTOFMEMORY); + + for (int iItem = 0; iItem < outItems; ++iItem) { + uint32_t endOffset = (iItem + 1 == outItems ? aLength : items[iItem + 1].iCharPos); + uint32_t startOffset = items[iItem].iCharPos; + AutoTArray<SCRIPT_LOGATTR, 64> sla; + + if (!sla.AppendElements(endOffset - startOffset)) + return; + + if (ScriptBreak(text + startOffset, endOffset - startOffset, + &items[iItem].a, sla.Elements()) < 0) + return; + + for (uint32_t j=0; j+startOffset < endOffset; ++j) { + aBreakBefore[j+startOffset] = sla[j].fSoftBreak; + } + } +} diff --git a/intl/lwbrk/rulebrk.c b/intl/lwbrk/rulebrk.c new file mode 100644 index 000000000..0c9e86e82 --- /dev/null +++ b/intl/lwbrk/rulebrk.c @@ -0,0 +1,376 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#define TH_UNICODE + +#include <stdlib.h> +#include <stdint.h> +#include <assert.h> +#include "th_char.h" +#define th_isalpha(c) (((c)>='a'&&(c)<='z')||((c)>='A'&&(c)<='Z')) +#define th_isspace(c) ((c)==' '||(c)=='\t') + + +/* +///////////////////////////////////////////////// +// Thai character type array +*/ + +typedef unsigned short twb_t; +extern const twb_t _TwbType[0x100-0xa0]; + +/* +// bit definition +*/ + +#define VRS 0x0001 +#define VRE 0x0002 +#define VRX 0x0004 + +#define VRA 0x0008 + +#define VLA 0x0010 +#define VLO 0x0020 +#define VLI 0x0040 + +#define VC 0x0080 + +#define CC 0x0100 +#define CS 0x0200 + +#define C2 0x0400 +#define CHB 0x0800 +#define CHE 0x1000 + +#define MT 0x2000 +/* +//_#define me 0x2000 +*/ +#define M 0x4000 + +#define T 0x8000 + +#define VL (VLA|VLO|VLI) +#define VR (VRS|VRE|VRX) +#define NE (VL|VRS) +#define NB (VR|M) +#define V (VL|VR) +#define CX (CC|CS) +#define C (CX|VC) +#define A (C|V|M) + +#define twbtype(c) (_TwbType[th_zcode(c)]) + +#ifndef TRUE +#define TRUE 1 +#define FALSE 0 +#endif +#define RETURN(b) return (b) + + +/* +///////////////////////////////////////////////// +*/ + +int TrbWordBreakPos(const th_char *pstr, int left, + const th_char *rstr, int right) +/* const ThBreakIterator *it, const th_char **p)*/ +{ + /* + //int left, right; + //const th_char *s = *p; + */ + const th_char *lstr = pstr + left; + th_char _c[6]; + twb_t _t[6]; + #define c(i) (_c[(i)+3]) + #define t(i) (_t[(i)+3]) + int i, j; + + /* + //left = s - it->begin; + */ + if(left < 0) return -1; + /* + //right = (it->end == NULL) ? 4 : it->begin - s; + */ + if(right < 1) return -1; + + /* + // get c(0), t(0) + */ + c(0) = rstr[0]; /* may be '\0' */ + if(!th_isthai(c(0))) return -1; + t(0) = twbtype(c(0)); + if(!(t(0) & A)) return -1; + + /* + // get c(-1), t(-1) + */ + if(left >= 1) { + c(-1) = lstr[-1]; + if(!th_isthai(c(-1))) return 0; + t(-1) = twbtype(c(-1)); + if(!(t(-1) & A)) return 0; /* handle punctuation marks here */ + } else { c(-1) = 0; t(-1) = 0; } + + /* + // get c(1..2), t(1..2) + */ + for(i = 1; i <= 2; i++) { + if(i >= right) { c(i) = 0; t(i) = 0; } + else { + c(i) = rstr[i]; /* may be '\0'; */ + if(!th_isthai(c(i))) right = i--; + else { + t(i) = twbtype(c(i)); + if(!(t(i) & A)) right = i--; + } + } + } + /* + // get c(-2..-3), t(-2..-3) + */ + for(i = -2, j = -2; i >= -3 ; j--) { + if(j < -left) { c(i) = 0; t(i) = 0; i--; } + else { + c(i) = lstr[j]; + if(!th_isthai(c(i))) left = 0; + else { + t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0); + if(!(t(i) & A)) left = 0; + else { + if((t(i+1) & MT) && ((t(i) & VR) || (t(i+2) & VR))) { + c(i+1) = c(i); t(i+1) = t(i); + } else i--; + } + } + } + } + + /* + // prohibit the unlikely + */ + if((t(-1) & C) && (t(0) & C)) { + if((t(-1) & CHE) || (t(0) & CHB)) return -1; + } + /* + // special case : vlao, C/ sara_a|aa, !sara_a + */ + if((t(-3) & (VLA|VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) && + (c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA)) return 0; + + /* + // prohibit break + */ + if(t(0) & NB) return -1; + if(t(-1) & NE) return -1; + + + /* + // apply 100% rules + */ + if(t(-1) & VRE) { + if(c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0; + return -1; /* usually too short syllable, part of word */ + } + + if(t(-2) & VRE) return -1; + + if((t(0) & C) && (t(1) & (VR|MT)) && (c(2) != TH_THANTHAKHAT)) { /*?C, NB */ + if((t(-1) & (VRS|VRX)) && c(1) == TH_SARA_I) return -1; /* exception */ + if(t(-1) & (V|M)) return 0; /* !C/ C, NB */ + if(t(-2) & VRS) return 0; /* VRS, C / C, NB */ + if(!(t(0) & C2) && c(1) == TH_SARA_I) { /* / !C2 or /c, sara_i */ + if(t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */ + if(t(-2) & VC) return 0; /* VC, C / C, NB ? 100% */ + } + } + if((t(-1) & VRX) && (t(0) & CC)) return 0; /* VRX/ CC */ + if((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V|M))) return 0;/* VRS, C/ !C */ + + + if((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) { + if((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */ + if((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */ + } + /* + // apply 90% rules + */ + if(t(0) & VL) return 0; + if(t(1) & VL) return -1; + if(c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING) return 0; + + /* + //return -1; + // apply 80% rules + */ + if(t(0) & CHE) { + if((t(-2) & VRS) && (t(-1) & C)) return 0; /* VRS, C/ CHE */ + /*if(t(-1) & VRX) return 0; // VRX/ CHE */ + if(t(-1) & VC) return 0; /* VC/ CHE */ + } + if(t(-1) & CHB) { + if((t(0) & C) && (t(1) & VR)) return 0; /* CHB/ CC, VR */ + if(t(0) & VC) return 0; /* CHB/ VC */ + } + + if((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */ + if(t(-2) & VLI) return 0; /* VLI,C/C,VR .*/ + else { /* vlao, C ? C , VR */ + if(c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */ + if(t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */ + if(!(t(1) & VRA)) return 0; /* VLA, C/ C, !vca */ + } + } + /* C,MT,C */ + if((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1; + + return -1; +} + + +int TrbFollowing(const th_char *begin, int length, int offset) +/* +//(ThBreakIterator *this, int offset) +*/ +{ + const th_char *w = begin + offset; + const th_char *end = begin + length; + while(w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++; + + if(w < end && *w && !th_isthai(*w)) { + int english = FALSE; + while(w < end && *w && !th_isthai(*w) && !th_isspace(*w)) { + if(th_isalpha(*w)) english = TRUE; + w++; + } + if(english || w == end || + (!th_isthai(*w) && th_isspace(*w))) return w - begin; + } + if(w == end || *w == 0 || !th_isthai(*w)) return w - begin; + w++; + if(w < end && *w && th_isthai(*w)) { + int brk = TrbWordBreakPos(begin, w-begin, w, end-w); + while (brk < 0) { + w++; + if(w == end || *w == 0 || !th_isthai(*w)) break; + brk = TrbWordBreakPos(begin, w-begin, w, end-w); + } + if (brk > 0) w += brk; + } + if(w < end && *w && !th_isthai(*w)) { + while(w < end && *w && !th_isthai(*w) && + !th_isalpha(*w) && !th_isspace(*w)) w++; + } + return w - begin; +} + + +/* +///////////////////////////////////////////////// +*/ +const twb_t _TwbType[0x100-0xa0] = { +#if 0 +/* 80 */ T, +/* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +/* 90 */ T, +/* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +#endif +/* a0 */ 0, +/* a1 */ CS, +/* a2 */ CS | CHE, +/* a3 */ CC | CHE, +/* a4 */ CS | CHE, +/* a5 */ CC | CHE, +/* a6 */ CS, +/* a7 */ CS | CHB, +/* a8 */ CS, +/* a9 */ CC | CHE, +/* aa */ CS, +/* ab */ CC | CHE, +/* ac */ CC | CHB | CHE, +/* ad */ CS | CHB, +/* ae */ CS | CHB, +/* af */ CS | CHB, +/* b0 */ CS, +/* b1 */ CS | CHB | CHE, +/* b2 */ CS | CHB | CHE, +/* b3 */ CS | CHB, +/* b4 */ CS, +/* b5 */ CS, +/* b6 */ CS, +/* b7 */ CS, +/* b8 */ CS, +/* b9 */ CS, +/* ba */ CS, +/* bb */ CS, +/* bc */ CC | CHE, +/* bd */ CC | CHE, +/* be */ CS, +/* bf */ CS, +/* c0 */ CS | CHE, +/* c1 */ CS, +/* c2 */ CS, +/* c3 */ CS | C2 | CHE, /* ? add CHE */ +/* c4 */ VC | CHE, +/* c5 */ CS | C2, +/* c6 */ VC | CHE, +/* c7 */ VC | C2, +/* c8 */ CS, +/* c9 */ CS | CHB, +/* ca */ CS | CHE, +/* cb */ CC | CHE, +/* CC */ CS | CHB | CHE, +/* cd */ VC, +/* ce */ CC | CHE, +/* cf */ T, +/* d0 */ VRE | VRA, +/* d1 */ VRS, +/* d2 */ VRX | VRA, +/* d3 */ VRE, +/* d4 */ VRX | VRA, +/* d5 */ VRX | VRA, +/* d6 */ VRS, +/* d7 */ VRS | VRA, +/* d8 */ VRX, +/* d9 */ VRX, +/* da */ T, +/* db */ 0, +/* dc */ 0, +/* dd */ 0, +/* de */ 0, +/* df */ T, +/* e0 */ VLA, +/* e1 */ VLO, +/* e2 */ VLO, +/* e3 */ VLI, +/* e4 */ VLI, +/* e5 */ VRE, +/* e6 */ M, +/* e7 */ M, +/* e8 */ M | MT, +/* e9 */ M | MT, +/* ea */ M | MT, +/* eb */ M | MT, +/* ec */ M, +/* ed */ T, +/* ee */ T, +/* ef */ T, +/* f0 */ T, +/* f1 */ T, +/* f2 */ T, +/* f3 */ T, +/* f4 */ T, +/* f5 */ T, +/* f6 */ T, +/* f7 */ T, +/* f8 */ T, +/* f9 */ T, +/* fa */ T, +/* fb */ T, +/* fc */ 0, +/* fd */ 0, +/* fe */ 0, +/* ff */ 0 +}; diff --git a/intl/lwbrk/rulebrk.h b/intl/lwbrk/rulebrk.h new file mode 100644 index 000000000..edc88651b --- /dev/null +++ b/intl/lwbrk/rulebrk.h @@ -0,0 +1,26 @@ +/* +Copyright (c) 1999 Samphan Raruenrom <samphan@thai.com> +Permission to use, copy, modify, distribute and sell this software +and its documentation for any purpose is hereby granted without fee, +provided that the above copyright notice appear in all copies and +that both that copyright notice and this permission notice appear +in supporting documentation. Samphan Raruenrom makes no +representations about the suitability of this software for any +purpose. It is provided "as is" without express or implied warranty. +*/ +#ifndef __RULEBRK_H__ +#define __RULEBRK_H__ +#include "th_char.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int TrbWordBreakPos(const th_char *pstr, int left, + const th_char *rstr, int right); +int TrbFollowing(const th_char *begin, int length, int offset); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/intl/lwbrk/th_char.h b/intl/lwbrk/th_char.h new file mode 100644 index 000000000..c6d7420f4 --- /dev/null +++ b/intl/lwbrk/th_char.h @@ -0,0 +1,54 @@ +/* +Copyright (c) 1999 Samphan Raruenrom <samphan@thai.com> +Permission to use, copy, modify, distribute and sell this software +and its documentation for any purpose is hereby granted without fee, +provided that the above copyright notice appear in all copies and +that both that copyright notice and this permission notice appear +in supporting documentation. Samphan Raruenrom makes no +representations about the suitability of this software for any +purpose. It is provided "as is" without express or implied warranty. +*/ +#ifndef __TH_CHAR_H__ +#define __TH_CHAR_H__ + + +typedef unsigned char tis_char; + +#ifdef TH_UNICODE +/* + * The char16_t type is only usable in C++ code, so we need this ugly hack to + * select a binary compatible C type for the expat C code to use. + */ +#ifdef __cplusplus +typedef char16_t th_char; +#else +typedef uint16_t th_char; +#endif +#define TH_THAIBEGIN_ 0x0e00 +#define th_isthai(c) (0x0e00 <= (c) && (c) <= 0x0e5f) +#else +typedef tis_char th_char; +#define TH_THAIBEGIN_ 0xa0 +#define th_isthai(c) ((c) >= 0xa0) +#endif +#define th_zcode(c) ((c) - TH_THAIBEGIN_) + +enum TH_CHARNAME { +TH_THAIBEGIN = TH_THAIBEGIN_, +TH_KOKAI,TH_KHOKHAI,TH_KHOKHUAT,TH_KHOKHWAI,TH_KHOKHON,TH_KHORAKHANG, +TH_NGONGU,TH_CHOCHAN,TH_CHOCHING,TH_CHOCHANG,TH_SOSO,TH_CHOCHOE,TH_YOYING, +TH_DOCHADA,TH_TOPATAK,TH_THOTHAN,TH_THONANGMONTHO,TH_THOPHUTHAO,TH_NONEN, +TH_DODEK,TH_TOTAO,TH_THOTHUNG,TH_THOTHAHAN,TH_THOTHONG,TH_NONU,TH_BOBAIMAI, +TH_POPLA,TH_PHOPHUNG,TH_FOFA,TH_PHOPHAN,TH_FOFAN,TH_PHOSAMPHAO,TH_MOMA, +TH_YOYAK,TH_RORUA,TH_RU,TH_LOLING,TH_LU,TH_WOWAEN,TH_SOSALA,TH_SORUSI, +TH_SOSUA,TH_HOHIP,TH_LOCHULA,TH_OANG,TH_HONOKHUK,TH_PAIYANNOI,TH_SARA_A, +TH_MAIHANAKAT,TH_SARA_AA,TH_SARA_AM,TH_SARA_I,TH_SARA_II,TH_SARA_UE, +TH_SARA_UEE,TH_SARA_U,TH_SARA_UU,TH_PHINTHU,TH_REM_CHERNG_,TH_TAC_WBRK_, +TH_UNDEF_DD,TH_UNDEF_DE,TH_BAHT,TH_SARA_E,TH_SARA_AE,TH_SARA_O,TH_MAIMUAN, +TH_MAIMALAI,TH_LAKKHANGYAO,TH_MAIYAMOK,TH_MAITAIKHU,TH_MAIEK,TH_MAITHO, +TH_MAITRI,TH_MAICHATTAWA,TH_THANTHAKHAT,TH_NIKHAHIT,TH_YAMAKKAN,TH_FONGMAN, +TH_THAIZERO,TH_THAIONE,TH_THAITWO,TH_THAITHREE,TH_THAIFOUR,TH_THAIFIVE, +TH_THAISIX,TH_THAISEVEN,TH_THAIEIGHT,TH_THAININE,TH_ANGKHANKHU,TH_KHOMUT, +TH_UNDEF_FC,TH_UNDEF_FD,TH_UNDEF_FE,TH_THAIEND +}; +#endif diff --git a/intl/lwbrk/tools/anzx4051.html b/intl/lwbrk/tools/anzx4051.html new file mode 100644 index 000000000..d894ce811 --- /dev/null +++ b/intl/lwbrk/tools/anzx4051.html @@ -0,0 +1,669 @@ +<!-- This Source Code Form is subject to the terms of the Mozilla Public + - License, v. 2.0. If a copy of the MPL was not distributed with this + - file, You can obtain one at http://mozilla.org/MPL/2.0/. --> + +<HTML> +<HEAD> +<TITLE> +Analysis of JIS X 4051 to Unicode General Category Mapping +</TITLE> +</HEAD> +<BODY> +<H1> +Analysis of JIS X 4051 to Unicode General Category Mapping +</H1> +<TABLE BORDER=3> +<TR BGCOLOR=blue><TH><TH> +<TD BGCOLOR=red>C</TD> +<TD BGCOLOR=red>L</TD> +<TD BGCOLOR=red>M</TD> +<TD BGCOLOR=red>N</TD> +<TD BGCOLOR=red>P</TD> +<TD BGCOLOR=red>S</TD> +<TD BGCOLOR=red>Z</TD> +<TD BGCOLOR=white>Total</TD> +<TD BGCOLOR=yellow>Cc</TD> +<TD BGCOLOR=yellow>Cf</TD> +<TD BGCOLOR=yellow>Co</TD> +<TD BGCOLOR=yellow>Cs</TD> +<TD BGCOLOR=yellow>Ll</TD> +<TD BGCOLOR=yellow>Lm</TD> +<TD BGCOLOR=yellow>Lo</TD> +<TD BGCOLOR=yellow>Lt</TD> +<TD BGCOLOR=yellow>Lu</TD> +<TD BGCOLOR=yellow>Mc</TD> +<TD BGCOLOR=yellow>Me</TD> +<TD BGCOLOR=yellow>Mn</TD> +<TD BGCOLOR=yellow>Nd</TD> +<TD BGCOLOR=yellow>Nl</TD> +<TD BGCOLOR=yellow>No</TD> +<TD BGCOLOR=yellow>Pc</TD> +<TD BGCOLOR=yellow>Pd</TD> +<TD BGCOLOR=yellow>Pe</TD> +<TD BGCOLOR=yellow>Pf</TD> +<TD BGCOLOR=yellow>Pi</TD> +<TD BGCOLOR=yellow>Po</TD> +<TD BGCOLOR=yellow>Ps</TD> +<TD BGCOLOR=yellow>Sc</TD> +<TD BGCOLOR=yellow>Sk</TD> +<TD BGCOLOR=yellow>Sm</TD> +<TD BGCOLOR=yellow>So</TD> +<TD BGCOLOR=yellow>Zl</TD> +<TD BGCOLOR=yellow>Zp</TD> +<TD BGCOLOR=yellow>Zs</TD> +</TR> +<TR><TH>00_1<TH> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>14</TD> +<TD>1</TD> +<TD></TD> +<TD BGCOLOR=white>15</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD>2</TD> +<TD>11</TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>01_[a]<TH> +<TD></TD> +<TD>32</TD> +<TD>2</TD> +<TD></TD> +<TD>28</TD> +<TD>3</TD> +<TD></TD> +<TD BGCOLOR=white>65</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>8</TD> +<TD>24</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>2</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD>12</TD> +<TD>1</TD> +<TD></TD> +<TD>14</TD> +<TD></TD> +<TD></TD> +<TD>2</TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>02_7<TH> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD BGCOLOR=white>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>03_8<TH> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD BGCOLOR=white>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>04_9<TH> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>5</TD> +<TD></TD> +<TD></TD> +<TD BGCOLOR=white>5</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>5</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>05_[b]<TH> +<TD>33</TD> +<TD>153</TD> +<TD></TD> +<TD>33</TD> +<TD>2</TD> +<TD>5</TD> +<TD>13</TD> +<TD BGCOLOR=white>239</TD> +<TD>32</TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>153</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>33</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>2</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>5</TD> +<TD></TD> +<TD></TD> +<TD>13</TD> +</TR> +<TR><TH>06_15<TH> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>30</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD BGCOLOR=white>30</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>30</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>07_18<TH> +<TD>18</TD> +<TD>157</TD> +<TD></TD> +<TD>33</TD> +<TD>56</TD> +<TD>125</TD> +<TD>2</TD> +<TD BGCOLOR=white>391</TD> +<TD></TD> +<TD>18</TD> +<TD></TD> +<TD></TD> +<TD>64</TD> +<TD>7</TD> +<TD>5</TD> +<TD></TD> +<TD>81</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>3</TD> +<TD>30</TD> +<TD>4</TD> +<TD>5</TD> +<TD>2</TD> +<TD></TD> +<TD>5</TD> +<TD>36</TD> +<TD>4</TD> +<TD></TD> +<TD>3</TD> +<TD>24</TD> +<TD>98</TD> +<TD>1</TD> +<TD>1</TD> +<TD></TD> +</TR> +<TR><TH>08_COMPLEX<TH> +<TD></TD> +<TD>54</TD> +<TD>33</TD> +<TD>20</TD> +<TD>2</TD> +<TD>1</TD> +<TD></TD> +<TD BGCOLOR=white>110</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD>53</TD> +<TD></TD> +<TD></TD> +<TD>11</TD> +<TD></TD> +<TD>22</TD> +<TD>10</TD> +<TD></TD> +<TD>10</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>2</TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>09_[c]<TH> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>3</TD> +<TD>4</TD> +<TD></TD> +<TD BGCOLOR=white>7</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>3</TD> +<TD>2</TD> +<TD></TD> +<TD>2</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>0A_[d]<TH> +<TD>1</TD> +<TD>2</TD> +<TD></TD> +<TD>6</TD> +<TD>28</TD> +<TD>14</TD> +<TD></TD> +<TD BGCOLOR=white>51</TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>6</TD> +<TD></TD> +<TD></TD> +<TD>3</TD> +<TD>3</TD> +<TD></TD> +<TD>22</TD> +<TD></TD> +<TD>2</TD> +<TD>3</TD> +<TD>7</TD> +<TD>2</TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>0B_[e]<TH> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD>1</TD> +<TD>3</TD> +<TD BGCOLOR=white>6</TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>3</TD> +</TR> +<TR><TH>X<TH> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD BGCOLOR=white>0</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +</TABLE> +<TABLE BORDER=3> +<TR BGCOLOR=blue><TH><TH> +<TD BGCOLOR=red>00_1</TD> +<TD BGCOLOR=red>01_[a]</TD> +<TD BGCOLOR=red>02_7</TD> +<TD BGCOLOR=red>03_8</TD> +<TD BGCOLOR=red>04_9</TD> +<TD BGCOLOR=red>05_[b]</TD> +<TD BGCOLOR=red>06_15</TD> +<TD BGCOLOR=red>07_18</TD> +<TD BGCOLOR=red>08_COMPLEX</TD> +<TD BGCOLOR=red>09_[c]</TD> +<TD BGCOLOR=red>0A_[d]</TD> +<TD BGCOLOR=red>0B_[e]</TD> +<TD BGCOLOR=red>X</TD> +</TR> +<TR><TH>00<TH> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>33</TD> +<TD>10</TD> +<TD>127</TD> +<TD></TD> +<TD>7</TD> +<TD>44</TD> +<TD>2</TD> +<TD></TD> +</TR> +<TR><TH>0E<TH> +<TD>1</TD> +<TD>6</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>20</TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>17<TH> +<TD>2</TD> +<TD>4</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>110</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>20<TH> +<TD>2</TD> +<TD>8</TD> +<TD>1</TD> +<TD></TD> +<TD>5</TD> +<TD>13</TD> +<TD></TD> +<TD>100</TD> +<TD></TD> +<TD></TD> +<TD>7</TD> +<TD>4</TD> +<TD></TD> +</TR> +<TR><TH>21<TH> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD>32</TD> +<TD></TD> +<TD>163</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>30<TH> +<TD>10</TD> +<TD>47</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>161</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +</TABLE> diff --git a/intl/lwbrk/tools/anzx4051.pl b/intl/lwbrk/tools/anzx4051.pl new file mode 100644 index 000000000..b13315b38 --- /dev/null +++ b/intl/lwbrk/tools/anzx4051.pl @@ -0,0 +1,356 @@ +#!/usr/bin/perl +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +###################################################################### +# +# Initial global variable +# +###################################################################### +%utot = (); +$ui=0; +$li=0; + +###################################################################### +# +# Open the unicode database file +# +###################################################################### +open ( UNICODATA , "< ../../unicharutil/tools/UnicodeData-Latest.txt") + || die "cannot find UnicodeData-Latest.txt"; + +###################################################################### +# +# Open the JIS X 4051 Class file +# +###################################################################### +open ( CLASS , "< jisx4051class.txt") + || die "cannot find jisx4051class.txt"; + +###################################################################### +# +# Open the JIS X 4051 Class simplified mapping +# +###################################################################### +open ( SIMP , "< jisx4051simp.txt") + || die "cannot find jisx4051simp.txt"; + +###################################################################### +# +# Open the output file +# +###################################################################### +open ( OUT , "> anzx4051.html") + || die "cannot open output anzx4051.html file"; + +###################################################################### +# +# Open the output file +# +###################################################################### +open ( HEADER , "> ../src/jisx4051class.h") + || die "cannot open output ../src/jisx4051class.h file"; + +###################################################################### +# +# Generate license and header +# +###################################################################### +$hthmlheader = <<END_OF_HTML; +<!-- This Source Code Form is subject to the terms of the Mozilla Public + - License, v. 2.0. If a copy of the MPL was not distributed with this + - file, You can obtain one at http://mozilla.org/MPL/2.0/. --> + +<HTML> +<HEAD> +<TITLE> +Analysis of JIS X 4051 to Unicode General Category Mapping +</TITLE> +</HEAD> +<BODY> +<H1> +Analysis of JIS X 4051 to Unicode General Category Mapping +</H1> +END_OF_HTML +print OUT $hthmlheader; + +###################################################################### +# +# Generate license and header +# +###################################################################### +$npl = <<END_OF_NPL; +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +/* + DO NOT EDIT THIS DOCUMENT !!! THIS DOCUMENT IS GENERATED BY + mozilla/intl/lwbrk/tools/anzx4051.pl + */ +END_OF_NPL +print HEADER $npl; + +%occ = (); +%gcat = (); +%dcat = (); +%simp = (); +%gcount = (); +%dcount = (); +%sccount = (); +%rangecount = (); + +###################################################################### +# +# Process the file line by line +# +###################################################################### +while(<UNICODATA>) { + chop; + ###################################################################### + # + # Get value from fields + # + ###################################################################### + @f = split(/;/ , $_); + $c = $f[0]; # The unicode value + $g = $f[2]; + $d = substr($g, 0, 1); + + $gcat{$c} = $g; + $dcat{$c} = $d; + $gcount{$g}++; + $dcount{$d}++; +} +close(UNIDATA); + +while(<SIMP>) { + chop; + ###################################################################### + # + # Get value from fields + # + ###################################################################### + @f = split(/;/ , $_); + + $simp{$f[0]} = $f[1]; + $sccount{$f[1]}++; +} +close(SIMP); + +sub GetClass{ + my ($u) = @_; + my $hex = DecToHex($u); + $g = $gcat{$hex}; + if($g ne "") { + return $g; + } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) { + return "Han"; + } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) { + return "Lo"; + } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) { + return "Cs"; + } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) { + return "Cs"; + } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) { + return "Cs"; + } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) { + return "Co"; + } else { + printf "WARNING !!!! Cannot find General Category for U+%s \n" , $hex; + } +} +sub GetDClass{ + my ($u) = @_; + my $hex = DecToHex($u); + $g = $dcat{$hex}; + if($g ne "") { + return $g; + } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) { + return "Han"; + } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) { + return "L"; + } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) { + return "C"; + } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) { + return "C"; + } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) { + return "C"; + } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) { + return "C"; + } else { + printf "WARNING !!!! Cannot find Detailed General Category for U+%s \n" , $hex; + } +} +sub DecToHex{ + my ($d) = @_; + return sprintf("%04X", $d); +} +%gtotal = (); +%dtotal = (); +while(<CLASS>) { + chop; + ###################################################################### + # + # Get value from fields + # + ###################################################################### + @f = split(/;/ , $_); + + if( substr($f[2], 0, 1) ne "a") + { + $sc = $simp{$f[2]}; + $l = hex($f[0]); + if($f[1] eq "") + { + $h = $l; + } else { + $h = hex($f[1]); + } + for($k = $l; $k <= $h ; $k++) + { + if( exists($occ{$k})) + { + # printf "WARNING !! Conflict defination!!! U+%s -> [%s] [%s | %s]\n", + # DecToHex($k), $occ{$k} , $f[2] , $sc; + } + else + { + $occ{$k} = $sc . " | " . $f[2]; + $gclass = GetClass($k); + $dclass = GetDClass($k); + $gtotal{$sc . $gclass}++; + $dtotal{$sc . $dclass}++; + $u = DecToHex($k); + $rk = " " . substr($u,0,2) . ":" . $sc; + $rangecount{$rk}++; + } + } + } +} + +#print %gtotal; +#print %dtotal; + +sub printreport +{ + print OUT "<TABLE BORDER=3>\n"; + print OUT "<TR BGCOLOR=blue><TH><TH>\n"; + + foreach $d (sort(keys %dcount)) { + print OUT "<TD BGCOLOR=red>$d</TD>\n"; + } + + print OUT "<TD BGCOLOR=white>Total</TD>\n"; + foreach $g (sort(keys %gcount)) { + print OUT "<TD BGCOLOR=yellow>$g</TD>\n"; + } + print OUT "</TR>\n"; + foreach $sc (sort(keys %sccount)) { + + print OUT "<TR><TH>$sc<TH>\n"; + + $total = 0; + foreach $d (sort (keys %dcount)) { + $count = $dtotal{$sc . $d}; + $total += $count; + print OUT "<TD>$count</TD>\n"; + } + + print OUT "<TD BGCOLOR=white>$total</TD>\n"; + + foreach $g (sort(keys %gcount)) { + $count = $gtotal{$sc . $g}; + print OUT "<TD>$count</TD>\n"; + } + + + print OUT "</TR>\n"; + } + print OUT "</TABLE>\n"; + + + print OUT "<TABLE BORDER=3>\n"; + print OUT "<TR BGCOLOR=blue><TH><TH>\n"; + + foreach $sc (sort(keys %sccount)) + { + print OUT "<TD BGCOLOR=red>$sc</TD>\n"; + } + + print OUT "</TR>\n"; + + + for($rr = 0; $rr < 0x4f; $rr++) + { + $empty = 0; + $r = sprintf("%02X" , $rr) ; + $tmp = "<TR><TH>" . $r . "<TH>\n"; + + foreach $sc (sort(keys %sccount)) { + $count = $rangecount{ " " .$r . ":" .$sc}; + $tmp .= sprintf("<TD>%s</TD>\n", $count); + $empty += $count; + } + + $tmp .= "</TR>\n"; + + if($empty ne 0) + { + print OUT $tmp; + } + } + print OUT "</TABLE>\n"; + +} +printreport(); + +sub printarray +{ + my($r, $def) = @_; +printf "[%s || %s]\n", $r, $def; + $k = hex($r) * 256; + printf HEADER "static const uint32_t gLBClass%s[32] = {\n", $r; + for($i = 0 ; $i < 256; $i+= 8) + { + for($j = 7 ; $j >= 0; $j-- ) + { + $v = $k + $i + $j; + if( exists($occ{$v})) + { + $p = substr($occ{$v}, 1,1); + } else { + $p = $def; + } + + if($j eq 7 ) + { + printf HEADER "0x%s" , $p; + } else { + printf HEADER "%s", $p ; + } + } + printf HEADER ", // U+%04X - U+%04X\n", $k + $i ,( $k + $i + 7); + } + print HEADER "};\n\n"; +} +printarray("00", "7"); +printarray("20", "7"); +printarray("21", "7"); +printarray("30", "5"); +printarray("0E", "8"); +printarray("17", "7"); + +#print %rangecount; + +###################################################################### +# +# Close files +# +###################################################################### +close(HEADER); +close(CLASS); +close(OUT); + diff --git a/intl/lwbrk/tools/jisx4051class.txt b/intl/lwbrk/tools/jisx4051class.txt new file mode 100644 index 000000000..5b26b7267 --- /dev/null +++ b/intl/lwbrk/tools/jisx4051class.txt @@ -0,0 +1,159 @@ +0000;001f;17 +0020;;17 +0024;;24 +0027;;18 +0028;;22 +002D;;18 +002F;;18 +0021;002F;23 +0030;0039;15 +003C;;22 +003A;003F;23 +0040;;18 +0041;005A;18 +005B;;22 +005E;;18 +005F;;18 +005B;005F;23 +0060;;18 +0061;007A;18 +007B;;22 +007B;007E;23 +00A0;;24 +00A3;;22 +00A5;;22 +00A9;;18 +00AA;;18 +00AB;;18 +00AC;;22 +00AE;;18 +00AF;;18 +00A1;00BF;23 +00B0;;18 +00F7;;23 +00C0;00FF;18 +0E3F;;1 +0E2F;;4 +0E46;;4 +0E5A;0E5B;4 +0E50;0E59;15 +0E4F;;18 +0EAF;;4 +0EC6;;4 +0ED0;0ED9;15 +1735;1736;1 +17D4;17D5;4 +17D8;;4 +17DA;;4 +1780;17DD;21 +17E0;17E9;21 +17F0;17F9;21 +2007;;24 +2000;200B;17 +200C;200F;18 +2010;;18 +2011;;24 +2012;2013;18 +2014;;7 +2015;;18 +2016;2017;18 +2019;;23 +201D;;23 +2018;201F;18 +2020;2023;18 +2024;2026;23 +2027;;23 +2028;202E;18 +202F;;24 +2030;2034;9 +2035;2038;18 +2039;;1 +203A;;2 +203B;;12 +203C;203D;3 +203E;;23 +203F;2043;18 +2044;;3 +2045;;1 +2046;;2 +2047;2049;3 +204A;205E;18 +205F;;17 +2060;;24 +2061;2063;18 +206A;206F;18 +2070;2071;18 +2074;208E;18 +2090;2094;18 +2116;;8 +2160;217F;12 +2190;21EA;a12 +2126;;18 +2100;2138;18 +2153;2182;18 +2190;21EA;18 +3008;;1 +300A;;1 +300C;;1 +300E;;1 +3010;;1 +3014;;1 +3016;;1 +3018;;1 +301A;;1 +301D;;1 +3001;;2 +3009;;2 +300B;;2 +300D;;2 +300F;;2 +3011;;2 +3015;;2 +3017;;2 +3019;;2 +301B;;2 +301E;;2 +301F;;2 +3005;;3 +301C;;3 +3041;;3 +3043;;3 +3045;;3 +3047;;3 +3049;;3 +3063;;3 +3083;;3 +3085;;3 +3087;;3 +308E;;3 +309D;;3 +309E;;3 +30A1;;3 +30A3;;3 +30A5;;3 +30A7;;3 +30A9;;3 +30C3;;3 +30E3;;3 +30E5;;3 +30E7;;3 +30EE;;3 +30F5;;3 +30F6;;3 +30FC;;3 +30FD;;3 +30FE;;3 +30FB;;5 +3002;;6 +3000;;10 +3042;3094;11 +3099;309E;3 +3003;;12 +3004;;12 +3006;;12 +3007;;12 +3012;;12 +3013;;12 +3020;;12 +3036;;12 +30A2;30FA;12 diff --git a/intl/lwbrk/tools/jisx4051simp.txt b/intl/lwbrk/tools/jisx4051simp.txt new file mode 100644 index 000000000..e12a7fd80 --- /dev/null +++ b/intl/lwbrk/tools/jisx4051simp.txt @@ -0,0 +1,24 @@ +1;00_1 +2;01_[a] +3;01_[a] +4;01_[a] +5;01_[a] +6;01_[a] +7;02_7 +8;03_8 +9;04_9 +10;05_[b] +11;05_[b] +12;05_[b] +13;X +14;X +15;06_15 +16;X +17;05_[b] +18;07_18 +19;X +20;X +21;08_COMPLEX +22;09_[c] +23;0A_[d] +24;0B_[e] diff --git a/intl/lwbrk/tools/spec_table.html b/intl/lwbrk/tools/spec_table.html new file mode 100644 index 000000000..519f98c53 --- /dev/null +++ b/intl/lwbrk/tools/spec_table.html @@ -0,0 +1,127 @@ +<!-- This Source Code Form is subject to the terms of the Mozilla Public + - License, v. 2.0. If a copy of the MPL was not distributed with this + - file, You can obtain one at http://mozilla.org/MPL/2.0/. --> + +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> +<html> +<head> +<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> +<title></title> +<style type="text/css"> +table { + border: solid 1px; + border-collapse: collapse; +} +tbody, tfoot { + border-top: solid 2px; +} +td, th { + border: solid 1px; +} +td { + text-align: center; +} +</style> +</head> +<body> +<p>This is a specification table for line breaking.</p> +<p>The values of IE7 and Opera9: 'A' means that the line is breakable After the character, and 'B' means Before. 'BA' means Before and After.</p> +<p>(C) which is the tail of the IE7 and the Opera9 means Character. (N) means Numeric. +This means that they are around the character at testing. E.g., "a$a" is a testcase for (C), "0$0" is a testcase for (N).</p> +<p>Gecko is not breaking the lines on most western language context. But for file paths, URLs and very long word which is connected hyphens, +some characters might be breakable. They are 'breakable' in the table. However, they are not always breakable, +they <em>depend on the context</em> in the word.</p> +<table border="1"> +<thead> +<tr><th colspan="2">character</th><th>Gecko</th><th>IE7(C)</th><th>IE7(N)</th><th>Opera9.2(C)</th><th>Opera9.2(N)</th></tr> +</thead> +<tfoot> +<tr><th colspan="2">character</th><th>Gecko</th><th>IE7(C)</th><th>IE7(N)</th><th>Opera9.2(C)</th><th>Opera9.2(N)</th></tr> +</tfoot> +<tbody> +<tr><th>0x21</th><th>!</th><td></td><td>A</td><td>A</td><td></td><td></td></tr> +<tr><th>0x22</th><th>"</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x23</th><th>#</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x24</th><th>$</th><td></td><td></td><td>B</td><td></td><td></td></tr> +<tr><th>0x25</th><th>%</th><td>breakable</td><td>A</td><td>A</td><td></td><td></td></tr> +<tr><th>0x26</th><th>&</th><td>breakable</td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x27</th><th>'</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x28</th><th>(</th><td></td><td>B</td><td>B</td><td></td><td></td></tr> +<tr><th>0x29</th><th>)</th><td></td><td>A</td><td>A</td><td></td><td></td></tr> +<tr><th>0x2A</th><th>*</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x2B</th><th>+</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x2C</th><th>,</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x2D</th><th>-</th><td>breakable</td><td>BA</td><td>BA</td><td>A</td><td>A</td></tr> +<tr><th>0x2E</th><th>.</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x2F</th><th>/</th><td>breakable</td><td></td><td></td><td>A</td><td>A</td></tr> +</tbody> +<tbody> +<tr><th>0x3A</th><th>:</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x3B</th><th>;</th><td>breakable</td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x3C</th><th><</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x3D</th><th>=</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x3E</th><th>></th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x3F</th><th>?</th><td></td><td>A</td><td>A</td><td></td><td></td></tr> +</tbody> +<tbody> +<tr><th>0x40</th><th>@</th><td></td><td></td><td></td><td></td></tr> +</tbody> +<tbody> +<tr><th>0x5B</th><th>[</th><td></td><td>B</td><td>B</td><td></td><td></td></tr> +<tr><th>0x5C</th><th>\</th><td>breakable</td><td></td><td>B</td><td></td><td></td></tr> +<tr><th>0x5D</th><th>]</th><td></td><td>A</td><td>A</td><td></td><td></td></tr> +<tr><th>0x5E</th><th>^</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x5F</th><th>_</th><td></td><td></td><td></td><td></td><td></td></tr> +</tbody> +<tbody> +<tr><th>0x60</th><th>`</th><td></td><td></td><td></td><td></td><td></td></tr> +</tbody> +<tbody> +<tr><th>0x7B</th><th>{</th><td></td><td>B</td><td>B</td><td></td><td></td></tr> +<tr><th>0x7C</th><th>|</th><td></td><td></td><td></td><td>A</td><td>A</td></tr> +<tr><th>0x7D</th><th>}</th><td></td><td>A</td><td>A</td><td></td><td></td></tr> +<tr><th>0x7E</th><th>~</th><td></td><td></td><td></td><td></td><td></td></tr> +</tbody> +<tbody> +<tr><th>0xA1</th><th>¡</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xA2</th><th>¢</th><td></td><td>A</td><td>A</td><td></td><td></td></tr> +<tr><th>0xA3</th><th>£</th><td></td><td></td><td>B</td><td></td><td></td></tr> +<tr><th>0xA4</th><th>¤</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xA5</th><th>¥</th><td></td><td></td><td>B</td><td></td><td></td></tr> +<tr><th>0xA6</th><th>¦</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xA7</th><th>§</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xA8</th><th>¨</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xA9</th><th>©</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xAA</th><th>ª</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xAB</th><th>«</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xAC</th><th>¬</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xAE</th><th>®</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xAF</th><th>¯</th><td></td><td></td><td></td><td></td><td></td></tr> +</tbody> +<tbody> +<tr><th>0xB0</th><th>°</th><td></td><td>A</td><td>A</td><td></td><td></td></tr> +<tr><th>0xB1</th><th>±</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xB2</th><th>²</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xB3</th><th>³</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xB4</th><th>´</th><td></td><td></td><td></td><td>B</td><td>B</td></tr> +<tr><th>0xB5</th><th>µ</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xB6</th><th>¶</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xB7</th><th>·</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xB8</th><th>¸</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xB9</th><th>¹</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xBA</th><th>º</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xBB</th><th>»</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xBC</th><th>¼</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xBD</th><th>½</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xBE</th><th>¾</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xBF</th><th>¿</th><td></td><td></td><td></td><td></td><td></td></tr> +</tbody> +<tbody> +<tr><th>0xD7</th><th>×</th><td></td><td></td><td></td><td></td><td></td></tr> +</tbody> +<tbody> +<tr><th>0xF7</th><th>÷</th><td></td><td></td><td></td><td></td><td></td></tr> +</tbody> +</table> +</body> +</html> |