summaryrefslogtreecommitdiffstats
path: root/intl/unicharutil/nsUnicodeNormalizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'intl/unicharutil/nsUnicodeNormalizer.cpp')
-rw-r--r--intl/unicharutil/nsUnicodeNormalizer.cpp704
1 files changed, 704 insertions, 0 deletions
diff --git a/intl/unicharutil/nsUnicodeNormalizer.cpp b/intl/unicharutil/nsUnicodeNormalizer.cpp
new file mode 100644
index 000000000..16c3a969c
--- /dev/null
+++ b/intl/unicharutil/nsUnicodeNormalizer.cpp
@@ -0,0 +1,704 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+
+/* This file is modified from JPNIC's mDNKit, it is under both MPL and
+ * JPNIC's license.
+ */
+
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/*
+ * Copyright (c) 2000,2002 Japan Network Information Center.
+ * All rights reserved.
+ *
+ * By using this file, you agree to the terms and conditions set forth bellow.
+ *
+ * LICENSE TERMS AND CONDITIONS
+ *
+ * The following License Terms and Conditions apply, unless a different
+ * license is obtained from Japan Network Information Center ("JPNIC"),
+ * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
+ * Chiyoda-ku, Tokyo 101-0047, Japan.
+ *
+ * 1. Use, Modification and Redistribution (including distribution of any
+ * modified or derived work) in source and/or binary forms is permitted
+ * under this License Terms and Conditions.
+ *
+ * 2. Redistribution of source code must retain the copyright notices as they
+ * appear in each source code file, this License Terms and Conditions.
+ *
+ * 3. Redistribution in binary form must reproduce the Copyright Notice,
+ * this License Terms and Conditions, in the documentation and/or other
+ * materials provided with the distribution. For the purposes of binary
+ * distribution the "Copyright Notice" refers to the following language:
+ * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved."
+ *
+ * 4. The name of JPNIC may not be used to endorse or promote products
+ * derived from this Software without specific prior written approval of
+ * JPNIC.
+ *
+ * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+ */
+
+#include <string.h>
+
+#include "nsMemory.h"
+#include "nsUnicodeNormalizer.h"
+#include "nsString.h"
+#include "mozilla/BinarySearch.h"
+
+NS_IMPL_ISUPPORTS(nsUnicodeNormalizer, nsIUnicodeNormalizer)
+
+
+nsUnicodeNormalizer::nsUnicodeNormalizer()
+{
+}
+
+nsUnicodeNormalizer::~nsUnicodeNormalizer()
+{
+}
+
+
+
+#define END_BIT 0x80000000
+
+
+/*
+ * Some constants for Hangul decomposition/composition.
+ * These things were taken from unicode book.
+ */
+#define SBase 0xac00
+#define LBase 0x1100
+#define VBase 0x1161
+#define TBase 0x11a7
+#define LCount 19
+#define VCount 21
+#define TCount 28
+#define SLast (SBase + LCount * VCount * TCount)
+
+struct composition {
+ uint32_t c2; /* 2nd character */
+ uint32_t comp; /* composed character */
+};
+
+
+#include "normalization_data.h"
+
+/*
+ * Macro for multi-level index table.
+ */
+#define LOOKUPTBL(vprefix, mprefix, v) \
+ DMAP(vprefix)[\
+ IMAP(vprefix)[\
+ IMAP(vprefix)[IDX0(mprefix, v)] + IDX1(mprefix, v)\
+ ]\
+ ].tbl[IDX2(mprefix, v)]
+
+#define IDX0(mprefix, v) IDX_0(v, BITS1(mprefix), BITS2(mprefix))
+#define IDX1(mprefix, v) IDX_1(v, BITS1(mprefix), BITS2(mprefix))
+#define IDX2(mprefix, v) IDX_2(v, BITS1(mprefix), BITS2(mprefix))
+
+#define IDX_0(v, bits1, bits2) ((v) >> ((bits1) + (bits2)))
+#define IDX_1(v, bits1, bits2) (((v) >> (bits2)) & ((1 << (bits1)) - 1))
+#define IDX_2(v, bits1, bits2) ((v) & ((1 << (bits2)) - 1))
+
+#define BITS1(mprefix) mprefix ## _BITS_1
+#define BITS2(mprefix) mprefix ## _BITS_2
+
+#define IMAP(vprefix) vprefix ## _imap
+#define DMAP(vprefix) vprefix ## _table
+#define SEQ(vprefix) vprefix ## _seq
+
+static int32_t
+canonclass(uint32_t c) {
+ /* Look up canonicalclass table. */
+ return (LOOKUPTBL(canon_class, CANON_CLASS, c));
+}
+
+static int32_t
+decompose_char(uint32_t c, const uint32_t **seqp)
+{
+ /* Look up decomposition table. */
+ int32_t seqidx = LOOKUPTBL(decompose, DECOMP, c);
+ *seqp = SEQ(decompose) + (seqidx & ~DECOMP_COMPAT);
+ return (seqidx);
+}
+
+static int32_t
+compose_char(uint32_t c,
+ const struct composition **compp)
+{
+ /* Look up composition table. */
+ int32_t seqidx = LOOKUPTBL(compose, CANON_COMPOSE, c);
+ *compp = SEQ(compose) + (seqidx & 0xffff);
+ return (seqidx >> 16);
+}
+
+static nsresult
+mdn__unicode_decompose(int32_t compat, uint32_t *v, size_t vlen,
+ uint32_t c, int32_t *decomp_lenp)
+{
+ uint32_t *vorg = v;
+ int32_t seqidx;
+ const uint32_t *seq;
+
+ //assert(v != nullptr && vlen >= 0 && decomp_lenp != nullptr);
+
+ /*
+ * First, check for Hangul.
+ */
+ if (SBase <= c && c < SLast) {
+ int32_t idx, t_offset, v_offset, l_offset;
+
+ idx = c - SBase;
+ t_offset = idx % TCount;
+ idx /= TCount;
+ v_offset = idx % VCount;
+ l_offset = idx / VCount;
+ if ((t_offset == 0 && vlen < 2) || (t_offset > 0 && vlen < 3))
+ return (NS_ERROR_UNORM_MOREOUTPUT);
+ *v++ = LBase + l_offset;
+ *v++ = VBase + v_offset;
+ if (t_offset > 0)
+ *v++ = TBase + t_offset;
+ *decomp_lenp = v - vorg;
+ return (NS_OK);
+ }
+
+ /*
+ * Look up decomposition table. If no decomposition is defined
+ * or if it is a compatibility decomosition when canonical
+ * decomposition requested, return 'NS_SUCCESS_UNORM_NOTFOUND'.
+ */
+ seqidx = decompose_char(c, &seq);
+ if (seqidx == 0 || (compat == 0 && (seqidx & DECOMP_COMPAT) != 0))
+ return (NS_SUCCESS_UNORM_NOTFOUND);
+
+ /*
+ * Copy the decomposed sequence. The end of the sequence are
+ * marked with END_BIT.
+ */
+ do {
+ uint32_t c;
+ int32_t dlen;
+ nsresult r;
+
+ c = *seq & ~END_BIT;
+
+ /* Decompose recursively. */
+ r = mdn__unicode_decompose(compat, v, vlen, c, &dlen);
+ if (r == NS_OK) {
+ v += dlen;
+ vlen -= dlen;
+ } else if (r == NS_SUCCESS_UNORM_NOTFOUND) {
+ if (vlen < 1)
+ return (NS_ERROR_UNORM_MOREOUTPUT);
+ *v++ = c;
+ vlen--;
+ } else {
+ return (r);
+ }
+
+ } while ((*seq++ & END_BIT) == 0);
+
+ *decomp_lenp = v - vorg;
+
+ return (NS_OK);
+}
+
+static int32_t
+mdn__unicode_iscompositecandidate(uint32_t c)
+{
+ const struct composition *dummy;
+
+ /* Check for Hangul */
+ if ((LBase <= c && c < LBase + LCount) || (SBase <= c && c < SLast))
+ return (1);
+
+ /*
+ * Look up composition table. If there are no composition
+ * that begins with the given character, it is not a
+ * composition candidate.
+ */
+ if (compose_char(c, &dummy) == 0)
+ return (0);
+ else
+ return (1);
+}
+
+namespace {
+
+struct SequenceAdaptor
+{
+ const composition* const mSequence;
+ explicit SequenceAdaptor(const composition* aSequence) : mSequence(aSequence) {}
+ uint32_t operator[](size_t aIdx) const {
+ return mSequence[aIdx].c2;
+ }
+};
+
+} // namespace
+
+static nsresult
+mdn__unicode_compose(uint32_t c1, uint32_t c2, uint32_t *compp)
+{
+ int32_t n;
+ const struct composition *cseq;
+
+ //assert(compp != nullptr);
+
+ /*
+ * Check for Hangul.
+ */
+ if (LBase <= c1 && c1 < LBase + LCount &&
+ VBase <= c2 && c2 < VBase + VCount) {
+ /*
+ * Hangul L and V.
+ */
+ *compp = SBase +
+ ((c1 - LBase) * VCount + (c2 - VBase)) * TCount;
+ return (NS_OK);
+ } else if (SBase <= c1 && c1 < SLast &&
+ TBase <= c2 && c2 < TBase + TCount &&
+ (c1 - SBase) % TCount == 0) {
+ /*
+ * Hangul LV and T.
+ */
+ *compp = c1 + (c2 - TBase);
+ return (NS_OK);
+ }
+
+ /*
+ * Look up composition table. If the result is 0, no composition
+ * is defined. Otherwise, upper 16bits of the result contains
+ * the number of composition that begins with 'c1', and the lower
+ * 16bits is the offset in 'compose_seq'.
+ */
+ if ((n = compose_char(c1, &cseq)) == 0)
+ return (NS_SUCCESS_UNORM_NOTFOUND);
+
+ /*
+ * The composite sequences are sorted by the 2nd character 'c2'.
+ * So we can use binary search.
+ */
+ size_t idx;
+ if (mozilla::BinarySearch(SequenceAdaptor(cseq), 0, n, c2, &idx)) {
+ *compp = cseq[idx].comp;
+ return (NS_OK);
+ }
+
+ return (NS_SUCCESS_UNORM_NOTFOUND);
+}
+
+
+#define WORKBUF_SIZE 128
+#define WORKBUF_SIZE_MAX 10000
+
+typedef struct {
+ int32_t cur; /* pointing now processing character */
+ int32_t last; /* pointing just after the last character */
+ int32_t size; /* size of UCS and CLASS array */
+ uint32_t *ucs; /* UCS-4 characters */
+ int32_t *cclass; /* and their canonical classes */
+ uint32_t ucs_buf[WORKBUF_SIZE]; /* local buffer */
+ int32_t class_buf[WORKBUF_SIZE]; /* ditto */
+} workbuf_t;
+
+static nsresult decompose(workbuf_t *wb, uint32_t c, int32_t compat);
+static void get_class(workbuf_t *wb);
+static void reorder(workbuf_t *wb);
+static void compose(workbuf_t *wb);
+static nsresult flush_before_cur(workbuf_t *wb, nsAString& aToStr);
+static void workbuf_init(workbuf_t *wb);
+static void workbuf_free(workbuf_t *wb);
+static nsresult workbuf_extend(workbuf_t *wb);
+static nsresult workbuf_append(workbuf_t *wb, uint32_t c);
+static void workbuf_shift(workbuf_t *wb, int32_t shift);
+static void workbuf_removevoid(workbuf_t *wb);
+
+
+static nsresult
+mdn_normalize(bool do_composition, bool compat,
+ const nsAString& aSrcStr, nsAString& aToStr)
+{
+ workbuf_t wb;
+ nsresult r = NS_OK;
+ /*
+ * Initialize working buffer.
+ */
+ workbuf_init(&wb);
+
+ nsAString::const_iterator start, end;
+ aSrcStr.BeginReading(start);
+ aSrcStr.EndReading(end);
+
+ while (start != end) {
+ uint32_t c;
+ char16_t curChar;
+
+ //assert(wb.cur == wb.last);
+
+ /*
+ * Get one character from 'from'.
+ */
+ curChar= *start++;
+
+ if (NS_IS_HIGH_SURROGATE(curChar) && start != end && NS_IS_LOW_SURROGATE(*(start)) ) {
+ c = SURROGATE_TO_UCS4(curChar, *start);
+ ++start;
+ } else {
+ c = curChar;
+ }
+
+ /*
+ * Decompose it.
+ */
+ if ((r = decompose(&wb, c, compat)) != NS_OK)
+ break;
+
+ /*
+ * Get canonical class.
+ */
+ get_class(&wb);
+
+ /*
+ * Reorder & compose.
+ */
+ for (; wb.cur < wb.last; wb.cur++) {
+ if (wb.cur == 0) {
+ continue;
+ } else if (wb.cclass[wb.cur] > 0) {
+ /*
+ * This is not a starter. Try reordering.
+ * Note that characters up to it are
+ * already in canonical order.
+ */
+ reorder(&wb);
+ continue;
+ }
+
+ /*
+ * This is a starter character, and there are
+ * some characters before it. Those characters
+ * have been reordered properly, and
+ * ready for composition.
+ */
+ if (do_composition && wb.cclass[0] == 0)
+ compose(&wb);
+
+ /*
+ * If CUR points to a starter character,
+ * then process of characters before CUR are
+ * already finished, because any further
+ * reordering/composition for them are blocked
+ * by the starter CUR points.
+ */
+ if (wb.cur > 0 && wb.cclass[wb.cur] == 0) {
+ /* Flush everything before CUR. */
+ r = flush_before_cur(&wb, aToStr);
+ if (r != NS_OK)
+ break;
+ }
+ }
+ }
+
+ if (r == NS_OK) {
+ if (do_composition && wb.cur > 0 && wb.cclass[0] == 0) {
+ /*
+ * There is some characters left in WB.
+ * They are ordered, but not composed yet.
+ * Now CUR points just after the last character in WB,
+ * and since compose() tries to compose characters
+ * between top and CUR inclusive, we must make CUR
+ * one character back during compose().
+ */
+ wb.cur--;
+ compose(&wb);
+ wb.cur++;
+ }
+ /*
+ * Call this even when WB.CUR == 0, to make TO
+ * NUL-terminated.
+ */
+ r = flush_before_cur(&wb, aToStr);
+ }
+
+ workbuf_free(&wb);
+
+ return (r);
+}
+
+static nsresult
+decompose(workbuf_t *wb, uint32_t c, int32_t compat) {
+ nsresult r;
+ int32_t dec_len;
+
+again:
+ r = mdn__unicode_decompose(compat, wb->ucs + wb->last,
+ wb->size - wb->last, c, &dec_len);
+ switch (r) {
+ case NS_OK:
+ wb->last += dec_len;
+ return (NS_OK);
+ case NS_SUCCESS_UNORM_NOTFOUND:
+ return (workbuf_append(wb, c));
+ case NS_ERROR_UNORM_MOREOUTPUT:
+ if ((r = workbuf_extend(wb)) != NS_OK)
+ return (r);
+ if (wb->size > WORKBUF_SIZE_MAX) {
+ // "mdn__unormalize_form*: " "working buffer too large\n"
+ return (NS_ERROR_FAILURE);
+ }
+ goto again;
+ default:
+ return (r);
+ }
+ /* NOTREACHED */
+}
+
+static void
+get_class(workbuf_t *wb) {
+ int32_t i;
+
+ for (i = wb->cur; i < wb->last; i++)
+ wb->cclass[i] = canonclass(wb->ucs[i]);
+}
+
+static void
+reorder(workbuf_t *wb) {
+ uint32_t c;
+ int32_t i;
+ int32_t cclass;
+
+ //assert(wb != nullptr);
+
+ i = wb->cur;
+ c = wb->ucs[i];
+ cclass = wb->cclass[i];
+
+ while (i > 0 && wb->cclass[i - 1] > cclass) {
+ wb->ucs[i] = wb->ucs[i - 1];
+ wb->cclass[i] =wb->cclass[i - 1];
+ i--;
+ wb->ucs[i] = c;
+ wb->cclass[i] = cclass;
+ }
+}
+
+static void
+compose(workbuf_t *wb) {
+ int32_t cur;
+ uint32_t *ucs;
+ int32_t *cclass;
+ int32_t last_class;
+ int32_t nvoids;
+ int32_t i;
+
+ //assert(wb != nullptr && wb->cclass[0] == 0);
+
+ cur = wb->cur;
+ ucs = wb->ucs;
+ cclass = wb->cclass;
+
+ /*
+ * If there are no decomposition sequence that begins with
+ * the top character, composition is impossible.
+ */
+ if (!mdn__unicode_iscompositecandidate(ucs[0]))
+ return;
+
+ last_class = 0;
+ nvoids = 0;
+ for (i = 1; i <= cur; i++) {
+ uint32_t c;
+ int32_t cl = cclass[i];
+
+ if ((last_class < cl || cl == 0) &&
+ mdn__unicode_compose(ucs[0], ucs[i],
+ &c) == NS_OK) {
+ /*
+ * Replace the top character with the composed one.
+ */
+ ucs[0] = c;
+ cclass[0] = canonclass(c);
+
+ cclass[i] = -1; /* void this character */
+ nvoids++;
+ } else {
+ last_class = cl;
+ }
+ }
+
+ /* Purge void characters, if any. */
+ if (nvoids > 0)
+ workbuf_removevoid(wb);
+}
+
+static nsresult
+flush_before_cur(workbuf_t *wb, nsAString& aToStr)
+{
+ int32_t i;
+
+ for (i = 0; i < wb->cur; i++) {
+ if (!IS_IN_BMP(wb->ucs[i])) {
+ aToStr.Append((char16_t)H_SURROGATE(wb->ucs[i]));
+ aToStr.Append((char16_t)L_SURROGATE(wb->ucs[i]));
+ } else {
+ aToStr.Append((char16_t)(wb->ucs[i]));
+ }
+ }
+
+ workbuf_shift(wb, wb->cur);
+
+ return (NS_OK);
+}
+
+static void
+workbuf_init(workbuf_t *wb) {
+ wb->cur = 0;
+ wb->last = 0;
+ wb->size = WORKBUF_SIZE;
+ wb->ucs = wb->ucs_buf;
+ wb->cclass = wb->class_buf;
+}
+
+static void
+workbuf_free(workbuf_t *wb) {
+ if (wb->ucs != wb->ucs_buf) {
+ free(wb->ucs);
+ free(wb->cclass);
+ }
+}
+
+static nsresult
+workbuf_extend(workbuf_t *wb) {
+ int32_t newsize = wb->size * 3;
+
+ if (wb->ucs == wb->ucs_buf) {
+ wb->ucs = (uint32_t*)moz_xmalloc(sizeof(wb->ucs[0]) * newsize);
+ if (!wb->ucs)
+ return NS_ERROR_OUT_OF_MEMORY;
+ wb->cclass = (int32_t*)moz_xmalloc(sizeof(wb->cclass[0]) * newsize);
+ if (!wb->cclass) {
+ free(wb->ucs);
+ wb->ucs = nullptr;
+ return NS_ERROR_OUT_OF_MEMORY;
+ }
+ } else {
+ void* buf = moz_xrealloc(wb->ucs, sizeof(wb->ucs[0]) * newsize);
+ if (!buf)
+ return NS_ERROR_OUT_OF_MEMORY;
+ wb->ucs = (uint32_t*)buf;
+ buf = moz_xrealloc(wb->cclass, sizeof(wb->cclass[0]) * newsize);
+ if (!buf)
+ return NS_ERROR_OUT_OF_MEMORY;
+ wb->cclass = (int32_t*)buf;
+ }
+ return (NS_OK);
+}
+
+static nsresult
+workbuf_append(workbuf_t *wb, uint32_t c) {
+ nsresult r;
+
+ if (wb->last >= wb->size && (r = workbuf_extend(wb)) != NS_OK)
+ return (r);
+ wb->ucs[wb->last++] = c;
+ return (NS_OK);
+}
+
+static void
+workbuf_shift(workbuf_t *wb, int32_t shift) {
+ int32_t nmove;
+
+ //assert(wb != nullptr && wb->cur >= shift);
+
+ nmove = wb->last - shift;
+ memmove(&wb->ucs[0], &wb->ucs[shift],
+ nmove * sizeof(wb->ucs[0]));
+ memmove(&wb->cclass[0], &wb->cclass[shift],
+ nmove * sizeof(wb->cclass[0]));
+ wb->cur -= shift;
+ wb->last -= shift;
+}
+
+static void
+workbuf_removevoid(workbuf_t *wb) {
+ int32_t i, j;
+ int32_t last = wb->last;
+
+ for (i = j = 0; i < last; i++) {
+ if (wb->cclass[i] >= 0) {
+ if (j < i) {
+ wb->ucs[j] = wb->ucs[i];
+ wb->cclass[j] = wb->cclass[i];
+ }
+ j++;
+ }
+ }
+ wb->cur -= last - j;
+ wb->last = j;
+}
+
+nsresult
+nsUnicodeNormalizer::NormalizeUnicodeNFD( const nsAString& aSrc, nsAString& aDest)
+{
+ return mdn_normalize(false, false, aSrc, aDest);
+}
+
+nsresult
+nsUnicodeNormalizer::NormalizeUnicodeNFC( const nsAString& aSrc, nsAString& aDest)
+{
+ return mdn_normalize(true, false, aSrc, aDest);
+}
+
+nsresult
+nsUnicodeNormalizer::NormalizeUnicodeNFKD( const nsAString& aSrc, nsAString& aDest)
+{
+ return mdn_normalize(false, true, aSrc, aDest);
+}
+
+nsresult
+nsUnicodeNormalizer::NormalizeUnicodeNFKC( const nsAString& aSrc, nsAString& aDest)
+{
+ return mdn_normalize(true, true, aSrc, aDest);
+}
+
+bool
+nsUnicodeNormalizer::Compose(uint32_t a, uint32_t b, uint32_t *ab)
+{
+ return mdn__unicode_compose(a, b, ab) == NS_OK;
+}
+
+bool
+nsUnicodeNormalizer::DecomposeNonRecursively(uint32_t c, uint32_t *c1, uint32_t *c2)
+{
+ // We can't use mdn__unicode_decompose here, because that does a recursive
+ // decomposition that may yield more than two characters, but the harfbuzz
+ // callback wants just a single-step decomp that is guaranteed to produce
+ // no more than two characters. So we do a low-level lookup in the table
+ // of decomp sequences.
+ const uint32_t *seq;
+ uint32_t seqidx = decompose_char(c, &seq);
+ if (seqidx == 0 || ((seqidx & DECOMP_COMPAT) != 0)) {
+ return false;
+ }
+ *c1 = *seq & ~END_BIT;
+ if (*seq & END_BIT) {
+ *c2 = 0;
+ } else {
+ *c2 = *++seq & ~END_BIT;
+ }
+ return true;
+}