diff options
Diffstat (limited to 'intl/hyphenation/glue/nsHyphenator.cpp')
-rw-r--r-- | intl/hyphenation/glue/nsHyphenator.cpp | 159 |
1 files changed, 159 insertions, 0 deletions
diff --git a/intl/hyphenation/glue/nsHyphenator.cpp b/intl/hyphenation/glue/nsHyphenator.cpp new file mode 100644 index 000000000..bcb87baf6 --- /dev/null +++ b/intl/hyphenation/glue/nsHyphenator.cpp @@ -0,0 +1,159 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsHyphenator.h" +#include "nsIFile.h" +#include "nsUTF8Utils.h" +#include "nsUnicodeProperties.h" +#include "nsUnicharUtilCIID.h" +#include "nsIURI.h" + +#include "hyphen.h" + +nsHyphenator::nsHyphenator(nsIURI *aURI) + : mDict(nullptr) +{ + nsCString uriSpec; + nsresult rv = aURI->GetSpec(uriSpec); + if (NS_FAILED(rv)) { + return; + } + mDict = hnj_hyphen_load(uriSpec.get()); +#ifdef DEBUG + if (mDict) { + printf("loaded hyphenation patterns from %s\n", uriSpec.get()); + } +#endif +} + +nsHyphenator::~nsHyphenator() +{ + if (mDict != nullptr) { + hnj_hyphen_free((HyphenDict*)mDict); + mDict = nullptr; + } +} + +bool +nsHyphenator::IsValid() +{ + return (mDict != nullptr); +} + +nsresult +nsHyphenator::Hyphenate(const nsAString& aString, nsTArray<bool>& aHyphens) +{ + if (!aHyphens.SetLength(aString.Length(), mozilla::fallible)) { + return NS_ERROR_OUT_OF_MEMORY; + } + memset(aHyphens.Elements(), false, aHyphens.Length() * sizeof(bool)); + + bool inWord = false; + uint32_t wordStart = 0, wordLimit = 0; + uint32_t chLen; + for (uint32_t i = 0; i < aString.Length(); i += chLen) { + uint32_t ch = aString[i]; + chLen = 1; + + if (NS_IS_HIGH_SURROGATE(ch)) { + if (i + 1 < aString.Length() && NS_IS_LOW_SURROGATE(aString[i+1])) { + ch = SURROGATE_TO_UCS4(ch, aString[i+1]); + chLen = 2; + } else { + NS_WARNING("unpaired surrogate found during hyphenation"); + } + } + + nsIUGenCategory::nsUGenCategory cat = mozilla::unicode::GetGenCategory(ch); + if (cat == nsIUGenCategory::kLetter || cat == nsIUGenCategory::kMark) { + if (!inWord) { + inWord = true; + wordStart = i; + } + wordLimit = i + chLen; + if (i + chLen < aString.Length()) { + continue; + } + } + + if (inWord) { + // Convert the word to utf-8 for libhyphen, lowercasing it as we go + // so that it will match the (lowercased) patterns (bug 1105644). + nsAutoCString utf8; + const char16_t* const begin = aString.BeginReading(); + const char16_t *cur = begin + wordStart; + const char16_t *end = begin + wordLimit; + while (cur < end) { + uint32_t ch = *cur++; + + if (NS_IS_HIGH_SURROGATE(ch)) { + if (cur < end && NS_IS_LOW_SURROGATE(*cur)) { + ch = SURROGATE_TO_UCS4(ch, *cur++); + } else { + ch = 0xfffd; // unpaired surrogate, treat as REPLACEMENT CHAR + } + } else if (NS_IS_LOW_SURROGATE(ch)) { + ch = 0xfffd; // unpaired surrogate + } + + // XXX What about language-specific casing? Consider Turkish I/i... + // In practice, it looks like the current patterns will not be + // affected by this, as they treat dotted and undotted i similarly. + ch = ToLowerCase(ch); + + if (ch < 0x80) { // U+0000 - U+007F + utf8.Append(ch); + } else if (ch < 0x0800) { // U+0100 - U+07FF + utf8.Append(0xC0 | (ch >> 6)); + utf8.Append(0x80 | (0x003F & ch)); + } else if (ch < 0x10000) { // U+0800 - U+D7FF,U+E000 - U+FFFF + utf8.Append(0xE0 | (ch >> 12)); + utf8.Append(0x80 | (0x003F & (ch >> 6))); + utf8.Append(0x80 | (0x003F & ch)); + } else { + utf8.Append(0xF0 | (ch >> 18)); + utf8.Append(0x80 | (0x003F & (ch >> 12))); + utf8.Append(0x80 | (0x003F & (ch >> 6))); + utf8.Append(0x80 | (0x003F & ch)); + } + } + + AutoTArray<char,200> utf8hyphens; + utf8hyphens.SetLength(utf8.Length() + 5); + char **rep = nullptr; + int *pos = nullptr; + int *cut = nullptr; + int err = hnj_hyphen_hyphenate2((HyphenDict*)mDict, + utf8.BeginReading(), utf8.Length(), + utf8hyphens.Elements(), nullptr, + &rep, &pos, &cut); + if (!err) { + // Surprisingly, hnj_hyphen_hyphenate2 converts the 'hyphens' buffer + // from utf8 code unit indexing (which would match the utf8 input + // string directly) to Unicode character indexing. + // We then need to convert this to utf16 code unit offsets for Gecko. + const char *hyphPtr = utf8hyphens.Elements(); + const char16_t *cur = begin + wordStart; + const char16_t *end = begin + wordLimit; + while (cur < end) { + if (*hyphPtr & 0x01) { + aHyphens[cur - begin] = true; + } + cur++; + if (cur < end && NS_IS_LOW_SURROGATE(*cur) && + NS_IS_HIGH_SURROGATE(*(cur-1))) + { + cur++; + } + hyphPtr++; + } + } + } + + inWord = false; + } + + return NS_OK; +} |