summaryrefslogtreecommitdiffstats
path: root/xpcom/string/nsUTF8Utils.h
diff options
context:
space:
mode:
Diffstat (limited to 'xpcom/string/nsUTF8Utils.h')
-rw-r--r--xpcom/string/nsUTF8Utils.h742
1 files changed, 742 insertions, 0 deletions
diff --git a/xpcom/string/nsUTF8Utils.h b/xpcom/string/nsUTF8Utils.h
new file mode 100644
index 000000000..9f38fa555
--- /dev/null
+++ b/xpcom/string/nsUTF8Utils.h
@@ -0,0 +1,742 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef nsUTF8Utils_h_
+#define nsUTF8Utils_h_
+
+// This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
+// file will provide signatures for the Mozilla abstract string types. It will
+// use XPCOM assertion/debugging macros, etc.
+
+#include "nscore.h"
+#include "mozilla/Assertions.h"
+#include "mozilla/SSE.h"
+#include "mozilla/TypeTraits.h"
+
+#include "nsCharTraits.h"
+
+class UTF8traits
+{
+public:
+ static bool isASCII(char aChar)
+ {
+ return (aChar & 0x80) == 0x00;
+ }
+ static bool isInSeq(char aChar)
+ {
+ return (aChar & 0xC0) == 0x80;
+ }
+ static bool is2byte(char aChar)
+ {
+ return (aChar & 0xE0) == 0xC0;
+ }
+ static bool is3byte(char aChar)
+ {
+ return (aChar & 0xF0) == 0xE0;
+ }
+ static bool is4byte(char aChar)
+ {
+ return (aChar & 0xF8) == 0xF0;
+ }
+ static bool is5byte(char aChar)
+ {
+ return (aChar & 0xFC) == 0xF8;
+ }
+ static bool is6byte(char aChar)
+ {
+ return (aChar & 0xFE) == 0xFC;
+ }
+};
+
+/**
+ * Extract the next UCS-4 character from the buffer and return it. The
+ * pointer passed in is advanced to the start of the next character in the
+ * buffer. If non-null, the parameters err and overlong are filled in to
+ * indicate that the character was represented by an overlong sequence, or
+ * that an error occurred.
+ */
+
+class UTF8CharEnumerator
+{
+public:
+ static uint32_t NextChar(const char** aBuffer, const char* aEnd, bool* aErr)
+ {
+ NS_ASSERTION(aBuffer && *aBuffer, "null buffer!");
+
+ const char* p = *aBuffer;
+ *aErr = false;
+
+ if (p >= aEnd) {
+ *aErr = true;
+
+ return 0;
+ }
+
+ char c = *p++;
+
+ if (UTF8traits::isASCII(c)) {
+ *aBuffer = p;
+ return c;
+ }
+
+ uint32_t ucs4;
+ uint32_t minUcs4;
+ int32_t state = 0;
+
+ if (!CalcState(c, ucs4, minUcs4, state)) {
+ NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
+ *aErr = true;
+
+ return 0;
+ }
+
+ while (state--) {
+ if (p == aEnd) {
+ *aErr = true;
+
+ return 0;
+ }
+
+ c = *p++;
+
+ if (!AddByte(c, state, ucs4)) {
+ *aErr = true;
+
+ return 0;
+ }
+ }
+
+ if (ucs4 < minUcs4) {
+ // Overlong sequence
+ ucs4 = UCS2_REPLACEMENT_CHAR;
+ } else if (ucs4 >= 0xD800 &&
+ (ucs4 <= 0xDFFF || ucs4 >= UCS_END)) {
+ // Surrogates and code points outside the Unicode range.
+ ucs4 = UCS2_REPLACEMENT_CHAR;
+ }
+
+ *aBuffer = p;
+ return ucs4;
+ }
+
+private:
+ static bool CalcState(char aChar, uint32_t& aUcs4, uint32_t& aMinUcs4,
+ int32_t& aState)
+ {
+ if (UTF8traits::is2byte(aChar)) {
+ aUcs4 = (uint32_t(aChar) << 6) & 0x000007C0L;
+ aState = 1;
+ aMinUcs4 = 0x00000080;
+ } else if (UTF8traits::is3byte(aChar)) {
+ aUcs4 = (uint32_t(aChar) << 12) & 0x0000F000L;
+ aState = 2;
+ aMinUcs4 = 0x00000800;
+ } else if (UTF8traits::is4byte(aChar)) {
+ aUcs4 = (uint32_t(aChar) << 18) & 0x001F0000L;
+ aState = 3;
+ aMinUcs4 = 0x00010000;
+ } else if (UTF8traits::is5byte(aChar)) {
+ aUcs4 = (uint32_t(aChar) << 24) & 0x03000000L;
+ aState = 4;
+ aMinUcs4 = 0x00200000;
+ } else if (UTF8traits::is6byte(aChar)) {
+ aUcs4 = (uint32_t(aChar) << 30) & 0x40000000L;
+ aState = 5;
+ aMinUcs4 = 0x04000000;
+ } else {
+ return false;
+ }
+
+ return true;
+ }
+
+ static bool AddByte(char aChar, int32_t aState, uint32_t& aUcs4)
+ {
+ if (UTF8traits::isInSeq(aChar)) {
+ int32_t shift = aState * 6;
+ aUcs4 |= (uint32_t(aChar) & 0x3F) << shift;
+ return true;
+ }
+
+ return false;
+ }
+};
+
+
+/**
+ * Extract the next UCS-4 character from the buffer and return it. The
+ * pointer passed in is advanced to the start of the next character in the
+ * buffer. If non-null, the err parameter is filled in if an error occurs.
+ *
+ * If an error occurs that causes UCS2_REPLACEMENT_CHAR to be returned, then
+ * the buffer will be updated to move only a single UCS-2 character.
+ *
+ * Any other error returns 0 and does not move the buffer position.
+ */
+
+
+class UTF16CharEnumerator
+{
+public:
+ static uint32_t NextChar(const char16_t** aBuffer, const char16_t* aEnd,
+ bool* aErr = nullptr)
+ {
+ NS_ASSERTION(aBuffer && *aBuffer, "null buffer!");
+
+ const char16_t* p = *aBuffer;
+
+ if (p >= aEnd) {
+ NS_ERROR("No input to work with");
+ if (aErr) {
+ *aErr = true;
+ }
+
+ return 0;
+ }
+
+ char16_t c = *p++;
+
+ if (!IS_SURROGATE(c)) { // U+0000 - U+D7FF,U+E000 - U+FFFF
+ if (aErr) {
+ *aErr = false;
+ }
+ *aBuffer = p;
+ return c;
+ } else if (NS_IS_HIGH_SURROGATE(c)) { // U+D800 - U+DBFF
+ if (p == aEnd) {
+ // Found a high surrogate at the end of the buffer. Flag this
+ // as an error and return the Unicode replacement
+ // character 0xFFFD.
+
+ NS_WARNING("Unexpected end of buffer after high surrogate");
+
+ if (aErr) {
+ *aErr = true;
+ }
+ *aBuffer = p;
+ return 0xFFFD;
+ }
+
+ // D800- DBFF - High Surrogate
+ char16_t h = c;
+
+ c = *p++;
+
+ if (NS_IS_LOW_SURROGATE(c)) {
+ // DC00- DFFF - Low Surrogate
+ // N = (H - D800) *400 + 10000 + (L - DC00)
+ uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
+ if (aErr) {
+ *aErr = false;
+ }
+ *aBuffer = p;
+ return ucs4;
+ } else {
+ // Found a high surrogate followed by something other than
+ // a low surrogate. Flag this as an error and return the
+ // Unicode replacement character 0xFFFD. Note that the
+ // pointer to the next character points to the second 16-bit
+ // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,
+ // only the first code unit of an illegal sequence must be
+ // treated as an illegally terminated code unit sequence
+ // (also Chapter 3 D91, "isolated [not paired and ill-formed]
+ // UTF-16 code units in the range D800..DFFF are ill-formed").
+ NS_WARNING("got a High Surrogate but no low surrogate");
+
+ if (aErr) {
+ *aErr = true;
+ }
+ *aBuffer = p - 1;
+ return 0xFFFD;
+ }
+ } else { // U+DC00 - U+DFFF
+ // DC00- DFFF - Low Surrogate
+
+ // Found a low surrogate w/o a preceding high surrogate. Flag
+ // this as an error and return the Unicode replacement
+ // character 0xFFFD.
+
+ NS_WARNING("got a low Surrogate but no high surrogate");
+ if (aErr) {
+ *aErr = true;
+ }
+ *aBuffer = p;
+ return 0xFFFD;
+ }
+
+ MOZ_ASSERT_UNREACHABLE("Impossible UCS-2 character value.");
+ }
+};
+
+
+/**
+ * A character sink (see |copy_string| in nsAlgorithm.h) for converting
+ * UTF-8 to UTF-16
+ */
+class ConvertUTF8toUTF16
+{
+public:
+ typedef char value_type;
+ typedef char16_t buffer_type;
+
+ explicit ConvertUTF8toUTF16(buffer_type* aBuffer)
+ : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false)
+ {
+ }
+
+ size_t Length() const
+ {
+ return mBuffer - mStart;
+ }
+
+ bool ErrorEncountered() const
+ {
+ return mErrorEncountered;
+ }
+
+ void write(const value_type* aStart, uint32_t aN)
+ {
+ if (mErrorEncountered) {
+ return;
+ }
+
+ // algorithm assumes utf8 units won't
+ // be spread across fragments
+ const value_type* p = aStart;
+ const value_type* end = aStart + aN;
+ buffer_type* out = mBuffer;
+ for (; p != end /* && *p */;) {
+ bool err;
+ uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err);
+
+ if (err) {
+ mErrorEncountered = true;
+ mBuffer = out;
+ return;
+ }
+
+ if (ucs4 >= PLANE1_BASE) {
+ *out++ = (buffer_type)H_SURROGATE(ucs4);
+ *out++ = (buffer_type)L_SURROGATE(ucs4);
+ } else {
+ *out++ = ucs4;
+ }
+ }
+ mBuffer = out;
+ }
+
+ void write_terminator()
+ {
+ *mBuffer = buffer_type(0);
+ }
+
+private:
+ buffer_type* const mStart;
+ buffer_type* mBuffer;
+ bool mErrorEncountered;
+};
+
+/**
+ * A character sink (see |copy_string| in nsAlgorithm.h) for computing
+ * the length of the UTF-16 string equivalent to a UTF-8 string.
+ */
+class CalculateUTF8Length
+{
+public:
+ typedef char value_type;
+
+ CalculateUTF8Length()
+ : mLength(0), mErrorEncountered(false)
+ {
+ }
+
+ size_t Length() const
+ {
+ return mLength;
+ }
+
+ void write(const value_type* aStart, uint32_t aN)
+ {
+ // ignore any further requests
+ if (mErrorEncountered) {
+ return;
+ }
+
+ // algorithm assumes utf8 units won't
+ // be spread across fragments
+ const value_type* p = aStart;
+ const value_type* end = aStart + aN;
+ for (; p < end /* && *p */; ++mLength) {
+ if (UTF8traits::isASCII(*p)) {
+ p += 1;
+ } else if (UTF8traits::is2byte(*p)) {
+ p += 2;
+ } else if (UTF8traits::is3byte(*p)) {
+ p += 3;
+ } else if (UTF8traits::is4byte(*p)) {
+ // Because a UTF-8 sequence of 4 bytes represents a codepoint
+ // greater than 0xFFFF, it will become a surrogate pair in the
+ // UTF-16 string, so add 1 more to mLength.
+ // This doesn't happen with is5byte and is6byte because they
+ // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
+ // converted to a single replacement character.
+
+ // However, there is one case when a 4 byte UTF-8 sequence will
+ // only generate 2 UTF-16 bytes. If we have a properly encoded
+ // sequence, but with an invalid value (too small or too big),
+ // that will result in a replacement character being written
+ // This replacement character is encoded as just 1 single
+ // UTF-16 character, which is 2 bytes.
+
+ // The below code therefore only adds 1 to mLength if the UTF8
+ // data will produce a decoded character which is greater than
+ // or equal to 0x010000 and less than 0x0110000.
+
+ // A 4byte UTF8 character is encoded as
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ // Bit 1-3 on the first byte, and bit 5-6 on the second byte,
+ // map to bit 17-21 in the final result. If these bits are
+ // between 0x01 and 0x11, that means that the final result is
+ // between 0x010000 and 0x110000. The below code reads these
+ // bits out and assigns them to c, but shifted up 4 bits to
+ // avoid having to shift twice.
+
+ // It doesn't matter what to do in the case where p + 4 > end
+ // since no UTF16 characters will be written in that case by
+ // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
+ // any of the surrogate bits are wrong since no UTF16
+ // characters will be written in that case either.
+
+ if (p + 4 <= end) {
+ uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 |
+ ((uint32_t)(p[1] & 0x30));
+ if (c >= 0x010 && c < 0x110) {
+ ++mLength;
+ }
+ }
+
+ p += 4;
+ } else if (UTF8traits::is5byte(*p)) {
+ p += 5;
+ } else if (UTF8traits::is6byte(*p)) {
+ p += 6;
+ } else { // error
+ ++mLength; // to account for the decrement below
+ break;
+ }
+ }
+ if (p != end) {
+ NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
+ --mLength; // The last multi-byte char wasn't complete, discard it.
+ mErrorEncountered = true;
+ }
+ }
+
+private:
+ size_t mLength;
+ bool mErrorEncountered;
+};
+
+/**
+ * A character sink (see |copy_string| in nsAlgorithm.h) for
+ * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD
+ * (0xEFBFBD in UTF-8).
+ */
+class ConvertUTF16toUTF8
+{
+public:
+ typedef char16_t value_type;
+ typedef char buffer_type;
+
+ // The error handling here is more lenient than that in
+ // |ConvertUTF8toUTF16|, but it's that way for backwards
+ // compatibility.
+
+ explicit ConvertUTF16toUTF8(buffer_type* aBuffer)
+ : mStart(aBuffer), mBuffer(aBuffer)
+ {
+ }
+
+ size_t Size() const
+ {
+ return mBuffer - mStart;
+ }
+
+ void write(const value_type* aStart, uint32_t aN)
+ {
+ buffer_type* out = mBuffer; // gcc isn't smart enough to do this!
+
+ for (const value_type* p = aStart, *end = aStart + aN; p < end; ++p) {
+ value_type c = *p;
+ if (!(c & 0xFF80)) { // U+0000 - U+007F
+ *out++ = (char)c;
+ } else if (!(c & 0xF800)) { // U+0100 - U+07FF
+ *out++ = 0xC0 | (char)(c >> 6);
+ *out++ = 0x80 | (char)(0x003F & c);
+ } else if (!IS_SURROGATE(c)) { // U+0800 - U+D7FF,U+E000 - U+FFFF
+ *out++ = 0xE0 | (char)(c >> 12);
+ *out++ = 0x80 | (char)(0x003F & (c >> 6));
+ *out++ = 0x80 | (char)(0x003F & c);
+ } else if (NS_IS_HIGH_SURROGATE(c)) { // U+D800 - U+DBFF
+ // D800- DBFF - High Surrogate
+ value_type h = c;
+
+ ++p;
+ if (p == end) {
+ // Treat broken characters as the Unicode
+ // replacement character 0xFFFD (0xEFBFBD in
+ // UTF-8)
+ *out++ = '\xEF';
+ *out++ = '\xBF';
+ *out++ = '\xBD';
+
+ NS_WARNING("String ending in half a surrogate pair!");
+
+ break;
+ }
+ c = *p;
+
+ if (NS_IS_LOW_SURROGATE(c)) {
+ // DC00- DFFF - Low Surrogate
+ // N = (H - D800) *400 + 10000 + ( L - DC00 )
+ uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
+
+ // 0001 0000-001F FFFF
+ *out++ = 0xF0 | (char)(ucs4 >> 18);
+ *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
+ *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
+ *out++ = 0x80 | (char)(0x003F & ucs4);
+ } else {
+ // Treat broken characters as the Unicode
+ // replacement character 0xFFFD (0xEFBFBD in
+ // UTF-8)
+ *out++ = '\xEF';
+ *out++ = '\xBF';
+ *out++ = '\xBD';
+
+ // The pointer to the next character points to the second
+ // 16-bit value, not beyond it, as per Unicode 5.0.0
+ // Chapter 3 C10, only the first code unit of an illegal
+ // sequence must be treated as an illegally terminated
+ // code unit sequence (also Chapter 3 D91, "isolated [not
+ // paired and ill-formed] UTF-16 code units in the range
+ // D800..DFFF are ill-formed").
+ p--;
+
+ NS_WARNING("got a High Surrogate but no low surrogate");
+ }
+ } else { // U+DC00 - U+DFFF
+ // Treat broken characters as the Unicode replacement
+ // character 0xFFFD (0xEFBFBD in UTF-8)
+ *out++ = '\xEF';
+ *out++ = '\xBF';
+ *out++ = '\xBD';
+
+ // DC00- DFFF - Low Surrogate
+ NS_WARNING("got a low Surrogate but no high surrogate");
+ }
+ }
+
+ mBuffer = out;
+ }
+
+ void write_terminator()
+ {
+ *mBuffer = buffer_type(0);
+ }
+
+private:
+ buffer_type* const mStart;
+ buffer_type* mBuffer;
+};
+
+/**
+ * A character sink (see |copy_string| in nsAlgorithm.h) for computing
+ * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid
+ * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8).
+ */
+class CalculateUTF8Size
+{
+public:
+ typedef char16_t value_type;
+
+ CalculateUTF8Size()
+ : mSize(0)
+ {
+ }
+
+ size_t Size() const
+ {
+ return mSize;
+ }
+
+ void write(const value_type* aStart, uint32_t aN)
+ {
+ // Assume UCS2 surrogate pairs won't be spread across fragments.
+ for (const value_type* p = aStart, *end = aStart + aN; p < end; ++p) {
+ value_type c = *p;
+ if (!(c & 0xFF80)) { // U+0000 - U+007F
+ mSize += 1;
+ } else if (!(c & 0xF800)) { // U+0100 - U+07FF
+ mSize += 2;
+ } else if (0xD800 != (0xF800 & c)) { // U+0800 - U+D7FF,U+E000 - U+FFFF
+ mSize += 3;
+ } else if (0xD800 == (0xFC00 & c)) { // U+D800 - U+DBFF
+ ++p;
+ if (p == end) {
+ // Treat broken characters as the Unicode
+ // replacement character 0xFFFD (0xEFBFBD in
+ // UTF-8)
+ mSize += 3;
+
+ NS_WARNING("String ending in half a surrogate pair!");
+
+ break;
+ }
+ c = *p;
+
+ if (0xDC00 == (0xFC00 & c)) {
+ mSize += 4;
+ } else {
+ // Treat broken characters as the Unicode
+ // replacement character 0xFFFD (0xEFBFBD in
+ // UTF-8)
+ mSize += 3;
+
+ // The next code unit is the second 16-bit value, not
+ // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,
+ // only the first code unit of an illegal sequence must
+ // be treated as an illegally terminated code unit
+ // sequence (also Chapter 3 D91, "isolated [not paired and
+ // ill-formed] UTF-16 code units in the range D800..DFFF
+ // are ill-formed").
+ p--;
+
+ NS_WARNING("got a high Surrogate but no low surrogate");
+ }
+ } else { // U+DC00 - U+DFFF
+ // Treat broken characters as the Unicode replacement
+ // character 0xFFFD (0xEFBFBD in UTF-8)
+ mSize += 3;
+
+ NS_WARNING("got a low Surrogate but no high surrogate");
+ }
+ }
+ }
+
+private:
+ size_t mSize;
+};
+
+#ifdef MOZILLA_INTERNAL_API
+/**
+ * A character sink that performs a |reinterpret_cast|-style conversion
+ * from char to char16_t.
+ */
+class LossyConvertEncoding8to16
+{
+public:
+ typedef char value_type;
+ typedef char input_type;
+ typedef char16_t output_type;
+
+public:
+ explicit LossyConvertEncoding8to16(char16_t* aDestination) :
+ mDestination(aDestination)
+ {
+ }
+
+ void
+ write(const char* aSource, uint32_t aSourceLength)
+ {
+#ifdef MOZILLA_MAY_SUPPORT_SSE2
+ if (mozilla::supports_sse2()) {
+ write_sse2(aSource, aSourceLength);
+ return;
+ }
+#endif
+ const char* done_writing = aSource + aSourceLength;
+ while (aSource < done_writing) {
+ *mDestination++ = (char16_t)(unsigned char)(*aSource++);
+ }
+ }
+
+ void
+ write_sse2(const char* aSource, uint32_t aSourceLength);
+
+ void
+ write_terminator()
+ {
+ *mDestination = (char16_t)(0);
+ }
+
+private:
+ char16_t* mDestination;
+};
+
+/**
+ * A character sink that performs a |reinterpret_cast|-style conversion
+ * from char16_t to char.
+ */
+class LossyConvertEncoding16to8
+{
+public:
+ typedef char16_t value_type;
+ typedef char16_t input_type;
+ typedef char output_type;
+
+ explicit LossyConvertEncoding16to8(char* aDestination)
+ : mDestination(aDestination)
+ {
+ }
+
+ void
+ write(const char16_t* aSource, uint32_t aSourceLength)
+ {
+#ifdef MOZILLA_MAY_SUPPORT_SSE2
+ if (mozilla::supports_sse2()) {
+ write_sse2(aSource, aSourceLength);
+ return;
+ }
+#endif
+ const char16_t* done_writing = aSource + aSourceLength;
+ while (aSource < done_writing) {
+ *mDestination++ = (char)(*aSource++);
+ }
+ }
+
+#ifdef MOZILLA_MAY_SUPPORT_SSE2
+ void
+ write_sse2(const char16_t* aSource, uint32_t aSourceLength);
+#endif
+
+ void
+ write_terminator()
+ {
+ *mDestination = '\0';
+ }
+
+private:
+ char* mDestination;
+};
+#endif // MOZILLA_INTERNAL_API
+
+
+template<typename Char, typename UnsignedT>
+inline UnsignedT
+RewindToPriorUTF8Codepoint(const Char* utf8Chars, UnsignedT index)
+{
+ static_assert(mozilla::IsSame<Char, char>::value ||
+ mozilla::IsSame<Char, unsigned char>::value ||
+ mozilla::IsSame<Char, signed char>::value,
+ "UTF-8 data must be in 8-bit units");
+ static_assert(mozilla::IsUnsigned<UnsignedT>::value, "index type must be unsigned");
+ while (index > 0 && (utf8Chars[index] & 0xC0) == 0x80)
+ --index;
+
+ return index;
+}
+
+#endif /* !defined(nsUTF8Utils_h_) */