summaryrefslogtreecommitdiffstats
path: root/intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp')
-rw-r--r--intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp361
1 files changed, 361 insertions, 0 deletions
diff --git a/intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp b/intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp
new file mode 100644
index 000000000..56c88ff3e
--- /dev/null
+++ b/intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp
@@ -0,0 +1,361 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsUTF16ToUnicode.h"
+#include "nsCharTraits.h"
+#include "mozilla/CheckedInt.h"
+#include "mozilla/EndianUtils.h"
+
+enum {
+ STATE_NORMAL = 0,
+ STATE_HALF_CODE_POINT = 1,
+ STATE_FIRST_CALL = 2,
+ STATE_SECOND_BYTE = STATE_FIRST_CALL | STATE_HALF_CODE_POINT,
+ STATE_ODD_SURROGATE_PAIR = 4
+};
+
+nsresult
+nsUTF16ToUnicodeBase::UTF16ConvertToUnicode(const char * aSrc,
+ int32_t * aSrcLength,
+ char16_t * aDest,
+ int32_t * aDestLength,
+ bool aSwapBytes)
+{
+ const char* src = aSrc;
+ const char* srcEnd = aSrc + *aSrcLength;
+ char16_t* dest = aDest;
+ char16_t* destEnd = aDest + *aDestLength;
+ char16_t oddHighSurrogate;
+
+ switch(mState) {
+ case STATE_FIRST_CALL:
+ NS_ASSERTION(*aSrcLength > 1, "buffer too short");
+ src+=2;
+ mState = STATE_NORMAL;
+ break;
+
+ case STATE_SECOND_BYTE:
+ NS_ASSERTION(*aSrcLength > 0, "buffer too short");
+ src++;
+ mState = STATE_NORMAL;
+ break;
+
+ case STATE_ODD_SURROGATE_PAIR:
+ if (*aDestLength < 2)
+ goto error;
+ else {
+ *dest++ = mOddHighSurrogate;
+ *dest++ = mOddLowSurrogate;
+ mOddHighSurrogate = mOddLowSurrogate = 0;
+ mState = STATE_NORMAL;
+ }
+ break;
+
+ case STATE_NORMAL:
+ case STATE_HALF_CODE_POINT:
+ default:
+ break;
+ }
+
+ oddHighSurrogate = mOddHighSurrogate;
+
+ if (src == srcEnd) {
+ *aDestLength = dest - aDest;
+ return (mState != STATE_NORMAL || oddHighSurrogate) ?
+ NS_OK_UDEC_MOREINPUT : NS_OK;
+ }
+
+ const char* srcEvenEnd;
+
+ char16_t u;
+ if (mState == STATE_HALF_CODE_POINT) {
+ if (dest == destEnd)
+ goto error;
+
+ // the 1st byte of a 16-bit code unit was stored in |mOddByte| in the
+ // previous run while the 2nd byte has to come from |*src|.
+ mState = STATE_NORMAL;
+#if MOZ_BIG_ENDIAN
+ u = (mOddByte << 8) | uint8_t(*src++); // safe, we know we have at least one byte.
+#else
+ u = (*src++ << 8) | mOddByte; // safe, we know we have at least one byte.
+#endif
+ srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
+ goto have_codepoint;
+ } else {
+ srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
+ }
+
+ while (src != srcEvenEnd) {
+ if (dest == destEnd)
+ goto error;
+
+#if !defined(__sparc__) && !defined(__arm__)
+ u = *(const char16_t*)src;
+#else
+ memcpy(&u, src, 2);
+#endif
+ src += 2;
+
+have_codepoint:
+ if (aSwapBytes)
+ u = u << 8 | u >> 8;
+
+ if (!IS_SURROGATE(u)) {
+ if (oddHighSurrogate) {
+ if (mErrBehavior == kOnError_Signal) {
+ goto error2;
+ }
+ *dest++ = UCS2_REPLACEMENT_CHAR;
+ if (dest == destEnd)
+ goto error;
+ oddHighSurrogate = 0;
+ }
+ *dest++ = u;
+ } else if (NS_IS_HIGH_SURROGATE(u)) {
+ if (oddHighSurrogate) {
+ if (mErrBehavior == kOnError_Signal) {
+ goto error2;
+ }
+ *dest++ = UCS2_REPLACEMENT_CHAR;
+ if (dest == destEnd)
+ goto error;
+ }
+ oddHighSurrogate = u;
+ }
+ else /* if (NS_IS_LOW_SURROGATE(u)) */ {
+ if (oddHighSurrogate && *aDestLength > 1) {
+ if (dest + 1 >= destEnd) {
+ mOddLowSurrogate = u;
+ mOddHighSurrogate = oddHighSurrogate;
+ mState = STATE_ODD_SURROGATE_PAIR;
+ goto error;
+ }
+ *dest++ = oddHighSurrogate;
+ *dest++ = u;
+ } else {
+ if (mErrBehavior == kOnError_Signal) {
+ goto error2;
+ }
+ *dest++ = UCS2_REPLACEMENT_CHAR;
+ }
+ oddHighSurrogate = 0;
+ }
+ }
+ if (src != srcEnd) {
+ // store the lead byte of a 16-bit unit for the next run.
+ mOddByte = *src++;
+ mState = STATE_HALF_CODE_POINT;
+ }
+
+ mOddHighSurrogate = oddHighSurrogate;
+
+ *aDestLength = dest - aDest;
+ *aSrcLength = src - aSrc;
+ return (mState != STATE_NORMAL || oddHighSurrogate) ?
+ NS_OK_UDEC_MOREINPUT : NS_OK;
+
+error:
+ *aDestLength = dest - aDest;
+ *aSrcLength = src - aSrc;
+ return NS_OK_UDEC_MOREOUTPUT;
+
+error2:
+ *aDestLength = dest - aDest;
+ *aSrcLength = --src - aSrc;
+ return NS_ERROR_ILLEGAL_INPUT;
+}
+
+NS_IMETHODIMP
+nsUTF16ToUnicodeBase::Reset()
+{
+ mState = STATE_FIRST_CALL;
+ mOddByte = 0;
+ mOddHighSurrogate = 0;
+ mOddLowSurrogate = 0;
+ return NS_OK;
+}
+
+NS_IMETHODIMP
+nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, int32_t aSrcLength,
+ int32_t * aDestLength)
+{
+ mozilla::CheckedInt32 length = aSrcLength;
+
+ if (STATE_HALF_CODE_POINT & mState) {
+ length += 1;
+ }
+
+ if (!length.isValid()) {
+ return NS_ERROR_OUT_OF_MEMORY;
+ }
+
+ // the left-over data of the previous run have to be taken into account.
+ *aDestLength = length.value() / 2;
+ if (mOddHighSurrogate)
+ (*aDestLength)++;
+ if (mOddLowSurrogate)
+ (*aDestLength)++;
+ return NS_OK;
+}
+
+
+NS_IMETHODIMP
+nsUTF16BEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
+ char16_t * aDest, int32_t * aDestLength)
+{
+ switch (mState) {
+ case STATE_FIRST_CALL:
+ if (*aSrcLength < 2) {
+ if (*aSrcLength < 1) {
+ *aDestLength = 0;
+ return NS_OK;
+ }
+ if (uint8_t(*aSrc) != 0xFE) {
+ mState = STATE_NORMAL;
+ break;
+ }
+ *aDestLength = 0;
+ mState = STATE_SECOND_BYTE;
+ return NS_OK_UDEC_MOREINPUT;
+ }
+#if MOZ_LITTLE_ENDIAN
+ // on LE machines, BE BOM is 0xFFFE
+ if (0xFFFE != *((char16_t*)aSrc)) {
+ mState = STATE_NORMAL;
+ }
+#else
+ if (0xFEFF != *((char16_t*)aSrc)) {
+ mState = STATE_NORMAL;
+ }
+#endif
+ break;
+
+ case STATE_SECOND_BYTE:
+ if (*aSrcLength < 1) {
+ *aDestLength = 0;
+ return NS_OK_UDEC_MOREINPUT;
+ }
+ if (uint8_t(*aSrc) != 0xFF) {
+ mOddByte = 0xFE;
+ mState = STATE_HALF_CODE_POINT;
+ }
+ break;
+ }
+
+ return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
+ bool(MOZ_LITTLE_ENDIAN));
+}
+
+NS_IMETHODIMP
+nsUTF16LEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
+ char16_t * aDest, int32_t * aDestLength)
+{
+ switch (mState) {
+ case STATE_FIRST_CALL:
+ if (*aSrcLength < 2) {
+ if (*aSrcLength < 1) {
+ *aDestLength = 0;
+ return NS_OK;
+ }
+ if (uint8_t(*aSrc) != 0xFF) {
+ mState = STATE_NORMAL;
+ break;
+ }
+ *aDestLength = 0;
+ mState = STATE_SECOND_BYTE;
+ return NS_OK_UDEC_MOREINPUT;
+ }
+#if MOZ_BIG_ENDIAN
+ // on BE machines, LE BOM is 0xFFFE
+ if (0xFFFE != *((char16_t*)aSrc)) {
+ mState = STATE_NORMAL;
+ }
+#else
+ if (0xFEFF != *((char16_t*)aSrc)) {
+ mState = STATE_NORMAL;
+ }
+#endif
+ break;
+
+ case STATE_SECOND_BYTE:
+ if (*aSrcLength < 1) {
+ *aDestLength = 0;
+ return NS_OK_UDEC_MOREINPUT;
+ }
+ if (uint8_t(*aSrc) != 0xFE) {
+ mOddByte = 0xFF;
+ mState = STATE_HALF_CODE_POINT;
+ }
+ break;
+ }
+
+ return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
+ bool(MOZ_BIG_ENDIAN));
+}
+
+NS_IMETHODIMP
+nsUTF16ToUnicode::Reset()
+{
+ mEndian = kUnknown;
+ mFoundBOM = false;
+ return nsUTF16ToUnicodeBase::Reset();
+}
+
+NS_IMETHODIMP
+nsUTF16ToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
+ char16_t * aDest, int32_t * aDestLength)
+{
+ if(STATE_FIRST_CALL == mState && *aSrcLength < 2)
+ {
+ nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT;
+ *aSrcLength=0;
+ *aDestLength=0;
+ return res;
+ }
+ if(STATE_FIRST_CALL == mState) // first time called
+ {
+ // check if BOM (0xFEFF) is at the beginning, remove it if found, and
+ // set mEndian accordingly.
+ if(0xFF == uint8_t(aSrc[0]) && 0xFE == uint8_t(aSrc[1])) {
+ mEndian = kLittleEndian;
+ mFoundBOM = true;
+ }
+ else if(0xFE == uint8_t(aSrc[0]) && 0xFF == uint8_t(aSrc[1])) {
+ mEndian = kBigEndian;
+ mFoundBOM = true;
+ }
+ // BOM is not found, but we can use a simple heuristic to determine
+ // the endianness. Assume the first character is [U+0001, U+00FF].
+ // Not always valid, but it's very likely to hold for html/xml/css.
+ else if(!aSrc[0] && aSrc[1]) { // 0x00 0xhh (hh != 00)
+ mState = STATE_NORMAL;
+ mEndian = kBigEndian;
+ }
+ else if(aSrc[0] && !aSrc[1]) { // 0xhh 0x00 (hh != 00)
+ mState = STATE_NORMAL;
+ mEndian = kLittleEndian;
+ }
+ else { // Neither BOM nor 'plausible' byte patterns at the beginning.
+ // Just assume it's BE (following Unicode standard)
+ // and let the garbage show up in the browser. (security concern?)
+ // (bug 246194)
+ mState = STATE_NORMAL;
+ mEndian = kBigEndian;
+ }
+ }
+
+ nsresult rv = UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
+#if MOZ_BIG_ENDIAN
+ (mEndian == kLittleEndian)
+#else
+ (mEndian == kBigEndian)
+#endif
+ );
+
+ // If BOM is not found and we're to return NS_OK, signal that BOM
+ // is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode|
+ return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv;
+}