1 files changed, 361 insertions, 0 deletions
diff --git a/intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp b/intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp
new file mode 100644
index 000000000..56c88ff3e
--- /dev/null
+++ b/intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp
@@ -0,0 +1,361 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsUTF16ToUnicode.h"
+#include "nsCharTraits.h"
+#include "mozilla/CheckedInt.h"
+#include "mozilla/EndianUtils.h"
+
+enum {
+  STATE_NORMAL = 0,
+  STATE_HALF_CODE_POINT = 1,
+  STATE_FIRST_CALL = 2,
+  STATE_SECOND_BYTE = STATE_FIRST_CALL | STATE_HALF_CODE_POINT,
+  STATE_ODD_SURROGATE_PAIR = 4
+};
+
+nsresult
+nsUTF16ToUnicodeBase::UTF16ConvertToUnicode(const char * aSrc,
+                                            int32_t * aSrcLength,
+                                            char16_t * aDest,
+                                            int32_t * aDestLength,
+                                            bool aSwapBytes)
+{
+  const char* src = aSrc;
+  const char* srcEnd = aSrc + *aSrcLength;
+  char16_t* dest = aDest;
+  char16_t* destEnd = aDest + *aDestLength;
+  char16_t oddHighSurrogate;
+
+  switch(mState) {
+    case STATE_FIRST_CALL:
+      NS_ASSERTION(*aSrcLength > 1, "buffer too short");
+      src+=2;
+      mState = STATE_NORMAL;
+      break;
+
+    case STATE_SECOND_BYTE:
+      NS_ASSERTION(*aSrcLength > 0, "buffer too short");
+      src++;
+      mState = STATE_NORMAL;
+      break;
+
+    case STATE_ODD_SURROGATE_PAIR:
+      if (*aDestLength < 2)
+        goto error;
+      else {
+        *dest++ = mOddHighSurrogate;
+        *dest++ = mOddLowSurrogate;
+        mOddHighSurrogate = mOddLowSurrogate = 0;
+        mState = STATE_NORMAL;
+      }
+      break;
+
+    case STATE_NORMAL:
+    case STATE_HALF_CODE_POINT:
+    default:
+      break;
+  }
+
+  oddHighSurrogate = mOddHighSurrogate;
+
+  if (src == srcEnd) {
+    *aDestLength = dest - aDest;
+    return (mState != STATE_NORMAL || oddHighSurrogate) ?
+           NS_OK_UDEC_MOREINPUT : NS_OK;
+  }
+
+  const char* srcEvenEnd;
+
+  char16_t u;
+  if (mState == STATE_HALF_CODE_POINT) {
+    if (dest == destEnd)
+      goto error;
+
+    // the 1st byte of a 16-bit code unit was stored in |mOddByte| in the
+    // previous run while the 2nd byte has to come from |*src|.
+    mState = STATE_NORMAL;
+#if MOZ_BIG_ENDIAN
+    u = (mOddByte << 8) | uint8_t(*src++); // safe, we know we have at least one byte.
+#else
+    u = (*src++ << 8) | mOddByte; // safe, we know we have at least one byte.
+#endif
+    srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
+    goto have_codepoint;
+  } else {
+    srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
+  }
+
+  while (src != srcEvenEnd) {
+    if (dest == destEnd)
+      goto error;
+
+#if !defined(__sparc__) && !defined(__arm__)
+    u = *(const char16_t*)src;
+#else
+    memcpy(&u, src, 2);
+#endif
+    src += 2;
+
+have_codepoint:
+    if (aSwapBytes)
+      u = u << 8 | u >> 8;
+
+    if (!IS_SURROGATE(u)) {
+      if (oddHighSurrogate) {
+        if (mErrBehavior == kOnError_Signal) {
+          goto error2;
+        }
+        *dest++ = UCS2_REPLACEMENT_CHAR;
+        if (dest == destEnd)
+          goto error;
+        oddHighSurrogate = 0;
+      }
+      *dest++ = u;
+    } else if (NS_IS_HIGH_SURROGATE(u)) {
+      if (oddHighSurrogate) {
+        if (mErrBehavior == kOnError_Signal) {
+          goto error2;
+        }
+        *dest++ = UCS2_REPLACEMENT_CHAR;
+        if (dest == destEnd)
+          goto error;
+      }
+      oddHighSurrogate = u;
+    }
+    else /* if (NS_IS_LOW_SURROGATE(u)) */ {
+      if (oddHighSurrogate && *aDestLength > 1) {
+        if (dest + 1 >= destEnd) {
+          mOddLowSurrogate = u;
+          mOddHighSurrogate = oddHighSurrogate;
+          mState = STATE_ODD_SURROGATE_PAIR;
+          goto error;
+        }
+        *dest++ = oddHighSurrogate;
+        *dest++ = u;
+      } else {
+        if (mErrBehavior == kOnError_Signal) {
+          goto error2;
+        }
+        *dest++ = UCS2_REPLACEMENT_CHAR;
+      }
+      oddHighSurrogate = 0;
+    }
+  }
+  if (src != srcEnd) {
+    // store the lead byte of a 16-bit unit for the next run.
+    mOddByte = *src++;
+    mState = STATE_HALF_CODE_POINT;
+  }
+
+  mOddHighSurrogate = oddHighSurrogate;
+
+  *aDestLength = dest - aDest;
+  *aSrcLength =  src  - aSrc; 
+  return (mState != STATE_NORMAL || oddHighSurrogate) ?
+         NS_OK_UDEC_MOREINPUT : NS_OK;
+
+error:
+  *aDestLength = dest - aDest;
+  *aSrcLength =  src  - aSrc; 
+  return  NS_OK_UDEC_MOREOUTPUT;
+
+error2:
+  *aDestLength = dest - aDest;
+  *aSrcLength = --src - aSrc; 
+  return  NS_ERROR_ILLEGAL_INPUT;
+}
+
+NS_IMETHODIMP
+nsUTF16ToUnicodeBase::Reset()
+{
+  mState = STATE_FIRST_CALL;
+  mOddByte = 0;
+  mOddHighSurrogate = 0;
+  mOddLowSurrogate = 0;
+  return NS_OK;
+}
+
+NS_IMETHODIMP
+nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, int32_t aSrcLength, 
+                                   int32_t * aDestLength)
+{
+  mozilla::CheckedInt32 length = aSrcLength;
+
+  if (STATE_HALF_CODE_POINT & mState) {
+    length += 1;
+  }
+
+  if (!length.isValid()) {
+    return NS_ERROR_OUT_OF_MEMORY;
+  }
+
+  // the left-over data of the previous run have to be taken into account.
+  *aDestLength = length.value() / 2;
+  if (mOddHighSurrogate)
+    (*aDestLength)++;
+  if (mOddLowSurrogate)
+    (*aDestLength)++;
+  return NS_OK;
+}
+
+
+NS_IMETHODIMP
+nsUTF16BEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
+                            char16_t * aDest, int32_t * aDestLength)
+{
+  switch (mState) {
+    case STATE_FIRST_CALL:
+      if (*aSrcLength < 2) {
+        if (*aSrcLength < 1) {
+          *aDestLength = 0;
+          return NS_OK;
+        }
+        if (uint8_t(*aSrc) != 0xFE) {
+          mState = STATE_NORMAL;
+          break;
+        }
+        *aDestLength = 0;
+        mState = STATE_SECOND_BYTE;
+        return NS_OK_UDEC_MOREINPUT;
+      }
+#if MOZ_LITTLE_ENDIAN
+      // on LE machines, BE BOM is 0xFFFE
+      if (0xFFFE != *((char16_t*)aSrc)) {
+        mState = STATE_NORMAL;
+      }
+#else
+      if (0xFEFF != *((char16_t*)aSrc)) {
+        mState = STATE_NORMAL;
+      }
+#endif
+      break;
+
+    case STATE_SECOND_BYTE:
+      if (*aSrcLength < 1) {
+        *aDestLength = 0;
+        return NS_OK_UDEC_MOREINPUT;
+      }
+      if (uint8_t(*aSrc) != 0xFF) {
+        mOddByte = 0xFE;
+        mState = STATE_HALF_CODE_POINT;
+      }
+      break;
+  }
+
+  return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
+                               bool(MOZ_LITTLE_ENDIAN));
+}
+
+NS_IMETHODIMP
+nsUTF16LEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
+                            char16_t * aDest, int32_t * aDestLength)
+{
+  switch (mState) {
+    case STATE_FIRST_CALL:
+      if (*aSrcLength < 2) {
+        if (*aSrcLength < 1) {
+          *aDestLength = 0;
+          return NS_OK;
+        }
+        if (uint8_t(*aSrc) != 0xFF) {
+          mState = STATE_NORMAL;
+          break;
+        }
+        *aDestLength = 0;
+        mState = STATE_SECOND_BYTE;
+        return NS_OK_UDEC_MOREINPUT;
+      }
+#if MOZ_BIG_ENDIAN
+      // on BE machines, LE BOM is 0xFFFE
+      if (0xFFFE != *((char16_t*)aSrc)) {
+        mState = STATE_NORMAL;
+      }
+#else
+      if (0xFEFF != *((char16_t*)aSrc)) {
+        mState = STATE_NORMAL;
+      }
+#endif
+      break;
+
+    case STATE_SECOND_BYTE:
+      if (*aSrcLength < 1) {
+        *aDestLength = 0;
+        return NS_OK_UDEC_MOREINPUT;
+      }
+      if (uint8_t(*aSrc) != 0xFE) {
+        mOddByte = 0xFF;
+        mState = STATE_HALF_CODE_POINT;
+      }
+      break;
+  }
+
+  return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
+                               bool(MOZ_BIG_ENDIAN));
+}
+
+NS_IMETHODIMP
+nsUTF16ToUnicode::Reset()
+{
+  mEndian = kUnknown;
+  mFoundBOM = false;
+  return nsUTF16ToUnicodeBase::Reset();
+}
+
+NS_IMETHODIMP
+nsUTF16ToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
+                          char16_t * aDest, int32_t * aDestLength)
+{
+    if(STATE_FIRST_CALL == mState && *aSrcLength < 2)
+    {
+      nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT;
+      *aSrcLength=0;
+      *aDestLength=0;
+      return res;
+    }
+    if(STATE_FIRST_CALL == mState) // first time called
+    {
+      // check if BOM (0xFEFF) is at the beginning, remove it if found, and
+      // set mEndian accordingly.
+      if(0xFF == uint8_t(aSrc[0]) && 0xFE == uint8_t(aSrc[1])) {
+        mEndian = kLittleEndian;
+        mFoundBOM = true;
+      }
+      else if(0xFE == uint8_t(aSrc[0]) && 0xFF == uint8_t(aSrc[1])) {
+        mEndian = kBigEndian;
+        mFoundBOM = true;
+      }
+      // BOM is not found, but we can use a simple heuristic to determine
+      // the endianness. Assume the first character is [U+0001, U+00FF].
+      // Not always valid, but it's very likely to hold for html/xml/css. 
+      else if(!aSrc[0] && aSrc[1]) {  // 0x00 0xhh (hh != 00)
+        mState = STATE_NORMAL;
+        mEndian = kBigEndian;
+      }
+      else if(aSrc[0] && !aSrc[1]) {  // 0xhh 0x00 (hh != 00)
+        mState = STATE_NORMAL;
+        mEndian = kLittleEndian;
+      }
+      else { // Neither BOM nor 'plausible' byte patterns at the beginning.
+             // Just assume it's BE (following Unicode standard)
+             // and let the garbage show up in the browser. (security concern?)
+             // (bug 246194)
+        mState = STATE_NORMAL;
+        mEndian = kBigEndian;
+      }
+    }
+    
+    nsresult rv = UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
+#if MOZ_BIG_ENDIAN
+                                        (mEndian == kLittleEndian)
+#else
+                                        (mEndian == kBigEndian)
+#endif
+                                        );
+
+    // If BOM is not found and we're to return NS_OK, signal that BOM
+    // is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode|
+    return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv;
+}