diff options
Diffstat (limited to 'intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp')
-rw-r--r-- | intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp | 361 |
1 files changed, 361 insertions, 0 deletions
diff --git a/intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp b/intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp new file mode 100644 index 000000000..56c88ff3e --- /dev/null +++ b/intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp @@ -0,0 +1,361 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsUTF16ToUnicode.h" +#include "nsCharTraits.h" +#include "mozilla/CheckedInt.h" +#include "mozilla/EndianUtils.h" + +enum { + STATE_NORMAL = 0, + STATE_HALF_CODE_POINT = 1, + STATE_FIRST_CALL = 2, + STATE_SECOND_BYTE = STATE_FIRST_CALL | STATE_HALF_CODE_POINT, + STATE_ODD_SURROGATE_PAIR = 4 +}; + +nsresult +nsUTF16ToUnicodeBase::UTF16ConvertToUnicode(const char * aSrc, + int32_t * aSrcLength, + char16_t * aDest, + int32_t * aDestLength, + bool aSwapBytes) +{ + const char* src = aSrc; + const char* srcEnd = aSrc + *aSrcLength; + char16_t* dest = aDest; + char16_t* destEnd = aDest + *aDestLength; + char16_t oddHighSurrogate; + + switch(mState) { + case STATE_FIRST_CALL: + NS_ASSERTION(*aSrcLength > 1, "buffer too short"); + src+=2; + mState = STATE_NORMAL; + break; + + case STATE_SECOND_BYTE: + NS_ASSERTION(*aSrcLength > 0, "buffer too short"); + src++; + mState = STATE_NORMAL; + break; + + case STATE_ODD_SURROGATE_PAIR: + if (*aDestLength < 2) + goto error; + else { + *dest++ = mOddHighSurrogate; + *dest++ = mOddLowSurrogate; + mOddHighSurrogate = mOddLowSurrogate = 0; + mState = STATE_NORMAL; + } + break; + + case STATE_NORMAL: + case STATE_HALF_CODE_POINT: + default: + break; + } + + oddHighSurrogate = mOddHighSurrogate; + + if (src == srcEnd) { + *aDestLength = dest - aDest; + return (mState != STATE_NORMAL || oddHighSurrogate) ? + NS_OK_UDEC_MOREINPUT : NS_OK; + } + + const char* srcEvenEnd; + + char16_t u; + if (mState == STATE_HALF_CODE_POINT) { + if (dest == destEnd) + goto error; + + // the 1st byte of a 16-bit code unit was stored in |mOddByte| in the + // previous run while the 2nd byte has to come from |*src|. + mState = STATE_NORMAL; +#if MOZ_BIG_ENDIAN + u = (mOddByte << 8) | uint8_t(*src++); // safe, we know we have at least one byte. +#else + u = (*src++ << 8) | mOddByte; // safe, we know we have at least one byte. +#endif + srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop + goto have_codepoint; + } else { + srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop + } + + while (src != srcEvenEnd) { + if (dest == destEnd) + goto error; + +#if !defined(__sparc__) && !defined(__arm__) + u = *(const char16_t*)src; +#else + memcpy(&u, src, 2); +#endif + src += 2; + +have_codepoint: + if (aSwapBytes) + u = u << 8 | u >> 8; + + if (!IS_SURROGATE(u)) { + if (oddHighSurrogate) { + if (mErrBehavior == kOnError_Signal) { + goto error2; + } + *dest++ = UCS2_REPLACEMENT_CHAR; + if (dest == destEnd) + goto error; + oddHighSurrogate = 0; + } + *dest++ = u; + } else if (NS_IS_HIGH_SURROGATE(u)) { + if (oddHighSurrogate) { + if (mErrBehavior == kOnError_Signal) { + goto error2; + } + *dest++ = UCS2_REPLACEMENT_CHAR; + if (dest == destEnd) + goto error; + } + oddHighSurrogate = u; + } + else /* if (NS_IS_LOW_SURROGATE(u)) */ { + if (oddHighSurrogate && *aDestLength > 1) { + if (dest + 1 >= destEnd) { + mOddLowSurrogate = u; + mOddHighSurrogate = oddHighSurrogate; + mState = STATE_ODD_SURROGATE_PAIR; + goto error; + } + *dest++ = oddHighSurrogate; + *dest++ = u; + } else { + if (mErrBehavior == kOnError_Signal) { + goto error2; + } + *dest++ = UCS2_REPLACEMENT_CHAR; + } + oddHighSurrogate = 0; + } + } + if (src != srcEnd) { + // store the lead byte of a 16-bit unit for the next run. + mOddByte = *src++; + mState = STATE_HALF_CODE_POINT; + } + + mOddHighSurrogate = oddHighSurrogate; + + *aDestLength = dest - aDest; + *aSrcLength = src - aSrc; + return (mState != STATE_NORMAL || oddHighSurrogate) ? + NS_OK_UDEC_MOREINPUT : NS_OK; + +error: + *aDestLength = dest - aDest; + *aSrcLength = src - aSrc; + return NS_OK_UDEC_MOREOUTPUT; + +error2: + *aDestLength = dest - aDest; + *aSrcLength = --src - aSrc; + return NS_ERROR_ILLEGAL_INPUT; +} + +NS_IMETHODIMP +nsUTF16ToUnicodeBase::Reset() +{ + mState = STATE_FIRST_CALL; + mOddByte = 0; + mOddHighSurrogate = 0; + mOddLowSurrogate = 0; + return NS_OK; +} + +NS_IMETHODIMP +nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, int32_t aSrcLength, + int32_t * aDestLength) +{ + mozilla::CheckedInt32 length = aSrcLength; + + if (STATE_HALF_CODE_POINT & mState) { + length += 1; + } + + if (!length.isValid()) { + return NS_ERROR_OUT_OF_MEMORY; + } + + // the left-over data of the previous run have to be taken into account. + *aDestLength = length.value() / 2; + if (mOddHighSurrogate) + (*aDestLength)++; + if (mOddLowSurrogate) + (*aDestLength)++; + return NS_OK; +} + + +NS_IMETHODIMP +nsUTF16BEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength, + char16_t * aDest, int32_t * aDestLength) +{ + switch (mState) { + case STATE_FIRST_CALL: + if (*aSrcLength < 2) { + if (*aSrcLength < 1) { + *aDestLength = 0; + return NS_OK; + } + if (uint8_t(*aSrc) != 0xFE) { + mState = STATE_NORMAL; + break; + } + *aDestLength = 0; + mState = STATE_SECOND_BYTE; + return NS_OK_UDEC_MOREINPUT; + } +#if MOZ_LITTLE_ENDIAN + // on LE machines, BE BOM is 0xFFFE + if (0xFFFE != *((char16_t*)aSrc)) { + mState = STATE_NORMAL; + } +#else + if (0xFEFF != *((char16_t*)aSrc)) { + mState = STATE_NORMAL; + } +#endif + break; + + case STATE_SECOND_BYTE: + if (*aSrcLength < 1) { + *aDestLength = 0; + return NS_OK_UDEC_MOREINPUT; + } + if (uint8_t(*aSrc) != 0xFF) { + mOddByte = 0xFE; + mState = STATE_HALF_CODE_POINT; + } + break; + } + + return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength, + bool(MOZ_LITTLE_ENDIAN)); +} + +NS_IMETHODIMP +nsUTF16LEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength, + char16_t * aDest, int32_t * aDestLength) +{ + switch (mState) { + case STATE_FIRST_CALL: + if (*aSrcLength < 2) { + if (*aSrcLength < 1) { + *aDestLength = 0; + return NS_OK; + } + if (uint8_t(*aSrc) != 0xFF) { + mState = STATE_NORMAL; + break; + } + *aDestLength = 0; + mState = STATE_SECOND_BYTE; + return NS_OK_UDEC_MOREINPUT; + } +#if MOZ_BIG_ENDIAN + // on BE machines, LE BOM is 0xFFFE + if (0xFFFE != *((char16_t*)aSrc)) { + mState = STATE_NORMAL; + } +#else + if (0xFEFF != *((char16_t*)aSrc)) { + mState = STATE_NORMAL; + } +#endif + break; + + case STATE_SECOND_BYTE: + if (*aSrcLength < 1) { + *aDestLength = 0; + return NS_OK_UDEC_MOREINPUT; + } + if (uint8_t(*aSrc) != 0xFE) { + mOddByte = 0xFF; + mState = STATE_HALF_CODE_POINT; + } + break; + } + + return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength, + bool(MOZ_BIG_ENDIAN)); +} + +NS_IMETHODIMP +nsUTF16ToUnicode::Reset() +{ + mEndian = kUnknown; + mFoundBOM = false; + return nsUTF16ToUnicodeBase::Reset(); +} + +NS_IMETHODIMP +nsUTF16ToUnicode::Convert(const char * aSrc, int32_t * aSrcLength, + char16_t * aDest, int32_t * aDestLength) +{ + if(STATE_FIRST_CALL == mState && *aSrcLength < 2) + { + nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT; + *aSrcLength=0; + *aDestLength=0; + return res; + } + if(STATE_FIRST_CALL == mState) // first time called + { + // check if BOM (0xFEFF) is at the beginning, remove it if found, and + // set mEndian accordingly. + if(0xFF == uint8_t(aSrc[0]) && 0xFE == uint8_t(aSrc[1])) { + mEndian = kLittleEndian; + mFoundBOM = true; + } + else if(0xFE == uint8_t(aSrc[0]) && 0xFF == uint8_t(aSrc[1])) { + mEndian = kBigEndian; + mFoundBOM = true; + } + // BOM is not found, but we can use a simple heuristic to determine + // the endianness. Assume the first character is [U+0001, U+00FF]. + // Not always valid, but it's very likely to hold for html/xml/css. + else if(!aSrc[0] && aSrc[1]) { // 0x00 0xhh (hh != 00) + mState = STATE_NORMAL; + mEndian = kBigEndian; + } + else if(aSrc[0] && !aSrc[1]) { // 0xhh 0x00 (hh != 00) + mState = STATE_NORMAL; + mEndian = kLittleEndian; + } + else { // Neither BOM nor 'plausible' byte patterns at the beginning. + // Just assume it's BE (following Unicode standard) + // and let the garbage show up in the browser. (security concern?) + // (bug 246194) + mState = STATE_NORMAL; + mEndian = kBigEndian; + } + } + + nsresult rv = UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength, +#if MOZ_BIG_ENDIAN + (mEndian == kLittleEndian) +#else + (mEndian == kBigEndian) +#endif + ); + + // If BOM is not found and we're to return NS_OK, signal that BOM + // is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode| + return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv; +} |