diff options
author | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
---|---|---|
committer | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
commit | 5f8de423f190bbb79a62f804151bc24824fa32d8 (patch) | |
tree | 10027f336435511475e392454359edea8e25895d /intl/uconv/ucvcn/nsGBKToUnicode.cpp | |
parent | 49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff) | |
download | UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip |
Add m-esr52 at 52.6.0
Diffstat (limited to 'intl/uconv/ucvcn/nsGBKToUnicode.cpp')
-rw-r--r-- | intl/uconv/ucvcn/nsGBKToUnicode.cpp | 244 |
1 files changed, 244 insertions, 0 deletions
diff --git a/intl/uconv/ucvcn/nsGBKToUnicode.cpp b/intl/uconv/ucvcn/nsGBKToUnicode.cpp new file mode 100644 index 000000000..eba94a5d3 --- /dev/null +++ b/intl/uconv/ucvcn/nsGBKToUnicode.cpp @@ -0,0 +1,244 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +/** + * A character set converter from GBK to Unicode. + * + * + * @created 07/Sept/1999 + * @author Yueheng Xu, Yueheng.Xu@intel.com + */ + +#include "nsGBKToUnicode.h" +#include "gbku.h" +#include "nsUnicodeDecodeHelper.h" + +static const uint16_t g_utGB18030Unique2Bytes[] = { +#include "gb18030uniq2b.ut" +}; + +static const uint16_t g_utGB18030Unique4Bytes[] = { +#include "gb180304bytes.ut" +}; + +//---------------------------------------------------------------------- +// Class nsGB18030ToUnicode [implementation] + +//---------------------------------------------------------------------- +// Subclassing of nsBufferDecoderSupport class [implementation] + +#define LEGAL_GBK_MULTIBYTE_FIRST_BYTE(c) \ + (UINT8_IN_RANGE(0x81, (c), 0xFE)) +#define FIRST_BYTE_IS_SURROGATE(c) \ + (UINT8_IN_RANGE(0x90, (c), 0xFE)) +#define LEGAL_GBK_2BYTE_SECOND_BYTE(c) \ + (UINT8_IN_RANGE(0x40, (c), 0x7E)|| UINT8_IN_RANGE(0x80, (c), 0xFE)) +#define LEGAL_GBK_4BYTE_SECOND_BYTE(c) \ + (UINT8_IN_RANGE(0x30, (c), 0x39)) +#define LEGAL_GBK_4BYTE_THIRD_BYTE(c) \ + (UINT8_IN_RANGE(0x81, (c), 0xFE)) +#define LEGAL_GBK_4BYTE_FORTH_BYTE(c) \ + (UINT8_IN_RANGE(0x30, (c), 0x39)) + +NS_IMETHODIMP nsGB18030ToUnicode::ConvertNoBuff(const char* aSrc, + int32_t * aSrcLength, + char16_t *aDest, + int32_t * aDestLength) +{ + int32_t i=0; + int32_t iSrcLength = (*aSrcLength); + int32_t iDestlen = 0; + nsresult rv=NS_OK; + *aSrcLength = 0; + + for (i=0;i<iSrcLength;i++) + { + if ( iDestlen >= (*aDestLength) ) + { + rv = NS_OK_UDEC_MOREOUTPUT; + break; + } + // The valid range for the 1st byte is [0x81,0xFE] + if(LEGAL_GBK_MULTIBYTE_FIRST_BYTE(*aSrc)) + { + if(i+1 >= iSrcLength) + { + rv = NS_OK_UDEC_MOREINPUT; + break; + } + // To make sure, the second byte has to be checked as well. + // In GBK, the second byte range is [0x40,0x7E] and [0x80,0XFE] + if(LEGAL_GBK_2BYTE_SECOND_BYTE(aSrc[1])) + { + // Valid GBK code + *aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]); + if(UCS2_NO_MAPPING == *aDest) + { + // We cannot map in the common mapping, let's call the + // delegate 2 byte decoder to decode the gbk or gb18030 unique + // 2 byte mapping + if(! TryExtensionDecoder(aSrc, aDest)) + { + *aDest = UCS2_NO_MAPPING; + } + } + aSrc += 2; + i++; + } + else if (LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1])) + { + // from the first 2 bytes, it looks like a 4 byte GB18030 + if(i+3 >= iSrcLength) // make sure we got 4 bytes + { + rv = NS_OK_UDEC_MOREINPUT; + break; + } + // 4 bytes patten + // [0x81-0xfe][0x30-0x39][0x81-0xfe][0x30-0x39] + // preset the + + if (LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]) && + LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3])) + { + if ( ! FIRST_BYTE_IS_SURROGATE(aSrc[0])) + { + // let's call the delegated 4 byte gb18030 converter to convert it + if (!Try4BytesDecoder(aSrc, aDest)) { + *aDest = UCS2_NO_MAPPING; + } + // Swapped character in GB18030-2005 + if (*aDest == 0x1E3F) { + *aDest = 0xE7C7; + } + } else { + // let's try supplement mapping + if ( (iDestlen+1) < (*aDestLength) ) + { + if(DecodeToSurrogate(aSrc, aDest)) + { + // surrogte two char16_t + iDestlen++; + aDest++; + } else { + *aDest = UCS2_NO_MAPPING; + } + } else { + if (*aDestLength < 2) { + NS_ERROR("insufficient space in output buffer"); + *aDest = UCS2_NO_MAPPING; + } else { + rv = NS_OK_UDEC_MOREOUTPUT; + break; + } + } + } + aSrc += 4; + i += 3; + } else { + *aDest = UCS2_NO_MAPPING; + // If the third and fourth bytes are not in the legal ranges for + // a four-byte sequnce, resynchronize on the second byte + // (which we know is in the range of LEGAL_GBK_4BYTE_SECOND_BYTE, + // 0x30-0x39) + aSrc++; + } + } + else if ((uint8_t) aSrc[0] == (uint8_t)0xA0 ) + { + // stand-alone (not followed by a valid second byte) 0xA0 ! + // treat it as valid a la Netscape 4.x + *aDest = CAST_CHAR_TO_UNICHAR(*aSrc); + aSrc++; + } else { + // Invalid GBK code point (second byte should be 0x40 or higher) + *aDest = UCS2_NO_MAPPING; + aSrc++; + } + } else { + if(IS_ASCII(*aSrc)) + { + // The source is an ASCII + *aDest = CAST_CHAR_TO_UNICHAR(*aSrc); + aSrc++; + } else { + if(IS_GBK_EURO(*aSrc)) { + *aDest = UCS2_EURO; + } else { + *aDest = UCS2_NO_MAPPING; + } + aSrc++; + } + } + iDestlen++; + aDest++; + *aSrcLength = i+1; + } + *aDestLength = iDestlen; + return rv; +} + +bool nsGB18030ToUnicode::DecodeToSurrogate(const char* aSrc, char16_t* aOut) +{ + NS_ASSERTION(FIRST_BYTE_IS_SURROGATE(aSrc[0]), "illegal first byte"); + NS_ASSERTION(LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]), "illegal second byte"); + NS_ASSERTION(LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]), "illegal third byte"); + NS_ASSERTION(LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]), "illegal forth byte"); + if(! FIRST_BYTE_IS_SURROGATE(aSrc[0])) + return false; + if(! LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1])) + return false; + if(! LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2])) + return false; + if(! LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3])) + return false; + + uint8_t a1 = (uint8_t) aSrc[0]; + uint8_t a2 = (uint8_t) aSrc[1]; + uint8_t a3 = (uint8_t) aSrc[2]; + uint8_t a4 = (uint8_t) aSrc[3]; + a1 -= (uint8_t)0x90; + a2 -= (uint8_t)0x30; + a3 -= (uint8_t)0x81; + a4 -= (uint8_t)0x30; + uint32_t idx = (((a1 * 10 + a2 ) * 126 + a3) * 10) + a4; + // idx == ucs4Codepoint - 0x10000 + if (idx > 0x000FFFFF) + return false; + + *aOut++ = 0xD800 | (idx >> 10); + *aOut = 0xDC00 | (0x000003FF & idx); + + return true; +} +bool nsGB18030ToUnicode::TryExtensionDecoder(const char* aSrc, char16_t* aOut) +{ + int32_t len = 2; + int32_t dstlen = 1; + nsresult res = + nsUnicodeDecodeHelper::ConvertByTable(aSrc, &len, aOut, &dstlen, + u2BytesCharset, nullptr, + (uMappingTable*) &g_utGB18030Unique2Bytes, + false); + NS_ASSERTION(NS_FAILED(res) || ((len==2) && (dstlen == 1)), + "some strange conversion result"); + // if we failed, we then just use the 0xfffd + // therefore, we ignore the res here. + return NS_SUCCEEDED(res); +} + +bool nsGB18030ToUnicode::Try4BytesDecoder(const char* aSrc, char16_t* aOut) +{ + int32_t len = 4; + int32_t dstlen = 1; + nsresult res = + nsUnicodeDecodeHelper::ConvertByTable(aSrc, &len, aOut, &dstlen, + u4BytesGB18030Charset, nullptr, + (uMappingTable*) &g_utGB18030Unique4Bytes, + false); + NS_ASSERTION(NS_FAILED(res) || ((len==4) && (dstlen == 1)), + "some strange conversion result"); + // if we failed, we then just use the 0xfffd + // therefore, we ignore the res here. + return NS_SUCCEEDED(res); +} |