summaryrefslogtreecommitdiffstats
path: root/intl/uconv/ucvcn/nsGBKToUnicode.cpp
blob: eba94a5d39b32afb81e89d4aa5fa6a6a5d9e4051 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/**
 * A character set converter from GBK to Unicode.
 * 
 *
 * @created         07/Sept/1999
 * @author  Yueheng Xu, Yueheng.Xu@intel.com
 */

#include "nsGBKToUnicode.h"
#include "gbku.h"
#include "nsUnicodeDecodeHelper.h"

static const uint16_t g_utGB18030Unique2Bytes[] = {
#include "gb18030uniq2b.ut"
};

static const uint16_t g_utGB18030Unique4Bytes[] = {
#include "gb180304bytes.ut"
};

//----------------------------------------------------------------------
// Class nsGB18030ToUnicode [implementation]

//----------------------------------------------------------------------
// Subclassing of nsBufferDecoderSupport class [implementation]

#define LEGAL_GBK_MULTIBYTE_FIRST_BYTE(c)  \
      (UINT8_IN_RANGE(0x81, (c), 0xFE))
#define FIRST_BYTE_IS_SURROGATE(c)  \
      (UINT8_IN_RANGE(0x90, (c), 0xFE))
#define LEGAL_GBK_2BYTE_SECOND_BYTE(c) \
      (UINT8_IN_RANGE(0x40, (c), 0x7E)|| UINT8_IN_RANGE(0x80, (c), 0xFE))
#define LEGAL_GBK_4BYTE_SECOND_BYTE(c) \
      (UINT8_IN_RANGE(0x30, (c), 0x39))
#define LEGAL_GBK_4BYTE_THIRD_BYTE(c)  \
      (UINT8_IN_RANGE(0x81, (c), 0xFE))
#define LEGAL_GBK_4BYTE_FORTH_BYTE(c) \
      (UINT8_IN_RANGE(0x30, (c), 0x39))

NS_IMETHODIMP nsGB18030ToUnicode::ConvertNoBuff(const char* aSrc,
                                                int32_t * aSrcLength,
                                                char16_t *aDest,
                                                int32_t * aDestLength)
{
  int32_t i=0;
  int32_t iSrcLength = (*aSrcLength);
  int32_t iDestlen = 0;
  nsresult rv=NS_OK;
  *aSrcLength = 0;
  
  for (i=0;i<iSrcLength;i++)
  {
    if ( iDestlen >= (*aDestLength) )
    {
      rv = NS_OK_UDEC_MOREOUTPUT;
      break;
    }
    // The valid range for the 1st byte is [0x81,0xFE] 
    if(LEGAL_GBK_MULTIBYTE_FIRST_BYTE(*aSrc))
    {
      if(i+1 >= iSrcLength) 
      {
        rv = NS_OK_UDEC_MOREINPUT;
        break;
      }
      // To make sure, the second byte has to be checked as well.
      // In GBK, the second byte range is [0x40,0x7E] and [0x80,0XFE]
      if(LEGAL_GBK_2BYTE_SECOND_BYTE(aSrc[1]))
      {
        // Valid GBK code
        *aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]);
        if(UCS2_NO_MAPPING == *aDest)
        { 
          // We cannot map in the common mapping, let's call the
          // delegate 2 byte decoder to decode the gbk or gb18030 unique 
          // 2 byte mapping
          if(! TryExtensionDecoder(aSrc, aDest))
          {
            *aDest = UCS2_NO_MAPPING;
          }
        }
        aSrc += 2;
        i++;
      }
      else if (LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
      {
        // from the first 2 bytes, it looks like a 4 byte GB18030
        if(i+3 >= iSrcLength)  // make sure we got 4 bytes
        {
          rv = NS_OK_UDEC_MOREINPUT;
          break;
        }
        // 4 bytes patten
        // [0x81-0xfe][0x30-0x39][0x81-0xfe][0x30-0x39]
        // preset the 
 
        if (LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]) &&
            LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
        {
           if ( ! FIRST_BYTE_IS_SURROGATE(aSrc[0])) 
           {
             // let's call the delegated 4 byte gb18030 converter to convert it
             if (!Try4BytesDecoder(aSrc, aDest)) {
               *aDest = UCS2_NO_MAPPING;
             }
             // Swapped character in GB18030-2005
             if (*aDest == 0x1E3F) {
               *aDest = 0xE7C7;
             }
           } else {
              // let's try supplement mapping
             if ( (iDestlen+1) < (*aDestLength) )
             {
               if(DecodeToSurrogate(aSrc, aDest))
               {
                 // surrogte two char16_t
                 iDestlen++;
                 aDest++;
               }  else {
                 *aDest = UCS2_NO_MAPPING;
              }
             } else {
               if (*aDestLength < 2) {
                 NS_ERROR("insufficient space in output buffer");
                 *aDest = UCS2_NO_MAPPING;
               } else {
                 rv = NS_OK_UDEC_MOREOUTPUT;
                 break;
               }
             }
           }
           aSrc += 4;
           i += 3;
        } else {
          *aDest = UCS2_NO_MAPPING; 
          // If the third and fourth bytes are not in the legal ranges for
          // a four-byte sequnce, resynchronize on the second byte
          // (which we know is in the range of LEGAL_GBK_4BYTE_SECOND_BYTE,
          //  0x30-0x39)
          aSrc++;
        }
      }
      else if ((uint8_t) aSrc[0] == (uint8_t)0xA0 )
      {
        // stand-alone (not followed by a valid second byte) 0xA0 !
        // treat it as valid a la Netscape 4.x
        *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
        aSrc++;
      } else {
        // Invalid GBK code point (second byte should be 0x40 or higher)
        *aDest = UCS2_NO_MAPPING;
        aSrc++;
      }
    } else {
      if(IS_ASCII(*aSrc))
      {
        // The source is an ASCII
        *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
        aSrc++;
      } else {
        if(IS_GBK_EURO(*aSrc)) {
          *aDest = UCS2_EURO;
        } else {
          *aDest = UCS2_NO_MAPPING;
        }
        aSrc++;
      }
    }
    iDestlen++;
    aDest++;
    *aSrcLength = i+1;
  }
  *aDestLength = iDestlen;
  return rv;
}

bool nsGB18030ToUnicode::DecodeToSurrogate(const char* aSrc, char16_t* aOut)
{
  NS_ASSERTION(FIRST_BYTE_IS_SURROGATE(aSrc[0]),       "illegal first byte");
  NS_ASSERTION(LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]),   "illegal second byte");
  NS_ASSERTION(LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]),    "illegal third byte");
  NS_ASSERTION(LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]),    "illegal forth byte");
  if(! FIRST_BYTE_IS_SURROGATE(aSrc[0]))
    return false;
  if(! LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
    return false;
  if(! LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]))
    return false;
  if(! LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
    return false;

  uint8_t a1 = (uint8_t) aSrc[0];
  uint8_t a2 = (uint8_t) aSrc[1];
  uint8_t a3 = (uint8_t) aSrc[2];
  uint8_t a4 = (uint8_t) aSrc[3];
  a1 -= (uint8_t)0x90;
  a2 -= (uint8_t)0x30;
  a3 -= (uint8_t)0x81;
  a4 -= (uint8_t)0x30;
  uint32_t idx = (((a1 * 10 + a2 ) * 126 + a3) * 10) + a4;
  // idx == ucs4Codepoint - 0x10000
  if (idx > 0x000FFFFF)
    return false;

  *aOut++ = 0xD800 | (idx >> 10);
  *aOut = 0xDC00 | (0x000003FF & idx);

  return true;
}
bool nsGB18030ToUnicode::TryExtensionDecoder(const char* aSrc, char16_t* aOut)
{
  int32_t len = 2;
  int32_t dstlen = 1;
  nsresult res =
    nsUnicodeDecodeHelper::ConvertByTable(aSrc, &len, aOut, &dstlen,
                                          u2BytesCharset, nullptr,
                                          (uMappingTable*) &g_utGB18030Unique2Bytes,
                                          false);
  NS_ASSERTION(NS_FAILED(res) || ((len==2) && (dstlen == 1)),
               "some strange conversion result");
  // if we failed, we then just use the 0xfffd
  // therefore, we ignore the res here.
  return NS_SUCCEEDED(res);
}

bool nsGB18030ToUnicode::Try4BytesDecoder(const char* aSrc, char16_t* aOut)
{
  int32_t len = 4;
  int32_t dstlen = 1;
  nsresult res =
    nsUnicodeDecodeHelper::ConvertByTable(aSrc, &len, aOut, &dstlen,
                                          u4BytesGB18030Charset, nullptr,
                                          (uMappingTable*) &g_utGB18030Unique4Bytes,
                                          false); 
  NS_ASSERTION(NS_FAILED(res) || ((len==4) && (dstlen == 1)),
               "some strange conversion result");
  // if we failed, we then just use the 0xfffd
  // therefore, we ignore the res here.
  return NS_SUCCEEDED(res);
}