diff options
Diffstat (limited to 'intl/uconv/util/uscan.c')
-rw-r--r-- | intl/uconv/util/uscan.c | 759 |
1 files changed, 759 insertions, 0 deletions
diff --git a/intl/uconv/util/uscan.c b/intl/uconv/util/uscan.c new file mode 100644 index 000000000..0abdd0c00 --- /dev/null +++ b/intl/uconv/util/uscan.c @@ -0,0 +1,759 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#include "unicpriv.h" +#define CHK_GR94(b) ( (uint8_t) 0xa0 < (uint8_t) (b) && (uint8_t) (b) < (uint8_t) 0xff ) +#define CHK_GR94_2Byte(b1,b2) (CHK_GR94(b1) && CHK_GR94(b2)) +/*================================================================================= + +=================================================================================*/ +typedef int (*uSubScannerFunc) (unsigned char* in, uint16_t* out); +/*================================================================================= + +=================================================================================*/ + +typedef int (*uScannerFunc) ( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ); + +int uScan( + uScanClassID scanClass, + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ); + +#define uSubScanner(sub,in,out) (* m_subscanner[sub])((in),(out)) + +int uCheckAndScanAlways1Byte( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ); +int uCheckAndScanAlways2Byte( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ); +int uCheckAndScanAlways2ByteShiftGR( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ); +int uCheckAndScanAlways2ByteGR128( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ); +int uScanShift( + uShiftInTable *shift, + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ); + +int uCheckAndScan2ByteGRPrefix8F( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ); +int uCheckAndScan2ByteGRPrefix8EA2( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ); +int uCheckAndScan2ByteGRPrefix8EA3( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ); +int uCheckAndScan2ByteGRPrefix8EA4( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ); +int uCheckAndScan2ByteGRPrefix8EA5( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ); +int uCheckAndScan2ByteGRPrefix8EA6( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ); +int uCheckAndScan2ByteGRPrefix8EA7( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ); +int uCnSAlways8BytesDecomposedHangul( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ); +int uCheckAndScanJohabHangul( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ); +int uCheckAndScanJohabSymbol( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ); + +int uCheckAndScan4BytesGB18030( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ); + +int uScanAlways2Byte( + unsigned char* in, + uint16_t* out + ); +int uScanAlways2ByteShiftGR( + unsigned char* in, + uint16_t* out + ); +int uScanAlways1Byte( + unsigned char* in, + uint16_t* out + ); +int uScanAlways1BytePrefix8E( + unsigned char* in, + uint16_t* out + ); +/*================================================================================= + +=================================================================================*/ +const uScannerFunc m_scanner[uNumOfCharsetType] = +{ + uCheckAndScanAlways1Byte, + uCheckAndScanAlways2Byte, + uCheckAndScanAlways2ByteShiftGR, + uCheckAndScan2ByteGRPrefix8F, + uCheckAndScan2ByteGRPrefix8EA2, + uCheckAndScan2ByteGRPrefix8EA3, + uCheckAndScan2ByteGRPrefix8EA4, + uCheckAndScan2ByteGRPrefix8EA5, + uCheckAndScan2ByteGRPrefix8EA6, + uCheckAndScan2ByteGRPrefix8EA7, + uCnSAlways8BytesDecomposedHangul, + uCheckAndScanJohabHangul, + uCheckAndScanJohabSymbol, + uCheckAndScan4BytesGB18030, + uCheckAndScanAlways2ByteGR128 +}; + +/*================================================================================= + +=================================================================================*/ + +const uSubScannerFunc m_subscanner[uNumOfCharType] = +{ + uScanAlways1Byte, + uScanAlways2Byte, + uScanAlways2ByteShiftGR, + uScanAlways1BytePrefix8E +}; +/*================================================================================= + +=================================================================================*/ +int uScan( + uScanClassID scanClass, + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ) +{ + return (* m_scanner[scanClass]) (state,in,out,inbuflen,inscanlen); +} +/*================================================================================= + +=================================================================================*/ +int uScanAlways1Byte( + unsigned char* in, + uint16_t* out + ) +{ + *out = (uint16_t) in[0]; + return 1; +} + +/*================================================================================= + +=================================================================================*/ +int uScanAlways2Byte( + unsigned char* in, + uint16_t* out + ) +{ + *out = (uint16_t) (( in[0] << 8) | (in[1])); + return 1; +} +/*================================================================================= + +=================================================================================*/ +int uScanAlways2ByteShiftGR( + unsigned char* in, + uint16_t* out + ) +{ + *out = (uint16_t) ((( in[0] << 8) | (in[1])) & 0x7F7F); + return 1; +} + +/*================================================================================= + +=================================================================================*/ +int uScanAlways1BytePrefix8E( + unsigned char* in, + uint16_t* out + ) +{ + *out = (uint16_t) in[1]; + return 1; +} +/*================================================================================= + +=================================================================================*/ +int uCheckAndScanAlways1Byte( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ) +{ + /* Don't check inlen. The caller should ensure it is larger than 0 */ + *inscanlen = 1; + *out = (uint16_t) in[0]; + + return 1; +} + +/*================================================================================= + +=================================================================================*/ +int uCheckAndScanAlways2Byte( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ) +{ + if(inbuflen < 2) + return 0; + else + { + *inscanlen = 2; + *out = ((in[0] << 8) | ( in[1])) ; + return 1; + } +} +/*================================================================================= + +=================================================================================*/ +int uCheckAndScanAlways2ByteShiftGR( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ) +{ + /* + * Both bytes should be in the range of [0xa1,0xfe] for 94x94 character sets + * invoked on GR. No encoding implemented in Mozilla uses 96x96 char. sets. + * Only 2nd byte range needs to be checked because + * 1st byte is checked before calling this in nsUnicodeDecoerHelper.cpp + */ + if(inbuflen < 2) /* will lead to NS_OK_UDEC_MOREINPUT */ + return 0; + else if (! CHK_GR94(in[1])) + { + *inscanlen = 2; + *out = 0xFF; /* for 2-byte table, uMap() is guaranteed to fail for 0xFF. */ + return 1; + } + else + { + *inscanlen = 2; + *out = (((in[0] << 8) | ( in[1])) & 0x7F7F); + return 1; + } +} +/*================================================================================= + +=================================================================================*/ +int uCheckAndScanAlways2ByteGR128( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ) +{ + /* + * The first byte should be in [0xa1,0xfe] + * and the second byte in [0x41,0xfe] + * Used by CP949 -> Unicode converter. + * Only 2nd byte range needs to be checked because + * 1st byte is checked before calling this in nsUnicodeDecoderHelper.cpp + */ + if(inbuflen < 2) /* will lead to NS_OK_UDEC_MOREINPUT */ + return 0; + else if (in[1] < 0x41) /* 2nd byte range check */ + { + *inscanlen = 2; + *out = 0xFF; /* for 2-byte table, uMap() is guaranteed to fail for 0xFF. */ + return 1; + } + else + { + *inscanlen = 2; + *out = (in[0] << 8) | in[1]; + return 1; + } +} +/*================================================================================= + +=================================================================================*/ +int uScanShift( + uShiftInTable *shift, + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ) +{ + int16_t i; + const uShiftInCell* cell = &(shift->shiftcell[0]); + int16_t itemnum = shift->numOfItem; + for(i=0;i<itemnum;i++) + { + if( ( in[0] >= cell[i].shiftin_Min) && + ( in[0] <= cell[i].shiftin_Max)) + { + if(inbuflen < cell[i].reserveLen) + return 0; + else + { + *inscanlen = cell[i].reserveLen; + return (uSubScanner(cell[i].classID,in,out)); + } + } + } + return 0; +} +/*================================================================================= + +=================================================================================*/ +int uCheckAndScan2ByteGRPrefix8F( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ) +{ + if((inbuflen < 3) ||(in[0] != 0x8F)) + return 0; + else if (! CHK_GR94(in[1])) /* 2nd byte range check */ + { + *inscanlen = 2; + *out = 0xFF; /* for 2-byte table, uMap() is guaranteed to fail for 0xFF. */ + return 1; + } + else if (! CHK_GR94(in[2])) /* 3rd byte range check */ + { + *inscanlen = 3; + *out = 0xFF; /* for 2-byte table, uMap() is guaranteed to fail for 0xFF. */ + return 1; + } + else + { + *inscanlen = 3; + *out = (((in[1] << 8) | ( in[2])) & 0x7F7F); + return 1; + } +} +/*================================================================================= + +=================================================================================*/ + +/* Macro definition to use for uCheckAndScan2ByteGRPrefix8EAX() + * where X is 2,3,4,5,6,7 + */ +#define CNS_8EAX_4BYTE(PREFIX) \ + if((inbuflen < 4) || (in[0] != 0x8E)) \ + return 0; \ + else if((in[1] != (PREFIX))) \ + { \ + *inscanlen = 2; \ + *out = 0xFF; \ + return 1; \ + } \ + else if(! CHK_GR94(in[2])) \ + { \ + *inscanlen = 3; \ + *out = 0xFF; \ + return 1; \ + } \ + else if(! CHK_GR94(in[3])) \ + { \ + *inscanlen = 4; \ + *out = 0xFF; \ + return 1; \ + } \ + else \ + { \ + *inscanlen = 4; \ + *out = (((in[2] << 8) | ( in[3])) & 0x7F7F); \ + return 1; \ + } + +int uCheckAndScan2ByteGRPrefix8EA2( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ) +{ + CNS_8EAX_4BYTE(0xA2) +} + +/*================================================================================= + +=================================================================================*/ +int uCheckAndScan2ByteGRPrefix8EA3( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ) +{ + CNS_8EAX_4BYTE(0xA3) +} +/*================================================================================= + +=================================================================================*/ +int uCheckAndScan2ByteGRPrefix8EA4( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ) +{ + CNS_8EAX_4BYTE(0xA4) +} +/*================================================================================= + +=================================================================================*/ +int uCheckAndScan2ByteGRPrefix8EA5( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ) +{ + CNS_8EAX_4BYTE(0xA5) +} +/*================================================================================= + +=================================================================================*/ +int uCheckAndScan2ByteGRPrefix8EA6( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ) +{ + CNS_8EAX_4BYTE(0xA6) +} +/*================================================================================= + +=================================================================================*/ +int uCheckAndScan2ByteGRPrefix8EA7( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ) +{ + CNS_8EAX_4BYTE(0xA7) +} +/*================================================================================= + +=================================================================================*/ +#define SBase 0xAC00 +#define SCount 11172 +#define LCount 19 +#define VCount 21 +#define TCount 28 +#define NCount (VCount * TCount) + +int uCnSAlways8BytesDecomposedHangul( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ) +{ + + uint16_t LIndex, VIndex, TIndex; + /* no 8 bytes, not in a4 range, or the first 2 byte are not a4d4 */ + if((inbuflen < 8) || (0xa4 != in[0]) || (0xd4 != in[1]) || + (0xa4 != in[2] ) || (0xa4 != in[4]) || (0xa4 != in[6])) + return 0; + + /* Compute LIndex */ + if((in[3] < 0xa1) || (in[3] > 0xbe)) { /* illegal leading consonant */ + return 0; + } + else { + static const uint8_t lMap[] = { + /* A1 A2 A3 A4 A5 A6 A7 */ + 0, 1,0xff, 2,0xff,0xff, 3, + /* A8 A9 AA AB AC AD AE AF */ + 4, 5,0xff,0xff,0xff,0xff,0xff,0xff, + /* B0 B1 B2 B3 B4 B5 B6 B7 */ + 0xff, 6, 7, 8,0xff, 9, 10, 11, + /* B8 B9 BA BB BC BD BE */ + 12, 13, 14, 15, 16, 17, 18 + }; + + LIndex = lMap[in[3] - 0xa1]; + if(0xff == (0xff & LIndex)) + return 0; + } + + /* Compute VIndex */ + if((in[5] < 0xbf) || (in[5] > 0xd3)) { /* illegal medial vowel */ + return 0; + } + else { + VIndex = in[5] - 0xbf; + } + + /* Compute TIndex */ + if(0xd4 == in[7]) + { + TIndex = 0; + } + else if((in[7] < 0xa1) || (in[7] > 0xbe)) {/* illegal trailing consonant */ + return 0; + } + else { + static const uint8_t tMap[] = { + /* A1 A2 A3 A4 A5 A6 A7 */ + 1, 2, 3, 4, 5, 6, 7, + /* A8 A9 AA AB AC AD AE AF */ + 0xff, 8, 9, 10, 11, 12, 13, 14, + /* B0 B1 B2 B3 B4 B5 B6 B7 */ + 15, 16, 17,0xff, 18, 19, 20, 21, + /* B8 B9 BA BB BC BD BE */ + 22,0xff, 23, 24, 25, 26, 27 + }; + TIndex = tMap[in[7] - 0xa1]; + if(0xff == (0xff & TIndex)) + return 0; + } + + *inscanlen = 8; + /* the following line is from Unicode 2.0 page 3-13 item 5 */ + *out = ( LIndex * VCount + VIndex) * TCount + TIndex + SBase; + + return 1; +} +/*================================================================================= + +=================================================================================*/ + +int uCheckAndScanJohabHangul( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ) +{ +/* since we don't have code to convert Johab to Unicode right now * + * make this part of code #if 0 to save space until we fully test it */ + if(inbuflen < 2) + return 0; + else { + /* + * See Table 4-45 Johab Encoding's Five-Bit Binary Patterns in page 183 + * of "CJKV Information Processing" for details + */ + static const uint8_t lMap[32]={ /* totaly 19 */ + 0xff,0xff,0, 1, 2, 3, 4, 5, /* 0-7 */ + 6, 7, 8, 9, 10, 11, 12, 13, /* 8-15 */ + 14, 15, 16, 17, 18, 0xff,0xff,0xff, /* 16-23 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff /* 24-31 */ + }; + static const uint8_t vMap[32]={ /* totaly 21 */ + 0xff,0xff,0xff,0, 1, 2, 3, 4, /* 0-7 */ + 0xff,0xff,5, 6, 7, 8, 9, 10, /* 8-15 */ + 0xff,0xff,11, 12, 13, 14, 15, 16, /* 16-23 */ + 0xff,0xff,17, 18, 19, 20, 0xff,0xff /* 24-31 */ + }; + static const uint8_t tMap[32]={ /* totaly 29 */ + 0xff,0, 1, 2, 3, 4, 5, 6, /* 0-7 */ + 7, 8, 9, 10, 11, 12, 13, 14, /* 8-15 */ + 15, 16, 0xff,17, 18, 19, 20, 21, /* 16-23 */ + 22, 23, 24, 25, 26, 27, 0xff,0xff /* 24-31 */ + }; + uint16_t ch = (in[0] << 8) | in[1]; + uint16_t LIndex, VIndex, TIndex; + if(0 == (0x8000 & ch)) + return 0; + LIndex=lMap[(ch>>10)& 0x1F]; + VIndex=vMap[(ch>>5) & 0x1F]; + TIndex=tMap[(ch>>0) & 0x1F]; + if((0xff==(LIndex)) || + (0xff==(VIndex)) || + (0xff==(TIndex))) + return 0; + /* the following line is from Unicode 2.0 page 3-13 item 5 */ + *out = ( LIndex * VCount + VIndex) * TCount + TIndex + SBase; + *inscanlen = 2; + return 1; + } +} +int uCheckAndScanJohabSymbol( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ) +{ + if(inbuflen < 2) + return 0; + else { + /* + * The following code are based on the Perl code lised under + * "Johab to ISO-2022-KR or EUC-KR Conversion" in page 1014 of + * "CJKV Information Processing" by Ken Lunde <lunde@adobe.com> + * + * sub johab2ks ($) { # Convert Johab to ISO-2022-KR + * my @johab = unpack("C*", $_[0]); + * my ($offset, $d8_off) = (0,0); + * my @out = (); + * while(($hi, $lo) = splice($johab, 0, 2)) { + * $offset = 1 if ($hi > 223 and $hi < 250); + * $d8_off = ($hi == 216 and ($lo > 160 ? 94 : 42)); + * push (@out, (((($hi - ($hi < 223 ? 200 : 187)) << 1) - + * ($lo < 161 ? 1 : 0) + $offset) + $d8_off), + * $lo - ($lo < 161 ? ($lo > 126 ? 34 : 16) : 128 )); + * } + * return pack ("C*", @out); + * } + * additional comments from Ken Lunde + * $d8_off = ($hi == 216 and ($lo > 160 ? 94 : 42)); + * has three possible return values: + * 0 if $hi is not equal to 216 + * 94 if $hi is euqal to 216 and if $lo is greater than 160 + * 42 if $hi is euqal to 216 and if $lo is not greater than 160 + */ + unsigned char hi = in[0]; + unsigned char lo = in[1]; + uint16_t offset = (( hi > 223 ) && ( hi < 250)) ? 1 : 0; + uint16_t d8_off = 0; + if(216 == hi) { + if( lo > 160) + d8_off = 94; + else + d8_off = 42; + } + + *out = (((((hi - ((hi < 223) ? 200 : 187)) << 1) - + (lo < 161 ? 1 : 0) + offset) + d8_off) << 8 ) | + (lo - ((lo < 161) ? ((lo > 126) ? 34 : 16) : + 128)); + *inscanlen = 2; + return 1; + } +} +int uCheckAndScan4BytesGB18030( + int32_t* state, + unsigned char *in, + uint16_t *out, + uint32_t inbuflen, + uint32_t* inscanlen + ) +{ + uint32_t data; + if(inbuflen < 4) + return 0; + + if((in[0] < 0x81 ) || (0xfe < in[0])) + return 0; + if((in[1] < 0x30 ) || (0x39 < in[1])) + return 0; + if((in[2] < 0x81 ) || (0xfe < in[2])) + return 0; + if((in[3] < 0x30 ) || (0x39 < in[3])) + return 0; + + data = (((((in[0] - 0x81) * 10 + (in[1] - 0x30)) * 126) + + (in[2] - 0x81)) * 10 ) + (in[3] - 0x30); + + *inscanlen = 4; + *out = (data < 0x00010000) ? data : 0xFFFD; + return 1; +} |