diff options
Diffstat (limited to 'security/nss/lib/util/utf8.c')
-rw-r--r-- | security/nss/lib/util/utf8.c | 445 |
1 files changed, 445 insertions, 0 deletions
diff --git a/security/nss/lib/util/utf8.c b/security/nss/lib/util/utf8.c new file mode 100644 index 000000000..7bdd71482 --- /dev/null +++ b/security/nss/lib/util/utf8.c @@ -0,0 +1,445 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "seccomon.h" +#include "secport.h" + +/* + * From RFC 2044: + * + * UCS-4 range (hex.) UTF-8 octet sequence (binary) + * 0000 0000-0000 007F 0xxxxxxx + * 0000 0080-0000 07FF 110xxxxx 10xxxxxx + * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx + * 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx + */ + +/* + * From http://www.imc.org/draft-hoffman-utf16 + * + * For U on [0x00010000,0x0010FFFF]: Let U' = U - 0x00010000 + * + * U' = yyyyyyyyyyxxxxxxxxxx + * W1 = 110110yyyyyyyyyy + * W2 = 110111xxxxxxxxxx + */ + +/* + * This code is assuming NETWORK BYTE ORDER for the 16- and 32-bit + * character values. If you wish to use this code for working with + * host byte order values, define the following: + * + * #if IS_BIG_ENDIAN + * #define L_0 0 + * #define L_1 1 + * #define L_2 2 + * #define L_3 3 + * #define H_0 0 + * #define H_1 1 + * #else / * not everyone has elif * / + * #if IS_LITTLE_ENDIAN + * #define L_0 3 + * #define L_1 2 + * #define L_2 1 + * #define L_3 0 + * #define H_0 1 + * #define H_1 0 + * #else + * #error "PDP and NUXI support deferred" + * #endif / * IS_LITTLE_ENDIAN * / + * #endif / * IS_BIG_ENDIAN * / + */ + +#define L_0 0 +#define L_1 1 +#define L_2 2 +#define L_3 3 +#define H_0 0 +#define H_1 1 + +#define BAD_UTF8 ((PRUint32)-1) + +/* + * Parse a single UTF-8 character per the spec. in section 3.9 (D36) + * of Unicode 4.0.0. + * + * Parameters: + * index - Points to the byte offset in inBuf of character to read. On success, + * updated to the offset of the following character. + * inBuf - Input buffer, UTF-8 encoded + * inbufLen - Length of input buffer, in bytes. + * + * Returns: + * Success - The UCS4 encoded character + * Failure - BAD_UTF8 + */ +static PRUint32 +sec_port_read_utf8(unsigned int *index, unsigned char *inBuf, unsigned int inBufLen) +{ + PRUint32 result; + unsigned int i = *index; + int bytes_left; + PRUint32 min_value; + + PORT_Assert(i < inBufLen); + + if ((inBuf[i] & 0x80) == 0x00) { + result = inBuf[i++]; + bytes_left = 0; + min_value = 0; + } else if ((inBuf[i] & 0xE0) == 0xC0) { + result = inBuf[i++] & 0x1F; + bytes_left = 1; + min_value = 0x80; + } else if ((inBuf[i] & 0xF0) == 0xE0) { + result = inBuf[i++] & 0x0F; + bytes_left = 2; + min_value = 0x800; + } else if ((inBuf[i] & 0xF8) == 0xF0) { + result = inBuf[i++] & 0x07; + bytes_left = 3; + min_value = 0x10000; + } else { + return BAD_UTF8; + } + + while (bytes_left--) { + if (i >= inBufLen || (inBuf[i] & 0xC0) != 0x80) + return BAD_UTF8; + result = (result << 6) | (inBuf[i++] & 0x3F); + } + + /* Check for overlong sequences, surrogates, and outside unicode range */ + if (result < min_value || (result & 0xFFFFF800) == 0xD800 || result > 0x10FFFF) { + return BAD_UTF8; + } + + *index = i; + return result; +} + +PRBool +sec_port_ucs4_utf8_conversion_function( + PRBool toUnicode, + unsigned char *inBuf, + unsigned int inBufLen, + unsigned char *outBuf, + unsigned int maxOutBufLen, + unsigned int *outBufLen) +{ + PORT_Assert((unsigned int *)NULL != outBufLen); + + if (toUnicode) { + unsigned int i, len = 0; + + for (i = 0; i < inBufLen;) { + if ((inBuf[i] & 0x80) == 0x00) + i += 1; + else if ((inBuf[i] & 0xE0) == 0xC0) + i += 2; + else if ((inBuf[i] & 0xF0) == 0xE0) + i += 3; + else if ((inBuf[i] & 0xF8) == 0xF0) + i += 4; + else + return PR_FALSE; + + len += 4; + } + + if (len > maxOutBufLen) { + *outBufLen = len; + return PR_FALSE; + } + + len = 0; + + for (i = 0; i < inBufLen;) { + PRUint32 ucs4 = sec_port_read_utf8(&i, inBuf, inBufLen); + + if (ucs4 == BAD_UTF8) + return PR_FALSE; + + outBuf[len + L_0] = 0x00; + outBuf[len + L_1] = (unsigned char)(ucs4 >> 16); + outBuf[len + L_2] = (unsigned char)(ucs4 >> 8); + outBuf[len + L_3] = (unsigned char)ucs4; + + len += 4; + } + + *outBufLen = len; + return PR_TRUE; + } else { + unsigned int i, len = 0; + PORT_Assert((inBufLen % 4) == 0); + if ((inBufLen % 4) != 0) { + *outBufLen = 0; + return PR_FALSE; + } + + for (i = 0; i < inBufLen; i += 4) { + if ((inBuf[i + L_0] > 0x00) || (inBuf[i + L_1] > 0x10)) { + *outBufLen = 0; + return PR_FALSE; + } else if (inBuf[i + L_1] >= 0x01) + len += 4; + else if (inBuf[i + L_2] >= 0x08) + len += 3; + else if ((inBuf[i + L_2] > 0x00) || (inBuf[i + L_3] >= 0x80)) + len += 2; + else + len += 1; + } + + if (len > maxOutBufLen) { + *outBufLen = len; + return PR_FALSE; + } + + len = 0; + + for (i = 0; i < inBufLen; i += 4) { + if (inBuf[i + L_1] >= 0x01) { + /* 0001 0000-001F FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + /* 00000000 000abcde fghijklm nopqrstu -> + 11110abc 10defghi 10jklmno 10pqrstu */ + + outBuf[len + 0] = 0xF0 | ((inBuf[i + L_1] & 0x1C) >> 2); + outBuf[len + 1] = 0x80 | ((inBuf[i + L_1] & 0x03) << 4) | ((inBuf[i + L_2] & 0xF0) >> 4); + outBuf[len + 2] = 0x80 | ((inBuf[i + L_2] & 0x0F) << 2) | ((inBuf[i + L_3] & 0xC0) >> 6); + outBuf[len + 3] = 0x80 | ((inBuf[i + L_3] & 0x3F) >> 0); + + len += 4; + } else if (inBuf[i + L_2] >= 0x08) { + /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */ + /* 00000000 00000000 abcdefgh ijklmnop -> + 1110abcd 10efghij 10klmnop */ + + outBuf[len + 0] = 0xE0 | ((inBuf[i + L_2] & 0xF0) >> 4); + outBuf[len + 1] = 0x80 | ((inBuf[i + L_2] & 0x0F) << 2) | ((inBuf[i + L_3] & 0xC0) >> 6); + outBuf[len + 2] = 0x80 | ((inBuf[i + L_3] & 0x3F) >> 0); + + len += 3; + } else if ((inBuf[i + L_2] > 0x00) || (inBuf[i + L_3] >= 0x80)) { + /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */ + /* 00000000 00000000 00000abc defghijk -> + 110abcde 10fghijk */ + + outBuf[len + 0] = 0xC0 | ((inBuf[i + L_2] & 0x07) << 2) | ((inBuf[i + L_3] & 0xC0) >> 6); + outBuf[len + 1] = 0x80 | ((inBuf[i + L_3] & 0x3F) >> 0); + + len += 2; + } else { + /* 0000 0000-0000 007F -> 0xxxxxx */ + /* 00000000 00000000 00000000 0abcdefg -> + 0abcdefg */ + + outBuf[len + 0] = (inBuf[i + L_3] & 0x7F); + + len += 1; + } + } + + *outBufLen = len; + return PR_TRUE; + } +} + +PRBool +sec_port_ucs2_utf8_conversion_function( + PRBool toUnicode, + unsigned char *inBuf, + unsigned int inBufLen, + unsigned char *outBuf, + unsigned int maxOutBufLen, + unsigned int *outBufLen) +{ + PORT_Assert((unsigned int *)NULL != outBufLen); + + if (toUnicode) { + unsigned int i, len = 0; + + for (i = 0; i < inBufLen;) { + if ((inBuf[i] & 0x80) == 0x00) { + i += 1; + len += 2; + } else if ((inBuf[i] & 0xE0) == 0xC0) { + i += 2; + len += 2; + } else if ((inBuf[i] & 0xF0) == 0xE0) { + i += 3; + len += 2; + } else if ((inBuf[i] & 0xF8) == 0xF0) { + i += 4; + len += 4; + } else + return PR_FALSE; + } + + if (len > maxOutBufLen) { + *outBufLen = len; + return PR_FALSE; + } + + len = 0; + + for (i = 0; i < inBufLen;) { + PRUint32 ucs4 = sec_port_read_utf8(&i, inBuf, inBufLen); + + if (ucs4 == BAD_UTF8) + return PR_FALSE; + + if (ucs4 < 0x10000) { + outBuf[len + H_0] = (unsigned char)(ucs4 >> 8); + outBuf[len + H_1] = (unsigned char)ucs4; + len += 2; + } else { + ucs4 -= 0x10000; + outBuf[len + 0 + H_0] = (unsigned char)(0xD8 | ((ucs4 >> 18) & 0x3)); + outBuf[len + 0 + H_1] = (unsigned char)(ucs4 >> 10); + outBuf[len + 2 + H_0] = (unsigned char)(0xDC | ((ucs4 >> 8) & 0x3)); + outBuf[len + 2 + H_1] = (unsigned char)ucs4; + len += 4; + } + } + + *outBufLen = len; + return PR_TRUE; + } else { + unsigned int i, len = 0; + PORT_Assert((inBufLen % 2) == 0); + if ((inBufLen % 2) != 0) { + *outBufLen = 0; + return PR_FALSE; + } + + for (i = 0; i < inBufLen; i += 2) { + if ((inBuf[i + H_0] == 0x00) && ((inBuf[i + H_1] & 0x80) == 0x00)) + len += 1; + else if (inBuf[i + H_0] < 0x08) + len += 2; + else if (((inBuf[i + H_0] & 0xFC) == 0xD8)) { + if (((inBufLen - i) > 2) && ((inBuf[i + 2 + H_0] & 0xFC) == 0xDC)) { + i += 2; + len += 4; + } else { + return PR_FALSE; + } + } else if ((inBuf[i + H_0] & 0xFC) == 0xDC) { + return PR_FALSE; + } else { + len += 3; + } + } + + if (len > maxOutBufLen) { + *outBufLen = len; + return PR_FALSE; + } + + len = 0; + + for (i = 0; i < inBufLen; i += 2) { + if ((inBuf[i + H_0] == 0x00) && ((inBuf[i + H_1] & 0x80) == 0x00)) { + /* 0000-007F -> 0xxxxxx */ + /* 00000000 0abcdefg -> 0abcdefg */ + + outBuf[len] = inBuf[i + H_1] & 0x7F; + + len += 1; + } else if (inBuf[i + H_0] < 0x08) { + /* 0080-07FF -> 110xxxxx 10xxxxxx */ + /* 00000abc defghijk -> 110abcde 10fghijk */ + + outBuf[len + 0] = 0xC0 | ((inBuf[i + H_0] & 0x07) << 2) | ((inBuf[i + H_1] & 0xC0) >> 6); + outBuf[len + 1] = 0x80 | ((inBuf[i + H_1] & 0x3F) >> 0); + + len += 2; + } else if ((inBuf[i + H_0] & 0xFC) == 0xD8) { + int abcde, BCDE; + + PORT_Assert(((inBufLen - i) > 2) && ((inBuf[i + 2 + H_0] & 0xFC) == 0xDC)); + + /* D800-DBFF DC00-DFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + /* 110110BC DEfghijk 110111lm nopqrstu -> + { Let abcde = BCDE + 1 } + 11110abc 10defghi 10jklmno 10pqrstu */ + + BCDE = ((inBuf[i + H_0] & 0x03) << 2) | ((inBuf[i + H_1] & 0xC0) >> 6); + abcde = BCDE + 1; + + outBuf[len + 0] = 0xF0 | ((abcde & 0x1C) >> 2); + outBuf[len + 1] = 0x80 | ((abcde & 0x03) << 4) | ((inBuf[i + 0 + H_1] & 0x3C) >> 2); + outBuf[len + 2] = 0x80 | ((inBuf[i + 0 + H_1] & 0x03) << 4) | ((inBuf[i + 2 + H_0] & 0x03) << 2) | ((inBuf[i + 2 + H_1] & 0xC0) >> 6); + outBuf[len + 3] = 0x80 | ((inBuf[i + 2 + H_1] & 0x3F) >> 0); + + i += 2; + len += 4; + } else { + /* 0800-FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */ + /* abcdefgh ijklmnop -> 1110abcd 10efghij 10klmnop */ + + outBuf[len + 0] = 0xE0 | ((inBuf[i + H_0] & 0xF0) >> 4); + outBuf[len + 1] = 0x80 | ((inBuf[i + H_0] & 0x0F) << 2) | ((inBuf[i + H_1] & 0xC0) >> 6); + outBuf[len + 2] = 0x80 | ((inBuf[i + H_1] & 0x3F) >> 0); + + len += 3; + } + } + + *outBufLen = len; + return PR_TRUE; + } +} + +PRBool +sec_port_iso88591_utf8_conversion_function( + const unsigned char *inBuf, + unsigned int inBufLen, + unsigned char *outBuf, + unsigned int maxOutBufLen, + unsigned int *outBufLen) +{ + unsigned int i, len = 0; + + PORT_Assert((unsigned int *)NULL != outBufLen); + + for (i = 0; i < inBufLen; i++) { + if ((inBuf[i] & 0x80) == 0x00) + len += 1; + else + len += 2; + } + + if (len > maxOutBufLen) { + *outBufLen = len; + return PR_FALSE; + } + + len = 0; + + for (i = 0; i < inBufLen; i++) { + if ((inBuf[i] & 0x80) == 0x00) { + /* 00-7F -> 0xxxxxxx */ + /* 0abcdefg -> 0abcdefg */ + + outBuf[len] = inBuf[i]; + len += 1; + } else { + /* 80-FF <- 110xxxxx 10xxxxxx */ + /* 00000000 abcdefgh -> 110000ab 10cdefgh */ + + outBuf[len + 0] = 0xC0 | ((inBuf[i] & 0xC0) >> 6); + outBuf[len + 1] = 0x80 | ((inBuf[i] & 0x3F) >> 0); + + len += 2; + } + } + + *outBufLen = len; + return PR_TRUE; +} |