diff options
Diffstat (limited to 'intl/uconv/tests/unit/test_charset_conversion.js')
-rw-r--r-- | intl/uconv/tests/unit/test_charset_conversion.js | 428 |
1 files changed, 428 insertions, 0 deletions
diff --git a/intl/uconv/tests/unit/test_charset_conversion.js b/intl/uconv/tests/unit/test_charset_conversion.js new file mode 100644 index 000000000..4111a85fa --- /dev/null +++ b/intl/uconv/tests/unit/test_charset_conversion.js @@ -0,0 +1,428 @@ +var Cc = Components.classes; +var Ci = Components.interfaces; + +const NS_ERROR_ILLEGAL_VALUE = Components.results.NS_ERROR_ILLEGAL_VALUE; + +var BIS, BOS, _Pipe, COS, FIS, _SS, CIS; + +var dataDir; + +function run_test() +{ + BIS = Components.Constructor("@mozilla.org/binaryinputstream;1", + "nsIBinaryInputStream", + "setInputStream"); + BOS = Components.Constructor("@mozilla.org/binaryoutputstream;1", + "nsIBinaryOutputStream", + "setOutputStream"); + _Pipe = Components.Constructor("@mozilla.org/pipe;1", + "nsIPipe", + "init"); + COS = Components.Constructor("@mozilla.org/intl/converter-output-stream;1", + "nsIConverterOutputStream", + "init"); + FIS = Components.Constructor("@mozilla.org/network/file-input-stream;1", + "nsIFileInputStream", + "init"); + _SS = Components.Constructor("@mozilla.org/storagestream;1", + "nsIStorageStream", + "init"); + CIS = Components.Constructor("@mozilla.org/intl/converter-input-stream;1", + "nsIConverterInputStream", + "init"); + + dataDir = do_get_file("data/"); + + test_utf8_1(); + test_utf16_1(); + test_utf16_2(); + test_utf16_3(); + test_cross_conversion(); +} + +const UNICODE_STRINGS = + [ + '\u00BD + \u00BE == \u00BD\u00B2 + \u00BC + \u00BE', + + 'AZaz09 \u007F ' + // U+000000 to U+00007F + '\u0080 \u0398 \u03BB \u0725 ' + // U+000080 to U+0007FF + '\u0964 \u0F5F \u20AC \uFFFB' // U+000800 to U+00FFFF + + // there would be strings containing non-BMP code points here, but + // unfortunately JS strings are UCS-2 (and worse yet are treated as + // 16-bit values by the spec), so we have to do gymnastics to work + // with non-BMP -- manual surrogate decoding doesn't work because + // String.prototype.charCodeAt() ignores surrogate pairs and only + // returns 16-bit values + ]; + +// test conversion equality -- keys are names of files containing equivalent +// Unicode data, values are the encoding of the file in the format expected by +// nsIConverter(In|Out)putStream.init +const UNICODE_FILES = + { + "unicode-conversion.utf8.txt": "UTF-8", + "unicode-conversion.utf16.txt": "UTF-16", + "unicode-conversion.utf16le.txt": "UTF-16LE", + "unicode-conversion.utf16be.txt": "UTF-16BE" + }; + +function test_utf8_1() +{ + for (var i = 0; i < UNICODE_STRINGS.length; i++) + { + var pipe = Pipe(); + var conv = new COS(pipe.outputStream, "UTF-8", 1024, 0x0); + do_check_true(conv.writeString(UNICODE_STRINGS[i])); + conv.close(); + + if (!equalStreams(new UTF8(pipe.inputStream), + stringToCodePoints(UNICODE_STRINGS[i]))) + do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); + } +} + +function test_utf16_1() +{ + for (var i = 0; i < UNICODE_STRINGS.length; i++) + { + var pipe = Pipe(); + var conv = new COS(pipe.outputStream, "UTF-16", 1024, 0x0); + do_check_true(conv.writeString(UNICODE_STRINGS[i])); + conv.close(); + + if (!equalStreams(new UTF16(pipe.inputStream), + stringToCodePoints(UNICODE_STRINGS[i]))) + do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); + } +} + +function test_utf16_2() +{ + for (var i = 0; i < UNICODE_STRINGS.length; i++) + { + var pipe = Pipe(); + var conv = new COS(pipe.outputStream, "UTF-16LE", 1024, 0x0); + do_check_true(conv.writeString(UNICODE_STRINGS[i])); + conv.close(); + + if (!equalStreams(new UTF16(pipe.inputStream, false), + stringToCodePoints(UNICODE_STRINGS[i]))) + do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); + } +} + +function test_utf16_3() +{ + for (var i = 0; i < UNICODE_STRINGS.length; i++) + { + var pipe = Pipe(); + var conv = new COS(pipe.outputStream, "UTF-16BE", 1024, 0x0); + do_check_true(conv.writeString(UNICODE_STRINGS[i])); + conv.close(); + + if (!equalStreams(new UTF16(pipe.inputStream, true), + stringToCodePoints(UNICODE_STRINGS[i]))) + do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); + } +} + + +function test_cross_conversion() +{ + for (var fn1 in UNICODE_FILES) + { + var fin = getBinaryInputStream(fn1); + var ss = StorageStream(); + + var bos = new BOS(ss.getOutputStream(0)); + var av; + while ((av = fin.available()) > 0) + { + var data = fin.readByteArray(av); + bos.writeByteArray(data, data.length); + } + fin.close(); + bos.close(); + + for (var fn2 in UNICODE_FILES) + { + var fin2 = getUnicharInputStream(fn2, UNICODE_FILES[fn2]); + var unichar = new CIS(ss.newInputStream(0), + UNICODE_FILES[fn1], 8192, 0x0); + + if (!equalUnicharStreams(unichar, fin2)) + do_throw("unequal streams: " + + UNICODE_FILES[fn1] + ", " + + UNICODE_FILES[fn2]); + } + } +} + + +// utility functions + +function StorageStream() +{ + return new _SS(8192, Math.pow(2, 32) - 1, null); +} + +function getUnicharInputStream(filename, encoding) +{ + var file = dataDir.clone(); + file.append(filename); + + const PR_RDONLY = 0x1; + var fis = new FIS(file, PR_RDONLY, 0644, Ci.nsIFileInputStream.CLOSE_ON_EOF); + return new CIS(fis, encoding, 8192, 0x0); +} + +function getBinaryInputStream(filename, encoding) +{ + var file = dataDir.clone(); + file.append(filename); + + const PR_RDONLY = 0x1; + var fis = new FIS(file, PR_RDONLY, 0644, Ci.nsIFileInputStream.CLOSE_ON_EOF); + return new BIS(fis); +} + +function equalStreams(stream, codePoints) +{ + var sz, currIndex = 0; + while (true) + { + var unit = stream.readUnit(); + if (unit < 0) + return currIndex == codePoints.length; + if (unit !== codePoints[currIndex++]) + return false; + } + + do_throw("not reached"); + return false; +} + +function equalUnicharStreams(s1, s2) +{ + var r1, r2; + var str1 = {}, str2 = {}; + while (true) + { + r1 = s1.readString(1024, str1); + r2 = s2.readString(1024, str2); + + if (r1 != r2 || str1.value != str2.value) + { + print("r1: " + r1 + ", r2: " + r2); + print(str1.value.length); + print(str2.value.length); + return false; + } + if (r1 == 0 && r2 == 0) + return true; + } + + // not reached + return false; +} + +function stringToCodePoints(str) +{ + return str.split('').map(function(v){ return v.charCodeAt(0); }); +} + +function lowbits(n) +{ + return Math.pow(2, n) - 1; +} + +function Pipe() +{ + return new _Pipe(false, false, 1024, 10, null); +} + + +// complex charset readers + +/** + * Wraps a UTF-8 stream to allow access to the Unicode code points in it. + * + * @param stream + * the stream to wrap + */ +function UTF8(stream) +{ + this._stream = new BIS(stream); +} +UTF8.prototype = + { + // returns numeric code point at front of stream encoded in UTF-8, -1 if at + // end of stream, or throws if valid (and properly encoded!) code point not + // found + readUnit: function() + { + var str = this._stream; + + var c, c2, c3, c4, rv; + + // if at end of stream, must distinguish failure to read any bytes + // (correct behavior) from failure to read some byte after the first + // in the character + try + { + c = str.read8(); + } + catch (e) + { + return -1; + } + + if (c < 0x80) + return c; + + if (c < 0xC0) // c < 11000000 + { + // byte doesn't have enough leading ones (must be at least two) + throw NS_ERROR_ILLEGAL_VALUE; + } + + + c2 = str.read8(); + if (c2 >= 0xC0 || c2 < 0x80) + throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx + + if (c < 0xE0) // c < 11100000 + { + // two-byte between U+000080 and U+0007FF + rv = ((lowbits(5) & c) << 6) + + (lowbits(6) & c2); + // no upper bounds-check needed, by previous lines + if (rv >= 0x80) + return rv; + throw NS_ERROR_ILLEGAL_VALUE; + } + + + c3 = str.read8(); + if (c3 >= 0xC0 || c3 < 0x80) + throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx + + if (c < 0xF0) // c < 11110000 + { + // three-byte between U+000800 and U+00FFFF + rv = ((lowbits(4) & c) << 12) + + ((lowbits(6) & c2) << 6) + + (lowbits(6) & c3); + // no upper bounds-check needed, by previous lines + if (rv >= 0xE000 || + (rv >= 0x800 && rv <= 0xD7FF)) + return rv; + throw NS_ERROR_ILLEGAL_VALUE; + } + + + c4 = str.read8(); + if (c4 >= 0xC0 || c4 < 0x80) + throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx + + if (c < 0xF8) // c < 11111000 + { + // four-byte between U+010000 and U+10FFFF + rv = ((lowbits(3) & c) << 18) + + ((lowbits(6) & c2) << 12) + + ((lowbits(6) & c3) << 6) + + (lowbits(6) & c4); + // need an upper bounds-check since 0x10FFFF isn't (2**n - 1) + if (rv >= 0x10000 && rv <= 0x10FFFF) + return rv; + throw NS_ERROR_ILLEGAL_VALUE; + } + + // 11111000 or greater -- no UTF-8 mapping + throw NS_ERROR_ILLEGAL_VALUE; + } + }; + +/** + * Wraps a UTF-16 stream to allow access to the Unicode code points in it. + * + * @param stream + * the stream to wrap + * @param bigEndian + * true for UTF-16BE, false for UTF-16LE, not present at all for UTF-16 with + * a byte-order mark + */ +function UTF16(stream, bigEndian) +{ + this._stream = new BIS(stream); + if (arguments.length > 1) + { + this._bigEndian = bigEndian; + } + else + { + var bom = this._stream.read16(); + if (bom == 0xFEFF) + this._bigEndian = true; + else if (bom == 0xFFFE) + this._bigEndian = false; + else + do_throw("missing BOM: " + bom.toString(16).toUpperCase()); + } +} +UTF16.prototype = + { + // returns numeric code point at front of stream encoded in UTF-16, + // -1 if at end of stream, or throws if UTF-16 code point not found + readUnit: function() + { + var str = this._stream; + + // if at end of stream, must distinguish failure to read any bytes + // (correct behavior) from failure to read some byte after the first + // in the character + try + { + var b1 = str.read8(); + } + catch (e) + { + return -1; + } + + var b2 = str.read8(); + + var w1 = this._bigEndian + ? (b1 << 8) + b2 + : (b2 << 8) + b1; + + if (w1 > 0xDBFF && w1 < 0xE000) + { + // second surrogate, but expecting none or first + throw NS_ERROR_ILLEGAL_VALUE; + } + + if (w1 > 0xD7FF && w1 < 0xDC00) + { + // non-BMP, use surrogate pair + b1 = str.read8(); + b2 = str.read8(); + var w2 = this._bigEndian + ? (b1 << 8) + b2 + : (b2 << 8) + b1; + if (w2 < 0xDC00 || w2 > 0xDFFF) + throw NS_ERROR_ILLEGAL_VALUE; + + var rv = 0x100000 + + ((lowbits(10) & w2) << 10) + + (lowbits(10) & w1); + if (rv <= 0x10FFFF) + return rv; + throw NS_ERROR_ILLEGAL_VALUE; + } + + // non-surrogate + return w1; + } + }; |