diff options
Diffstat (limited to 'js/src/vm/CharacterEncoding.cpp')
-rw-r--r-- | js/src/vm/CharacterEncoding.cpp | 531 |
1 files changed, 531 insertions, 0 deletions
diff --git a/js/src/vm/CharacterEncoding.cpp b/js/src/vm/CharacterEncoding.cpp new file mode 100644 index 000000000..4644b0a36 --- /dev/null +++ b/js/src/vm/CharacterEncoding.cpp @@ -0,0 +1,531 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- + * vim: set ts=8 sts=4 et sw=4 tw=99: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "js/CharacterEncoding.h" + +#include "mozilla/Range.h" +#include "mozilla/Sprintf.h" + +#include <algorithm> +#include <type_traits> + +#include "jscntxt.h" +#include "jsprf.h" + +using namespace js; + +Latin1CharsZ +JS::LossyTwoByteCharsToNewLatin1CharsZ(js::ExclusiveContext* cx, + const mozilla::Range<const char16_t> tbchars) +{ + MOZ_ASSERT(cx); + size_t len = tbchars.length(); + unsigned char* latin1 = cx->pod_malloc<unsigned char>(len + 1); + if (!latin1) + return Latin1CharsZ(); + for (size_t i = 0; i < len; ++i) + latin1[i] = static_cast<unsigned char>(tbchars[i]); + latin1[len] = '\0'; + return Latin1CharsZ(latin1, len); +} + +template <typename CharT> +static size_t +GetDeflatedUTF8StringLength(const CharT* chars, size_t nchars) +{ + size_t nbytes = nchars; + for (const CharT* end = chars + nchars; chars < end; chars++) { + char16_t c = *chars; + if (c < 0x80) + continue; + uint32_t v; + if (0xD800 <= c && c <= 0xDFFF) { + /* nbytes sets 1 length since this is surrogate pair. */ + if (c >= 0xDC00 || (chars + 1) == end) { + nbytes += 2; /* Bad Surrogate */ + continue; + } + char16_t c2 = chars[1]; + if (c2 < 0xDC00 || c2 > 0xDFFF) { + nbytes += 2; /* Bad Surrogate */ + continue; + } + v = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000; + nbytes--; + chars++; + } else { + v = c; + } + v >>= 11; + nbytes++; + while (v) { + v >>= 5; + nbytes++; + } + } + return nbytes; +} + +JS_PUBLIC_API(size_t) +JS::GetDeflatedUTF8StringLength(JSFlatString* s) +{ + JS::AutoCheckCannotGC nogc; + return s->hasLatin1Chars() + ? ::GetDeflatedUTF8StringLength(s->latin1Chars(nogc), s->length()) + : ::GetDeflatedUTF8StringLength(s->twoByteChars(nogc), s->length()); +} + +static const char16_t UTF8_REPLACEMENT_CHAR = 0xFFFD; + +template <typename CharT> +static void +DeflateStringToUTF8Buffer(const CharT* src, size_t srclen, mozilla::RangedPtr<char> dst, + size_t* dstlenp = nullptr, size_t* numcharsp = nullptr) +{ + size_t capacity = 0; + if (dstlenp) { + capacity = *dstlenp; + *dstlenp = 0; + } + if (numcharsp) + *numcharsp = 0; + + while (srclen) { + uint32_t v; + char16_t c = *src++; + srclen--; + if (c >= 0xDC00 && c <= 0xDFFF) { + v = UTF8_REPLACEMENT_CHAR; + } else if (c < 0xD800 || c > 0xDBFF) { + v = c; + } else { + if (srclen < 1) { + v = UTF8_REPLACEMENT_CHAR; + } else { + char16_t c2 = *src; + if (c2 < 0xDC00 || c2 > 0xDFFF) { + v = UTF8_REPLACEMENT_CHAR; + } else { + src++; + srclen--; + v = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000; + } + } + } + + size_t utf8Len; + if (v < 0x0080) { + /* no encoding necessary - performance hack */ + if (dstlenp && *dstlenp + 1 > capacity) + return; + *dst++ = char(v); + utf8Len = 1; + } else { + uint8_t utf8buf[4]; + utf8Len = OneUcs4ToUtf8Char(utf8buf, v); + if (dstlenp && *dstlenp + utf8Len > capacity) + return; + for (size_t i = 0; i < utf8Len; i++) + *dst++ = char(utf8buf[i]); + } + + if (dstlenp) + *dstlenp += utf8Len; + if (numcharsp) + (*numcharsp)++; + } +} + +JS_PUBLIC_API(void) +JS::DeflateStringToUTF8Buffer(JSFlatString* src, mozilla::RangedPtr<char> dst, + size_t* dstlenp, size_t* numcharsp) +{ + JS::AutoCheckCannotGC nogc; + return src->hasLatin1Chars() + ? ::DeflateStringToUTF8Buffer(src->latin1Chars(nogc), src->length(), dst, + dstlenp, numcharsp) + : ::DeflateStringToUTF8Buffer(src->twoByteChars(nogc), src->length(), dst, + dstlenp, numcharsp); +} + +template <typename CharT> +UTF8CharsZ +JS::CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx, const mozilla::Range<CharT> chars) +{ + /* Get required buffer size. */ + const CharT* str = chars.begin().get(); + size_t len = ::GetDeflatedUTF8StringLength(str, chars.length()); + + /* Allocate buffer. */ + char* utf8; + if (maybeCx) + utf8 = maybeCx->pod_malloc<char>(len + 1); + else + utf8 = js_pod_malloc<char>(len + 1); + if (!utf8) + return UTF8CharsZ(); + + /* Encode to UTF8. */ + ::DeflateStringToUTF8Buffer(str, chars.length(), mozilla::RangedPtr<char>(utf8, len)); + utf8[len] = '\0'; + + return UTF8CharsZ(utf8, len); +} + +template UTF8CharsZ +JS::CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx, + const mozilla::Range<Latin1Char> chars); + +template UTF8CharsZ +JS::CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx, + const mozilla::Range<char16_t> chars); + +template UTF8CharsZ +JS::CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx, + const mozilla::Range<const Latin1Char> chars); + +template UTF8CharsZ +JS::CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx, + const mozilla::Range<const char16_t> chars); + +static const uint32_t INVALID_UTF8 = UINT32_MAX; + +/* + * Convert a utf8 character sequence into a UCS-4 character and return that + * character. It is assumed that the caller already checked that the sequence + * is valid. + */ +uint32_t +JS::Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length) +{ + MOZ_ASSERT(1 <= utf8Length && utf8Length <= 4); + + if (utf8Length == 1) { + MOZ_ASSERT(!(*utf8Buffer & 0x80)); + return *utf8Buffer; + } + + /* from Unicode 3.1, non-shortest form is illegal */ + static const uint32_t minucs4Table[] = { 0x80, 0x800, 0x10000 }; + + MOZ_ASSERT((*utf8Buffer & (0x100 - (1 << (7 - utf8Length)))) == + (0x100 - (1 << (8 - utf8Length)))); + uint32_t ucs4Char = *utf8Buffer++ & ((1 << (7 - utf8Length)) - 1); + uint32_t minucs4Char = minucs4Table[utf8Length - 2]; + while (--utf8Length) { + MOZ_ASSERT((*utf8Buffer & 0xC0) == 0x80); + ucs4Char = (ucs4Char << 6) | (*utf8Buffer++ & 0x3F); + } + + if (MOZ_UNLIKELY(ucs4Char < minucs4Char || (ucs4Char >= 0xD800 && ucs4Char <= 0xDFFF))) + return INVALID_UTF8; + + return ucs4Char; +} + +static void +ReportInvalidCharacter(JSContext* cx, uint32_t offset) +{ + char buffer[10]; + SprintfLiteral(buffer, "%u", offset); + JS_ReportErrorFlagsAndNumberASCII(cx, JSREPORT_ERROR, GetErrorMessage, nullptr, + JSMSG_MALFORMED_UTF8_CHAR, buffer); +} + +static void +ReportInvalidCharacter(js::ExclusiveContext* cx, uint32_t offset) +{ +} + +static void +ReportBufferTooSmall(JSContext* cx, uint32_t dummy) +{ + JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_BUFFER_TOO_SMALL); +} + +static void +ReportBufferTooSmall(js::ExclusiveContext* cx, uint32_t dummy) +{ +} + +static void +ReportTooBigCharacter(JSContext* cx, uint32_t v) +{ + char buffer[10]; + SprintfLiteral(buffer, "0x%x", v + 0x10000); + JS_ReportErrorFlagsAndNumberASCII(cx, JSREPORT_ERROR, GetErrorMessage, nullptr, + JSMSG_UTF8_CHAR_TOO_LARGE, buffer); +} + +static void +ReportTooBigCharacter(js::ExclusiveContext* cx, uint32_t v) +{ +} + +enum InflateUTF8Action { + CountAndReportInvalids, + CountAndIgnoreInvalids, + AssertNoInvalids, + Copy, + FindEncoding +}; + +static const char16_t REPLACE_UTF8 = 0xFFFD; +static const Latin1Char REPLACE_UTF8_LATIN1 = '?'; + +// If making changes to this algorithm, make sure to also update +// LossyConvertUTF8toUTF16() in dom/wifi/WifiUtils.cpp +template <InflateUTF8Action Action, typename CharT, class ContextT> +static bool +InflateUTF8StringToBuffer(ContextT* cx, const UTF8Chars src, CharT* dst, size_t* dstlenp, + JS::SmallestEncoding *smallestEncoding) +{ + if (Action != AssertNoInvalids) + *smallestEncoding = JS::SmallestEncoding::ASCII; + auto RequireLatin1 = [&smallestEncoding]{ + *smallestEncoding = std::max(JS::SmallestEncoding::Latin1, *smallestEncoding); + }; + auto RequireUTF16 = [&smallestEncoding]{ + *smallestEncoding = JS::SmallestEncoding::UTF16; + }; + + // Count how many code units need to be in the inflated string. + // |i| is the index into |src|, and |j| is the the index into |dst|. + size_t srclen = src.length(); + uint32_t j = 0; + for (uint32_t i = 0; i < srclen; i++, j++) { + uint32_t v = uint32_t(src[i]); + if (!(v & 0x80)) { + // ASCII code unit. Simple copy. + if (Action == Copy) + dst[j] = CharT(v); + + } else { + // Non-ASCII code unit. Determine its length in bytes (n). + uint32_t n = 1; + while (v & (0x80 >> n)) + n++; + + #define INVALID(report, arg, n2) \ + do { \ + if (Action == CountAndReportInvalids) { \ + report(cx, arg); \ + return false; \ + } else if (Action == AssertNoInvalids) { \ + MOZ_CRASH("invalid UTF-8 string: " # report); \ + } else { \ + if (Action == Copy) { \ + if (std::is_same<decltype(dst[0]), Latin1Char>::value) \ + dst[j] = CharT(REPLACE_UTF8_LATIN1); \ + else \ + dst[j] = CharT(REPLACE_UTF8); \ + } else { \ + MOZ_ASSERT(Action == CountAndIgnoreInvalids || \ + Action == FindEncoding); \ + } \ + n = n2; \ + goto invalidMultiByteCodeUnit; \ + } \ + } while (0) + + // Check the leading byte. + if (n < 2 || n > 4) + INVALID(ReportInvalidCharacter, i, 1); + + // Check that |src| is large enough to hold an n-byte code unit. + if (i + n > srclen) + INVALID(ReportBufferTooSmall, /* dummy = */ 0, 1); + + // Check the second byte. From Unicode Standard v6.2, Table 3-7 + // Well-Formed UTF-8 Byte Sequences. + if ((v == 0xE0 && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) || // E0 A0~BF + (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0x80) || // ED 80~9F + (v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) || // F0 90~BF + (v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80)) // F4 80~8F + { + INVALID(ReportInvalidCharacter, i, 1); + } + + // Check the continuation bytes. + for (uint32_t m = 1; m < n; m++) { + if ((src[i + m] & 0xC0) != 0x80) + INVALID(ReportInvalidCharacter, i, m); + } + + // Determine the code unit's length in CharT and act accordingly. + v = JS::Utf8ToOneUcs4Char((uint8_t*)&src[i], n); + if (Action != AssertNoInvalids) { + if (v > 0xff) { + RequireUTF16(); + if (Action == FindEncoding) { + MOZ_ASSERT(dst == nullptr); + return true; + } + } else { + RequireLatin1(); + } + } + if (v < 0x10000) { + // The n-byte UTF8 code unit will fit in a single CharT. + if (Action == Copy) + dst[j] = CharT(v); + } else { + v -= 0x10000; + if (v <= 0xFFFFF) { + // The n-byte UTF8 code unit will fit in two CharT units. + if (Action == Copy) + dst[j] = CharT((v >> 10) + 0xD800); + j++; + if (Action == Copy) + dst[j] = CharT((v & 0x3FF) + 0xDC00); + + } else { + // The n-byte UTF8 code unit won't fit in two CharT units. + INVALID(ReportTooBigCharacter, v, 1); + } + } + + invalidMultiByteCodeUnit: + // Move i to the last byte of the multi-byte code unit; the loop + // header will do the final i++ to move to the start of the next + // code unit. + i += n - 1; + if (Action != AssertNoInvalids) + RequireUTF16(); + } + } + + if (Action != AssertNoInvalids && Action != FindEncoding) + *dstlenp = j; + + return true; +} + +template <InflateUTF8Action Action, typename CharsT, class ContextT> +static CharsT +InflateUTF8StringHelper(ContextT* cx, const UTF8Chars src, size_t* outlen) +{ + using CharT = typename CharsT::CharT; + *outlen = 0; + + JS::SmallestEncoding encoding; + if (!InflateUTF8StringToBuffer<Action, CharT>(cx, src, /* dst = */ nullptr, outlen, &encoding)) + return CharsT(); + + CharT* dst = cx->template pod_malloc<CharT>(*outlen + 1); // +1 for NUL + if (!dst) { + ReportOutOfMemory(cx); + return CharsT(); + } + + if (encoding == JS::SmallestEncoding::ASCII) { + size_t srclen = src.length(); + MOZ_ASSERT(*outlen == srclen); + for (uint32_t i = 0; i < srclen; i++) + dst[i] = CharT(src[i]); + } else { + MOZ_ALWAYS_TRUE((InflateUTF8StringToBuffer<Copy, CharT>(cx, src, dst, outlen, &encoding))); + } + + dst[*outlen] = 0; // NUL char + + return CharsT(dst, *outlen); +} + +TwoByteCharsZ +JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen) +{ + return InflateUTF8StringHelper<CountAndReportInvalids, TwoByteCharsZ>(cx, utf8, outlen); +} + +TwoByteCharsZ +JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen) +{ + UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str())); + return InflateUTF8StringHelper<CountAndReportInvalids, TwoByteCharsZ>(cx, chars, outlen); +} + +TwoByteCharsZ +js::LossyUTF8CharsToNewTwoByteCharsZ(js::ExclusiveContext* cx, const JS::UTF8Chars utf8, size_t* outlen) +{ + return InflateUTF8StringHelper<CountAndIgnoreInvalids, TwoByteCharsZ>(cx, utf8, outlen); +} + +TwoByteCharsZ +JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen) +{ + return js::LossyUTF8CharsToNewTwoByteCharsZ(cx, utf8, outlen); +} + +TwoByteCharsZ +js::LossyUTF8CharsToNewTwoByteCharsZ(js::ExclusiveContext* cx, const JS::ConstUTF8CharsZ& utf8, size_t* outlen) +{ + UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str())); + return InflateUTF8StringHelper<CountAndIgnoreInvalids, TwoByteCharsZ>(cx, chars, outlen); +} + +TwoByteCharsZ +JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen) +{ + return js::LossyUTF8CharsToNewTwoByteCharsZ(cx, utf8, outlen); +} + +JS::SmallestEncoding +JS::FindSmallestEncoding(UTF8Chars utf8) +{ + JS::SmallestEncoding encoding; + MOZ_ALWAYS_TRUE((InflateUTF8StringToBuffer<FindEncoding, char16_t, JSContext>( + /* cx = */ nullptr, + utf8, + /* dst = */ nullptr, + /* dstlen = */ nullptr, + &encoding))); + return encoding; +} + +Latin1CharsZ +JS::UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen) +{ + return InflateUTF8StringHelper<CountAndReportInvalids, Latin1CharsZ>(cx, utf8, outlen); +} + +Latin1CharsZ +js::LossyUTF8CharsToNewLatin1CharsZ(js::ExclusiveContext* cx, const JS::UTF8Chars utf8, size_t* outlen) +{ + return InflateUTF8StringHelper<CountAndIgnoreInvalids, Latin1CharsZ>(cx, utf8, outlen); +} + +Latin1CharsZ +JS::LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen) +{ + return js::LossyUTF8CharsToNewLatin1CharsZ(cx, utf8, outlen); +} + +#ifdef DEBUG +void +JS::ConstUTF8CharsZ::validate(size_t aLength) +{ + MOZ_ASSERT(data_); + UTF8Chars chars(data_, aLength); + InflateUTF8StringToBuffer<AssertNoInvalids, char16_t, JSContext>( + /* cx = */ nullptr, + chars, + /* dst = */ nullptr, + /* dstlen = */ nullptr, + /* smallestEncoding = */ nullptr); +} +#endif + +bool +JS::StringIsASCII(const char* s) +{ + while (*s) { + if (*s & 0x80) + return false; + s++; + } + return true; +} |