summaryrefslogtreecommitdiffstats
path: root/js/src/vm/CharacterEncoding.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'js/src/vm/CharacterEncoding.cpp')
-rw-r--r--js/src/vm/CharacterEncoding.cpp531
1 files changed, 531 insertions, 0 deletions
diff --git a/js/src/vm/CharacterEncoding.cpp b/js/src/vm/CharacterEncoding.cpp
new file mode 100644
index 000000000..4644b0a36
--- /dev/null
+++ b/js/src/vm/CharacterEncoding.cpp
@@ -0,0 +1,531 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * vim: set ts=8 sts=4 et sw=4 tw=99:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "js/CharacterEncoding.h"
+
+#include "mozilla/Range.h"
+#include "mozilla/Sprintf.h"
+
+#include <algorithm>
+#include <type_traits>
+
+#include "jscntxt.h"
+#include "jsprf.h"
+
+using namespace js;
+
+Latin1CharsZ
+JS::LossyTwoByteCharsToNewLatin1CharsZ(js::ExclusiveContext* cx,
+ const mozilla::Range<const char16_t> tbchars)
+{
+ MOZ_ASSERT(cx);
+ size_t len = tbchars.length();
+ unsigned char* latin1 = cx->pod_malloc<unsigned char>(len + 1);
+ if (!latin1)
+ return Latin1CharsZ();
+ for (size_t i = 0; i < len; ++i)
+ latin1[i] = static_cast<unsigned char>(tbchars[i]);
+ latin1[len] = '\0';
+ return Latin1CharsZ(latin1, len);
+}
+
+template <typename CharT>
+static size_t
+GetDeflatedUTF8StringLength(const CharT* chars, size_t nchars)
+{
+ size_t nbytes = nchars;
+ for (const CharT* end = chars + nchars; chars < end; chars++) {
+ char16_t c = *chars;
+ if (c < 0x80)
+ continue;
+ uint32_t v;
+ if (0xD800 <= c && c <= 0xDFFF) {
+ /* nbytes sets 1 length since this is surrogate pair. */
+ if (c >= 0xDC00 || (chars + 1) == end) {
+ nbytes += 2; /* Bad Surrogate */
+ continue;
+ }
+ char16_t c2 = chars[1];
+ if (c2 < 0xDC00 || c2 > 0xDFFF) {
+ nbytes += 2; /* Bad Surrogate */
+ continue;
+ }
+ v = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000;
+ nbytes--;
+ chars++;
+ } else {
+ v = c;
+ }
+ v >>= 11;
+ nbytes++;
+ while (v) {
+ v >>= 5;
+ nbytes++;
+ }
+ }
+ return nbytes;
+}
+
+JS_PUBLIC_API(size_t)
+JS::GetDeflatedUTF8StringLength(JSFlatString* s)
+{
+ JS::AutoCheckCannotGC nogc;
+ return s->hasLatin1Chars()
+ ? ::GetDeflatedUTF8StringLength(s->latin1Chars(nogc), s->length())
+ : ::GetDeflatedUTF8StringLength(s->twoByteChars(nogc), s->length());
+}
+
+static const char16_t UTF8_REPLACEMENT_CHAR = 0xFFFD;
+
+template <typename CharT>
+static void
+DeflateStringToUTF8Buffer(const CharT* src, size_t srclen, mozilla::RangedPtr<char> dst,
+ size_t* dstlenp = nullptr, size_t* numcharsp = nullptr)
+{
+ size_t capacity = 0;
+ if (dstlenp) {
+ capacity = *dstlenp;
+ *dstlenp = 0;
+ }
+ if (numcharsp)
+ *numcharsp = 0;
+
+ while (srclen) {
+ uint32_t v;
+ char16_t c = *src++;
+ srclen--;
+ if (c >= 0xDC00 && c <= 0xDFFF) {
+ v = UTF8_REPLACEMENT_CHAR;
+ } else if (c < 0xD800 || c > 0xDBFF) {
+ v = c;
+ } else {
+ if (srclen < 1) {
+ v = UTF8_REPLACEMENT_CHAR;
+ } else {
+ char16_t c2 = *src;
+ if (c2 < 0xDC00 || c2 > 0xDFFF) {
+ v = UTF8_REPLACEMENT_CHAR;
+ } else {
+ src++;
+ srclen--;
+ v = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000;
+ }
+ }
+ }
+
+ size_t utf8Len;
+ if (v < 0x0080) {
+ /* no encoding necessary - performance hack */
+ if (dstlenp && *dstlenp + 1 > capacity)
+ return;
+ *dst++ = char(v);
+ utf8Len = 1;
+ } else {
+ uint8_t utf8buf[4];
+ utf8Len = OneUcs4ToUtf8Char(utf8buf, v);
+ if (dstlenp && *dstlenp + utf8Len > capacity)
+ return;
+ for (size_t i = 0; i < utf8Len; i++)
+ *dst++ = char(utf8buf[i]);
+ }
+
+ if (dstlenp)
+ *dstlenp += utf8Len;
+ if (numcharsp)
+ (*numcharsp)++;
+ }
+}
+
+JS_PUBLIC_API(void)
+JS::DeflateStringToUTF8Buffer(JSFlatString* src, mozilla::RangedPtr<char> dst,
+ size_t* dstlenp, size_t* numcharsp)
+{
+ JS::AutoCheckCannotGC nogc;
+ return src->hasLatin1Chars()
+ ? ::DeflateStringToUTF8Buffer(src->latin1Chars(nogc), src->length(), dst,
+ dstlenp, numcharsp)
+ : ::DeflateStringToUTF8Buffer(src->twoByteChars(nogc), src->length(), dst,
+ dstlenp, numcharsp);
+}
+
+template <typename CharT>
+UTF8CharsZ
+JS::CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx, const mozilla::Range<CharT> chars)
+{
+ /* Get required buffer size. */
+ const CharT* str = chars.begin().get();
+ size_t len = ::GetDeflatedUTF8StringLength(str, chars.length());
+
+ /* Allocate buffer. */
+ char* utf8;
+ if (maybeCx)
+ utf8 = maybeCx->pod_malloc<char>(len + 1);
+ else
+ utf8 = js_pod_malloc<char>(len + 1);
+ if (!utf8)
+ return UTF8CharsZ();
+
+ /* Encode to UTF8. */
+ ::DeflateStringToUTF8Buffer(str, chars.length(), mozilla::RangedPtr<char>(utf8, len));
+ utf8[len] = '\0';
+
+ return UTF8CharsZ(utf8, len);
+}
+
+template UTF8CharsZ
+JS::CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx,
+ const mozilla::Range<Latin1Char> chars);
+
+template UTF8CharsZ
+JS::CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx,
+ const mozilla::Range<char16_t> chars);
+
+template UTF8CharsZ
+JS::CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx,
+ const mozilla::Range<const Latin1Char> chars);
+
+template UTF8CharsZ
+JS::CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx,
+ const mozilla::Range<const char16_t> chars);
+
+static const uint32_t INVALID_UTF8 = UINT32_MAX;
+
+/*
+ * Convert a utf8 character sequence into a UCS-4 character and return that
+ * character. It is assumed that the caller already checked that the sequence
+ * is valid.
+ */
+uint32_t
+JS::Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length)
+{
+ MOZ_ASSERT(1 <= utf8Length && utf8Length <= 4);
+
+ if (utf8Length == 1) {
+ MOZ_ASSERT(!(*utf8Buffer & 0x80));
+ return *utf8Buffer;
+ }
+
+ /* from Unicode 3.1, non-shortest form is illegal */
+ static const uint32_t minucs4Table[] = { 0x80, 0x800, 0x10000 };
+
+ MOZ_ASSERT((*utf8Buffer & (0x100 - (1 << (7 - utf8Length)))) ==
+ (0x100 - (1 << (8 - utf8Length))));
+ uint32_t ucs4Char = *utf8Buffer++ & ((1 << (7 - utf8Length)) - 1);
+ uint32_t minucs4Char = minucs4Table[utf8Length - 2];
+ while (--utf8Length) {
+ MOZ_ASSERT((*utf8Buffer & 0xC0) == 0x80);
+ ucs4Char = (ucs4Char << 6) | (*utf8Buffer++ & 0x3F);
+ }
+
+ if (MOZ_UNLIKELY(ucs4Char < minucs4Char || (ucs4Char >= 0xD800 && ucs4Char <= 0xDFFF)))
+ return INVALID_UTF8;
+
+ return ucs4Char;
+}
+
+static void
+ReportInvalidCharacter(JSContext* cx, uint32_t offset)
+{
+ char buffer[10];
+ SprintfLiteral(buffer, "%u", offset);
+ JS_ReportErrorFlagsAndNumberASCII(cx, JSREPORT_ERROR, GetErrorMessage, nullptr,
+ JSMSG_MALFORMED_UTF8_CHAR, buffer);
+}
+
+static void
+ReportInvalidCharacter(js::ExclusiveContext* cx, uint32_t offset)
+{
+}
+
+static void
+ReportBufferTooSmall(JSContext* cx, uint32_t dummy)
+{
+ JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_BUFFER_TOO_SMALL);
+}
+
+static void
+ReportBufferTooSmall(js::ExclusiveContext* cx, uint32_t dummy)
+{
+}
+
+static void
+ReportTooBigCharacter(JSContext* cx, uint32_t v)
+{
+ char buffer[10];
+ SprintfLiteral(buffer, "0x%x", v + 0x10000);
+ JS_ReportErrorFlagsAndNumberASCII(cx, JSREPORT_ERROR, GetErrorMessage, nullptr,
+ JSMSG_UTF8_CHAR_TOO_LARGE, buffer);
+}
+
+static void
+ReportTooBigCharacter(js::ExclusiveContext* cx, uint32_t v)
+{
+}
+
+enum InflateUTF8Action {
+ CountAndReportInvalids,
+ CountAndIgnoreInvalids,
+ AssertNoInvalids,
+ Copy,
+ FindEncoding
+};
+
+static const char16_t REPLACE_UTF8 = 0xFFFD;
+static const Latin1Char REPLACE_UTF8_LATIN1 = '?';
+
+// If making changes to this algorithm, make sure to also update
+// LossyConvertUTF8toUTF16() in dom/wifi/WifiUtils.cpp
+template <InflateUTF8Action Action, typename CharT, class ContextT>
+static bool
+InflateUTF8StringToBuffer(ContextT* cx, const UTF8Chars src, CharT* dst, size_t* dstlenp,
+ JS::SmallestEncoding *smallestEncoding)
+{
+ if (Action != AssertNoInvalids)
+ *smallestEncoding = JS::SmallestEncoding::ASCII;
+ auto RequireLatin1 = [&smallestEncoding]{
+ *smallestEncoding = std::max(JS::SmallestEncoding::Latin1, *smallestEncoding);
+ };
+ auto RequireUTF16 = [&smallestEncoding]{
+ *smallestEncoding = JS::SmallestEncoding::UTF16;
+ };
+
+ // Count how many code units need to be in the inflated string.
+ // |i| is the index into |src|, and |j| is the the index into |dst|.
+ size_t srclen = src.length();
+ uint32_t j = 0;
+ for (uint32_t i = 0; i < srclen; i++, j++) {
+ uint32_t v = uint32_t(src[i]);
+ if (!(v & 0x80)) {
+ // ASCII code unit. Simple copy.
+ if (Action == Copy)
+ dst[j] = CharT(v);
+
+ } else {
+ // Non-ASCII code unit. Determine its length in bytes (n).
+ uint32_t n = 1;
+ while (v & (0x80 >> n))
+ n++;
+
+ #define INVALID(report, arg, n2) \
+ do { \
+ if (Action == CountAndReportInvalids) { \
+ report(cx, arg); \
+ return false; \
+ } else if (Action == AssertNoInvalids) { \
+ MOZ_CRASH("invalid UTF-8 string: " # report); \
+ } else { \
+ if (Action == Copy) { \
+ if (std::is_same<decltype(dst[0]), Latin1Char>::value) \
+ dst[j] = CharT(REPLACE_UTF8_LATIN1); \
+ else \
+ dst[j] = CharT(REPLACE_UTF8); \
+ } else { \
+ MOZ_ASSERT(Action == CountAndIgnoreInvalids || \
+ Action == FindEncoding); \
+ } \
+ n = n2; \
+ goto invalidMultiByteCodeUnit; \
+ } \
+ } while (0)
+
+ // Check the leading byte.
+ if (n < 2 || n > 4)
+ INVALID(ReportInvalidCharacter, i, 1);
+
+ // Check that |src| is large enough to hold an n-byte code unit.
+ if (i + n > srclen)
+ INVALID(ReportBufferTooSmall, /* dummy = */ 0, 1);
+
+ // Check the second byte. From Unicode Standard v6.2, Table 3-7
+ // Well-Formed UTF-8 Byte Sequences.
+ if ((v == 0xE0 && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) || // E0 A0~BF
+ (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0x80) || // ED 80~9F
+ (v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) || // F0 90~BF
+ (v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80)) // F4 80~8F
+ {
+ INVALID(ReportInvalidCharacter, i, 1);
+ }
+
+ // Check the continuation bytes.
+ for (uint32_t m = 1; m < n; m++) {
+ if ((src[i + m] & 0xC0) != 0x80)
+ INVALID(ReportInvalidCharacter, i, m);
+ }
+
+ // Determine the code unit's length in CharT and act accordingly.
+ v = JS::Utf8ToOneUcs4Char((uint8_t*)&src[i], n);
+ if (Action != AssertNoInvalids) {
+ if (v > 0xff) {
+ RequireUTF16();
+ if (Action == FindEncoding) {
+ MOZ_ASSERT(dst == nullptr);
+ return true;
+ }
+ } else {
+ RequireLatin1();
+ }
+ }
+ if (v < 0x10000) {
+ // The n-byte UTF8 code unit will fit in a single CharT.
+ if (Action == Copy)
+ dst[j] = CharT(v);
+ } else {
+ v -= 0x10000;
+ if (v <= 0xFFFFF) {
+ // The n-byte UTF8 code unit will fit in two CharT units.
+ if (Action == Copy)
+ dst[j] = CharT((v >> 10) + 0xD800);
+ j++;
+ if (Action == Copy)
+ dst[j] = CharT((v & 0x3FF) + 0xDC00);
+
+ } else {
+ // The n-byte UTF8 code unit won't fit in two CharT units.
+ INVALID(ReportTooBigCharacter, v, 1);
+ }
+ }
+
+ invalidMultiByteCodeUnit:
+ // Move i to the last byte of the multi-byte code unit; the loop
+ // header will do the final i++ to move to the start of the next
+ // code unit.
+ i += n - 1;
+ if (Action != AssertNoInvalids)
+ RequireUTF16();
+ }
+ }
+
+ if (Action != AssertNoInvalids && Action != FindEncoding)
+ *dstlenp = j;
+
+ return true;
+}
+
+template <InflateUTF8Action Action, typename CharsT, class ContextT>
+static CharsT
+InflateUTF8StringHelper(ContextT* cx, const UTF8Chars src, size_t* outlen)
+{
+ using CharT = typename CharsT::CharT;
+ *outlen = 0;
+
+ JS::SmallestEncoding encoding;
+ if (!InflateUTF8StringToBuffer<Action, CharT>(cx, src, /* dst = */ nullptr, outlen, &encoding))
+ return CharsT();
+
+ CharT* dst = cx->template pod_malloc<CharT>(*outlen + 1); // +1 for NUL
+ if (!dst) {
+ ReportOutOfMemory(cx);
+ return CharsT();
+ }
+
+ if (encoding == JS::SmallestEncoding::ASCII) {
+ size_t srclen = src.length();
+ MOZ_ASSERT(*outlen == srclen);
+ for (uint32_t i = 0; i < srclen; i++)
+ dst[i] = CharT(src[i]);
+ } else {
+ MOZ_ALWAYS_TRUE((InflateUTF8StringToBuffer<Copy, CharT>(cx, src, dst, outlen, &encoding)));
+ }
+
+ dst[*outlen] = 0; // NUL char
+
+ return CharsT(dst, *outlen);
+}
+
+TwoByteCharsZ
+JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
+{
+ return InflateUTF8StringHelper<CountAndReportInvalids, TwoByteCharsZ>(cx, utf8, outlen);
+}
+
+TwoByteCharsZ
+JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen)
+{
+ UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
+ return InflateUTF8StringHelper<CountAndReportInvalids, TwoByteCharsZ>(cx, chars, outlen);
+}
+
+TwoByteCharsZ
+js::LossyUTF8CharsToNewTwoByteCharsZ(js::ExclusiveContext* cx, const JS::UTF8Chars utf8, size_t* outlen)
+{
+ return InflateUTF8StringHelper<CountAndIgnoreInvalids, TwoByteCharsZ>(cx, utf8, outlen);
+}
+
+TwoByteCharsZ
+JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
+{
+ return js::LossyUTF8CharsToNewTwoByteCharsZ(cx, utf8, outlen);
+}
+
+TwoByteCharsZ
+js::LossyUTF8CharsToNewTwoByteCharsZ(js::ExclusiveContext* cx, const JS::ConstUTF8CharsZ& utf8, size_t* outlen)
+{
+ UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
+ return InflateUTF8StringHelper<CountAndIgnoreInvalids, TwoByteCharsZ>(cx, chars, outlen);
+}
+
+TwoByteCharsZ
+JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen)
+{
+ return js::LossyUTF8CharsToNewTwoByteCharsZ(cx, utf8, outlen);
+}
+
+JS::SmallestEncoding
+JS::FindSmallestEncoding(UTF8Chars utf8)
+{
+ JS::SmallestEncoding encoding;
+ MOZ_ALWAYS_TRUE((InflateUTF8StringToBuffer<FindEncoding, char16_t, JSContext>(
+ /* cx = */ nullptr,
+ utf8,
+ /* dst = */ nullptr,
+ /* dstlen = */ nullptr,
+ &encoding)));
+ return encoding;
+}
+
+Latin1CharsZ
+JS::UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
+{
+ return InflateUTF8StringHelper<CountAndReportInvalids, Latin1CharsZ>(cx, utf8, outlen);
+}
+
+Latin1CharsZ
+js::LossyUTF8CharsToNewLatin1CharsZ(js::ExclusiveContext* cx, const JS::UTF8Chars utf8, size_t* outlen)
+{
+ return InflateUTF8StringHelper<CountAndIgnoreInvalids, Latin1CharsZ>(cx, utf8, outlen);
+}
+
+Latin1CharsZ
+JS::LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
+{
+ return js::LossyUTF8CharsToNewLatin1CharsZ(cx, utf8, outlen);
+}
+
+#ifdef DEBUG
+void
+JS::ConstUTF8CharsZ::validate(size_t aLength)
+{
+ MOZ_ASSERT(data_);
+ UTF8Chars chars(data_, aLength);
+ InflateUTF8StringToBuffer<AssertNoInvalids, char16_t, JSContext>(
+ /* cx = */ nullptr,
+ chars,
+ /* dst = */ nullptr,
+ /* dstlen = */ nullptr,
+ /* smallestEncoding = */ nullptr);
+}
+#endif
+
+bool
+JS::StringIsASCII(const char* s)
+{
+ while (*s) {
+ if (*s & 0x80)
+ return false;
+ s++;
+ }
+ return true;
+}