1 files changed, 531 insertions, 0 deletions
diff --git a/js/src/vm/CharacterEncoding.cpp b/js/src/vm/CharacterEncoding.cpp
new file mode 100644
index 000000000..4644b0a36
--- /dev/null
+++ b/js/src/vm/CharacterEncoding.cpp
@@ -0,0 +1,531 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * vim: set ts=8 sts=4 et sw=4 tw=99:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "js/CharacterEncoding.h"
+
+#include "mozilla/Range.h"
+#include "mozilla/Sprintf.h"
+
+#include <algorithm>
+#include <type_traits>
+
+#include "jscntxt.h"
+#include "jsprf.h"
+
+using namespace js;
+
+Latin1CharsZ
+JS::LossyTwoByteCharsToNewLatin1CharsZ(js::ExclusiveContext* cx,
+                                       const mozilla::Range<const char16_t> tbchars)
+{
+    MOZ_ASSERT(cx);
+    size_t len = tbchars.length();
+    unsigned char* latin1 = cx->pod_malloc<unsigned char>(len + 1);
+    if (!latin1)
+        return Latin1CharsZ();
+    for (size_t i = 0; i < len; ++i)
+        latin1[i] = static_cast<unsigned char>(tbchars[i]);
+    latin1[len] = '\0';
+    return Latin1CharsZ(latin1, len);
+}
+
+template <typename CharT>
+static size_t
+GetDeflatedUTF8StringLength(const CharT* chars, size_t nchars)
+{
+    size_t nbytes = nchars;
+    for (const CharT* end = chars + nchars; chars < end; chars++) {
+        char16_t c = *chars;
+        if (c < 0x80)
+            continue;
+        uint32_t v;
+        if (0xD800 <= c && c <= 0xDFFF) {
+            /* nbytes sets 1 length since this is surrogate pair. */
+            if (c >= 0xDC00 || (chars + 1) == end) {
+                nbytes += 2; /* Bad Surrogate */
+                continue;
+            }
+            char16_t c2 = chars[1];
+            if (c2 < 0xDC00 || c2 > 0xDFFF) {
+                nbytes += 2; /* Bad Surrogate */
+                continue;
+            }
+            v = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000;
+            nbytes--;
+            chars++;
+        } else {
+            v = c;
+        }
+        v >>= 11;
+        nbytes++;
+        while (v) {
+            v >>= 5;
+            nbytes++;
+        }
+    }
+    return nbytes;
+}
+
+JS_PUBLIC_API(size_t)
+JS::GetDeflatedUTF8StringLength(JSFlatString* s)
+{
+    JS::AutoCheckCannotGC nogc;
+    return s->hasLatin1Chars()
+           ? ::GetDeflatedUTF8StringLength(s->latin1Chars(nogc), s->length())
+           : ::GetDeflatedUTF8StringLength(s->twoByteChars(nogc), s->length());
+}
+
+static const char16_t UTF8_REPLACEMENT_CHAR = 0xFFFD;
+
+template <typename CharT>
+static void
+DeflateStringToUTF8Buffer(const CharT* src, size_t srclen, mozilla::RangedPtr<char> dst,
+                          size_t* dstlenp = nullptr, size_t* numcharsp = nullptr)
+{
+    size_t capacity = 0;
+    if (dstlenp) {
+        capacity = *dstlenp;
+        *dstlenp = 0;
+    }
+    if (numcharsp)
+        *numcharsp = 0;
+
+    while (srclen) {
+        uint32_t v;
+        char16_t c = *src++;
+        srclen--;
+        if (c >= 0xDC00 && c <= 0xDFFF) {
+            v = UTF8_REPLACEMENT_CHAR;
+        } else if (c < 0xD800 || c > 0xDBFF) {
+            v = c;
+        } else {
+            if (srclen < 1) {
+                v = UTF8_REPLACEMENT_CHAR;
+            } else {
+                char16_t c2 = *src;
+                if (c2 < 0xDC00 || c2 > 0xDFFF) {
+                    v = UTF8_REPLACEMENT_CHAR;
+                } else {
+                    src++;
+                    srclen--;
+                    v = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000;
+                }
+            }
+        }
+
+        size_t utf8Len;
+        if (v < 0x0080) {
+            /* no encoding necessary - performance hack */
+            if (dstlenp && *dstlenp + 1 > capacity)
+                return;
+            *dst++ = char(v);
+            utf8Len = 1;
+        } else {
+            uint8_t utf8buf[4];
+            utf8Len = OneUcs4ToUtf8Char(utf8buf, v);
+            if (dstlenp && *dstlenp + utf8Len > capacity)
+                return;
+            for (size_t i = 0; i < utf8Len; i++)
+                *dst++ = char(utf8buf[i]);
+        }
+
+        if (dstlenp)
+            *dstlenp += utf8Len;
+        if (numcharsp)
+            (*numcharsp)++;
+    }
+}
+
+JS_PUBLIC_API(void)
+JS::DeflateStringToUTF8Buffer(JSFlatString* src, mozilla::RangedPtr<char> dst,
+                              size_t* dstlenp, size_t* numcharsp)
+{
+    JS::AutoCheckCannotGC nogc;
+    return src->hasLatin1Chars()
+           ? ::DeflateStringToUTF8Buffer(src->latin1Chars(nogc), src->length(), dst,
+                                         dstlenp, numcharsp)
+           : ::DeflateStringToUTF8Buffer(src->twoByteChars(nogc), src->length(), dst,
+                                         dstlenp, numcharsp);
+}
+
+template <typename CharT>
+UTF8CharsZ
+JS::CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx, const mozilla::Range<CharT> chars)
+{
+    /* Get required buffer size. */
+    const CharT* str = chars.begin().get();
+    size_t len = ::GetDeflatedUTF8StringLength(str, chars.length());
+
+    /* Allocate buffer. */
+    char* utf8;
+    if (maybeCx)
+        utf8 = maybeCx->pod_malloc<char>(len + 1);
+    else
+        utf8 = js_pod_malloc<char>(len + 1);
+    if (!utf8)
+        return UTF8CharsZ();
+
+    /* Encode to UTF8. */
+    ::DeflateStringToUTF8Buffer(str, chars.length(), mozilla::RangedPtr<char>(utf8, len));
+    utf8[len] = '\0';
+
+    return UTF8CharsZ(utf8, len);
+}
+
+template UTF8CharsZ
+JS::CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx,
+                         const mozilla::Range<Latin1Char> chars);
+
+template UTF8CharsZ
+JS::CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx,
+                         const mozilla::Range<char16_t> chars);
+
+template UTF8CharsZ
+JS::CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx,
+                         const mozilla::Range<const Latin1Char> chars);
+
+template UTF8CharsZ
+JS::CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx,
+                         const mozilla::Range<const char16_t> chars);
+
+static const uint32_t INVALID_UTF8 = UINT32_MAX;
+
+/*
+ * Convert a utf8 character sequence into a UCS-4 character and return that
+ * character.  It is assumed that the caller already checked that the sequence
+ * is valid.
+ */
+uint32_t
+JS::Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length)
+{
+    MOZ_ASSERT(1 <= utf8Length && utf8Length <= 4);
+
+    if (utf8Length == 1) {
+        MOZ_ASSERT(!(*utf8Buffer & 0x80));
+        return *utf8Buffer;
+    }
+
+    /* from Unicode 3.1, non-shortest form is illegal */
+    static const uint32_t minucs4Table[] = { 0x80, 0x800, 0x10000 };
+
+    MOZ_ASSERT((*utf8Buffer & (0x100 - (1 << (7 - utf8Length)))) ==
+               (0x100 - (1 << (8 - utf8Length))));
+    uint32_t ucs4Char = *utf8Buffer++ & ((1 << (7 - utf8Length)) - 1);
+    uint32_t minucs4Char = minucs4Table[utf8Length - 2];
+    while (--utf8Length) {
+        MOZ_ASSERT((*utf8Buffer & 0xC0) == 0x80);
+        ucs4Char = (ucs4Char << 6) | (*utf8Buffer++ & 0x3F);
+    }
+
+    if (MOZ_UNLIKELY(ucs4Char < minucs4Char || (ucs4Char >= 0xD800 && ucs4Char <= 0xDFFF)))
+        return INVALID_UTF8;
+
+    return ucs4Char;
+}
+
+static void
+ReportInvalidCharacter(JSContext* cx, uint32_t offset)
+{
+    char buffer[10];
+    SprintfLiteral(buffer, "%u", offset);
+    JS_ReportErrorFlagsAndNumberASCII(cx, JSREPORT_ERROR, GetErrorMessage, nullptr,
+                                      JSMSG_MALFORMED_UTF8_CHAR, buffer);
+}
+
+static void
+ReportInvalidCharacter(js::ExclusiveContext* cx, uint32_t offset)
+{
+}
+
+static void
+ReportBufferTooSmall(JSContext* cx, uint32_t dummy)
+{
+    JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_BUFFER_TOO_SMALL);
+}
+
+static void
+ReportBufferTooSmall(js::ExclusiveContext* cx, uint32_t dummy)
+{
+}
+
+static void
+ReportTooBigCharacter(JSContext* cx, uint32_t v)
+{
+    char buffer[10];
+    SprintfLiteral(buffer, "0x%x", v + 0x10000);
+    JS_ReportErrorFlagsAndNumberASCII(cx, JSREPORT_ERROR, GetErrorMessage, nullptr,
+                                      JSMSG_UTF8_CHAR_TOO_LARGE, buffer);
+}
+
+static void
+ReportTooBigCharacter(js::ExclusiveContext* cx, uint32_t v)
+{
+}
+
+enum InflateUTF8Action {
+    CountAndReportInvalids,
+    CountAndIgnoreInvalids,
+    AssertNoInvalids,
+    Copy,
+    FindEncoding
+};
+
+static const char16_t REPLACE_UTF8 = 0xFFFD;
+static const Latin1Char REPLACE_UTF8_LATIN1 = '?';
+
+// If making changes to this algorithm, make sure to also update
+// LossyConvertUTF8toUTF16() in dom/wifi/WifiUtils.cpp
+template <InflateUTF8Action Action, typename CharT, class ContextT>
+static bool
+InflateUTF8StringToBuffer(ContextT* cx, const UTF8Chars src, CharT* dst, size_t* dstlenp,
+                          JS::SmallestEncoding *smallestEncoding)
+{
+    if (Action != AssertNoInvalids)
+        *smallestEncoding = JS::SmallestEncoding::ASCII;
+    auto RequireLatin1 = [&smallestEncoding]{
+        *smallestEncoding = std::max(JS::SmallestEncoding::Latin1, *smallestEncoding);
+    };
+    auto RequireUTF16 = [&smallestEncoding]{
+        *smallestEncoding = JS::SmallestEncoding::UTF16;
+    };
+
+    // Count how many code units need to be in the inflated string.
+    // |i| is the index into |src|, and |j| is the the index into |dst|.
+    size_t srclen = src.length();
+    uint32_t j = 0;
+    for (uint32_t i = 0; i < srclen; i++, j++) {
+        uint32_t v = uint32_t(src[i]);
+        if (!(v & 0x80)) {
+            // ASCII code unit.  Simple copy.
+            if (Action == Copy)
+                dst[j] = CharT(v);
+
+        } else {
+            // Non-ASCII code unit.  Determine its length in bytes (n).
+            uint32_t n = 1;
+            while (v & (0x80 >> n))
+                n++;
+
+        #define INVALID(report, arg, n2)                                \
+            do {                                                        \
+                if (Action == CountAndReportInvalids) {                 \
+                    report(cx, arg);                                    \
+                    return false;                                       \
+                } else if (Action == AssertNoInvalids) {                \
+                    MOZ_CRASH("invalid UTF-8 string: " # report);       \
+                } else {                                                \
+                    if (Action == Copy) {                               \
+                        if (std::is_same<decltype(dst[0]), Latin1Char>::value) \
+                            dst[j] = CharT(REPLACE_UTF8_LATIN1);        \
+                        else                                            \
+                            dst[j] = CharT(REPLACE_UTF8);               \
+                    } else {                                            \
+                        MOZ_ASSERT(Action == CountAndIgnoreInvalids ||  \
+                                   Action == FindEncoding);             \
+                    }                                                   \
+                    n = n2;                                             \
+                    goto invalidMultiByteCodeUnit;                      \
+                }                                                       \
+            } while (0)
+
+            // Check the leading byte.
+            if (n < 2 || n > 4)
+                INVALID(ReportInvalidCharacter, i, 1);
+
+            // Check that |src| is large enough to hold an n-byte code unit.
+            if (i + n > srclen)
+                INVALID(ReportBufferTooSmall, /* dummy = */ 0, 1);
+
+            // Check the second byte.  From Unicode Standard v6.2, Table 3-7
+            // Well-Formed UTF-8 Byte Sequences.
+            if ((v == 0xE0 && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) ||  // E0 A0~BF
+                (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0x80) ||  // ED 80~9F
+                (v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) ||  // F0 90~BF
+                (v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80))    // F4 80~8F
+            {
+                INVALID(ReportInvalidCharacter, i, 1);
+            }
+
+            // Check the continuation bytes.
+            for (uint32_t m = 1; m < n; m++) {
+                if ((src[i + m] & 0xC0) != 0x80)
+                    INVALID(ReportInvalidCharacter, i, m);
+            }
+
+            // Determine the code unit's length in CharT and act accordingly.
+            v = JS::Utf8ToOneUcs4Char((uint8_t*)&src[i], n);
+            if (Action != AssertNoInvalids) {
+                if (v > 0xff) {
+                    RequireUTF16();
+                    if (Action == FindEncoding) {
+                        MOZ_ASSERT(dst == nullptr);
+                        return true;
+                    }
+                } else {
+                    RequireLatin1();
+                }
+            }
+            if (v < 0x10000) {
+                // The n-byte UTF8 code unit will fit in a single CharT.
+                if (Action == Copy)
+                    dst[j] = CharT(v);
+            } else {
+                v -= 0x10000;
+                if (v <= 0xFFFFF) {
+                    // The n-byte UTF8 code unit will fit in two CharT units.
+                    if (Action == Copy)
+                        dst[j] = CharT((v >> 10) + 0xD800);
+                    j++;
+                    if (Action == Copy)
+                        dst[j] = CharT((v & 0x3FF) + 0xDC00);
+
+                } else {
+                    // The n-byte UTF8 code unit won't fit in two CharT units.
+                    INVALID(ReportTooBigCharacter, v, 1);
+                }
+            }
+
+          invalidMultiByteCodeUnit:
+            // Move i to the last byte of the multi-byte code unit;  the loop
+            // header will do the final i++ to move to the start of the next
+            // code unit.
+            i += n - 1;
+            if (Action != AssertNoInvalids)
+                RequireUTF16();
+        }
+    }
+
+    if (Action != AssertNoInvalids && Action != FindEncoding)
+        *dstlenp = j;
+
+    return true;
+}
+
+template <InflateUTF8Action Action, typename CharsT, class ContextT>
+static CharsT
+InflateUTF8StringHelper(ContextT* cx, const UTF8Chars src, size_t* outlen)
+{
+    using CharT = typename CharsT::CharT;
+    *outlen = 0;
+
+    JS::SmallestEncoding encoding;
+    if (!InflateUTF8StringToBuffer<Action, CharT>(cx, src, /* dst = */ nullptr, outlen, &encoding))
+        return CharsT();
+
+    CharT* dst = cx->template pod_malloc<CharT>(*outlen + 1);  // +1 for NUL
+    if (!dst) {
+        ReportOutOfMemory(cx);
+        return CharsT();
+    }
+
+    if (encoding == JS::SmallestEncoding::ASCII) {
+        size_t srclen = src.length();
+        MOZ_ASSERT(*outlen == srclen);
+        for (uint32_t i = 0; i < srclen; i++)
+            dst[i] = CharT(src[i]);
+    } else {
+        MOZ_ALWAYS_TRUE((InflateUTF8StringToBuffer<Copy, CharT>(cx, src, dst, outlen, &encoding)));
+    }
+
+    dst[*outlen] = 0;    // NUL char
+
+    return CharsT(dst, *outlen);
+}
+
+TwoByteCharsZ
+JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
+{
+    return InflateUTF8StringHelper<CountAndReportInvalids, TwoByteCharsZ>(cx, utf8, outlen);
+}
+
+TwoByteCharsZ
+JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen)
+{
+    UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
+    return InflateUTF8StringHelper<CountAndReportInvalids, TwoByteCharsZ>(cx, chars, outlen);
+}
+
+TwoByteCharsZ
+js::LossyUTF8CharsToNewTwoByteCharsZ(js::ExclusiveContext* cx, const JS::UTF8Chars utf8, size_t* outlen)
+{
+    return InflateUTF8StringHelper<CountAndIgnoreInvalids, TwoByteCharsZ>(cx, utf8, outlen);
+}
+
+TwoByteCharsZ
+JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
+{
+    return js::LossyUTF8CharsToNewTwoByteCharsZ(cx, utf8, outlen);
+}
+
+TwoByteCharsZ
+js::LossyUTF8CharsToNewTwoByteCharsZ(js::ExclusiveContext* cx, const JS::ConstUTF8CharsZ& utf8, size_t* outlen)
+{
+    UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
+    return InflateUTF8StringHelper<CountAndIgnoreInvalids, TwoByteCharsZ>(cx, chars, outlen);
+}
+
+TwoByteCharsZ
+JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen)
+{
+    return js::LossyUTF8CharsToNewTwoByteCharsZ(cx, utf8, outlen);
+}
+
+JS::SmallestEncoding
+JS::FindSmallestEncoding(UTF8Chars utf8)
+{
+    JS::SmallestEncoding encoding;
+    MOZ_ALWAYS_TRUE((InflateUTF8StringToBuffer<FindEncoding, char16_t, JSContext>(
+                         /* cx = */ nullptr,
+                         utf8,
+                         /* dst = */ nullptr,
+                         /* dstlen = */ nullptr,
+                         &encoding)));
+    return encoding;
+}
+
+Latin1CharsZ
+JS::UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
+{
+    return InflateUTF8StringHelper<CountAndReportInvalids, Latin1CharsZ>(cx, utf8, outlen);
+}
+
+Latin1CharsZ
+js::LossyUTF8CharsToNewLatin1CharsZ(js::ExclusiveContext* cx, const JS::UTF8Chars utf8, size_t* outlen)
+{
+    return InflateUTF8StringHelper<CountAndIgnoreInvalids, Latin1CharsZ>(cx, utf8, outlen);
+}
+
+Latin1CharsZ
+JS::LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
+{
+    return js::LossyUTF8CharsToNewLatin1CharsZ(cx, utf8, outlen);
+}
+
+#ifdef DEBUG
+void
+JS::ConstUTF8CharsZ::validate(size_t aLength)
+{
+    MOZ_ASSERT(data_);
+    UTF8Chars chars(data_, aLength);
+    InflateUTF8StringToBuffer<AssertNoInvalids, char16_t, JSContext>(
+        /* cx = */ nullptr,
+        chars,
+        /* dst = */ nullptr,
+        /* dstlen = */ nullptr,
+        /* smallestEncoding = */ nullptr);
+}
+#endif
+
+bool
+JS::StringIsASCII(const char* s)
+{
+    while (*s) {
+        if (*s & 0x80)
+            return false;
+        s++;
+    }
+    return true;
+}