/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
 * vim: set ts=8 sts=4 et sw=4 tw=99:
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "js/CharacterEncoding.h"

#include "mozilla/Range.h"
#include "mozilla/Sprintf.h"

#include <algorithm>
#include <type_traits>

#include "jscntxt.h"
#include "jsprf.h"

using namespace js;

Latin1CharsZ
JS::LossyTwoByteCharsToNewLatin1CharsZ(js::ExclusiveContext* cx,
                                       const mozilla::Range<const char16_t> tbchars)
{
    MOZ_ASSERT(cx);
    size_t len = tbchars.length();
    unsigned char* latin1 = cx->pod_malloc<unsigned char>(len + 1);
    if (!latin1)
        return Latin1CharsZ();
    for (size_t i = 0; i < len; ++i)
        latin1[i] = static_cast<unsigned char>(tbchars[i]);
    latin1[len] = '\0';
    return Latin1CharsZ(latin1, len);
}

template <typename CharT>
static size_t
GetDeflatedUTF8StringLength(const CharT* chars, size_t nchars)
{
    size_t nbytes = nchars;
    for (const CharT* end = chars + nchars; chars < end; chars++) {
        char16_t c = *chars;
        if (c < 0x80)
            continue;
        uint32_t v;
        if (0xD800 <= c && c <= 0xDFFF) {
            /* nbytes sets 1 length since this is surrogate pair. */
            if (c >= 0xDC00 || (chars + 1) == end) {
                nbytes += 2; /* Bad Surrogate */
                continue;
            }
            char16_t c2 = chars[1];
            if (c2 < 0xDC00 || c2 > 0xDFFF) {
                nbytes += 2; /* Bad Surrogate */
                continue;
            }
            v = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000;
            nbytes--;
            chars++;
        } else {
            v = c;
        }
        v >>= 11;
        nbytes++;
        while (v) {
            v >>= 5;
            nbytes++;
        }
    }
    return nbytes;
}

JS_PUBLIC_API(size_t)
JS::GetDeflatedUTF8StringLength(JSFlatString* s)
{
    JS::AutoCheckCannotGC nogc;
    return s->hasLatin1Chars()
           ? ::GetDeflatedUTF8StringLength(s->latin1Chars(nogc), s->length())
           : ::GetDeflatedUTF8StringLength(s->twoByteChars(nogc), s->length());
}

static const char16_t UTF8_REPLACEMENT_CHAR = 0xFFFD;

template <typename CharT>
static void
DeflateStringToUTF8Buffer(const CharT* src, size_t srclen, mozilla::RangedPtr<char> dst,
                          size_t* dstlenp = nullptr, size_t* numcharsp = nullptr)
{
    size_t capacity = 0;
    if (dstlenp) {
        capacity = *dstlenp;
        *dstlenp = 0;
    }
    if (numcharsp)
        *numcharsp = 0;

    while (srclen) {
        uint32_t v;
        char16_t c = *src++;
        srclen--;
        if (c >= 0xDC00 && c <= 0xDFFF) {
            v = UTF8_REPLACEMENT_CHAR;
        } else if (c < 0xD800 || c > 0xDBFF) {
            v = c;
        } else {
            if (srclen < 1) {
                v = UTF8_REPLACEMENT_CHAR;
            } else {
                char16_t c2 = *src;
                if (c2 < 0xDC00 || c2 > 0xDFFF) {
                    v = UTF8_REPLACEMENT_CHAR;
                } else {
                    src++;
                    srclen--;
                    v = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000;
                }
            }
        }

        size_t utf8Len;
        if (v < 0x0080) {
            /* no encoding necessary - performance hack */
            if (dstlenp && *dstlenp + 1 > capacity)
                return;
            *dst++ = char(v);
            utf8Len = 1;
        } else {
            uint8_t utf8buf[4];
            utf8Len = OneUcs4ToUtf8Char(utf8buf, v);
            if (dstlenp && *dstlenp + utf8Len > capacity)
                return;
            for (size_t i = 0; i < utf8Len; i++)
                *dst++ = char(utf8buf[i]);
        }

        if (dstlenp)
            *dstlenp += utf8Len;
        if (numcharsp)
            (*numcharsp)++;
    }
}

JS_PUBLIC_API(void)
JS::DeflateStringToUTF8Buffer(JSFlatString* src, mozilla::RangedPtr<char> dst,
                              size_t* dstlenp, size_t* numcharsp)
{
    JS::AutoCheckCannotGC nogc;
    return src->hasLatin1Chars()
           ? ::DeflateStringToUTF8Buffer(src->latin1Chars(nogc), src->length(), dst,
                                         dstlenp, numcharsp)
           : ::DeflateStringToUTF8Buffer(src->twoByteChars(nogc), src->length(), dst,
                                         dstlenp, numcharsp);
}

template <typename CharT>
UTF8CharsZ
JS::CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx, const mozilla::Range<CharT> chars)
{
    /* Get required buffer size. */
    const CharT* str = chars.begin().get();
    size_t len = ::GetDeflatedUTF8StringLength(str, chars.length());

    /* Allocate buffer. */
    char* utf8;
    if (maybeCx)
        utf8 = maybeCx->pod_malloc<char>(len + 1);
    else
        utf8 = js_pod_malloc<char>(len + 1);
    if (!utf8)
        return UTF8CharsZ();

    /* Encode to UTF8. */
    ::DeflateStringToUTF8Buffer(str, chars.length(), mozilla::RangedPtr<char>(utf8, len));
    utf8[len] = '\0';

    return UTF8CharsZ(utf8, len);
}

template UTF8CharsZ
JS::CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx,
                         const mozilla::Range<Latin1Char> chars);

template UTF8CharsZ
JS::CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx,
                         const mozilla::Range<char16_t> chars);

template UTF8CharsZ
JS::CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx,
                         const mozilla::Range<const Latin1Char> chars);

template UTF8CharsZ
JS::CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx,
                         const mozilla::Range<const char16_t> chars);

static const uint32_t INVALID_UTF8 = UINT32_MAX;

/*
 * Convert a utf8 character sequence into a UCS-4 character and return that
 * character.  It is assumed that the caller already checked that the sequence
 * is valid.
 */
uint32_t
JS::Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length)
{
    MOZ_ASSERT(1 <= utf8Length && utf8Length <= 4);

    if (utf8Length == 1) {
        MOZ_ASSERT(!(*utf8Buffer & 0x80));
        return *utf8Buffer;
    }

    /* from Unicode 3.1, non-shortest form is illegal */
    static const uint32_t minucs4Table[] = { 0x80, 0x800, 0x10000 };

    MOZ_ASSERT((*utf8Buffer & (0x100 - (1 << (7 - utf8Length)))) ==
               (0x100 - (1 << (8 - utf8Length))));
    uint32_t ucs4Char = *utf8Buffer++ & ((1 << (7 - utf8Length)) - 1);
    uint32_t minucs4Char = minucs4Table[utf8Length - 2];
    while (--utf8Length) {
        MOZ_ASSERT((*utf8Buffer & 0xC0) == 0x80);
        ucs4Char = (ucs4Char << 6) | (*utf8Buffer++ & 0x3F);
    }

    if (MOZ_UNLIKELY(ucs4Char < minucs4Char || (ucs4Char >= 0xD800 && ucs4Char <= 0xDFFF)))
        return INVALID_UTF8;

    return ucs4Char;
}

static void
ReportInvalidCharacter(JSContext* cx, uint32_t offset)
{
    char buffer[10];
    SprintfLiteral(buffer, "%u", offset);
    JS_ReportErrorFlagsAndNumberASCII(cx, JSREPORT_ERROR, GetErrorMessage, nullptr,
                                      JSMSG_MALFORMED_UTF8_CHAR, buffer);
}

static void
ReportInvalidCharacter(js::ExclusiveContext* cx, uint32_t offset)
{
}

static void
ReportBufferTooSmall(JSContext* cx, uint32_t dummy)
{
    JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_BUFFER_TOO_SMALL);
}

static void
ReportBufferTooSmall(js::ExclusiveContext* cx, uint32_t dummy)
{
}

static void
ReportTooBigCharacter(JSContext* cx, uint32_t v)
{
    char buffer[10];
    SprintfLiteral(buffer, "0x%x", v + 0x10000);
    JS_ReportErrorFlagsAndNumberASCII(cx, JSREPORT_ERROR, GetErrorMessage, nullptr,
                                      JSMSG_UTF8_CHAR_TOO_LARGE, buffer);
}

static void
ReportTooBigCharacter(js::ExclusiveContext* cx, uint32_t v)
{
}

enum InflateUTF8Action {
    CountAndReportInvalids,
    CountAndIgnoreInvalids,
    AssertNoInvalids,
    Copy,
    FindEncoding
};

static const char16_t REPLACE_UTF8 = 0xFFFD;
static const Latin1Char REPLACE_UTF8_LATIN1 = '?';

// If making changes to this algorithm, make sure to also update
// LossyConvertUTF8toUTF16() in dom/wifi/WifiUtils.cpp
template <InflateUTF8Action Action, typename CharT, class ContextT>
static bool
InflateUTF8StringToBuffer(ContextT* cx, const UTF8Chars src, CharT* dst, size_t* dstlenp,
                          JS::SmallestEncoding *smallestEncoding)
{
    if (Action != AssertNoInvalids)
        *smallestEncoding = JS::SmallestEncoding::ASCII;
    auto RequireLatin1 = [&smallestEncoding]{
        *smallestEncoding = std::max(JS::SmallestEncoding::Latin1, *smallestEncoding);
    };
    auto RequireUTF16 = [&smallestEncoding]{
        *smallestEncoding = JS::SmallestEncoding::UTF16;
    };

    // Count how many code units need to be in the inflated string.
    // |i| is the index into |src|, and |j| is the the index into |dst|.
    size_t srclen = src.length();
    uint32_t j = 0;
    for (uint32_t i = 0; i < srclen; i++, j++) {
        uint32_t v = uint32_t(src[i]);
        if (!(v & 0x80)) {
            // ASCII code unit.  Simple copy.
            if (Action == Copy)
                dst[j] = CharT(v);

        } else {
            // Non-ASCII code unit.  Determine its length in bytes (n).
            uint32_t n = 1;
            while (v & (0x80 >> n))
                n++;

        #define INVALID(report, arg, n2)                                \
            do {                                                        \
                if (Action == CountAndReportInvalids) {                 \
                    report(cx, arg);                                    \
                    return false;                                       \
                } else if (Action == AssertNoInvalids) {                \
                    MOZ_CRASH("invalid UTF-8 string: " # report);       \
                } else {                                                \
                    if (Action == Copy) {                               \
                        if (std::is_same<decltype(dst[0]), Latin1Char>::value) \
                            dst[j] = CharT(REPLACE_UTF8_LATIN1);        \
                        else                                            \
                            dst[j] = CharT(REPLACE_UTF8);               \
                    } else {                                            \
                        MOZ_ASSERT(Action == CountAndIgnoreInvalids ||  \
                                   Action == FindEncoding);             \
                    }                                                   \
                    n = n2;                                             \
                    goto invalidMultiByteCodeUnit;                      \
                }                                                       \
            } while (0)

            // Check the leading byte.
            if (n < 2 || n > 4)
                INVALID(ReportInvalidCharacter, i, 1);

            // Check that |src| is large enough to hold an n-byte code unit.
            if (i + n > srclen)
                INVALID(ReportBufferTooSmall, /* dummy = */ 0, 1);

            // Check the second byte.  From Unicode Standard v6.2, Table 3-7
            // Well-Formed UTF-8 Byte Sequences.
            if ((v == 0xE0 && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) ||  // E0 A0~BF
                (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0x80) ||  // ED 80~9F
                (v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) ||  // F0 90~BF
                (v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80))    // F4 80~8F
            {
                INVALID(ReportInvalidCharacter, i, 1);
            }

            // Check the continuation bytes.
            for (uint32_t m = 1; m < n; m++) {
                if ((src[i + m] & 0xC0) != 0x80)
                    INVALID(ReportInvalidCharacter, i, m);
            }

            // Determine the code unit's length in CharT and act accordingly.
            v = JS::Utf8ToOneUcs4Char((uint8_t*)&src[i], n);
            if (Action != AssertNoInvalids) {
                if (v > 0xff) {
                    RequireUTF16();
                    if (Action == FindEncoding) {
                        MOZ_ASSERT(dst == nullptr);
                        return true;
                    }
                } else {
                    RequireLatin1();
                }
            }
            if (v < 0x10000) {
                // The n-byte UTF8 code unit will fit in a single CharT.
                if (Action == Copy)
                    dst[j] = CharT(v);
            } else {
                v -= 0x10000;
                if (v <= 0xFFFFF) {
                    // The n-byte UTF8 code unit will fit in two CharT units.
                    if (Action == Copy)
                        dst[j] = CharT((v >> 10) + 0xD800);
                    j++;
                    if (Action == Copy)
                        dst[j] = CharT((v & 0x3FF) + 0xDC00);

                } else {
                    // The n-byte UTF8 code unit won't fit in two CharT units.
                    INVALID(ReportTooBigCharacter, v, 1);
                }
            }

          invalidMultiByteCodeUnit:
            // Move i to the last byte of the multi-byte code unit;  the loop
            // header will do the final i++ to move to the start of the next
            // code unit.
            i += n - 1;
            if (Action != AssertNoInvalids)
                RequireUTF16();
        }
    }

    if (Action != AssertNoInvalids && Action != FindEncoding)
        *dstlenp = j;

    return true;
}

template <InflateUTF8Action Action, typename CharsT, class ContextT>
static CharsT
InflateUTF8StringHelper(ContextT* cx, const UTF8Chars src, size_t* outlen)
{
    using CharT = typename CharsT::CharT;
    *outlen = 0;

    JS::SmallestEncoding encoding;
    if (!InflateUTF8StringToBuffer<Action, CharT>(cx, src, /* dst = */ nullptr, outlen, &encoding))
        return CharsT();

    CharT* dst = cx->template pod_malloc<CharT>(*outlen + 1);  // +1 for NUL
    if (!dst) {
        ReportOutOfMemory(cx);
        return CharsT();
    }

    if (encoding == JS::SmallestEncoding::ASCII) {
        size_t srclen = src.length();
        MOZ_ASSERT(*outlen == srclen);
        for (uint32_t i = 0; i < srclen; i++)
            dst[i] = CharT(src[i]);
    } else {
        MOZ_ALWAYS_TRUE((InflateUTF8StringToBuffer<Copy, CharT>(cx, src, dst, outlen, &encoding)));
    }

    dst[*outlen] = 0;    // NUL char

    return CharsT(dst, *outlen);
}

TwoByteCharsZ
JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
{
    return InflateUTF8StringHelper<CountAndReportInvalids, TwoByteCharsZ>(cx, utf8, outlen);
}

TwoByteCharsZ
JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen)
{
    UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
    return InflateUTF8StringHelper<CountAndReportInvalids, TwoByteCharsZ>(cx, chars, outlen);
}

TwoByteCharsZ
js::LossyUTF8CharsToNewTwoByteCharsZ(js::ExclusiveContext* cx, const JS::UTF8Chars utf8, size_t* outlen)
{
    return InflateUTF8StringHelper<CountAndIgnoreInvalids, TwoByteCharsZ>(cx, utf8, outlen);
}

TwoByteCharsZ
JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
{
    return js::LossyUTF8CharsToNewTwoByteCharsZ(cx, utf8, outlen);
}

TwoByteCharsZ
js::LossyUTF8CharsToNewTwoByteCharsZ(js::ExclusiveContext* cx, const JS::ConstUTF8CharsZ& utf8, size_t* outlen)
{
    UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
    return InflateUTF8StringHelper<CountAndIgnoreInvalids, TwoByteCharsZ>(cx, chars, outlen);
}

TwoByteCharsZ
JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen)
{
    return js::LossyUTF8CharsToNewTwoByteCharsZ(cx, utf8, outlen);
}

JS::SmallestEncoding
JS::FindSmallestEncoding(UTF8Chars utf8)
{
    JS::SmallestEncoding encoding;
    MOZ_ALWAYS_TRUE((InflateUTF8StringToBuffer<FindEncoding, char16_t, JSContext>(
                         /* cx = */ nullptr,
                         utf8,
                         /* dst = */ nullptr,
                         /* dstlen = */ nullptr,
                         &encoding)));
    return encoding;
}

Latin1CharsZ
JS::UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
{
    return InflateUTF8StringHelper<CountAndReportInvalids, Latin1CharsZ>(cx, utf8, outlen);
}

Latin1CharsZ
js::LossyUTF8CharsToNewLatin1CharsZ(js::ExclusiveContext* cx, const JS::UTF8Chars utf8, size_t* outlen)
{
    return InflateUTF8StringHelper<CountAndIgnoreInvalids, Latin1CharsZ>(cx, utf8, outlen);
}

Latin1CharsZ
JS::LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
{
    return js::LossyUTF8CharsToNewLatin1CharsZ(cx, utf8, outlen);
}

#ifdef DEBUG
void
JS::ConstUTF8CharsZ::validate(size_t aLength)
{
    MOZ_ASSERT(data_);
    UTF8Chars chars(data_, aLength);
    InflateUTF8StringToBuffer<AssertNoInvalids, char16_t, JSContext>(
        /* cx = */ nullptr,
        chars,
        /* dst = */ nullptr,
        /* dstlen = */ nullptr,
        /* smallestEncoding = */ nullptr);
}
#endif

bool
JS::StringIsASCII(const char* s)
{
    while (*s) {
        if (*s & 0x80)
            return false;
        s++;
    }
    return true;
}