summaryrefslogtreecommitdiffstats
path: root/js/public/CharacterEncoding.h
diff options
context:
space:
mode:
Diffstat (limited to 'js/public/CharacterEncoding.h')
-rw-r--r--js/public/CharacterEncoding.h338
1 files changed, 338 insertions, 0 deletions
diff --git a/js/public/CharacterEncoding.h b/js/public/CharacterEncoding.h
new file mode 100644
index 000000000..90a31d188
--- /dev/null
+++ b/js/public/CharacterEncoding.h
@@ -0,0 +1,338 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * vim: set ts=8 sts=4 et sw=4 tw=99:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef js_CharacterEncoding_h
+#define js_CharacterEncoding_h
+
+#include "mozilla/Range.h"
+
+#include "js/TypeDecls.h"
+#include "js/Utility.h"
+
+namespace js {
+class ExclusiveContext;
+} // namespace js
+
+class JSFlatString;
+
+namespace JS {
+
+/*
+ * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI
+ * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each
+ * byte is treated as a 2-byte character, and there is no way to pass in a
+ * string containing characters beyond U+00FF.
+ */
+class Latin1Chars : public mozilla::Range<Latin1Char>
+{
+ typedef mozilla::Range<Latin1Char> Base;
+
+ public:
+ using CharT = Latin1Char;
+
+ Latin1Chars() : Base() {}
+ Latin1Chars(char* aBytes, size_t aLength) : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {}
+ Latin1Chars(const Latin1Char* aBytes, size_t aLength)
+ : Base(const_cast<Latin1Char*>(aBytes), aLength)
+ {}
+ Latin1Chars(const char* aBytes, size_t aLength)
+ : Base(reinterpret_cast<Latin1Char*>(const_cast<char*>(aBytes)), aLength)
+ {}
+};
+
+/*
+ * A Latin1Chars, but with \0 termination for C compatibility.
+ */
+class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char>
+{
+ typedef mozilla::RangedPtr<Latin1Char> Base;
+
+ public:
+ using CharT = Latin1Char;
+
+ Latin1CharsZ() : Base(nullptr, 0) {}
+
+ Latin1CharsZ(char* aBytes, size_t aLength)
+ : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength)
+ {
+ MOZ_ASSERT(aBytes[aLength] == '\0');
+ }
+
+ Latin1CharsZ(Latin1Char* aBytes, size_t aLength)
+ : Base(aBytes, aLength)
+ {
+ MOZ_ASSERT(aBytes[aLength] == '\0');
+ }
+
+ using Base::operator=;
+
+ char* c_str() { return reinterpret_cast<char*>(get()); }
+};
+
+class UTF8Chars : public mozilla::Range<unsigned char>
+{
+ typedef mozilla::Range<unsigned char> Base;
+
+ public:
+ using CharT = unsigned char;
+
+ UTF8Chars() : Base() {}
+ UTF8Chars(char* aBytes, size_t aLength)
+ : Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
+ {}
+ UTF8Chars(const char* aBytes, size_t aLength)
+ : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)), aLength)
+ {}
+};
+
+/*
+ * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
+ */
+class UTF8CharsZ : public mozilla::RangedPtr<unsigned char>
+{
+ typedef mozilla::RangedPtr<unsigned char> Base;
+
+ public:
+ using CharT = unsigned char;
+
+ UTF8CharsZ() : Base(nullptr, 0) {}
+
+ UTF8CharsZ(char* aBytes, size_t aLength)
+ : Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
+ {
+ MOZ_ASSERT(aBytes[aLength] == '\0');
+ }
+
+ UTF8CharsZ(unsigned char* aBytes, size_t aLength)
+ : Base(aBytes, aLength)
+ {
+ MOZ_ASSERT(aBytes[aLength] == '\0');
+ }
+
+ using Base::operator=;
+
+ char* c_str() { return reinterpret_cast<char*>(get()); }
+};
+
+/*
+ * A wrapper for a "const char*" that is encoded using UTF-8.
+ * This class does not manage ownership of the data; that is left
+ * to others. This differs from UTF8CharsZ in that the chars are
+ * const and it allows assignment.
+ */
+class JS_PUBLIC_API(ConstUTF8CharsZ)
+{
+ const char* data_;
+
+ public:
+ using CharT = unsigned char;
+
+ ConstUTF8CharsZ() : data_(nullptr)
+ {}
+
+ ConstUTF8CharsZ(const char* aBytes, size_t aLength)
+ : data_(aBytes)
+ {
+ MOZ_ASSERT(aBytes[aLength] == '\0');
+#ifdef DEBUG
+ validate(aLength);
+#endif
+ }
+
+ const void* get() const { return data_; }
+
+ const char* c_str() const { return data_; }
+
+ explicit operator bool() const { return data_ != nullptr; }
+
+ private:
+#ifdef DEBUG
+ void validate(size_t aLength);
+#endif
+};
+
+/*
+ * SpiderMonkey uses a 2-byte character representation: it is a
+ * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,
+ * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a
+ * sufficiently dedicated JavaScript program to be fully unicode-aware by
+ * manually interpreting UTF-16 extension characters embedded in the JS
+ * string.
+ */
+class TwoByteChars : public mozilla::Range<char16_t>
+{
+ typedef mozilla::Range<char16_t> Base;
+
+ public:
+ using CharT = char16_t;
+
+ TwoByteChars() : Base() {}
+ TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
+ TwoByteChars(const char16_t* aChars, size_t aLength) : Base(const_cast<char16_t*>(aChars), aLength) {}
+};
+
+/*
+ * A TwoByteChars, but \0 terminated for compatibility with JSFlatString.
+ */
+class TwoByteCharsZ : public mozilla::RangedPtr<char16_t>
+{
+ typedef mozilla::RangedPtr<char16_t> Base;
+
+ public:
+ using CharT = char16_t;
+
+ TwoByteCharsZ() : Base(nullptr, 0) {}
+
+ TwoByteCharsZ(char16_t* chars, size_t length)
+ : Base(chars, length)
+ {
+ MOZ_ASSERT(chars[length] == '\0');
+ }
+
+ using Base::operator=;
+};
+
+typedef mozilla::RangedPtr<const char16_t> ConstCharPtr;
+
+/*
+ * Like TwoByteChars, but the chars are const.
+ */
+class ConstTwoByteChars : public mozilla::Range<const char16_t>
+{
+ typedef mozilla::Range<const char16_t> Base;
+
+ public:
+ using CharT = char16_t;
+
+ ConstTwoByteChars() : Base() {}
+ ConstTwoByteChars(const char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
+};
+
+/*
+ * Convert a 2-byte character sequence to "ISO-Latin-1". This works by
+ * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source
+ * contains any UTF-16 extension characters, then this may give invalid Latin1
+ * output. The returned string is zero terminated. The returned string or the
+ * returned string's |start()| must be freed with JS_free or js_free,
+ * respectively. If allocation fails, an OOM error will be set and the method
+ * will return a nullptr chars (which can be tested for with the ! operator).
+ * This method cannot trigger GC.
+ */
+extern Latin1CharsZ
+LossyTwoByteCharsToNewLatin1CharsZ(js::ExclusiveContext* cx,
+ const mozilla::Range<const char16_t> tbchars);
+
+inline Latin1CharsZ
+LossyTwoByteCharsToNewLatin1CharsZ(js::ExclusiveContext* cx, const char16_t* begin, size_t length)
+{
+ const mozilla::Range<const char16_t> tbchars(begin, length);
+ return JS::LossyTwoByteCharsToNewLatin1CharsZ(cx, tbchars);
+}
+
+template <typename CharT>
+extern UTF8CharsZ
+CharsToNewUTF8CharsZ(js::ExclusiveContext* maybeCx, const mozilla::Range<CharT> chars);
+
+JS_PUBLIC_API(uint32_t)
+Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length);
+
+/*
+ * Inflate bytes in UTF-8 encoding to char16_t.
+ * - On error, returns an empty TwoByteCharsZ.
+ * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold
+ * its length; the length value excludes the trailing null.
+ */
+extern JS_PUBLIC_API(TwoByteCharsZ)
+UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
+
+/*
+ * Like UTF8CharsToNewTwoByteCharsZ, but for ConstUTF8CharsZ.
+ */
+extern JS_PUBLIC_API(TwoByteCharsZ)
+UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen);
+
+/*
+ * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 characters
+ * will be replaced by \uFFFD. No exception will be thrown for malformed UTF-8
+ * input.
+ */
+extern JS_PUBLIC_API(TwoByteCharsZ)
+LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
+
+extern JS_PUBLIC_API(TwoByteCharsZ)
+LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen);
+
+/*
+ * Returns the length of the char buffer required to encode |s| as UTF8.
+ * Does not include the null-terminator.
+ */
+JS_PUBLIC_API(size_t)
+GetDeflatedUTF8StringLength(JSFlatString* s);
+
+/*
+ * Encode |src| as UTF8. The caller must either ensure |dst| has enough space
+ * to encode the entire string or pass the length of the buffer as |dstlenp|,
+ * in which case the function will encode characters from the string until
+ * the buffer is exhausted. Does not write the null terminator.
+ *
+ * If |dstlenp| is provided, it will be updated to hold the number of bytes
+ * written to the buffer. If |numcharsp| is provided, it will be updated to hold
+ * the number of Unicode characters written to the buffer (which can be less
+ * than the length of the string, if the buffer is exhausted before the string
+ * is fully encoded).
+ */
+JS_PUBLIC_API(void)
+DeflateStringToUTF8Buffer(JSFlatString* src, mozilla::RangedPtr<char> dst,
+ size_t* dstlenp = nullptr, size_t* numcharsp = nullptr);
+
+/*
+ * The smallest character encoding capable of fully representing a particular
+ * string.
+ */
+enum class SmallestEncoding {
+ ASCII,
+ Latin1,
+ UTF16
+};
+
+/*
+ * Returns the smallest encoding possible for the given string: if all
+ * codepoints are <128 then ASCII, otherwise if all codepoints are <256
+ * Latin-1, else UTF16.
+ */
+JS_PUBLIC_API(SmallestEncoding)
+FindSmallestEncoding(UTF8Chars utf8);
+
+/*
+ * Return a null-terminated Latin-1 string copied from the input string,
+ * storing its length (excluding null terminator) in |*outlen|. Fail and
+ * report an error if the string contains non-Latin-1 codepoints. Returns
+ * Latin1CharsZ() on failure.
+ */
+extern JS_PUBLIC_API(Latin1CharsZ)
+UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
+
+/*
+ * Return a null-terminated Latin-1 string copied from the input string,
+ * storing its length (excluding null terminator) in |*outlen|. Non-Latin-1
+ * codepoints are replaced by '?'. Returns Latin1CharsZ() on failure.
+ */
+extern JS_PUBLIC_API(Latin1CharsZ)
+LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
+
+/*
+ * Returns true if all characters in the given null-terminated string are
+ * ASCII, i.e. < 0x80, false otherwise.
+ */
+extern JS_PUBLIC_API(bool)
+StringIsASCII(const char* s);
+
+} // namespace JS
+
+inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }
+inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); }
+
+#endif /* js_CharacterEncoding_h */