diff options
author | wolfbeast <mcwerewolf@wolfbeast.com> | 2019-11-14 09:07:29 +0100 |
---|---|---|
committer | wolfbeast <mcwerewolf@wolfbeast.com> | 2019-11-14 09:07:29 +0100 |
commit | 56de283899bc91f7110aba58a3ca174c10852683 (patch) | |
tree | 779e6501bbbe4f015509c423ab44f2f40ea97cc8 /modules/brotli/enc/utf8_util.c | |
parent | ce0dd36a78814c59950fde6c19413c1f7ea85ee1 (diff) | |
download | UXP-56de283899bc91f7110aba58a3ca174c10852683.tar UXP-56de283899bc91f7110aba58a3ca174c10852683.tar.gz UXP-56de283899bc91f7110aba58a3ca174c10852683.tar.lz UXP-56de283899bc91f7110aba58a3ca174c10852683.tar.xz UXP-56de283899bc91f7110aba58a3ca174c10852683.zip |
Issue #1288 - Part 1a: Update brotli to 1.0.7
This also reorganizes the exports in the build system to use `brotli/`
as include directory.
Diffstat (limited to 'modules/brotli/enc/utf8_util.c')
-rw-r--r-- | modules/brotli/enc/utf8_util.c | 85 |
1 files changed, 85 insertions, 0 deletions
diff --git a/modules/brotli/enc/utf8_util.c b/modules/brotli/enc/utf8_util.c new file mode 100644 index 000000000..04a780516 --- /dev/null +++ b/modules/brotli/enc/utf8_util.c @@ -0,0 +1,85 @@ +/* Copyright 2013 Google Inc. All Rights Reserved. + + Distributed under MIT license. + See file LICENSE for detail or copy at https://opensource.org/licenses/MIT +*/ + +/* Heuristics for deciding about the UTF8-ness of strings. */ + +#include "./utf8_util.h" + +#include <brotli/types.h> + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +static size_t BrotliParseAsUTF8( + int* symbol, const uint8_t* input, size_t size) { + /* ASCII */ + if ((input[0] & 0x80) == 0) { + *symbol = input[0]; + if (*symbol > 0) { + return 1; + } + } + /* 2-byte UTF8 */ + if (size > 1u && + (input[0] & 0xE0) == 0xC0 && + (input[1] & 0xC0) == 0x80) { + *symbol = (((input[0] & 0x1F) << 6) | + (input[1] & 0x3F)); + if (*symbol > 0x7F) { + return 2; + } + } + /* 3-byte UFT8 */ + if (size > 2u && + (input[0] & 0xF0) == 0xE0 && + (input[1] & 0xC0) == 0x80 && + (input[2] & 0xC0) == 0x80) { + *symbol = (((input[0] & 0x0F) << 12) | + ((input[1] & 0x3F) << 6) | + (input[2] & 0x3F)); + if (*symbol > 0x7FF) { + return 3; + } + } + /* 4-byte UFT8 */ + if (size > 3u && + (input[0] & 0xF8) == 0xF0 && + (input[1] & 0xC0) == 0x80 && + (input[2] & 0xC0) == 0x80 && + (input[3] & 0xC0) == 0x80) { + *symbol = (((input[0] & 0x07) << 18) | + ((input[1] & 0x3F) << 12) | + ((input[2] & 0x3F) << 6) | + (input[3] & 0x3F)); + if (*symbol > 0xFFFF && *symbol <= 0x10FFFF) { + return 4; + } + } + /* Not UTF8, emit a special symbol above the UTF8-code space */ + *symbol = 0x110000 | input[0]; + return 1; +} + +/* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/ +BROTLI_BOOL BrotliIsMostlyUTF8( + const uint8_t* data, const size_t pos, const size_t mask, + const size_t length, const double min_fraction) { + size_t size_utf8 = 0; + size_t i = 0; + while (i < length) { + int symbol; + size_t bytes_read = + BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i); + i += bytes_read; + if (symbol < 0x110000) size_utf8 += bytes_read; + } + return TO_BROTLI_BOOL(size_utf8 > min_fraction * (double)length); +} + +#if defined(__cplusplus) || defined(c_plusplus) +} /* extern "C" */ +#endif |