summaryrefslogtreecommitdiffstats
path: root/modules/brotli/enc/utf8_util.c
blob: 04a7805161b87db8fe7cf0c8ab92702c95e13e20 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
/* Copyright 2013 Google Inc. All Rights Reserved.

   Distributed under MIT license.
   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
*/

/* Heuristics for deciding about the UTF8-ness of strings. */

#include "./utf8_util.h"

#include <brotli/types.h>

#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif

static size_t BrotliParseAsUTF8(
    int* symbol, const uint8_t* input, size_t size) {
  /* ASCII */
  if ((input[0] & 0x80) == 0) {
    *symbol = input[0];
    if (*symbol > 0) {
      return 1;
    }
  }
  /* 2-byte UTF8 */
  if (size > 1u &&
      (input[0] & 0xE0) == 0xC0 &&
      (input[1] & 0xC0) == 0x80) {
    *symbol = (((input[0] & 0x1F) << 6) |
               (input[1] & 0x3F));
    if (*symbol > 0x7F) {
      return 2;
    }
  }
  /* 3-byte UFT8 */
  if (size > 2u &&
      (input[0] & 0xF0) == 0xE0 &&
      (input[1] & 0xC0) == 0x80 &&
      (input[2] & 0xC0) == 0x80) {
    *symbol = (((input[0] & 0x0F) << 12) |
               ((input[1] & 0x3F) << 6) |
               (input[2] & 0x3F));
    if (*symbol > 0x7FF) {
      return 3;
    }
  }
  /* 4-byte UFT8 */
  if (size > 3u &&
      (input[0] & 0xF8) == 0xF0 &&
      (input[1] & 0xC0) == 0x80 &&
      (input[2] & 0xC0) == 0x80 &&
      (input[3] & 0xC0) == 0x80) {
    *symbol = (((input[0] & 0x07) << 18) |
               ((input[1] & 0x3F) << 12) |
               ((input[2] & 0x3F) << 6) |
               (input[3] & 0x3F));
    if (*symbol > 0xFFFF && *symbol <= 0x10FFFF) {
      return 4;
    }
  }
  /* Not UTF8, emit a special symbol above the UTF8-code space */
  *symbol = 0x110000 | input[0];
  return 1;
}

/* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/
BROTLI_BOOL BrotliIsMostlyUTF8(
    const uint8_t* data, const size_t pos, const size_t mask,
    const size_t length, const double min_fraction) {
  size_t size_utf8 = 0;
  size_t i = 0;
  while (i < length) {
    int symbol;
    size_t bytes_read =
        BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i);
    i += bytes_read;
    if (symbol < 0x110000) size_utf8 += bytes_read;
  }
  return TO_BROTLI_BOOL(size_utf8 > min_fraction * (double)length);
}

#if defined(__cplusplus) || defined(c_plusplus)
}  /* extern "C" */
#endif