js/src/regexp/special-case.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79

// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef V8_REGEXP_SPECIAL_CASE_H_
#define V8_REGEXP_SPECIAL_CASE_H_

#ifdef V8_INTL_SUPPORT
#include "unicode/uversion.h"
namespace U_ICU_NAMESPACE {
class UnicodeSet;
}  //  namespace U_ICU_NAMESPACE

namespace v8 {
namespace internal {

// Functions to build special sets of Unicode characters that need special
// handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE).
//
// For the characters in the "ignore set", the process should not treat other
// characters in the result of closeOver(USET_CASE_INSENSITIVE) as case
// equivlant under the ECMA262 RegExp "i" mode because these characters are
// uppercase themselves that no other characters in the set uppercase to.
//
// For the characters in the "special add set", the proecess should add only
// those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is
// not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode
// and also that ONE uppercase character that other non uppercase character
// uppercase into to the set. Other uppercase characters in the result of
// closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262
// RegExp "i" mode consider two characters as "case equivlant" if both
// characters uppercase to the same character.
//
// For example, consider the following case equivalent set defined by Unicode
// standard. Notice there are more than one uppercase characters in this set:
//  U+212B Å Angstrom Sign - an uppercase character.
//  U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character.
//  U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which
//    uppercase to U+00C5.
// In this case equivlant set is a special set and need special handling while
// considering "case equivlant" under the ECMA262 RegExp "i" mode which is
// different than Unicode Standard:
//  * U+212B should be included into the "ignore" set because there are no other
//    characters, under the ECMA262 "i" mode, are considered as "case equivlant"
//    to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5
//    uppercase to U+212B.
//  * U+00C5 and U+00E5 will both be included into the "special add" set. While
//    calculate the "equivlant set" under ECMA262 "i" mode, the process will
//    add U+00E5, because it is not an uppercase character in the set. The
//    process will also add U+00C5, because it is the uppercase character which
//    other non uppercase character, U+00C5, uppercase into.
//
// For characters not included in "ignore set" and "special add set", the
// process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is
// much faster.
//
// Under Unicode 12.0, there are only 7 characters in the "special add set" and
// 4 characters in "ignore set" so even the special add process is slower, it is
// limited to a small set of cases only.
//
// The implementation of these two function will be generated by calling ICU
// icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by
// the code in src/regexp/gen-regexp-special-case.cc.
//
// These two function will be used with LazyInstance<> template to generate
// global sharable set to reduce memory usage and speed up performance.

// Function to build and return the Ignore set.
icu::UnicodeSet BuildIgnoreSet();

// Function to build and return the Special Add set.
icu::UnicodeSet BuildSpecialAddSet();

}  // namespace internal
}  // namespace v8

#endif  // V8_INTL_SUPPORT

#endif  // V8_REGEXP_SPECIAL_CASE_H_