diff options
Diffstat (limited to 'js/src/regexp/special-case.h')
-rw-r--r-- | js/src/regexp/special-case.h | 140 |
1 files changed, 89 insertions, 51 deletions
diff --git a/js/src/regexp/special-case.h b/js/src/regexp/special-case.h index 1ccec5d31..3aca98302 100644 --- a/js/src/regexp/special-case.h +++ b/js/src/regexp/special-case.h @@ -6,70 +6,108 @@ #define V8_REGEXP_SPECIAL_CASE_H_ #ifdef V8_INTL_SUPPORT -#include "unicode/uversion.h" -namespace U_ICU_NAMESPACE { -class UnicodeSet; -} // namespace U_ICU_NAMESPACE +#include "regexp/regexp-shim.h" + +#include "unicode/uchar.h" +#include "unicode/uniset.h" +#include "unicode/unistr.h" namespace v8 { namespace internal { -// Functions to build special sets of Unicode characters that need special -// handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE). +// Sets of Unicode characters that need special handling under "i" mode + +// For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262 +// defines slightly different case-folding rules than Unicode. An +// input character should match a pattern character if the result of +// the Canonicalize algorithm is the same for both characters. // -// For the characters in the "ignore set", the process should not treat other -// characters in the result of closeOver(USET_CASE_INSENSITIVE) as case -// equivlant under the ECMA262 RegExp "i" mode because these characters are -// uppercase themselves that no other characters in the set uppercase to. +// Roughly speaking, for "i" regexps, Canonicalize(c) is the same as +// c.toUpperCase(), unless a) c.toUpperCase() is a multi-character +// string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See +// https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for +// the precise definition. // -// For the characters in the "special add set", the proecess should add only -// those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is -// not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode -// and also that ONE uppercase character that other non uppercase character -// uppercase into to the set. Other uppercase characters in the result of -// closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262 -// RegExp "i" mode consider two characters as "case equivlant" if both -// characters uppercase to the same character. +// While compiling such regular expressions, we need to compute the +// set of characters that should match a given input character. (See +// GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.) +// For almost all characters, this can be efficiently computed using +// UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent +// the remaining special cases. // -// For example, consider the following case equivalent set defined by Unicode -// standard. Notice there are more than one uppercase characters in this set: -// U+212B Å Angstrom Sign - an uppercase character. -// U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character. -// U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which -// uppercase to U+00C5. -// In this case equivlant set is a special set and need special handling while -// considering "case equivlant" under the ECMA262 RegExp "i" mode which is -// different than Unicode Standard: -// * U+212B should be included into the "ignore" set because there are no other -// characters, under the ECMA262 "i" mode, are considered as "case equivlant" -// to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5 -// uppercase to U+212B. -// * U+00C5 and U+00E5 will both be included into the "special add" set. While -// calculate the "equivlant set" under ECMA262 "i" mode, the process will -// add U+00E5, because it is not an uppercase character in the set. The -// process will also add U+00C5, because it is the uppercase character which -// other non uppercase character, U+00C5, uppercase into. +// For a character c, the rules are as follows: // -// For characters not included in "ignore set" and "special add set", the -// process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is -// much faster. +// 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling +// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet +// containing c will produce the set of characters that should +// match /c/i (or /[c]/i), and only those characters. // -// Under Unicode 12.0, there are only 7 characters in the "special add set" and -// 4 characters in "ignore set" so even the special add process is slower, it is -// limited to a small set of cases only. +// 2. If c is in IgnoreSet, then the only character it should match is +// itself. However, closeOver will add additional incorrect +// matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ' +// (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is +// "SS". Step 3.e therefore requires that 'ß' canonicalizes to +// itself, and should not match 'ẞ'. In these cases, we can skip +// the closeOver entirely, because it will never add an equivalent +// character. // -// The implementation of these two function will be generated by calling ICU -// icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by -// the code in src/regexp/gen-regexp-special-case.cc. +// 3. If c is in SpecialAddSet, then it should match at least one +// character other than itself. However, closeOver will add at +// least one additional incorrect match. For example, consider the +// letter 'k'. Closing over 'k' gives "kKK" (lowercase k, uppercase +// K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN +// SIGN should not match either of the other two characters. As a +// result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in +// IgnoreSet). To find the correct matches for characters in +// SpecialAddSet, we closeOver the original character, but filter +// out the results that do not have the same canonical value. // -// These two function will be used with LazyInstance<> template to generate -// global sharable set to reduce memory usage and speed up performance. +// The contents of these sets are calculated at build time by +// src/regexp/gen-regexp-special-case.cc, which generates +// gen/src/regexp/special-case.cc. This is done by iterating over the +// result of closeOver for each BMP character, and finding sets for +// which at least one character has a different canonical value than +// another character. Characters that match no other characters in +// their equivalence class are added to IgnoreSet. Characters that +// match at least one other character are added to SpecialAddSet. + +class RegExpCaseFolding final : public AllStatic { + public: + static const icu::UnicodeSet& IgnoreSet(); + static const icu::UnicodeSet& SpecialAddSet(); + + // This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics: + // Canonicalize) step 3, which is used to determine whether + // characters match when ignoreCase is true and unicode is false. + static UChar32 Canonicalize(UChar32 ch) { + // a. Assert: ch is a UTF-16 code unit. + CHECK_LE(ch, 0xffff); + + // b. Let s be the String value consisting of the single code unit ch. + icu::UnicodeString s(ch); + + // c. Let u be the same result produced as if by performing the algorithm + // for String.prototype.toUpperCase using s as the this value. + // d. Assert: Type(u) is String. + icu::UnicodeString& u = s.toUpper(); + + // e. If u does not consist of a single code unit, return ch. + if (u.length() != 1) { + return ch; + } + + // f. Let cu be u's single code unit element. + UChar32 cu = u.char32At(0); -// Function to build and return the Ignore set. -icu::UnicodeSet BuildIgnoreSet(); + // g. If the value of ch >= 128 and the value of cu < 128, return ch. + if (ch >= 128 && cu < 128) { + return ch; + } -// Function to build and return the Special Add set. -icu::UnicodeSet BuildSpecialAddSet(); + // h. Return cu. + return cu; + } +}; } // namespace internal } // namespace v8 |