1 files changed, 89 insertions, 51 deletions
diff --git a/js/src/regexp/special-case.h b/js/src/regexp/special-case.h
index 1ccec5d31..3aca98302 100644
--- a/js/src/regexp/special-case.h
+++ b/js/src/regexp/special-case.h
@@ -6,70 +6,108 @@
 #define V8_REGEXP_SPECIAL_CASE_H_
 
 #ifdef V8_INTL_SUPPORT
-#include "unicode/uversion.h"
-namespace U_ICU_NAMESPACE {
-class UnicodeSet;
-}  //  namespace U_ICU_NAMESPACE
+#include "regexp/regexp-shim.h"
+
+#include "unicode/uchar.h"
+#include "unicode/uniset.h"
+#include "unicode/unistr.h"
 
 namespace v8 {
 namespace internal {
 
-// Functions to build special sets of Unicode characters that need special
-// handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE).
+// Sets of Unicode characters that need special handling under "i" mode
+
+// For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262
+// defines slightly different case-folding rules than Unicode. An
+// input character should match a pattern character if the result of
+// the Canonicalize algorithm is the same for both characters.
 //
-// For the characters in the "ignore set", the process should not treat other
-// characters in the result of closeOver(USET_CASE_INSENSITIVE) as case
-// equivlant under the ECMA262 RegExp "i" mode because these characters are
-// uppercase themselves that no other characters in the set uppercase to.
+// Roughly speaking, for "i" regexps, Canonicalize(c) is the same as
+// c.toUpperCase(), unless a) c.toUpperCase() is a multi-character
+// string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See
+// https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for
+// the precise definition.
 //
-// For the characters in the "special add set", the proecess should add only
-// those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is
-// not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode
-// and also that ONE uppercase character that other non uppercase character
-// uppercase into to the set. Other uppercase characters in the result of
-// closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262
-// RegExp "i" mode consider two characters as "case equivlant" if both
-// characters uppercase to the same character.
+// While compiling such regular expressions, we need to compute the
+// set of characters that should match a given input character. (See
+// GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.)
+// For almost all characters, this can be efficiently computed using
+// UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent
+// the remaining special cases.
 //
-// For example, consider the following case equivalent set defined by Unicode
-// standard. Notice there are more than one uppercase characters in this set:
-//  U+212B Å Angstrom Sign - an uppercase character.
-//  U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character.
-//  U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which
-//    uppercase to U+00C5.
-// In this case equivlant set is a special set and need special handling while
-// considering "case equivlant" under the ECMA262 RegExp "i" mode which is
-// different than Unicode Standard:
-//  * U+212B should be included into the "ignore" set because there are no other
-//    characters, under the ECMA262 "i" mode, are considered as "case equivlant"
-//    to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5
-//    uppercase to U+212B.
-//  * U+00C5 and U+00E5 will both be included into the "special add" set. While
-//    calculate the "equivlant set" under ECMA262 "i" mode, the process will
-//    add U+00E5, because it is not an uppercase character in the set. The
-//    process will also add U+00C5, because it is the uppercase character which
-//    other non uppercase character, U+00C5, uppercase into.
+// For a character c, the rules are as follows:
 //
-// For characters not included in "ignore set" and "special add set", the
-// process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is
-// much faster.
+// 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling
+//    UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet
+//    containing c will produce the set of characters that should
+//    match /c/i (or /[c]/i), and only those characters.
 //
-// Under Unicode 12.0, there are only 7 characters in the "special add set" and
-// 4 characters in "ignore set" so even the special add process is slower, it is
-// limited to a small set of cases only.
+// 2. If c is in IgnoreSet, then the only character it should match is
+//    itself. However, closeOver will add additional incorrect
+//    matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ'
+//    (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is
+//    "SS".  Step 3.e therefore requires that 'ß' canonicalizes to
+//    itself, and should not match 'ẞ'. In these cases, we can skip
+//    the closeOver entirely, because it will never add an equivalent
+//    character.
 //
-// The implementation of these two function will be generated by calling ICU
-// icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by
-// the code in src/regexp/gen-regexp-special-case.cc.
+// 3. If c is in SpecialAddSet, then it should match at least one
+//    character other than itself. However, closeOver will add at
+//    least one additional incorrect match. For example, consider the
+//    letter 'k'. Closing over 'k' gives "kKK" (lowercase k, uppercase
+//    K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN
+//    SIGN should not match either of the other two characters. As a
+//    result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in
+//    IgnoreSet). To find the correct matches for characters in
+//    SpecialAddSet, we closeOver the original character, but filter
+//    out the results that do not have the same canonical value.
 //
-// These two function will be used with LazyInstance<> template to generate
-// global sharable set to reduce memory usage and speed up performance.
+// The contents of these sets are calculated at build time by
+// src/regexp/gen-regexp-special-case.cc, which generates
+// gen/src/regexp/special-case.cc. This is done by iterating over the
+// result of closeOver for each BMP character, and finding sets for
+// which at least one character has a different canonical value than
+// another character. Characters that match no other characters in
+// their equivalence class are added to IgnoreSet. Characters that
+// match at least one other character are added to SpecialAddSet.
+
+class RegExpCaseFolding final : public AllStatic {
+ public:
+  static const icu::UnicodeSet& IgnoreSet();
+  static const icu::UnicodeSet& SpecialAddSet();
+
+  // This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:
+  // Canonicalize) step 3, which is used to determine whether
+  // characters match when ignoreCase is true and unicode is false.
+  static UChar32 Canonicalize(UChar32 ch) {
+    // a. Assert: ch is a UTF-16 code unit.
+    CHECK_LE(ch, 0xffff);
+
+    // b. Let s be the String value consisting of the single code unit ch.
+    icu::UnicodeString s(ch);
+
+    // c. Let u be the same result produced as if by performing the algorithm
+    // for String.prototype.toUpperCase using s as the this value.
+    // d. Assert: Type(u) is String.
+    icu::UnicodeString& u = s.toUpper();
+
+    // e. If u does not consist of a single code unit, return ch.
+    if (u.length() != 1) {
+      return ch;
+    }
+
+    // f. Let cu be u's single code unit element.
+    UChar32 cu = u.char32At(0);
 
-// Function to build and return the Ignore set.
-icu::UnicodeSet BuildIgnoreSet();
+    // g. If the value of ch >= 128 and the value of cu < 128, return ch.
+    if (ch >= 128 && cu < 128) {
+      return ch;
+    }
 
-// Function to build and return the Special Add set.
-icu::UnicodeSet BuildSpecialAddSet();
+    // h. Return cu.
+    return cu;
+  }
+};
 
 }  // namespace internal
 }  // namespace v8