1 files changed, 95 insertions, 54 deletions
diff --git a/js/src/regexp/gen-regexp-special-case.cc b/js/src/regexp/gen-regexp-special-case.cc
index 337743f53..b4a8c3da4 100644
--- a/js/src/regexp/gen-regexp-special-case.cc
+++ b/js/src/regexp/gen-regexp-special-case.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 the V8 project authors. All rights reserved.
+// Copyright 2020 the V8 project authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
@@ -7,18 +7,19 @@
 #include <iostream>
 #include <sstream>
 
-#include "unicode/uchar.h"
-#include "unicode/uniset.h"
+#include "regexp/special-case.h"
 
 namespace v8 {
 namespace internal {
 
-// The following code generates BuildSpecialAddSet() and BuildIgnoreSet()
-// functions into "src/regexp/special-case.cc".
-// See more details in http://shorturl.at/adfO5
-void PrintSet(std::ofstream& out, const char* func_name,
+static const uc32 kSurrogateStart = 0xd800;
+static const uc32 kSurrogateEnd = 0xdfff;
+static const uc32 kNonBmpStart = 0x10000;
+
+// The following code generates "src/regexp/special-case.cc".
+void PrintSet(std::ofstream& out, const char* name,
               const icu::UnicodeSet& set) {
-  out << "icu::UnicodeSet " << func_name << "() {\n"
+  out << "icu::UnicodeSet Build" << name << "() {\n"
       << "  icu::UnicodeSet set;\n";
   for (int32_t i = 0; i < set.getRangeCount(); i++) {
     if (set.getRangeStart(i) == set.getRangeEnd(i)) {
@@ -30,73 +31,113 @@ void PrintSet(std::ofstream& out, const char* func_name,
   }
   out << "  set.freeze();\n"
       << "  return set;\n"
-      << "}\n";
+      << "}\n\n";
+
+  out << "struct " << name << "Data {\n"
+      << "  " << name << "Data() : set(Build" << name << "()) {}\n"
+      << "  const icu::UnicodeSet set;\n"
+      << "};\n\n";
+
+  out << "//static\n"
+      << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n"
+      << "  static base::LazyInstance<" << name << "Data>::type set =\n"
+      << "      LAZY_INSTANCE_INITIALIZER;\n"
+      << "  return set.Pointer()->set;\n"
+      << "}\n\n";
 }
 
 void PrintSpecial(std::ofstream& out) {
   icu::UnicodeSet current;
-  icu::UnicodeSet processed(0xd800, 0xdbff);  // Ignore surrogate range.
   icu::UnicodeSet special_add;
   icu::UnicodeSet ignore;
   UErrorCode status = U_ZERO_ERROR;
   icu::UnicodeSet upper("[\\p{Lu}]", status);
   CHECK(U_SUCCESS(status));
-  // Iterate through all chars in BMP except ASCII and Surrogate.
-  for (UChar32 i = 0x80; i < 0x010000; i++) {
-    // Ignore those characters which is already processed.
-    if (!processed.contains(i)) {
-      current.set(i, i);
-      current.closeOver(USET_CASE_INSENSITIVE);
 
-      // Remember we already processed current.
-      processed.addAll(current);
-
-      // All uppercase characters in current.
-      icu::UnicodeSet keep_upper(current);
-      keep_upper.retainAll(upper);
-
-      // Check if we have more than one uppercase character in current.
-      // If there are more than one uppercase character, then it is a special
-      // set which need to be added into either "Special Add" set or "Ignore"
-      // set.
-      int32_t number_of_upper = 0;
-      for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) {
-        number_of_upper +=
-            keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1;
+  // Iterate through all chars in BMP except surrogates.
+  for (UChar32 i = 0; i < kNonBmpStart; i++) {
+    if (i >= kSurrogateStart && i <= kSurrogateEnd) {
+      continue;  // Ignore surrogate range
+    }
+    current.set(i, i);
+    current.closeOver(USET_CASE_INSENSITIVE);
+
+    // Check to see if all characters in the case-folding equivalence
+    // class as defined by UnicodeSet::closeOver all map to the same
+    // canonical value.
+    UChar32 canonical = RegExpCaseFolding::Canonicalize(i);
+    bool class_has_matching_canonical_char = false;
+    bool class_has_non_matching_canonical_char = false;
+    for (int32_t j = 0; j < current.getRangeCount(); j++) {
+      for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);
+           c++) {
+        if (c == i) {
+          continue;
+        }
+        UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c);
+        if (canonical == other_canonical) {
+          class_has_matching_canonical_char = true;
+        } else {
+          class_has_non_matching_canonical_char = true;
+        }
+      }
+    }
+    // If any other character in i's equivalence class has a
+    // different canonical value, then i needs special handling.  If
+    // no other character shares a canonical value with i, we can
+    // ignore i when adding alternatives for case-independent
+    // comparison.  If at least one other character shares a
+    // canonical value, then i needs special handling.
+    if (class_has_non_matching_canonical_char) {
+      if (class_has_matching_canonical_char) {
+        special_add.add(i);
+      } else {
+        ignore.add(i);
       }
-      if (number_of_upper > 1) {
-        // Add all non uppercase characters (could be Ll or Mn) to special add
-        // set.
-        current.removeAll(upper);
-        special_add.addAll(current);
-
-        // Add the uppercase characters of non uppercase character to
-        // special add set.
-        CHECK_GT(current.getRangeCount(), 0);
-        UChar32 main_upper = u_toupper(current.getRangeStart(0));
-        special_add.add(main_upper);
-
-        // Add all uppercase except the main upper to ignore set.
-        keep_upper.remove(main_upper);
-        ignore.addAll(keep_upper);
+    }
+  }
+
+  // Verify that no Unicode equivalence class contains two non-trivial
+  // JS equivalence classes. Every character in SpecialAddSet has the
+  // same canonical value as every other non-IgnoreSet character in
+  // its Unicode equivalence class. Therefore, if we call closeOver on
+  // a set containing no IgnoreSet characters, the only characters
+  // that must be removed from the result are in IgnoreSet. This fact
+  // is used in CharacterRange::AddCaseEquivalents.
+  for (int32_t i = 0; i < special_add.getRangeCount(); i++) {
+    for (UChar32 c = special_add.getRangeStart(i);
+         c <= special_add.getRangeEnd(i); c++) {
+      UChar32 canonical = RegExpCaseFolding::Canonicalize(c);
+      current.set(c, c);
+      current.closeOver(USET_CASE_INSENSITIVE);
+      current.removeAll(ignore);
+      for (int32_t j = 0; j < current.getRangeCount(); j++) {
+        for (UChar32 c2 = current.getRangeStart(j);
+             c2 <= current.getRangeEnd(j); c2++) {
+          CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));
+        }
       }
     }
   }
 
-  // Remove any ASCII
-  special_add.remove(0x0000, 0x007f);
-  PrintSet(out, "BuildIgnoreSet", ignore);
-  PrintSet(out, "BuildSpecialAddSet", special_add);
+  PrintSet(out, "IgnoreSet", ignore);
+  PrintSet(out, "SpecialAddSet", special_add);
 }
 
 void WriteHeader(const char* header_filename) {
   std::ofstream out(header_filename);
   out << std::hex << std::setfill('0') << std::setw(4);
-
-  out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n"
-      << "// The following functions are used to build icu::UnicodeSet\n"
-      << "// for specical cases different between Unicode and ECMA262.\n"
+  out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"
+      << "// Use of this source code is governed by a BSD-style license that\n"
+      << "// can be found in the LICENSE file.\n\n"
+      << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"
+      << "// The following functions are used to build UnicodeSets\n"
+      << "// for special cases where the case-folding algorithm used by\n"
+      << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"
+      << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"
+      << "// Semantics: Canonicalize) step 3.\n\n"
       << "#ifdef V8_INTL_SUPPORT\n"
+      << "#include \"src/base/lazy-instance.h\"\n\n"
       << "#include \"src/regexp/special-case.h\"\n\n"
       << "#include \"unicode/uniset.h\"\n"
       << "namespace v8 {\n"