summaryrefslogtreecommitdiffstats
path: root/js/src/regexp/gen-regexp-special-case.cc
diff options
context:
space:
mode:
Diffstat (limited to 'js/src/regexp/gen-regexp-special-case.cc')
-rw-r--r--js/src/regexp/gen-regexp-special-case.cc149
1 files changed, 95 insertions, 54 deletions
diff --git a/js/src/regexp/gen-regexp-special-case.cc b/js/src/regexp/gen-regexp-special-case.cc
index 337743f53..b4a8c3da4 100644
--- a/js/src/regexp/gen-regexp-special-case.cc
+++ b/js/src/regexp/gen-regexp-special-case.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 the V8 project authors. All rights reserved.
+// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
@@ -7,18 +7,19 @@
#include <iostream>
#include <sstream>
-#include "unicode/uchar.h"
-#include "unicode/uniset.h"
+#include "regexp/special-case.h"
namespace v8 {
namespace internal {
-// The following code generates BuildSpecialAddSet() and BuildIgnoreSet()
-// functions into "src/regexp/special-case.cc".
-// See more details in http://shorturl.at/adfO5
-void PrintSet(std::ofstream& out, const char* func_name,
+static const uc32 kSurrogateStart = 0xd800;
+static const uc32 kSurrogateEnd = 0xdfff;
+static const uc32 kNonBmpStart = 0x10000;
+
+// The following code generates "src/regexp/special-case.cc".
+void PrintSet(std::ofstream& out, const char* name,
const icu::UnicodeSet& set) {
- out << "icu::UnicodeSet " << func_name << "() {\n"
+ out << "icu::UnicodeSet Build" << name << "() {\n"
<< " icu::UnicodeSet set;\n";
for (int32_t i = 0; i < set.getRangeCount(); i++) {
if (set.getRangeStart(i) == set.getRangeEnd(i)) {
@@ -30,73 +31,113 @@ void PrintSet(std::ofstream& out, const char* func_name,
}
out << " set.freeze();\n"
<< " return set;\n"
- << "}\n";
+ << "}\n\n";
+
+ out << "struct " << name << "Data {\n"
+ << " " << name << "Data() : set(Build" << name << "()) {}\n"
+ << " const icu::UnicodeSet set;\n"
+ << "};\n\n";
+
+ out << "//static\n"
+ << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n"
+ << " static base::LazyInstance<" << name << "Data>::type set =\n"
+ << " LAZY_INSTANCE_INITIALIZER;\n"
+ << " return set.Pointer()->set;\n"
+ << "}\n\n";
}
void PrintSpecial(std::ofstream& out) {
icu::UnicodeSet current;
- icu::UnicodeSet processed(0xd800, 0xdbff); // Ignore surrogate range.
icu::UnicodeSet special_add;
icu::UnicodeSet ignore;
UErrorCode status = U_ZERO_ERROR;
icu::UnicodeSet upper("[\\p{Lu}]", status);
CHECK(U_SUCCESS(status));
- // Iterate through all chars in BMP except ASCII and Surrogate.
- for (UChar32 i = 0x80; i < 0x010000; i++) {
- // Ignore those characters which is already processed.
- if (!processed.contains(i)) {
- current.set(i, i);
- current.closeOver(USET_CASE_INSENSITIVE);
- // Remember we already processed current.
- processed.addAll(current);
-
- // All uppercase characters in current.
- icu::UnicodeSet keep_upper(current);
- keep_upper.retainAll(upper);
-
- // Check if we have more than one uppercase character in current.
- // If there are more than one uppercase character, then it is a special
- // set which need to be added into either "Special Add" set or "Ignore"
- // set.
- int32_t number_of_upper = 0;
- for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) {
- number_of_upper +=
- keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1;
+ // Iterate through all chars in BMP except surrogates.
+ for (UChar32 i = 0; i < kNonBmpStart; i++) {
+ if (i >= kSurrogateStart && i <= kSurrogateEnd) {
+ continue; // Ignore surrogate range
+ }
+ current.set(i, i);
+ current.closeOver(USET_CASE_INSENSITIVE);
+
+ // Check to see if all characters in the case-folding equivalence
+ // class as defined by UnicodeSet::closeOver all map to the same
+ // canonical value.
+ UChar32 canonical = RegExpCaseFolding::Canonicalize(i);
+ bool class_has_matching_canonical_char = false;
+ bool class_has_non_matching_canonical_char = false;
+ for (int32_t j = 0; j < current.getRangeCount(); j++) {
+ for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);
+ c++) {
+ if (c == i) {
+ continue;
+ }
+ UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c);
+ if (canonical == other_canonical) {
+ class_has_matching_canonical_char = true;
+ } else {
+ class_has_non_matching_canonical_char = true;
+ }
+ }
+ }
+ // If any other character in i's equivalence class has a
+ // different canonical value, then i needs special handling. If
+ // no other character shares a canonical value with i, we can
+ // ignore i when adding alternatives for case-independent
+ // comparison. If at least one other character shares a
+ // canonical value, then i needs special handling.
+ if (class_has_non_matching_canonical_char) {
+ if (class_has_matching_canonical_char) {
+ special_add.add(i);
+ } else {
+ ignore.add(i);
}
- if (number_of_upper > 1) {
- // Add all non uppercase characters (could be Ll or Mn) to special add
- // set.
- current.removeAll(upper);
- special_add.addAll(current);
-
- // Add the uppercase characters of non uppercase character to
- // special add set.
- CHECK_GT(current.getRangeCount(), 0);
- UChar32 main_upper = u_toupper(current.getRangeStart(0));
- special_add.add(main_upper);
-
- // Add all uppercase except the main upper to ignore set.
- keep_upper.remove(main_upper);
- ignore.addAll(keep_upper);
+ }
+ }
+
+ // Verify that no Unicode equivalence class contains two non-trivial
+ // JS equivalence classes. Every character in SpecialAddSet has the
+ // same canonical value as every other non-IgnoreSet character in
+ // its Unicode equivalence class. Therefore, if we call closeOver on
+ // a set containing no IgnoreSet characters, the only characters
+ // that must be removed from the result are in IgnoreSet. This fact
+ // is used in CharacterRange::AddCaseEquivalents.
+ for (int32_t i = 0; i < special_add.getRangeCount(); i++) {
+ for (UChar32 c = special_add.getRangeStart(i);
+ c <= special_add.getRangeEnd(i); c++) {
+ UChar32 canonical = RegExpCaseFolding::Canonicalize(c);
+ current.set(c, c);
+ current.closeOver(USET_CASE_INSENSITIVE);
+ current.removeAll(ignore);
+ for (int32_t j = 0; j < current.getRangeCount(); j++) {
+ for (UChar32 c2 = current.getRangeStart(j);
+ c2 <= current.getRangeEnd(j); c2++) {
+ CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));
+ }
}
}
}
- // Remove any ASCII
- special_add.remove(0x0000, 0x007f);
- PrintSet(out, "BuildIgnoreSet", ignore);
- PrintSet(out, "BuildSpecialAddSet", special_add);
+ PrintSet(out, "IgnoreSet", ignore);
+ PrintSet(out, "SpecialAddSet", special_add);
}
void WriteHeader(const char* header_filename) {
std::ofstream out(header_filename);
out << std::hex << std::setfill('0') << std::setw(4);
-
- out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n"
- << "// The following functions are used to build icu::UnicodeSet\n"
- << "// for specical cases different between Unicode and ECMA262.\n"
+ out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"
+ << "// Use of this source code is governed by a BSD-style license that\n"
+ << "// can be found in the LICENSE file.\n\n"
+ << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"
+ << "// The following functions are used to build UnicodeSets\n"
+ << "// for special cases where the case-folding algorithm used by\n"
+ << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"
+ << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"
+ << "// Semantics: Canonicalize) step 3.\n\n"
<< "#ifdef V8_INTL_SUPPORT\n"
+ << "#include \"src/base/lazy-instance.h\"\n\n"
<< "#include \"src/regexp/special-case.h\"\n\n"
<< "#include \"unicode/uniset.h\"\n"
<< "namespace v8 {\n"