summaryrefslogtreecommitdiffstats
path: root/js/src/regexp
diff options
context:
space:
mode:
authorMatt A. Tobin <email@mattatobin.com>2020-11-09 20:37:05 -0500
committerMatt A. Tobin <email@mattatobin.com>2020-11-09 20:37:05 -0500
commit51468e998c8e7191ddecacec3944c806b29dd590 (patch)
treec713f075c54781868ec119ea5c5f3c9369af3576 /js/src/regexp
parent77746f1d900a35eceb23bd760983e95de7b4a547 (diff)
downloadUXP-51468e998c8e7191ddecacec3944c806b29dd590.tar
UXP-51468e998c8e7191ddecacec3944c806b29dd590.tar.gz
UXP-51468e998c8e7191ddecacec3944c806b29dd590.tar.lz
UXP-51468e998c8e7191ddecacec3944c806b29dd590.tar.xz
UXP-51468e998c8e7191ddecacec3944c806b29dd590.zip
Issue #1677 - Part 5: "Simplify" regexp re-import process (and re-import from later revision)
I am going on record to say Mozilla are utter fucking assholes for pulling this as part of their progression.
Diffstat (limited to 'js/src/regexp')
-rw-r--r--js/src/regexp/VERSION5
-rw-r--r--js/src/regexp/gen-regexp-special-case.cc149
-rw-r--r--js/src/regexp/import-irregexp.py143
-rw-r--r--js/src/regexp/regexp-ast.h29
-rw-r--r--js/src/regexp/regexp-bytecode-generator.cc8
-rw-r--r--js/src/regexp/regexp-bytecode-generator.h2
-rw-r--r--js/src/regexp/regexp-bytecode-peephole.cc1
-rw-r--r--js/src/regexp/regexp-bytecodes.h8
-rw-r--r--js/src/regexp/regexp-compiler-tonode.cc114
-rw-r--r--js/src/regexp/regexp-compiler.cc115
-rw-r--r--js/src/regexp/regexp-compiler.h16
-rw-r--r--js/src/regexp/regexp-error.cc22
-rw-r--r--js/src/regexp/regexp-error.h56
-rw-r--r--js/src/regexp/regexp-interpreter.cc65
-rw-r--r--js/src/regexp/regexp-macro-assembler-tracer.cc10
-rw-r--r--js/src/regexp/regexp-macro-assembler-tracer.h1
-rw-r--r--js/src/regexp/regexp-macro-assembler.cc47
-rw-r--r--js/src/regexp/regexp-macro-assembler.h10
-rw-r--r--js/src/regexp/regexp-parser.cc142
-rw-r--r--js/src/regexp/regexp-parser.h20
-rw-r--r--js/src/regexp/regexp-shim.h4
-rw-r--r--js/src/regexp/regexp-stack.h6
-rw-r--r--js/src/regexp/regexp.h7
-rw-r--r--js/src/regexp/special-case.cc60
-rw-r--r--js/src/regexp/special-case.h140
-rw-r--r--js/src/regexp/update-headers.py38
26 files changed, 706 insertions, 512 deletions
diff --git a/js/src/regexp/VERSION b/js/src/regexp/VERSION
index 3a0935dea..c7d35a2bb 100644
--- a/js/src/regexp/VERSION
+++ b/js/src/regexp/VERSION
@@ -1,3 +1,2 @@
-This code was most recently imported from the following version of V8:
-
-https://github.com/v8/v8/tree/2599d3cc208a3a4873be517285220abd8416c3d7/src/regexp
+Imported using import-irregexp.py from:
+https://github.com/v8/v8/tree/560f2d8bb3f3a72d78e1a7d7654235d53fdcc83c/src/regexp
diff --git a/js/src/regexp/gen-regexp-special-case.cc b/js/src/regexp/gen-regexp-special-case.cc
index 337743f53..b4a8c3da4 100644
--- a/js/src/regexp/gen-regexp-special-case.cc
+++ b/js/src/regexp/gen-regexp-special-case.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 the V8 project authors. All rights reserved.
+// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
@@ -7,18 +7,19 @@
#include <iostream>
#include <sstream>
-#include "unicode/uchar.h"
-#include "unicode/uniset.h"
+#include "regexp/special-case.h"
namespace v8 {
namespace internal {
-// The following code generates BuildSpecialAddSet() and BuildIgnoreSet()
-// functions into "src/regexp/special-case.cc".
-// See more details in http://shorturl.at/adfO5
-void PrintSet(std::ofstream& out, const char* func_name,
+static const uc32 kSurrogateStart = 0xd800;
+static const uc32 kSurrogateEnd = 0xdfff;
+static const uc32 kNonBmpStart = 0x10000;
+
+// The following code generates "src/regexp/special-case.cc".
+void PrintSet(std::ofstream& out, const char* name,
const icu::UnicodeSet& set) {
- out << "icu::UnicodeSet " << func_name << "() {\n"
+ out << "icu::UnicodeSet Build" << name << "() {\n"
<< " icu::UnicodeSet set;\n";
for (int32_t i = 0; i < set.getRangeCount(); i++) {
if (set.getRangeStart(i) == set.getRangeEnd(i)) {
@@ -30,73 +31,113 @@ void PrintSet(std::ofstream& out, const char* func_name,
}
out << " set.freeze();\n"
<< " return set;\n"
- << "}\n";
+ << "}\n\n";
+
+ out << "struct " << name << "Data {\n"
+ << " " << name << "Data() : set(Build" << name << "()) {}\n"
+ << " const icu::UnicodeSet set;\n"
+ << "};\n\n";
+
+ out << "//static\n"
+ << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n"
+ << " static base::LazyInstance<" << name << "Data>::type set =\n"
+ << " LAZY_INSTANCE_INITIALIZER;\n"
+ << " return set.Pointer()->set;\n"
+ << "}\n\n";
}
void PrintSpecial(std::ofstream& out) {
icu::UnicodeSet current;
- icu::UnicodeSet processed(0xd800, 0xdbff); // Ignore surrogate range.
icu::UnicodeSet special_add;
icu::UnicodeSet ignore;
UErrorCode status = U_ZERO_ERROR;
icu::UnicodeSet upper("[\\p{Lu}]", status);
CHECK(U_SUCCESS(status));
- // Iterate through all chars in BMP except ASCII and Surrogate.
- for (UChar32 i = 0x80; i < 0x010000; i++) {
- // Ignore those characters which is already processed.
- if (!processed.contains(i)) {
- current.set(i, i);
- current.closeOver(USET_CASE_INSENSITIVE);
- // Remember we already processed current.
- processed.addAll(current);
-
- // All uppercase characters in current.
- icu::UnicodeSet keep_upper(current);
- keep_upper.retainAll(upper);
-
- // Check if we have more than one uppercase character in current.
- // If there are more than one uppercase character, then it is a special
- // set which need to be added into either "Special Add" set or "Ignore"
- // set.
- int32_t number_of_upper = 0;
- for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) {
- number_of_upper +=
- keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1;
+ // Iterate through all chars in BMP except surrogates.
+ for (UChar32 i = 0; i < kNonBmpStart; i++) {
+ if (i >= kSurrogateStart && i <= kSurrogateEnd) {
+ continue; // Ignore surrogate range
+ }
+ current.set(i, i);
+ current.closeOver(USET_CASE_INSENSITIVE);
+
+ // Check to see if all characters in the case-folding equivalence
+ // class as defined by UnicodeSet::closeOver all map to the same
+ // canonical value.
+ UChar32 canonical = RegExpCaseFolding::Canonicalize(i);
+ bool class_has_matching_canonical_char = false;
+ bool class_has_non_matching_canonical_char = false;
+ for (int32_t j = 0; j < current.getRangeCount(); j++) {
+ for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);
+ c++) {
+ if (c == i) {
+ continue;
+ }
+ UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c);
+ if (canonical == other_canonical) {
+ class_has_matching_canonical_char = true;
+ } else {
+ class_has_non_matching_canonical_char = true;
+ }
+ }
+ }
+ // If any other character in i's equivalence class has a
+ // different canonical value, then i needs special handling. If
+ // no other character shares a canonical value with i, we can
+ // ignore i when adding alternatives for case-independent
+ // comparison. If at least one other character shares a
+ // canonical value, then i needs special handling.
+ if (class_has_non_matching_canonical_char) {
+ if (class_has_matching_canonical_char) {
+ special_add.add(i);
+ } else {
+ ignore.add(i);
}
- if (number_of_upper > 1) {
- // Add all non uppercase characters (could be Ll or Mn) to special add
- // set.
- current.removeAll(upper);
- special_add.addAll(current);
-
- // Add the uppercase characters of non uppercase character to
- // special add set.
- CHECK_GT(current.getRangeCount(), 0);
- UChar32 main_upper = u_toupper(current.getRangeStart(0));
- special_add.add(main_upper);
-
- // Add all uppercase except the main upper to ignore set.
- keep_upper.remove(main_upper);
- ignore.addAll(keep_upper);
+ }
+ }
+
+ // Verify that no Unicode equivalence class contains two non-trivial
+ // JS equivalence classes. Every character in SpecialAddSet has the
+ // same canonical value as every other non-IgnoreSet character in
+ // its Unicode equivalence class. Therefore, if we call closeOver on
+ // a set containing no IgnoreSet characters, the only characters
+ // that must be removed from the result are in IgnoreSet. This fact
+ // is used in CharacterRange::AddCaseEquivalents.
+ for (int32_t i = 0; i < special_add.getRangeCount(); i++) {
+ for (UChar32 c = special_add.getRangeStart(i);
+ c <= special_add.getRangeEnd(i); c++) {
+ UChar32 canonical = RegExpCaseFolding::Canonicalize(c);
+ current.set(c, c);
+ current.closeOver(USET_CASE_INSENSITIVE);
+ current.removeAll(ignore);
+ for (int32_t j = 0; j < current.getRangeCount(); j++) {
+ for (UChar32 c2 = current.getRangeStart(j);
+ c2 <= current.getRangeEnd(j); c2++) {
+ CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));
+ }
}
}
}
- // Remove any ASCII
- special_add.remove(0x0000, 0x007f);
- PrintSet(out, "BuildIgnoreSet", ignore);
- PrintSet(out, "BuildSpecialAddSet", special_add);
+ PrintSet(out, "IgnoreSet", ignore);
+ PrintSet(out, "SpecialAddSet", special_add);
}
void WriteHeader(const char* header_filename) {
std::ofstream out(header_filename);
out << std::hex << std::setfill('0') << std::setw(4);
-
- out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n"
- << "// The following functions are used to build icu::UnicodeSet\n"
- << "// for specical cases different between Unicode and ECMA262.\n"
+ out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"
+ << "// Use of this source code is governed by a BSD-style license that\n"
+ << "// can be found in the LICENSE file.\n\n"
+ << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"
+ << "// The following functions are used to build UnicodeSets\n"
+ << "// for special cases where the case-folding algorithm used by\n"
+ << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"
+ << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"
+ << "// Semantics: Canonicalize) step 3.\n\n"
<< "#ifdef V8_INTL_SUPPORT\n"
+ << "#include \"src/base/lazy-instance.h\"\n\n"
<< "#include \"src/regexp/special-case.h\"\n\n"
<< "#include \"unicode/uniset.h\"\n"
<< "namespace v8 {\n"
diff --git a/js/src/regexp/import-irregexp.py b/js/src/regexp/import-irregexp.py
new file mode 100644
index 000000000..870387232
--- /dev/null
+++ b/js/src/regexp/import-irregexp.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+# This script handles all the mechanical steps of importing irregexp from v8:
+#
+# 1. Acquire the source: either from github, or optionally from a local copy of v8.
+# 2. Copy the contents of v8/src/regexp into js/src/regexp
+# - Exclude files that we have chosen not to import.
+# 3. While doing so, update #includes:
+# - Change "src/regexp/*" to "regexp/*".
+# - Remove other v8-specific headers completely.
+# 4. Add '#include "regexp/regexp-shim.h" in the necessary places.
+# 5. Update the VERSION file to include the correct git hash.
+#
+# Usage:
+# cd path/to/js/src/regexp
+# ./import-irregexp.py --path path/to/v8/src/regexp
+#
+# Alternatively, without the --path argument, import-irregexp.py will
+# clone v8 from github into a temporary directory.
+#
+# After running this script, changes to the shim code may be necessary
+# to account for changes in upstream irregexp.
+
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+
+def get_hash(path):
+ # Get the hash for the current git revision
+ cwd = os.getcwd()
+ os.chdir(path)
+ command = ['git', 'rev-parse', 'HEAD']
+ result = subprocess.check_output(command, encoding='utf-8')
+ os.chdir(cwd)
+ return result.rstrip()
+
+
+def copy_and_update_includes(src_path, dst_path):
+ # List of header files that need to include the shim header
+ need_shim = ['property-sequences.h',
+ 'regexp-ast.h',
+ 'regexp-bytecode-peephole.h',
+ 'regexp-bytecodes.h',
+ 'regexp-dotprinter.h',
+ 'regexp.h',
+ 'regexp-macro-assembler.h',
+ 'regexp-stack.h',
+ 'special-case.h']
+
+ src = open(str(src_path), 'r')
+ dst = open(str(dst_path), 'w')
+
+ # 1. Rewrite includes of V8 regexp headers:
+ regexp_include = re.compile('#include "src/regexp')
+ regexp_include_new = '#include "regexp'
+
+ # 2. Remove includes of other V8 headers
+ other_include = re.compile('#include "src/')
+
+ # 3. If needed, add '#include "regexp/regexp-shim.h"'.
+ # Note: We get a little fancy to ensure that header files are
+ # in alphabetic order. `need_to_add_shim` is true if we still
+ # have to add the shim header in this file. `adding_shim_now`
+ # is true if we have found a '#include "src/*' and we are just
+ # waiting to find something alphabetically smaller (or an empty
+ # line) so that we can insert the shim header in the right place.
+ need_to_add_shim = src_path.name in need_shim
+ adding_shim_now = False
+
+ for line in src:
+ if adding_shim_now:
+ if (line == '\n' or line > '#include "src/regexp/regexp-shim.h"'):
+ dst.write('#include "regexp/regexp-shim.h"\n')
+ need_to_add_shim = False
+ adding_shim_now = False
+
+ if regexp_include.search(line):
+ dst.write(re.sub(regexp_include, regexp_include_new, line))
+ elif other_include.search(line):
+ if need_to_add_shim:
+ adding_shim_now = True
+ else:
+ dst.write(line)
+
+
+def import_from(srcdir, dstdir):
+ excluded = ['OWNERS',
+ 'regexp.cc',
+ 'regexp-utils.cc',
+ 'regexp-utils.h',
+ 'regexp-macro-assembler-arch.h']
+
+ for file in srcdir.iterdir():
+ if file.is_dir():
+ continue
+ if str(file.name) in excluded:
+ continue
+ copy_and_update_includes(file, dstdir / file.name)
+
+ # Update VERSION file
+ hash = get_hash(srcdir)
+ version_file = open(str(dstdir / 'VERSION'), 'w')
+ version_file.write('Imported using import-irregexp.py from:\n')
+ version_file.write('https://github.com/v8/v8/tree/%s/src/regexp\n' % hash)
+
+
+if __name__ == '__main__':
+ import argparse
+ import tempfile
+
+ # This script should be run from js/src/regexp to work correctly.
+ current_path = Path(os.getcwd())
+ expected_path = 'js/src/regexp'
+ if not current_path.match(expected_path):
+ raise RuntimeError('%s must be run from %s' % (sys.argv[0],
+ expected_path))
+
+ parser = argparse.ArgumentParser(description='Import irregexp from v8')
+ parser.add_argument('-p', '--path', help='path to v8/src/regexp')
+ args = parser.parse_args()
+
+ if args.path:
+ src_path = Path(args.path)
+
+ if not (src_path / 'regexp.h').exists():
+ print('Usage:\n import-irregexp.py --path <path/to/v8/src/regexp>')
+ sys.exit(1)
+ import_from(src_path, current_path)
+ sys.exit(0)
+
+ with tempfile.TemporaryDirectory() as tempdir:
+ v8_git = 'https://github.com/v8/v8.git'
+ clone = 'git clone --depth 1 %s %s' % (v8_git, tempdir)
+ os.system(clone)
+ src_path = Path(tempdir) / 'src/regexp'
+ import_from(src_path, current_path)
diff --git a/js/src/regexp/regexp-ast.h b/js/src/regexp/regexp-ast.h
index fe6913e1d..311929d0b 100644
--- a/js/src/regexp/regexp-ast.h
+++ b/js/src/regexp/regexp-ast.h
@@ -458,7 +458,11 @@ class RegExpQuantifier final : public RegExpTree {
class RegExpCapture final : public RegExpTree {
public:
explicit RegExpCapture(int index)
- : body_(nullptr), index_(index), name_(nullptr) {}
+ : body_(nullptr),
+ index_(index),
+ min_match_(0),
+ max_match_(0),
+ name_(nullptr) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
static RegExpNode* ToNode(RegExpTree* body, int index,
@@ -468,10 +472,14 @@ class RegExpCapture final : public RegExpTree {
bool IsAnchoredAtEnd() override;
Interval CaptureRegisters() override;
bool IsCapture() override;
- int min_match() override { return body_->min_match(); }
- int max_match() override { return body_->max_match(); }
+ int min_match() override { return min_match_; }
+ int max_match() override { return max_match_; }
RegExpTree* body() { return body_; }
- void set_body(RegExpTree* body) { body_ = body; }
+ void set_body(RegExpTree* body) {
+ body_ = body;
+ min_match_ = body->min_match();
+ max_match_ = body->max_match();
+ }
int index() const { return index_; }
const ZoneVector<uc16>* name() const { return name_; }
void set_name(const ZoneVector<uc16>* name) { name_ = name; }
@@ -481,12 +489,17 @@ class RegExpCapture final : public RegExpTree {
private:
RegExpTree* body_;
int index_;
+ int min_match_;
+ int max_match_;
const ZoneVector<uc16>* name_;
};
class RegExpGroup final : public RegExpTree {
public:
- explicit RegExpGroup(RegExpTree* body) : body_(body) {}
+ explicit RegExpGroup(RegExpTree* body)
+ : body_(body),
+ min_match_(body->min_match()),
+ max_match_(body->max_match()) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) override {
@@ -496,13 +509,15 @@ class RegExpGroup final : public RegExpTree {
bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); }
bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); }
bool IsGroup() override;
- int min_match() override { return body_->min_match(); }
- int max_match() override { return body_->max_match(); }
+ int min_match() override { return min_match_; }
+ int max_match() override { return max_match_; }
Interval CaptureRegisters() override { return body_->CaptureRegisters(); }
RegExpTree* body() { return body_; }
private:
RegExpTree* body_;
+ int min_match_;
+ int max_match_;
};
class RegExpLookaround final : public RegExpTree {
diff --git a/js/src/regexp/regexp-bytecode-generator.cc b/js/src/regexp/regexp-bytecode-generator.cc
index 239b27605..db151de85 100644
--- a/js/src/regexp/regexp-bytecode-generator.cc
+++ b/js/src/regexp/regexp-bytecode-generator.cc
@@ -327,13 +327,11 @@ void RegExpBytecodeGenerator::CheckNotBackReference(int start_reg,
}
void RegExpBytecodeGenerator::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, bool unicode, Label* on_not_equal) {
+ int start_reg, bool read_backward, Label* on_not_equal) {
DCHECK_LE(0, start_reg);
DCHECK_GE(kMaxRegister, start_reg);
- Emit(read_backward ? (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD
- : BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD)
- : (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE
- : BC_CHECK_NOT_BACK_REF_NO_CASE),
+ Emit(read_backward ? BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD
+ : BC_CHECK_NOT_BACK_REF_NO_CASE,
start_reg);
EmitOrLink(on_not_equal);
}
diff --git a/js/src/regexp/regexp-bytecode-generator.h b/js/src/regexp/regexp-bytecode-generator.h
index 15fbda8ec..f5502464d 100644
--- a/js/src/regexp/regexp-bytecode-generator.h
+++ b/js/src/regexp/regexp-bytecode-generator.h
@@ -69,7 +69,7 @@ class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler {
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward, bool unicode,
+ bool read_backward,
Label* on_no_match);
virtual void IfRegisterLT(int register_index, int comparand, Label* if_lt);
virtual void IfRegisterGE(int register_index, int comparand, Label* if_ge);
diff --git a/js/src/regexp/regexp-bytecode-peephole.cc b/js/src/regexp/regexp-bytecode-peephole.cc
index 2bc1b5aa2..4266b4a80 100644
--- a/js/src/regexp/regexp-bytecode-peephole.cc
+++ b/js/src/regexp/regexp-bytecode-peephole.cc
@@ -428,7 +428,6 @@ BytecodeArgumentMapping BytecodeSequenceNode::ArgumentMapping(
size_t index) const {
DCHECK(IsSequence());
DCHECK(argument_mapping_ != nullptr);
- DCHECK_GE(index, 0);
DCHECK_LT(index, argument_mapping_->size());
return argument_mapping_->at(index);
diff --git a/js/src/regexp/regexp-bytecodes.h b/js/src/regexp/regexp-bytecodes.h
index 24d6925db..1cfef1b2d 100644
--- a/js/src/regexp/regexp-bytecodes.h
+++ b/js/src/regexp/regexp-bytecodes.h
@@ -100,12 +100,12 @@ STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK);
V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128 */ \
V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \
V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \
- V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \
- V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \
- V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) \
+ V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \
+ V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \
+ V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) /* UNUSED */ \
V(CHECK_NOT_BACK_REF_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 41, 8) /* bc8 reg_idx24 addr32 */ \
- V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) \
+ V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) /* UNUSED */ \
V(CHECK_NOT_REGS_EQUAL, 43, 12) /* bc8 regidx24 reg_idx32 addr32 */ \
V(CHECK_REGISTER_LT, 44, 12) /* bc8 reg_idx24 value32 addr32 */ \
V(CHECK_REGISTER_GE, 45, 12) /* bc8 reg_idx24 value32 addr32 */ \
diff --git a/js/src/regexp/regexp-compiler-tonode.cc b/js/src/regexp/regexp-compiler-tonode.cc
index fc734ac7c..257030589 100644
--- a/js/src/regexp/regexp-compiler-tonode.cc
+++ b/js/src/regexp/regexp-compiler-tonode.cc
@@ -1137,39 +1137,6 @@ Vector<const int> CharacterRange::GetWordBounds() {
return Vector<const int>(kWordRanges, kWordRangeCount - 1);
}
-#ifdef V8_INTL_SUPPORT
-struct IgnoreSet {
- IgnoreSet() : set(BuildIgnoreSet()) {}
- const icu::UnicodeSet set;
-};
-
-struct SpecialAddSet {
- SpecialAddSet() : set(BuildSpecialAddSet()) {}
- const icu::UnicodeSet set;
-};
-
-icu::UnicodeSet BuildAsciiAToZSet() {
- icu::UnicodeSet set('a', 'z');
- set.add('A', 'Z');
- set.freeze();
- return set;
-}
-
-struct AsciiAToZSet {
- AsciiAToZSet() : set(BuildAsciiAToZSet()) {}
- const icu::UnicodeSet set;
-};
-
-static base::LazyInstance<IgnoreSet>::type ignore_set =
- LAZY_INSTANCE_INITIALIZER;
-
-static base::LazyInstance<SpecialAddSet>::type special_add_set =
- LAZY_INSTANCE_INITIALIZER;
-
-static base::LazyInstance<AsciiAToZSet>::type ascii_a_to_z_set =
- LAZY_INSTANCE_INITIALIZER;
-#endif // V8_INTL_SUPPORT
-
// static
void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
ZoneList<CharacterRange>* ranges,
@@ -1192,75 +1159,22 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
others.add(from, to);
}
- // Set of characters already added to ranges that do not need to be added
- // again.
+ // Compute the set of additional characters that should be added,
+ // using UnicodeSet::closeOver. ECMA 262 defines slightly different
+ // case-folding rules than Unicode, so some characters that are
+ // added by closeOver do not match anything other than themselves in
+ // JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the
+ // same case-insensitive character as 's' or 'S' according to
+ // Unicode, but does not match any other character in JS. To handle
+ // this case, we add such characters to the IgnoreSet and filter
+ // them out. We filter twice: once before calling closeOver (to
+ // prevent 'ſ' from adding 's'), and once after calling closeOver
+ // (to prevent 's' from adding 'ſ'). See regexp/special-case.h for
+ // more information.
icu::UnicodeSet already_added(others);
-
- // Set of characters in ranges that are in the 52 ASCII characters [a-zA-Z].
- icu::UnicodeSet in_ascii_a_to_z(others);
- in_ascii_a_to_z.retainAll(ascii_a_to_z_set.Pointer()->set);
-
- // Remove all chars in [a-zA-Z] from others.
- others.removeAll(in_ascii_a_to_z);
-
- // Set of characters in ranges that are overlapping with special add set.
- icu::UnicodeSet in_special_add(others);
- in_special_add.retainAll(special_add_set.Pointer()->set);
-
- others.removeAll(in_special_add);
-
- // Ignore all chars in ignore set.
- others.removeAll(ignore_set.Pointer()->set);
-
- // For most of the chars in ranges that is still in others, find the case
- // equivlant set by calling closeOver(USET_CASE_INSENSITIVE).
+ others.removeAll(RegExpCaseFolding::IgnoreSet());
others.closeOver(USET_CASE_INSENSITIVE);
-
- // Because closeOver(USET_CASE_INSENSITIVE) may add ASCII [a-zA-Z] to others,
- // but ECMA262 "i" mode won't consider that, remove them from others.
- // Ex: U+017F add 'S' and 's' to others.
- others.removeAll(ascii_a_to_z_set.Pointer()->set);
-
- // Special handling for in_ascii_a_to_z.
- for (int32_t i = 0; i < in_ascii_a_to_z.getRangeCount(); i++) {
- UChar32 start = in_ascii_a_to_z.getRangeStart(i);
- UChar32 end = in_ascii_a_to_z.getRangeEnd(i);
- // Check if it is uppercase A-Z by checking bit 6.
- if (start & 0x0020) {
- // Add the lowercases
- others.add(start & 0x005F, end & 0x005F);
- } else {
- // Add the uppercases
- others.add(start | 0x0020, end | 0x0020);
- }
- }
-
- // Special handling for chars in "Special Add" set.
- for (int32_t i = 0; i < in_special_add.getRangeCount(); i++) {
- UChar32 end = in_special_add.getRangeEnd(i);
- for (UChar32 ch = in_special_add.getRangeStart(i); ch <= end; ch++) {
- // Add the uppercase of this character if itself is not an uppercase
- // character.
- // Note: The if condiction cannot be u_islower(ch) because ch could be
- // neither uppercase nor lowercase but Mn.
- if (!u_isupper(ch)) {
- others.add(u_toupper(ch));
- }
- icu::UnicodeSet candidates(ch, ch);
- candidates.closeOver(USET_CASE_INSENSITIVE);
- for (int32_t j = 0; j < candidates.getRangeCount(); j++) {
- UChar32 end2 = candidates.getRangeEnd(j);
- for (UChar32 ch2 = candidates.getRangeStart(j); ch2 <= end2; ch2++) {
- // Add character that is not uppercase to others.
- if (!u_isupper(ch2)) {
- others.add(ch2);
- }
- }
- }
- }
- }
-
- // Remove all characters which already in the ranges.
+ others.removeAll(RegExpCaseFolding::IgnoreSet());
others.removeAll(already_added);
// Add others to the ranges
diff --git a/js/src/regexp/regexp-compiler.cc b/js/src/regexp/regexp-compiler.cc
index 9a2aa30dc..c0070061f 100644
--- a/js/src/regexp/regexp-compiler.cc
+++ b/js/src/regexp/regexp-compiler.cc
@@ -5,7 +5,9 @@
#include "regexp/regexp-compiler.h"
#include "regexp/regexp-macro-assembler-arch.h"
-#include "regexp/regexp-macro-assembler-tracer.h"
+#ifdef V8_INTL_SUPPORT
+#include "regexp/special-case.h"
+#endif // V8_INTL_SUPPORT
#ifdef V8_INTL_SUPPORT
#include "unicode/locid.h"
@@ -237,20 +239,15 @@ RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
RegExpCompiler::CompilationResult RegExpCompiler::Assemble(
Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start,
int capture_count, Handle<String> pattern) {
-#ifdef DEBUG
- if (FLAG_trace_regexp_assembler)
- macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler);
- else
-#endif
- macro_assembler_ = macro_assembler;
+ macro_assembler_ = macro_assembler;
- std::vector<RegExpNode*> work_list;
+ ZoneVector<RegExpNode*> work_list(zone());
work_list_ = &work_list;
Label fail;
macro_assembler_->PushBacktrack(&fail);
Trace new_trace;
start->Emit(this, &new_trace);
- macro_assembler_->Bind(&fail);
+ macro_assembler_->BindJumpTarget(&fail);
macro_assembler_->Fail();
while (!work_list.empty()) {
RegExpNode* node = work_list.back();
@@ -264,14 +261,9 @@ RegExpCompiler::CompilationResult RegExpCompiler::Assemble(
}
Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
- isolate->IncreaseTotalRegexpCodeGenerated(code->Size());
+ isolate->IncreaseTotalRegexpCodeGenerated(code);
work_list_ = nullptr;
-#ifdef DEBUG
- if (FLAG_trace_regexp_assembler) {
- delete macro_assembler_;
- }
-#endif
return {*code, next_register_};
}
@@ -557,7 +549,7 @@ void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
}
// On backtrack we need to restore state.
- assembler->Bind(&undo);
+ assembler->BindJumpTarget(&undo);
RestoreAffectedRegisters(assembler, max_register, registers_to_pop,
registers_to_clear);
if (backtrack() == nullptr) {
@@ -720,32 +712,34 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
unibrow::uchar* letters,
int letter_length) {
#ifdef V8_INTL_SUPPORT
- // Special case for U+017F which has upper case in ASCII range.
- if (character == 0x017f) {
+ if (RegExpCaseFolding::IgnoreSet().contains(character)) {
letters[0] = character;
return 1;
}
+ bool in_special_add_set =
+ RegExpCaseFolding::SpecialAddSet().contains(character);
+
icu::UnicodeSet set;
set.add(character);
set = set.closeOver(USET_CASE_INSENSITIVE);
+
+ UChar32 canon = 0;
+ if (in_special_add_set) {
+ canon = RegExpCaseFolding::Canonicalize(character);
+ }
+
int32_t range_count = set.getRangeCount();
int items = 0;
for (int32_t i = 0; i < range_count; i++) {
UChar32 start = set.getRangeStart(i);
UChar32 end = set.getRangeEnd(i);
CHECK(end - start + items <= letter_length);
- // Only add to the output if character is not in ASCII range
- // or the case equivalent character is in ASCII range.
- // #sec-runtime-semantics-canonicalize-ch
- // 3.g If the numeric value of ch ≥ 128 and the numeric value of cu < 128,
- // return ch.
- if (!((start >= 128) && (character < 128))) {
- // No range have start and end span across code point 128.
- DCHECK((start >= 128) == (end >= 128));
- for (UChar32 cu = start; cu <= end; cu++) {
- if (one_byte_subject && cu > String::kMaxOneByteCharCode) break;
- letters[items++] = (unibrow::uchar)(cu);
+ for (UChar32 cu = start; cu <= end; cu++) {
+ if (one_byte_subject && cu > String::kMaxOneByteCharCode) break;
+ if (in_special_add_set && RegExpCaseFolding::Canonicalize(cu) != canon) {
+ continue;
}
+ letters[items++] = (unibrow::uchar)(cu);
}
}
return items;
@@ -852,10 +846,6 @@ static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
return false;
}
-using EmitCharacterFunction = bool(Isolate* isolate, RegExpCompiler* compiler,
- uc16 c, Label* on_failure, int cp_offset,
- bool check, bool preloaded);
-
// Only emits letters (things that have case). Only used for case independent
// matches.
static inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler,
@@ -1843,13 +1833,13 @@ RegExpNode* TextNode::FilterOneByte(int depth) {
if (elm.text_type() == TextElement::ATOM) {
Vector<const uc16> quarks = elm.atom()->data();
for (int j = 0; j < quarks.length(); j++) {
- uint16_t c = quarks[j];
+ uc16 c = quarks[j];
if (elm.atom()->ignore_case()) {
c = unibrow::Latin1::TryConvertToLatin1(c);
}
if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
// Replace quark in case we converted to Latin-1.
- uint16_t* writable_quarks = const_cast<uint16_t*>(quarks.begin());
+ uc16* writable_quarks = const_cast<uc16*>(quarks.begin());
writable_quarks[j] = c;
}
} else {
@@ -2304,7 +2294,6 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
if (first_element_checked && i == 0 && j == 0) continue;
if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
- EmitCharacterFunction* emit_function = nullptr;
uc16 quark = quarks[j];
if (elm.atom()->ignore_case()) {
// Everywhere else we assume that a non-Latin-1 character cannot match
@@ -2312,6 +2301,9 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
// invalid by using the Latin1 equivalent instead.
quark = unibrow::Latin1::TryConvertToLatin1(quark);
}
+ bool needs_bounds_check =
+ *checked_up_to < cp_offset + j || read_backward();
+ bool bounds_checked = false;
switch (pass) {
case NON_LATIN1_MATCH:
DCHECK(one_byte);
@@ -2321,24 +2313,24 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
}
break;
case NON_LETTER_CHARACTER_MATCH:
- emit_function = &EmitAtomNonLetter;
+ bounds_checked =
+ EmitAtomNonLetter(isolate, compiler, quark, backtrack,
+ cp_offset + j, needs_bounds_check, preloaded);
break;
case SIMPLE_CHARACTER_MATCH:
- emit_function = &EmitSimpleCharacter;
+ bounds_checked = EmitSimpleCharacter(isolate, compiler, quark,
+ backtrack, cp_offset + j,
+ needs_bounds_check, preloaded);
break;
case CASE_CHARACTER_MATCH:
- emit_function = &EmitAtomLetter;
+ bounds_checked =
+ EmitAtomLetter(isolate, compiler, quark, backtrack,
+ cp_offset + j, needs_bounds_check, preloaded);
break;
default:
break;
}
- if (emit_function != nullptr) {
- bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
- bool bound_checked =
- emit_function(isolate, compiler, quark, backtrack, cp_offset + j,
- bounds_check, preloaded);
- if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
- }
+ if (bounds_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
}
} else {
DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
@@ -3424,8 +3416,8 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
DCHECK_EQ(start_reg_ + 1, end_reg_);
if (IgnoreCase(flags_)) {
- assembler->CheckNotBackReferenceIgnoreCase(
- start_reg_, read_backward(), IsUnicode(flags_), trace->backtrack());
+ assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),
+ trace->backtrack());
} else {
assembler->CheckNotBackReference(start_reg_, read_backward(),
trace->backtrack());
@@ -3597,12 +3589,17 @@ template <typename... Propagators>
class Analysis : public NodeVisitor {
public:
Analysis(Isolate* isolate, bool is_one_byte)
- : isolate_(isolate), is_one_byte_(is_one_byte), error_message_(nullptr) {}
+ : isolate_(isolate),
+ is_one_byte_(is_one_byte),
+ error_(RegExpError::kNone) {}
void EnsureAnalyzed(RegExpNode* that) {
StackLimitCheck check(isolate());
if (check.HasOverflowed()) {
- fail("Stack overflow");
+ if (FLAG_correctness_fuzzer_suppressions) {
+ FATAL("Analysis: Aborting on stack overflow");
+ }
+ fail(RegExpError::kAnalysisStackOverflow);
return;
}
if (that->info()->been_analyzed || that->info()->being_analyzed) return;
@@ -3612,12 +3609,12 @@ class Analysis : public NodeVisitor {
that->info()->been_analyzed = true;
}
- bool has_failed() { return error_message_ != nullptr; }
- const char* error_message() {
- DCHECK(error_message_ != nullptr);
- return error_message_;
+ bool has_failed() { return error_ != RegExpError::kNone; }
+ RegExpError error() {
+ DCHECK(error_ != RegExpError::kNone);
+ return error_;
}
- void fail(const char* error_message) { error_message_ = error_message; }
+ void fail(RegExpError error) { error_ = error; }
Isolate* isolate() const { return isolate_; }
@@ -3702,19 +3699,19 @@ class Analysis : public NodeVisitor {
private:
Isolate* isolate_;
bool is_one_byte_;
- const char* error_message_;
+ RegExpError error_;
DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
};
-const char* AnalyzeRegExp(Isolate* isolate, bool is_one_byte,
+RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte,
RegExpNode* node) {
Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis(isolate,
is_one_byte);
DCHECK_EQ(node->info()->been_analyzed, false);
analysis.EnsureAnalyzed(node);
- DCHECK_IMPLIES(analysis.has_failed(), analysis.error_message() != nullptr);
- return analysis.has_failed() ? analysis.error_message() : nullptr;
+ DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone);
+ return analysis.has_failed() ? analysis.error() : RegExpError::kNone;
}
void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
diff --git a/js/src/regexp/regexp-compiler.h b/js/src/regexp/regexp-compiler.h
index 192b3284d..1954f1a4c 100644
--- a/js/src/regexp/regexp-compiler.h
+++ b/js/src/regexp/regexp-compiler.h
@@ -422,10 +422,7 @@ struct PreloadState {
// Analysis performs assertion propagation and computes eats_at_least_ values.
// See the comments on AssertionPropagator and EatsAtLeastPropagator for more
// details.
-//
-// This method returns nullptr on success or a null-terminated failure message
-// on failure.
-const char* AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node);
+RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node);
class FrequencyCollator {
public:
@@ -502,18 +499,17 @@ class RegExpCompiler {
}
struct CompilationResult final {
- explicit CompilationResult(const char* error_message)
- : error_message(error_message) {}
+ explicit CompilationResult(RegExpError err) : error(err) {}
CompilationResult(Object code, int registers)
: code(code), num_registers(registers) {}
static CompilationResult RegExpTooBig() {
- return CompilationResult("RegExp too big");
+ return CompilationResult(RegExpError::kTooLarge);
}
- bool Succeeded() const { return error_message == nullptr; }
+ bool Succeeded() const { return error == RegExpError::kNone; }
- const char* const error_message = nullptr;
+ const RegExpError error = RegExpError::kNone;
Object code;
int num_registers = 0;
};
@@ -575,7 +571,7 @@ class RegExpCompiler {
int next_register_;
int unicode_lookaround_stack_register_;
int unicode_lookaround_position_register_;
- std::vector<RegExpNode*>* work_list_;
+ ZoneVector<RegExpNode*>* work_list_;
int recursion_depth_;
RegExpMacroAssembler* macro_assembler_;
bool one_byte_;
diff --git a/js/src/regexp/regexp-error.cc b/js/src/regexp/regexp-error.cc
new file mode 100644
index 000000000..3906f9d9f
--- /dev/null
+++ b/js/src/regexp/regexp-error.cc
@@ -0,0 +1,22 @@
+// Copyright 2020 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "regexp/regexp-error.h"
+
+namespace v8 {
+namespace internal {
+
+const char* kRegExpErrorStrings[] = {
+#define TEMPLATE(NAME, STRING) STRING,
+ REGEXP_ERROR_MESSAGES(TEMPLATE)
+#undef TEMPLATE
+};
+
+const char* RegExpErrorString(RegExpError error) {
+ DCHECK_LT(error, RegExpError::NumErrors);
+ return kRegExpErrorStrings[static_cast<int>(error)];
+}
+
+} // namespace internal
+} // namespace v8
diff --git a/js/src/regexp/regexp-error.h b/js/src/regexp/regexp-error.h
new file mode 100644
index 000000000..ef9d037dd
--- /dev/null
+++ b/js/src/regexp/regexp-error.h
@@ -0,0 +1,56 @@
+// Copyright 2020 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_ERROR_H_
+#define V8_REGEXP_REGEXP_ERROR_H_
+
+
+namespace v8 {
+namespace internal {
+
+#define REGEXP_ERROR_MESSAGES(T) \
+ T(None, "") \
+ T(StackOverflow, "Maximum call stack size exceeded") \
+ T(AnalysisStackOverflow, "Stack overflow") \
+ T(TooLarge, "Regular expression too large") \
+ T(UnterminatedGroup, "Unterminated group") \
+ T(UnmatchedParen, "Unmatched ')'") \
+ T(EscapeAtEndOfPattern, "\\ at end of pattern") \
+ T(InvalidPropertyName, "Invalid property name") \
+ T(InvalidEscape, "Invalid escape") \
+ T(InvalidDecimalEscape, "Invalid decimal escape") \
+ T(InvalidUnicodeEscape, "Invalid Unicode escape") \
+ T(NothingToRepeat, "Nothing to repeat") \
+ T(LoneQuantifierBrackets, "Lone quantifier brackets") \
+ T(RangeOutOfOrder, "numbers out of order in {} quantifier") \
+ T(IncompleteQuantifier, "Incomplete quantifier") \
+ T(InvalidQuantifier, "Invalid quantifier") \
+ T(InvalidGroup, "Invalid group") \
+ T(MultipleFlagDashes, "Multiple dashes in flag group") \
+ T(RepeatedFlag, "Repeated flag in flag group") \
+ T(InvalidFlagGroup, "Invalid flag group") \
+ T(TooManyCaptures, "Too many captures") \
+ T(InvalidCaptureGroupName, "Invalid capture group name") \
+ T(DuplicateCaptureGroupName, "Duplicate capture group name") \
+ T(InvalidNamedReference, "Invalid named reference") \
+ T(InvalidNamedCaptureReference, "Invalid named capture referenced") \
+ T(InvalidClassEscape, "Invalid class escape") \
+ T(InvalidClassPropertyName, "Invalid property name in character class") \
+ T(InvalidCharacterClass, "Invalid character class") \
+ T(UnterminatedCharacterClass, "Unterminated character class") \
+ T(OutOfOrderCharacterClass, "Range out of order in character class")
+
+enum class RegExpError : uint32_t {
+#define TEMPLATE(NAME, STRING) k##NAME,
+ REGEXP_ERROR_MESSAGES(TEMPLATE)
+#undef TEMPLATE
+ NumErrors
+};
+
+V8_EXPORT_PRIVATE const char* RegExpErrorString(RegExpError error);
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_REGEXP_REGEXP_ERROR_H_
diff --git a/js/src/regexp/regexp-interpreter.cc b/js/src/regexp/regexp-interpreter.cc
index 6632cd729..7735d6885 100644
--- a/js/src/regexp/regexp-interpreter.cc
+++ b/js/src/regexp/regexp-interpreter.cc
@@ -28,18 +28,18 @@ namespace internal {
namespace {
bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len,
- Vector<const uc16> subject, bool unicode) {
+ Vector<const uc16> subject) {
Address offset_a =
reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(from)));
Address offset_b =
reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(current)));
size_t length = len * kUC16Size;
- return RegExpMacroAssembler::CaseInsensitiveCompareUC16(
- offset_a, offset_b, length, unicode ? nullptr : isolate) == 1;
+ return RegExpMacroAssembler::CaseInsensitiveCompareUC16(offset_a, offset_b,
+ length, isolate) == 1;
}
bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len,
- Vector<const uint8_t> subject, bool unicode) {
+ Vector<const uint8_t> subject) {
// For Latin1 characters the unicode flag makes no difference.
for (int i = 0; i < len; i++) {
unsigned int old_char = subject[from++];
@@ -82,11 +82,17 @@ int32_t Load32Aligned(const byte* pc) {
return *reinterpret_cast<const int32_t*>(pc);
}
-int32_t Load16Aligned(const byte* pc) {
+// TODO(jgruber): Rename to Load16AlignedUnsigned.
+uint32_t Load16Aligned(const byte* pc) {
DCHECK_EQ(0, reinterpret_cast<intptr_t>(pc) & 1);
return *reinterpret_cast<const uint16_t*>(pc);
}
+int32_t Load16AlignedSigned(const byte* pc) {
+ DCHECK_EQ(0, reinterpret_cast<intptr_t>(pc) & 1);
+ return *reinterpret_cast<const int16_t*>(pc);
+}
+
// A simple abstraction over the backtracking stack used by the interpreter.
//
// Despite the name 'backtracking' stack, it's actually used as a generic stack
@@ -734,26 +740,14 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
DISPATCH();
}
BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE) {
- int from = registers[insn >> BYTECODE_SHIFT];
- int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
- if (from >= 0 && len > 0) {
- if (current + len > subject.length() ||
- !BackRefMatchesNoCase(isolate, from, current, len, subject, true)) {
- SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
- DISPATCH();
- }
- current += len;
- }
- ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE);
- DISPATCH();
+ UNREACHABLE(); // TODO(jgruber): Remove this unused bytecode.
}
BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) {
int from = registers[insn >> BYTECODE_SHIFT];
int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
if (from >= 0 && len > 0) {
if (current + len > subject.length() ||
- !BackRefMatchesNoCase(isolate, from, current, len, subject,
- false)) {
+ !BackRefMatchesNoCase(isolate, from, current, len, subject)) {
SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
DISPATCH();
}
@@ -763,27 +757,14 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
DISPATCH();
}
BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD) {
- int from = registers[insn >> BYTECODE_SHIFT];
- int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
- if (from >= 0 && len > 0) {
- if (current - len < 0 ||
- !BackRefMatchesNoCase(isolate, from, current - len, len, subject,
- true)) {
- SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
- DISPATCH();
- }
- current -= len;
- }
- ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD);
- DISPATCH();
+ UNREACHABLE(); // TODO(jgruber): Remove this unused bytecode.
}
BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) {
int from = registers[insn >> BYTECODE_SHIFT];
int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
if (from >= 0 && len > 0) {
if (current - len < 0 ||
- !BackRefMatchesNoCase(isolate, from, current - len, len, subject,
- false)) {
+ !BackRefMatchesNoCase(isolate, from, current - len, len, subject)) {
SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
DISPATCH();
}
@@ -828,7 +809,7 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
}
BYTECODE(SKIP_UNTIL_CHAR) {
int load_offset = (insn >> BYTECODE_SHIFT);
- uint32_t advance = Load16Aligned(pc + 4);
+ int32_t advance = Load16AlignedSigned(pc + 4);
uint32_t c = Load16Aligned(pc + 6);
while (static_cast<uintptr_t>(current + load_offset) <
static_cast<uintptr_t>(subject.length())) {
@@ -844,7 +825,7 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
}
BYTECODE(SKIP_UNTIL_CHAR_AND) {
int load_offset = (insn >> BYTECODE_SHIFT);
- uint16_t advance = Load16Aligned(pc + 4);
+ int32_t advance = Load16AlignedSigned(pc + 4);
uint16_t c = Load16Aligned(pc + 6);
uint32_t mask = Load32Aligned(pc + 8);
int32_t maximum_offset = Load32Aligned(pc + 12);
@@ -862,7 +843,7 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
}
BYTECODE(SKIP_UNTIL_CHAR_POS_CHECKED) {
int load_offset = (insn >> BYTECODE_SHIFT);
- uint16_t advance = Load16Aligned(pc + 4);
+ int32_t advance = Load16AlignedSigned(pc + 4);
uint16_t c = Load16Aligned(pc + 6);
int32_t maximum_offset = Load32Aligned(pc + 8);
while (static_cast<uintptr_t>(current + maximum_offset) <=
@@ -879,7 +860,7 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
}
BYTECODE(SKIP_UNTIL_BIT_IN_TABLE) {
int load_offset = (insn >> BYTECODE_SHIFT);
- uint32_t advance = Load16Aligned(pc + 4);
+ int32_t advance = Load16AlignedSigned(pc + 4);
const byte* table = pc + 8;
while (static_cast<uintptr_t>(current + load_offset) <
static_cast<uintptr_t>(subject.length())) {
@@ -895,7 +876,7 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
}
BYTECODE(SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE) {
int load_offset = (insn >> BYTECODE_SHIFT);
- uint16_t advance = Load16Aligned(pc + 4);
+ int32_t advance = Load16AlignedSigned(pc + 4);
uint16_t limit = Load16Aligned(pc + 6);
const byte* table = pc + 8;
while (static_cast<uintptr_t>(current + load_offset) <
@@ -916,7 +897,7 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
}
BYTECODE(SKIP_UNTIL_CHAR_OR_CHAR) {
int load_offset = (insn >> BYTECODE_SHIFT);
- uint32_t advance = Load32Aligned(pc + 4);
+ int32_t advance = Load32Aligned(pc + 4);
uint16_t c = Load16Aligned(pc + 8);
uint16_t c2 = Load16Aligned(pc + 10);
while (static_cast<uintptr_t>(current + load_offset) <
@@ -1016,6 +997,8 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
}
}
+#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
+
// This method is called through an external reference from RegExpExecInternal
// builtin.
IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromJs(
@@ -1042,6 +1025,8 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromJs(
start_position, call_origin);
}
+#endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
+
IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromRuntime(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject_string,
int* registers, int registers_length, int start_position) {
diff --git a/js/src/regexp/regexp-macro-assembler-tracer.cc b/js/src/regexp/regexp-macro-assembler-tracer.cc
index 331c57d1a..b71a0f48e 100644
--- a/js/src/regexp/regexp-macro-assembler-tracer.cc
+++ b/js/src/regexp/regexp-macro-assembler-tracer.cc
@@ -349,17 +349,15 @@ void RegExpMacroAssemblerTracer::CheckNotBackReference(int start_reg,
assembler_->CheckNotBackReference(start_reg, read_backward, on_no_match);
}
-
void RegExpMacroAssemblerTracer::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
- PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s %s, label[%08x]);\n",
+ int start_reg, bool read_backward, Label* on_no_match) {
+ PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s, label[%08x]);\n",
start_reg, read_backward ? "backward" : "forward",
- unicode ? "unicode" : "non-unicode", LabelToInt(on_no_match));
- assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward, unicode,
+ LabelToInt(on_no_match));
+ assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward,
on_no_match);
}
-
void RegExpMacroAssemblerTracer::CheckPosition(int cp_offset,
Label* on_outside_input) {
PrintF(" CheckPosition(cp_offset=%d, label[%08x]);\n", cp_offset,
diff --git a/js/src/regexp/regexp-macro-assembler-tracer.h b/js/src/regexp/regexp-macro-assembler-tracer.h
index 938f84796..5332e59b8 100644
--- a/js/src/regexp/regexp-macro-assembler-tracer.h
+++ b/js/src/regexp/regexp-macro-assembler-tracer.h
@@ -33,7 +33,6 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler {
void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match) override;
void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward,
- bool unicode,
Label* on_no_match) override;
void CheckNotCharacter(unsigned c, Label* on_not_equal) override;
void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with,
diff --git a/js/src/regexp/regexp-macro-assembler.cc b/js/src/regexp/regexp-macro-assembler.cc
index 4a8dcd3ce..7f8de2543 100644
--- a/js/src/regexp/regexp-macro-assembler.cc
+++ b/js/src/regexp/regexp-macro-assembler.cc
@@ -110,34 +110,7 @@ bool NativeRegExpMacroAssembler::CanReadUnaligned() {
return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
}
-const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
- String subject, int start_index, const DisallowHeapAllocation& no_gc) {
- if (subject.IsConsString()) {
- subject = ConsString::cast(subject).first();
- } else if (subject.IsSlicedString()) {
- start_index += SlicedString::cast(subject).offset();
- subject = SlicedString::cast(subject).parent();
- }
- if (subject.IsThinString()) {
- subject = ThinString::cast(subject).actual();
- }
- DCHECK_LE(0, start_index);
- DCHECK_LE(start_index, subject.length());
- if (subject.IsSeqOneByteString()) {
- return reinterpret_cast<const byte*>(
- SeqOneByteString::cast(subject).GetChars(no_gc) + start_index);
- } else if (subject.IsSeqTwoByteString()) {
- return reinterpret_cast<const byte*>(
- SeqTwoByteString::cast(subject).GetChars(no_gc) + start_index);
- } else if (subject.IsExternalOneByteString()) {
- return reinterpret_cast<const byte*>(
- ExternalOneByteString::cast(subject).GetChars() + start_index);
- } else {
- DCHECK(subject.IsExternalTwoByteString());
- return reinterpret_cast<const byte*>(
- ExternalTwoByteString::cast(subject).GetChars() + start_index);
- }
-}
+#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
// This method may only be called after an interrupt.
int NativeRegExpMacroAssembler::CheckStackGuardState(
@@ -145,9 +118,10 @@ int NativeRegExpMacroAssembler::CheckStackGuardState(
Address* return_address, Code re_code, Address* subject,
const byte** input_start, const byte** input_end) {
DisallowHeapAllocation no_gc;
+ Address old_pc = PointerAuthentication::AuthenticatePC(return_address, 0);
+ DCHECK_LE(re_code.raw_instruction_start(), old_pc);
+ DCHECK_LE(old_pc, re_code.raw_instruction_end());
- DCHECK(re_code.raw_instruction_start() <= *return_address);
- DCHECK(*return_address <= re_code.raw_instruction_end());
StackLimitCheck check(isolate);
bool js_has_overflowed = check.JsHasOverflowed();
@@ -189,9 +163,11 @@ int NativeRegExpMacroAssembler::CheckStackGuardState(
}
if (*code_handle != re_code) { // Return address no longer valid
- intptr_t delta = code_handle->address() - re_code.address();
// Overwrite the return address on the stack.
- *return_address += delta;
+ intptr_t delta = code_handle->address() - re_code.address();
+ Address new_pc = old_pc + delta;
+ // TODO(v8:10026): avoid replacing a signed pointer.
+ PointerAuthentication::ReplacePC(return_address, new_pc, 0);
}
// If we continue, we need to update the subject string addresses.
@@ -206,8 +182,7 @@ int NativeRegExpMacroAssembler::CheckStackGuardState(
} else {
*subject = subject_handle->ptr();
intptr_t byte_length = *input_end - *input_start;
- *input_start =
- StringCharacterPosition(*subject_handle, start_index, no_gc);
+ *input_start = subject_handle->AddressOfCharacterAt(start_index, no_gc);
*input_end = *input_start + byte_length;
}
}
@@ -255,7 +230,7 @@ int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp,
DisallowHeapAllocation no_gc;
const byte* input_start =
- StringCharacterPosition(subject_ptr, start_offset + slice_offset, no_gc);
+ subject_ptr.AddressOfCharacterAt(start_offset + slice_offset, no_gc);
int byte_length = char_length << char_size_shift;
const byte* input_end = input_start + byte_length;
return Execute(*subject, start_offset, input_start, input_end, offsets_vector,
@@ -301,6 +276,8 @@ int NativeRegExpMacroAssembler::Execute(
return result;
}
+#endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
+
// clang-format off
const byte NativeRegExpMacroAssembler::word_character_map[] = {
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
diff --git a/js/src/regexp/regexp-macro-assembler.h b/js/src/regexp/regexp-macro-assembler.h
index dd059a43d..ef3961a70 100644
--- a/js/src/regexp/regexp-macro-assembler.h
+++ b/js/src/regexp/regexp-macro-assembler.h
@@ -87,7 +87,7 @@ class RegExpMacroAssembler {
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match) = 0;
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward, bool unicode,
+ bool read_backward,
Label* on_no_match) = 0;
// Check the current character for a match with a literal character. If we
// fail to match then goto the on_failure label. End of input always
@@ -122,6 +122,11 @@ class RegExpMacroAssembler {
// not have custom support.
// May clobber the current loaded character.
virtual bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match);
+
+ // Control-flow integrity:
+ // Define a jump target and bind a label.
+ virtual void BindJumpTarget(Label* label) { Bind(label); }
+
virtual void Fail() = 0;
virtual Handle<HeapObject> GetCode(Handle<String> source) = 0;
virtual void GoTo(Label* label) = 0;
@@ -246,9 +251,6 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
static Address GrowStack(Address stack_pointer, Address* stack_top,
Isolate* isolate);
- static const byte* StringCharacterPosition(
- String subject, int start_index, const DisallowHeapAllocation& no_gc);
-
static int CheckStackGuardState(Isolate* isolate, int start_index,
RegExp::CallOrigin call_origin,
Address* return_address, Code re_code,
diff --git a/js/src/regexp/regexp-parser.cc b/js/src/regexp/regexp-parser.cc
index 377b94247..e2bbb6ed0 100644
--- a/js/src/regexp/regexp-parser.cc
+++ b/js/src/regexp/regexp-parser.cc
@@ -17,11 +17,10 @@
namespace v8 {
namespace internal {
-RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
- JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
+RegExpParser::RegExpParser(FlatStringReader* in, JSRegExp::Flags flags,
+ Isolate* isolate, Zone* zone)
: isolate_(isolate),
zone_(zone),
- error_(error),
captures_(nullptr),
named_captures_(nullptr),
named_back_references_(nullptr),
@@ -74,13 +73,12 @@ void RegExpParser::Advance() {
if (FLAG_correctness_fuzzer_suppressions) {
FATAL("Aborting on stack overflow");
}
- ReportError(CStrVector(
- MessageFormatter::TemplateString(MessageTemplate::kStackOverflow)));
+ ReportError(RegExpError::kStackOverflow);
} else if (zone()->excess_allocation()) {
if (FLAG_correctness_fuzzer_suppressions) {
FATAL("Aborting on excess zone allocation");
}
- ReportError(CStrVector("Regular expression too large"));
+ ReportError(RegExpError::kTooLarge);
} else {
current_ = ReadNext<true>();
}
@@ -132,15 +130,12 @@ bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
return false;
}
-
-RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
+RegExpTree* RegExpParser::ReportError(RegExpError error) {
if (failed_) return nullptr; // Do not overwrite any existing error.
failed_ = true;
- *error_ = isolate()
- ->factory()
- ->NewStringFromOneByte(Vector<const uint8_t>::cast(message))
- .ToHandleChecked();
- // Zip to the end to make sure the no more input is read.
+ error_ = error;
+ error_pos_ = position();
+ // Zip to the end to make sure no more input is read.
current_ = kEndMarker;
next_pos_ = in()->length();
return nullptr;
@@ -187,14 +182,14 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case kEndMarker:
if (state->IsSubexpression()) {
// Inside a parenthesized group when hitting end of input.
- return ReportError(CStrVector("Unterminated group"));
+ return ReportError(RegExpError::kUnterminatedGroup);
}
DCHECK_EQ(INITIAL, state->group_type());
// Parsing completed successfully.
return builder->ToRegExp();
case ')': {
if (!state->IsSubexpression()) {
- return ReportError(CStrVector("Unmatched ')'"));
+ return ReportError(RegExpError::kUnmatchedParen);
}
DCHECK_NE(INITIAL, state->group_type());
@@ -245,7 +240,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '*':
case '+':
case '?':
- return ReportError(CStrVector("Nothing to repeat"));
+ return ReportError(RegExpError::kNothingToRepeat);
case '^': {
Advance();
if (builder->multiline()) {
@@ -300,7 +295,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '\\':
switch (Next()) {
case kEndMarker:
- return ReportError(CStrVector("\\ at end of pattern"));
+ return ReportError(RegExpError::kEscapeAtEndOfPattern);
case 'b':
Advance(2);
builder->AddAssertion(new (zone()) RegExpAssertion(
@@ -340,7 +335,8 @@ RegExpTree* RegExpParser::ParseDisjunction() {
if (unicode()) {
ZoneList<CharacterRange>* ranges =
new (zone()) ZoneList<CharacterRange>(2, zone());
- std::vector<char> name_1, name_2;
+ ZoneVector<char> name_1(zone());
+ ZoneVector<char> name_2(zone());
if (ParsePropertyClassName(&name_1, &name_2)) {
if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
RegExpCharacterClass* cc = new (zone())
@@ -356,7 +352,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
}
}
}
- return ReportError(CStrVector("Invalid property name"));
+ return ReportError(RegExpError::kInvalidPropertyName);
} else {
builder->AddCharacter(p);
}
@@ -392,7 +388,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
// With /u, no identity escapes except for syntax characters
// are allowed. Otherwise, all identity escapes are allowed.
if (unicode()) {
- return ReportError(CStrVector("Invalid escape"));
+ return ReportError(RegExpError::kInvalidEscape);
}
uc32 first_digit = Next();
if (first_digit == '8' || first_digit == '9') {
@@ -406,7 +402,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
Advance();
if (unicode() && Next() >= '0' && Next() <= '9') {
// With /u, decimal escape with leading 0 are not parsed as octal.
- return ReportError(CStrVector("Invalid decimal escape"));
+ return ReportError(RegExpError::kInvalidDecimalEscape);
}
uc32 octal = ParseOctalLiteral();
builder->AddCharacter(octal);
@@ -447,7 +443,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
// ES#prod-annexB-ExtendedPatternCharacter
if (unicode()) {
// With /u, invalid escapes are not treated as identity escapes.
- return ReportError(CStrVector("Invalid unicode escape"));
+ return ReportError(RegExpError::kInvalidUnicodeEscape);
}
builder->AddCharacter('\\');
} else {
@@ -465,7 +461,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
builder->AddCharacter('x');
} else {
// With /u, invalid escapes are not treated as identity escapes.
- return ReportError(CStrVector("Invalid escape"));
+ return ReportError(RegExpError::kInvalidEscape);
}
break;
}
@@ -478,7 +474,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
builder->AddCharacter('u');
} else {
// With /u, invalid escapes are not treated as identity escapes.
- return ReportError(CStrVector("Invalid Unicode escape"));
+ return ReportError(RegExpError::kInvalidUnicodeEscape);
}
break;
}
@@ -502,7 +498,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
builder->AddCharacter(current());
Advance();
} else {
- return ReportError(CStrVector("Invalid escape"));
+ return ReportError(RegExpError::kInvalidEscape);
}
break;
}
@@ -510,13 +506,13 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '{': {
int dummy;
bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
- if (parsed) return ReportError(CStrVector("Nothing to repeat"));
+ if (parsed) return ReportError(RegExpError::kNothingToRepeat);
V8_FALLTHROUGH;
}
case '}':
case ']':
if (unicode()) {
- return ReportError(CStrVector("Lone quantifier brackets"));
+ return ReportError(RegExpError::kLoneQuantifierBrackets);
}
V8_FALLTHROUGH;
default:
@@ -551,13 +547,12 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '{':
if (ParseIntervalQuantifier(&min, &max)) {
if (max < min) {
- return ReportError(
- CStrVector("numbers out of order in {} quantifier"));
+ return ReportError(RegExpError::kRangeOutOfOrder);
}
break;
} else if (unicode()) {
// With /u, incomplete quantifiers are not allowed.
- return ReportError(CStrVector("Incomplete quantifier"));
+ return ReportError(RegExpError::kIncompleteQuantifier);
}
continue;
default:
@@ -573,7 +568,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
Advance();
}
if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
- return ReportError(CStrVector("Invalid quantifier"));
+ return ReportError(RegExpError::kInvalidQuantifier);
}
}
}
@@ -608,7 +603,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
case 's':
case 'm': {
if (!FLAG_regexp_mode_modifiers) {
- ReportError(CStrVector("Invalid group"));
+ ReportError(RegExpError::kInvalidGroup);
return nullptr;
}
Advance();
@@ -617,7 +612,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
switch (current()) {
case '-':
if (!flags_sense) {
- ReportError(CStrVector("Multiple dashes in flag group"));
+ ReportError(RegExpError::kMultipleFlagDashes);
return nullptr;
}
flags_sense = false;
@@ -631,7 +626,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
if (current() == 'm') bit = JSRegExp::kMultiline;
if (current() == 's') bit = JSRegExp::kDotAll;
if (((switch_on | switch_off) & bit) != 0) {
- ReportError(CStrVector("Repeated flag in flag group"));
+ ReportError(RegExpError::kRepeatedFlag);
return nullptr;
}
if (flags_sense) {
@@ -659,7 +654,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
subexpr_type = GROUPING; // Will break us out of the outer loop.
continue;
default:
- ReportError(CStrVector("Invalid flag group"));
+ ReportError(RegExpError::kInvalidFlagGroup);
return nullptr;
}
}
@@ -683,13 +678,13 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
Advance();
break;
default:
- ReportError(CStrVector("Invalid group"));
+ ReportError(RegExpError::kInvalidGroup);
return nullptr;
}
}
if (subexpr_type == CAPTURE) {
if (captures_started_ >= JSRegExp::kMaxCaptures) {
- ReportError(CStrVector("Too many captures"));
+ ReportError(RegExpError::kTooManyCaptures);
return nullptr;
}
captures_started_++;
@@ -838,20 +833,20 @@ const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
if (c == '\\' && current() == 'u') {
Advance();
if (!ParseUnicodeEscape(&c)) {
- ReportError(CStrVector("Invalid Unicode escape sequence"));
+ ReportError(RegExpError::kInvalidUnicodeEscape);
return nullptr;
}
}
// The backslash char is misclassified as both ID_Start and ID_Continue.
if (c == '\\') {
- ReportError(CStrVector("Invalid capture group name"));
+ ReportError(RegExpError::kInvalidCaptureGroupName);
return nullptr;
}
if (at_start) {
if (!IsIdentifierStart(c)) {
- ReportError(CStrVector("Invalid capture group name"));
+ ReportError(RegExpError::kInvalidCaptureGroupName);
return nullptr;
}
push_code_unit(name, c);
@@ -862,7 +857,7 @@ const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
} else if (IsIdentifierPart(c)) {
push_code_unit(name, c);
} else {
- ReportError(CStrVector("Invalid capture group name"));
+ ReportError(RegExpError::kInvalidCaptureGroupName);
return nullptr;
}
}
@@ -889,7 +884,7 @@ bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
const auto& named_capture_it = named_captures_->find(capture);
if (named_capture_it != named_captures_->end()) {
- ReportError(CStrVector("Duplicate capture group name"));
+ ReportError(RegExpError::kDuplicateCaptureGroupName);
return false;
}
}
@@ -903,7 +898,7 @@ bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
RegExpParserState* state) {
// The parser is assumed to be on the '<' in \k<name>.
if (current() != '<') {
- ReportError(CStrVector("Invalid named reference"));
+ ReportError(RegExpError::kInvalidNamedReference);
return false;
}
@@ -936,7 +931,7 @@ void RegExpParser::PatchNamedBackReferences() {
if (named_back_references_ == nullptr) return;
if (named_captures_ == nullptr) {
- ReportError(CStrVector("Invalid named capture referenced"));
+ ReportError(RegExpError::kInvalidNamedCaptureReference);
return;
}
@@ -957,7 +952,7 @@ void RegExpParser::PatchNamedBackReferences() {
if (capture_it != named_captures_->end()) {
index = (*capture_it)->index();
} else {
- ReportError(CStrVector("Invalid named capture referenced"));
+ ReportError(RegExpError::kInvalidNamedCaptureReference);
return;
}
@@ -1378,8 +1373,8 @@ bool IsUnicodePropertyValueCharacter(char c) {
} // anonymous namespace
-bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
- std::vector<char>* name_2) {
+bool RegExpParser::ParsePropertyClassName(ZoneVector<char>* name_1,
+ ZoneVector<char>* name_2) {
DCHECK(name_1->empty());
DCHECK(name_2->empty());
// Parse the property class as follows:
@@ -1418,8 +1413,8 @@ bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
bool negate,
- const std::vector<char>& name_1,
- const std::vector<char>& name_2) {
+ const ZoneVector<char>& name_1,
+ const ZoneVector<char>& name_2) {
if (name_2.empty()) {
// First attempt to interpret as general category property value name.
const char* name = name_1.data();
@@ -1456,7 +1451,7 @@ bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
}
}
-RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name_1) {
+RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector<char>& name_1) {
if (!FLAG_harmony_regexp_sequence) return nullptr;
const char* name = name_1.data();
const uc32* sequence_list = nullptr;
@@ -1522,19 +1517,19 @@ RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name_1) {
#else // V8_INTL_SUPPORT
-bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
- std::vector<char>* name_2) {
+bool RegExpParser::ParsePropertyClassName(ZoneVector<char>* name_1,
+ ZoneVector<char>* name_2) {
return false;
}
bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
bool negate,
- const std::vector<char>& name_1,
- const std::vector<char>& name_2) {
+ const ZoneVector<char>& name_1,
+ const ZoneVector<char>& name_2) {
return false;
}
-RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name) {
+RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector<char>& name) {
return nullptr;
}
@@ -1598,7 +1593,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
}
if (unicode()) {
// With /u, invalid escapes are not treated as identity escapes.
- ReportError(CStrVector("Invalid class escape"));
+ ReportError(RegExpError::kInvalidClassEscape);
return 0;
}
if ((controlLetter >= '0' && controlLetter <= '9') ||
@@ -1631,7 +1626,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
// ES#prod-annexB-LegacyOctalEscapeSequence
if (unicode()) {
// With /u, decimal escape is not interpreted as octal character code.
- ReportError(CStrVector("Invalid class escape"));
+ ReportError(RegExpError::kInvalidClassEscape);
return 0;
}
return ParseOctalLiteral();
@@ -1641,7 +1636,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
if (ParseHexEscape(2, &value)) return value;
if (unicode()) {
// With /u, invalid escapes are not treated as identity escapes.
- ReportError(CStrVector("Invalid escape"));
+ ReportError(RegExpError::kInvalidEscape);
return 0;
}
// If \x is not followed by a two-digit hexadecimal, treat it
@@ -1654,7 +1649,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
if (ParseUnicodeEscape(&value)) return value;
if (unicode()) {
// With /u, invalid escapes are not treated as identity escapes.
- ReportError(CStrVector("Invalid unicode escape"));
+ ReportError(RegExpError::kInvalidUnicodeEscape);
return 0;
}
// If \u is not followed by a two-digit hexadecimal, treat it
@@ -1669,11 +1664,11 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
Advance();
return result;
}
- ReportError(CStrVector("Invalid escape"));
+ ReportError(RegExpError::kInvalidEscape);
return 0;
}
}
- return 0;
+ UNREACHABLE();
}
void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
@@ -1696,17 +1691,18 @@ void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
return;
}
case kEndMarker:
- ReportError(CStrVector("\\ at end of pattern"));
+ ReportError(RegExpError::kEscapeAtEndOfPattern);
return;
case 'p':
case 'P':
if (unicode()) {
bool negate = Next() == 'P';
Advance(2);
- std::vector<char> name_1, name_2;
+ ZoneVector<char> name_1(zone);
+ ZoneVector<char> name_2(zone);
if (!ParsePropertyClassName(&name_1, &name_2) ||
!AddPropertyClassRange(ranges, negate, name_1, name_2)) {
- ReportError(CStrVector("Invalid property name in character class"));
+ ReportError(RegExpError::kInvalidClassPropertyName);
}
*is_class_escape = true;
return;
@@ -1725,10 +1721,6 @@ void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
}
RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
- static const char* kUnterminated = "Unterminated character class";
- static const char* kRangeInvalid = "Invalid character class";
- static const char* kRangeOutOfOrder = "Range out of order in character class";
-
DCHECK_EQ(current(), '[');
Advance();
bool is_negated = false;
@@ -1761,7 +1753,7 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
// Either end is an escaped character class. Treat the '-' verbatim.
if (unicode()) {
// ES2015 21.2.2.15.1 step 1.
- return ReportError(CStrVector(kRangeInvalid));
+ return ReportError(RegExpError::kInvalidCharacterClass);
}
if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
ranges->Add(CharacterRange::Singleton('-'), zone());
@@ -1770,7 +1762,7 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
}
// ES2015 21.2.2.15.1 step 6.
if (char_1 > char_2) {
- return ReportError(CStrVector(kRangeOutOfOrder));
+ return ReportError(RegExpError::kOutOfOrderCharacterClass);
}
ranges->Add(CharacterRange::Range(char_1, char_2), zone());
} else {
@@ -1778,7 +1770,7 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
}
}
if (!has_more()) {
- return ReportError(CStrVector(kUnterminated));
+ return ReportError(RegExpError::kUnterminatedCharacterClass);
}
Advance();
RegExpCharacterClass::CharacterClassFlags character_class_flags;
@@ -1795,14 +1787,16 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
FlatStringReader* input, JSRegExp::Flags flags,
RegExpCompileData* result) {
DCHECK(result != nullptr);
- RegExpParser parser(input, &result->error, flags, isolate, zone);
+ RegExpParser parser(input, flags, isolate, zone);
RegExpTree* tree = parser.ParsePattern();
if (parser.failed()) {
DCHECK(tree == nullptr);
- DCHECK(!result->error.is_null());
+ DCHECK(parser.error_ != RegExpError::kNone);
+ result->error = parser.error_;
+ result->error_pos = parser.error_pos_;
} else {
DCHECK(tree != nullptr);
- DCHECK(result->error.is_null());
+ DCHECK(parser.error_ == RegExpError::kNone);
if (FLAG_trace_regexp_parser) {
StdoutStream os;
tree->Print(os, zone);
diff --git a/js/src/regexp/regexp-parser.h b/js/src/regexp/regexp-parser.h
index 91677d6c3..131d12161 100644
--- a/js/src/regexp/regexp-parser.h
+++ b/js/src/regexp/regexp-parser.h
@@ -6,6 +6,7 @@
#define V8_REGEXP_REGEXP_PARSER_H_
#include "regexp/regexp-ast.h"
+#include "regexp/regexp-error.h"
namespace v8 {
namespace internal {
@@ -150,8 +151,8 @@ class RegExpBuilder : public ZoneObject {
class V8_EXPORT_PRIVATE RegExpParser {
public:
- RegExpParser(FlatStringReader* in, Handle<String>* error,
- JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
+ RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, Isolate* isolate,
+ Zone* zone);
static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
JSRegExp::Flags flags, RegExpCompileData* result);
@@ -174,13 +175,13 @@ class V8_EXPORT_PRIVATE RegExpParser {
bool ParseUnicodeEscape(uc32* value);
bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
- bool ParsePropertyClassName(std::vector<char>* name_1,
- std::vector<char>* name_2);
+ bool ParsePropertyClassName(ZoneVector<char>* name_1,
+ ZoneVector<char>* name_2);
bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
- const std::vector<char>& name_1,
- const std::vector<char>& name_2);
+ const ZoneVector<char>& name_1,
+ const ZoneVector<char>& name_2);
- RegExpTree* GetPropertySequence(const std::vector<char>& name_1);
+ RegExpTree* GetPropertySequence(const ZoneVector<char>& name_1);
RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
uc32 ParseOctalLiteral();
@@ -199,7 +200,7 @@ class V8_EXPORT_PRIVATE RegExpParser {
char ParseClassEscape();
- RegExpTree* ReportError(Vector<const char> message);
+ RegExpTree* ReportError(RegExpError error);
void Advance();
void Advance(int dist);
void Reset(int pos);
@@ -332,7 +333,8 @@ class V8_EXPORT_PRIVATE RegExpParser {
Isolate* isolate_;
Zone* zone_;
- Handle<String>* error_;
+ RegExpError error_ = RegExpError::kNone;
+ int error_pos_ = 0;
ZoneList<RegExpCapture*>* captures_;
ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
ZoneList<RegExpBackReference*>* named_back_references_;
diff --git a/js/src/regexp/regexp-shim.h b/js/src/regexp/regexp-shim.h
index 38b035727..462e396f4 100644
--- a/js/src/regexp/regexp-shim.h
+++ b/js/src/regexp/regexp-shim.h
@@ -60,6 +60,7 @@ class RegExpStack;
#define DCHECK_NOT_NULL(val) MOZ_ASSERT((val) != nullptr)
#define DCHECK_IMPLIES(lhs, rhs) MOZ_ASSERT_IF(lhs, rhs)
#define CHECK MOZ_RELEASE_ASSERT
+#define CHECK_LE(lhs, rhs) MOZ_RELEASE_ASSERT((lhs) <= (rhs))
template <class T>
static constexpr inline T Min(T t1, T t2) {
@@ -1009,7 +1010,7 @@ private:
public:
// An empty stub for telemetry we don't support
- void IncreaseTotalRegexpCodeGenerated(int size) {}
+ void IncreaseTotalRegexpCodeGenerated(Handle<HeapObject> code) {}
Counters* counters() { return &counters_; }
@@ -1155,6 +1156,7 @@ extern bool FLAG_trace_regexp_parser;
extern bool FLAG_trace_regexp_peephole_optimization;
#define V8_USE_COMPUTED_GOTO 1
+#define COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
} // namespace internal
} // namespace v8
diff --git a/js/src/regexp/regexp-stack.h b/js/src/regexp/regexp-stack.h
index 812195ad1..0b452c005 100644
--- a/js/src/regexp/regexp-stack.h
+++ b/js/src/regexp/regexp-stack.h
@@ -36,6 +36,9 @@ class RegExpStackScope {
class RegExpStack {
public:
+ RegExpStack();
+ ~RegExpStack();
+
// Number of allocated locations on the stack below the limit.
// No sequence of pushes must be longer that this without doing a stack-limit
// check.
@@ -75,9 +78,6 @@ class RegExpStack {
static constexpr size_t kMaximumStackSize = 64 * MB;
private:
- RegExpStack();
- ~RegExpStack();
-
// Artificial limit used when the thread-local state has been destroyed.
static const Address kMemoryTop =
static_cast<Address>(static_cast<uintptr_t>(-1));
diff --git a/js/src/regexp/regexp.h b/js/src/regexp/regexp.h
index cce58da38..a36662b78 100644
--- a/js/src/regexp/regexp.h
+++ b/js/src/regexp/regexp.h
@@ -5,6 +5,7 @@
#ifndef V8_REGEXP_REGEXP_H_
#define V8_REGEXP_REGEXP_H_
+#include "regexp/regexp-error.h"
#include "regexp/regexp-shim.h"
namespace v8 {
@@ -42,7 +43,11 @@ struct RegExpCompileData {
// The error message. Only used if an error occurred during parsing or
// compilation.
- Handle<String> error;
+ RegExpError error = RegExpError::kNone;
+
+ // The position at which the error was detected. Only used if an
+ // error occurred.
+ int error_pos = 0;
// The number of capture groups, without the global capture \0.
int capture_count = 0;
diff --git a/js/src/regexp/special-case.cc b/js/src/regexp/special-case.cc
index d60b98764..6b12d28d7 100644
--- a/js/src/regexp/special-case.cc
+++ b/js/src/regexp/special-case.cc
@@ -1,10 +1,15 @@
-// Copyright 2019 the V8 project authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
+// Copyright 2020 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that
+// can be found in the LICENSE file.
// Automatically generated by regexp/gen-regexp-special-case.cc
-// The following functions are used to build icu::UnicodeSet
-// for specical cases different between Unicode and ECMA262.
+
+// The following functions are used to build UnicodeSets
+// for special cases where the case-folding algorithm used by
+// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match
+// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime
+// Semantics: Canonicalize) step 3.
+
#ifdef V8_INTL_SUPPORT
#include "regexp/special-case.h"
@@ -14,14 +19,46 @@ namespace internal {
icu::UnicodeSet BuildIgnoreSet() {
icu::UnicodeSet set;
+ set.add(0xdf);
+ set.add(0x17f);
+ set.add(0x390);
+ set.add(0x3b0);
set.add(0x3f4);
+ set.add(0x1e9e);
+ set.add(0x1f80, 0x1faf);
+ set.add(0x1fb3);
+ set.add(0x1fbc);
+ set.add(0x1fc3);
+ set.add(0x1fcc);
+ set.add(0x1fd3);
+ set.add(0x1fe3);
+ set.add(0x1ff3);
+ set.add(0x1ffc);
set.add(0x2126);
set.add(0x212a, 0x212b);
+ set.add(0xfb05, 0xfb06);
set.freeze();
return set;
}
+
+struct IgnoreSetData {
+ IgnoreSetData() : set(BuildIgnoreSet()) {}
+ const icu::UnicodeSet set;
+};
+
+//static
+const icu::UnicodeSet& RegExpCaseFolding::IgnoreSet() {
+ static base::LazyInstance<IgnoreSetData>::type set =
+ LAZY_INSTANCE_INITIALIZER;
+ return set.Pointer()->set;
+}
+
icu::UnicodeSet BuildSpecialAddSet() {
icu::UnicodeSet set;
+ set.add(0x4b);
+ set.add(0x53);
+ set.add(0x6b);
+ set.add(0x73);
set.add(0xc5);
set.add(0xe5);
set.add(0x398);
@@ -33,6 +70,19 @@ icu::UnicodeSet BuildSpecialAddSet() {
return set;
}
+struct SpecialAddSetData {
+ SpecialAddSetData() : set(BuildSpecialAddSet()) {}
+ const icu::UnicodeSet set;
+};
+
+//static
+const icu::UnicodeSet& RegExpCaseFolding::SpecialAddSet() {
+ static base::LazyInstance<SpecialAddSetData>::type set =
+ LAZY_INSTANCE_INITIALIZER;
+ return set.Pointer()->set;
+}
+
+
} // namespace internal
} // namespace v8
#endif // V8_INTL_SUPPORT
diff --git a/js/src/regexp/special-case.h b/js/src/regexp/special-case.h
index 1ccec5d31..3aca98302 100644
--- a/js/src/regexp/special-case.h
+++ b/js/src/regexp/special-case.h
@@ -6,70 +6,108 @@
#define V8_REGEXP_SPECIAL_CASE_H_
#ifdef V8_INTL_SUPPORT
-#include "unicode/uversion.h"
-namespace U_ICU_NAMESPACE {
-class UnicodeSet;
-} // namespace U_ICU_NAMESPACE
+#include "regexp/regexp-shim.h"
+
+#include "unicode/uchar.h"
+#include "unicode/uniset.h"
+#include "unicode/unistr.h"
namespace v8 {
namespace internal {
-// Functions to build special sets of Unicode characters that need special
-// handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE).
+// Sets of Unicode characters that need special handling under "i" mode
+
+// For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262
+// defines slightly different case-folding rules than Unicode. An
+// input character should match a pattern character if the result of
+// the Canonicalize algorithm is the same for both characters.
//
-// For the characters in the "ignore set", the process should not treat other
-// characters in the result of closeOver(USET_CASE_INSENSITIVE) as case
-// equivlant under the ECMA262 RegExp "i" mode because these characters are
-// uppercase themselves that no other characters in the set uppercase to.
+// Roughly speaking, for "i" regexps, Canonicalize(c) is the same as
+// c.toUpperCase(), unless a) c.toUpperCase() is a multi-character
+// string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See
+// https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for
+// the precise definition.
//
-// For the characters in the "special add set", the proecess should add only
-// those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is
-// not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode
-// and also that ONE uppercase character that other non uppercase character
-// uppercase into to the set. Other uppercase characters in the result of
-// closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262
-// RegExp "i" mode consider two characters as "case equivlant" if both
-// characters uppercase to the same character.
+// While compiling such regular expressions, we need to compute the
+// set of characters that should match a given input character. (See
+// GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.)
+// For almost all characters, this can be efficiently computed using
+// UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent
+// the remaining special cases.
//
-// For example, consider the following case equivalent set defined by Unicode
-// standard. Notice there are more than one uppercase characters in this set:
-// U+212B Å Angstrom Sign - an uppercase character.
-// U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character.
-// U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which
-// uppercase to U+00C5.
-// In this case equivlant set is a special set and need special handling while
-// considering "case equivlant" under the ECMA262 RegExp "i" mode which is
-// different than Unicode Standard:
-// * U+212B should be included into the "ignore" set because there are no other
-// characters, under the ECMA262 "i" mode, are considered as "case equivlant"
-// to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5
-// uppercase to U+212B.
-// * U+00C5 and U+00E5 will both be included into the "special add" set. While
-// calculate the "equivlant set" under ECMA262 "i" mode, the process will
-// add U+00E5, because it is not an uppercase character in the set. The
-// process will also add U+00C5, because it is the uppercase character which
-// other non uppercase character, U+00C5, uppercase into.
+// For a character c, the rules are as follows:
//
-// For characters not included in "ignore set" and "special add set", the
-// process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is
-// much faster.
+// 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling
+// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet
+// containing c will produce the set of characters that should
+// match /c/i (or /[c]/i), and only those characters.
//
-// Under Unicode 12.0, there are only 7 characters in the "special add set" and
-// 4 characters in "ignore set" so even the special add process is slower, it is
-// limited to a small set of cases only.
+// 2. If c is in IgnoreSet, then the only character it should match is
+// itself. However, closeOver will add additional incorrect
+// matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ'
+// (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is
+// "SS". Step 3.e therefore requires that 'ß' canonicalizes to
+// itself, and should not match 'ẞ'. In these cases, we can skip
+// the closeOver entirely, because it will never add an equivalent
+// character.
//
-// The implementation of these two function will be generated by calling ICU
-// icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by
-// the code in src/regexp/gen-regexp-special-case.cc.
+// 3. If c is in SpecialAddSet, then it should match at least one
+// character other than itself. However, closeOver will add at
+// least one additional incorrect match. For example, consider the
+// letter 'k'. Closing over 'k' gives "kKK" (lowercase k, uppercase
+// K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN
+// SIGN should not match either of the other two characters. As a
+// result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in
+// IgnoreSet). To find the correct matches for characters in
+// SpecialAddSet, we closeOver the original character, but filter
+// out the results that do not have the same canonical value.
//
-// These two function will be used with LazyInstance<> template to generate
-// global sharable set to reduce memory usage and speed up performance.
+// The contents of these sets are calculated at build time by
+// src/regexp/gen-regexp-special-case.cc, which generates
+// gen/src/regexp/special-case.cc. This is done by iterating over the
+// result of closeOver for each BMP character, and finding sets for
+// which at least one character has a different canonical value than
+// another character. Characters that match no other characters in
+// their equivalence class are added to IgnoreSet. Characters that
+// match at least one other character are added to SpecialAddSet.
+
+class RegExpCaseFolding final : public AllStatic {
+ public:
+ static const icu::UnicodeSet& IgnoreSet();
+ static const icu::UnicodeSet& SpecialAddSet();
+
+ // This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:
+ // Canonicalize) step 3, which is used to determine whether
+ // characters match when ignoreCase is true and unicode is false.
+ static UChar32 Canonicalize(UChar32 ch) {
+ // a. Assert: ch is a UTF-16 code unit.
+ CHECK_LE(ch, 0xffff);
+
+ // b. Let s be the String value consisting of the single code unit ch.
+ icu::UnicodeString s(ch);
+
+ // c. Let u be the same result produced as if by performing the algorithm
+ // for String.prototype.toUpperCase using s as the this value.
+ // d. Assert: Type(u) is String.
+ icu::UnicodeString& u = s.toUpper();
+
+ // e. If u does not consist of a single code unit, return ch.
+ if (u.length() != 1) {
+ return ch;
+ }
+
+ // f. Let cu be u's single code unit element.
+ UChar32 cu = u.char32At(0);
-// Function to build and return the Ignore set.
-icu::UnicodeSet BuildIgnoreSet();
+ // g. If the value of ch >= 128 and the value of cu < 128, return ch.
+ if (ch >= 128 && cu < 128) {
+ return ch;
+ }
-// Function to build and return the Special Add set.
-icu::UnicodeSet BuildSpecialAddSet();
+ // h. Return cu.
+ return cu;
+ }
+};
} // namespace internal
} // namespace v8
diff --git a/js/src/regexp/update-headers.py b/js/src/regexp/update-headers.py
deleted file mode 100644
index 0cff9d6ae..000000000
--- a/js/src/regexp/update-headers.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env python
-
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this file,
-# You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#
-# This script modifies V8 regexp source files to make them suitable for
-# inclusion in SpiderMonkey. Specifically, it:
-#
-# 1. Rewrites all #includes of V8 regexp headers to point to their location in
-# the SM tree: src/regexp/* --> regexp/*
-# 2. Removes all #includes of other V8 src/* headers. The required definitions
-# will be provided by regexp-shim.h.
-#
-# Usage:
-# cd js/src/regexp
-# find . -name "*.h" -o -name "*.cc" | xargs ./update_headers.py
-#
-
-import fileinput
-import re
-import sys
-
-# 1. Rewrite includes of V8 regexp headers
-regexp_include = re.compile('#include "src/regexp')
-regexp_include_new = '#include "regexp'
-
-# 2. Remove includes of other V8 headers
-other_include = re.compile('#include "src/')
-
-for line in fileinput.input(inplace=1):
- if regexp_include.search(line):
- sys.stdout.write(re.sub(regexp_include, regexp_include_new, line))
- elif other_include.search(line):
- pass
- else:
- sys.stdout.write(line)