diff options
author | Matt A. Tobin <email@mattatobin.com> | 2020-11-09 20:37:05 -0500 |
---|---|---|
committer | Matt A. Tobin <email@mattatobin.com> | 2020-11-09 20:37:05 -0500 |
commit | 51468e998c8e7191ddecacec3944c806b29dd590 (patch) | |
tree | c713f075c54781868ec119ea5c5f3c9369af3576 | |
parent | 77746f1d900a35eceb23bd760983e95de7b4a547 (diff) | |
download | UXP-51468e998c8e7191ddecacec3944c806b29dd590.tar UXP-51468e998c8e7191ddecacec3944c806b29dd590.tar.gz UXP-51468e998c8e7191ddecacec3944c806b29dd590.tar.lz UXP-51468e998c8e7191ddecacec3944c806b29dd590.tar.xz UXP-51468e998c8e7191ddecacec3944c806b29dd590.zip |
Issue #1677 - Part 5: "Simplify" regexp re-import process (and re-import from later revision)
I am going on record to say Mozilla are utter fucking assholes for pulling this as part of their progression.
26 files changed, 706 insertions, 512 deletions
diff --git a/js/src/regexp/VERSION b/js/src/regexp/VERSION index 3a0935dea..c7d35a2bb 100644 --- a/js/src/regexp/VERSION +++ b/js/src/regexp/VERSION @@ -1,3 +1,2 @@ -This code was most recently imported from the following version of V8: - -https://github.com/v8/v8/tree/2599d3cc208a3a4873be517285220abd8416c3d7/src/regexp +Imported using import-irregexp.py from: +https://github.com/v8/v8/tree/560f2d8bb3f3a72d78e1a7d7654235d53fdcc83c/src/regexp diff --git a/js/src/regexp/gen-regexp-special-case.cc b/js/src/regexp/gen-regexp-special-case.cc index 337743f53..b4a8c3da4 100644 --- a/js/src/regexp/gen-regexp-special-case.cc +++ b/js/src/regexp/gen-regexp-special-case.cc @@ -1,4 +1,4 @@ -// Copyright 2019 the V8 project authors. All rights reserved. +// Copyright 2020 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -7,18 +7,19 @@ #include <iostream> #include <sstream> -#include "unicode/uchar.h" -#include "unicode/uniset.h" +#include "regexp/special-case.h" namespace v8 { namespace internal { -// The following code generates BuildSpecialAddSet() and BuildIgnoreSet() -// functions into "src/regexp/special-case.cc". -// See more details in http://shorturl.at/adfO5 -void PrintSet(std::ofstream& out, const char* func_name, +static const uc32 kSurrogateStart = 0xd800; +static const uc32 kSurrogateEnd = 0xdfff; +static const uc32 kNonBmpStart = 0x10000; + +// The following code generates "src/regexp/special-case.cc". +void PrintSet(std::ofstream& out, const char* name, const icu::UnicodeSet& set) { - out << "icu::UnicodeSet " << func_name << "() {\n" + out << "icu::UnicodeSet Build" << name << "() {\n" << " icu::UnicodeSet set;\n"; for (int32_t i = 0; i < set.getRangeCount(); i++) { if (set.getRangeStart(i) == set.getRangeEnd(i)) { @@ -30,73 +31,113 @@ void PrintSet(std::ofstream& out, const char* func_name, } out << " set.freeze();\n" << " return set;\n" - << "}\n"; + << "}\n\n"; + + out << "struct " << name << "Data {\n" + << " " << name << "Data() : set(Build" << name << "()) {}\n" + << " const icu::UnicodeSet set;\n" + << "};\n\n"; + + out << "//static\n" + << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n" + << " static base::LazyInstance<" << name << "Data>::type set =\n" + << " LAZY_INSTANCE_INITIALIZER;\n" + << " return set.Pointer()->set;\n" + << "}\n\n"; } void PrintSpecial(std::ofstream& out) { icu::UnicodeSet current; - icu::UnicodeSet processed(0xd800, 0xdbff); // Ignore surrogate range. icu::UnicodeSet special_add; icu::UnicodeSet ignore; UErrorCode status = U_ZERO_ERROR; icu::UnicodeSet upper("[\\p{Lu}]", status); CHECK(U_SUCCESS(status)); - // Iterate through all chars in BMP except ASCII and Surrogate. - for (UChar32 i = 0x80; i < 0x010000; i++) { - // Ignore those characters which is already processed. - if (!processed.contains(i)) { - current.set(i, i); - current.closeOver(USET_CASE_INSENSITIVE); - // Remember we already processed current. - processed.addAll(current); - - // All uppercase characters in current. - icu::UnicodeSet keep_upper(current); - keep_upper.retainAll(upper); - - // Check if we have more than one uppercase character in current. - // If there are more than one uppercase character, then it is a special - // set which need to be added into either "Special Add" set or "Ignore" - // set. - int32_t number_of_upper = 0; - for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) { - number_of_upper += - keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1; + // Iterate through all chars in BMP except surrogates. + for (UChar32 i = 0; i < kNonBmpStart; i++) { + if (i >= kSurrogateStart && i <= kSurrogateEnd) { + continue; // Ignore surrogate range + } + current.set(i, i); + current.closeOver(USET_CASE_INSENSITIVE); + + // Check to see if all characters in the case-folding equivalence + // class as defined by UnicodeSet::closeOver all map to the same + // canonical value. + UChar32 canonical = RegExpCaseFolding::Canonicalize(i); + bool class_has_matching_canonical_char = false; + bool class_has_non_matching_canonical_char = false; + for (int32_t j = 0; j < current.getRangeCount(); j++) { + for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j); + c++) { + if (c == i) { + continue; + } + UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c); + if (canonical == other_canonical) { + class_has_matching_canonical_char = true; + } else { + class_has_non_matching_canonical_char = true; + } + } + } + // If any other character in i's equivalence class has a + // different canonical value, then i needs special handling. If + // no other character shares a canonical value with i, we can + // ignore i when adding alternatives for case-independent + // comparison. If at least one other character shares a + // canonical value, then i needs special handling. + if (class_has_non_matching_canonical_char) { + if (class_has_matching_canonical_char) { + special_add.add(i); + } else { + ignore.add(i); } - if (number_of_upper > 1) { - // Add all non uppercase characters (could be Ll or Mn) to special add - // set. - current.removeAll(upper); - special_add.addAll(current); - - // Add the uppercase characters of non uppercase character to - // special add set. - CHECK_GT(current.getRangeCount(), 0); - UChar32 main_upper = u_toupper(current.getRangeStart(0)); - special_add.add(main_upper); - - // Add all uppercase except the main upper to ignore set. - keep_upper.remove(main_upper); - ignore.addAll(keep_upper); + } + } + + // Verify that no Unicode equivalence class contains two non-trivial + // JS equivalence classes. Every character in SpecialAddSet has the + // same canonical value as every other non-IgnoreSet character in + // its Unicode equivalence class. Therefore, if we call closeOver on + // a set containing no IgnoreSet characters, the only characters + // that must be removed from the result are in IgnoreSet. This fact + // is used in CharacterRange::AddCaseEquivalents. + for (int32_t i = 0; i < special_add.getRangeCount(); i++) { + for (UChar32 c = special_add.getRangeStart(i); + c <= special_add.getRangeEnd(i); c++) { + UChar32 canonical = RegExpCaseFolding::Canonicalize(c); + current.set(c, c); + current.closeOver(USET_CASE_INSENSITIVE); + current.removeAll(ignore); + for (int32_t j = 0; j < current.getRangeCount(); j++) { + for (UChar32 c2 = current.getRangeStart(j); + c2 <= current.getRangeEnd(j); c2++) { + CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2)); + } } } } - // Remove any ASCII - special_add.remove(0x0000, 0x007f); - PrintSet(out, "BuildIgnoreSet", ignore); - PrintSet(out, "BuildSpecialAddSet", special_add); + PrintSet(out, "IgnoreSet", ignore); + PrintSet(out, "SpecialAddSet", special_add); } void WriteHeader(const char* header_filename) { std::ofstream out(header_filename); out << std::hex << std::setfill('0') << std::setw(4); - - out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n" - << "// The following functions are used to build icu::UnicodeSet\n" - << "// for specical cases different between Unicode and ECMA262.\n" + out << "// Copyright 2020 the V8 project authors. All rights reserved.\n" + << "// Use of this source code is governed by a BSD-style license that\n" + << "// can be found in the LICENSE file.\n\n" + << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n" + << "// The following functions are used to build UnicodeSets\n" + << "// for special cases where the case-folding algorithm used by\n" + << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n" + << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n" + << "// Semantics: Canonicalize) step 3.\n\n" << "#ifdef V8_INTL_SUPPORT\n" + << "#include \"src/base/lazy-instance.h\"\n\n" << "#include \"src/regexp/special-case.h\"\n\n" << "#include \"unicode/uniset.h\"\n" << "namespace v8 {\n" diff --git a/js/src/regexp/import-irregexp.py b/js/src/regexp/import-irregexp.py new file mode 100644 index 000000000..870387232 --- /dev/null +++ b/js/src/regexp/import-irregexp.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 + +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +# This script handles all the mechanical steps of importing irregexp from v8: +# +# 1. Acquire the source: either from github, or optionally from a local copy of v8. +# 2. Copy the contents of v8/src/regexp into js/src/regexp +# - Exclude files that we have chosen not to import. +# 3. While doing so, update #includes: +# - Change "src/regexp/*" to "regexp/*". +# - Remove other v8-specific headers completely. +# 4. Add '#include "regexp/regexp-shim.h" in the necessary places. +# 5. Update the VERSION file to include the correct git hash. +# +# Usage: +# cd path/to/js/src/regexp +# ./import-irregexp.py --path path/to/v8/src/regexp +# +# Alternatively, without the --path argument, import-irregexp.py will +# clone v8 from github into a temporary directory. +# +# After running this script, changes to the shim code may be necessary +# to account for changes in upstream irregexp. + +import os +import re +import subprocess +import sys +from pathlib import Path + + +def get_hash(path): + # Get the hash for the current git revision + cwd = os.getcwd() + os.chdir(path) + command = ['git', 'rev-parse', 'HEAD'] + result = subprocess.check_output(command, encoding='utf-8') + os.chdir(cwd) + return result.rstrip() + + +def copy_and_update_includes(src_path, dst_path): + # List of header files that need to include the shim header + need_shim = ['property-sequences.h', + 'regexp-ast.h', + 'regexp-bytecode-peephole.h', + 'regexp-bytecodes.h', + 'regexp-dotprinter.h', + 'regexp.h', + 'regexp-macro-assembler.h', + 'regexp-stack.h', + 'special-case.h'] + + src = open(str(src_path), 'r') + dst = open(str(dst_path), 'w') + + # 1. Rewrite includes of V8 regexp headers: + regexp_include = re.compile('#include "src/regexp') + regexp_include_new = '#include "regexp' + + # 2. Remove includes of other V8 headers + other_include = re.compile('#include "src/') + + # 3. If needed, add '#include "regexp/regexp-shim.h"'. + # Note: We get a little fancy to ensure that header files are + # in alphabetic order. `need_to_add_shim` is true if we still + # have to add the shim header in this file. `adding_shim_now` + # is true if we have found a '#include "src/*' and we are just + # waiting to find something alphabetically smaller (or an empty + # line) so that we can insert the shim header in the right place. + need_to_add_shim = src_path.name in need_shim + adding_shim_now = False + + for line in src: + if adding_shim_now: + if (line == '\n' or line > '#include "src/regexp/regexp-shim.h"'): + dst.write('#include "regexp/regexp-shim.h"\n') + need_to_add_shim = False + adding_shim_now = False + + if regexp_include.search(line): + dst.write(re.sub(regexp_include, regexp_include_new, line)) + elif other_include.search(line): + if need_to_add_shim: + adding_shim_now = True + else: + dst.write(line) + + +def import_from(srcdir, dstdir): + excluded = ['OWNERS', + 'regexp.cc', + 'regexp-utils.cc', + 'regexp-utils.h', + 'regexp-macro-assembler-arch.h'] + + for file in srcdir.iterdir(): + if file.is_dir(): + continue + if str(file.name) in excluded: + continue + copy_and_update_includes(file, dstdir / file.name) + + # Update VERSION file + hash = get_hash(srcdir) + version_file = open(str(dstdir / 'VERSION'), 'w') + version_file.write('Imported using import-irregexp.py from:\n') + version_file.write('https://github.com/v8/v8/tree/%s/src/regexp\n' % hash) + + +if __name__ == '__main__': + import argparse + import tempfile + + # This script should be run from js/src/regexp to work correctly. + current_path = Path(os.getcwd()) + expected_path = 'js/src/regexp' + if not current_path.match(expected_path): + raise RuntimeError('%s must be run from %s' % (sys.argv[0], + expected_path)) + + parser = argparse.ArgumentParser(description='Import irregexp from v8') + parser.add_argument('-p', '--path', help='path to v8/src/regexp') + args = parser.parse_args() + + if args.path: + src_path = Path(args.path) + + if not (src_path / 'regexp.h').exists(): + print('Usage:\n import-irregexp.py --path <path/to/v8/src/regexp>') + sys.exit(1) + import_from(src_path, current_path) + sys.exit(0) + + with tempfile.TemporaryDirectory() as tempdir: + v8_git = 'https://github.com/v8/v8.git' + clone = 'git clone --depth 1 %s %s' % (v8_git, tempdir) + os.system(clone) + src_path = Path(tempdir) / 'src/regexp' + import_from(src_path, current_path) diff --git a/js/src/regexp/regexp-ast.h b/js/src/regexp/regexp-ast.h index fe6913e1d..311929d0b 100644 --- a/js/src/regexp/regexp-ast.h +++ b/js/src/regexp/regexp-ast.h @@ -458,7 +458,11 @@ class RegExpQuantifier final : public RegExpTree { class RegExpCapture final : public RegExpTree { public: explicit RegExpCapture(int index) - : body_(nullptr), index_(index), name_(nullptr) {} + : body_(nullptr), + index_(index), + min_match_(0), + max_match_(0), + name_(nullptr) {} void* Accept(RegExpVisitor* visitor, void* data) override; RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; static RegExpNode* ToNode(RegExpTree* body, int index, @@ -468,10 +472,14 @@ class RegExpCapture final : public RegExpTree { bool IsAnchoredAtEnd() override; Interval CaptureRegisters() override; bool IsCapture() override; - int min_match() override { return body_->min_match(); } - int max_match() override { return body_->max_match(); } + int min_match() override { return min_match_; } + int max_match() override { return max_match_; } RegExpTree* body() { return body_; } - void set_body(RegExpTree* body) { body_ = body; } + void set_body(RegExpTree* body) { + body_ = body; + min_match_ = body->min_match(); + max_match_ = body->max_match(); + } int index() const { return index_; } const ZoneVector<uc16>* name() const { return name_; } void set_name(const ZoneVector<uc16>* name) { name_ = name; } @@ -481,12 +489,17 @@ class RegExpCapture final : public RegExpTree { private: RegExpTree* body_; int index_; + int min_match_; + int max_match_; const ZoneVector<uc16>* name_; }; class RegExpGroup final : public RegExpTree { public: - explicit RegExpGroup(RegExpTree* body) : body_(body) {} + explicit RegExpGroup(RegExpTree* body) + : body_(body), + min_match_(body->min_match()), + max_match_(body->max_match()) {} void* Accept(RegExpVisitor* visitor, void* data) override; RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override { @@ -496,13 +509,15 @@ class RegExpGroup final : public RegExpTree { bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); } bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); } bool IsGroup() override; - int min_match() override { return body_->min_match(); } - int max_match() override { return body_->max_match(); } + int min_match() override { return min_match_; } + int max_match() override { return max_match_; } Interval CaptureRegisters() override { return body_->CaptureRegisters(); } RegExpTree* body() { return body_; } private: RegExpTree* body_; + int min_match_; + int max_match_; }; class RegExpLookaround final : public RegExpTree { diff --git a/js/src/regexp/regexp-bytecode-generator.cc b/js/src/regexp/regexp-bytecode-generator.cc index 239b27605..db151de85 100644 --- a/js/src/regexp/regexp-bytecode-generator.cc +++ b/js/src/regexp/regexp-bytecode-generator.cc @@ -327,13 +327,11 @@ void RegExpBytecodeGenerator::CheckNotBackReference(int start_reg, } void RegExpBytecodeGenerator::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, bool unicode, Label* on_not_equal) { + int start_reg, bool read_backward, Label* on_not_equal) { DCHECK_LE(0, start_reg); DCHECK_GE(kMaxRegister, start_reg); - Emit(read_backward ? (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD - : BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) - : (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE - : BC_CHECK_NOT_BACK_REF_NO_CASE), + Emit(read_backward ? BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD + : BC_CHECK_NOT_BACK_REF_NO_CASE, start_reg); EmitOrLink(on_not_equal); } diff --git a/js/src/regexp/regexp-bytecode-generator.h b/js/src/regexp/regexp-bytecode-generator.h index 15fbda8ec..f5502464d 100644 --- a/js/src/regexp/regexp-bytecode-generator.h +++ b/js/src/regexp/regexp-bytecode-generator.h @@ -69,7 +69,7 @@ class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler { virtual void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match); virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, bool unicode, + bool read_backward, Label* on_no_match); virtual void IfRegisterLT(int register_index, int comparand, Label* if_lt); virtual void IfRegisterGE(int register_index, int comparand, Label* if_ge); diff --git a/js/src/regexp/regexp-bytecode-peephole.cc b/js/src/regexp/regexp-bytecode-peephole.cc index 2bc1b5aa2..4266b4a80 100644 --- a/js/src/regexp/regexp-bytecode-peephole.cc +++ b/js/src/regexp/regexp-bytecode-peephole.cc @@ -428,7 +428,6 @@ BytecodeArgumentMapping BytecodeSequenceNode::ArgumentMapping( size_t index) const { DCHECK(IsSequence()); DCHECK(argument_mapping_ != nullptr); - DCHECK_GE(index, 0); DCHECK_LT(index, argument_mapping_->size()); return argument_mapping_->at(index); diff --git a/js/src/regexp/regexp-bytecodes.h b/js/src/regexp/regexp-bytecodes.h index 24d6925db..1cfef1b2d 100644 --- a/js/src/regexp/regexp-bytecodes.h +++ b/js/src/regexp/regexp-bytecodes.h @@ -100,12 +100,12 @@ STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK); V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128 */ \ V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \ V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \ - V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \ - V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \ - V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) \ + V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \ + V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \ + V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) /* UNUSED */ \ V(CHECK_NOT_BACK_REF_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \ V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 41, 8) /* bc8 reg_idx24 addr32 */ \ - V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) \ + V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) /* UNUSED */ \ V(CHECK_NOT_REGS_EQUAL, 43, 12) /* bc8 regidx24 reg_idx32 addr32 */ \ V(CHECK_REGISTER_LT, 44, 12) /* bc8 reg_idx24 value32 addr32 */ \ V(CHECK_REGISTER_GE, 45, 12) /* bc8 reg_idx24 value32 addr32 */ \ diff --git a/js/src/regexp/regexp-compiler-tonode.cc b/js/src/regexp/regexp-compiler-tonode.cc index fc734ac7c..257030589 100644 --- a/js/src/regexp/regexp-compiler-tonode.cc +++ b/js/src/regexp/regexp-compiler-tonode.cc @@ -1137,39 +1137,6 @@ Vector<const int> CharacterRange::GetWordBounds() { return Vector<const int>(kWordRanges, kWordRangeCount - 1); } -#ifdef V8_INTL_SUPPORT -struct IgnoreSet { - IgnoreSet() : set(BuildIgnoreSet()) {} - const icu::UnicodeSet set; -}; - -struct SpecialAddSet { - SpecialAddSet() : set(BuildSpecialAddSet()) {} - const icu::UnicodeSet set; -}; - -icu::UnicodeSet BuildAsciiAToZSet() { - icu::UnicodeSet set('a', 'z'); - set.add('A', 'Z'); - set.freeze(); - return set; -} - -struct AsciiAToZSet { - AsciiAToZSet() : set(BuildAsciiAToZSet()) {} - const icu::UnicodeSet set; -}; - -static base::LazyInstance<IgnoreSet>::type ignore_set = - LAZY_INSTANCE_INITIALIZER; - -static base::LazyInstance<SpecialAddSet>::type special_add_set = - LAZY_INSTANCE_INITIALIZER; - -static base::LazyInstance<AsciiAToZSet>::type ascii_a_to_z_set = - LAZY_INSTANCE_INITIALIZER; -#endif // V8_INTL_SUPPORT - // static void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, ZoneList<CharacterRange>* ranges, @@ -1192,75 +1159,22 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, others.add(from, to); } - // Set of characters already added to ranges that do not need to be added - // again. + // Compute the set of additional characters that should be added, + // using UnicodeSet::closeOver. ECMA 262 defines slightly different + // case-folding rules than Unicode, so some characters that are + // added by closeOver do not match anything other than themselves in + // JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the + // same case-insensitive character as 's' or 'S' according to + // Unicode, but does not match any other character in JS. To handle + // this case, we add such characters to the IgnoreSet and filter + // them out. We filter twice: once before calling closeOver (to + // prevent 'ſ' from adding 's'), and once after calling closeOver + // (to prevent 's' from adding 'ſ'). See regexp/special-case.h for + // more information. icu::UnicodeSet already_added(others); - - // Set of characters in ranges that are in the 52 ASCII characters [a-zA-Z]. - icu::UnicodeSet in_ascii_a_to_z(others); - in_ascii_a_to_z.retainAll(ascii_a_to_z_set.Pointer()->set); - - // Remove all chars in [a-zA-Z] from others. - others.removeAll(in_ascii_a_to_z); - - // Set of characters in ranges that are overlapping with special add set. - icu::UnicodeSet in_special_add(others); - in_special_add.retainAll(special_add_set.Pointer()->set); - - others.removeAll(in_special_add); - - // Ignore all chars in ignore set. - others.removeAll(ignore_set.Pointer()->set); - - // For most of the chars in ranges that is still in others, find the case - // equivlant set by calling closeOver(USET_CASE_INSENSITIVE). + others.removeAll(RegExpCaseFolding::IgnoreSet()); others.closeOver(USET_CASE_INSENSITIVE); - - // Because closeOver(USET_CASE_INSENSITIVE) may add ASCII [a-zA-Z] to others, - // but ECMA262 "i" mode won't consider that, remove them from others. - // Ex: U+017F add 'S' and 's' to others. - others.removeAll(ascii_a_to_z_set.Pointer()->set); - - // Special handling for in_ascii_a_to_z. - for (int32_t i = 0; i < in_ascii_a_to_z.getRangeCount(); i++) { - UChar32 start = in_ascii_a_to_z.getRangeStart(i); - UChar32 end = in_ascii_a_to_z.getRangeEnd(i); - // Check if it is uppercase A-Z by checking bit 6. - if (start & 0x0020) { - // Add the lowercases - others.add(start & 0x005F, end & 0x005F); - } else { - // Add the uppercases - others.add(start | 0x0020, end | 0x0020); - } - } - - // Special handling for chars in "Special Add" set. - for (int32_t i = 0; i < in_special_add.getRangeCount(); i++) { - UChar32 end = in_special_add.getRangeEnd(i); - for (UChar32 ch = in_special_add.getRangeStart(i); ch <= end; ch++) { - // Add the uppercase of this character if itself is not an uppercase - // character. - // Note: The if condiction cannot be u_islower(ch) because ch could be - // neither uppercase nor lowercase but Mn. - if (!u_isupper(ch)) { - others.add(u_toupper(ch)); - } - icu::UnicodeSet candidates(ch, ch); - candidates.closeOver(USET_CASE_INSENSITIVE); - for (int32_t j = 0; j < candidates.getRangeCount(); j++) { - UChar32 end2 = candidates.getRangeEnd(j); - for (UChar32 ch2 = candidates.getRangeStart(j); ch2 <= end2; ch2++) { - // Add character that is not uppercase to others. - if (!u_isupper(ch2)) { - others.add(ch2); - } - } - } - } - } - - // Remove all characters which already in the ranges. + others.removeAll(RegExpCaseFolding::IgnoreSet()); others.removeAll(already_added); // Add others to the ranges diff --git a/js/src/regexp/regexp-compiler.cc b/js/src/regexp/regexp-compiler.cc index 9a2aa30dc..c0070061f 100644 --- a/js/src/regexp/regexp-compiler.cc +++ b/js/src/regexp/regexp-compiler.cc @@ -5,7 +5,9 @@ #include "regexp/regexp-compiler.h" #include "regexp/regexp-macro-assembler-arch.h" -#include "regexp/regexp-macro-assembler-tracer.h" +#ifdef V8_INTL_SUPPORT +#include "regexp/special-case.h" +#endif // V8_INTL_SUPPORT #ifdef V8_INTL_SUPPORT #include "unicode/locid.h" @@ -237,20 +239,15 @@ RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, RegExpCompiler::CompilationResult RegExpCompiler::Assemble( Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start, int capture_count, Handle<String> pattern) { -#ifdef DEBUG - if (FLAG_trace_regexp_assembler) - macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler); - else -#endif - macro_assembler_ = macro_assembler; + macro_assembler_ = macro_assembler; - std::vector<RegExpNode*> work_list; + ZoneVector<RegExpNode*> work_list(zone()); work_list_ = &work_list; Label fail; macro_assembler_->PushBacktrack(&fail); Trace new_trace; start->Emit(this, &new_trace); - macro_assembler_->Bind(&fail); + macro_assembler_->BindJumpTarget(&fail); macro_assembler_->Fail(); while (!work_list.empty()) { RegExpNode* node = work_list.back(); @@ -264,14 +261,9 @@ RegExpCompiler::CompilationResult RegExpCompiler::Assemble( } Handle<HeapObject> code = macro_assembler_->GetCode(pattern); - isolate->IncreaseTotalRegexpCodeGenerated(code->Size()); + isolate->IncreaseTotalRegexpCodeGenerated(code); work_list_ = nullptr; -#ifdef DEBUG - if (FLAG_trace_regexp_assembler) { - delete macro_assembler_; - } -#endif return {*code, next_register_}; } @@ -557,7 +549,7 @@ void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) { } // On backtrack we need to restore state. - assembler->Bind(&undo); + assembler->BindJumpTarget(&undo); RestoreAffectedRegisters(assembler, max_register, registers_to_pop, registers_to_clear); if (backtrack() == nullptr) { @@ -720,32 +712,34 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, unibrow::uchar* letters, int letter_length) { #ifdef V8_INTL_SUPPORT - // Special case for U+017F which has upper case in ASCII range. - if (character == 0x017f) { + if (RegExpCaseFolding::IgnoreSet().contains(character)) { letters[0] = character; return 1; } + bool in_special_add_set = + RegExpCaseFolding::SpecialAddSet().contains(character); + icu::UnicodeSet set; set.add(character); set = set.closeOver(USET_CASE_INSENSITIVE); + + UChar32 canon = 0; + if (in_special_add_set) { + canon = RegExpCaseFolding::Canonicalize(character); + } + int32_t range_count = set.getRangeCount(); int items = 0; for (int32_t i = 0; i < range_count; i++) { UChar32 start = set.getRangeStart(i); UChar32 end = set.getRangeEnd(i); CHECK(end - start + items <= letter_length); - // Only add to the output if character is not in ASCII range - // or the case equivalent character is in ASCII range. - // #sec-runtime-semantics-canonicalize-ch - // 3.g If the numeric value of ch ≥ 128 and the numeric value of cu < 128, - // return ch. - if (!((start >= 128) && (character < 128))) { - // No range have start and end span across code point 128. - DCHECK((start >= 128) == (end >= 128)); - for (UChar32 cu = start; cu <= end; cu++) { - if (one_byte_subject && cu > String::kMaxOneByteCharCode) break; - letters[items++] = (unibrow::uchar)(cu); + for (UChar32 cu = start; cu <= end; cu++) { + if (one_byte_subject && cu > String::kMaxOneByteCharCode) break; + if (in_special_add_set && RegExpCaseFolding::Canonicalize(cu) != canon) { + continue; } + letters[items++] = (unibrow::uchar)(cu); } } return items; @@ -852,10 +846,6 @@ static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, return false; } -using EmitCharacterFunction = bool(Isolate* isolate, RegExpCompiler* compiler, - uc16 c, Label* on_failure, int cp_offset, - bool check, bool preloaded); - // Only emits letters (things that have case). Only used for case independent // matches. static inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler, @@ -1843,13 +1833,13 @@ RegExpNode* TextNode::FilterOneByte(int depth) { if (elm.text_type() == TextElement::ATOM) { Vector<const uc16> quarks = elm.atom()->data(); for (int j = 0; j < quarks.length(); j++) { - uint16_t c = quarks[j]; + uc16 c = quarks[j]; if (elm.atom()->ignore_case()) { c = unibrow::Latin1::TryConvertToLatin1(c); } if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr); // Replace quark in case we converted to Latin-1. - uint16_t* writable_quarks = const_cast<uint16_t*>(quarks.begin()); + uc16* writable_quarks = const_cast<uc16*>(quarks.begin()); writable_quarks[j] = c; } } else { @@ -2304,7 +2294,6 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass, for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { if (first_element_checked && i == 0 && j == 0) continue; if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; - EmitCharacterFunction* emit_function = nullptr; uc16 quark = quarks[j]; if (elm.atom()->ignore_case()) { // Everywhere else we assume that a non-Latin-1 character cannot match @@ -2312,6 +2301,9 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass, // invalid by using the Latin1 equivalent instead. quark = unibrow::Latin1::TryConvertToLatin1(quark); } + bool needs_bounds_check = + *checked_up_to < cp_offset + j || read_backward(); + bool bounds_checked = false; switch (pass) { case NON_LATIN1_MATCH: DCHECK(one_byte); @@ -2321,24 +2313,24 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass, } break; case NON_LETTER_CHARACTER_MATCH: - emit_function = &EmitAtomNonLetter; + bounds_checked = + EmitAtomNonLetter(isolate, compiler, quark, backtrack, + cp_offset + j, needs_bounds_check, preloaded); break; case SIMPLE_CHARACTER_MATCH: - emit_function = &EmitSimpleCharacter; + bounds_checked = EmitSimpleCharacter(isolate, compiler, quark, + backtrack, cp_offset + j, + needs_bounds_check, preloaded); break; case CASE_CHARACTER_MATCH: - emit_function = &EmitAtomLetter; + bounds_checked = + EmitAtomLetter(isolate, compiler, quark, backtrack, + cp_offset + j, needs_bounds_check, preloaded); break; default: break; } - if (emit_function != nullptr) { - bool bounds_check = *checked_up_to < cp_offset + j || read_backward(); - bool bound_checked = - emit_function(isolate, compiler, quark, backtrack, cp_offset + j, - bounds_check, preloaded); - if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); - } + if (bounds_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); } } else { DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type()); @@ -3424,8 +3416,8 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { DCHECK_EQ(start_reg_ + 1, end_reg_); if (IgnoreCase(flags_)) { - assembler->CheckNotBackReferenceIgnoreCase( - start_reg_, read_backward(), IsUnicode(flags_), trace->backtrack()); + assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(), + trace->backtrack()); } else { assembler->CheckNotBackReference(start_reg_, read_backward(), trace->backtrack()); @@ -3597,12 +3589,17 @@ template <typename... Propagators> class Analysis : public NodeVisitor { public: Analysis(Isolate* isolate, bool is_one_byte) - : isolate_(isolate), is_one_byte_(is_one_byte), error_message_(nullptr) {} + : isolate_(isolate), + is_one_byte_(is_one_byte), + error_(RegExpError::kNone) {} void EnsureAnalyzed(RegExpNode* that) { StackLimitCheck check(isolate()); if (check.HasOverflowed()) { - fail("Stack overflow"); + if (FLAG_correctness_fuzzer_suppressions) { + FATAL("Analysis: Aborting on stack overflow"); + } + fail(RegExpError::kAnalysisStackOverflow); return; } if (that->info()->been_analyzed || that->info()->being_analyzed) return; @@ -3612,12 +3609,12 @@ class Analysis : public NodeVisitor { that->info()->been_analyzed = true; } - bool has_failed() { return error_message_ != nullptr; } - const char* error_message() { - DCHECK(error_message_ != nullptr); - return error_message_; + bool has_failed() { return error_ != RegExpError::kNone; } + RegExpError error() { + DCHECK(error_ != RegExpError::kNone); + return error_; } - void fail(const char* error_message) { error_message_ = error_message; } + void fail(RegExpError error) { error_ = error; } Isolate* isolate() const { return isolate_; } @@ -3702,19 +3699,19 @@ class Analysis : public NodeVisitor { private: Isolate* isolate_; bool is_one_byte_; - const char* error_message_; + RegExpError error_; DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis); }; -const char* AnalyzeRegExp(Isolate* isolate, bool is_one_byte, +RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node) { Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis(isolate, is_one_byte); DCHECK_EQ(node->info()->been_analyzed, false); analysis.EnsureAnalyzed(node); - DCHECK_IMPLIES(analysis.has_failed(), analysis.error_message() != nullptr); - return analysis.has_failed() ? analysis.error_message() : nullptr; + DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone); + return analysis.has_failed() ? analysis.error() : RegExpError::kNone; } void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, diff --git a/js/src/regexp/regexp-compiler.h b/js/src/regexp/regexp-compiler.h index 192b3284d..1954f1a4c 100644 --- a/js/src/regexp/regexp-compiler.h +++ b/js/src/regexp/regexp-compiler.h @@ -422,10 +422,7 @@ struct PreloadState { // Analysis performs assertion propagation and computes eats_at_least_ values. // See the comments on AssertionPropagator and EatsAtLeastPropagator for more // details. -// -// This method returns nullptr on success or a null-terminated failure message -// on failure. -const char* AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node); +RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node); class FrequencyCollator { public: @@ -502,18 +499,17 @@ class RegExpCompiler { } struct CompilationResult final { - explicit CompilationResult(const char* error_message) - : error_message(error_message) {} + explicit CompilationResult(RegExpError err) : error(err) {} CompilationResult(Object code, int registers) : code(code), num_registers(registers) {} static CompilationResult RegExpTooBig() { - return CompilationResult("RegExp too big"); + return CompilationResult(RegExpError::kTooLarge); } - bool Succeeded() const { return error_message == nullptr; } + bool Succeeded() const { return error == RegExpError::kNone; } - const char* const error_message = nullptr; + const RegExpError error = RegExpError::kNone; Object code; int num_registers = 0; }; @@ -575,7 +571,7 @@ class RegExpCompiler { int next_register_; int unicode_lookaround_stack_register_; int unicode_lookaround_position_register_; - std::vector<RegExpNode*>* work_list_; + ZoneVector<RegExpNode*>* work_list_; int recursion_depth_; RegExpMacroAssembler* macro_assembler_; bool one_byte_; diff --git a/js/src/regexp/regexp-error.cc b/js/src/regexp/regexp-error.cc new file mode 100644 index 000000000..3906f9d9f --- /dev/null +++ b/js/src/regexp/regexp-error.cc @@ -0,0 +1,22 @@ +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "regexp/regexp-error.h" + +namespace v8 { +namespace internal { + +const char* kRegExpErrorStrings[] = { +#define TEMPLATE(NAME, STRING) STRING, + REGEXP_ERROR_MESSAGES(TEMPLATE) +#undef TEMPLATE +}; + +const char* RegExpErrorString(RegExpError error) { + DCHECK_LT(error, RegExpError::NumErrors); + return kRegExpErrorStrings[static_cast<int>(error)]; +} + +} // namespace internal +} // namespace v8 diff --git a/js/src/regexp/regexp-error.h b/js/src/regexp/regexp-error.h new file mode 100644 index 000000000..ef9d037dd --- /dev/null +++ b/js/src/regexp/regexp-error.h @@ -0,0 +1,56 @@ +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_REGEXP_ERROR_H_ +#define V8_REGEXP_REGEXP_ERROR_H_ + + +namespace v8 { +namespace internal { + +#define REGEXP_ERROR_MESSAGES(T) \ + T(None, "") \ + T(StackOverflow, "Maximum call stack size exceeded") \ + T(AnalysisStackOverflow, "Stack overflow") \ + T(TooLarge, "Regular expression too large") \ + T(UnterminatedGroup, "Unterminated group") \ + T(UnmatchedParen, "Unmatched ')'") \ + T(EscapeAtEndOfPattern, "\\ at end of pattern") \ + T(InvalidPropertyName, "Invalid property name") \ + T(InvalidEscape, "Invalid escape") \ + T(InvalidDecimalEscape, "Invalid decimal escape") \ + T(InvalidUnicodeEscape, "Invalid Unicode escape") \ + T(NothingToRepeat, "Nothing to repeat") \ + T(LoneQuantifierBrackets, "Lone quantifier brackets") \ + T(RangeOutOfOrder, "numbers out of order in {} quantifier") \ + T(IncompleteQuantifier, "Incomplete quantifier") \ + T(InvalidQuantifier, "Invalid quantifier") \ + T(InvalidGroup, "Invalid group") \ + T(MultipleFlagDashes, "Multiple dashes in flag group") \ + T(RepeatedFlag, "Repeated flag in flag group") \ + T(InvalidFlagGroup, "Invalid flag group") \ + T(TooManyCaptures, "Too many captures") \ + T(InvalidCaptureGroupName, "Invalid capture group name") \ + T(DuplicateCaptureGroupName, "Duplicate capture group name") \ + T(InvalidNamedReference, "Invalid named reference") \ + T(InvalidNamedCaptureReference, "Invalid named capture referenced") \ + T(InvalidClassEscape, "Invalid class escape") \ + T(InvalidClassPropertyName, "Invalid property name in character class") \ + T(InvalidCharacterClass, "Invalid character class") \ + T(UnterminatedCharacterClass, "Unterminated character class") \ + T(OutOfOrderCharacterClass, "Range out of order in character class") + +enum class RegExpError : uint32_t { +#define TEMPLATE(NAME, STRING) k##NAME, + REGEXP_ERROR_MESSAGES(TEMPLATE) +#undef TEMPLATE + NumErrors +}; + +V8_EXPORT_PRIVATE const char* RegExpErrorString(RegExpError error); + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_REGEXP_ERROR_H_ diff --git a/js/src/regexp/regexp-interpreter.cc b/js/src/regexp/regexp-interpreter.cc index 6632cd729..7735d6885 100644 --- a/js/src/regexp/regexp-interpreter.cc +++ b/js/src/regexp/regexp-interpreter.cc @@ -28,18 +28,18 @@ namespace internal { namespace { bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len, - Vector<const uc16> subject, bool unicode) { + Vector<const uc16> subject) { Address offset_a = reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(from))); Address offset_b = reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(current))); size_t length = len * kUC16Size; - return RegExpMacroAssembler::CaseInsensitiveCompareUC16( - offset_a, offset_b, length, unicode ? nullptr : isolate) == 1; + return RegExpMacroAssembler::CaseInsensitiveCompareUC16(offset_a, offset_b, + length, isolate) == 1; } bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len, - Vector<const uint8_t> subject, bool unicode) { + Vector<const uint8_t> subject) { // For Latin1 characters the unicode flag makes no difference. for (int i = 0; i < len; i++) { unsigned int old_char = subject[from++]; @@ -82,11 +82,17 @@ int32_t Load32Aligned(const byte* pc) { return *reinterpret_cast<const int32_t*>(pc); } -int32_t Load16Aligned(const byte* pc) { +// TODO(jgruber): Rename to Load16AlignedUnsigned. +uint32_t Load16Aligned(const byte* pc) { DCHECK_EQ(0, reinterpret_cast<intptr_t>(pc) & 1); return *reinterpret_cast<const uint16_t*>(pc); } +int32_t Load16AlignedSigned(const byte* pc) { + DCHECK_EQ(0, reinterpret_cast<intptr_t>(pc) & 1); + return *reinterpret_cast<const int16_t*>(pc); +} + // A simple abstraction over the backtracking stack used by the interpreter. // // Despite the name 'backtracking' stack, it's actually used as a generic stack @@ -734,26 +740,14 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array, DISPATCH(); } BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE) { - int from = registers[insn >> BYTECODE_SHIFT]; - int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; - if (from >= 0 && len > 0) { - if (current + len > subject.length() || - !BackRefMatchesNoCase(isolate, from, current, len, subject, true)) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - DISPATCH(); - } - current += len; - } - ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE); - DISPATCH(); + UNREACHABLE(); // TODO(jgruber): Remove this unused bytecode. } BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) { int from = registers[insn >> BYTECODE_SHIFT]; int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; if (from >= 0 && len > 0) { if (current + len > subject.length() || - !BackRefMatchesNoCase(isolate, from, current, len, subject, - false)) { + !BackRefMatchesNoCase(isolate, from, current, len, subject)) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); DISPATCH(); } @@ -763,27 +757,14 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array, DISPATCH(); } BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD) { - int from = registers[insn >> BYTECODE_SHIFT]; - int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; - if (from >= 0 && len > 0) { - if (current - len < 0 || - !BackRefMatchesNoCase(isolate, from, current - len, len, subject, - true)) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - DISPATCH(); - } - current -= len; - } - ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD); - DISPATCH(); + UNREACHABLE(); // TODO(jgruber): Remove this unused bytecode. } BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) { int from = registers[insn >> BYTECODE_SHIFT]; int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; if (from >= 0 && len > 0) { if (current - len < 0 || - !BackRefMatchesNoCase(isolate, from, current - len, len, subject, - false)) { + !BackRefMatchesNoCase(isolate, from, current - len, len, subject)) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); DISPATCH(); } @@ -828,7 +809,7 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array, } BYTECODE(SKIP_UNTIL_CHAR) { int load_offset = (insn >> BYTECODE_SHIFT); - uint32_t advance = Load16Aligned(pc + 4); + int32_t advance = Load16AlignedSigned(pc + 4); uint32_t c = Load16Aligned(pc + 6); while (static_cast<uintptr_t>(current + load_offset) < static_cast<uintptr_t>(subject.length())) { @@ -844,7 +825,7 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array, } BYTECODE(SKIP_UNTIL_CHAR_AND) { int load_offset = (insn >> BYTECODE_SHIFT); - uint16_t advance = Load16Aligned(pc + 4); + int32_t advance = Load16AlignedSigned(pc + 4); uint16_t c = Load16Aligned(pc + 6); uint32_t mask = Load32Aligned(pc + 8); int32_t maximum_offset = Load32Aligned(pc + 12); @@ -862,7 +843,7 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array, } BYTECODE(SKIP_UNTIL_CHAR_POS_CHECKED) { int load_offset = (insn >> BYTECODE_SHIFT); - uint16_t advance = Load16Aligned(pc + 4); + int32_t advance = Load16AlignedSigned(pc + 4); uint16_t c = Load16Aligned(pc + 6); int32_t maximum_offset = Load32Aligned(pc + 8); while (static_cast<uintptr_t>(current + maximum_offset) <= @@ -879,7 +860,7 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array, } BYTECODE(SKIP_UNTIL_BIT_IN_TABLE) { int load_offset = (insn >> BYTECODE_SHIFT); - uint32_t advance = Load16Aligned(pc + 4); + int32_t advance = Load16AlignedSigned(pc + 4); const byte* table = pc + 8; while (static_cast<uintptr_t>(current + load_offset) < static_cast<uintptr_t>(subject.length())) { @@ -895,7 +876,7 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array, } BYTECODE(SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE) { int load_offset = (insn >> BYTECODE_SHIFT); - uint16_t advance = Load16Aligned(pc + 4); + int32_t advance = Load16AlignedSigned(pc + 4); uint16_t limit = Load16Aligned(pc + 6); const byte* table = pc + 8; while (static_cast<uintptr_t>(current + load_offset) < @@ -916,7 +897,7 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array, } BYTECODE(SKIP_UNTIL_CHAR_OR_CHAR) { int load_offset = (insn >> BYTECODE_SHIFT); - uint32_t advance = Load32Aligned(pc + 4); + int32_t advance = Load32Aligned(pc + 4); uint16_t c = Load16Aligned(pc + 8); uint16_t c2 = Load16Aligned(pc + 10); while (static_cast<uintptr_t>(current + load_offset) < @@ -1016,6 +997,8 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal( } } +#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER + // This method is called through an external reference from RegExpExecInternal // builtin. IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromJs( @@ -1042,6 +1025,8 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromJs( start_position, call_origin); } +#endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER + IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromRuntime( Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject_string, int* registers, int registers_length, int start_position) { diff --git a/js/src/regexp/regexp-macro-assembler-tracer.cc b/js/src/regexp/regexp-macro-assembler-tracer.cc index 331c57d1a..b71a0f48e 100644 --- a/js/src/regexp/regexp-macro-assembler-tracer.cc +++ b/js/src/regexp/regexp-macro-assembler-tracer.cc @@ -349,17 +349,15 @@ void RegExpMacroAssemblerTracer::CheckNotBackReference(int start_reg, assembler_->CheckNotBackReference(start_reg, read_backward, on_no_match); } - void RegExpMacroAssemblerTracer::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, bool unicode, Label* on_no_match) { - PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s %s, label[%08x]);\n", + int start_reg, bool read_backward, Label* on_no_match) { + PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s, label[%08x]);\n", start_reg, read_backward ? "backward" : "forward", - unicode ? "unicode" : "non-unicode", LabelToInt(on_no_match)); - assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward, unicode, + LabelToInt(on_no_match)); + assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward, on_no_match); } - void RegExpMacroAssemblerTracer::CheckPosition(int cp_offset, Label* on_outside_input) { PrintF(" CheckPosition(cp_offset=%d, label[%08x]);\n", cp_offset, diff --git a/js/src/regexp/regexp-macro-assembler-tracer.h b/js/src/regexp/regexp-macro-assembler-tracer.h index 938f84796..5332e59b8 100644 --- a/js/src/regexp/regexp-macro-assembler-tracer.h +++ b/js/src/regexp/regexp-macro-assembler-tracer.h @@ -33,7 +33,6 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler { void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match) override; void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward, - bool unicode, Label* on_no_match) override; void CheckNotCharacter(unsigned c, Label* on_not_equal) override; void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with, diff --git a/js/src/regexp/regexp-macro-assembler.cc b/js/src/regexp/regexp-macro-assembler.cc index 4a8dcd3ce..7f8de2543 100644 --- a/js/src/regexp/regexp-macro-assembler.cc +++ b/js/src/regexp/regexp-macro-assembler.cc @@ -110,34 +110,7 @@ bool NativeRegExpMacroAssembler::CanReadUnaligned() { return FLAG_enable_regexp_unaligned_accesses && !slow_safe(); } -const byte* NativeRegExpMacroAssembler::StringCharacterPosition( - String subject, int start_index, const DisallowHeapAllocation& no_gc) { - if (subject.IsConsString()) { - subject = ConsString::cast(subject).first(); - } else if (subject.IsSlicedString()) { - start_index += SlicedString::cast(subject).offset(); - subject = SlicedString::cast(subject).parent(); - } - if (subject.IsThinString()) { - subject = ThinString::cast(subject).actual(); - } - DCHECK_LE(0, start_index); - DCHECK_LE(start_index, subject.length()); - if (subject.IsSeqOneByteString()) { - return reinterpret_cast<const byte*>( - SeqOneByteString::cast(subject).GetChars(no_gc) + start_index); - } else if (subject.IsSeqTwoByteString()) { - return reinterpret_cast<const byte*>( - SeqTwoByteString::cast(subject).GetChars(no_gc) + start_index); - } else if (subject.IsExternalOneByteString()) { - return reinterpret_cast<const byte*>( - ExternalOneByteString::cast(subject).GetChars() + start_index); - } else { - DCHECK(subject.IsExternalTwoByteString()); - return reinterpret_cast<const byte*>( - ExternalTwoByteString::cast(subject).GetChars() + start_index); - } -} +#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER // This method may only be called after an interrupt. int NativeRegExpMacroAssembler::CheckStackGuardState( @@ -145,9 +118,10 @@ int NativeRegExpMacroAssembler::CheckStackGuardState( Address* return_address, Code re_code, Address* subject, const byte** input_start, const byte** input_end) { DisallowHeapAllocation no_gc; + Address old_pc = PointerAuthentication::AuthenticatePC(return_address, 0); + DCHECK_LE(re_code.raw_instruction_start(), old_pc); + DCHECK_LE(old_pc, re_code.raw_instruction_end()); - DCHECK(re_code.raw_instruction_start() <= *return_address); - DCHECK(*return_address <= re_code.raw_instruction_end()); StackLimitCheck check(isolate); bool js_has_overflowed = check.JsHasOverflowed(); @@ -189,9 +163,11 @@ int NativeRegExpMacroAssembler::CheckStackGuardState( } if (*code_handle != re_code) { // Return address no longer valid - intptr_t delta = code_handle->address() - re_code.address(); // Overwrite the return address on the stack. - *return_address += delta; + intptr_t delta = code_handle->address() - re_code.address(); + Address new_pc = old_pc + delta; + // TODO(v8:10026): avoid replacing a signed pointer. + PointerAuthentication::ReplacePC(return_address, new_pc, 0); } // If we continue, we need to update the subject string addresses. @@ -206,8 +182,7 @@ int NativeRegExpMacroAssembler::CheckStackGuardState( } else { *subject = subject_handle->ptr(); intptr_t byte_length = *input_end - *input_start; - *input_start = - StringCharacterPosition(*subject_handle, start_index, no_gc); + *input_start = subject_handle->AddressOfCharacterAt(start_index, no_gc); *input_end = *input_start + byte_length; } } @@ -255,7 +230,7 @@ int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp, DisallowHeapAllocation no_gc; const byte* input_start = - StringCharacterPosition(subject_ptr, start_offset + slice_offset, no_gc); + subject_ptr.AddressOfCharacterAt(start_offset + slice_offset, no_gc); int byte_length = char_length << char_size_shift; const byte* input_end = input_start + byte_length; return Execute(*subject, start_offset, input_start, input_end, offsets_vector, @@ -301,6 +276,8 @@ int NativeRegExpMacroAssembler::Execute( return result; } +#endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER + // clang-format off const byte NativeRegExpMacroAssembler::word_character_map[] = { 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, diff --git a/js/src/regexp/regexp-macro-assembler.h b/js/src/regexp/regexp-macro-assembler.h index dd059a43d..ef3961a70 100644 --- a/js/src/regexp/regexp-macro-assembler.h +++ b/js/src/regexp/regexp-macro-assembler.h @@ -87,7 +87,7 @@ class RegExpMacroAssembler { virtual void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match) = 0; virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, bool unicode, + bool read_backward, Label* on_no_match) = 0; // Check the current character for a match with a literal character. If we // fail to match then goto the on_failure label. End of input always @@ -122,6 +122,11 @@ class RegExpMacroAssembler { // not have custom support. // May clobber the current loaded character. virtual bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match); + + // Control-flow integrity: + // Define a jump target and bind a label. + virtual void BindJumpTarget(Label* label) { Bind(label); } + virtual void Fail() = 0; virtual Handle<HeapObject> GetCode(Handle<String> source) = 0; virtual void GoTo(Label* label) = 0; @@ -246,9 +251,6 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler { static Address GrowStack(Address stack_pointer, Address* stack_top, Isolate* isolate); - static const byte* StringCharacterPosition( - String subject, int start_index, const DisallowHeapAllocation& no_gc); - static int CheckStackGuardState(Isolate* isolate, int start_index, RegExp::CallOrigin call_origin, Address* return_address, Code re_code, diff --git a/js/src/regexp/regexp-parser.cc b/js/src/regexp/regexp-parser.cc index 377b94247..e2bbb6ed0 100644 --- a/js/src/regexp/regexp-parser.cc +++ b/js/src/regexp/regexp-parser.cc @@ -17,11 +17,10 @@ namespace v8 { namespace internal { -RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, - JSRegExp::Flags flags, Isolate* isolate, Zone* zone) +RegExpParser::RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, + Isolate* isolate, Zone* zone) : isolate_(isolate), zone_(zone), - error_(error), captures_(nullptr), named_captures_(nullptr), named_back_references_(nullptr), @@ -74,13 +73,12 @@ void RegExpParser::Advance() { if (FLAG_correctness_fuzzer_suppressions) { FATAL("Aborting on stack overflow"); } - ReportError(CStrVector( - MessageFormatter::TemplateString(MessageTemplate::kStackOverflow))); + ReportError(RegExpError::kStackOverflow); } else if (zone()->excess_allocation()) { if (FLAG_correctness_fuzzer_suppressions) { FATAL("Aborting on excess zone allocation"); } - ReportError(CStrVector("Regular expression too large")); + ReportError(RegExpError::kTooLarge); } else { current_ = ReadNext<true>(); } @@ -132,15 +130,12 @@ bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { return false; } - -RegExpTree* RegExpParser::ReportError(Vector<const char> message) { +RegExpTree* RegExpParser::ReportError(RegExpError error) { if (failed_) return nullptr; // Do not overwrite any existing error. failed_ = true; - *error_ = isolate() - ->factory() - ->NewStringFromOneByte(Vector<const uint8_t>::cast(message)) - .ToHandleChecked(); - // Zip to the end to make sure the no more input is read. + error_ = error; + error_pos_ = position(); + // Zip to the end to make sure no more input is read. current_ = kEndMarker; next_pos_ = in()->length(); return nullptr; @@ -187,14 +182,14 @@ RegExpTree* RegExpParser::ParseDisjunction() { case kEndMarker: if (state->IsSubexpression()) { // Inside a parenthesized group when hitting end of input. - return ReportError(CStrVector("Unterminated group")); + return ReportError(RegExpError::kUnterminatedGroup); } DCHECK_EQ(INITIAL, state->group_type()); // Parsing completed successfully. return builder->ToRegExp(); case ')': { if (!state->IsSubexpression()) { - return ReportError(CStrVector("Unmatched ')'")); + return ReportError(RegExpError::kUnmatchedParen); } DCHECK_NE(INITIAL, state->group_type()); @@ -245,7 +240,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { case '*': case '+': case '?': - return ReportError(CStrVector("Nothing to repeat")); + return ReportError(RegExpError::kNothingToRepeat); case '^': { Advance(); if (builder->multiline()) { @@ -300,7 +295,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { case '\\': switch (Next()) { case kEndMarker: - return ReportError(CStrVector("\\ at end of pattern")); + return ReportError(RegExpError::kEscapeAtEndOfPattern); case 'b': Advance(2); builder->AddAssertion(new (zone()) RegExpAssertion( @@ -340,7 +335,8 @@ RegExpTree* RegExpParser::ParseDisjunction() { if (unicode()) { ZoneList<CharacterRange>* ranges = new (zone()) ZoneList<CharacterRange>(2, zone()); - std::vector<char> name_1, name_2; + ZoneVector<char> name_1(zone()); + ZoneVector<char> name_2(zone()); if (ParsePropertyClassName(&name_1, &name_2)) { if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) { RegExpCharacterClass* cc = new (zone()) @@ -356,7 +352,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { } } } - return ReportError(CStrVector("Invalid property name")); + return ReportError(RegExpError::kInvalidPropertyName); } else { builder->AddCharacter(p); } @@ -392,7 +388,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { // With /u, no identity escapes except for syntax characters // are allowed. Otherwise, all identity escapes are allowed. if (unicode()) { - return ReportError(CStrVector("Invalid escape")); + return ReportError(RegExpError::kInvalidEscape); } uc32 first_digit = Next(); if (first_digit == '8' || first_digit == '9') { @@ -406,7 +402,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { Advance(); if (unicode() && Next() >= '0' && Next() <= '9') { // With /u, decimal escape with leading 0 are not parsed as octal. - return ReportError(CStrVector("Invalid decimal escape")); + return ReportError(RegExpError::kInvalidDecimalEscape); } uc32 octal = ParseOctalLiteral(); builder->AddCharacter(octal); @@ -447,7 +443,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { // ES#prod-annexB-ExtendedPatternCharacter if (unicode()) { // With /u, invalid escapes are not treated as identity escapes. - return ReportError(CStrVector("Invalid unicode escape")); + return ReportError(RegExpError::kInvalidUnicodeEscape); } builder->AddCharacter('\\'); } else { @@ -465,7 +461,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { builder->AddCharacter('x'); } else { // With /u, invalid escapes are not treated as identity escapes. - return ReportError(CStrVector("Invalid escape")); + return ReportError(RegExpError::kInvalidEscape); } break; } @@ -478,7 +474,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { builder->AddCharacter('u'); } else { // With /u, invalid escapes are not treated as identity escapes. - return ReportError(CStrVector("Invalid Unicode escape")); + return ReportError(RegExpError::kInvalidUnicodeEscape); } break; } @@ -502,7 +498,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { builder->AddCharacter(current()); Advance(); } else { - return ReportError(CStrVector("Invalid escape")); + return ReportError(RegExpError::kInvalidEscape); } break; } @@ -510,13 +506,13 @@ RegExpTree* RegExpParser::ParseDisjunction() { case '{': { int dummy; bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED); - if (parsed) return ReportError(CStrVector("Nothing to repeat")); + if (parsed) return ReportError(RegExpError::kNothingToRepeat); V8_FALLTHROUGH; } case '}': case ']': if (unicode()) { - return ReportError(CStrVector("Lone quantifier brackets")); + return ReportError(RegExpError::kLoneQuantifierBrackets); } V8_FALLTHROUGH; default: @@ -551,13 +547,12 @@ RegExpTree* RegExpParser::ParseDisjunction() { case '{': if (ParseIntervalQuantifier(&min, &max)) { if (max < min) { - return ReportError( - CStrVector("numbers out of order in {} quantifier")); + return ReportError(RegExpError::kRangeOutOfOrder); } break; } else if (unicode()) { // With /u, incomplete quantifiers are not allowed. - return ReportError(CStrVector("Incomplete quantifier")); + return ReportError(RegExpError::kIncompleteQuantifier); } continue; default: @@ -573,7 +568,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { Advance(); } if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) { - return ReportError(CStrVector("Invalid quantifier")); + return ReportError(RegExpError::kInvalidQuantifier); } } } @@ -608,7 +603,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( case 's': case 'm': { if (!FLAG_regexp_mode_modifiers) { - ReportError(CStrVector("Invalid group")); + ReportError(RegExpError::kInvalidGroup); return nullptr; } Advance(); @@ -617,7 +612,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( switch (current()) { case '-': if (!flags_sense) { - ReportError(CStrVector("Multiple dashes in flag group")); + ReportError(RegExpError::kMultipleFlagDashes); return nullptr; } flags_sense = false; @@ -631,7 +626,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( if (current() == 'm') bit = JSRegExp::kMultiline; if (current() == 's') bit = JSRegExp::kDotAll; if (((switch_on | switch_off) & bit) != 0) { - ReportError(CStrVector("Repeated flag in flag group")); + ReportError(RegExpError::kRepeatedFlag); return nullptr; } if (flags_sense) { @@ -659,7 +654,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( subexpr_type = GROUPING; // Will break us out of the outer loop. continue; default: - ReportError(CStrVector("Invalid flag group")); + ReportError(RegExpError::kInvalidFlagGroup); return nullptr; } } @@ -683,13 +678,13 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( Advance(); break; default: - ReportError(CStrVector("Invalid group")); + ReportError(RegExpError::kInvalidGroup); return nullptr; } } if (subexpr_type == CAPTURE) { if (captures_started_ >= JSRegExp::kMaxCaptures) { - ReportError(CStrVector("Too many captures")); + ReportError(RegExpError::kTooManyCaptures); return nullptr; } captures_started_++; @@ -838,20 +833,20 @@ const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { if (c == '\\' && current() == 'u') { Advance(); if (!ParseUnicodeEscape(&c)) { - ReportError(CStrVector("Invalid Unicode escape sequence")); + ReportError(RegExpError::kInvalidUnicodeEscape); return nullptr; } } // The backslash char is misclassified as both ID_Start and ID_Continue. if (c == '\\') { - ReportError(CStrVector("Invalid capture group name")); + ReportError(RegExpError::kInvalidCaptureGroupName); return nullptr; } if (at_start) { if (!IsIdentifierStart(c)) { - ReportError(CStrVector("Invalid capture group name")); + ReportError(RegExpError::kInvalidCaptureGroupName); return nullptr; } push_code_unit(name, c); @@ -862,7 +857,7 @@ const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { } else if (IsIdentifierPart(c)) { push_code_unit(name, c); } else { - ReportError(CStrVector("Invalid capture group name")); + ReportError(RegExpError::kInvalidCaptureGroupName); return nullptr; } } @@ -889,7 +884,7 @@ bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, const auto& named_capture_it = named_captures_->find(capture); if (named_capture_it != named_captures_->end()) { - ReportError(CStrVector("Duplicate capture group name")); + ReportError(RegExpError::kDuplicateCaptureGroupName); return false; } } @@ -903,7 +898,7 @@ bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, RegExpParserState* state) { // The parser is assumed to be on the '<' in \k<name>. if (current() != '<') { - ReportError(CStrVector("Invalid named reference")); + ReportError(RegExpError::kInvalidNamedReference); return false; } @@ -936,7 +931,7 @@ void RegExpParser::PatchNamedBackReferences() { if (named_back_references_ == nullptr) return; if (named_captures_ == nullptr) { - ReportError(CStrVector("Invalid named capture referenced")); + ReportError(RegExpError::kInvalidNamedCaptureReference); return; } @@ -957,7 +952,7 @@ void RegExpParser::PatchNamedBackReferences() { if (capture_it != named_captures_->end()) { index = (*capture_it)->index(); } else { - ReportError(CStrVector("Invalid named capture referenced")); + ReportError(RegExpError::kInvalidNamedCaptureReference); return; } @@ -1378,8 +1373,8 @@ bool IsUnicodePropertyValueCharacter(char c) { } // anonymous namespace -bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1, - std::vector<char>* name_2) { +bool RegExpParser::ParsePropertyClassName(ZoneVector<char>* name_1, + ZoneVector<char>* name_2) { DCHECK(name_1->empty()); DCHECK(name_2->empty()); // Parse the property class as follows: @@ -1418,8 +1413,8 @@ bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1, bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate, - const std::vector<char>& name_1, - const std::vector<char>& name_2) { + const ZoneVector<char>& name_1, + const ZoneVector<char>& name_2) { if (name_2.empty()) { // First attempt to interpret as general category property value name. const char* name = name_1.data(); @@ -1456,7 +1451,7 @@ bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to, } } -RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name_1) { +RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector<char>& name_1) { if (!FLAG_harmony_regexp_sequence) return nullptr; const char* name = name_1.data(); const uc32* sequence_list = nullptr; @@ -1522,19 +1517,19 @@ RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name_1) { #else // V8_INTL_SUPPORT -bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1, - std::vector<char>* name_2) { +bool RegExpParser::ParsePropertyClassName(ZoneVector<char>* name_1, + ZoneVector<char>* name_2) { return false; } bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate, - const std::vector<char>& name_1, - const std::vector<char>& name_2) { + const ZoneVector<char>& name_1, + const ZoneVector<char>& name_2) { return false; } -RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name) { +RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector<char>& name) { return nullptr; } @@ -1598,7 +1593,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() { } if (unicode()) { // With /u, invalid escapes are not treated as identity escapes. - ReportError(CStrVector("Invalid class escape")); + ReportError(RegExpError::kInvalidClassEscape); return 0; } if ((controlLetter >= '0' && controlLetter <= '9') || @@ -1631,7 +1626,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() { // ES#prod-annexB-LegacyOctalEscapeSequence if (unicode()) { // With /u, decimal escape is not interpreted as octal character code. - ReportError(CStrVector("Invalid class escape")); + ReportError(RegExpError::kInvalidClassEscape); return 0; } return ParseOctalLiteral(); @@ -1641,7 +1636,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() { if (ParseHexEscape(2, &value)) return value; if (unicode()) { // With /u, invalid escapes are not treated as identity escapes. - ReportError(CStrVector("Invalid escape")); + ReportError(RegExpError::kInvalidEscape); return 0; } // If \x is not followed by a two-digit hexadecimal, treat it @@ -1654,7 +1649,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() { if (ParseUnicodeEscape(&value)) return value; if (unicode()) { // With /u, invalid escapes are not treated as identity escapes. - ReportError(CStrVector("Invalid unicode escape")); + ReportError(RegExpError::kInvalidUnicodeEscape); return 0; } // If \u is not followed by a two-digit hexadecimal, treat it @@ -1669,11 +1664,11 @@ uc32 RegExpParser::ParseClassCharacterEscape() { Advance(); return result; } - ReportError(CStrVector("Invalid escape")); + ReportError(RegExpError::kInvalidEscape); return 0; } } - return 0; + UNREACHABLE(); } void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges, @@ -1696,17 +1691,18 @@ void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges, return; } case kEndMarker: - ReportError(CStrVector("\\ at end of pattern")); + ReportError(RegExpError::kEscapeAtEndOfPattern); return; case 'p': case 'P': if (unicode()) { bool negate = Next() == 'P'; Advance(2); - std::vector<char> name_1, name_2; + ZoneVector<char> name_1(zone); + ZoneVector<char> name_2(zone); if (!ParsePropertyClassName(&name_1, &name_2) || !AddPropertyClassRange(ranges, negate, name_1, name_2)) { - ReportError(CStrVector("Invalid property name in character class")); + ReportError(RegExpError::kInvalidClassPropertyName); } *is_class_escape = true; return; @@ -1725,10 +1721,6 @@ void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges, } RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { - static const char* kUnterminated = "Unterminated character class"; - static const char* kRangeInvalid = "Invalid character class"; - static const char* kRangeOutOfOrder = "Range out of order in character class"; - DCHECK_EQ(current(), '['); Advance(); bool is_negated = false; @@ -1761,7 +1753,7 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { // Either end is an escaped character class. Treat the '-' verbatim. if (unicode()) { // ES2015 21.2.2.15.1 step 1. - return ReportError(CStrVector(kRangeInvalid)); + return ReportError(RegExpError::kInvalidCharacterClass); } if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); ranges->Add(CharacterRange::Singleton('-'), zone()); @@ -1770,7 +1762,7 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { } // ES2015 21.2.2.15.1 step 6. if (char_1 > char_2) { - return ReportError(CStrVector(kRangeOutOfOrder)); + return ReportError(RegExpError::kOutOfOrderCharacterClass); } ranges->Add(CharacterRange::Range(char_1, char_2), zone()); } else { @@ -1778,7 +1770,7 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { } } if (!has_more()) { - return ReportError(CStrVector(kUnterminated)); + return ReportError(RegExpError::kUnterminatedCharacterClass); } Advance(); RegExpCharacterClass::CharacterClassFlags character_class_flags; @@ -1795,14 +1787,16 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input, JSRegExp::Flags flags, RegExpCompileData* result) { DCHECK(result != nullptr); - RegExpParser parser(input, &result->error, flags, isolate, zone); + RegExpParser parser(input, flags, isolate, zone); RegExpTree* tree = parser.ParsePattern(); if (parser.failed()) { DCHECK(tree == nullptr); - DCHECK(!result->error.is_null()); + DCHECK(parser.error_ != RegExpError::kNone); + result->error = parser.error_; + result->error_pos = parser.error_pos_; } else { DCHECK(tree != nullptr); - DCHECK(result->error.is_null()); + DCHECK(parser.error_ == RegExpError::kNone); if (FLAG_trace_regexp_parser) { StdoutStream os; tree->Print(os, zone); diff --git a/js/src/regexp/regexp-parser.h b/js/src/regexp/regexp-parser.h index 91677d6c3..131d12161 100644 --- a/js/src/regexp/regexp-parser.h +++ b/js/src/regexp/regexp-parser.h @@ -6,6 +6,7 @@ #define V8_REGEXP_REGEXP_PARSER_H_ #include "regexp/regexp-ast.h" +#include "regexp/regexp-error.h" namespace v8 { namespace internal { @@ -150,8 +151,8 @@ class RegExpBuilder : public ZoneObject { class V8_EXPORT_PRIVATE RegExpParser { public: - RegExpParser(FlatStringReader* in, Handle<String>* error, - JSRegExp::Flags flags, Isolate* isolate, Zone* zone); + RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, Isolate* isolate, + Zone* zone); static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input, JSRegExp::Flags flags, RegExpCompileData* result); @@ -174,13 +175,13 @@ class V8_EXPORT_PRIVATE RegExpParser { bool ParseUnicodeEscape(uc32* value); bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value); - bool ParsePropertyClassName(std::vector<char>* name_1, - std::vector<char>* name_2); + bool ParsePropertyClassName(ZoneVector<char>* name_1, + ZoneVector<char>* name_2); bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate, - const std::vector<char>& name_1, - const std::vector<char>& name_2); + const ZoneVector<char>& name_1, + const ZoneVector<char>& name_2); - RegExpTree* GetPropertySequence(const std::vector<char>& name_1); + RegExpTree* GetPropertySequence(const ZoneVector<char>& name_1); RegExpTree* ParseCharacterClass(const RegExpBuilder* state); uc32 ParseOctalLiteral(); @@ -199,7 +200,7 @@ class V8_EXPORT_PRIVATE RegExpParser { char ParseClassEscape(); - RegExpTree* ReportError(Vector<const char> message); + RegExpTree* ReportError(RegExpError error); void Advance(); void Advance(int dist); void Reset(int pos); @@ -332,7 +333,8 @@ class V8_EXPORT_PRIVATE RegExpParser { Isolate* isolate_; Zone* zone_; - Handle<String>* error_; + RegExpError error_ = RegExpError::kNone; + int error_pos_ = 0; ZoneList<RegExpCapture*>* captures_; ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_; ZoneList<RegExpBackReference*>* named_back_references_; diff --git a/js/src/regexp/regexp-shim.h b/js/src/regexp/regexp-shim.h index 38b035727..462e396f4 100644 --- a/js/src/regexp/regexp-shim.h +++ b/js/src/regexp/regexp-shim.h @@ -60,6 +60,7 @@ class RegExpStack; #define DCHECK_NOT_NULL(val) MOZ_ASSERT((val) != nullptr) #define DCHECK_IMPLIES(lhs, rhs) MOZ_ASSERT_IF(lhs, rhs) #define CHECK MOZ_RELEASE_ASSERT +#define CHECK_LE(lhs, rhs) MOZ_RELEASE_ASSERT((lhs) <= (rhs)) template <class T> static constexpr inline T Min(T t1, T t2) { @@ -1009,7 +1010,7 @@ private: public: // An empty stub for telemetry we don't support - void IncreaseTotalRegexpCodeGenerated(int size) {} + void IncreaseTotalRegexpCodeGenerated(Handle<HeapObject> code) {} Counters* counters() { return &counters_; } @@ -1155,6 +1156,7 @@ extern bool FLAG_trace_regexp_parser; extern bool FLAG_trace_regexp_peephole_optimization; #define V8_USE_COMPUTED_GOTO 1 +#define COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER } // namespace internal } // namespace v8 diff --git a/js/src/regexp/regexp-stack.h b/js/src/regexp/regexp-stack.h index 812195ad1..0b452c005 100644 --- a/js/src/regexp/regexp-stack.h +++ b/js/src/regexp/regexp-stack.h @@ -36,6 +36,9 @@ class RegExpStackScope { class RegExpStack { public: + RegExpStack(); + ~RegExpStack(); + // Number of allocated locations on the stack below the limit. // No sequence of pushes must be longer that this without doing a stack-limit // check. @@ -75,9 +78,6 @@ class RegExpStack { static constexpr size_t kMaximumStackSize = 64 * MB; private: - RegExpStack(); - ~RegExpStack(); - // Artificial limit used when the thread-local state has been destroyed. static const Address kMemoryTop = static_cast<Address>(static_cast<uintptr_t>(-1)); diff --git a/js/src/regexp/regexp.h b/js/src/regexp/regexp.h index cce58da38..a36662b78 100644 --- a/js/src/regexp/regexp.h +++ b/js/src/regexp/regexp.h @@ -5,6 +5,7 @@ #ifndef V8_REGEXP_REGEXP_H_ #define V8_REGEXP_REGEXP_H_ +#include "regexp/regexp-error.h" #include "regexp/regexp-shim.h" namespace v8 { @@ -42,7 +43,11 @@ struct RegExpCompileData { // The error message. Only used if an error occurred during parsing or // compilation. - Handle<String> error; + RegExpError error = RegExpError::kNone; + + // The position at which the error was detected. Only used if an + // error occurred. + int error_pos = 0; // The number of capture groups, without the global capture \0. int capture_count = 0; diff --git a/js/src/regexp/special-case.cc b/js/src/regexp/special-case.cc index d60b98764..6b12d28d7 100644 --- a/js/src/regexp/special-case.cc +++ b/js/src/regexp/special-case.cc @@ -1,10 +1,15 @@ -// Copyright 2019 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that +// can be found in the LICENSE file. // Automatically generated by regexp/gen-regexp-special-case.cc -// The following functions are used to build icu::UnicodeSet -// for specical cases different between Unicode and ECMA262. + +// The following functions are used to build UnicodeSets +// for special cases where the case-folding algorithm used by +// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match +// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime +// Semantics: Canonicalize) step 3. + #ifdef V8_INTL_SUPPORT #include "regexp/special-case.h" @@ -14,14 +19,46 @@ namespace internal { icu::UnicodeSet BuildIgnoreSet() { icu::UnicodeSet set; + set.add(0xdf); + set.add(0x17f); + set.add(0x390); + set.add(0x3b0); set.add(0x3f4); + set.add(0x1e9e); + set.add(0x1f80, 0x1faf); + set.add(0x1fb3); + set.add(0x1fbc); + set.add(0x1fc3); + set.add(0x1fcc); + set.add(0x1fd3); + set.add(0x1fe3); + set.add(0x1ff3); + set.add(0x1ffc); set.add(0x2126); set.add(0x212a, 0x212b); + set.add(0xfb05, 0xfb06); set.freeze(); return set; } + +struct IgnoreSetData { + IgnoreSetData() : set(BuildIgnoreSet()) {} + const icu::UnicodeSet set; +}; + +//static +const icu::UnicodeSet& RegExpCaseFolding::IgnoreSet() { + static base::LazyInstance<IgnoreSetData>::type set = + LAZY_INSTANCE_INITIALIZER; + return set.Pointer()->set; +} + icu::UnicodeSet BuildSpecialAddSet() { icu::UnicodeSet set; + set.add(0x4b); + set.add(0x53); + set.add(0x6b); + set.add(0x73); set.add(0xc5); set.add(0xe5); set.add(0x398); @@ -33,6 +70,19 @@ icu::UnicodeSet BuildSpecialAddSet() { return set; } +struct SpecialAddSetData { + SpecialAddSetData() : set(BuildSpecialAddSet()) {} + const icu::UnicodeSet set; +}; + +//static +const icu::UnicodeSet& RegExpCaseFolding::SpecialAddSet() { + static base::LazyInstance<SpecialAddSetData>::type set = + LAZY_INSTANCE_INITIALIZER; + return set.Pointer()->set; +} + + } // namespace internal } // namespace v8 #endif // V8_INTL_SUPPORT diff --git a/js/src/regexp/special-case.h b/js/src/regexp/special-case.h index 1ccec5d31..3aca98302 100644 --- a/js/src/regexp/special-case.h +++ b/js/src/regexp/special-case.h @@ -6,70 +6,108 @@ #define V8_REGEXP_SPECIAL_CASE_H_ #ifdef V8_INTL_SUPPORT -#include "unicode/uversion.h" -namespace U_ICU_NAMESPACE { -class UnicodeSet; -} // namespace U_ICU_NAMESPACE +#include "regexp/regexp-shim.h" + +#include "unicode/uchar.h" +#include "unicode/uniset.h" +#include "unicode/unistr.h" namespace v8 { namespace internal { -// Functions to build special sets of Unicode characters that need special -// handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE). +// Sets of Unicode characters that need special handling under "i" mode + +// For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262 +// defines slightly different case-folding rules than Unicode. An +// input character should match a pattern character if the result of +// the Canonicalize algorithm is the same for both characters. // -// For the characters in the "ignore set", the process should not treat other -// characters in the result of closeOver(USET_CASE_INSENSITIVE) as case -// equivlant under the ECMA262 RegExp "i" mode because these characters are -// uppercase themselves that no other characters in the set uppercase to. +// Roughly speaking, for "i" regexps, Canonicalize(c) is the same as +// c.toUpperCase(), unless a) c.toUpperCase() is a multi-character +// string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See +// https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for +// the precise definition. // -// For the characters in the "special add set", the proecess should add only -// those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is -// not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode -// and also that ONE uppercase character that other non uppercase character -// uppercase into to the set. Other uppercase characters in the result of -// closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262 -// RegExp "i" mode consider two characters as "case equivlant" if both -// characters uppercase to the same character. +// While compiling such regular expressions, we need to compute the +// set of characters that should match a given input character. (See +// GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.) +// For almost all characters, this can be efficiently computed using +// UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent +// the remaining special cases. // -// For example, consider the following case equivalent set defined by Unicode -// standard. Notice there are more than one uppercase characters in this set: -// U+212B Å Angstrom Sign - an uppercase character. -// U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character. -// U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which -// uppercase to U+00C5. -// In this case equivlant set is a special set and need special handling while -// considering "case equivlant" under the ECMA262 RegExp "i" mode which is -// different than Unicode Standard: -// * U+212B should be included into the "ignore" set because there are no other -// characters, under the ECMA262 "i" mode, are considered as "case equivlant" -// to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5 -// uppercase to U+212B. -// * U+00C5 and U+00E5 will both be included into the "special add" set. While -// calculate the "equivlant set" under ECMA262 "i" mode, the process will -// add U+00E5, because it is not an uppercase character in the set. The -// process will also add U+00C5, because it is the uppercase character which -// other non uppercase character, U+00C5, uppercase into. +// For a character c, the rules are as follows: // -// For characters not included in "ignore set" and "special add set", the -// process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is -// much faster. +// 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling +// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet +// containing c will produce the set of characters that should +// match /c/i (or /[c]/i), and only those characters. // -// Under Unicode 12.0, there are only 7 characters in the "special add set" and -// 4 characters in "ignore set" so even the special add process is slower, it is -// limited to a small set of cases only. +// 2. If c is in IgnoreSet, then the only character it should match is +// itself. However, closeOver will add additional incorrect +// matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ' +// (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is +// "SS". Step 3.e therefore requires that 'ß' canonicalizes to +// itself, and should not match 'ẞ'. In these cases, we can skip +// the closeOver entirely, because it will never add an equivalent +// character. // -// The implementation of these two function will be generated by calling ICU -// icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by -// the code in src/regexp/gen-regexp-special-case.cc. +// 3. If c is in SpecialAddSet, then it should match at least one +// character other than itself. However, closeOver will add at +// least one additional incorrect match. For example, consider the +// letter 'k'. Closing over 'k' gives "kKK" (lowercase k, uppercase +// K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN +// SIGN should not match either of the other two characters. As a +// result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in +// IgnoreSet). To find the correct matches for characters in +// SpecialAddSet, we closeOver the original character, but filter +// out the results that do not have the same canonical value. // -// These two function will be used with LazyInstance<> template to generate -// global sharable set to reduce memory usage and speed up performance. +// The contents of these sets are calculated at build time by +// src/regexp/gen-regexp-special-case.cc, which generates +// gen/src/regexp/special-case.cc. This is done by iterating over the +// result of closeOver for each BMP character, and finding sets for +// which at least one character has a different canonical value than +// another character. Characters that match no other characters in +// their equivalence class are added to IgnoreSet. Characters that +// match at least one other character are added to SpecialAddSet. + +class RegExpCaseFolding final : public AllStatic { + public: + static const icu::UnicodeSet& IgnoreSet(); + static const icu::UnicodeSet& SpecialAddSet(); + + // This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics: + // Canonicalize) step 3, which is used to determine whether + // characters match when ignoreCase is true and unicode is false. + static UChar32 Canonicalize(UChar32 ch) { + // a. Assert: ch is a UTF-16 code unit. + CHECK_LE(ch, 0xffff); + + // b. Let s be the String value consisting of the single code unit ch. + icu::UnicodeString s(ch); + + // c. Let u be the same result produced as if by performing the algorithm + // for String.prototype.toUpperCase using s as the this value. + // d. Assert: Type(u) is String. + icu::UnicodeString& u = s.toUpper(); + + // e. If u does not consist of a single code unit, return ch. + if (u.length() != 1) { + return ch; + } + + // f. Let cu be u's single code unit element. + UChar32 cu = u.char32At(0); -// Function to build and return the Ignore set. -icu::UnicodeSet BuildIgnoreSet(); + // g. If the value of ch >= 128 and the value of cu < 128, return ch. + if (ch >= 128 && cu < 128) { + return ch; + } -// Function to build and return the Special Add set. -icu::UnicodeSet BuildSpecialAddSet(); + // h. Return cu. + return cu; + } +}; } // namespace internal } // namespace v8 diff --git a/js/src/regexp/update-headers.py b/js/src/regexp/update-headers.py deleted file mode 100644 index 0cff9d6ae..000000000 --- a/js/src/regexp/update-headers.py +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python - -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this file, -# You can obtain one at http://mozilla.org/MPL/2.0/. - -# -# This script modifies V8 regexp source files to make them suitable for -# inclusion in SpiderMonkey. Specifically, it: -# -# 1. Rewrites all #includes of V8 regexp headers to point to their location in -# the SM tree: src/regexp/* --> regexp/* -# 2. Removes all #includes of other V8 src/* headers. The required definitions -# will be provided by regexp-shim.h. -# -# Usage: -# cd js/src/regexp -# find . -name "*.h" -o -name "*.cc" | xargs ./update_headers.py -# - -import fileinput -import re -import sys - -# 1. Rewrite includes of V8 regexp headers -regexp_include = re.compile('#include "src/regexp') -regexp_include_new = '#include "regexp' - -# 2. Remove includes of other V8 headers -other_include = re.compile('#include "src/') - -for line in fileinput.input(inplace=1): - if regexp_include.search(line): - sys.stdout.write(re.sub(regexp_include, regexp_include_new, line)) - elif other_include.search(line): - pass - else: - sys.stdout.write(line) |