summaryrefslogtreecommitdiffstats
path: root/js/src/regexp/regexp-parser.cc
diff options
context:
space:
mode:
authorMatt A. Tobin <email@mattatobin.com>2020-11-04 19:46:11 -0500
committerMatt A. Tobin <email@mattatobin.com>2020-11-04 20:27:57 -0500
commit78b3a722b4b91c2482fed60d7e970a3f57645456 (patch)
tree717c2e8f2e1a110295f525e9cca666469dbe8049 /js/src/regexp/regexp-parser.cc
parent2e07199197e94ed02926c77bd3bd10d187b352b0 (diff)
downloadUXP-78b3a722b4b91c2482fed60d7e970a3f57645456.tar
UXP-78b3a722b4b91c2482fed60d7e970a3f57645456.tar.gz
UXP-78b3a722b4b91c2482fed60d7e970a3f57645456.tar.lz
UXP-78b3a722b4b91c2482fed60d7e970a3f57645456.tar.xz
UXP-78b3a722b4b91c2482fed60d7e970a3f57645456.zip
Issue #1677 - Part 1: Import new V8 regexp code with Mozilla's header modifications
Diffstat (limited to 'js/src/regexp/regexp-parser.cc')
-rw-r--r--js/src/regexp/regexp-parser.cc2115
1 files changed, 2115 insertions, 0 deletions
diff --git a/js/src/regexp/regexp-parser.cc b/js/src/regexp/regexp-parser.cc
new file mode 100644
index 000000000..377b94247
--- /dev/null
+++ b/js/src/regexp/regexp-parser.cc
@@ -0,0 +1,2115 @@
+// Copyright 2016 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "regexp/regexp-parser.h"
+
+#include <vector>
+
+#include "regexp/property-sequences.h"
+#include "regexp/regexp-macro-assembler.h"
+#include "regexp/regexp.h"
+
+#ifdef V8_INTL_SUPPORT
+#include "unicode/uniset.h"
+#endif // V8_INTL_SUPPORT
+
+namespace v8 {
+namespace internal {
+
+RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
+ JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
+ : isolate_(isolate),
+ zone_(zone),
+ error_(error),
+ captures_(nullptr),
+ named_captures_(nullptr),
+ named_back_references_(nullptr),
+ in_(in),
+ current_(kEndMarker),
+ top_level_flags_(flags),
+ next_pos_(0),
+ captures_started_(0),
+ capture_count_(0),
+ has_more_(true),
+ simple_(false),
+ contains_anchor_(false),
+ is_scanned_for_captures_(false),
+ has_named_captures_(false),
+ failed_(false) {
+ Advance();
+}
+
+template <bool update_position>
+inline uc32 RegExpParser::ReadNext() {
+ int position = next_pos_;
+ uc32 c0 = in()->Get(position);
+ position++;
+ // Read the whole surrogate pair in case of unicode flag, if possible.
+ if (unicode() && position < in()->length() &&
+ unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
+ uc16 c1 = in()->Get(position);
+ if (unibrow::Utf16::IsTrailSurrogate(c1)) {
+ c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
+ position++;
+ }
+ }
+ if (update_position) next_pos_ = position;
+ return c0;
+}
+
+
+uc32 RegExpParser::Next() {
+ if (has_next()) {
+ return ReadNext<false>();
+ } else {
+ return kEndMarker;
+ }
+}
+
+void RegExpParser::Advance() {
+ if (has_next()) {
+ StackLimitCheck check(isolate());
+ if (check.HasOverflowed()) {
+ if (FLAG_correctness_fuzzer_suppressions) {
+ FATAL("Aborting on stack overflow");
+ }
+ ReportError(CStrVector(
+ MessageFormatter::TemplateString(MessageTemplate::kStackOverflow)));
+ } else if (zone()->excess_allocation()) {
+ if (FLAG_correctness_fuzzer_suppressions) {
+ FATAL("Aborting on excess zone allocation");
+ }
+ ReportError(CStrVector("Regular expression too large"));
+ } else {
+ current_ = ReadNext<true>();
+ }
+ } else {
+ current_ = kEndMarker;
+ // Advance so that position() points to 1-after-the-last-character. This is
+ // important so that Reset() to this position works correctly.
+ next_pos_ = in()->length() + 1;
+ has_more_ = false;
+ }
+}
+
+
+void RegExpParser::Reset(int pos) {
+ next_pos_ = pos;
+ has_more_ = (pos < in()->length());
+ Advance();
+}
+
+void RegExpParser::Advance(int dist) {
+ next_pos_ += dist - 1;
+ Advance();
+}
+
+
+bool RegExpParser::simple() { return simple_; }
+
+bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
+ switch (c) {
+ case '^':
+ case '$':
+ case '\\':
+ case '.':
+ case '*':
+ case '+':
+ case '?':
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case '{':
+ case '}':
+ case '|':
+ case '/':
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+
+RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
+ if (failed_) return nullptr; // Do not overwrite any existing error.
+ failed_ = true;
+ *error_ = isolate()
+ ->factory()
+ ->NewStringFromOneByte(Vector<const uint8_t>::cast(message))
+ .ToHandleChecked();
+ // Zip to the end to make sure the no more input is read.
+ current_ = kEndMarker;
+ next_pos_ = in()->length();
+ return nullptr;
+}
+
+#define CHECK_FAILED /**/); \
+ if (failed_) return nullptr; \
+ ((void)0
+
+// Pattern ::
+// Disjunction
+RegExpTree* RegExpParser::ParsePattern() {
+ RegExpTree* result = ParseDisjunction(CHECK_FAILED);
+ PatchNamedBackReferences(CHECK_FAILED);
+ DCHECK(!has_more());
+ // If the result of parsing is a literal string atom, and it has the
+ // same length as the input, then the atom is identical to the input.
+ if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
+ simple_ = true;
+ }
+ return result;
+}
+
+
+// Disjunction ::
+// Alternative
+// Alternative | Disjunction
+// Alternative ::
+// [empty]
+// Term Alternative
+// Term ::
+// Assertion
+// Atom
+// Atom Quantifier
+RegExpTree* RegExpParser::ParseDisjunction() {
+ // Used to store current state while parsing subexpressions.
+ RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
+ 0, nullptr, top_level_flags_, zone());
+ RegExpParserState* state = &initial_state;
+ // Cache the builder in a local variable for quick access.
+ RegExpBuilder* builder = initial_state.builder();
+ while (true) {
+ switch (current()) {
+ case kEndMarker:
+ if (state->IsSubexpression()) {
+ // Inside a parenthesized group when hitting end of input.
+ return ReportError(CStrVector("Unterminated group"));
+ }
+ DCHECK_EQ(INITIAL, state->group_type());
+ // Parsing completed successfully.
+ return builder->ToRegExp();
+ case ')': {
+ if (!state->IsSubexpression()) {
+ return ReportError(CStrVector("Unmatched ')'"));
+ }
+ DCHECK_NE(INITIAL, state->group_type());
+
+ Advance();
+ // End disjunction parsing and convert builder content to new single
+ // regexp atom.
+ RegExpTree* body = builder->ToRegExp();
+
+ int end_capture_index = captures_started();
+
+ int capture_index = state->capture_index();
+ SubexpressionType group_type = state->group_type();
+
+ // Build result of subexpression.
+ if (group_type == CAPTURE) {
+ if (state->IsNamedCapture()) {
+ CreateNamedCaptureAtIndex(state->capture_name(),
+ capture_index CHECK_FAILED);
+ }
+ RegExpCapture* capture = GetCapture(capture_index);
+ capture->set_body(body);
+ body = capture;
+ } else if (group_type == GROUPING) {
+ body = new (zone()) RegExpGroup(body);
+ } else {
+ DCHECK(group_type == POSITIVE_LOOKAROUND ||
+ group_type == NEGATIVE_LOOKAROUND);
+ bool is_positive = (group_type == POSITIVE_LOOKAROUND);
+ body = new (zone()) RegExpLookaround(
+ body, is_positive, end_capture_index - capture_index,
+ capture_index, state->lookaround_type());
+ }
+
+ // Restore previous state.
+ state = state->previous_state();
+ builder = state->builder();
+
+ builder->AddAtom(body);
+ // For compatibility with JSC and ES3, we allow quantifiers after
+ // lookaheads, and break in all cases.
+ break;
+ }
+ case '|': {
+ Advance();
+ builder->NewAlternative();
+ continue;
+ }
+ case '*':
+ case '+':
+ case '?':
+ return ReportError(CStrVector("Nothing to repeat"));
+ case '^': {
+ Advance();
+ if (builder->multiline()) {
+ builder->AddAssertion(new (zone()) RegExpAssertion(
+ RegExpAssertion::START_OF_LINE, builder->flags()));
+ } else {
+ builder->AddAssertion(new (zone()) RegExpAssertion(
+ RegExpAssertion::START_OF_INPUT, builder->flags()));
+ set_contains_anchor();
+ }
+ continue;
+ }
+ case '$': {
+ Advance();
+ RegExpAssertion::AssertionType assertion_type =
+ builder->multiline() ? RegExpAssertion::END_OF_LINE
+ : RegExpAssertion::END_OF_INPUT;
+ builder->AddAssertion(
+ new (zone()) RegExpAssertion(assertion_type, builder->flags()));
+ continue;
+ }
+ case '.': {
+ Advance();
+ ZoneList<CharacterRange>* ranges =
+ new (zone()) ZoneList<CharacterRange>(2, zone());
+
+ if (builder->dotall()) {
+ // Everything.
+ CharacterRange::AddClassEscape('*', ranges, false, zone());
+ } else {
+ // Everything except \x0A, \x0D, \u2028 and \u2029
+ CharacterRange::AddClassEscape('.', ranges, false, zone());
+ }
+
+ RegExpCharacterClass* cc =
+ new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags());
+ builder->AddCharacterClass(cc);
+ break;
+ }
+ case '(': {
+ state = ParseOpenParenthesis(state CHECK_FAILED);
+ builder = state->builder();
+ continue;
+ }
+ case '[': {
+ RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
+ builder->AddCharacterClass(cc->AsCharacterClass());
+ break;
+ }
+ // Atom ::
+ // \ AtomEscape
+ case '\\':
+ switch (Next()) {
+ case kEndMarker:
+ return ReportError(CStrVector("\\ at end of pattern"));
+ case 'b':
+ Advance(2);
+ builder->AddAssertion(new (zone()) RegExpAssertion(
+ RegExpAssertion::BOUNDARY, builder->flags()));
+ continue;
+ case 'B':
+ Advance(2);
+ builder->AddAssertion(new (zone()) RegExpAssertion(
+ RegExpAssertion::NON_BOUNDARY, builder->flags()));
+ continue;
+ // AtomEscape ::
+ // CharacterClassEscape
+ //
+ // CharacterClassEscape :: one of
+ // d D s S w W
+ case 'd':
+ case 'D':
+ case 's':
+ case 'S':
+ case 'w':
+ case 'W': {
+ uc32 c = Next();
+ Advance(2);
+ ZoneList<CharacterRange>* ranges =
+ new (zone()) ZoneList<CharacterRange>(2, zone());
+ CharacterRange::AddClassEscape(
+ c, ranges, unicode() && builder->ignore_case(), zone());
+ RegExpCharacterClass* cc = new (zone())
+ RegExpCharacterClass(zone(), ranges, builder->flags());
+ builder->AddCharacterClass(cc);
+ break;
+ }
+ case 'p':
+ case 'P': {
+ uc32 p = Next();
+ Advance(2);
+ if (unicode()) {
+ ZoneList<CharacterRange>* ranges =
+ new (zone()) ZoneList<CharacterRange>(2, zone());
+ std::vector<char> name_1, name_2;
+ if (ParsePropertyClassName(&name_1, &name_2)) {
+ if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
+ RegExpCharacterClass* cc = new (zone())
+ RegExpCharacterClass(zone(), ranges, builder->flags());
+ builder->AddCharacterClass(cc);
+ break;
+ }
+ if (p == 'p' && name_2.empty()) {
+ RegExpTree* sequence = GetPropertySequence(name_1);
+ if (sequence != nullptr) {
+ builder->AddAtom(sequence);
+ break;
+ }
+ }
+ }
+ return ReportError(CStrVector("Invalid property name"));
+ } else {
+ builder->AddCharacter(p);
+ }
+ break;
+ }
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9': {
+ int index = 0;
+ bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
+ if (is_backref) {
+ if (state->IsInsideCaptureGroup(index)) {
+ // The back reference is inside the capture group it refers to.
+ // Nothing can possibly have been captured yet, so we use empty
+ // instead. This ensures that, when checking a back reference,
+ // the capture registers of the referenced capture are either
+ // both set or both cleared.
+ builder->AddEmpty();
+ } else {
+ RegExpCapture* capture = GetCapture(index);
+ RegExpTree* atom =
+ new (zone()) RegExpBackReference(capture, builder->flags());
+ builder->AddAtom(atom);
+ }
+ break;
+ }
+ // With /u, no identity escapes except for syntax characters
+ // are allowed. Otherwise, all identity escapes are allowed.
+ if (unicode()) {
+ return ReportError(CStrVector("Invalid escape"));
+ }
+ uc32 first_digit = Next();
+ if (first_digit == '8' || first_digit == '9') {
+ builder->AddCharacter(first_digit);
+ Advance(2);
+ break;
+ }
+ V8_FALLTHROUGH;
+ }
+ case '0': {
+ Advance();
+ if (unicode() && Next() >= '0' && Next() <= '9') {
+ // With /u, decimal escape with leading 0 are not parsed as octal.
+ return ReportError(CStrVector("Invalid decimal escape"));
+ }
+ uc32 octal = ParseOctalLiteral();
+ builder->AddCharacter(octal);
+ break;
+ }
+ // ControlEscape :: one of
+ // f n r t v
+ case 'f':
+ Advance(2);
+ builder->AddCharacter('\f');
+ break;
+ case 'n':
+ Advance(2);
+ builder->AddCharacter('\n');
+ break;
+ case 'r':
+ Advance(2);
+ builder->AddCharacter('\r');
+ break;
+ case 't':
+ Advance(2);
+ builder->AddCharacter('\t');
+ break;
+ case 'v':
+ Advance(2);
+ builder->AddCharacter('\v');
+ break;
+ case 'c': {
+ Advance();
+ uc32 controlLetter = Next();
+ // Special case if it is an ASCII letter.
+ // Convert lower case letters to uppercase.
+ uc32 letter = controlLetter & ~('a' ^ 'A');
+ if (letter < 'A' || 'Z' < letter) {
+ // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
+ // Read the backslash as a literal character instead of as
+ // starting an escape.
+ // ES#prod-annexB-ExtendedPatternCharacter
+ if (unicode()) {
+ // With /u, invalid escapes are not treated as identity escapes.
+ return ReportError(CStrVector("Invalid unicode escape"));
+ }
+ builder->AddCharacter('\\');
+ } else {
+ Advance(2);
+ builder->AddCharacter(controlLetter & 0x1F);
+ }
+ break;
+ }
+ case 'x': {
+ Advance(2);
+ uc32 value;
+ if (ParseHexEscape(2, &value)) {
+ builder->AddCharacter(value);
+ } else if (!unicode()) {
+ builder->AddCharacter('x');
+ } else {
+ // With /u, invalid escapes are not treated as identity escapes.
+ return ReportError(CStrVector("Invalid escape"));
+ }
+ break;
+ }
+ case 'u': {
+ Advance(2);
+ uc32 value;
+ if (ParseUnicodeEscape(&value)) {
+ builder->AddEscapedUnicodeCharacter(value);
+ } else if (!unicode()) {
+ builder->AddCharacter('u');
+ } else {
+ // With /u, invalid escapes are not treated as identity escapes.
+ return ReportError(CStrVector("Invalid Unicode escape"));
+ }
+ break;
+ }
+ case 'k':
+ // Either an identity escape or a named back-reference. The two
+ // interpretations are mutually exclusive: '\k' is interpreted as
+ // an identity escape for non-Unicode patterns without named
+ // capture groups, and as the beginning of a named back-reference
+ // in all other cases.
+ if (unicode() || HasNamedCaptures()) {
+ Advance(2);
+ ParseNamedBackReference(builder, state CHECK_FAILED);
+ break;
+ }
+ V8_FALLTHROUGH;
+ default:
+ Advance();
+ // With /u, no identity escapes except for syntax characters
+ // are allowed. Otherwise, all identity escapes are allowed.
+ if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
+ builder->AddCharacter(current());
+ Advance();
+ } else {
+ return ReportError(CStrVector("Invalid escape"));
+ }
+ break;
+ }
+ break;
+ case '{': {
+ int dummy;
+ bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
+ if (parsed) return ReportError(CStrVector("Nothing to repeat"));
+ V8_FALLTHROUGH;
+ }
+ case '}':
+ case ']':
+ if (unicode()) {
+ return ReportError(CStrVector("Lone quantifier brackets"));
+ }
+ V8_FALLTHROUGH;
+ default:
+ builder->AddUnicodeCharacter(current());
+ Advance();
+ break;
+ } // end switch(current())
+
+ int min;
+ int max;
+ switch (current()) {
+ // QuantifierPrefix ::
+ // *
+ // +
+ // ?
+ // {
+ case '*':
+ min = 0;
+ max = RegExpTree::kInfinity;
+ Advance();
+ break;
+ case '+':
+ min = 1;
+ max = RegExpTree::kInfinity;
+ Advance();
+ break;
+ case '?':
+ min = 0;
+ max = 1;
+ Advance();
+ break;
+ case '{':
+ if (ParseIntervalQuantifier(&min, &max)) {
+ if (max < min) {
+ return ReportError(
+ CStrVector("numbers out of order in {} quantifier"));
+ }
+ break;
+ } else if (unicode()) {
+ // With /u, incomplete quantifiers are not allowed.
+ return ReportError(CStrVector("Incomplete quantifier"));
+ }
+ continue;
+ default:
+ continue;
+ }
+ RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
+ if (current() == '?') {
+ quantifier_type = RegExpQuantifier::NON_GREEDY;
+ Advance();
+ } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
+ // FLAG_regexp_possessive_quantifier is a debug-only flag.
+ quantifier_type = RegExpQuantifier::POSSESSIVE;
+ Advance();
+ }
+ if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
+ return ReportError(CStrVector("Invalid quantifier"));
+ }
+ }
+}
+
+RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
+ RegExpParserState* state) {
+ RegExpLookaround::Type lookaround_type = state->lookaround_type();
+ bool is_named_capture = false;
+ JSRegExp::Flags switch_on = JSRegExp::kNone;
+ JSRegExp::Flags switch_off = JSRegExp::kNone;
+ const ZoneVector<uc16>* capture_name = nullptr;
+ SubexpressionType subexpr_type = CAPTURE;
+ Advance();
+ if (current() == '?') {
+ switch (Next()) {
+ case ':':
+ Advance(2);
+ subexpr_type = GROUPING;
+ break;
+ case '=':
+ Advance(2);
+ lookaround_type = RegExpLookaround::LOOKAHEAD;
+ subexpr_type = POSITIVE_LOOKAROUND;
+ break;
+ case '!':
+ Advance(2);
+ lookaround_type = RegExpLookaround::LOOKAHEAD;
+ subexpr_type = NEGATIVE_LOOKAROUND;
+ break;
+ case '-':
+ case 'i':
+ case 's':
+ case 'm': {
+ if (!FLAG_regexp_mode_modifiers) {
+ ReportError(CStrVector("Invalid group"));
+ return nullptr;
+ }
+ Advance();
+ bool flags_sense = true; // Switching on flags.
+ while (subexpr_type != GROUPING) {
+ switch (current()) {
+ case '-':
+ if (!flags_sense) {
+ ReportError(CStrVector("Multiple dashes in flag group"));
+ return nullptr;
+ }
+ flags_sense = false;
+ Advance();
+ continue;
+ case 's':
+ case 'i':
+ case 'm': {
+ JSRegExp::Flags bit = JSRegExp::kUnicode;
+ if (current() == 'i') bit = JSRegExp::kIgnoreCase;
+ if (current() == 'm') bit = JSRegExp::kMultiline;
+ if (current() == 's') bit = JSRegExp::kDotAll;
+ if (((switch_on | switch_off) & bit) != 0) {
+ ReportError(CStrVector("Repeated flag in flag group"));
+ return nullptr;
+ }
+ if (flags_sense) {
+ switch_on |= bit;
+ } else {
+ switch_off |= bit;
+ }
+ Advance();
+ continue;
+ }
+ case ')': {
+ Advance();
+ state->builder()
+ ->FlushText(); // Flush pending text using old flags.
+ // These (?i)-style flag switches don't put us in a subexpression
+ // at all, they just modify the flags in the rest of the current
+ // subexpression.
+ JSRegExp::Flags flags =
+ (state->builder()->flags() | switch_on) & ~switch_off;
+ state->builder()->set_flags(flags);
+ return state;
+ }
+ case ':':
+ Advance();
+ subexpr_type = GROUPING; // Will break us out of the outer loop.
+ continue;
+ default:
+ ReportError(CStrVector("Invalid flag group"));
+ return nullptr;
+ }
+ }
+ break;
+ }
+ case '<':
+ Advance();
+ if (Next() == '=') {
+ Advance(2);
+ lookaround_type = RegExpLookaround::LOOKBEHIND;
+ subexpr_type = POSITIVE_LOOKAROUND;
+ break;
+ } else if (Next() == '!') {
+ Advance(2);
+ lookaround_type = RegExpLookaround::LOOKBEHIND;
+ subexpr_type = NEGATIVE_LOOKAROUND;
+ break;
+ }
+ is_named_capture = true;
+ has_named_captures_ = true;
+ Advance();
+ break;
+ default:
+ ReportError(CStrVector("Invalid group"));
+ return nullptr;
+ }
+ }
+ if (subexpr_type == CAPTURE) {
+ if (captures_started_ >= JSRegExp::kMaxCaptures) {
+ ReportError(CStrVector("Too many captures"));
+ return nullptr;
+ }
+ captures_started_++;
+
+ if (is_named_capture) {
+ capture_name = ParseCaptureGroupName(CHECK_FAILED);
+ }
+ }
+ JSRegExp::Flags flags = (state->builder()->flags() | switch_on) & ~switch_off;
+ // Store current state and begin new disjunction parsing.
+ return new (zone())
+ RegExpParserState(state, subexpr_type, lookaround_type, captures_started_,
+ capture_name, flags, zone());
+}
+
+#ifdef DEBUG
+// Currently only used in an DCHECK.
+static bool IsSpecialClassEscape(uc32 c) {
+ switch (c) {
+ case 'd':
+ case 'D':
+ case 's':
+ case 'S':
+ case 'w':
+ case 'W':
+ return true;
+ default:
+ return false;
+ }
+}
+#endif
+
+
+// In order to know whether an escape is a backreference or not we have to scan
+// the entire regexp and find the number of capturing parentheses. However we
+// don't want to scan the regexp twice unless it is necessary. This mini-parser
+// is called when needed. It can see the difference between capturing and
+// noncapturing parentheses and can skip character classes and backslash-escaped
+// characters.
+void RegExpParser::ScanForCaptures() {
+ DCHECK(!is_scanned_for_captures_);
+ const int saved_position = position();
+ // Start with captures started previous to current position
+ int capture_count = captures_started();
+ // Add count of captures after this position.
+ int n;
+ while ((n = current()) != kEndMarker) {
+ Advance();
+ switch (n) {
+ case '\\':
+ Advance();
+ break;
+ case '[': {
+ int c;
+ while ((c = current()) != kEndMarker) {
+ Advance();
+ if (c == '\\') {
+ Advance();
+ } else {
+ if (c == ']') break;
+ }
+ }
+ break;
+ }
+ case '(':
+ if (current() == '?') {
+ // At this point we could be in
+ // * a non-capturing group '(:',
+ // * a lookbehind assertion '(?<=' '(?<!'
+ // * or a named capture '(?<'.
+ //
+ // Of these, only named captures are capturing groups.
+
+ Advance();
+ if (current() != '<') break;
+
+ Advance();
+ if (current() == '=' || current() == '!') break;
+
+ // Found a possible named capture. It could turn out to be a syntax
+ // error (e.g. an unterminated or invalid name), but that distinction
+ // does not matter for our purposes.
+ has_named_captures_ = true;
+ }
+ capture_count++;
+ break;
+ }
+ }
+ capture_count_ = capture_count;
+ is_scanned_for_captures_ = true;
+ Reset(saved_position);
+}
+
+
+bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
+ DCHECK_EQ('\\', current());
+ DCHECK('1' <= Next() && Next() <= '9');
+ // Try to parse a decimal literal that is no greater than the total number
+ // of left capturing parentheses in the input.
+ int start = position();
+ int value = Next() - '0';
+ Advance(2);
+ while (true) {
+ uc32 c = current();
+ if (IsDecimalDigit(c)) {
+ value = 10 * value + (c - '0');
+ if (value > JSRegExp::kMaxCaptures) {
+ Reset(start);
+ return false;
+ }
+ Advance();
+ } else {
+ break;
+ }
+ }
+ if (value > captures_started()) {
+ if (!is_scanned_for_captures_) ScanForCaptures();
+ if (value > capture_count_) {
+ Reset(start);
+ return false;
+ }
+ }
+ *index_out = value;
+ return true;
+}
+
+static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
+ if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
+ v->push_back(code_unit);
+ } else {
+ v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
+ v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
+ }
+}
+
+const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
+ ZoneVector<uc16>* name =
+ new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
+
+ bool at_start = true;
+ while (true) {
+ uc32 c = current();
+ Advance();
+
+ // Convert unicode escapes.
+ if (c == '\\' && current() == 'u') {
+ Advance();
+ if (!ParseUnicodeEscape(&c)) {
+ ReportError(CStrVector("Invalid Unicode escape sequence"));
+ return nullptr;
+ }
+ }
+
+ // The backslash char is misclassified as both ID_Start and ID_Continue.
+ if (c == '\\') {
+ ReportError(CStrVector("Invalid capture group name"));
+ return nullptr;
+ }
+
+ if (at_start) {
+ if (!IsIdentifierStart(c)) {
+ ReportError(CStrVector("Invalid capture group name"));
+ return nullptr;
+ }
+ push_code_unit(name, c);
+ at_start = false;
+ } else {
+ if (c == '>') {
+ break;
+ } else if (IsIdentifierPart(c)) {
+ push_code_unit(name, c);
+ } else {
+ ReportError(CStrVector("Invalid capture group name"));
+ return nullptr;
+ }
+ }
+ }
+
+ return name;
+}
+
+bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
+ int index) {
+ DCHECK(0 < index && index <= captures_started_);
+ DCHECK_NOT_NULL(name);
+
+ RegExpCapture* capture = GetCapture(index);
+ DCHECK_NULL(capture->name());
+
+ capture->set_name(name);
+
+ if (named_captures_ == nullptr) {
+ named_captures_ = new (zone_->New(sizeof(*named_captures_)))
+ ZoneSet<RegExpCapture*, RegExpCaptureNameLess>(zone());
+ } else {
+ // Check for duplicates and bail if we find any.
+
+ const auto& named_capture_it = named_captures_->find(capture);
+ if (named_capture_it != named_captures_->end()) {
+ ReportError(CStrVector("Duplicate capture group name"));
+ return false;
+ }
+ }
+
+ named_captures_->emplace(capture);
+
+ return true;
+}
+
+bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
+ RegExpParserState* state) {
+ // The parser is assumed to be on the '<' in \k<name>.
+ if (current() != '<') {
+ ReportError(CStrVector("Invalid named reference"));
+ return false;
+ }
+
+ Advance();
+ const ZoneVector<uc16>* name = ParseCaptureGroupName();
+ if (name == nullptr) {
+ return false;
+ }
+
+ if (state->IsInsideCaptureGroup(name)) {
+ builder->AddEmpty();
+ } else {
+ RegExpBackReference* atom =
+ new (zone()) RegExpBackReference(builder->flags());
+ atom->set_name(name);
+
+ builder->AddAtom(atom);
+
+ if (named_back_references_ == nullptr) {
+ named_back_references_ =
+ new (zone()) ZoneList<RegExpBackReference*>(1, zone());
+ }
+ named_back_references_->Add(atom, zone());
+ }
+
+ return true;
+}
+
+void RegExpParser::PatchNamedBackReferences() {
+ if (named_back_references_ == nullptr) return;
+
+ if (named_captures_ == nullptr) {
+ ReportError(CStrVector("Invalid named capture referenced"));
+ return;
+ }
+
+ // Look up and patch the actual capture for each named back reference.
+
+ for (int i = 0; i < named_back_references_->length(); i++) {
+ RegExpBackReference* ref = named_back_references_->at(i);
+
+ // Capture used to search the named_captures_ by name, index of the
+ // capture is never used.
+ static const int kInvalidIndex = 0;
+ RegExpCapture* search_capture = new (zone()) RegExpCapture(kInvalidIndex);
+ DCHECK_NULL(search_capture->name());
+ search_capture->set_name(ref->name());
+
+ int index = -1;
+ const auto& capture_it = named_captures_->find(search_capture);
+ if (capture_it != named_captures_->end()) {
+ index = (*capture_it)->index();
+ } else {
+ ReportError(CStrVector("Invalid named capture referenced"));
+ return;
+ }
+
+ ref->set_capture(GetCapture(index));
+ }
+}
+
+RegExpCapture* RegExpParser::GetCapture(int index) {
+ // The index for the capture groups are one-based. Its index in the list is
+ // zero-based.
+ int know_captures =
+ is_scanned_for_captures_ ? capture_count_ : captures_started_;
+ DCHECK(index <= know_captures);
+ if (captures_ == nullptr) {
+ captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
+ }
+ while (captures_->length() < know_captures) {
+ captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
+ }
+ return captures_->at(index - 1);
+}
+
+namespace {
+
+struct RegExpCaptureIndexLess {
+ bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
+ DCHECK_NOT_NULL(lhs);
+ DCHECK_NOT_NULL(rhs);
+ return lhs->index() < rhs->index();
+ }
+};
+
+} // namespace
+
+Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
+ if (named_captures_ == nullptr || named_captures_->empty()) {
+ return Handle<FixedArray>();
+ }
+
+ // Named captures are sorted by name (because the set is used to ensure
+ // name uniqueness). But the capture name map must to be sorted by index.
+
+ ZoneVector<RegExpCapture*> sorted_named_captures(
+ named_captures_->begin(), named_captures_->end(), zone());
+ std::sort(sorted_named_captures.begin(), sorted_named_captures.end(),
+ RegExpCaptureIndexLess{});
+ DCHECK_EQ(sorted_named_captures.size(), named_captures_->size());
+
+ Factory* factory = isolate()->factory();
+
+ int len = static_cast<int>(sorted_named_captures.size()) * 2;
+ Handle<FixedArray> array = factory->NewFixedArray(len);
+
+ int i = 0;
+ for (const auto& capture : sorted_named_captures) {
+ Vector<const uc16> capture_name(capture->name()->data(),
+ capture->name()->size());
+ // CSA code in ConstructNewResultFromMatchInfo requires these strings to be
+ // internalized so they can be used as property names in the 'exec' results.
+ Handle<String> name = factory->InternalizeString(capture_name);
+ array->set(i * 2, *name);
+ array->set(i * 2 + 1, Smi::FromInt(capture->index()));
+
+ i++;
+ }
+ DCHECK_EQ(i * 2, len);
+
+ return array;
+}
+
+bool RegExpParser::HasNamedCaptures() {
+ if (has_named_captures_ || is_scanned_for_captures_) {
+ return has_named_captures_;
+ }
+
+ ScanForCaptures();
+ DCHECK(is_scanned_for_captures_);
+ return has_named_captures_;
+}
+
+bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
+ for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
+ if (s->group_type() != CAPTURE) continue;
+ // Return true if we found the matching capture index.
+ if (index == s->capture_index()) return true;
+ // Abort if index is larger than what has been parsed up till this state.
+ if (index > s->capture_index()) return false;
+ }
+ return false;
+}
+
+bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
+ const ZoneVector<uc16>* name) {
+ DCHECK_NOT_NULL(name);
+ for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
+ if (s->capture_name() == nullptr) continue;
+ if (*s->capture_name() == *name) return true;
+ }
+ return false;
+}
+
+// QuantifierPrefix ::
+// { DecimalDigits }
+// { DecimalDigits , }
+// { DecimalDigits , DecimalDigits }
+//
+// Returns true if parsing succeeds, and set the min_out and max_out
+// values. Values are truncated to RegExpTree::kInfinity if they overflow.
+bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
+ DCHECK_EQ(current(), '{');
+ int start = position();
+ Advance();
+ int min = 0;
+ if (!IsDecimalDigit(current())) {
+ Reset(start);
+ return false;
+ }
+ while (IsDecimalDigit(current())) {
+ int next = current() - '0';
+ if (min > (RegExpTree::kInfinity - next) / 10) {
+ // Overflow. Skip past remaining decimal digits and return -1.
+ do {
+ Advance();
+ } while (IsDecimalDigit(current()));
+ min = RegExpTree::kInfinity;
+ break;
+ }
+ min = 10 * min + next;
+ Advance();
+ }
+ int max = 0;
+ if (current() == '}') {
+ max = min;
+ Advance();
+ } else if (current() == ',') {
+ Advance();
+ if (current() == '}') {
+ max = RegExpTree::kInfinity;
+ Advance();
+ } else {
+ while (IsDecimalDigit(current())) {
+ int next = current() - '0';
+ if (max > (RegExpTree::kInfinity - next) / 10) {
+ do {
+ Advance();
+ } while (IsDecimalDigit(current()));
+ max = RegExpTree::kInfinity;
+ break;
+ }
+ max = 10 * max + next;
+ Advance();
+ }
+ if (current() != '}') {
+ Reset(start);
+ return false;
+ }
+ Advance();
+ }
+ } else {
+ Reset(start);
+ return false;
+ }
+ *min_out = min;
+ *max_out = max;
+ return true;
+}
+
+
+uc32 RegExpParser::ParseOctalLiteral() {
+ DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
+ // For compatibility with some other browsers (not all), we parse
+ // up to three octal digits with a value below 256.
+ // ES#prod-annexB-LegacyOctalEscapeSequence
+ uc32 value = current() - '0';
+ Advance();
+ if ('0' <= current() && current() <= '7') {
+ value = value * 8 + current() - '0';
+ Advance();
+ if (value < 32 && '0' <= current() && current() <= '7') {
+ value = value * 8 + current() - '0';
+ Advance();
+ }
+ }
+ return value;
+}
+
+
+bool RegExpParser::ParseHexEscape(int length, uc32* value) {
+ int start = position();
+ uc32 val = 0;
+ for (int i = 0; i < length; ++i) {
+ uc32 c = current();
+ int d = HexValue(c);
+ if (d < 0) {
+ Reset(start);
+ return false;
+ }
+ val = val * 16 + d;
+ Advance();
+ }
+ *value = val;
+ return true;
+}
+
+// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
+bool RegExpParser::ParseUnicodeEscape(uc32* value) {
+ // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
+ // allowed). In the latter case, the number of hex digits between { } is
+ // arbitrary. \ and u have already been read.
+ if (current() == '{' && unicode()) {
+ int start = position();
+ Advance();
+ if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
+ if (current() == '}') {
+ Advance();
+ return true;
+ }
+ }
+ Reset(start);
+ return false;
+ }
+ // \u but no {, or \u{...} escapes not allowed.
+ bool result = ParseHexEscape(4, value);
+ if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
+ current() == '\\') {
+ // Attempt to read trail surrogate.
+ int start = position();
+ if (Next() == 'u') {
+ Advance(2);
+ uc32 trail;
+ if (ParseHexEscape(4, &trail) &&
+ unibrow::Utf16::IsTrailSurrogate(trail)) {
+ *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
+ static_cast<uc16>(trail));
+ return true;
+ }
+ }
+ Reset(start);
+ }
+ return result;
+}
+
+#ifdef V8_INTL_SUPPORT
+
+namespace {
+
+bool IsExactPropertyAlias(const char* property_name, UProperty property) {
+ const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
+ if (short_name != nullptr && strcmp(property_name, short_name) == 0)
+ return true;
+ for (int i = 0;; i++) {
+ const char* long_name = u_getPropertyName(
+ property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
+ if (long_name == nullptr) break;
+ if (strcmp(property_name, long_name) == 0) return true;
+ }
+ return false;
+}
+
+bool IsExactPropertyValueAlias(const char* property_value_name,
+ UProperty property, int32_t property_value) {
+ const char* short_name =
+ u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
+ if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
+ return true;
+ }
+ for (int i = 0;; i++) {
+ const char* long_name = u_getPropertyValueName(
+ property, property_value,
+ static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
+ if (long_name == nullptr) break;
+ if (strcmp(property_value_name, long_name) == 0) return true;
+ }
+ return false;
+}
+
+bool LookupPropertyValueName(UProperty property,
+ const char* property_value_name, bool negate,
+ ZoneList<CharacterRange>* result, Zone* zone) {
+ UProperty property_for_lookup = property;
+ if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
+ // For the property Script_Extensions, we have to do the property value
+ // name lookup as if the property is Script.
+ property_for_lookup = UCHAR_SCRIPT;
+ }
+ int32_t property_value =
+ u_getPropertyValueEnum(property_for_lookup, property_value_name);
+ if (property_value == UCHAR_INVALID_CODE) return false;
+
+ // We require the property name to match exactly to one of the property value
+ // aliases. However, u_getPropertyValueEnum uses loose matching.
+ if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
+ property_value)) {
+ return false;
+ }
+
+ UErrorCode ec = U_ZERO_ERROR;
+ icu::UnicodeSet set;
+ set.applyIntPropertyValue(property, property_value, ec);
+ bool success = ec == U_ZERO_ERROR && !set.isEmpty();
+
+ if (success) {
+ set.removeAllStrings();
+ if (negate) set.complement();
+ for (int i = 0; i < set.getRangeCount(); i++) {
+ result->Add(
+ CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
+ zone);
+ }
+ }
+ return success;
+}
+
+template <size_t N>
+inline bool NameEquals(const char* name, const char (&literal)[N]) {
+ return strncmp(name, literal, N + 1) == 0;
+}
+
+bool LookupSpecialPropertyValueName(const char* name,
+ ZoneList<CharacterRange>* result,
+ bool negate, Zone* zone) {
+ if (NameEquals(name, "Any")) {
+ if (negate) {
+ // Leave the list of character ranges empty, since the negation of 'Any'
+ // is the empty set.
+ } else {
+ result->Add(CharacterRange::Everything(), zone);
+ }
+ } else if (NameEquals(name, "ASCII")) {
+ result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
+ : CharacterRange::Range(0x0, 0x7F),
+ zone);
+ } else if (NameEquals(name, "Assigned")) {
+ return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
+ !negate, result, zone);
+ } else {
+ return false;
+ }
+ return true;
+}
+
+// Explicitly whitelist supported binary properties. The spec forbids supporting
+// properties outside of this set to ensure interoperability.
+bool IsSupportedBinaryProperty(UProperty property) {
+ switch (property) {
+ case UCHAR_ALPHABETIC:
+ // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
+ // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
+ case UCHAR_ASCII_HEX_DIGIT:
+ // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
+ case UCHAR_BIDI_CONTROL:
+ case UCHAR_BIDI_MIRRORED:
+ case UCHAR_CASE_IGNORABLE:
+ case UCHAR_CASED:
+ case UCHAR_CHANGES_WHEN_CASEFOLDED:
+ case UCHAR_CHANGES_WHEN_CASEMAPPED:
+ case UCHAR_CHANGES_WHEN_LOWERCASED:
+ case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
+ case UCHAR_CHANGES_WHEN_TITLECASED:
+ case UCHAR_CHANGES_WHEN_UPPERCASED:
+ case UCHAR_DASH:
+ case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
+ case UCHAR_DEPRECATED:
+ case UCHAR_DIACRITIC:
+ case UCHAR_EMOJI:
+ case UCHAR_EMOJI_COMPONENT:
+ case UCHAR_EMOJI_MODIFIER_BASE:
+ case UCHAR_EMOJI_MODIFIER:
+ case UCHAR_EMOJI_PRESENTATION:
+ case UCHAR_EXTENDED_PICTOGRAPHIC:
+ case UCHAR_EXTENDER:
+ case UCHAR_GRAPHEME_BASE:
+ case UCHAR_GRAPHEME_EXTEND:
+ case UCHAR_HEX_DIGIT:
+ case UCHAR_ID_CONTINUE:
+ case UCHAR_ID_START:
+ case UCHAR_IDEOGRAPHIC:
+ case UCHAR_IDS_BINARY_OPERATOR:
+ case UCHAR_IDS_TRINARY_OPERATOR:
+ case UCHAR_JOIN_CONTROL:
+ case UCHAR_LOGICAL_ORDER_EXCEPTION:
+ case UCHAR_LOWERCASE:
+ case UCHAR_MATH:
+ case UCHAR_NONCHARACTER_CODE_POINT:
+ case UCHAR_PATTERN_SYNTAX:
+ case UCHAR_PATTERN_WHITE_SPACE:
+ case UCHAR_QUOTATION_MARK:
+ case UCHAR_RADICAL:
+ case UCHAR_REGIONAL_INDICATOR:
+ case UCHAR_S_TERM:
+ case UCHAR_SOFT_DOTTED:
+ case UCHAR_TERMINAL_PUNCTUATION:
+ case UCHAR_UNIFIED_IDEOGRAPH:
+ case UCHAR_UPPERCASE:
+ case UCHAR_VARIATION_SELECTOR:
+ case UCHAR_WHITE_SPACE:
+ case UCHAR_XID_CONTINUE:
+ case UCHAR_XID_START:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+bool IsUnicodePropertyValueCharacter(char c) {
+ // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
+ //
+ // Note that using this to validate each parsed char is quite conservative.
+ // A possible alternative solution would be to only ensure the parsed
+ // property name/value candidate string does not contain '\0' characters and
+ // let ICU lookups trigger the final failure.
+ if ('a' <= c && c <= 'z') return true;
+ if ('A' <= c && c <= 'Z') return true;
+ if ('0' <= c && c <= '9') return true;
+ return (c == '_');
+}
+
+} // anonymous namespace
+
+bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
+ std::vector<char>* name_2) {
+ DCHECK(name_1->empty());
+ DCHECK(name_2->empty());
+ // Parse the property class as follows:
+ // - In \p{name}, 'name' is interpreted
+ // - either as a general category property value name.
+ // - or as a binary property name.
+ // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
+ // and 'value' is interpreted as one of the available property value names.
+ // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
+ // - Loose matching is not applied.
+ if (current() == '{') {
+ // Parse \p{[PropertyName=]PropertyNameValue}
+ for (Advance(); current() != '}' && current() != '='; Advance()) {
+ if (!IsUnicodePropertyValueCharacter(current())) return false;
+ if (!has_next()) return false;
+ name_1->push_back(static_cast<char>(current()));
+ }
+ if (current() == '=') {
+ for (Advance(); current() != '}'; Advance()) {
+ if (!IsUnicodePropertyValueCharacter(current())) return false;
+ if (!has_next()) return false;
+ name_2->push_back(static_cast<char>(current()));
+ }
+ name_2->push_back(0); // null-terminate string.
+ }
+ } else {
+ return false;
+ }
+ Advance();
+ name_1->push_back(0); // null-terminate string.
+
+ DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
+ DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
+ return true;
+}
+
+bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
+ bool negate,
+ const std::vector<char>& name_1,
+ const std::vector<char>& name_2) {
+ if (name_2.empty()) {
+ // First attempt to interpret as general category property value name.
+ const char* name = name_1.data();
+ if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
+ add_to, zone())) {
+ return true;
+ }
+ // Interpret "Any", "ASCII", and "Assigned".
+ if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) {
+ return true;
+ }
+ // Then attempt to interpret as binary property name with value name 'Y'.
+ UProperty property = u_getPropertyEnum(name);
+ if (!IsSupportedBinaryProperty(property)) return false;
+ if (!IsExactPropertyAlias(name, property)) return false;
+ return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to,
+ zone());
+ } else {
+ // Both property name and value name are specified. Attempt to interpret
+ // the property name as enumerated property.
+ const char* property_name = name_1.data();
+ const char* value_name = name_2.data();
+ UProperty property = u_getPropertyEnum(property_name);
+ if (!IsExactPropertyAlias(property_name, property)) return false;
+ if (property == UCHAR_GENERAL_CATEGORY) {
+ // We want to allow aggregate value names such as "Letter".
+ property = UCHAR_GENERAL_CATEGORY_MASK;
+ } else if (property != UCHAR_SCRIPT &&
+ property != UCHAR_SCRIPT_EXTENSIONS) {
+ return false;
+ }
+ return LookupPropertyValueName(property, value_name, negate, add_to,
+ zone());
+ }
+}
+
+RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name_1) {
+ if (!FLAG_harmony_regexp_sequence) return nullptr;
+ const char* name = name_1.data();
+ const uc32* sequence_list = nullptr;
+ JSRegExp::Flags flags = JSRegExp::kUnicode;
+ if (NameEquals(name, "Emoji_Flag_Sequence")) {
+ sequence_list = UnicodePropertySequences::kEmojiFlagSequences;
+ } else if (NameEquals(name, "Emoji_Tag_Sequence")) {
+ sequence_list = UnicodePropertySequences::kEmojiTagSequences;
+ } else if (NameEquals(name, "Emoji_ZWJ_Sequence")) {
+ sequence_list = UnicodePropertySequences::kEmojiZWJSequences;
+ }
+ if (sequence_list != nullptr) {
+ // TODO(yangguo): this creates huge regexp code. Alternative to this is
+ // to create a new operator that checks for these sequences at runtime.
+ RegExpBuilder builder(zone(), flags);
+ while (true) { // Iterate through list of sequences.
+ while (*sequence_list != 0) { // Iterate through sequence.
+ builder.AddUnicodeCharacter(*sequence_list);
+ sequence_list++;
+ }
+ sequence_list++;
+ if (*sequence_list == 0) break;
+ builder.NewAlternative();
+ }
+ return builder.ToRegExp();
+ }
+
+ if (NameEquals(name, "Emoji_Keycap_Sequence")) {
+ // https://unicode.org/reports/tr51/#def_emoji_keycap_sequence
+ // emoji_keycap_sequence := [0-9#*] \x{FE0F 20E3}
+ RegExpBuilder builder(zone(), flags);
+ ZoneList<CharacterRange>* prefix_ranges =
+ new (zone()) ZoneList<CharacterRange>(2, zone());
+ prefix_ranges->Add(CharacterRange::Range('0', '9'), zone());
+ prefix_ranges->Add(CharacterRange::Singleton('#'), zone());
+ prefix_ranges->Add(CharacterRange::Singleton('*'), zone());
+ builder.AddCharacterClass(
+ new (zone()) RegExpCharacterClass(zone(), prefix_ranges, flags));
+ builder.AddCharacter(0xFE0F);
+ builder.AddCharacter(0x20E3);
+ return builder.ToRegExp();
+ } else if (NameEquals(name, "Emoji_Modifier_Sequence")) {
+ // https://unicode.org/reports/tr51/#def_emoji_modifier_sequence
+ // emoji_modifier_sequence := emoji_modifier_base emoji_modifier
+ RegExpBuilder builder(zone(), flags);
+ ZoneList<CharacterRange>* modifier_base_ranges =
+ new (zone()) ZoneList<CharacterRange>(2, zone());
+ LookupPropertyValueName(UCHAR_EMOJI_MODIFIER_BASE, "Y", false,
+ modifier_base_ranges, zone());
+ builder.AddCharacterClass(
+ new (zone()) RegExpCharacterClass(zone(), modifier_base_ranges, flags));
+ ZoneList<CharacterRange>* modifier_ranges =
+ new (zone()) ZoneList<CharacterRange>(2, zone());
+ LookupPropertyValueName(UCHAR_EMOJI_MODIFIER, "Y", false, modifier_ranges,
+ zone());
+ builder.AddCharacterClass(
+ new (zone()) RegExpCharacterClass(zone(), modifier_ranges, flags));
+ return builder.ToRegExp();
+ }
+
+ return nullptr;
+}
+
+#else // V8_INTL_SUPPORT
+
+bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
+ std::vector<char>* name_2) {
+ return false;
+}
+
+bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
+ bool negate,
+ const std::vector<char>& name_1,
+ const std::vector<char>& name_2) {
+ return false;
+}
+
+RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name) {
+ return nullptr;
+}
+
+#endif // V8_INTL_SUPPORT
+
+bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
+ uc32 x = 0;
+ int d = HexValue(current());
+ if (d < 0) {
+ return false;
+ }
+ while (d >= 0) {
+ x = x * 16 + d;
+ if (x > max_value) {
+ return false;
+ }
+ Advance();
+ d = HexValue(current());
+ }
+ *value = x;
+ return true;
+}
+
+
+uc32 RegExpParser::ParseClassCharacterEscape() {
+ DCHECK_EQ('\\', current());
+ DCHECK(has_next() && !IsSpecialClassEscape(Next()));
+ Advance();
+ switch (current()) {
+ case 'b':
+ Advance();
+ return '\b';
+ // ControlEscape :: one of
+ // f n r t v
+ case 'f':
+ Advance();
+ return '\f';
+ case 'n':
+ Advance();
+ return '\n';
+ case 'r':
+ Advance();
+ return '\r';
+ case 't':
+ Advance();
+ return '\t';
+ case 'v':
+ Advance();
+ return '\v';
+ case 'c': {
+ uc32 controlLetter = Next();
+ uc32 letter = controlLetter & ~('A' ^ 'a');
+ // Inside a character class, we also accept digits and underscore as
+ // control characters, unless with /u. See Annex B:
+ // ES#prod-annexB-ClassControlLetter
+ if (letter >= 'A' && letter <= 'Z') {
+ Advance(2);
+ // Control letters mapped to ASCII control characters in the range
+ // 0x00-0x1F.
+ return controlLetter & 0x1F;
+ }
+ if (unicode()) {
+ // With /u, invalid escapes are not treated as identity escapes.
+ ReportError(CStrVector("Invalid class escape"));
+ return 0;
+ }
+ if ((controlLetter >= '0' && controlLetter <= '9') ||
+ controlLetter == '_') {
+ Advance(2);
+ return controlLetter & 0x1F;
+ }
+ // We match JSC in reading the backslash as a literal
+ // character instead of as starting an escape.
+ // TODO(v8:6201): Not yet covered by the spec.
+ return '\\';
+ }
+ case '0':
+ // With /u, \0 is interpreted as NUL if not followed by another digit.
+ if (unicode() && !(Next() >= '0' && Next() <= '9')) {
+ Advance();
+ return 0;
+ }
+ V8_FALLTHROUGH;
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ // For compatibility, we interpret a decimal escape that isn't
+ // a back reference (and therefore either \0 or not valid according
+ // to the specification) as a 1..3 digit octal character code.
+ // ES#prod-annexB-LegacyOctalEscapeSequence
+ if (unicode()) {
+ // With /u, decimal escape is not interpreted as octal character code.
+ ReportError(CStrVector("Invalid class escape"));
+ return 0;
+ }
+ return ParseOctalLiteral();
+ case 'x': {
+ Advance();
+ uc32 value;
+ if (ParseHexEscape(2, &value)) return value;
+ if (unicode()) {
+ // With /u, invalid escapes are not treated as identity escapes.
+ ReportError(CStrVector("Invalid escape"));
+ return 0;
+ }
+ // If \x is not followed by a two-digit hexadecimal, treat it
+ // as an identity escape.
+ return 'x';
+ }
+ case 'u': {
+ Advance();
+ uc32 value;
+ if (ParseUnicodeEscape(&value)) return value;
+ if (unicode()) {
+ // With /u, invalid escapes are not treated as identity escapes.
+ ReportError(CStrVector("Invalid unicode escape"));
+ return 0;
+ }
+ // If \u is not followed by a two-digit hexadecimal, treat it
+ // as an identity escape.
+ return 'u';
+ }
+ default: {
+ uc32 result = current();
+ // With /u, no identity escapes except for syntax characters and '-' are
+ // allowed. Otherwise, all identity escapes are allowed.
+ if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
+ Advance();
+ return result;
+ }
+ ReportError(CStrVector("Invalid escape"));
+ return 0;
+ }
+ }
+ return 0;
+}
+
+void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
+ Zone* zone,
+ bool add_unicode_case_equivalents,
+ uc32* char_out, bool* is_class_escape) {
+ uc32 current_char = current();
+ if (current_char == '\\') {
+ switch (Next()) {
+ case 'w':
+ case 'W':
+ case 'd':
+ case 'D':
+ case 's':
+ case 'S': {
+ CharacterRange::AddClassEscape(static_cast<char>(Next()), ranges,
+ add_unicode_case_equivalents, zone);
+ Advance(2);
+ *is_class_escape = true;
+ return;
+ }
+ case kEndMarker:
+ ReportError(CStrVector("\\ at end of pattern"));
+ return;
+ case 'p':
+ case 'P':
+ if (unicode()) {
+ bool negate = Next() == 'P';
+ Advance(2);
+ std::vector<char> name_1, name_2;
+ if (!ParsePropertyClassName(&name_1, &name_2) ||
+ !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
+ ReportError(CStrVector("Invalid property name in character class"));
+ }
+ *is_class_escape = true;
+ return;
+ }
+ break;
+ default:
+ break;
+ }
+ *char_out = ParseClassCharacterEscape();
+ *is_class_escape = false;
+ } else {
+ Advance();
+ *char_out = current_char;
+ *is_class_escape = false;
+ }
+}
+
+RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
+ static const char* kUnterminated = "Unterminated character class";
+ static const char* kRangeInvalid = "Invalid character class";
+ static const char* kRangeOutOfOrder = "Range out of order in character class";
+
+ DCHECK_EQ(current(), '[');
+ Advance();
+ bool is_negated = false;
+ if (current() == '^') {
+ is_negated = true;
+ Advance();
+ }
+ ZoneList<CharacterRange>* ranges =
+ new (zone()) ZoneList<CharacterRange>(2, zone());
+ bool add_unicode_case_equivalents = unicode() && builder->ignore_case();
+ while (has_more() && current() != ']') {
+ uc32 char_1, char_2;
+ bool is_class_1, is_class_2;
+ ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
+ &is_class_1 CHECK_FAILED);
+ if (current() == '-') {
+ Advance();
+ if (current() == kEndMarker) {
+ // If we reach the end we break out of the loop and let the
+ // following code report an error.
+ break;
+ } else if (current() == ']') {
+ if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
+ ranges->Add(CharacterRange::Singleton('-'), zone());
+ break;
+ }
+ ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
+ &is_class_2 CHECK_FAILED);
+ if (is_class_1 || is_class_2) {
+ // Either end is an escaped character class. Treat the '-' verbatim.
+ if (unicode()) {
+ // ES2015 21.2.2.15.1 step 1.
+ return ReportError(CStrVector(kRangeInvalid));
+ }
+ if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
+ ranges->Add(CharacterRange::Singleton('-'), zone());
+ if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
+ continue;
+ }
+ // ES2015 21.2.2.15.1 step 6.
+ if (char_1 > char_2) {
+ return ReportError(CStrVector(kRangeOutOfOrder));
+ }
+ ranges->Add(CharacterRange::Range(char_1, char_2), zone());
+ } else {
+ if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
+ }
+ }
+ if (!has_more()) {
+ return ReportError(CStrVector(kUnterminated));
+ }
+ Advance();
+ RegExpCharacterClass::CharacterClassFlags character_class_flags;
+ if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED;
+ return new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags(),
+ character_class_flags);
+}
+
+
+#undef CHECK_FAILED
+
+
+bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
+ FlatStringReader* input, JSRegExp::Flags flags,
+ RegExpCompileData* result) {
+ DCHECK(result != nullptr);
+ RegExpParser parser(input, &result->error, flags, isolate, zone);
+ RegExpTree* tree = parser.ParsePattern();
+ if (parser.failed()) {
+ DCHECK(tree == nullptr);
+ DCHECK(!result->error.is_null());
+ } else {
+ DCHECK(tree != nullptr);
+ DCHECK(result->error.is_null());
+ if (FLAG_trace_regexp_parser) {
+ StdoutStream os;
+ tree->Print(os, zone);
+ os << "\n";
+ }
+ result->tree = tree;
+ int capture_count = parser.captures_started();
+ result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
+ result->contains_anchor = parser.contains_anchor();
+ result->capture_name_map = parser.CreateCaptureNameMap();
+ result->capture_count = capture_count;
+ }
+ return !parser.failed();
+}
+
+RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)
+ : zone_(zone),
+ pending_empty_(false),
+ flags_(flags),
+ characters_(nullptr),
+ pending_surrogate_(kNoPendingSurrogate),
+ terms_(),
+ alternatives_()
+#ifdef DEBUG
+ ,
+ last_added_(ADD_NONE)
+#endif
+{
+}
+
+
+void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
+ DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
+ FlushPendingSurrogate();
+ // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
+ pending_surrogate_ = lead_surrogate;
+}
+
+
+void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
+ DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
+ if (pending_surrogate_ != kNoPendingSurrogate) {
+ uc16 lead_surrogate = pending_surrogate_;
+ pending_surrogate_ = kNoPendingSurrogate;
+ DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
+ uc32 combined =
+ unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
+ if (NeedsDesugaringForIgnoreCase(combined)) {
+ AddCharacterClassForDesugaring(combined);
+ } else {
+ ZoneList<uc16> surrogate_pair(2, zone());
+ surrogate_pair.Add(lead_surrogate, zone());
+ surrogate_pair.Add(trail_surrogate, zone());
+ RegExpAtom* atom =
+ new (zone()) RegExpAtom(surrogate_pair.ToConstVector(), flags_);
+ AddAtom(atom);
+ }
+ } else {
+ pending_surrogate_ = trail_surrogate;
+ FlushPendingSurrogate();
+ }
+}
+
+
+void RegExpBuilder::FlushPendingSurrogate() {
+ if (pending_surrogate_ != kNoPendingSurrogate) {
+ DCHECK(unicode());
+ uc32 c = pending_surrogate_;
+ pending_surrogate_ = kNoPendingSurrogate;
+ AddCharacterClassForDesugaring(c);
+ }
+}
+
+
+void RegExpBuilder::FlushCharacters() {
+ FlushPendingSurrogate();
+ pending_empty_ = false;
+ if (characters_ != nullptr) {
+ RegExpTree* atom =
+ new (zone()) RegExpAtom(characters_->ToConstVector(), flags_);
+ characters_ = nullptr;
+ text_.Add(atom, zone());
+ LAST(ADD_ATOM);
+ }
+}
+
+
+void RegExpBuilder::FlushText() {
+ FlushCharacters();
+ int num_text = text_.length();
+ if (num_text == 0) {
+ return;
+ } else if (num_text == 1) {
+ terms_.Add(text_.last(), zone());
+ } else {
+ RegExpText* text = new (zone()) RegExpText(zone());
+ for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
+ terms_.Add(text, zone());
+ }
+ text_.Clear();
+}
+
+
+void RegExpBuilder::AddCharacter(uc16 c) {
+ FlushPendingSurrogate();
+ pending_empty_ = false;
+ if (NeedsDesugaringForIgnoreCase(c)) {
+ AddCharacterClassForDesugaring(c);
+ } else {
+ if (characters_ == nullptr) {
+ characters_ = new (zone()) ZoneList<uc16>(4, zone());
+ }
+ characters_->Add(c, zone());
+ LAST(ADD_CHAR);
+ }
+}
+
+
+void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
+ if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
+ DCHECK(unicode());
+ AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
+ AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
+ } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
+ AddLeadSurrogate(c);
+ } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
+ AddTrailSurrogate(c);
+ } else {
+ AddCharacter(static_cast<uc16>(c));
+ }
+}
+
+void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
+ // A lead or trail surrogate parsed via escape sequence will not
+ // pair up with any preceding lead or following trail surrogate.
+ FlushPendingSurrogate();
+ AddUnicodeCharacter(character);
+ FlushPendingSurrogate();
+}
+
+void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
+
+
+void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
+ if (NeedsDesugaringForUnicode(cc)) {
+ // With /u, character class needs to be desugared, so it
+ // must be a standalone term instead of being part of a RegExpText.
+ AddTerm(cc);
+ } else {
+ AddAtom(cc);
+ }
+}
+
+void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
+ AddTerm(new (zone()) RegExpCharacterClass(
+ zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)),
+ flags_));
+}
+
+
+void RegExpBuilder::AddAtom(RegExpTree* term) {
+ if (term->IsEmpty()) {
+ AddEmpty();
+ return;
+ }
+ if (term->IsTextElement()) {
+ FlushCharacters();
+ text_.Add(term, zone());
+ } else {
+ FlushText();
+ terms_.Add(term, zone());
+ }
+ LAST(ADD_ATOM);
+}
+
+
+void RegExpBuilder::AddTerm(RegExpTree* term) {
+ FlushText();
+ terms_.Add(term, zone());
+ LAST(ADD_ATOM);
+}
+
+
+void RegExpBuilder::AddAssertion(RegExpTree* assert) {
+ FlushText();
+ terms_.Add(assert, zone());
+ LAST(ADD_ASSERT);
+}
+
+
+void RegExpBuilder::NewAlternative() { FlushTerms(); }
+
+
+void RegExpBuilder::FlushTerms() {
+ FlushText();
+ int num_terms = terms_.length();
+ RegExpTree* alternative;
+ if (num_terms == 0) {
+ alternative = new (zone()) RegExpEmpty();
+ } else if (num_terms == 1) {
+ alternative = terms_.last();
+ } else {
+ alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
+ }
+ alternatives_.Add(alternative, zone());
+ terms_.Clear();
+ LAST(ADD_NONE);
+}
+
+
+bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
+ if (!unicode()) return false;
+ // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
+ // necessarily mean that we need to desugar. It's probably nicer to have a
+ // separate pass to figure out unicode desugarings.
+ if (ignore_case()) return true;
+ ZoneList<CharacterRange>* ranges = cc->ranges(zone());
+ CharacterRange::Canonicalize(ranges);
+ for (int i = ranges->length() - 1; i >= 0; i--) {
+ uc32 from = ranges->at(i).from();
+ uc32 to = ranges->at(i).to();
+ // Check for non-BMP characters.
+ if (to >= kNonBmpStart) return true;
+ // Check for lone surrogates.
+ if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
+ }
+ return false;
+}
+
+
+bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
+#ifdef V8_INTL_SUPPORT
+ if (unicode() && ignore_case()) {
+ icu::UnicodeSet set(c, c);
+ set.closeOver(USET_CASE_INSENSITIVE);
+ set.removeAllStrings();
+ return set.size() > 1;
+ }
+ // In the case where ICU is not included, we act as if the unicode flag is
+ // not set, and do not desugar.
+#endif // V8_INTL_SUPPORT
+ return false;
+}
+
+
+RegExpTree* RegExpBuilder::ToRegExp() {
+ FlushTerms();
+ int num_alternatives = alternatives_.length();
+ if (num_alternatives == 0) return new (zone()) RegExpEmpty();
+ if (num_alternatives == 1) return alternatives_.last();
+ return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
+}
+
+bool RegExpBuilder::AddQuantifierToAtom(
+ int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
+ FlushPendingSurrogate();
+ if (pending_empty_) {
+ pending_empty_ = false;
+ return true;
+ }
+ RegExpTree* atom;
+ if (characters_ != nullptr) {
+ DCHECK(last_added_ == ADD_CHAR);
+ // Last atom was character.
+ Vector<const uc16> char_vector = characters_->ToConstVector();
+ int num_chars = char_vector.length();
+ if (num_chars > 1) {
+ Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
+ text_.Add(new (zone()) RegExpAtom(prefix, flags_), zone());
+ char_vector = char_vector.SubVector(num_chars - 1, num_chars);
+ }
+ characters_ = nullptr;
+ atom = new (zone()) RegExpAtom(char_vector, flags_);
+ FlushText();
+ } else if (text_.length() > 0) {
+ DCHECK(last_added_ == ADD_ATOM);
+ atom = text_.RemoveLast();
+ FlushText();
+ } else if (terms_.length() > 0) {
+ DCHECK(last_added_ == ADD_ATOM);
+ atom = terms_.RemoveLast();
+ if (atom->IsLookaround()) {
+ // With /u, lookarounds are not quantifiable.
+ if (unicode()) return false;
+ // Lookbehinds are not quantifiable.
+ if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
+ return false;
+ }
+ }
+ if (atom->max_match() == 0) {
+ // Guaranteed to only match an empty string.
+ LAST(ADD_TERM);
+ if (min == 0) {
+ return true;
+ }
+ terms_.Add(atom, zone());
+ return true;
+ }
+ } else {
+ // Only call immediately after adding an atom or character!
+ UNREACHABLE();
+ }
+ terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
+ zone());
+ LAST(ADD_TERM);
+ return true;
+}
+
+} // namespace internal
+} // namespace v8