From fa473930f424bf17a9e545b601c84dd2e61364e3 Mon Sep 17 00:00:00 2001 From: wolfbeast Date: Mon, 11 Nov 2019 23:37:35 +0100 Subject: Issue #1279 - Implement regular expression lookbehind Based on Tom Schuster's work, with extra minters for unicode. --- js/src/irregexp/RegExpParser.cpp | 130 +++++++++++++++++++++++++++------------ 1 file changed, 89 insertions(+), 41 deletions(-) (limited to 'js/src/irregexp/RegExpParser.cpp') diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp index 8bd88047a..9ef9fe3e2 100644 --- a/js/src/irregexp/RegExpParser.cpp +++ b/js/src/irregexp/RegExpParser.cpp @@ -227,6 +227,7 @@ RegExpParser::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc, alloc(alloc), captures_(nullptr), next_pos_(chars), + captures_started_(0), end_(end), current_(kEndMarker), capture_count_(0), @@ -418,7 +419,8 @@ RangeAtom(LifoAlloc* alloc, char16_t from, char16_t to) static inline RegExpTree* NegativeLookahead(LifoAlloc* alloc, char16_t from, char16_t to) { - return alloc->newInfallible(RangeAtom(alloc, from, to), false, 0, 0); + return alloc->newInfallible(RangeAtom(alloc, from, to), false, + 0, 0, RegExpLookaround::LOOKAHEAD); } static bool @@ -1213,6 +1215,38 @@ RegExpParser::ParseBackReferenceIndex(int* index_out) return true; } +template +RegExpCapture* +RegExpParser::GetCapture(int index) { + // The index for the capture groups are one-based. Its index in the list is + // zero-based. + int known_captures = + is_scanned_for_captures_ ? capture_count_ : captures_started_; + MOZ_ASSERT(index <= known_captures); + if (captures_ == NULL) { + captures_ = alloc->newInfallible(*alloc); + } + while ((int)captures_->length() < known_captures) { + RegExpCapture* capture = alloc->newInfallible(nullptr, captures_->length() + 1); + captures_->append(capture); + } + return (*captures_)[index - 1]; +} + + +template +bool +RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) { + for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) { + if (s->group_type() != CAPTURE) continue; + // Return true if we found the matching capture index. + if (index == s->capture_index()) return true; + // Abort if index is larger than what has been parsed up till this state. + if (index > s->capture_index()) return false; + } + return false; +} + // QuantifierPrefix :: // { DecimalDigits } // { DecimalDigits , } @@ -1423,24 +1457,24 @@ RegExpTree* RegExpParser::ParseDisjunction() { // Used to store current state while parsing subexpressions. - RegExpParserState initial_state(alloc, nullptr, INITIAL, 0); - RegExpParserState* stored_state = &initial_state; + RegExpParserState initial_state(alloc, nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, 0); + RegExpParserState* state = &initial_state; // Cache the builder in a local variable for quick access. RegExpBuilder* builder = initial_state.builder(); while (true) { switch (current()) { case kEndMarker: - if (stored_state->IsSubexpression()) { + if (state->IsSubexpression()) { // Inside a parenthesized group when hitting end of input. return ReportError(JSMSG_MISSING_PAREN); } - MOZ_ASSERT(INITIAL == stored_state->group_type()); + MOZ_ASSERT(INITIAL == state->group_type()); // Parsing completed successfully. return builder->ToRegExp(); case ')': { - if (!stored_state->IsSubexpression()) + if (!state->IsSubexpression()) return ReportError(JSMSG_UNMATCHED_RIGHT_PAREN); - MOZ_ASSERT(INITIAL != stored_state->group_type()); + MOZ_ASSERT(INITIAL != state->group_type()); Advance(); // End disjunction parsing and convert builder content to new single @@ -1449,29 +1483,30 @@ RegExpParser::ParseDisjunction() int end_capture_index = captures_started(); - int capture_index = stored_state->capture_index(); - SubexpressionType group_type = stored_state->group_type(); - - // Restore previous state. - stored_state = stored_state->previous_state(); - builder = stored_state->builder(); + int capture_index = state->capture_index(); + SubexpressionType group_type = state->group_type(); // Build result of subexpression. if (group_type == CAPTURE) { - RegExpCapture* capture = alloc->newInfallible(body, capture_index); - (*captures_)[capture_index - 1] = capture; + RegExpCapture* capture = GetCapture(capture_index); + capture->set_body(body); body = capture; } else if (group_type != GROUPING) { - MOZ_ASSERT(group_type == POSITIVE_LOOKAHEAD || - group_type == NEGATIVE_LOOKAHEAD); - bool is_positive = (group_type == POSITIVE_LOOKAHEAD); - body = alloc->newInfallible(body, + MOZ_ASSERT(group_type == POSITIVE_LOOKAROUND || + group_type == NEGATIVE_LOOKAROUND); + bool is_positive = (group_type == POSITIVE_LOOKAROUND); + body = alloc->newInfallible(body, is_positive, end_capture_index - capture_index, - capture_index); + capture_index, + state->lookaround_type()); } + + // Restore previous state. + state = state->previous_state(); + builder = state->builder(); builder->AddAtom(body); - if (unicode_ && (group_type == POSITIVE_LOOKAHEAD || group_type == NEGATIVE_LOOKAHEAD)) + if (unicode_ && (group_type == POSITIVE_LOOKAROUND || group_type == NEGATIVE_LOOKAROUND)) continue; // For compatability with JSC and ES3, we allow quantifiers after // lookaheads, and break in all cases. @@ -1519,6 +1554,7 @@ RegExpParser::ParseDisjunction() } case '(': { SubexpressionType subexpr_type = CAPTURE; + RegExpLookaround::Type lookaround_type = state->lookaround_type(); Advance(); if (current() == '?') { switch (Next()) { @@ -1526,26 +1562,39 @@ RegExpParser::ParseDisjunction() subexpr_type = GROUPING; break; case '=': - subexpr_type = POSITIVE_LOOKAHEAD; + lookaround_type = RegExpLookaround::LOOKAHEAD; + subexpr_type = POSITIVE_LOOKAROUND; break; case '!': - subexpr_type = NEGATIVE_LOOKAHEAD; + lookaround_type = RegExpLookaround::LOOKAHEAD; + subexpr_type = NEGATIVE_LOOKAROUND; break; + case '<': + Advance(); + lookaround_type = RegExpLookaround::LOOKBEHIND; + if (Next() == '=') { + subexpr_type = POSITIVE_LOOKAROUND; + break; + } else if (Next() == '!') { + subexpr_type = NEGATIVE_LOOKAROUND; + break; + } + // We didn't get a positive or negative after '<'. + // That's an error. + return ReportError(JSMSG_INVALID_GROUP); default: return ReportError(JSMSG_INVALID_GROUP); } Advance(2); } else { - if (captures_ == nullptr) - captures_ = alloc->newInfallible(*alloc); if (captures_started() >= kMaxCaptures) return ReportError(JSMSG_TOO_MANY_PARENS); - captures_->append((RegExpCapture*) nullptr); + captures_started_++; } // Store current state and begin new disjunction parsing. - stored_state = alloc->newInfallible(alloc, stored_state, subexpr_type, - captures_started()); - builder = stored_state->builder(); + state = alloc->newInfallible(alloc, state, subexpr_type, + lookaround_type, captures_started_); + builder = state->builder(); continue; } case '[': { @@ -1600,19 +1649,18 @@ RegExpParser::ParseDisjunction() case '7': case '8': case '9': { int index = 0; if (ParseBackReferenceIndex(&index)) { - RegExpCapture* capture = nullptr; - if (captures_ != nullptr && index <= (int) captures_->length()) { - capture = (*captures_)[index - 1]; - } - if (capture == nullptr) { - builder->AddEmpty(); - break; + if (state->IsInsideCaptureGroup(index)) { + // The backreference is inside the capture group it refers to. + // Nothing can possibly have been captured yet. + builder->AddEmpty(); + } else { + RegExpCapture* capture = GetCapture(index); + RegExpTree* atom = alloc->newInfallible(capture); + if (unicode_) + builder->AddAtom(UnicodeBackReferenceAtom(alloc, atom)); + else + builder->AddAtom(atom); } - RegExpTree* atom = alloc->newInfallible(capture); - if (unicode_) - builder->AddAtom(UnicodeBackReferenceAtom(alloc, atom)); - else - builder->AddAtom(atom); break; } if (unicode_) -- cgit v1.2.3