From fa473930f424bf17a9e545b601c84dd2e61364e3 Mon Sep 17 00:00:00 2001
From: wolfbeast <mcwerewolf@wolfbeast.com>
Date: Mon, 11 Nov 2019 23:37:35 +0100
Subject: Issue #1279 - Implement regular expression lookbehind

Based on Tom Schuster's work, with extra minters for unicode.
---
 js/src/irregexp/RegExpParser.cpp | 130 +++++++++++++++++++++++++++------------
 1 file changed, 89 insertions(+), 41 deletions(-)

(limited to 'js/src/irregexp/RegExpParser.cpp')
diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp
index 8bd88047a..9ef9fe3e2 100644
--- a/js/src/irregexp/RegExpParser.cpp
+++ b/js/src/irregexp/RegExpParser.cpp
@@ -227,6 +227,7 @@ RegExpParser<CharT>::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc,
     alloc(alloc),
     captures_(nullptr),
     next_pos_(chars),
+    captures_started_(0),
     end_(end),
     current_(kEndMarker),
     capture_count_(0),
@@ -418,7 +419,8 @@ RangeAtom(LifoAlloc* alloc, char16_t from, char16_t to)
 static inline RegExpTree*
 NegativeLookahead(LifoAlloc* alloc, char16_t from, char16_t to)
 {
-    return alloc->newInfallible<RegExpLookahead>(RangeAtom(alloc, from, to), false, 0, 0);
+    return alloc->newInfallible<RegExpLookaround>(RangeAtom(alloc, from, to), false,
+                                                  0, 0, RegExpLookaround::LOOKAHEAD);
 }
 
 static bool
@@ -1213,6 +1215,38 @@ RegExpParser<CharT>::ParseBackReferenceIndex(int* index_out)
     return true;
 }
 
+template <typename CharT>
+RegExpCapture*
+RegExpParser<CharT>::GetCapture(int index) {
+  // The index for the capture groups are one-based. Its index in the list is
+  // zero-based.
+  int known_captures =
+      is_scanned_for_captures_ ? capture_count_ : captures_started_;
+  MOZ_ASSERT(index <= known_captures);
+  if (captures_ == NULL) {
+    captures_ = alloc->newInfallible<RegExpCaptureVector>(*alloc);
+  }
+  while ((int)captures_->length() < known_captures) {
+    RegExpCapture* capture = alloc->newInfallible<RegExpCapture>(nullptr, captures_->length() + 1);
+    captures_->append(capture);
+  }
+  return (*captures_)[index - 1];
+}
+
+
+template <typename CharT>
+bool
+RegExpParser<CharT>::RegExpParserState::IsInsideCaptureGroup(int index) {
+  for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
+    if (s->group_type() != CAPTURE) continue;
+    // Return true if we found the matching capture index.
+    if (index == s->capture_index()) return true;
+    // Abort if index is larger than what has been parsed up till this state.
+    if (index > s->capture_index()) return false;
+  }
+  return false;
+}
+
 // QuantifierPrefix ::
 //   { DecimalDigits }
 //   { DecimalDigits , }
@@ -1423,24 +1457,24 @@ RegExpTree*
 RegExpParser<CharT>::ParseDisjunction()
 {
     // Used to store current state while parsing subexpressions.
-    RegExpParserState initial_state(alloc, nullptr, INITIAL, 0);
-    RegExpParserState* stored_state = &initial_state;
+    RegExpParserState initial_state(alloc, nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, 0);
+    RegExpParserState* state = &initial_state;
     // Cache the builder in a local variable for quick access.
     RegExpBuilder* builder = initial_state.builder();
     while (true) {
         switch (current()) {
           case kEndMarker:
-            if (stored_state->IsSubexpression()) {
+            if (state->IsSubexpression()) {
                 // Inside a parenthesized group when hitting end of input.
                 return ReportError(JSMSG_MISSING_PAREN);
             }
-            MOZ_ASSERT(INITIAL == stored_state->group_type());
+            MOZ_ASSERT(INITIAL == state->group_type());
             // Parsing completed successfully.
             return builder->ToRegExp();
           case ')': {
-            if (!stored_state->IsSubexpression())
+            if (!state->IsSubexpression())
                 return ReportError(JSMSG_UNMATCHED_RIGHT_PAREN);
-            MOZ_ASSERT(INITIAL != stored_state->group_type());
+            MOZ_ASSERT(INITIAL != state->group_type());
 
             Advance();
             // End disjunction parsing and convert builder content to new single
@@ -1449,29 +1483,30 @@ RegExpParser<CharT>::ParseDisjunction()
 
             int end_capture_index = captures_started();
 
-            int capture_index = stored_state->capture_index();
-            SubexpressionType group_type = stored_state->group_type();
-
-            // Restore previous state.
-            stored_state = stored_state->previous_state();
-            builder = stored_state->builder();
+            int capture_index = state->capture_index();
+            SubexpressionType group_type = state->group_type();
 
             // Build result of subexpression.
             if (group_type == CAPTURE) {
-                RegExpCapture* capture = alloc->newInfallible<RegExpCapture>(body, capture_index);
-                (*captures_)[capture_index - 1] = capture;
+                RegExpCapture* capture = GetCapture(capture_index);
+                capture->set_body(body);
                 body = capture;
             } else if (group_type != GROUPING) {
-                MOZ_ASSERT(group_type == POSITIVE_LOOKAHEAD ||
-                           group_type == NEGATIVE_LOOKAHEAD);
-                bool is_positive = (group_type == POSITIVE_LOOKAHEAD);
-                body = alloc->newInfallible<RegExpLookahead>(body,
+                MOZ_ASSERT(group_type == POSITIVE_LOOKAROUND ||
+                           group_type == NEGATIVE_LOOKAROUND);
+                bool is_positive = (group_type == POSITIVE_LOOKAROUND);
+                body = alloc->newInfallible<RegExpLookaround>(body,
                                                    is_positive,
                                                    end_capture_index - capture_index,
-                                                   capture_index);
+                                                   capture_index,
+                                                   state->lookaround_type());
             }
+
+            // Restore previous state.
+            state = state->previous_state();
+            builder = state->builder();
             builder->AddAtom(body);
-            if (unicode_ && (group_type == POSITIVE_LOOKAHEAD || group_type == NEGATIVE_LOOKAHEAD))
+            if (unicode_ && (group_type == POSITIVE_LOOKAROUND || group_type == NEGATIVE_LOOKAROUND))
                 continue;
             // For compatability with JSC and ES3, we allow quantifiers after
             // lookaheads, and break in all cases.
@@ -1519,6 +1554,7 @@ RegExpParser<CharT>::ParseDisjunction()
           }
           case '(': {
             SubexpressionType subexpr_type = CAPTURE;
+            RegExpLookaround::Type lookaround_type = state->lookaround_type();
             Advance();
             if (current() == '?') {
                 switch (Next()) {
@@ -1526,26 +1562,39 @@ RegExpParser<CharT>::ParseDisjunction()
                     subexpr_type = GROUPING;
                     break;
                   case '=':
-                    subexpr_type = POSITIVE_LOOKAHEAD;
+                    lookaround_type = RegExpLookaround::LOOKAHEAD;
+                    subexpr_type = POSITIVE_LOOKAROUND;
                     break;
                   case '!':
-                    subexpr_type = NEGATIVE_LOOKAHEAD;
+                    lookaround_type = RegExpLookaround::LOOKAHEAD;
+                    subexpr_type = NEGATIVE_LOOKAROUND;
                     break;
+                  case '<':
+                    Advance();
+                    lookaround_type = RegExpLookaround::LOOKBEHIND;
+                    if (Next() == '=') {
+                      subexpr_type = POSITIVE_LOOKAROUND;
+                      break;
+                    } else if (Next() == '!') {
+                      subexpr_type = NEGATIVE_LOOKAROUND;
+                      break;
+                    }
+                    // We didn't get a positive or negative after '<'.
+                    // That's an error.
+                    return ReportError(JSMSG_INVALID_GROUP);
                   default:
                     return ReportError(JSMSG_INVALID_GROUP);
                 }
                 Advance(2);
             } else {
-                if (captures_ == nullptr)
-                    captures_ = alloc->newInfallible<RegExpCaptureVector>(*alloc);
                 if (captures_started() >= kMaxCaptures)
                     return ReportError(JSMSG_TOO_MANY_PARENS);
-                captures_->append((RegExpCapture*) nullptr);
+                captures_started_++;
             }
             // Store current state and begin new disjunction parsing.
-            stored_state = alloc->newInfallible<RegExpParserState>(alloc, stored_state, subexpr_type,
-                                                                   captures_started());
-            builder = stored_state->builder();
+            state = alloc->newInfallible<RegExpParserState>(alloc, state, subexpr_type,
+                                                            lookaround_type, captures_started_);
+            builder = state->builder();
             continue;
           }
           case '[': {
@@ -1600,19 +1649,18 @@ RegExpParser<CharT>::ParseDisjunction()
               case '7': case '8': case '9': {
                 int index = 0;
                 if (ParseBackReferenceIndex(&index)) {
-                    RegExpCapture* capture = nullptr;
-                    if (captures_ != nullptr && index <= (int) captures_->length()) {
-                        capture = (*captures_)[index - 1];
-                    }
-                    if (capture == nullptr) {
-                        builder->AddEmpty();
-                        break;
+                    if (state->IsInsideCaptureGroup(index)) {
+                      // The backreference is inside the capture group it refers to.
+                      // Nothing can possibly have been captured yet.
+                      builder->AddEmpty();
+                    } else {
+                      RegExpCapture* capture = GetCapture(index);
+                      RegExpTree* atom = alloc->newInfallible<RegExpBackReference>(capture);
+                      if (unicode_)
+                          builder->AddAtom(UnicodeBackReferenceAtom(alloc, atom));
+                      else
+                          builder->AddAtom(atom);
                     }
-                    RegExpTree* atom = alloc->newInfallible<RegExpBackReference>(capture);
-                    if (unicode_)
-                        builder->AddAtom(UnicodeBackReferenceAtom(alloc, atom));
-                    else
-                        builder->AddAtom(atom);
                     break;
                 }
                 if (unicode_)
-- 
cgit v1.2.3