summaryrefslogtreecommitdiffstats
path: root/js/src/irregexp/RegExpParser.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'js/src/irregexp/RegExpParser.cpp')
-rw-r--r--js/src/irregexp/RegExpParser.cpp1905
1 files changed, 1905 insertions, 0 deletions
diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp
new file mode 100644
index 000000000..ccc6ae3eb
--- /dev/null
+++ b/js/src/irregexp/RegExpParser.cpp
@@ -0,0 +1,1905 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * vim: set ts=8 sts=4 et sw=4 tw=99: */
+
+// Copyright 2012 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "irregexp/RegExpParser.h"
+
+#include "frontend/TokenStream.h"
+
+using namespace js;
+using namespace js::irregexp;
+
+// ----------------------------------------------------------------------------
+// RegExpBuilder
+
+RegExpBuilder::RegExpBuilder(LifoAlloc* alloc)
+ : alloc(alloc),
+ pending_empty_(false),
+ characters_(nullptr)
+#ifdef DEBUG
+ , last_added_(ADD_NONE)
+#endif
+{}
+
+void
+RegExpBuilder::FlushCharacters()
+{
+ pending_empty_ = false;
+ if (characters_ != nullptr) {
+ RegExpTree* atom = alloc->newInfallible<RegExpAtom>(characters_);
+ characters_ = nullptr;
+ text_.Add(alloc, atom);
+#ifdef DEBUG
+ last_added_ = ADD_ATOM;
+#endif
+ }
+}
+
+void
+RegExpBuilder::FlushText()
+{
+ FlushCharacters();
+ int num_text = text_.length();
+ if (num_text == 0)
+ return;
+ if (num_text == 1) {
+ terms_.Add(alloc, text_.last());
+ } else {
+ RegExpText* text = alloc->newInfallible<RegExpText>(alloc);
+ for (int i = 0; i < num_text; i++)
+ text_.Get(i)->AppendToText(text);
+ terms_.Add(alloc, text);
+ }
+ text_.Clear();
+}
+
+void
+RegExpBuilder::AddCharacter(char16_t c)
+{
+ pending_empty_ = false;
+ if (characters_ == nullptr)
+ characters_ = alloc->newInfallible<CharacterVector>(*alloc);
+ characters_->append(c);
+#ifdef DEBUG
+ last_added_ = ADD_CHAR;
+#endif
+}
+
+void
+RegExpBuilder::AddEmpty()
+{
+ pending_empty_ = true;
+}
+
+void
+RegExpBuilder::AddAtom(RegExpTree* term)
+{
+ if (term->IsEmpty()) {
+ AddEmpty();
+ return;
+ }
+ if (term->IsTextElement()) {
+ FlushCharacters();
+ text_.Add(alloc, term);
+ } else {
+ FlushText();
+ terms_.Add(alloc, term);
+ }
+#ifdef DEBUG
+ last_added_ = ADD_ATOM;
+#endif
+}
+
+void
+RegExpBuilder::AddAssertion(RegExpTree* assert)
+{
+ FlushText();
+ terms_.Add(alloc, assert);
+#ifdef DEBUG
+ last_added_ = ADD_ASSERT;
+#endif
+}
+
+void
+RegExpBuilder::NewAlternative()
+{
+ FlushTerms();
+}
+
+void
+RegExpBuilder::FlushTerms()
+{
+ FlushText();
+ int num_terms = terms_.length();
+ RegExpTree* alternative;
+ if (num_terms == 0)
+ alternative = RegExpEmpty::GetInstance();
+ else if (num_terms == 1)
+ alternative = terms_.last();
+ else
+ alternative = alloc->newInfallible<RegExpAlternative>(terms_.GetList(alloc));
+ alternatives_.Add(alloc, alternative);
+ terms_.Clear();
+#ifdef DEBUG
+ last_added_ = ADD_NONE;
+#endif
+}
+
+RegExpTree*
+RegExpBuilder::ToRegExp()
+{
+ FlushTerms();
+ int num_alternatives = alternatives_.length();
+ if (num_alternatives == 0) {
+ return RegExpEmpty::GetInstance();
+ }
+ if (num_alternatives == 1) {
+ return alternatives_.last();
+ }
+ return alloc->newInfallible<RegExpDisjunction>(alternatives_.GetList(alloc));
+}
+
+void
+RegExpBuilder::AddQuantifierToAtom(int min, int max,
+ RegExpQuantifier::QuantifierType quantifier_type)
+{
+ if (pending_empty_) {
+ pending_empty_ = false;
+ return;
+ }
+ RegExpTree* atom;
+ if (characters_ != nullptr) {
+ MOZ_ASSERT(last_added_ == ADD_CHAR);
+ // Last atom was character.
+ CharacterVector* char_vector = characters_;
+ int num_chars = char_vector->length();
+ if (num_chars > 1) {
+ CharacterVector* prefix = alloc->newInfallible<CharacterVector>(*alloc);
+ prefix->append(char_vector->begin(), num_chars - 1);
+ text_.Add(alloc, alloc->newInfallible<RegExpAtom>(prefix));
+ char_vector = alloc->newInfallible<CharacterVector>(*alloc);
+ char_vector->append((*characters_)[num_chars - 1]);
+ }
+ characters_ = nullptr;
+ atom = alloc->newInfallible<RegExpAtom>(char_vector);
+ FlushText();
+ } else if (text_.length() > 0) {
+ MOZ_ASSERT(last_added_ == ADD_ATOM);
+ atom = text_.RemoveLast();
+ FlushText();
+ } else if (terms_.length() > 0) {
+ MOZ_ASSERT(last_added_ == ADD_ATOM);
+ atom = terms_.RemoveLast();
+ if (atom->max_match() == 0) {
+ // Guaranteed to only match an empty string.
+#ifdef DEBUG
+ last_added_ = ADD_TERM;
+#endif
+ if (min == 0)
+ return;
+ terms_.Add(alloc, atom);
+ return;
+ }
+ } else {
+ // Only call immediately after adding an atom or character!
+ MOZ_CRASH("Bad call");
+ }
+ terms_.Add(alloc, alloc->newInfallible<RegExpQuantifier>(min, max, quantifier_type, atom));
+#ifdef DEBUG
+ last_added_ = ADD_TERM;
+#endif
+}
+
+// ----------------------------------------------------------------------------
+// RegExpParser
+
+template <typename CharT>
+RegExpParser<CharT>::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc,
+ const CharT* chars, const CharT* end, bool multiline_mode,
+ bool unicode, bool ignore_case)
+ : ts(ts),
+ alloc(alloc),
+ captures_(nullptr),
+ next_pos_(chars),
+ end_(end),
+ current_(kEndMarker),
+ capture_count_(0),
+ has_more_(true),
+ multiline_(multiline_mode),
+ unicode_(unicode),
+ ignore_case_(ignore_case),
+ simple_(false),
+ contains_anchor_(false),
+ is_scanned_for_captures_(false)
+{
+ Advance();
+}
+
+template <typename CharT>
+RegExpTree*
+RegExpParser<CharT>::ReportError(unsigned errorNumber)
+{
+ gc::AutoSuppressGC suppressGC(ts.context());
+ ts.reportError(errorNumber);
+ return nullptr;
+}
+
+template <typename CharT>
+void
+RegExpParser<CharT>::Advance()
+{
+ if (next_pos_ < end_) {
+ current_ = *next_pos_;
+ next_pos_++;
+ } else {
+ current_ = kEndMarker;
+ has_more_ = false;
+ }
+}
+
+// Returns the value (0 .. 15) of a hexadecimal character c.
+// If c is not a legal hexadecimal character, returns a value < 0.
+inline int
+HexValue(uint32_t c)
+{
+ c -= '0';
+ if (static_cast<unsigned>(c) <= 9) return c;
+ c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.
+ if (static_cast<unsigned>(c) <= 5) return c + 10;
+ return -1;
+}
+
+template <typename CharT>
+widechar
+RegExpParser<CharT>::ParseOctalLiteral()
+{
+ MOZ_ASSERT('0' <= current() && current() <= '7');
+ // For compatibility with some other browsers (not all), we parse
+ // up to three octal digits with a value below 256.
+ widechar value = current() - '0';
+ Advance();
+ if ('0' <= current() && current() <= '7') {
+ value = value * 8 + current() - '0';
+ Advance();
+ if (value < 32 && '0' <= current() && current() <= '7') {
+ value = value * 8 + current() - '0';
+ Advance();
+ }
+ }
+ return value;
+}
+
+template <typename CharT>
+bool
+RegExpParser<CharT>::ParseHexEscape(int length, widechar* value)
+{
+ const CharT* start = position();
+ uint32_t val = 0;
+ bool done = false;
+ for (int i = 0; !done; i++) {
+ widechar c = current();
+ int d = HexValue(c);
+ if (d < 0) {
+ Reset(start);
+ return false;
+ }
+ val = val * 16 + d;
+ Advance();
+ if (i == length - 1) {
+ done = true;
+ }
+ }
+ *value = val;
+ return true;
+}
+
+template <typename CharT>
+bool
+RegExpParser<CharT>::ParseBracedHexEscape(widechar* value)
+{
+ MOZ_ASSERT(current() == '{');
+ Advance();
+
+ bool first = true;
+ uint32_t code = 0;
+ while (true) {
+ widechar c = current();
+ if (c == kEndMarker) {
+ ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
+ return false;
+ }
+ if (c == '}') {
+ if (first) {
+ ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
+ return false;
+ }
+ Advance();
+ break;
+ }
+
+ int d = HexValue(c);
+ if (d < 0) {
+ ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
+ return false;
+ }
+ code = (code << 4) | d;
+ if (code > unicode::NonBMPMax) {
+ ReportError(JSMSG_UNICODE_OVERFLOW);
+ return false;
+ }
+ Advance();
+ first = false;
+ }
+
+ *value = code;
+ return true;
+}
+
+template <typename CharT>
+bool
+RegExpParser<CharT>::ParseTrailSurrogate(widechar* value)
+{
+ if (current() != '\\')
+ return false;
+
+ const CharT* start = position();
+ Advance();
+ if (current() != 'u') {
+ Reset(start);
+ return false;
+ }
+ Advance();
+ if (!ParseHexEscape(4, value)) {
+ Reset(start);
+ return false;
+ }
+ if (!unicode::IsTrailSurrogate(*value)) {
+ Reset(start);
+ return false;
+ }
+ return true;
+}
+
+template <typename CharT>
+bool
+RegExpParser<CharT>::ParseRawSurrogatePair(char16_t* lead, char16_t* trail)
+{
+ widechar c1 = current();
+ if (!unicode::IsLeadSurrogate(c1))
+ return false;
+
+ const CharT* start = position();
+ Advance();
+ widechar c2 = current();
+ if (!unicode::IsTrailSurrogate(c2)) {
+ Reset(start);
+ return false;
+ }
+ Advance();
+ *lead = c1;
+ *trail = c2;
+ return true;
+}
+
+static inline RegExpTree*
+RangeAtom(LifoAlloc* alloc, char16_t from, char16_t to)
+{
+ CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+ ranges->append(CharacterRange::Range(from, to));
+ return alloc->newInfallible<RegExpCharacterClass>(ranges, false);
+}
+
+static inline RegExpTree*
+NegativeLookahead(LifoAlloc* alloc, char16_t from, char16_t to)
+{
+ return alloc->newInfallible<RegExpLookahead>(RangeAtom(alloc, from, to), false, 0, 0);
+}
+
+static bool
+IsSyntaxCharacter(widechar c)
+{
+ switch (c) {
+ case '^':
+ case '$':
+ case '\\':
+ case '.':
+ case '*':
+ case '+':
+ case '?':
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case '{':
+ case '}':
+ case '|':
+ case '/':
+ return true;
+ default:
+ return false;
+ }
+}
+
+#ifdef DEBUG
+// Currently only used in an assert.kASSERT.
+static bool
+IsSpecialClassEscape(widechar c)
+{
+ switch (c) {
+ case 'd': case 'D':
+ case 's': case 'S':
+ case 'w': case 'W':
+ return true;
+ default:
+ return false;
+ }
+}
+#endif
+
+template <typename CharT>
+bool
+RegExpParser<CharT>::ParseClassCharacterEscape(widechar* code)
+{
+ MOZ_ASSERT(current() == '\\');
+ MOZ_ASSERT(has_next() && !IsSpecialClassEscape(Next()));
+ Advance();
+ switch (current()) {
+ case 'b':
+ Advance();
+ *code = '\b';
+ return true;
+ // ControlEscape :: one of
+ // f n r t v
+ case 'f':
+ Advance();
+ *code = '\f';
+ return true;
+ case 'n':
+ Advance();
+ *code = '\n';
+ return true;
+ case 'r':
+ Advance();
+ *code = '\r';
+ return true;
+ case 't':
+ Advance();
+ *code = '\t';
+ return true;
+ case 'v':
+ Advance();
+ *code = '\v';
+ return true;
+ case 'c': {
+ widechar controlLetter = Next();
+ widechar letter = controlLetter & ~('A' ^ 'a');
+ // For compatibility with JSC, inside a character class
+ // we also accept digits and underscore as control characters,
+ // but only in non-unicode mode
+ if ((!unicode_ &&
+ ((controlLetter >= '0' && controlLetter <= '9') ||
+ controlLetter == '_')) ||
+ (letter >= 'A' && letter <= 'Z'))
+ {
+ Advance(2);
+ // Control letters mapped to ASCII control characters in the range
+ // 0x00-0x1f.
+ *code = controlLetter & 0x1f;
+ return true;
+ }
+ if (unicode_) {
+ ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+ return false;
+ }
+ // We match JSC in reading the backslash as a literal
+ // character instead of as starting an escape.
+ *code = '\\';
+ return true;
+ }
+ case '0': case '1': case '2': case '3': case '4': case '5':
+ case '6': case '7':
+ if (unicode_) {
+ if (current() == '0') {
+ Advance();
+ *code = 0;
+ return true;
+ }
+ ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+ return false;
+ }
+ // For compatibility, outside of unicode mode, we interpret a decimal
+ // escape that isn't a back reference (and therefore either \0 or not
+ // valid according to the specification) as a 1..3 digit octal
+ // character code.
+ *code = ParseOctalLiteral();
+ return true;
+ case 'x': {
+ Advance();
+ widechar value;
+ if (ParseHexEscape(2, &value)) {
+ *code = value;
+ return true;
+ }
+ if (unicode_) {
+ ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+ return false;
+ }
+ // If \x is not followed by a two-digit hexadecimal, treat it
+ // as an identity escape in non-unicode mode.
+ *code = 'x';
+ return true;
+ }
+ case 'u': {
+ Advance();
+ widechar value;
+ if (unicode_) {
+ if (current() == '{') {
+ if (!ParseBracedHexEscape(&value))
+ return false;
+ *code = value;
+ return true;
+ }
+ if (ParseHexEscape(4, &value)) {
+ if (unicode::IsLeadSurrogate(value)) {
+ widechar trail;
+ if (ParseTrailSurrogate(&trail)) {
+ *code = unicode::UTF16Decode(value, trail);
+ return true;
+ }
+ }
+ *code = value;
+ return true;
+ }
+ ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
+ return false;
+ }
+ if (ParseHexEscape(4, &value)) {
+ *code = value;
+ return true;
+ }
+ // If \u is not followed by a four-digit or braced hexadecimal, treat it
+ // as an identity escape.
+ *code = 'u';
+ return true;
+ }
+ default: {
+ // Extended identity escape (non-unicode only). We accept any character
+ // that hasn't been matched by a more specific case, not just the subset
+ // required by the ECMAScript specification.
+ widechar result = current();
+ if (unicode_ && result != '-' && !IsSyntaxCharacter(result)) {
+ ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+ return false;
+ }
+ Advance();
+ *code = result;
+ return true;
+ }
+ }
+ return true;
+}
+
+class WideCharRange
+{
+ public:
+ WideCharRange()
+ : from_(0), to_(0)
+ {}
+
+ WideCharRange(widechar from, widechar to)
+ : from_(from), to_(to)
+ {}
+
+ static inline WideCharRange Singleton(widechar value) {
+ return WideCharRange(value, value);
+ }
+ static inline WideCharRange Range(widechar from, widechar to) {
+ MOZ_ASSERT(from <= to);
+ return WideCharRange(from, to);
+ }
+
+ bool Contains(widechar i) const { return from_ <= i && i <= to_; }
+ widechar from() const { return from_; }
+ widechar to() const { return to_; }
+
+ private:
+ widechar from_;
+ widechar to_;
+};
+
+typedef InfallibleVector<WideCharRange, 1> WideCharRangeVector;
+
+static inline CharacterRange
+LeadSurrogateRange()
+{
+ return CharacterRange::Range(unicode::LeadSurrogateMin, unicode::LeadSurrogateMax);
+}
+
+static inline CharacterRange
+TrailSurrogateRange()
+{
+ return CharacterRange::Range(unicode::TrailSurrogateMin, unicode::TrailSurrogateMax);
+}
+
+static inline WideCharRange
+NonBMPRange()
+{
+ return WideCharRange::Range(unicode::NonBMPMin, unicode::NonBMPMax);
+}
+
+static const char16_t kNoCharClass = 0;
+
+// Adds a character or pre-defined character class to character ranges.
+// If char_class is not kInvalidClass, it's interpreted as a class
+// escape (i.e., 's' means whitespace, from '\s').
+static inline void
+AddCharOrEscape(LifoAlloc* alloc,
+ CharacterRangeVector* ranges,
+ char16_t char_class,
+ widechar c)
+{
+ if (char_class != kNoCharClass)
+ CharacterRange::AddClassEscape(alloc, char_class, ranges);
+ else
+ ranges->append(CharacterRange::Singleton(c));
+}
+
+static inline void
+AddCharOrEscapeUnicode(LifoAlloc* alloc,
+ CharacterRangeVector* ranges,
+ CharacterRangeVector* lead_ranges,
+ CharacterRangeVector* trail_ranges,
+ WideCharRangeVector* wide_ranges,
+ char16_t char_class,
+ widechar c,
+ bool ignore_case)
+{
+ if (char_class != kNoCharClass) {
+ CharacterRange::AddClassEscapeUnicode(alloc, char_class, ranges, ignore_case);
+ switch (char_class) {
+ case 'S':
+ case 'W':
+ case 'D':
+ lead_ranges->append(LeadSurrogateRange());
+ trail_ranges->append(TrailSurrogateRange());
+ wide_ranges->append(NonBMPRange());
+ break;
+ case '.':
+ MOZ_CRASH("Bad char_class!");
+ }
+ return;
+ }
+
+ if (unicode::IsLeadSurrogate(c))
+ lead_ranges->append(CharacterRange::Singleton(c));
+ else if (unicode::IsTrailSurrogate(c))
+ trail_ranges->append(CharacterRange::Singleton(c));
+ else if (c >= unicode::NonBMPMin)
+ wide_ranges->append(WideCharRange::Singleton(c));
+ else
+ ranges->append(CharacterRange::Singleton(c));
+}
+
+static inline void
+AddUnicodeRange(LifoAlloc* alloc,
+ CharacterRangeVector* ranges,
+ CharacterRangeVector* lead_ranges,
+ CharacterRangeVector* trail_ranges,
+ WideCharRangeVector* wide_ranges,
+ widechar first,
+ widechar next)
+{
+ MOZ_ASSERT(first <= next);
+ if (first < unicode::LeadSurrogateMin) {
+ if (next < unicode::LeadSurrogateMin) {
+ ranges->append(CharacterRange::Range(first, next));
+ return;
+ }
+ ranges->append(CharacterRange::Range(first, unicode::LeadSurrogateMin - 1));
+ first = unicode::LeadSurrogateMin;
+ }
+ if (first <= unicode::LeadSurrogateMax) {
+ if (next <= unicode::LeadSurrogateMax) {
+ lead_ranges->append(CharacterRange::Range(first, next));
+ return;
+ }
+ lead_ranges->append(CharacterRange::Range(first, unicode::LeadSurrogateMax));
+ first = unicode::LeadSurrogateMax + 1;
+ }
+ MOZ_ASSERT(unicode::LeadSurrogateMax + 1 == unicode::TrailSurrogateMin);
+ if (first <= unicode::TrailSurrogateMax) {
+ if (next <= unicode::TrailSurrogateMax) {
+ trail_ranges->append(CharacterRange::Range(first, next));
+ return;
+ }
+ trail_ranges->append(CharacterRange::Range(first, unicode::TrailSurrogateMax));
+ first = unicode::TrailSurrogateMax + 1;
+ }
+ if (first <= unicode::UTF16Max) {
+ if (next <= unicode::UTF16Max) {
+ ranges->append(CharacterRange::Range(first, next));
+ return;
+ }
+ ranges->append(CharacterRange::Range(first, unicode::UTF16Max));
+ first = unicode::NonBMPMin;
+ }
+ MOZ_ASSERT(unicode::UTF16Max + 1 == unicode::NonBMPMin);
+ wide_ranges->append(WideCharRange::Range(first, next));
+}
+
+// Negate a vector of ranges by subtracting its ranges from a range
+// encompassing the full range of possible values.
+template <typename RangeType>
+static inline void
+NegateUnicodeRanges(LifoAlloc* alloc, InfallibleVector<RangeType, 1>** ranges,
+ RangeType full_range)
+{
+ typedef InfallibleVector<RangeType, 1> RangeVector;
+ RangeVector* tmp_ranges = alloc->newInfallible<RangeVector>(*alloc);
+ tmp_ranges->append(full_range);
+ RangeVector* result_ranges = alloc->newInfallible<RangeVector>(*alloc);
+
+ // Perform the following calculation:
+ // result_ranges = tmp_ranges - ranges
+ // with the following steps:
+ // result_ranges = tmp_ranges - ranges[0]
+ // SWAP(result_ranges, tmp_ranges)
+ // result_ranges = tmp_ranges - ranges[1]
+ // SWAP(result_ranges, tmp_ranges)
+ // ...
+ // result_ranges = tmp_ranges - ranges[N-1]
+ // SWAP(result_ranges, tmp_ranges)
+ // The last SWAP is just for simplicity of the loop.
+ for (size_t i = 0; i < (*ranges)->length(); i++) {
+ result_ranges->clear();
+
+ const RangeType& range = (**ranges)[i];
+ for (size_t j = 0; j < tmp_ranges->length(); j++) {
+ const RangeType& tmpRange = (*tmp_ranges)[j];
+ auto from1 = tmpRange.from();
+ auto to1 = tmpRange.to();
+ auto from2 = range.from();
+ auto to2 = range.to();
+
+ if (from1 < from2) {
+ if (to1 < from2) {
+ result_ranges->append(tmpRange);
+ } else if (to1 <= to2) {
+ result_ranges->append(RangeType::Range(from1, from2 - 1));
+ } else {
+ result_ranges->append(RangeType::Range(from1, from2 - 1));
+ result_ranges->append(RangeType::Range(to2 + 1, to1));
+ }
+ } else if (from1 <= to2) {
+ if (to1 > to2)
+ result_ranges->append(RangeType::Range(to2 + 1, to1));
+ } else {
+ result_ranges->append(tmpRange);
+ }
+ }
+
+ auto tmp = tmp_ranges;
+ tmp_ranges = result_ranges;
+ result_ranges = tmp;
+ }
+
+ // After the loop, result is pointed at by tmp_ranges, instead of
+ // result_ranges.
+ *ranges = tmp_ranges;
+}
+
+static bool
+WideCharRangesContain(WideCharRangeVector* wide_ranges, widechar c)
+{
+ for (size_t i = 0; i < wide_ranges->length(); i++) {
+ const WideCharRange& range = (*wide_ranges)[i];
+ if (range.Contains(c))
+ return true;
+ }
+ return false;
+}
+
+static void
+CalculateCaseInsensitiveRanges(LifoAlloc* alloc, widechar from, widechar to, int32_t diff,
+ WideCharRangeVector* wide_ranges,
+ WideCharRangeVector** tmp_wide_ranges)
+{
+ widechar contains_from = 0;
+ widechar contains_to = 0;
+ for (widechar c = from; c <= to; c++) {
+ if (WideCharRangesContain(wide_ranges, c) &&
+ !WideCharRangesContain(wide_ranges, c + diff))
+ {
+ if (contains_from == 0)
+ contains_from = c;
+ contains_to = c;
+ } else if (contains_from != 0) {
+ if (!*tmp_wide_ranges)
+ *tmp_wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
+
+ (*tmp_wide_ranges)->append(WideCharRange::Range(contains_from + diff,
+ contains_to + diff));
+ contains_from = 0;
+ }
+ }
+
+ if (contains_from != 0) {
+ if (!*tmp_wide_ranges)
+ *tmp_wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
+
+ (*tmp_wide_ranges)->append(WideCharRange::Range(contains_from + diff,
+ contains_to + diff));
+ }
+}
+
+static RegExpTree*
+UnicodeRangesAtom(LifoAlloc* alloc,
+ CharacterRangeVector* ranges,
+ CharacterRangeVector* lead_ranges,
+ CharacterRangeVector* trail_ranges,
+ WideCharRangeVector* wide_ranges,
+ bool is_negated,
+ bool ignore_case)
+{
+ // Calculate case folding for non-BMP first and negate the range if needed.
+ if (ignore_case) {
+ WideCharRangeVector* tmp_wide_ranges = nullptr;
+#define CALL_CALC(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
+ CalculateCaseInsensitiveRanges(alloc, FROM, TO, DIFF, wide_ranges, &tmp_wide_ranges);
+ FOR_EACH_NON_BMP_CASE_FOLDING(CALL_CALC)
+ FOR_EACH_NON_BMP_REV_CASE_FOLDING(CALL_CALC)
+#undef CALL_CALC
+
+ if (tmp_wide_ranges) {
+ for (size_t i = 0; i < tmp_wide_ranges->length(); i++)
+ wide_ranges->append((*tmp_wide_ranges)[i]);
+ }
+ }
+
+ if (is_negated) {
+ NegateUnicodeRanges(alloc, &lead_ranges, LeadSurrogateRange());
+ NegateUnicodeRanges(alloc, &trail_ranges, TrailSurrogateRange());
+ NegateUnicodeRanges(alloc, &wide_ranges, NonBMPRange());
+ }
+
+ RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
+
+ bool added = false;
+
+ if (is_negated) {
+ ranges->append(LeadSurrogateRange());
+ ranges->append(TrailSurrogateRange());
+ }
+ if (ranges->length() > 0) {
+ builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, is_negated));
+ added = true;
+ }
+
+ if (lead_ranges->length() > 0) {
+ if (added)
+ builder->NewAlternative();
+ builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(lead_ranges, false));
+ builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin,
+ unicode::TrailSurrogateMax));
+ added = true;
+ }
+
+ if (trail_ranges->length() > 0) {
+ if (added)
+ builder->NewAlternative();
+ builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
+ RegExpAssertion::NOT_AFTER_LEAD_SURROGATE));
+ builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(trail_ranges, false));
+ added = true;
+ }
+
+ for (size_t i = 0; i < wide_ranges->length(); i++) {
+ if (added)
+ builder->NewAlternative();
+
+ const WideCharRange& range = (*wide_ranges)[i];
+ widechar from = range.from();
+ widechar to = range.to();
+ char16_t from_lead, from_trail;
+ char16_t to_lead, to_trail;
+
+ unicode::UTF16Encode(from, &from_lead, &from_trail);
+ if (from == to) {
+ builder->AddCharacter(from_lead);
+ builder->AddCharacter(from_trail);
+ } else {
+ unicode::UTF16Encode(to, &to_lead, &to_trail);
+ if (from_lead == to_lead) {
+ MOZ_ASSERT(from_trail != to_trail);
+ builder->AddCharacter(from_lead);
+ builder->AddAtom(RangeAtom(alloc, from_trail, to_trail));
+ } else if (from_trail == unicode::TrailSurrogateMin &&
+ to_trail == unicode::TrailSurrogateMax)
+ {
+ builder->AddAtom(RangeAtom(alloc, from_lead, to_lead));
+ builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin,
+ unicode::TrailSurrogateMax));
+ } else if (from_lead + 1 == to_lead) {
+ builder->AddCharacter(from_lead);
+ builder->AddAtom(RangeAtom(alloc, from_trail, unicode::TrailSurrogateMax));
+
+ builder->NewAlternative();
+
+ builder->AddCharacter(to_lead);
+ builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, to_trail));
+ } else if (from_lead + 2 == to_lead) {
+ builder->AddCharacter(from_lead);
+ builder->AddAtom(RangeAtom(alloc, from_trail, unicode::TrailSurrogateMax));
+
+ builder->NewAlternative();
+
+ builder->AddCharacter(from_lead + 1);
+ builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin,
+ unicode::TrailSurrogateMax));
+
+ builder->NewAlternative();
+
+ builder->AddCharacter(to_lead);
+ builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, to_trail));
+ } else {
+ builder->AddCharacter(from_lead);
+ builder->AddAtom(RangeAtom(alloc, from_trail, unicode::TrailSurrogateMax));
+
+ builder->NewAlternative();
+
+ builder->AddAtom(RangeAtom(alloc, from_lead + 1, to_lead - 1));
+ builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin,
+ unicode::TrailSurrogateMax));
+
+ builder->NewAlternative();
+
+ builder->AddCharacter(to_lead);
+ builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, to_trail));
+ }
+ }
+ added = true;
+ }
+
+ return builder->ToRegExp();
+}
+
+template <typename CharT>
+RegExpTree*
+RegExpParser<CharT>::ParseCharacterClass()
+{
+ MOZ_ASSERT(current() == '[');
+ Advance();
+ bool is_negated = false;
+ if (current() == '^') {
+ is_negated = true;
+ Advance();
+ }
+ CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+ CharacterRangeVector* lead_ranges = nullptr;
+ CharacterRangeVector* trail_ranges = nullptr;
+ WideCharRangeVector* wide_ranges = nullptr;
+
+ if (unicode_) {
+ lead_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+ trail_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+ wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
+ }
+
+ while (has_more() && current() != ']') {
+ char16_t char_class = kNoCharClass;
+ widechar first = 0;
+ if (!ParseClassAtom(&char_class, &first))
+ return nullptr;
+ if (current() == '-') {
+ Advance();
+ if (current() == kEndMarker) {
+ // If we reach the end we break out of the loop and let the
+ // following code report an error.
+ break;
+ } else if (current() == ']') {
+ if (unicode_) {
+ AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges,
+ char_class, first, ignore_case_);
+ } else {
+ AddCharOrEscape(alloc, ranges, char_class, first);
+ }
+ ranges->append(CharacterRange::Singleton('-'));
+ break;
+ }
+ char16_t char_class_2 = kNoCharClass;
+ widechar next = 0;
+ if (!ParseClassAtom(&char_class_2, &next))
+ return nullptr;
+ if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
+ if (unicode_)
+ return ReportError(JSMSG_RANGE_WITH_CLASS_ESCAPE);
+
+ // Either end is an escaped character class. Treat the '-' verbatim.
+ AddCharOrEscape(alloc, ranges, char_class, first);
+ ranges->append(CharacterRange::Singleton('-'));
+ AddCharOrEscape(alloc, ranges, char_class_2, next);
+ continue;
+ }
+ if (first > next)
+ return ReportError(JSMSG_BAD_CLASS_RANGE);
+ if (unicode_)
+ AddUnicodeRange(alloc, ranges, lead_ranges, trail_ranges,wide_ranges, first, next);
+ else
+ ranges->append(CharacterRange::Range(first, next));
+ } else {
+ if (unicode_) {
+ AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges,
+ char_class, first, ignore_case_);
+ } else {
+ AddCharOrEscape(alloc, ranges, char_class, first);
+ }
+ }
+ }
+ if (!has_more())
+ return ReportError(JSMSG_UNTERM_CLASS);
+ Advance();
+ if (!unicode_) {
+ if (ranges->length() == 0) {
+ ranges->append(CharacterRange::Everything());
+ is_negated = !is_negated;
+ }
+ return alloc->newInfallible<RegExpCharacterClass>(ranges, is_negated);
+ }
+
+ if (!is_negated && ranges->length() == 0 && lead_ranges->length() == 0 &&
+ trail_ranges->length() == 0 && wide_ranges->length() == 0)
+ {
+ ranges->append(CharacterRange::Everything());
+ return alloc->newInfallible<RegExpCharacterClass>(ranges, true);
+ }
+
+ return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, is_negated,
+ ignore_case_);
+}
+
+template <typename CharT>
+bool
+RegExpParser<CharT>::ParseClassAtom(char16_t* char_class, widechar* value)
+{
+ MOZ_ASSERT(*char_class == kNoCharClass);
+ widechar first = current();
+ if (first == '\\') {
+ switch (Next()) {
+ case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {
+ *char_class = Next();
+ Advance(2);
+ return true;
+ }
+ case kEndMarker:
+ return ReportError(JSMSG_ESCAPE_AT_END_OF_REGEXP);
+ default:
+ if (!ParseClassCharacterEscape(value))
+ return false;
+ return true;
+ }
+ } else {
+ if (unicode_) {
+ char16_t lead, trail;
+ if (ParseRawSurrogatePair(&lead, &trail)) {
+ *value = unicode::UTF16Decode(lead, trail);
+ return true;
+ }
+ }
+ Advance();
+ *value = first;
+ return true;
+ }
+}
+
+// In order to know whether an escape is a backreference or not we have to scan
+// the entire regexp and find the number of capturing parentheses. However we
+// don't want to scan the regexp twice unless it is necessary. This mini-parser
+// is called when needed. It can see the difference between capturing and
+// noncapturing parentheses and can skip character classes and backslash-escaped
+// characters.
+template <typename CharT>
+void
+RegExpParser<CharT>::ScanForCaptures()
+{
+ // Start with captures started previous to current position
+ int capture_count = captures_started();
+ // Add count of captures after this position.
+ widechar n;
+ while ((n = current()) != kEndMarker) {
+ Advance();
+ switch (n) {
+ case '\\':
+ Advance();
+ break;
+ case '[': {
+ widechar c;
+ while ((c = current()) != kEndMarker) {
+ Advance();
+ if (c == '\\') {
+ Advance();
+ } else {
+ if (c == ']') break;
+ }
+ }
+ break;
+ }
+ case '(':
+ if (current() != '?') capture_count++;
+ break;
+ }
+ }
+ capture_count_ = capture_count;
+ is_scanned_for_captures_ = true;
+}
+
+inline bool
+IsInRange(int value, int lower_limit, int higher_limit)
+{
+ MOZ_ASSERT(lower_limit <= higher_limit);
+ return static_cast<unsigned int>(value - lower_limit) <=
+ static_cast<unsigned int>(higher_limit - lower_limit);
+}
+
+inline bool
+IsDecimalDigit(widechar c)
+{
+ // ECMA-262, 3rd, 7.8.3 (p 16)
+ return IsInRange(c, '0', '9');
+}
+
+template <typename CharT>
+bool
+RegExpParser<CharT>::ParseBackReferenceIndex(int* index_out)
+{
+ MOZ_ASSERT('\\' == current());
+ MOZ_ASSERT('1' <= Next() && Next() <= '9');
+
+ // Try to parse a decimal literal that is no greater than the total number
+ // of left capturing parentheses in the input.
+ const CharT* start = position();
+ int value = Next() - '0';
+ Advance(2);
+ while (true) {
+ widechar c = current();
+ if (IsDecimalDigit(c)) {
+ value = 10 * value + (c - '0');
+ if (value > kMaxCaptures) {
+ Reset(start);
+ return false;
+ }
+ Advance();
+ } else {
+ break;
+ }
+ }
+ if (value > captures_started()) {
+ if (!is_scanned_for_captures_) {
+ const CharT* saved_position = position();
+ ScanForCaptures();
+ Reset(saved_position);
+ }
+ if (value > capture_count_) {
+ Reset(start);
+ return false;
+ }
+ }
+ *index_out = value;
+ return true;
+}
+
+// QuantifierPrefix ::
+// { DecimalDigits }
+// { DecimalDigits , }
+// { DecimalDigits , DecimalDigits }
+//
+// Returns true if parsing succeeds, and set the min_out and max_out
+// values. Values are truncated to RegExpTree::kInfinity if they overflow.
+template <typename CharT>
+bool
+RegExpParser<CharT>::ParseIntervalQuantifier(int* min_out, int* max_out)
+{
+ MOZ_ASSERT(current() == '{');
+ const CharT* start = position();
+ Advance();
+ int min = 0;
+ if (!IsDecimalDigit(current())) {
+ Reset(start);
+ return false;
+ }
+ while (IsDecimalDigit(current())) {
+ int next = current() - '0';
+ if (min > (RegExpTree::kInfinity - next) / 10) {
+ // Overflow. Skip past remaining decimal digits and return -1.
+ do {
+ Advance();
+ } while (IsDecimalDigit(current()));
+ min = RegExpTree::kInfinity;
+ break;
+ }
+ min = 10 * min + next;
+ Advance();
+ }
+ int max = 0;
+ if (current() == '}') {
+ max = min;
+ Advance();
+ } else if (current() == ',') {
+ Advance();
+ if (current() == '}') {
+ max = RegExpTree::kInfinity;
+ Advance();
+ } else {
+ while (IsDecimalDigit(current())) {
+ int next = current() - '0';
+ if (max > (RegExpTree::kInfinity - next) / 10) {
+ do {
+ Advance();
+ } while (IsDecimalDigit(current()));
+ max = RegExpTree::kInfinity;
+ break;
+ }
+ max = 10 * max + next;
+ Advance();
+ }
+ if (current() != '}') {
+ Reset(start);
+ return false;
+ }
+ Advance();
+ }
+ } else {
+ Reset(start);
+ return false;
+ }
+ *min_out = min;
+ *max_out = max;
+ return true;
+}
+
+// Pattern ::
+// Disjunction
+template <typename CharT>
+RegExpTree*
+RegExpParser<CharT>::ParsePattern()
+{
+ RegExpTree* result = ParseDisjunction();
+ MOZ_ASSERT_IF(result, !has_more());
+ return result;
+}
+
+static inline RegExpTree*
+CaseFoldingSurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail, int32_t diff)
+{
+ RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
+
+ builder->AddCharacter(lead);
+ CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+ ranges->append(CharacterRange::Range(trail, trail));
+ ranges->append(CharacterRange::Range(trail + diff, trail + diff));
+ builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, false));
+
+ return builder->ToRegExp();
+}
+
+static inline RegExpTree*
+SurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail, bool ignore_case)
+{
+ if (ignore_case) {
+#define CALL_ATOM(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
+ if (lead == LEAD &&trail >= TRAIL_FROM && trail <= TRAIL_TO) \
+ return CaseFoldingSurrogatePairAtom(alloc, lead, trail, DIFF);
+ FOR_EACH_NON_BMP_CASE_FOLDING(CALL_ATOM)
+ FOR_EACH_NON_BMP_REV_CASE_FOLDING(CALL_ATOM)
+#undef CALL_ATOM
+ }
+
+ RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
+ builder->AddCharacter(lead);
+ builder->AddCharacter(trail);
+ return builder->ToRegExp();
+}
+
+static inline RegExpTree*
+LeadSurrogateAtom(LifoAlloc* alloc, char16_t value)
+{
+ RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
+ builder->AddCharacter(value);
+ builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin,
+ unicode::TrailSurrogateMax));
+ return builder->ToRegExp();
+}
+
+static inline RegExpTree*
+TrailSurrogateAtom(LifoAlloc* alloc, char16_t value)
+{
+ RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
+ builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
+ RegExpAssertion::NOT_AFTER_LEAD_SURROGATE));
+ builder->AddCharacter(value);
+ return builder->ToRegExp();
+}
+
+static inline RegExpTree*
+UnicodeEverythingAtom(LifoAlloc* alloc)
+{
+ RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
+
+ // everything except \x0a, \x0d, \u2028 and \u2029
+
+ CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+ ranges->append(CharacterRange::Range(0x0, 0x09));
+ ranges->append(CharacterRange::Range(0x0b, 0x0c));
+ ranges->append(CharacterRange::Range(0x0e, 0x2027));
+ ranges->append(CharacterRange::Range(0x202A, unicode::LeadSurrogateMin - 1));
+ ranges->append(CharacterRange::Range(unicode::TrailSurrogateMax + 1, unicode::UTF16Max));
+ builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, false));
+
+ builder->NewAlternative();
+
+ builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax));
+ builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin,
+ unicode::TrailSurrogateMax));
+
+ builder->NewAlternative();
+
+ builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
+ RegExpAssertion::NOT_AFTER_LEAD_SURROGATE));
+ builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax));
+
+ builder->NewAlternative();
+
+ builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax));
+ builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax));
+
+ return builder->ToRegExp();
+}
+
+RegExpTree*
+UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class, bool ignore_case)
+{
+ CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+ CharacterRangeVector* lead_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+ CharacterRangeVector* trail_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+ WideCharRangeVector* wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
+ AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, char_class, 0,
+ ignore_case);
+
+ return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, false, false);
+}
+
+static inline RegExpTree*
+UnicodeBackReferenceAtom(LifoAlloc* alloc, RegExpTree* atom)
+{
+ // If a back reference has a standalone lead surrogate as its last
+ // character, then that lead surrogate shouldn't match lead surrogates that
+ // are paired with a corresponding trail surrogate.
+ RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
+
+ builder->AddAtom(atom);
+ builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
+ RegExpAssertion::NOT_IN_SURROGATE_PAIR));
+
+ return builder->ToRegExp();
+}
+
+// Disjunction ::
+// Alternative
+// Alternative | Disjunction
+// Alternative ::
+// [empty]
+// Term Alternative
+// Term ::
+// Assertion
+// Atom
+// Atom Quantifier
+template <typename CharT>
+RegExpTree*
+RegExpParser<CharT>::ParseDisjunction()
+{
+ // Used to store current state while parsing subexpressions.
+ RegExpParserState initial_state(alloc, nullptr, INITIAL, 0);
+ RegExpParserState* stored_state = &initial_state;
+ // Cache the builder in a local variable for quick access.
+ RegExpBuilder* builder = initial_state.builder();
+ while (true) {
+ switch (current()) {
+ case kEndMarker:
+ if (stored_state->IsSubexpression()) {
+ // Inside a parenthesized group when hitting end of input.
+ return ReportError(JSMSG_MISSING_PAREN);
+ }
+ MOZ_ASSERT(INITIAL == stored_state->group_type());
+ // Parsing completed successfully.
+ return builder->ToRegExp();
+ case ')': {
+ if (!stored_state->IsSubexpression())
+ return ReportError(JSMSG_UNMATCHED_RIGHT_PAREN);
+ MOZ_ASSERT(INITIAL != stored_state->group_type());
+
+ Advance();
+ // End disjunction parsing and convert builder content to new single
+ // regexp atom.
+ RegExpTree* body = builder->ToRegExp();
+
+ int end_capture_index = captures_started();
+
+ int capture_index = stored_state->capture_index();
+ SubexpressionType group_type = stored_state->group_type();
+
+ // Restore previous state.
+ stored_state = stored_state->previous_state();
+ builder = stored_state->builder();
+
+ // Build result of subexpression.
+ if (group_type == CAPTURE) {
+ RegExpCapture* capture = alloc->newInfallible<RegExpCapture>(body, capture_index);
+ (*captures_)[capture_index - 1] = capture;
+ body = capture;
+ } else if (group_type != GROUPING) {
+ MOZ_ASSERT(group_type == POSITIVE_LOOKAHEAD ||
+ group_type == NEGATIVE_LOOKAHEAD);
+ bool is_positive = (group_type == POSITIVE_LOOKAHEAD);
+ body = alloc->newInfallible<RegExpLookahead>(body,
+ is_positive,
+ end_capture_index - capture_index,
+ capture_index);
+ }
+ builder->AddAtom(body);
+ if (unicode_ && (group_type == POSITIVE_LOOKAHEAD || group_type == NEGATIVE_LOOKAHEAD))
+ continue;
+ // For compatability with JSC and ES3, we allow quantifiers after
+ // lookaheads, and break in all cases.
+ break;
+ }
+ case '|': {
+ Advance();
+ builder->NewAlternative();
+ continue;
+ }
+ case '*':
+ case '+':
+ case '?':
+ return ReportError(JSMSG_NOTHING_TO_REPEAT);
+ case '^': {
+ Advance();
+ if (multiline_) {
+ builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::START_OF_LINE));
+ } else {
+ builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::START_OF_INPUT));
+ set_contains_anchor();
+ }
+ continue;
+ }
+ case '$': {
+ Advance();
+ RegExpAssertion::AssertionType assertion_type =
+ multiline_ ? RegExpAssertion::END_OF_LINE :
+ RegExpAssertion::END_OF_INPUT;
+ builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(assertion_type));
+ continue;
+ }
+ case '.': {
+ Advance();
+ // everything except \x0a, \x0d, \u2028 and \u2029
+ if (unicode_) {
+ builder->AddAtom(UnicodeEverythingAtom(alloc));
+ break;
+ }
+ CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+ CharacterRange::AddClassEscape(alloc, '.', ranges);
+ RegExpTree* atom = alloc->newInfallible<RegExpCharacterClass>(ranges, false);
+ builder->AddAtom(atom);
+ break;
+ }
+ case '(': {
+ SubexpressionType subexpr_type = CAPTURE;
+ Advance();
+ if (current() == '?') {
+ switch (Next()) {
+ case ':':
+ subexpr_type = GROUPING;
+ break;
+ case '=':
+ subexpr_type = POSITIVE_LOOKAHEAD;
+ break;
+ case '!':
+ subexpr_type = NEGATIVE_LOOKAHEAD;
+ break;
+ default:
+ return ReportError(JSMSG_INVALID_GROUP);
+ }
+ Advance(2);
+ } else {
+ if (captures_ == nullptr)
+ captures_ = alloc->newInfallible<RegExpCaptureVector>(*alloc);
+ if (captures_started() >= kMaxCaptures)
+ return ReportError(JSMSG_TOO_MANY_PARENS);
+ captures_->append((RegExpCapture*) nullptr);
+ }
+ // Store current state and begin new disjunction parsing.
+ stored_state = alloc->newInfallible<RegExpParserState>(alloc, stored_state, subexpr_type,
+ captures_started());
+ builder = stored_state->builder();
+ continue;
+ }
+ case '[': {
+ RegExpTree* atom = ParseCharacterClass();
+ if (!atom)
+ return nullptr;
+ builder->AddAtom(atom);
+ break;
+ }
+ // Atom ::
+ // \ AtomEscape
+ case '\\':
+ switch (Next()) {
+ case kEndMarker:
+ return ReportError(JSMSG_ESCAPE_AT_END_OF_REGEXP);
+ case 'b':
+ Advance(2);
+ builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::BOUNDARY));
+ continue;
+ case 'B':
+ Advance(2);
+ builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::NON_BOUNDARY));
+ continue;
+ // AtomEscape ::
+ // CharacterClassEscape
+ //
+ // CharacterClassEscape :: one of
+ // d D s S w W
+ case 'D': case 'S': case 'W':
+ if (unicode_) {
+ Advance();
+ builder->AddAtom(UnicodeCharacterClassEscapeAtom(alloc, current(),
+ ignore_case_));
+ Advance();
+ break;
+ }
+ MOZ_FALLTHROUGH;
+ case 'd': case 's': case 'w': {
+ widechar c = Next();
+ Advance(2);
+ CharacterRangeVector* ranges =
+ alloc->newInfallible<CharacterRangeVector>(*alloc);
+ if (unicode_)
+ CharacterRange::AddClassEscapeUnicode(alloc, c, ranges, ignore_case_);
+ else
+ CharacterRange::AddClassEscape(alloc, c, ranges);
+ RegExpTree* atom = alloc->newInfallible<RegExpCharacterClass>(ranges, false);
+ builder->AddAtom(atom);
+ break;
+ }
+ case '1': case '2': case '3': case '4': case '5': case '6':
+ case '7': case '8': case '9': {
+ int index = 0;
+ if (ParseBackReferenceIndex(&index)) {
+ RegExpCapture* capture = nullptr;
+ if (captures_ != nullptr && index <= (int) captures_->length()) {
+ capture = (*captures_)[index - 1];
+ }
+ if (capture == nullptr) {
+ builder->AddEmpty();
+ break;
+ }
+ RegExpTree* atom = alloc->newInfallible<RegExpBackReference>(capture);
+ if (unicode_)
+ builder->AddAtom(UnicodeBackReferenceAtom(alloc, atom));
+ else
+ builder->AddAtom(atom);
+ break;
+ }
+ if (unicode_)
+ return ReportError(JSMSG_BACK_REF_OUT_OF_RANGE);
+ widechar first_digit = Next();
+ if (first_digit == '8' || first_digit == '9') {
+ // Treat as identity escape
+ builder->AddCharacter(first_digit);
+ Advance(2);
+ break;
+ }
+ MOZ_FALLTHROUGH;
+ }
+ case '0': {
+ if (unicode_) {
+ Advance(2);
+ if (IsDecimalDigit(current()))
+ return ReportError(JSMSG_INVALID_DECIMAL_ESCAPE);
+ builder->AddCharacter(0);
+ break;
+ }
+
+ Advance();
+ widechar octal = ParseOctalLiteral();
+ builder->AddCharacter(octal);
+ break;
+ }
+ // ControlEscape :: one of
+ // f n r t v
+ case 'f':
+ Advance(2);
+ builder->AddCharacter('\f');
+ break;
+ case 'n':
+ Advance(2);
+ builder->AddCharacter('\n');
+ break;
+ case 'r':
+ Advance(2);
+ builder->AddCharacter('\r');
+ break;
+ case 't':
+ Advance(2);
+ builder->AddCharacter('\t');
+ break;
+ case 'v':
+ Advance(2);
+ builder->AddCharacter('\v');
+ break;
+ case 'c': {
+ Advance();
+ widechar controlLetter = Next();
+ // Special case if it is an ASCII letter.
+ // Convert lower case letters to uppercase.
+ widechar letter = controlLetter & ~('a' ^ 'A');
+ if (letter < 'A' || 'Z' < letter) {
+ if (unicode_)
+ return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+ // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
+ // This is outside the specification. We match JSC in
+ // reading the backslash as a literal character instead
+ // of as starting an escape.
+ builder->AddCharacter('\\');
+ } else {
+ Advance(2);
+ builder->AddCharacter(controlLetter & 0x1f);
+ }
+ break;
+ }
+ case 'x': {
+ Advance(2);
+ widechar value;
+ if (ParseHexEscape(2, &value)) {
+ builder->AddCharacter(value);
+ } else {
+ if (unicode_)
+ return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+ builder->AddCharacter('x');
+ }
+ break;
+ }
+ case 'u': {
+ Advance(2);
+ widechar value;
+ if (unicode_) {
+ if (current() == '{') {
+ if (!ParseBracedHexEscape(&value))
+ return nullptr;
+ if (unicode::IsLeadSurrogate(value)) {
+ builder->AddAtom(LeadSurrogateAtom(alloc, value));
+ } else if (unicode::IsTrailSurrogate(value)) {
+ builder->AddAtom(TrailSurrogateAtom(alloc, value));
+ } else if (value >= unicode::NonBMPMin) {
+ char16_t lead, trail;
+ unicode::UTF16Encode(value, &lead, &trail);
+ builder->AddAtom(SurrogatePairAtom(alloc, lead, trail,
+ ignore_case_));
+ } else {
+ builder->AddCharacter(value);
+ }
+ } else if (ParseHexEscape(4, &value)) {
+ if (unicode::IsLeadSurrogate(value)) {
+ widechar trail;
+ if (ParseTrailSurrogate(&trail)) {
+ builder->AddAtom(SurrogatePairAtom(alloc, value, trail,
+ ignore_case_));
+ } else {
+ builder->AddAtom(LeadSurrogateAtom(alloc, value));
+ }
+ } else if (unicode::IsTrailSurrogate(value)) {
+ builder->AddAtom(TrailSurrogateAtom(alloc, value));
+ } else {
+ builder->AddCharacter(value);
+ }
+ } else {
+ return ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
+ }
+ break;
+ }
+ if (ParseHexEscape(4, &value)) {
+ builder->AddCharacter(value);
+ } else {
+ builder->AddCharacter('u');
+ }
+ break;
+ }
+ default:
+ // Identity escape.
+ if (unicode_ && !IsSyntaxCharacter(Next()))
+ return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+ builder->AddCharacter(Next());
+ Advance(2);
+ break;
+ }
+ break;
+ case '{': {
+ if (unicode_)
+ return ReportError(JSMSG_RAW_BRACE_IN_REGEP);
+ int dummy;
+ if (ParseIntervalQuantifier(&dummy, &dummy))
+ return ReportError(JSMSG_NOTHING_TO_REPEAT);
+ MOZ_FALLTHROUGH;
+ }
+ default:
+ if (unicode_) {
+ char16_t lead, trail;
+ if (ParseRawSurrogatePair(&lead, &trail)) {
+ builder->AddAtom(SurrogatePairAtom(alloc, lead, trail, ignore_case_));
+ } else {
+ widechar c = current();
+ if (unicode::IsLeadSurrogate(c))
+ builder->AddAtom(LeadSurrogateAtom(alloc, c));
+ else if (unicode::IsTrailSurrogate(c))
+ builder->AddAtom(TrailSurrogateAtom(alloc, c));
+ else if (c == ']')
+ return ReportError(JSMSG_RAW_BRACKET_IN_REGEP);
+ else if (c == '}')
+ return ReportError(JSMSG_RAW_BRACE_IN_REGEP);
+ else
+ builder->AddCharacter(c);
+ Advance();
+ }
+ break;
+ }
+ builder->AddCharacter(current());
+ Advance();
+ break;
+ } // end switch(current())
+
+ int min;
+ int max;
+ switch (current()) {
+ // QuantifierPrefix ::
+ // *
+ // +
+ // ?
+ // {
+ case '*':
+ min = 0;
+ max = RegExpTree::kInfinity;
+ Advance();
+ break;
+ case '+':
+ min = 1;
+ max = RegExpTree::kInfinity;
+ Advance();
+ break;
+ case '?':
+ min = 0;
+ max = 1;
+ Advance();
+ break;
+ case '{':
+ if (ParseIntervalQuantifier(&min, &max)) {
+ if (max < min)
+ return ReportError(JSMSG_NUMBERS_OUT_OF_ORDER);
+ break;
+ } else {
+ continue;
+ }
+ default:
+ continue;
+ }
+ RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
+ if (current() == '?') {
+ quantifier_type = RegExpQuantifier::NON_GREEDY;
+ Advance();
+ }
+ builder->AddQuantifierToAtom(min, max, quantifier_type);
+ }
+}
+
+template class irregexp::RegExpParser<Latin1Char>;
+template class irregexp::RegExpParser<char16_t>;
+
+template <typename CharT>
+static bool
+ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length,
+ bool multiline, bool match_only, bool unicode, bool ignore_case,
+ bool global, bool sticky, RegExpCompileData* data)
+{
+ if (match_only) {
+ // Try to strip a leading '.*' from the RegExp, but only if it is not
+ // followed by a '?' (which will affect how the .* is parsed). This
+ // pattern will affect the captures produced by the RegExp, but not
+ // whether there is a match or not.
+ if (length >= 3 && chars[0] == '.' && chars[1] == '*' && chars[2] != '?') {
+ chars += 2;
+ length -= 2;
+ }
+
+ // Try to strip a trailing '.*' from the RegExp, which as above will
+ // affect the captures but not whether there is a match. Only do this
+ // when the following conditions are met:
+ // 1. there are no other meta characters in the RegExp, so that we
+ // are sure this will not affect how the RegExp is parsed
+ // 2. global and sticky flags are not set, as lastIndex needs to be
+ // set properly on global or sticky match
+ if (length >= 3 && !HasRegExpMetaChars(chars, length - 2) &&
+ !global && !sticky &&
+ chars[length - 2] == '.' && chars[length - 1] == '*')
+ {
+ length -= 2;
+ }
+ }
+
+ RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, multiline, unicode, ignore_case);
+ data->tree = parser.ParsePattern();
+ if (!data->tree)
+ return false;
+
+ data->simple = parser.simple();
+ data->contains_anchor = parser.contains_anchor();
+ data->capture_count = parser.captures_started();
+ return true;
+}
+
+bool
+irregexp::ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str,
+ bool multiline, bool match_only, bool unicode, bool ignore_case,
+ bool global, bool sticky, RegExpCompileData* data)
+{
+ JS::AutoCheckCannotGC nogc;
+ return str->hasLatin1Chars()
+ ? ::ParsePattern(ts, alloc, str->latin1Chars(nogc), str->length(),
+ multiline, match_only, unicode, ignore_case, global, sticky, data)
+ : ::ParsePattern(ts, alloc, str->twoByteChars(nogc), str->length(),
+ multiline, match_only, unicode, ignore_case, global, sticky, data);
+}
+
+template <typename CharT>
+static bool
+ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length,
+ bool unicode)
+{
+ LifoAllocScope scope(&alloc);
+
+ RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, false, unicode, false);
+ return parser.ParsePattern() != nullptr;
+}
+
+bool
+irregexp::ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str,
+ bool unicode)
+{
+ JS::AutoCheckCannotGC nogc;
+ return str->hasLatin1Chars()
+ ? ::ParsePatternSyntax(ts, alloc, str->latin1Chars(nogc), str->length(), unicode)
+ : ::ParsePatternSyntax(ts, alloc, str->twoByteChars(nogc), str->length(), unicode);
+}