Add m-esr52 at 52.6.0

author: Matt A. Tobin <mattatobin@localhost.localdomain> 2018-02-02 04:16:08 -0500
committer: Matt A. Tobin <mattatobin@localhost.localdomain> 2018-02-02 04:16:08 -0500
commit: 5f8de423f190bbb79a62f804151bc24824fa32d8 (patch)
tree: 10027f336435511475e392454359edea8e25895d /js/src/irregexp/RegExpParser.cpp
parent: 49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff)
download: UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip
1 files changed, 1905 insertions, 0 deletions
diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp
new file mode 100644
index 000000000..ccc6ae3eb
--- /dev/null
+++ b/js/src/irregexp/RegExpParser.cpp
@@ -0,0 +1,1905 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * vim: set ts=8 sts=4 et sw=4 tw=99: */
+
+// Copyright 2012 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "irregexp/RegExpParser.h"
+
+#include "frontend/TokenStream.h"
+
+using namespace js;
+using namespace js::irregexp;
+
+// ----------------------------------------------------------------------------
+// RegExpBuilder
+
+RegExpBuilder::RegExpBuilder(LifoAlloc* alloc)
+  : alloc(alloc),
+    pending_empty_(false),
+    characters_(nullptr)
+#ifdef DEBUG
+  , last_added_(ADD_NONE)
+#endif
+{}
+
+void
+RegExpBuilder::FlushCharacters()
+{
+    pending_empty_ = false;
+    if (characters_ != nullptr) {
+        RegExpTree* atom = alloc->newInfallible<RegExpAtom>(characters_);
+        characters_ = nullptr;
+        text_.Add(alloc, atom);
+#ifdef DEBUG
+        last_added_ = ADD_ATOM;
+#endif
+    }
+}
+
+void
+RegExpBuilder::FlushText()
+{
+    FlushCharacters();
+    int num_text = text_.length();
+    if (num_text == 0)
+        return;
+    if (num_text == 1) {
+        terms_.Add(alloc, text_.last());
+    } else {
+        RegExpText* text = alloc->newInfallible<RegExpText>(alloc);
+        for (int i = 0; i < num_text; i++)
+            text_.Get(i)->AppendToText(text);
+        terms_.Add(alloc, text);
+    }
+    text_.Clear();
+}
+
+void
+RegExpBuilder::AddCharacter(char16_t c)
+{
+    pending_empty_ = false;
+    if (characters_ == nullptr)
+        characters_ = alloc->newInfallible<CharacterVector>(*alloc);
+    characters_->append(c);
+#ifdef DEBUG
+    last_added_ = ADD_CHAR;
+#endif
+}
+
+void
+RegExpBuilder::AddEmpty()
+{
+    pending_empty_ = true;
+}
+
+void
+RegExpBuilder::AddAtom(RegExpTree* term)
+{
+    if (term->IsEmpty()) {
+        AddEmpty();
+        return;
+    }
+    if (term->IsTextElement()) {
+        FlushCharacters();
+        text_.Add(alloc, term);
+    } else {
+        FlushText();
+        terms_.Add(alloc, term);
+    }
+#ifdef DEBUG
+    last_added_ = ADD_ATOM;
+#endif
+}
+
+void
+RegExpBuilder::AddAssertion(RegExpTree* assert)
+{
+    FlushText();
+    terms_.Add(alloc, assert);
+#ifdef DEBUG
+    last_added_ = ADD_ASSERT;
+#endif
+}
+
+void
+RegExpBuilder::NewAlternative()
+{
+    FlushTerms();
+}
+
+void
+RegExpBuilder::FlushTerms()
+{
+    FlushText();
+    int num_terms = terms_.length();
+    RegExpTree* alternative;
+    if (num_terms == 0)
+        alternative = RegExpEmpty::GetInstance();
+    else if (num_terms == 1)
+        alternative = terms_.last();
+    else
+        alternative = alloc->newInfallible<RegExpAlternative>(terms_.GetList(alloc));
+    alternatives_.Add(alloc, alternative);
+    terms_.Clear();
+#ifdef DEBUG
+    last_added_ = ADD_NONE;
+#endif
+}
+
+RegExpTree*
+RegExpBuilder::ToRegExp()
+{
+    FlushTerms();
+    int num_alternatives = alternatives_.length();
+    if (num_alternatives == 0) {
+        return RegExpEmpty::GetInstance();
+    }
+    if (num_alternatives == 1) {
+        return alternatives_.last();
+    }
+    return alloc->newInfallible<RegExpDisjunction>(alternatives_.GetList(alloc));
+}
+
+void
+RegExpBuilder::AddQuantifierToAtom(int min, int max,
+                                   RegExpQuantifier::QuantifierType quantifier_type)
+{
+    if (pending_empty_) {
+        pending_empty_ = false;
+        return;
+    }
+    RegExpTree* atom;
+    if (characters_ != nullptr) {
+        MOZ_ASSERT(last_added_ == ADD_CHAR);
+        // Last atom was character.
+        CharacterVector* char_vector = characters_;
+        int num_chars = char_vector->length();
+        if (num_chars > 1) {
+            CharacterVector* prefix = alloc->newInfallible<CharacterVector>(*alloc);
+            prefix->append(char_vector->begin(), num_chars - 1);
+            text_.Add(alloc, alloc->newInfallible<RegExpAtom>(prefix));
+            char_vector = alloc->newInfallible<CharacterVector>(*alloc);
+            char_vector->append((*characters_)[num_chars - 1]);
+        }
+        characters_ = nullptr;
+        atom = alloc->newInfallible<RegExpAtom>(char_vector);
+        FlushText();
+    } else if (text_.length() > 0) {
+        MOZ_ASSERT(last_added_ == ADD_ATOM);
+        atom = text_.RemoveLast();
+        FlushText();
+    } else if (terms_.length() > 0) {
+        MOZ_ASSERT(last_added_ == ADD_ATOM);
+        atom = terms_.RemoveLast();
+        if (atom->max_match() == 0) {
+            // Guaranteed to only match an empty string.
+#ifdef DEBUG
+            last_added_ = ADD_TERM;
+#endif
+            if (min == 0)
+                return;
+            terms_.Add(alloc, atom);
+            return;
+        }
+    } else {
+        // Only call immediately after adding an atom or character!
+        MOZ_CRASH("Bad call");
+    }
+    terms_.Add(alloc, alloc->newInfallible<RegExpQuantifier>(min, max, quantifier_type, atom));
+#ifdef DEBUG
+    last_added_ = ADD_TERM;
+#endif
+}
+
+// ----------------------------------------------------------------------------
+// RegExpParser
+
+template <typename CharT>
+RegExpParser<CharT>::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc,
+                                  const CharT* chars, const CharT* end, bool multiline_mode,
+                                  bool unicode, bool ignore_case)
+  : ts(ts),
+    alloc(alloc),
+    captures_(nullptr),
+    next_pos_(chars),
+    end_(end),
+    current_(kEndMarker),
+    capture_count_(0),
+    has_more_(true),
+    multiline_(multiline_mode),
+    unicode_(unicode),
+    ignore_case_(ignore_case),
+    simple_(false),
+    contains_anchor_(false),
+    is_scanned_for_captures_(false)
+{
+    Advance();
+}
+
+template <typename CharT>
+RegExpTree*
+RegExpParser<CharT>::ReportError(unsigned errorNumber)
+{
+    gc::AutoSuppressGC suppressGC(ts.context());
+    ts.reportError(errorNumber);
+    return nullptr;
+}
+
+template <typename CharT>
+void
+RegExpParser<CharT>::Advance()
+{
+    if (next_pos_ < end_) {
+        current_ = *next_pos_;
+        next_pos_++;
+    } else {
+        current_ = kEndMarker;
+        has_more_ = false;
+    }
+}
+
+// Returns the value (0 .. 15) of a hexadecimal character c.
+// If c is not a legal hexadecimal character, returns a value < 0.
+inline int
+HexValue(uint32_t c)
+{
+    c -= '0';
+    if (static_cast<unsigned>(c) <= 9) return c;
+    c = (c | 0x20) - ('a' - '0');  // detect 0x11..0x16 and 0x31..0x36.
+    if (static_cast<unsigned>(c) <= 5) return c + 10;
+    return -1;
+}
+
+template <typename CharT>
+widechar
+RegExpParser<CharT>::ParseOctalLiteral()
+{
+    MOZ_ASSERT('0' <= current() && current() <= '7');
+    // For compatibility with some other browsers (not all), we parse
+    // up to three octal digits with a value below 256.
+    widechar value = current() - '0';
+    Advance();
+    if ('0' <= current() && current() <= '7') {
+        value = value * 8 + current() - '0';
+        Advance();
+        if (value < 32 && '0' <= current() && current() <= '7') {
+            value = value * 8 + current() - '0';
+            Advance();
+        }
+    }
+    return value;
+}
+
+template <typename CharT>
+bool
+RegExpParser<CharT>::ParseHexEscape(int length, widechar* value)
+{
+    const CharT* start = position();
+    uint32_t val = 0;
+    bool done = false;
+    for (int i = 0; !done; i++) {
+        widechar c = current();
+        int d = HexValue(c);
+        if (d < 0) {
+            Reset(start);
+            return false;
+        }
+        val = val * 16 + d;
+        Advance();
+        if (i == length - 1) {
+            done = true;
+        }
+    }
+    *value = val;
+    return true;
+}
+
+template <typename CharT>
+bool
+RegExpParser<CharT>::ParseBracedHexEscape(widechar* value)
+{
+    MOZ_ASSERT(current() == '{');
+    Advance();
+
+    bool first = true;
+    uint32_t code = 0;
+    while (true) {
+        widechar c = current();
+        if (c == kEndMarker) {
+            ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
+            return false;
+        }
+        if (c == '}') {
+            if (first) {
+                ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
+                return false;
+            }
+            Advance();
+            break;
+        }
+
+        int d = HexValue(c);
+        if (d < 0) {
+            ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
+            return false;
+        }
+        code = (code << 4) | d;
+        if (code > unicode::NonBMPMax) {
+            ReportError(JSMSG_UNICODE_OVERFLOW);
+            return false;
+        }
+        Advance();
+        first = false;
+    }
+
+    *value = code;
+    return true;
+}
+
+template <typename CharT>
+bool
+RegExpParser<CharT>::ParseTrailSurrogate(widechar* value)
+{
+    if (current() != '\\')
+        return false;
+
+    const CharT* start = position();
+    Advance();
+    if (current() != 'u') {
+        Reset(start);
+        return false;
+    }
+    Advance();
+    if (!ParseHexEscape(4, value)) {
+        Reset(start);
+        return false;
+    }
+    if (!unicode::IsTrailSurrogate(*value)) {
+        Reset(start);
+        return false;
+    }
+    return true;
+}
+
+template <typename CharT>
+bool
+RegExpParser<CharT>::ParseRawSurrogatePair(char16_t* lead, char16_t* trail)
+{
+    widechar c1 = current();
+    if (!unicode::IsLeadSurrogate(c1))
+        return false;
+
+    const CharT* start = position();
+    Advance();
+    widechar c2 = current();
+    if (!unicode::IsTrailSurrogate(c2)) {
+        Reset(start);
+        return false;
+    }
+    Advance();
+    *lead = c1;
+    *trail = c2;
+    return true;
+}
+
+static inline RegExpTree*
+RangeAtom(LifoAlloc* alloc, char16_t from, char16_t to)
+{
+    CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+    ranges->append(CharacterRange::Range(from, to));
+    return alloc->newInfallible<RegExpCharacterClass>(ranges, false);
+}
+
+static inline RegExpTree*
+NegativeLookahead(LifoAlloc* alloc, char16_t from, char16_t to)
+{
+    return alloc->newInfallible<RegExpLookahead>(RangeAtom(alloc, from, to), false, 0, 0);
+}
+
+static bool
+IsSyntaxCharacter(widechar c)
+{
+  switch (c) {
+    case '^':
+    case '$':
+    case '\\':
+    case '.':
+    case '*':
+    case '+':
+    case '?':
+    case '(':
+    case ')':
+    case '[':
+    case ']':
+    case '{':
+    case '}':
+    case '|':
+    case '/':
+      return true;
+    default:
+      return false;
+  }
+}
+
+#ifdef DEBUG
+// Currently only used in an assert.kASSERT.
+static bool
+IsSpecialClassEscape(widechar c)
+{
+  switch (c) {
+    case 'd': case 'D':
+    case 's': case 'S':
+    case 'w': case 'W':
+      return true;
+    default:
+      return false;
+  }
+}
+#endif
+
+template <typename CharT>
+bool
+RegExpParser<CharT>::ParseClassCharacterEscape(widechar* code)
+{
+    MOZ_ASSERT(current() == '\\');
+    MOZ_ASSERT(has_next() && !IsSpecialClassEscape(Next()));
+    Advance();
+    switch (current()) {
+      case 'b':
+        Advance();
+        *code = '\b';
+        return true;
+      // ControlEscape :: one of
+      //   f n r t v
+      case 'f':
+        Advance();
+        *code = '\f';
+        return true;
+      case 'n':
+        Advance();
+        *code = '\n';
+        return true;
+      case 'r':
+        Advance();
+        *code = '\r';
+        return true;
+      case 't':
+        Advance();
+        *code = '\t';
+        return true;
+      case 'v':
+        Advance();
+        *code = '\v';
+        return true;
+      case 'c': {
+        widechar controlLetter = Next();
+        widechar letter = controlLetter & ~('A' ^ 'a');
+        // For compatibility with JSC, inside a character class
+        // we also accept digits and underscore as control characters,
+        // but only in non-unicode mode
+        if ((!unicode_ &&
+             ((controlLetter >= '0' && controlLetter <= '9') ||
+              controlLetter == '_')) ||
+            (letter >= 'A' && letter <= 'Z'))
+        {
+            Advance(2);
+            // Control letters mapped to ASCII control characters in the range
+            // 0x00-0x1f.
+            *code = controlLetter & 0x1f;
+            return true;
+        }
+        if (unicode_) {
+            ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+            return false;
+        }
+        // We match JSC in reading the backslash as a literal
+        // character instead of as starting an escape.
+        *code = '\\';
+        return true;
+      }
+      case '0': case '1': case '2': case '3': case '4': case '5':
+      case '6': case '7':
+        if (unicode_) {
+            if (current() == '0') {
+                Advance();
+                *code = 0;
+                return true;
+            }
+            ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+            return false;
+        }
+        // For compatibility, outside of unicode mode, we interpret a decimal
+        // escape that isn't a back reference (and therefore either \0 or not
+        // valid according to the specification) as a 1..3 digit octal
+        // character code.
+        *code = ParseOctalLiteral();
+        return true;
+      case 'x': {
+        Advance();
+        widechar value;
+        if (ParseHexEscape(2, &value)) {
+            *code = value;
+            return true;
+        }
+        if (unicode_) {
+            ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+            return false;
+        }
+        // If \x is not followed by a two-digit hexadecimal, treat it
+        // as an identity escape in non-unicode mode.
+        *code = 'x';
+        return true;
+      }
+      case 'u': {
+        Advance();
+        widechar value;
+        if (unicode_) {
+            if (current() == '{') {
+                if (!ParseBracedHexEscape(&value))
+                    return false;
+                *code = value;
+                return true;
+            }
+            if (ParseHexEscape(4, &value)) {
+                if (unicode::IsLeadSurrogate(value)) {
+                    widechar trail;
+                    if (ParseTrailSurrogate(&trail)) {
+                        *code = unicode::UTF16Decode(value, trail);
+                        return true;
+                    }
+                }
+                *code = value;
+                return true;
+            }
+            ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
+            return false;
+        }
+        if (ParseHexEscape(4, &value)) {
+            *code = value;
+            return true;
+        }
+        // If \u is not followed by a four-digit or braced hexadecimal, treat it
+        // as an identity escape.
+        *code = 'u';
+        return true;
+      }
+      default: {
+        // Extended identity escape (non-unicode only). We accept any character
+        // that hasn't been matched by a more specific case, not just the subset
+        // required by the ECMAScript specification.
+        widechar result = current();
+        if (unicode_ && result != '-' && !IsSyntaxCharacter(result)) {
+            ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+            return false;
+        }
+        Advance();
+        *code = result;
+        return true;
+      }
+    }
+    return true;
+}
+
+class WideCharRange
+{
+  public:
+    WideCharRange()
+      : from_(0), to_(0)
+    {}
+
+    WideCharRange(widechar from, widechar to)
+      : from_(from), to_(to)
+    {}
+
+    static inline WideCharRange Singleton(widechar value) {
+        return WideCharRange(value, value);
+    }
+    static inline WideCharRange Range(widechar from, widechar to) {
+        MOZ_ASSERT(from <= to);
+        return WideCharRange(from, to);
+    }
+
+    bool Contains(widechar i) const { return from_ <= i && i <= to_; }
+    widechar from() const { return from_; }
+    widechar to() const { return to_; }
+
+  private:
+    widechar from_;
+    widechar to_;
+};
+
+typedef InfallibleVector<WideCharRange, 1> WideCharRangeVector;
+
+static inline CharacterRange
+LeadSurrogateRange()
+{
+    return CharacterRange::Range(unicode::LeadSurrogateMin, unicode::LeadSurrogateMax);
+}
+
+static inline CharacterRange
+TrailSurrogateRange()
+{
+    return CharacterRange::Range(unicode::TrailSurrogateMin, unicode::TrailSurrogateMax);
+}
+
+static inline WideCharRange
+NonBMPRange()
+{
+    return WideCharRange::Range(unicode::NonBMPMin, unicode::NonBMPMax);
+}
+
+static const char16_t kNoCharClass = 0;
+
+// Adds a character or pre-defined character class to character ranges.
+// If char_class is not kInvalidClass, it's interpreted as a class
+// escape (i.e., 's' means whitespace, from '\s').
+static inline void
+AddCharOrEscape(LifoAlloc* alloc,
+                CharacterRangeVector* ranges,
+                char16_t char_class,
+                widechar c)
+{
+    if (char_class != kNoCharClass)
+        CharacterRange::AddClassEscape(alloc, char_class, ranges);
+    else
+        ranges->append(CharacterRange::Singleton(c));
+}
+
+static inline void
+AddCharOrEscapeUnicode(LifoAlloc* alloc,
+                       CharacterRangeVector* ranges,
+                       CharacterRangeVector* lead_ranges,
+                       CharacterRangeVector* trail_ranges,
+                       WideCharRangeVector* wide_ranges,
+                       char16_t char_class,
+                       widechar c,
+                       bool ignore_case)
+{
+    if (char_class != kNoCharClass) {
+        CharacterRange::AddClassEscapeUnicode(alloc, char_class, ranges, ignore_case);
+        switch (char_class) {
+          case 'S':
+          case 'W':
+          case 'D':
+            lead_ranges->append(LeadSurrogateRange());
+            trail_ranges->append(TrailSurrogateRange());
+            wide_ranges->append(NonBMPRange());
+            break;
+          case '.':
+            MOZ_CRASH("Bad char_class!");
+        }
+        return;
+    }
+
+    if (unicode::IsLeadSurrogate(c))
+        lead_ranges->append(CharacterRange::Singleton(c));
+    else if (unicode::IsTrailSurrogate(c))
+        trail_ranges->append(CharacterRange::Singleton(c));
+    else if (c >= unicode::NonBMPMin)
+        wide_ranges->append(WideCharRange::Singleton(c));
+    else
+        ranges->append(CharacterRange::Singleton(c));
+}
+
+static inline void
+AddUnicodeRange(LifoAlloc* alloc,
+                CharacterRangeVector* ranges,
+                CharacterRangeVector* lead_ranges,
+                CharacterRangeVector* trail_ranges,
+                WideCharRangeVector* wide_ranges,
+                widechar first,
+                widechar next)
+{
+    MOZ_ASSERT(first <= next);
+    if (first < unicode::LeadSurrogateMin) {
+        if (next < unicode::LeadSurrogateMin) {
+            ranges->append(CharacterRange::Range(first, next));
+            return;
+        }
+        ranges->append(CharacterRange::Range(first, unicode::LeadSurrogateMin - 1));
+        first = unicode::LeadSurrogateMin;
+    }
+    if (first <= unicode::LeadSurrogateMax) {
+        if (next <= unicode::LeadSurrogateMax) {
+            lead_ranges->append(CharacterRange::Range(first, next));
+            return;
+        }
+        lead_ranges->append(CharacterRange::Range(first, unicode::LeadSurrogateMax));
+        first = unicode::LeadSurrogateMax + 1;
+    }
+    MOZ_ASSERT(unicode::LeadSurrogateMax + 1 == unicode::TrailSurrogateMin);
+    if (first <= unicode::TrailSurrogateMax) {
+        if (next <= unicode::TrailSurrogateMax) {
+            trail_ranges->append(CharacterRange::Range(first, next));
+            return;
+        }
+        trail_ranges->append(CharacterRange::Range(first, unicode::TrailSurrogateMax));
+        first = unicode::TrailSurrogateMax + 1;
+    }
+    if (first <= unicode::UTF16Max) {
+        if (next <= unicode::UTF16Max) {
+            ranges->append(CharacterRange::Range(first, next));
+            return;
+        }
+        ranges->append(CharacterRange::Range(first, unicode::UTF16Max));
+        first = unicode::NonBMPMin;
+    }
+    MOZ_ASSERT(unicode::UTF16Max + 1 == unicode::NonBMPMin);
+    wide_ranges->append(WideCharRange::Range(first, next));
+}
+
+// Negate a vector of ranges by subtracting its ranges from a range
+// encompassing the full range of possible values.
+template <typename RangeType>
+static inline void
+NegateUnicodeRanges(LifoAlloc* alloc, InfallibleVector<RangeType, 1>** ranges,
+                    RangeType full_range)
+{
+    typedef InfallibleVector<RangeType, 1> RangeVector;
+    RangeVector* tmp_ranges = alloc->newInfallible<RangeVector>(*alloc);
+    tmp_ranges->append(full_range);
+    RangeVector* result_ranges = alloc->newInfallible<RangeVector>(*alloc);
+
+    // Perform the following calculation:
+    //   result_ranges = tmp_ranges - ranges
+    // with the following steps:
+    //   result_ranges = tmp_ranges - ranges[0]
+    //   SWAP(result_ranges, tmp_ranges)
+    //   result_ranges = tmp_ranges - ranges[1]
+    //   SWAP(result_ranges, tmp_ranges)
+    //   ...
+    //   result_ranges = tmp_ranges - ranges[N-1]
+    //   SWAP(result_ranges, tmp_ranges)
+    // The last SWAP is just for simplicity of the loop.
+    for (size_t i = 0; i < (*ranges)->length(); i++) {
+        result_ranges->clear();
+
+        const RangeType& range = (**ranges)[i];
+        for (size_t j = 0; j < tmp_ranges->length(); j++) {
+            const RangeType& tmpRange = (*tmp_ranges)[j];
+            auto from1 = tmpRange.from();
+            auto to1 = tmpRange.to();
+            auto from2 = range.from();
+            auto to2 = range.to();
+
+            if (from1 < from2) {
+                if (to1 < from2) {
+                    result_ranges->append(tmpRange);
+                } else if (to1 <= to2) {
+                    result_ranges->append(RangeType::Range(from1, from2 - 1));
+                } else {
+                    result_ranges->append(RangeType::Range(from1, from2 - 1));
+                    result_ranges->append(RangeType::Range(to2 + 1, to1));
+                }
+            } else if (from1 <= to2) {
+                if (to1 > to2)
+                    result_ranges->append(RangeType::Range(to2 + 1, to1));
+            } else {
+                result_ranges->append(tmpRange);
+            }
+        }
+
+        auto tmp = tmp_ranges;
+        tmp_ranges = result_ranges;
+        result_ranges = tmp;
+    }
+
+    // After the loop, result is pointed at by tmp_ranges, instead of
+    // result_ranges.
+    *ranges = tmp_ranges;
+}
+
+static bool
+WideCharRangesContain(WideCharRangeVector* wide_ranges, widechar c)
+{
+    for (size_t i = 0; i < wide_ranges->length(); i++) {
+        const WideCharRange& range = (*wide_ranges)[i];
+        if (range.Contains(c))
+            return true;
+    }
+    return false;
+}
+
+static void
+CalculateCaseInsensitiveRanges(LifoAlloc* alloc, widechar from, widechar to, int32_t diff,
+                               WideCharRangeVector* wide_ranges,
+                               WideCharRangeVector** tmp_wide_ranges)
+{
+    widechar contains_from = 0;
+    widechar contains_to = 0;
+    for (widechar c = from; c <= to; c++) {
+        if (WideCharRangesContain(wide_ranges, c) &&
+            !WideCharRangesContain(wide_ranges, c + diff))
+        {
+            if (contains_from == 0)
+                contains_from = c;
+            contains_to = c;
+        } else if (contains_from != 0) {
+            if (!*tmp_wide_ranges)
+                *tmp_wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
+
+            (*tmp_wide_ranges)->append(WideCharRange::Range(contains_from + diff,
+                                                            contains_to + diff));
+            contains_from = 0;
+        }
+    }
+
+    if (contains_from != 0) {
+        if (!*tmp_wide_ranges)
+            *tmp_wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
+
+        (*tmp_wide_ranges)->append(WideCharRange::Range(contains_from + diff,
+                                                        contains_to + diff));
+    }
+}
+
+static RegExpTree*
+UnicodeRangesAtom(LifoAlloc* alloc,
+                  CharacterRangeVector* ranges,
+                  CharacterRangeVector* lead_ranges,
+                  CharacterRangeVector* trail_ranges,
+                  WideCharRangeVector* wide_ranges,
+                  bool is_negated,
+                  bool ignore_case)
+{
+    // Calculate case folding for non-BMP first and negate the range if needed.
+    if (ignore_case) {
+        WideCharRangeVector* tmp_wide_ranges = nullptr;
+#define CALL_CALC(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
+        CalculateCaseInsensitiveRanges(alloc, FROM, TO, DIFF, wide_ranges, &tmp_wide_ranges);
+        FOR_EACH_NON_BMP_CASE_FOLDING(CALL_CALC)
+        FOR_EACH_NON_BMP_REV_CASE_FOLDING(CALL_CALC)
+#undef CALL_CALC
+
+        if (tmp_wide_ranges) {
+            for (size_t i = 0; i < tmp_wide_ranges->length(); i++)
+                wide_ranges->append((*tmp_wide_ranges)[i]);
+        }
+    }
+
+    if (is_negated) {
+        NegateUnicodeRanges(alloc, &lead_ranges, LeadSurrogateRange());
+        NegateUnicodeRanges(alloc, &trail_ranges, TrailSurrogateRange());
+        NegateUnicodeRanges(alloc, &wide_ranges, NonBMPRange());
+    }
+
+    RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
+
+    bool added = false;
+
+    if (is_negated) {
+        ranges->append(LeadSurrogateRange());
+        ranges->append(TrailSurrogateRange());
+    }
+    if (ranges->length() > 0) {
+        builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, is_negated));
+        added = true;
+    }
+
+    if (lead_ranges->length() > 0) {
+        if (added)
+            builder->NewAlternative();
+        builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(lead_ranges, false));
+        builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin,
+                                           unicode::TrailSurrogateMax));
+        added = true;
+    }
+
+    if (trail_ranges->length() > 0) {
+        if (added)
+            builder->NewAlternative();
+        builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
+            RegExpAssertion::NOT_AFTER_LEAD_SURROGATE));
+        builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(trail_ranges, false));
+        added = true;
+    }
+
+    for (size_t i = 0; i < wide_ranges->length(); i++) {
+        if (added)
+            builder->NewAlternative();
+
+        const WideCharRange& range = (*wide_ranges)[i];
+        widechar from = range.from();
+        widechar to = range.to();
+        char16_t from_lead, from_trail;
+        char16_t to_lead, to_trail;
+
+        unicode::UTF16Encode(from, &from_lead, &from_trail);
+        if (from == to) {
+            builder->AddCharacter(from_lead);
+            builder->AddCharacter(from_trail);
+        } else {
+            unicode::UTF16Encode(to, &to_lead, &to_trail);
+            if (from_lead == to_lead) {
+                MOZ_ASSERT(from_trail != to_trail);
+                builder->AddCharacter(from_lead);
+                builder->AddAtom(RangeAtom(alloc, from_trail, to_trail));
+            } else if (from_trail == unicode::TrailSurrogateMin &&
+                       to_trail == unicode::TrailSurrogateMax)
+            {
+                builder->AddAtom(RangeAtom(alloc, from_lead, to_lead));
+                builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin,
+                                           unicode::TrailSurrogateMax));
+            } else if (from_lead + 1 == to_lead) {
+                builder->AddCharacter(from_lead);
+                builder->AddAtom(RangeAtom(alloc, from_trail, unicode::TrailSurrogateMax));
+
+                builder->NewAlternative();
+
+                builder->AddCharacter(to_lead);
+                builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, to_trail));
+            } else if (from_lead + 2 == to_lead) {
+                builder->AddCharacter(from_lead);
+                builder->AddAtom(RangeAtom(alloc, from_trail, unicode::TrailSurrogateMax));
+
+                builder->NewAlternative();
+
+                builder->AddCharacter(from_lead + 1);
+                builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin,
+                                           unicode::TrailSurrogateMax));
+
+                builder->NewAlternative();
+
+                builder->AddCharacter(to_lead);
+                builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, to_trail));
+            } else {
+                builder->AddCharacter(from_lead);
+                builder->AddAtom(RangeAtom(alloc, from_trail, unicode::TrailSurrogateMax));
+
+                builder->NewAlternative();
+
+                builder->AddAtom(RangeAtom(alloc, from_lead + 1, to_lead - 1));
+                builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin,
+                                           unicode::TrailSurrogateMax));
+
+                builder->NewAlternative();
+
+                builder->AddCharacter(to_lead);
+                builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, to_trail));
+            }
+        }
+        added = true;
+    }
+
+    return builder->ToRegExp();
+}
+
+template <typename CharT>
+RegExpTree*
+RegExpParser<CharT>::ParseCharacterClass()
+{
+    MOZ_ASSERT(current() == '[');
+    Advance();
+    bool is_negated = false;
+    if (current() == '^') {
+        is_negated = true;
+        Advance();
+    }
+    CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+    CharacterRangeVector* lead_ranges = nullptr;
+    CharacterRangeVector* trail_ranges = nullptr;
+    WideCharRangeVector* wide_ranges = nullptr;
+
+    if (unicode_) {
+        lead_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+        trail_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+        wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
+    }
+
+    while (has_more() && current() != ']') {
+        char16_t char_class = kNoCharClass;
+        widechar first = 0;
+        if (!ParseClassAtom(&char_class, &first))
+            return nullptr;
+        if (current() == '-') {
+            Advance();
+            if (current() == kEndMarker) {
+                // If we reach the end we break out of the loop and let the
+                // following code report an error.
+                break;
+            } else if (current() == ']') {
+                if (unicode_) {
+                    AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges,
+                                           char_class, first, ignore_case_);
+                } else {
+                    AddCharOrEscape(alloc, ranges, char_class, first);
+                }
+                ranges->append(CharacterRange::Singleton('-'));
+                break;
+            }
+            char16_t char_class_2 = kNoCharClass;
+            widechar next = 0;
+            if (!ParseClassAtom(&char_class_2, &next))
+                return nullptr;
+            if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
+                if (unicode_)
+                    return ReportError(JSMSG_RANGE_WITH_CLASS_ESCAPE);
+
+                // Either end is an escaped character class. Treat the '-' verbatim.
+                AddCharOrEscape(alloc, ranges, char_class, first);
+                ranges->append(CharacterRange::Singleton('-'));
+                AddCharOrEscape(alloc, ranges, char_class_2, next);
+                continue;
+            }
+            if (first > next)
+                return ReportError(JSMSG_BAD_CLASS_RANGE);
+            if (unicode_)
+                AddUnicodeRange(alloc, ranges, lead_ranges, trail_ranges,wide_ranges, first, next);
+            else
+                ranges->append(CharacterRange::Range(first, next));
+        } else {
+            if (unicode_) {
+                AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges,
+                                       char_class, first, ignore_case_);
+            } else {
+                AddCharOrEscape(alloc, ranges, char_class, first);
+            }
+        }
+    }
+    if (!has_more())
+        return ReportError(JSMSG_UNTERM_CLASS);
+    Advance();
+    if (!unicode_) {
+        if (ranges->length() == 0) {
+            ranges->append(CharacterRange::Everything());
+            is_negated = !is_negated;
+        }
+        return alloc->newInfallible<RegExpCharacterClass>(ranges, is_negated);
+    }
+
+    if (!is_negated && ranges->length() == 0 && lead_ranges->length() == 0 &&
+        trail_ranges->length() == 0 && wide_ranges->length() == 0)
+    {
+        ranges->append(CharacterRange::Everything());
+        return alloc->newInfallible<RegExpCharacterClass>(ranges, true);
+    }
+
+    return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, is_negated,
+                             ignore_case_);
+}
+
+template <typename CharT>
+bool
+RegExpParser<CharT>::ParseClassAtom(char16_t* char_class, widechar* value)
+{
+    MOZ_ASSERT(*char_class == kNoCharClass);
+    widechar first = current();
+    if (first == '\\') {
+        switch (Next()) {
+          case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {
+            *char_class = Next();
+            Advance(2);
+            return true;
+          }
+          case kEndMarker:
+            return ReportError(JSMSG_ESCAPE_AT_END_OF_REGEXP);
+          default:
+            if (!ParseClassCharacterEscape(value))
+                return false;
+            return true;
+        }
+    } else {
+        if (unicode_) {
+            char16_t lead, trail;
+            if (ParseRawSurrogatePair(&lead, &trail)) {
+                *value = unicode::UTF16Decode(lead, trail);
+                return true;
+            }
+        }
+        Advance();
+        *value = first;
+        return true;
+    }
+}
+
+// In order to know whether an escape is a backreference or not we have to scan
+// the entire regexp and find the number of capturing parentheses.  However we
+// don't want to scan the regexp twice unless it is necessary.  This mini-parser
+// is called when needed.  It can see the difference between capturing and
+// noncapturing parentheses and can skip character classes and backslash-escaped
+// characters.
+template <typename CharT>
+void
+RegExpParser<CharT>::ScanForCaptures()
+{
+    // Start with captures started previous to current position
+    int capture_count = captures_started();
+    // Add count of captures after this position.
+    widechar n;
+    while ((n = current()) != kEndMarker) {
+        Advance();
+        switch (n) {
+          case '\\':
+            Advance();
+            break;
+          case '[': {
+            widechar c;
+            while ((c = current()) != kEndMarker) {
+                Advance();
+                if (c == '\\') {
+                    Advance();
+                } else {
+                    if (c == ']') break;
+                }
+            }
+            break;
+          }
+          case '(':
+            if (current() != '?') capture_count++;
+            break;
+        }
+    }
+    capture_count_ = capture_count;
+    is_scanned_for_captures_ = true;
+}
+
+inline bool
+IsInRange(int value, int lower_limit, int higher_limit)
+{
+    MOZ_ASSERT(lower_limit <= higher_limit);
+    return static_cast<unsigned int>(value - lower_limit) <=
+           static_cast<unsigned int>(higher_limit - lower_limit);
+}
+
+inline bool
+IsDecimalDigit(widechar c)
+{
+    // ECMA-262, 3rd, 7.8.3 (p 16)
+    return IsInRange(c, '0', '9');
+}
+
+template <typename CharT>
+bool
+RegExpParser<CharT>::ParseBackReferenceIndex(int* index_out)
+{
+    MOZ_ASSERT('\\' == current());
+    MOZ_ASSERT('1' <= Next() && Next() <= '9');
+
+    // Try to parse a decimal literal that is no greater than the total number
+    // of left capturing parentheses in the input.
+    const CharT* start = position();
+    int value = Next() - '0';
+    Advance(2);
+    while (true) {
+        widechar c = current();
+        if (IsDecimalDigit(c)) {
+            value = 10 * value + (c - '0');
+            if (value > kMaxCaptures) {
+                Reset(start);
+                return false;
+            }
+            Advance();
+        } else {
+            break;
+        }
+    }
+    if (value > captures_started()) {
+        if (!is_scanned_for_captures_) {
+            const CharT* saved_position = position();
+            ScanForCaptures();
+            Reset(saved_position);
+        }
+        if (value > capture_count_) {
+            Reset(start);
+            return false;
+        }
+    }
+    *index_out = value;
+    return true;
+}
+
+// QuantifierPrefix ::
+//   { DecimalDigits }
+//   { DecimalDigits , }
+//   { DecimalDigits , DecimalDigits }
+//
+// Returns true if parsing succeeds, and set the min_out and max_out
+// values. Values are truncated to RegExpTree::kInfinity if they overflow.
+template <typename CharT>
+bool
+RegExpParser<CharT>::ParseIntervalQuantifier(int* min_out, int* max_out)
+{
+    MOZ_ASSERT(current() == '{');
+    const CharT* start = position();
+    Advance();
+    int min = 0;
+    if (!IsDecimalDigit(current())) {
+        Reset(start);
+        return false;
+    }
+    while (IsDecimalDigit(current())) {
+        int next = current() - '0';
+        if (min > (RegExpTree::kInfinity - next) / 10) {
+            // Overflow. Skip past remaining decimal digits and return -1.
+            do {
+                Advance();
+            } while (IsDecimalDigit(current()));
+            min = RegExpTree::kInfinity;
+            break;
+        }
+        min = 10 * min + next;
+        Advance();
+    }
+    int max = 0;
+    if (current() == '}') {
+        max = min;
+        Advance();
+    } else if (current() == ',') {
+        Advance();
+        if (current() == '}') {
+            max = RegExpTree::kInfinity;
+            Advance();
+        } else {
+            while (IsDecimalDigit(current())) {
+                int next = current() - '0';
+                if (max > (RegExpTree::kInfinity - next) / 10) {
+                    do {
+                        Advance();
+                    } while (IsDecimalDigit(current()));
+                    max = RegExpTree::kInfinity;
+                    break;
+                }
+                max = 10 * max + next;
+                Advance();
+            }
+            if (current() != '}') {
+                Reset(start);
+                return false;
+            }
+            Advance();
+        }
+    } else {
+        Reset(start);
+        return false;
+    }
+    *min_out = min;
+    *max_out = max;
+    return true;
+}
+
+// Pattern ::
+//   Disjunction
+template <typename CharT>
+RegExpTree*
+RegExpParser<CharT>::ParsePattern()
+{
+    RegExpTree* result = ParseDisjunction();
+    MOZ_ASSERT_IF(result, !has_more());
+    return result;
+}
+
+static inline RegExpTree*
+CaseFoldingSurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail, int32_t diff)
+{
+    RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
+
+    builder->AddCharacter(lead);
+    CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+    ranges->append(CharacterRange::Range(trail, trail));
+    ranges->append(CharacterRange::Range(trail + diff, trail + diff));
+    builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, false));
+
+    return builder->ToRegExp();
+}
+
+static inline RegExpTree*
+SurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail, bool ignore_case)
+{
+    if (ignore_case) {
+#define CALL_ATOM(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
+        if (lead == LEAD &&trail >= TRAIL_FROM && trail <= TRAIL_TO) \
+            return CaseFoldingSurrogatePairAtom(alloc, lead, trail, DIFF);
+        FOR_EACH_NON_BMP_CASE_FOLDING(CALL_ATOM)
+        FOR_EACH_NON_BMP_REV_CASE_FOLDING(CALL_ATOM)
+#undef CALL_ATOM
+    }
+
+    RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
+    builder->AddCharacter(lead);
+    builder->AddCharacter(trail);
+    return builder->ToRegExp();
+}
+
+static inline RegExpTree*
+LeadSurrogateAtom(LifoAlloc* alloc, char16_t value)
+{
+    RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
+    builder->AddCharacter(value);
+    builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin,
+                                       unicode::TrailSurrogateMax));
+    return builder->ToRegExp();
+}
+
+static inline RegExpTree*
+TrailSurrogateAtom(LifoAlloc* alloc, char16_t value)
+{
+    RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
+    builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
+        RegExpAssertion::NOT_AFTER_LEAD_SURROGATE));
+    builder->AddCharacter(value);
+    return builder->ToRegExp();
+}
+
+static inline RegExpTree*
+UnicodeEverythingAtom(LifoAlloc* alloc)
+{
+    RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
+
+    // everything except \x0a, \x0d, \u2028 and \u2029
+
+    CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+    ranges->append(CharacterRange::Range(0x0, 0x09));
+    ranges->append(CharacterRange::Range(0x0b, 0x0c));
+    ranges->append(CharacterRange::Range(0x0e, 0x2027));
+    ranges->append(CharacterRange::Range(0x202A, unicode::LeadSurrogateMin - 1));
+    ranges->append(CharacterRange::Range(unicode::TrailSurrogateMax + 1, unicode::UTF16Max));
+    builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, false));
+
+    builder->NewAlternative();
+
+    builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax));
+    builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin,
+                                       unicode::TrailSurrogateMax));
+
+    builder->NewAlternative();
+
+    builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
+        RegExpAssertion::NOT_AFTER_LEAD_SURROGATE));
+    builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax));
+
+    builder->NewAlternative();
+
+    builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax));
+    builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax));
+
+    return builder->ToRegExp();
+}
+
+RegExpTree*
+UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class, bool ignore_case)
+{
+    CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+    CharacterRangeVector* lead_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+    CharacterRangeVector* trail_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+    WideCharRangeVector* wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
+    AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, char_class, 0,
+                           ignore_case);
+
+    return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, false, false);
+}
+
+static inline RegExpTree*
+UnicodeBackReferenceAtom(LifoAlloc* alloc, RegExpTree* atom)
+{
+    // If a back reference has a standalone lead surrogate as its last
+    // character, then that lead surrogate shouldn't match lead surrogates that
+    // are paired with a corresponding trail surrogate.
+    RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
+
+    builder->AddAtom(atom);
+    builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
+        RegExpAssertion::NOT_IN_SURROGATE_PAIR));
+
+    return builder->ToRegExp();
+}
+
+// Disjunction ::
+//   Alternative
+//   Alternative | Disjunction
+// Alternative ::
+//   [empty]
+//   Term Alternative
+// Term ::
+//   Assertion
+//   Atom
+//   Atom Quantifier
+template <typename CharT>
+RegExpTree*
+RegExpParser<CharT>::ParseDisjunction()
+{
+    // Used to store current state while parsing subexpressions.
+    RegExpParserState initial_state(alloc, nullptr, INITIAL, 0);
+    RegExpParserState* stored_state = &initial_state;
+    // Cache the builder in a local variable for quick access.
+    RegExpBuilder* builder = initial_state.builder();
+    while (true) {
+        switch (current()) {
+          case kEndMarker:
+            if (stored_state->IsSubexpression()) {
+                // Inside a parenthesized group when hitting end of input.
+                return ReportError(JSMSG_MISSING_PAREN);
+            }
+            MOZ_ASSERT(INITIAL == stored_state->group_type());
+            // Parsing completed successfully.
+            return builder->ToRegExp();
+          case ')': {
+            if (!stored_state->IsSubexpression())
+                return ReportError(JSMSG_UNMATCHED_RIGHT_PAREN);
+            MOZ_ASSERT(INITIAL != stored_state->group_type());
+
+            Advance();
+            // End disjunction parsing and convert builder content to new single
+            // regexp atom.
+            RegExpTree* body = builder->ToRegExp();
+
+            int end_capture_index = captures_started();
+
+            int capture_index = stored_state->capture_index();
+            SubexpressionType group_type = stored_state->group_type();
+
+            // Restore previous state.
+            stored_state = stored_state->previous_state();
+            builder = stored_state->builder();
+
+            // Build result of subexpression.
+            if (group_type == CAPTURE) {
+                RegExpCapture* capture = alloc->newInfallible<RegExpCapture>(body, capture_index);
+                (*captures_)[capture_index - 1] = capture;
+                body = capture;
+            } else if (group_type != GROUPING) {
+                MOZ_ASSERT(group_type == POSITIVE_LOOKAHEAD ||
+                           group_type == NEGATIVE_LOOKAHEAD);
+                bool is_positive = (group_type == POSITIVE_LOOKAHEAD);
+                body = alloc->newInfallible<RegExpLookahead>(body,
+                                                   is_positive,
+                                                   end_capture_index - capture_index,
+                                                   capture_index);
+            }
+            builder->AddAtom(body);
+            if (unicode_ && (group_type == POSITIVE_LOOKAHEAD || group_type == NEGATIVE_LOOKAHEAD))
+                continue;
+            // For compatability with JSC and ES3, we allow quantifiers after
+            // lookaheads, and break in all cases.
+            break;
+          }
+          case '|': {
+            Advance();
+            builder->NewAlternative();
+            continue;
+          }
+          case '*':
+          case '+':
+          case '?':
+            return ReportError(JSMSG_NOTHING_TO_REPEAT);
+          case '^': {
+            Advance();
+            if (multiline_) {
+                builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::START_OF_LINE));
+            } else {
+                builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::START_OF_INPUT));
+                set_contains_anchor();
+            }
+            continue;
+          }
+          case '$': {
+            Advance();
+            RegExpAssertion::AssertionType assertion_type =
+                multiline_ ? RegExpAssertion::END_OF_LINE :
+                RegExpAssertion::END_OF_INPUT;
+            builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(assertion_type));
+            continue;
+          }
+          case '.': {
+            Advance();
+            // everything except \x0a, \x0d, \u2028 and \u2029
+            if (unicode_) {
+                builder->AddAtom(UnicodeEverythingAtom(alloc));
+                break;
+            }
+            CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+            CharacterRange::AddClassEscape(alloc, '.', ranges);
+            RegExpTree* atom = alloc->newInfallible<RegExpCharacterClass>(ranges, false);
+            builder->AddAtom(atom);
+            break;
+          }
+          case '(': {
+            SubexpressionType subexpr_type = CAPTURE;
+            Advance();
+            if (current() == '?') {
+                switch (Next()) {
+                  case ':':
+                    subexpr_type = GROUPING;
+                    break;
+                  case '=':
+                    subexpr_type = POSITIVE_LOOKAHEAD;
+                    break;
+                  case '!':
+                    subexpr_type = NEGATIVE_LOOKAHEAD;
+                    break;
+                  default:
+                    return ReportError(JSMSG_INVALID_GROUP);
+                }
+                Advance(2);
+            } else {
+                if (captures_ == nullptr)
+                    captures_ = alloc->newInfallible<RegExpCaptureVector>(*alloc);
+                if (captures_started() >= kMaxCaptures)
+                    return ReportError(JSMSG_TOO_MANY_PARENS);
+                captures_->append((RegExpCapture*) nullptr);
+            }
+            // Store current state and begin new disjunction parsing.
+            stored_state = alloc->newInfallible<RegExpParserState>(alloc, stored_state, subexpr_type,
+                                                                   captures_started());
+            builder = stored_state->builder();
+            continue;
+          }
+          case '[': {
+            RegExpTree* atom = ParseCharacterClass();
+            if (!atom)
+                return nullptr;
+            builder->AddAtom(atom);
+            break;
+          }
+            // Atom ::
+            //   \ AtomEscape
+          case '\\':
+            switch (Next()) {
+              case kEndMarker:
+                return ReportError(JSMSG_ESCAPE_AT_END_OF_REGEXP);
+              case 'b':
+                Advance(2);
+                builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::BOUNDARY));
+                continue;
+              case 'B':
+                Advance(2);
+                builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::NON_BOUNDARY));
+                continue;
+                // AtomEscape ::
+                //   CharacterClassEscape
+                //
+                // CharacterClassEscape :: one of
+                //   d D s S w W
+              case 'D': case 'S': case 'W':
+                if (unicode_) {
+                    Advance();
+                    builder->AddAtom(UnicodeCharacterClassEscapeAtom(alloc, current(),
+                                                                     ignore_case_));
+                    Advance();
+                    break;
+                }
+                MOZ_FALLTHROUGH;
+              case 'd': case 's': case 'w': {
+                widechar c = Next();
+                Advance(2);
+                CharacterRangeVector* ranges =
+                    alloc->newInfallible<CharacterRangeVector>(*alloc);
+                if (unicode_)
+                    CharacterRange::AddClassEscapeUnicode(alloc, c, ranges, ignore_case_);
+                else
+                    CharacterRange::AddClassEscape(alloc, c, ranges);
+                RegExpTree* atom = alloc->newInfallible<RegExpCharacterClass>(ranges, false);
+                builder->AddAtom(atom);
+                break;
+              }
+              case '1': case '2': case '3': case '4': case '5': case '6':
+              case '7': case '8': case '9': {
+                int index = 0;
+                if (ParseBackReferenceIndex(&index)) {
+                    RegExpCapture* capture = nullptr;
+                    if (captures_ != nullptr && index <= (int) captures_->length()) {
+                        capture = (*captures_)[index - 1];
+                    }
+                    if (capture == nullptr) {
+                        builder->AddEmpty();
+                        break;
+                    }
+                    RegExpTree* atom = alloc->newInfallible<RegExpBackReference>(capture);
+                    if (unicode_)
+                        builder->AddAtom(UnicodeBackReferenceAtom(alloc, atom));
+                    else
+                        builder->AddAtom(atom);
+                    break;
+                }
+                if (unicode_)
+                    return ReportError(JSMSG_BACK_REF_OUT_OF_RANGE);
+                widechar first_digit = Next();
+                if (first_digit == '8' || first_digit == '9') {
+                    // Treat as identity escape
+                    builder->AddCharacter(first_digit);
+                    Advance(2);
+                    break;
+                }
+                MOZ_FALLTHROUGH;
+              }
+              case '0': {
+                if (unicode_) {
+                    Advance(2);
+                    if (IsDecimalDigit(current()))
+                        return ReportError(JSMSG_INVALID_DECIMAL_ESCAPE);
+                    builder->AddCharacter(0);
+                    break;
+                }
+
+                Advance();
+                widechar octal = ParseOctalLiteral();
+                builder->AddCharacter(octal);
+                break;
+              }
+                // ControlEscape :: one of
+                //   f n r t v
+              case 'f':
+                Advance(2);
+                builder->AddCharacter('\f');
+                break;
+              case 'n':
+                Advance(2);
+                builder->AddCharacter('\n');
+                break;
+              case 'r':
+                Advance(2);
+                builder->AddCharacter('\r');
+                break;
+              case 't':
+                Advance(2);
+                builder->AddCharacter('\t');
+                break;
+              case 'v':
+                Advance(2);
+                builder->AddCharacter('\v');
+                break;
+              case 'c': {
+                Advance();
+                widechar controlLetter = Next();
+                // Special case if it is an ASCII letter.
+                // Convert lower case letters to uppercase.
+                widechar letter = controlLetter & ~('a' ^ 'A');
+                if (letter < 'A' || 'Z' < letter) {
+                    if (unicode_)
+                        return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+                    // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
+                    // This is outside the specification. We match JSC in
+                    // reading the backslash as a literal character instead
+                    // of as starting an escape.
+                    builder->AddCharacter('\\');
+                } else {
+                    Advance(2);
+                    builder->AddCharacter(controlLetter & 0x1f);
+                }
+                break;
+              }
+              case 'x': {
+                Advance(2);
+                widechar value;
+                if (ParseHexEscape(2, &value)) {
+                    builder->AddCharacter(value);
+                } else {
+                    if (unicode_)
+                        return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+                    builder->AddCharacter('x');
+                }
+                break;
+              }
+              case 'u': {
+                Advance(2);
+                widechar value;
+                if (unicode_) {
+                    if (current() == '{') {
+                        if (!ParseBracedHexEscape(&value))
+                            return nullptr;
+                        if (unicode::IsLeadSurrogate(value)) {
+                            builder->AddAtom(LeadSurrogateAtom(alloc, value));
+                        } else if (unicode::IsTrailSurrogate(value)) {
+                            builder->AddAtom(TrailSurrogateAtom(alloc, value));
+                        } else if (value >= unicode::NonBMPMin) {
+                            char16_t lead, trail;
+                            unicode::UTF16Encode(value, &lead, &trail);
+                            builder->AddAtom(SurrogatePairAtom(alloc, lead, trail,
+                                                               ignore_case_));
+                        } else {
+                            builder->AddCharacter(value);
+                        }
+                    } else if (ParseHexEscape(4, &value)) {
+                        if (unicode::IsLeadSurrogate(value)) {
+                            widechar trail;
+                            if (ParseTrailSurrogate(&trail)) {
+                                builder->AddAtom(SurrogatePairAtom(alloc, value, trail,
+                                                                   ignore_case_));
+                            } else {
+                                builder->AddAtom(LeadSurrogateAtom(alloc, value));
+                            }
+                        } else if (unicode::IsTrailSurrogate(value)) {
+                            builder->AddAtom(TrailSurrogateAtom(alloc, value));
+                        } else {
+                            builder->AddCharacter(value);
+                        }
+                    } else {
+                        return ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
+                    }
+                    break;
+                }
+                if (ParseHexEscape(4, &value)) {
+                    builder->AddCharacter(value);
+                } else {
+                    builder->AddCharacter('u');
+                }
+                break;
+              }
+              default:
+                // Identity escape.
+                if (unicode_ && !IsSyntaxCharacter(Next()))
+                    return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+                builder->AddCharacter(Next());
+                Advance(2);
+                break;
+            }
+            break;
+          case '{': {
+            if (unicode_)
+                return ReportError(JSMSG_RAW_BRACE_IN_REGEP);
+            int dummy;
+            if (ParseIntervalQuantifier(&dummy, &dummy))
+                return ReportError(JSMSG_NOTHING_TO_REPEAT);
+            MOZ_FALLTHROUGH;
+          }
+          default:
+            if (unicode_) {
+                char16_t lead, trail;
+                if (ParseRawSurrogatePair(&lead, &trail)) {
+                    builder->AddAtom(SurrogatePairAtom(alloc, lead, trail, ignore_case_));
+                } else {
+                    widechar c = current();
+                    if (unicode::IsLeadSurrogate(c))
+                        builder->AddAtom(LeadSurrogateAtom(alloc, c));
+                    else if (unicode::IsTrailSurrogate(c))
+                        builder->AddAtom(TrailSurrogateAtom(alloc, c));
+                    else if (c == ']')
+                        return ReportError(JSMSG_RAW_BRACKET_IN_REGEP);
+                    else if (c == '}')
+                        return ReportError(JSMSG_RAW_BRACE_IN_REGEP);
+                    else
+                        builder->AddCharacter(c);
+                    Advance();
+                }
+                break;
+            }
+            builder->AddCharacter(current());
+            Advance();
+            break;
+        }  // end switch(current())
+
+        int min;
+        int max;
+        switch (current()) {
+            // QuantifierPrefix ::
+            //   *
+            //   +
+            //   ?
+            //   {
+          case '*':
+            min = 0;
+            max = RegExpTree::kInfinity;
+            Advance();
+            break;
+          case '+':
+            min = 1;
+            max = RegExpTree::kInfinity;
+            Advance();
+            break;
+          case '?':
+            min = 0;
+            max = 1;
+            Advance();
+            break;
+          case '{':
+            if (ParseIntervalQuantifier(&min, &max)) {
+                if (max < min)
+                    return ReportError(JSMSG_NUMBERS_OUT_OF_ORDER);
+                break;
+            } else {
+                continue;
+            }
+          default:
+            continue;
+        }
+        RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
+        if (current() == '?') {
+            quantifier_type = RegExpQuantifier::NON_GREEDY;
+            Advance();
+        }
+        builder->AddQuantifierToAtom(min, max, quantifier_type);
+    }
+}
+
+template class irregexp::RegExpParser<Latin1Char>;
+template class irregexp::RegExpParser<char16_t>;
+
+template <typename CharT>
+static bool
+ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length,
+             bool multiline, bool match_only, bool unicode, bool ignore_case,
+             bool global, bool sticky, RegExpCompileData* data)
+{
+    if (match_only) {
+        // Try to strip a leading '.*' from the RegExp, but only if it is not
+        // followed by a '?' (which will affect how the .* is parsed). This
+        // pattern will affect the captures produced by the RegExp, but not
+        // whether there is a match or not.
+        if (length >= 3 && chars[0] == '.' && chars[1] == '*' && chars[2] != '?') {
+            chars += 2;
+            length -= 2;
+        }
+
+        // Try to strip a trailing '.*' from the RegExp, which as above will
+        // affect the captures but not whether there is a match. Only do this
+        // when the following conditions are met:
+        //   1. there are no other meta characters in the RegExp, so that we
+        //      are sure this will not affect how the RegExp is parsed
+        //   2. global and sticky flags are not set, as lastIndex needs to be
+        //      set properly on global or sticky match
+        if (length >= 3 && !HasRegExpMetaChars(chars, length - 2) &&
+            !global && !sticky &&
+            chars[length - 2] == '.' && chars[length - 1] == '*')
+        {
+            length -= 2;
+        }
+    }
+
+    RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, multiline, unicode, ignore_case);
+    data->tree = parser.ParsePattern();
+    if (!data->tree)
+        return false;
+
+    data->simple = parser.simple();
+    data->contains_anchor = parser.contains_anchor();
+    data->capture_count = parser.captures_started();
+    return true;
+}
+
+bool
+irregexp::ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str,
+                       bool multiline, bool match_only, bool unicode, bool ignore_case,
+                       bool global, bool sticky, RegExpCompileData* data)
+{
+    JS::AutoCheckCannotGC nogc;
+    return str->hasLatin1Chars()
+           ? ::ParsePattern(ts, alloc, str->latin1Chars(nogc), str->length(),
+                            multiline, match_only, unicode, ignore_case, global, sticky, data)
+           : ::ParsePattern(ts, alloc, str->twoByteChars(nogc), str->length(),
+                            multiline, match_only, unicode, ignore_case, global, sticky, data);
+}
+
+template <typename CharT>
+static bool
+ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length,
+                   bool unicode)
+{
+    LifoAllocScope scope(&alloc);
+
+    RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, false, unicode, false);
+    return parser.ParsePattern() != nullptr;
+}
+
+bool
+irregexp::ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str,
+                             bool unicode)
+{
+    JS::AutoCheckCannotGC nogc;
+    return str->hasLatin1Chars()
+           ? ::ParsePatternSyntax(ts, alloc, str->latin1Chars(nogc), str->length(), unicode)
+           : ::ParsePatternSyntax(ts, alloc, str->twoByteChars(nogc), str->length(), unicode);
+}
author	Matt A. Tobin <mattatobin@localhost.localdomain>	2018-02-02 04:16:08 -0500
committer	Matt A. Tobin <mattatobin@localhost.localdomain>	2018-02-02 04:16:08 -0500
commit	5f8de423f190bbb79a62f804151bc24824fa32d8 (patch)
tree	10027f336435511475e392454359edea8e25895d /js/src/irregexp/RegExpParser.cpp
parent	49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff)
download	UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip