diff options
author | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
---|---|---|
committer | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
commit | 5f8de423f190bbb79a62f804151bc24824fa32d8 (patch) | |
tree | 10027f336435511475e392454359edea8e25895d /js/src/irregexp/RegExpParser.cpp | |
parent | 49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff) | |
download | UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip |
Add m-esr52 at 52.6.0
Diffstat (limited to 'js/src/irregexp/RegExpParser.cpp')
-rw-r--r-- | js/src/irregexp/RegExpParser.cpp | 1905 |
1 files changed, 1905 insertions, 0 deletions
diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp new file mode 100644 index 000000000..ccc6ae3eb --- /dev/null +++ b/js/src/irregexp/RegExpParser.cpp @@ -0,0 +1,1905 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- + * vim: set ts=8 sts=4 et sw=4 tw=99: */ + +// Copyright 2012 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "irregexp/RegExpParser.h" + +#include "frontend/TokenStream.h" + +using namespace js; +using namespace js::irregexp; + +// ---------------------------------------------------------------------------- +// RegExpBuilder + +RegExpBuilder::RegExpBuilder(LifoAlloc* alloc) + : alloc(alloc), + pending_empty_(false), + characters_(nullptr) +#ifdef DEBUG + , last_added_(ADD_NONE) +#endif +{} + +void +RegExpBuilder::FlushCharacters() +{ + pending_empty_ = false; + if (characters_ != nullptr) { + RegExpTree* atom = alloc->newInfallible<RegExpAtom>(characters_); + characters_ = nullptr; + text_.Add(alloc, atom); +#ifdef DEBUG + last_added_ = ADD_ATOM; +#endif + } +} + +void +RegExpBuilder::FlushText() +{ + FlushCharacters(); + int num_text = text_.length(); + if (num_text == 0) + return; + if (num_text == 1) { + terms_.Add(alloc, text_.last()); + } else { + RegExpText* text = alloc->newInfallible<RegExpText>(alloc); + for (int i = 0; i < num_text; i++) + text_.Get(i)->AppendToText(text); + terms_.Add(alloc, text); + } + text_.Clear(); +} + +void +RegExpBuilder::AddCharacter(char16_t c) +{ + pending_empty_ = false; + if (characters_ == nullptr) + characters_ = alloc->newInfallible<CharacterVector>(*alloc); + characters_->append(c); +#ifdef DEBUG + last_added_ = ADD_CHAR; +#endif +} + +void +RegExpBuilder::AddEmpty() +{ + pending_empty_ = true; +} + +void +RegExpBuilder::AddAtom(RegExpTree* term) +{ + if (term->IsEmpty()) { + AddEmpty(); + return; + } + if (term->IsTextElement()) { + FlushCharacters(); + text_.Add(alloc, term); + } else { + FlushText(); + terms_.Add(alloc, term); + } +#ifdef DEBUG + last_added_ = ADD_ATOM; +#endif +} + +void +RegExpBuilder::AddAssertion(RegExpTree* assert) +{ + FlushText(); + terms_.Add(alloc, assert); +#ifdef DEBUG + last_added_ = ADD_ASSERT; +#endif +} + +void +RegExpBuilder::NewAlternative() +{ + FlushTerms(); +} + +void +RegExpBuilder::FlushTerms() +{ + FlushText(); + int num_terms = terms_.length(); + RegExpTree* alternative; + if (num_terms == 0) + alternative = RegExpEmpty::GetInstance(); + else if (num_terms == 1) + alternative = terms_.last(); + else + alternative = alloc->newInfallible<RegExpAlternative>(terms_.GetList(alloc)); + alternatives_.Add(alloc, alternative); + terms_.Clear(); +#ifdef DEBUG + last_added_ = ADD_NONE; +#endif +} + +RegExpTree* +RegExpBuilder::ToRegExp() +{ + FlushTerms(); + int num_alternatives = alternatives_.length(); + if (num_alternatives == 0) { + return RegExpEmpty::GetInstance(); + } + if (num_alternatives == 1) { + return alternatives_.last(); + } + return alloc->newInfallible<RegExpDisjunction>(alternatives_.GetList(alloc)); +} + +void +RegExpBuilder::AddQuantifierToAtom(int min, int max, + RegExpQuantifier::QuantifierType quantifier_type) +{ + if (pending_empty_) { + pending_empty_ = false; + return; + } + RegExpTree* atom; + if (characters_ != nullptr) { + MOZ_ASSERT(last_added_ == ADD_CHAR); + // Last atom was character. + CharacterVector* char_vector = characters_; + int num_chars = char_vector->length(); + if (num_chars > 1) { + CharacterVector* prefix = alloc->newInfallible<CharacterVector>(*alloc); + prefix->append(char_vector->begin(), num_chars - 1); + text_.Add(alloc, alloc->newInfallible<RegExpAtom>(prefix)); + char_vector = alloc->newInfallible<CharacterVector>(*alloc); + char_vector->append((*characters_)[num_chars - 1]); + } + characters_ = nullptr; + atom = alloc->newInfallible<RegExpAtom>(char_vector); + FlushText(); + } else if (text_.length() > 0) { + MOZ_ASSERT(last_added_ == ADD_ATOM); + atom = text_.RemoveLast(); + FlushText(); + } else if (terms_.length() > 0) { + MOZ_ASSERT(last_added_ == ADD_ATOM); + atom = terms_.RemoveLast(); + if (atom->max_match() == 0) { + // Guaranteed to only match an empty string. +#ifdef DEBUG + last_added_ = ADD_TERM; +#endif + if (min == 0) + return; + terms_.Add(alloc, atom); + return; + } + } else { + // Only call immediately after adding an atom or character! + MOZ_CRASH("Bad call"); + } + terms_.Add(alloc, alloc->newInfallible<RegExpQuantifier>(min, max, quantifier_type, atom)); +#ifdef DEBUG + last_added_ = ADD_TERM; +#endif +} + +// ---------------------------------------------------------------------------- +// RegExpParser + +template <typename CharT> +RegExpParser<CharT>::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc, + const CharT* chars, const CharT* end, bool multiline_mode, + bool unicode, bool ignore_case) + : ts(ts), + alloc(alloc), + captures_(nullptr), + next_pos_(chars), + end_(end), + current_(kEndMarker), + capture_count_(0), + has_more_(true), + multiline_(multiline_mode), + unicode_(unicode), + ignore_case_(ignore_case), + simple_(false), + contains_anchor_(false), + is_scanned_for_captures_(false) +{ + Advance(); +} + +template <typename CharT> +RegExpTree* +RegExpParser<CharT>::ReportError(unsigned errorNumber) +{ + gc::AutoSuppressGC suppressGC(ts.context()); + ts.reportError(errorNumber); + return nullptr; +} + +template <typename CharT> +void +RegExpParser<CharT>::Advance() +{ + if (next_pos_ < end_) { + current_ = *next_pos_; + next_pos_++; + } else { + current_ = kEndMarker; + has_more_ = false; + } +} + +// Returns the value (0 .. 15) of a hexadecimal character c. +// If c is not a legal hexadecimal character, returns a value < 0. +inline int +HexValue(uint32_t c) +{ + c -= '0'; + if (static_cast<unsigned>(c) <= 9) return c; + c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36. + if (static_cast<unsigned>(c) <= 5) return c + 10; + return -1; +} + +template <typename CharT> +widechar +RegExpParser<CharT>::ParseOctalLiteral() +{ + MOZ_ASSERT('0' <= current() && current() <= '7'); + // For compatibility with some other browsers (not all), we parse + // up to three octal digits with a value below 256. + widechar value = current() - '0'; + Advance(); + if ('0' <= current() && current() <= '7') { + value = value * 8 + current() - '0'; + Advance(); + if (value < 32 && '0' <= current() && current() <= '7') { + value = value * 8 + current() - '0'; + Advance(); + } + } + return value; +} + +template <typename CharT> +bool +RegExpParser<CharT>::ParseHexEscape(int length, widechar* value) +{ + const CharT* start = position(); + uint32_t val = 0; + bool done = false; + for (int i = 0; !done; i++) { + widechar c = current(); + int d = HexValue(c); + if (d < 0) { + Reset(start); + return false; + } + val = val * 16 + d; + Advance(); + if (i == length - 1) { + done = true; + } + } + *value = val; + return true; +} + +template <typename CharT> +bool +RegExpParser<CharT>::ParseBracedHexEscape(widechar* value) +{ + MOZ_ASSERT(current() == '{'); + Advance(); + + bool first = true; + uint32_t code = 0; + while (true) { + widechar c = current(); + if (c == kEndMarker) { + ReportError(JSMSG_INVALID_UNICODE_ESCAPE); + return false; + } + if (c == '}') { + if (first) { + ReportError(JSMSG_INVALID_UNICODE_ESCAPE); + return false; + } + Advance(); + break; + } + + int d = HexValue(c); + if (d < 0) { + ReportError(JSMSG_INVALID_UNICODE_ESCAPE); + return false; + } + code = (code << 4) | d; + if (code > unicode::NonBMPMax) { + ReportError(JSMSG_UNICODE_OVERFLOW); + return false; + } + Advance(); + first = false; + } + + *value = code; + return true; +} + +template <typename CharT> +bool +RegExpParser<CharT>::ParseTrailSurrogate(widechar* value) +{ + if (current() != '\\') + return false; + + const CharT* start = position(); + Advance(); + if (current() != 'u') { + Reset(start); + return false; + } + Advance(); + if (!ParseHexEscape(4, value)) { + Reset(start); + return false; + } + if (!unicode::IsTrailSurrogate(*value)) { + Reset(start); + return false; + } + return true; +} + +template <typename CharT> +bool +RegExpParser<CharT>::ParseRawSurrogatePair(char16_t* lead, char16_t* trail) +{ + widechar c1 = current(); + if (!unicode::IsLeadSurrogate(c1)) + return false; + + const CharT* start = position(); + Advance(); + widechar c2 = current(); + if (!unicode::IsTrailSurrogate(c2)) { + Reset(start); + return false; + } + Advance(); + *lead = c1; + *trail = c2; + return true; +} + +static inline RegExpTree* +RangeAtom(LifoAlloc* alloc, char16_t from, char16_t to) +{ + CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); + ranges->append(CharacterRange::Range(from, to)); + return alloc->newInfallible<RegExpCharacterClass>(ranges, false); +} + +static inline RegExpTree* +NegativeLookahead(LifoAlloc* alloc, char16_t from, char16_t to) +{ + return alloc->newInfallible<RegExpLookahead>(RangeAtom(alloc, from, to), false, 0, 0); +} + +static bool +IsSyntaxCharacter(widechar c) +{ + switch (c) { + case '^': + case '$': + case '\\': + case '.': + case '*': + case '+': + case '?': + case '(': + case ')': + case '[': + case ']': + case '{': + case '}': + case '|': + case '/': + return true; + default: + return false; + } +} + +#ifdef DEBUG +// Currently only used in an assert.kASSERT. +static bool +IsSpecialClassEscape(widechar c) +{ + switch (c) { + case 'd': case 'D': + case 's': case 'S': + case 'w': case 'W': + return true; + default: + return false; + } +} +#endif + +template <typename CharT> +bool +RegExpParser<CharT>::ParseClassCharacterEscape(widechar* code) +{ + MOZ_ASSERT(current() == '\\'); + MOZ_ASSERT(has_next() && !IsSpecialClassEscape(Next())); + Advance(); + switch (current()) { + case 'b': + Advance(); + *code = '\b'; + return true; + // ControlEscape :: one of + // f n r t v + case 'f': + Advance(); + *code = '\f'; + return true; + case 'n': + Advance(); + *code = '\n'; + return true; + case 'r': + Advance(); + *code = '\r'; + return true; + case 't': + Advance(); + *code = '\t'; + return true; + case 'v': + Advance(); + *code = '\v'; + return true; + case 'c': { + widechar controlLetter = Next(); + widechar letter = controlLetter & ~('A' ^ 'a'); + // For compatibility with JSC, inside a character class + // we also accept digits and underscore as control characters, + // but only in non-unicode mode + if ((!unicode_ && + ((controlLetter >= '0' && controlLetter <= '9') || + controlLetter == '_')) || + (letter >= 'A' && letter <= 'Z')) + { + Advance(2); + // Control letters mapped to ASCII control characters in the range + // 0x00-0x1f. + *code = controlLetter & 0x1f; + return true; + } + if (unicode_) { + ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); + return false; + } + // We match JSC in reading the backslash as a literal + // character instead of as starting an escape. + *code = '\\'; + return true; + } + case '0': case '1': case '2': case '3': case '4': case '5': + case '6': case '7': + if (unicode_) { + if (current() == '0') { + Advance(); + *code = 0; + return true; + } + ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); + return false; + } + // For compatibility, outside of unicode mode, we interpret a decimal + // escape that isn't a back reference (and therefore either \0 or not + // valid according to the specification) as a 1..3 digit octal + // character code. + *code = ParseOctalLiteral(); + return true; + case 'x': { + Advance(); + widechar value; + if (ParseHexEscape(2, &value)) { + *code = value; + return true; + } + if (unicode_) { + ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); + return false; + } + // If \x is not followed by a two-digit hexadecimal, treat it + // as an identity escape in non-unicode mode. + *code = 'x'; + return true; + } + case 'u': { + Advance(); + widechar value; + if (unicode_) { + if (current() == '{') { + if (!ParseBracedHexEscape(&value)) + return false; + *code = value; + return true; + } + if (ParseHexEscape(4, &value)) { + if (unicode::IsLeadSurrogate(value)) { + widechar trail; + if (ParseTrailSurrogate(&trail)) { + *code = unicode::UTF16Decode(value, trail); + return true; + } + } + *code = value; + return true; + } + ReportError(JSMSG_INVALID_UNICODE_ESCAPE); + return false; + } + if (ParseHexEscape(4, &value)) { + *code = value; + return true; + } + // If \u is not followed by a four-digit or braced hexadecimal, treat it + // as an identity escape. + *code = 'u'; + return true; + } + default: { + // Extended identity escape (non-unicode only). We accept any character + // that hasn't been matched by a more specific case, not just the subset + // required by the ECMAScript specification. + widechar result = current(); + if (unicode_ && result != '-' && !IsSyntaxCharacter(result)) { + ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); + return false; + } + Advance(); + *code = result; + return true; + } + } + return true; +} + +class WideCharRange +{ + public: + WideCharRange() + : from_(0), to_(0) + {} + + WideCharRange(widechar from, widechar to) + : from_(from), to_(to) + {} + + static inline WideCharRange Singleton(widechar value) { + return WideCharRange(value, value); + } + static inline WideCharRange Range(widechar from, widechar to) { + MOZ_ASSERT(from <= to); + return WideCharRange(from, to); + } + + bool Contains(widechar i) const { return from_ <= i && i <= to_; } + widechar from() const { return from_; } + widechar to() const { return to_; } + + private: + widechar from_; + widechar to_; +}; + +typedef InfallibleVector<WideCharRange, 1> WideCharRangeVector; + +static inline CharacterRange +LeadSurrogateRange() +{ + return CharacterRange::Range(unicode::LeadSurrogateMin, unicode::LeadSurrogateMax); +} + +static inline CharacterRange +TrailSurrogateRange() +{ + return CharacterRange::Range(unicode::TrailSurrogateMin, unicode::TrailSurrogateMax); +} + +static inline WideCharRange +NonBMPRange() +{ + return WideCharRange::Range(unicode::NonBMPMin, unicode::NonBMPMax); +} + +static const char16_t kNoCharClass = 0; + +// Adds a character or pre-defined character class to character ranges. +// If char_class is not kInvalidClass, it's interpreted as a class +// escape (i.e., 's' means whitespace, from '\s'). +static inline void +AddCharOrEscape(LifoAlloc* alloc, + CharacterRangeVector* ranges, + char16_t char_class, + widechar c) +{ + if (char_class != kNoCharClass) + CharacterRange::AddClassEscape(alloc, char_class, ranges); + else + ranges->append(CharacterRange::Singleton(c)); +} + +static inline void +AddCharOrEscapeUnicode(LifoAlloc* alloc, + CharacterRangeVector* ranges, + CharacterRangeVector* lead_ranges, + CharacterRangeVector* trail_ranges, + WideCharRangeVector* wide_ranges, + char16_t char_class, + widechar c, + bool ignore_case) +{ + if (char_class != kNoCharClass) { + CharacterRange::AddClassEscapeUnicode(alloc, char_class, ranges, ignore_case); + switch (char_class) { + case 'S': + case 'W': + case 'D': + lead_ranges->append(LeadSurrogateRange()); + trail_ranges->append(TrailSurrogateRange()); + wide_ranges->append(NonBMPRange()); + break; + case '.': + MOZ_CRASH("Bad char_class!"); + } + return; + } + + if (unicode::IsLeadSurrogate(c)) + lead_ranges->append(CharacterRange::Singleton(c)); + else if (unicode::IsTrailSurrogate(c)) + trail_ranges->append(CharacterRange::Singleton(c)); + else if (c >= unicode::NonBMPMin) + wide_ranges->append(WideCharRange::Singleton(c)); + else + ranges->append(CharacterRange::Singleton(c)); +} + +static inline void +AddUnicodeRange(LifoAlloc* alloc, + CharacterRangeVector* ranges, + CharacterRangeVector* lead_ranges, + CharacterRangeVector* trail_ranges, + WideCharRangeVector* wide_ranges, + widechar first, + widechar next) +{ + MOZ_ASSERT(first <= next); + if (first < unicode::LeadSurrogateMin) { + if (next < unicode::LeadSurrogateMin) { + ranges->append(CharacterRange::Range(first, next)); + return; + } + ranges->append(CharacterRange::Range(first, unicode::LeadSurrogateMin - 1)); + first = unicode::LeadSurrogateMin; + } + if (first <= unicode::LeadSurrogateMax) { + if (next <= unicode::LeadSurrogateMax) { + lead_ranges->append(CharacterRange::Range(first, next)); + return; + } + lead_ranges->append(CharacterRange::Range(first, unicode::LeadSurrogateMax)); + first = unicode::LeadSurrogateMax + 1; + } + MOZ_ASSERT(unicode::LeadSurrogateMax + 1 == unicode::TrailSurrogateMin); + if (first <= unicode::TrailSurrogateMax) { + if (next <= unicode::TrailSurrogateMax) { + trail_ranges->append(CharacterRange::Range(first, next)); + return; + } + trail_ranges->append(CharacterRange::Range(first, unicode::TrailSurrogateMax)); + first = unicode::TrailSurrogateMax + 1; + } + if (first <= unicode::UTF16Max) { + if (next <= unicode::UTF16Max) { + ranges->append(CharacterRange::Range(first, next)); + return; + } + ranges->append(CharacterRange::Range(first, unicode::UTF16Max)); + first = unicode::NonBMPMin; + } + MOZ_ASSERT(unicode::UTF16Max + 1 == unicode::NonBMPMin); + wide_ranges->append(WideCharRange::Range(first, next)); +} + +// Negate a vector of ranges by subtracting its ranges from a range +// encompassing the full range of possible values. +template <typename RangeType> +static inline void +NegateUnicodeRanges(LifoAlloc* alloc, InfallibleVector<RangeType, 1>** ranges, + RangeType full_range) +{ + typedef InfallibleVector<RangeType, 1> RangeVector; + RangeVector* tmp_ranges = alloc->newInfallible<RangeVector>(*alloc); + tmp_ranges->append(full_range); + RangeVector* result_ranges = alloc->newInfallible<RangeVector>(*alloc); + + // Perform the following calculation: + // result_ranges = tmp_ranges - ranges + // with the following steps: + // result_ranges = tmp_ranges - ranges[0] + // SWAP(result_ranges, tmp_ranges) + // result_ranges = tmp_ranges - ranges[1] + // SWAP(result_ranges, tmp_ranges) + // ... + // result_ranges = tmp_ranges - ranges[N-1] + // SWAP(result_ranges, tmp_ranges) + // The last SWAP is just for simplicity of the loop. + for (size_t i = 0; i < (*ranges)->length(); i++) { + result_ranges->clear(); + + const RangeType& range = (**ranges)[i]; + for (size_t j = 0; j < tmp_ranges->length(); j++) { + const RangeType& tmpRange = (*tmp_ranges)[j]; + auto from1 = tmpRange.from(); + auto to1 = tmpRange.to(); + auto from2 = range.from(); + auto to2 = range.to(); + + if (from1 < from2) { + if (to1 < from2) { + result_ranges->append(tmpRange); + } else if (to1 <= to2) { + result_ranges->append(RangeType::Range(from1, from2 - 1)); + } else { + result_ranges->append(RangeType::Range(from1, from2 - 1)); + result_ranges->append(RangeType::Range(to2 + 1, to1)); + } + } else if (from1 <= to2) { + if (to1 > to2) + result_ranges->append(RangeType::Range(to2 + 1, to1)); + } else { + result_ranges->append(tmpRange); + } + } + + auto tmp = tmp_ranges; + tmp_ranges = result_ranges; + result_ranges = tmp; + } + + // After the loop, result is pointed at by tmp_ranges, instead of + // result_ranges. + *ranges = tmp_ranges; +} + +static bool +WideCharRangesContain(WideCharRangeVector* wide_ranges, widechar c) +{ + for (size_t i = 0; i < wide_ranges->length(); i++) { + const WideCharRange& range = (*wide_ranges)[i]; + if (range.Contains(c)) + return true; + } + return false; +} + +static void +CalculateCaseInsensitiveRanges(LifoAlloc* alloc, widechar from, widechar to, int32_t diff, + WideCharRangeVector* wide_ranges, + WideCharRangeVector** tmp_wide_ranges) +{ + widechar contains_from = 0; + widechar contains_to = 0; + for (widechar c = from; c <= to; c++) { + if (WideCharRangesContain(wide_ranges, c) && + !WideCharRangesContain(wide_ranges, c + diff)) + { + if (contains_from == 0) + contains_from = c; + contains_to = c; + } else if (contains_from != 0) { + if (!*tmp_wide_ranges) + *tmp_wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc); + + (*tmp_wide_ranges)->append(WideCharRange::Range(contains_from + diff, + contains_to + diff)); + contains_from = 0; + } + } + + if (contains_from != 0) { + if (!*tmp_wide_ranges) + *tmp_wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc); + + (*tmp_wide_ranges)->append(WideCharRange::Range(contains_from + diff, + contains_to + diff)); + } +} + +static RegExpTree* +UnicodeRangesAtom(LifoAlloc* alloc, + CharacterRangeVector* ranges, + CharacterRangeVector* lead_ranges, + CharacterRangeVector* trail_ranges, + WideCharRangeVector* wide_ranges, + bool is_negated, + bool ignore_case) +{ + // Calculate case folding for non-BMP first and negate the range if needed. + if (ignore_case) { + WideCharRangeVector* tmp_wide_ranges = nullptr; +#define CALL_CALC(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \ + CalculateCaseInsensitiveRanges(alloc, FROM, TO, DIFF, wide_ranges, &tmp_wide_ranges); + FOR_EACH_NON_BMP_CASE_FOLDING(CALL_CALC) + FOR_EACH_NON_BMP_REV_CASE_FOLDING(CALL_CALC) +#undef CALL_CALC + + if (tmp_wide_ranges) { + for (size_t i = 0; i < tmp_wide_ranges->length(); i++) + wide_ranges->append((*tmp_wide_ranges)[i]); + } + } + + if (is_negated) { + NegateUnicodeRanges(alloc, &lead_ranges, LeadSurrogateRange()); + NegateUnicodeRanges(alloc, &trail_ranges, TrailSurrogateRange()); + NegateUnicodeRanges(alloc, &wide_ranges, NonBMPRange()); + } + + RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc); + + bool added = false; + + if (is_negated) { + ranges->append(LeadSurrogateRange()); + ranges->append(TrailSurrogateRange()); + } + if (ranges->length() > 0) { + builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, is_negated)); + added = true; + } + + if (lead_ranges->length() > 0) { + if (added) + builder->NewAlternative(); + builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(lead_ranges, false)); + builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin, + unicode::TrailSurrogateMax)); + added = true; + } + + if (trail_ranges->length() > 0) { + if (added) + builder->NewAlternative(); + builder->AddAssertion(alloc->newInfallible<RegExpAssertion>( + RegExpAssertion::NOT_AFTER_LEAD_SURROGATE)); + builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(trail_ranges, false)); + added = true; + } + + for (size_t i = 0; i < wide_ranges->length(); i++) { + if (added) + builder->NewAlternative(); + + const WideCharRange& range = (*wide_ranges)[i]; + widechar from = range.from(); + widechar to = range.to(); + char16_t from_lead, from_trail; + char16_t to_lead, to_trail; + + unicode::UTF16Encode(from, &from_lead, &from_trail); + if (from == to) { + builder->AddCharacter(from_lead); + builder->AddCharacter(from_trail); + } else { + unicode::UTF16Encode(to, &to_lead, &to_trail); + if (from_lead == to_lead) { + MOZ_ASSERT(from_trail != to_trail); + builder->AddCharacter(from_lead); + builder->AddAtom(RangeAtom(alloc, from_trail, to_trail)); + } else if (from_trail == unicode::TrailSurrogateMin && + to_trail == unicode::TrailSurrogateMax) + { + builder->AddAtom(RangeAtom(alloc, from_lead, to_lead)); + builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, + unicode::TrailSurrogateMax)); + } else if (from_lead + 1 == to_lead) { + builder->AddCharacter(from_lead); + builder->AddAtom(RangeAtom(alloc, from_trail, unicode::TrailSurrogateMax)); + + builder->NewAlternative(); + + builder->AddCharacter(to_lead); + builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, to_trail)); + } else if (from_lead + 2 == to_lead) { + builder->AddCharacter(from_lead); + builder->AddAtom(RangeAtom(alloc, from_trail, unicode::TrailSurrogateMax)); + + builder->NewAlternative(); + + builder->AddCharacter(from_lead + 1); + builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, + unicode::TrailSurrogateMax)); + + builder->NewAlternative(); + + builder->AddCharacter(to_lead); + builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, to_trail)); + } else { + builder->AddCharacter(from_lead); + builder->AddAtom(RangeAtom(alloc, from_trail, unicode::TrailSurrogateMax)); + + builder->NewAlternative(); + + builder->AddAtom(RangeAtom(alloc, from_lead + 1, to_lead - 1)); + builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, + unicode::TrailSurrogateMax)); + + builder->NewAlternative(); + + builder->AddCharacter(to_lead); + builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, to_trail)); + } + } + added = true; + } + + return builder->ToRegExp(); +} + +template <typename CharT> +RegExpTree* +RegExpParser<CharT>::ParseCharacterClass() +{ + MOZ_ASSERT(current() == '['); + Advance(); + bool is_negated = false; + if (current() == '^') { + is_negated = true; + Advance(); + } + CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); + CharacterRangeVector* lead_ranges = nullptr; + CharacterRangeVector* trail_ranges = nullptr; + WideCharRangeVector* wide_ranges = nullptr; + + if (unicode_) { + lead_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); + trail_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); + wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc); + } + + while (has_more() && current() != ']') { + char16_t char_class = kNoCharClass; + widechar first = 0; + if (!ParseClassAtom(&char_class, &first)) + return nullptr; + if (current() == '-') { + Advance(); + if (current() == kEndMarker) { + // If we reach the end we break out of the loop and let the + // following code report an error. + break; + } else if (current() == ']') { + if (unicode_) { + AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, + char_class, first, ignore_case_); + } else { + AddCharOrEscape(alloc, ranges, char_class, first); + } + ranges->append(CharacterRange::Singleton('-')); + break; + } + char16_t char_class_2 = kNoCharClass; + widechar next = 0; + if (!ParseClassAtom(&char_class_2, &next)) + return nullptr; + if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { + if (unicode_) + return ReportError(JSMSG_RANGE_WITH_CLASS_ESCAPE); + + // Either end is an escaped character class. Treat the '-' verbatim. + AddCharOrEscape(alloc, ranges, char_class, first); + ranges->append(CharacterRange::Singleton('-')); + AddCharOrEscape(alloc, ranges, char_class_2, next); + continue; + } + if (first > next) + return ReportError(JSMSG_BAD_CLASS_RANGE); + if (unicode_) + AddUnicodeRange(alloc, ranges, lead_ranges, trail_ranges,wide_ranges, first, next); + else + ranges->append(CharacterRange::Range(first, next)); + } else { + if (unicode_) { + AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, + char_class, first, ignore_case_); + } else { + AddCharOrEscape(alloc, ranges, char_class, first); + } + } + } + if (!has_more()) + return ReportError(JSMSG_UNTERM_CLASS); + Advance(); + if (!unicode_) { + if (ranges->length() == 0) { + ranges->append(CharacterRange::Everything()); + is_negated = !is_negated; + } + return alloc->newInfallible<RegExpCharacterClass>(ranges, is_negated); + } + + if (!is_negated && ranges->length() == 0 && lead_ranges->length() == 0 && + trail_ranges->length() == 0 && wide_ranges->length() == 0) + { + ranges->append(CharacterRange::Everything()); + return alloc->newInfallible<RegExpCharacterClass>(ranges, true); + } + + return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, is_negated, + ignore_case_); +} + +template <typename CharT> +bool +RegExpParser<CharT>::ParseClassAtom(char16_t* char_class, widechar* value) +{ + MOZ_ASSERT(*char_class == kNoCharClass); + widechar first = current(); + if (first == '\\') { + switch (Next()) { + case 'w': case 'W': case 'd': case 'D': case 's': case 'S': { + *char_class = Next(); + Advance(2); + return true; + } + case kEndMarker: + return ReportError(JSMSG_ESCAPE_AT_END_OF_REGEXP); + default: + if (!ParseClassCharacterEscape(value)) + return false; + return true; + } + } else { + if (unicode_) { + char16_t lead, trail; + if (ParseRawSurrogatePair(&lead, &trail)) { + *value = unicode::UTF16Decode(lead, trail); + return true; + } + } + Advance(); + *value = first; + return true; + } +} + +// In order to know whether an escape is a backreference or not we have to scan +// the entire regexp and find the number of capturing parentheses. However we +// don't want to scan the regexp twice unless it is necessary. This mini-parser +// is called when needed. It can see the difference between capturing and +// noncapturing parentheses and can skip character classes and backslash-escaped +// characters. +template <typename CharT> +void +RegExpParser<CharT>::ScanForCaptures() +{ + // Start with captures started previous to current position + int capture_count = captures_started(); + // Add count of captures after this position. + widechar n; + while ((n = current()) != kEndMarker) { + Advance(); + switch (n) { + case '\\': + Advance(); + break; + case '[': { + widechar c; + while ((c = current()) != kEndMarker) { + Advance(); + if (c == '\\') { + Advance(); + } else { + if (c == ']') break; + } + } + break; + } + case '(': + if (current() != '?') capture_count++; + break; + } + } + capture_count_ = capture_count; + is_scanned_for_captures_ = true; +} + +inline bool +IsInRange(int value, int lower_limit, int higher_limit) +{ + MOZ_ASSERT(lower_limit <= higher_limit); + return static_cast<unsigned int>(value - lower_limit) <= + static_cast<unsigned int>(higher_limit - lower_limit); +} + +inline bool +IsDecimalDigit(widechar c) +{ + // ECMA-262, 3rd, 7.8.3 (p 16) + return IsInRange(c, '0', '9'); +} + +template <typename CharT> +bool +RegExpParser<CharT>::ParseBackReferenceIndex(int* index_out) +{ + MOZ_ASSERT('\\' == current()); + MOZ_ASSERT('1' <= Next() && Next() <= '9'); + + // Try to parse a decimal literal that is no greater than the total number + // of left capturing parentheses in the input. + const CharT* start = position(); + int value = Next() - '0'; + Advance(2); + while (true) { + widechar c = current(); + if (IsDecimalDigit(c)) { + value = 10 * value + (c - '0'); + if (value > kMaxCaptures) { + Reset(start); + return false; + } + Advance(); + } else { + break; + } + } + if (value > captures_started()) { + if (!is_scanned_for_captures_) { + const CharT* saved_position = position(); + ScanForCaptures(); + Reset(saved_position); + } + if (value > capture_count_) { + Reset(start); + return false; + } + } + *index_out = value; + return true; +} + +// QuantifierPrefix :: +// { DecimalDigits } +// { DecimalDigits , } +// { DecimalDigits , DecimalDigits } +// +// Returns true if parsing succeeds, and set the min_out and max_out +// values. Values are truncated to RegExpTree::kInfinity if they overflow. +template <typename CharT> +bool +RegExpParser<CharT>::ParseIntervalQuantifier(int* min_out, int* max_out) +{ + MOZ_ASSERT(current() == '{'); + const CharT* start = position(); + Advance(); + int min = 0; + if (!IsDecimalDigit(current())) { + Reset(start); + return false; + } + while (IsDecimalDigit(current())) { + int next = current() - '0'; + if (min > (RegExpTree::kInfinity - next) / 10) { + // Overflow. Skip past remaining decimal digits and return -1. + do { + Advance(); + } while (IsDecimalDigit(current())); + min = RegExpTree::kInfinity; + break; + } + min = 10 * min + next; + Advance(); + } + int max = 0; + if (current() == '}') { + max = min; + Advance(); + } else if (current() == ',') { + Advance(); + if (current() == '}') { + max = RegExpTree::kInfinity; + Advance(); + } else { + while (IsDecimalDigit(current())) { + int next = current() - '0'; + if (max > (RegExpTree::kInfinity - next) / 10) { + do { + Advance(); + } while (IsDecimalDigit(current())); + max = RegExpTree::kInfinity; + break; + } + max = 10 * max + next; + Advance(); + } + if (current() != '}') { + Reset(start); + return false; + } + Advance(); + } + } else { + Reset(start); + return false; + } + *min_out = min; + *max_out = max; + return true; +} + +// Pattern :: +// Disjunction +template <typename CharT> +RegExpTree* +RegExpParser<CharT>::ParsePattern() +{ + RegExpTree* result = ParseDisjunction(); + MOZ_ASSERT_IF(result, !has_more()); + return result; +} + +static inline RegExpTree* +CaseFoldingSurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail, int32_t diff) +{ + RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc); + + builder->AddCharacter(lead); + CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); + ranges->append(CharacterRange::Range(trail, trail)); + ranges->append(CharacterRange::Range(trail + diff, trail + diff)); + builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, false)); + + return builder->ToRegExp(); +} + +static inline RegExpTree* +SurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail, bool ignore_case) +{ + if (ignore_case) { +#define CALL_ATOM(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \ + if (lead == LEAD &&trail >= TRAIL_FROM && trail <= TRAIL_TO) \ + return CaseFoldingSurrogatePairAtom(alloc, lead, trail, DIFF); + FOR_EACH_NON_BMP_CASE_FOLDING(CALL_ATOM) + FOR_EACH_NON_BMP_REV_CASE_FOLDING(CALL_ATOM) +#undef CALL_ATOM + } + + RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc); + builder->AddCharacter(lead); + builder->AddCharacter(trail); + return builder->ToRegExp(); +} + +static inline RegExpTree* +LeadSurrogateAtom(LifoAlloc* alloc, char16_t value) +{ + RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc); + builder->AddCharacter(value); + builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin, + unicode::TrailSurrogateMax)); + return builder->ToRegExp(); +} + +static inline RegExpTree* +TrailSurrogateAtom(LifoAlloc* alloc, char16_t value) +{ + RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc); + builder->AddAssertion(alloc->newInfallible<RegExpAssertion>( + RegExpAssertion::NOT_AFTER_LEAD_SURROGATE)); + builder->AddCharacter(value); + return builder->ToRegExp(); +} + +static inline RegExpTree* +UnicodeEverythingAtom(LifoAlloc* alloc) +{ + RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc); + + // everything except \x0a, \x0d, \u2028 and \u2029 + + CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); + ranges->append(CharacterRange::Range(0x0, 0x09)); + ranges->append(CharacterRange::Range(0x0b, 0x0c)); + ranges->append(CharacterRange::Range(0x0e, 0x2027)); + ranges->append(CharacterRange::Range(0x202A, unicode::LeadSurrogateMin - 1)); + ranges->append(CharacterRange::Range(unicode::TrailSurrogateMax + 1, unicode::UTF16Max)); + builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, false)); + + builder->NewAlternative(); + + builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax)); + builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin, + unicode::TrailSurrogateMax)); + + builder->NewAlternative(); + + builder->AddAssertion(alloc->newInfallible<RegExpAssertion>( + RegExpAssertion::NOT_AFTER_LEAD_SURROGATE)); + builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); + + builder->NewAlternative(); + + builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax)); + builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); + + return builder->ToRegExp(); +} + +RegExpTree* +UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class, bool ignore_case) +{ + CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); + CharacterRangeVector* lead_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); + CharacterRangeVector* trail_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); + WideCharRangeVector* wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc); + AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, char_class, 0, + ignore_case); + + return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, false, false); +} + +static inline RegExpTree* +UnicodeBackReferenceAtom(LifoAlloc* alloc, RegExpTree* atom) +{ + // If a back reference has a standalone lead surrogate as its last + // character, then that lead surrogate shouldn't match lead surrogates that + // are paired with a corresponding trail surrogate. + RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc); + + builder->AddAtom(atom); + builder->AddAssertion(alloc->newInfallible<RegExpAssertion>( + RegExpAssertion::NOT_IN_SURROGATE_PAIR)); + + return builder->ToRegExp(); +} + +// Disjunction :: +// Alternative +// Alternative | Disjunction +// Alternative :: +// [empty] +// Term Alternative +// Term :: +// Assertion +// Atom +// Atom Quantifier +template <typename CharT> +RegExpTree* +RegExpParser<CharT>::ParseDisjunction() +{ + // Used to store current state while parsing subexpressions. + RegExpParserState initial_state(alloc, nullptr, INITIAL, 0); + RegExpParserState* stored_state = &initial_state; + // Cache the builder in a local variable for quick access. + RegExpBuilder* builder = initial_state.builder(); + while (true) { + switch (current()) { + case kEndMarker: + if (stored_state->IsSubexpression()) { + // Inside a parenthesized group when hitting end of input. + return ReportError(JSMSG_MISSING_PAREN); + } + MOZ_ASSERT(INITIAL == stored_state->group_type()); + // Parsing completed successfully. + return builder->ToRegExp(); + case ')': { + if (!stored_state->IsSubexpression()) + return ReportError(JSMSG_UNMATCHED_RIGHT_PAREN); + MOZ_ASSERT(INITIAL != stored_state->group_type()); + + Advance(); + // End disjunction parsing and convert builder content to new single + // regexp atom. + RegExpTree* body = builder->ToRegExp(); + + int end_capture_index = captures_started(); + + int capture_index = stored_state->capture_index(); + SubexpressionType group_type = stored_state->group_type(); + + // Restore previous state. + stored_state = stored_state->previous_state(); + builder = stored_state->builder(); + + // Build result of subexpression. + if (group_type == CAPTURE) { + RegExpCapture* capture = alloc->newInfallible<RegExpCapture>(body, capture_index); + (*captures_)[capture_index - 1] = capture; + body = capture; + } else if (group_type != GROUPING) { + MOZ_ASSERT(group_type == POSITIVE_LOOKAHEAD || + group_type == NEGATIVE_LOOKAHEAD); + bool is_positive = (group_type == POSITIVE_LOOKAHEAD); + body = alloc->newInfallible<RegExpLookahead>(body, + is_positive, + end_capture_index - capture_index, + capture_index); + } + builder->AddAtom(body); + if (unicode_ && (group_type == POSITIVE_LOOKAHEAD || group_type == NEGATIVE_LOOKAHEAD)) + continue; + // For compatability with JSC and ES3, we allow quantifiers after + // lookaheads, and break in all cases. + break; + } + case '|': { + Advance(); + builder->NewAlternative(); + continue; + } + case '*': + case '+': + case '?': + return ReportError(JSMSG_NOTHING_TO_REPEAT); + case '^': { + Advance(); + if (multiline_) { + builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::START_OF_LINE)); + } else { + builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::START_OF_INPUT)); + set_contains_anchor(); + } + continue; + } + case '$': { + Advance(); + RegExpAssertion::AssertionType assertion_type = + multiline_ ? RegExpAssertion::END_OF_LINE : + RegExpAssertion::END_OF_INPUT; + builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(assertion_type)); + continue; + } + case '.': { + Advance(); + // everything except \x0a, \x0d, \u2028 and \u2029 + if (unicode_) { + builder->AddAtom(UnicodeEverythingAtom(alloc)); + break; + } + CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); + CharacterRange::AddClassEscape(alloc, '.', ranges); + RegExpTree* atom = alloc->newInfallible<RegExpCharacterClass>(ranges, false); + builder->AddAtom(atom); + break; + } + case '(': { + SubexpressionType subexpr_type = CAPTURE; + Advance(); + if (current() == '?') { + switch (Next()) { + case ':': + subexpr_type = GROUPING; + break; + case '=': + subexpr_type = POSITIVE_LOOKAHEAD; + break; + case '!': + subexpr_type = NEGATIVE_LOOKAHEAD; + break; + default: + return ReportError(JSMSG_INVALID_GROUP); + } + Advance(2); + } else { + if (captures_ == nullptr) + captures_ = alloc->newInfallible<RegExpCaptureVector>(*alloc); + if (captures_started() >= kMaxCaptures) + return ReportError(JSMSG_TOO_MANY_PARENS); + captures_->append((RegExpCapture*) nullptr); + } + // Store current state and begin new disjunction parsing. + stored_state = alloc->newInfallible<RegExpParserState>(alloc, stored_state, subexpr_type, + captures_started()); + builder = stored_state->builder(); + continue; + } + case '[': { + RegExpTree* atom = ParseCharacterClass(); + if (!atom) + return nullptr; + builder->AddAtom(atom); + break; + } + // Atom :: + // \ AtomEscape + case '\\': + switch (Next()) { + case kEndMarker: + return ReportError(JSMSG_ESCAPE_AT_END_OF_REGEXP); + case 'b': + Advance(2); + builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::BOUNDARY)); + continue; + case 'B': + Advance(2); + builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::NON_BOUNDARY)); + continue; + // AtomEscape :: + // CharacterClassEscape + // + // CharacterClassEscape :: one of + // d D s S w W + case 'D': case 'S': case 'W': + if (unicode_) { + Advance(); + builder->AddAtom(UnicodeCharacterClassEscapeAtom(alloc, current(), + ignore_case_)); + Advance(); + break; + } + MOZ_FALLTHROUGH; + case 'd': case 's': case 'w': { + widechar c = Next(); + Advance(2); + CharacterRangeVector* ranges = + alloc->newInfallible<CharacterRangeVector>(*alloc); + if (unicode_) + CharacterRange::AddClassEscapeUnicode(alloc, c, ranges, ignore_case_); + else + CharacterRange::AddClassEscape(alloc, c, ranges); + RegExpTree* atom = alloc->newInfallible<RegExpCharacterClass>(ranges, false); + builder->AddAtom(atom); + break; + } + case '1': case '2': case '3': case '4': case '5': case '6': + case '7': case '8': case '9': { + int index = 0; + if (ParseBackReferenceIndex(&index)) { + RegExpCapture* capture = nullptr; + if (captures_ != nullptr && index <= (int) captures_->length()) { + capture = (*captures_)[index - 1]; + } + if (capture == nullptr) { + builder->AddEmpty(); + break; + } + RegExpTree* atom = alloc->newInfallible<RegExpBackReference>(capture); + if (unicode_) + builder->AddAtom(UnicodeBackReferenceAtom(alloc, atom)); + else + builder->AddAtom(atom); + break; + } + if (unicode_) + return ReportError(JSMSG_BACK_REF_OUT_OF_RANGE); + widechar first_digit = Next(); + if (first_digit == '8' || first_digit == '9') { + // Treat as identity escape + builder->AddCharacter(first_digit); + Advance(2); + break; + } + MOZ_FALLTHROUGH; + } + case '0': { + if (unicode_) { + Advance(2); + if (IsDecimalDigit(current())) + return ReportError(JSMSG_INVALID_DECIMAL_ESCAPE); + builder->AddCharacter(0); + break; + } + + Advance(); + widechar octal = ParseOctalLiteral(); + builder->AddCharacter(octal); + break; + } + // ControlEscape :: one of + // f n r t v + case 'f': + Advance(2); + builder->AddCharacter('\f'); + break; + case 'n': + Advance(2); + builder->AddCharacter('\n'); + break; + case 'r': + Advance(2); + builder->AddCharacter('\r'); + break; + case 't': + Advance(2); + builder->AddCharacter('\t'); + break; + case 'v': + Advance(2); + builder->AddCharacter('\v'); + break; + case 'c': { + Advance(); + widechar controlLetter = Next(); + // Special case if it is an ASCII letter. + // Convert lower case letters to uppercase. + widechar letter = controlLetter & ~('a' ^ 'A'); + if (letter < 'A' || 'Z' < letter) { + if (unicode_) + return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); + // controlLetter is not in range 'A'-'Z' or 'a'-'z'. + // This is outside the specification. We match JSC in + // reading the backslash as a literal character instead + // of as starting an escape. + builder->AddCharacter('\\'); + } else { + Advance(2); + builder->AddCharacter(controlLetter & 0x1f); + } + break; + } + case 'x': { + Advance(2); + widechar value; + if (ParseHexEscape(2, &value)) { + builder->AddCharacter(value); + } else { + if (unicode_) + return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); + builder->AddCharacter('x'); + } + break; + } + case 'u': { + Advance(2); + widechar value; + if (unicode_) { + if (current() == '{') { + if (!ParseBracedHexEscape(&value)) + return nullptr; + if (unicode::IsLeadSurrogate(value)) { + builder->AddAtom(LeadSurrogateAtom(alloc, value)); + } else if (unicode::IsTrailSurrogate(value)) { + builder->AddAtom(TrailSurrogateAtom(alloc, value)); + } else if (value >= unicode::NonBMPMin) { + char16_t lead, trail; + unicode::UTF16Encode(value, &lead, &trail); + builder->AddAtom(SurrogatePairAtom(alloc, lead, trail, + ignore_case_)); + } else { + builder->AddCharacter(value); + } + } else if (ParseHexEscape(4, &value)) { + if (unicode::IsLeadSurrogate(value)) { + widechar trail; + if (ParseTrailSurrogate(&trail)) { + builder->AddAtom(SurrogatePairAtom(alloc, value, trail, + ignore_case_)); + } else { + builder->AddAtom(LeadSurrogateAtom(alloc, value)); + } + } else if (unicode::IsTrailSurrogate(value)) { + builder->AddAtom(TrailSurrogateAtom(alloc, value)); + } else { + builder->AddCharacter(value); + } + } else { + return ReportError(JSMSG_INVALID_UNICODE_ESCAPE); + } + break; + } + if (ParseHexEscape(4, &value)) { + builder->AddCharacter(value); + } else { + builder->AddCharacter('u'); + } + break; + } + default: + // Identity escape. + if (unicode_ && !IsSyntaxCharacter(Next())) + return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); + builder->AddCharacter(Next()); + Advance(2); + break; + } + break; + case '{': { + if (unicode_) + return ReportError(JSMSG_RAW_BRACE_IN_REGEP); + int dummy; + if (ParseIntervalQuantifier(&dummy, &dummy)) + return ReportError(JSMSG_NOTHING_TO_REPEAT); + MOZ_FALLTHROUGH; + } + default: + if (unicode_) { + char16_t lead, trail; + if (ParseRawSurrogatePair(&lead, &trail)) { + builder->AddAtom(SurrogatePairAtom(alloc, lead, trail, ignore_case_)); + } else { + widechar c = current(); + if (unicode::IsLeadSurrogate(c)) + builder->AddAtom(LeadSurrogateAtom(alloc, c)); + else if (unicode::IsTrailSurrogate(c)) + builder->AddAtom(TrailSurrogateAtom(alloc, c)); + else if (c == ']') + return ReportError(JSMSG_RAW_BRACKET_IN_REGEP); + else if (c == '}') + return ReportError(JSMSG_RAW_BRACE_IN_REGEP); + else + builder->AddCharacter(c); + Advance(); + } + break; + } + builder->AddCharacter(current()); + Advance(); + break; + } // end switch(current()) + + int min; + int max; + switch (current()) { + // QuantifierPrefix :: + // * + // + + // ? + // { + case '*': + min = 0; + max = RegExpTree::kInfinity; + Advance(); + break; + case '+': + min = 1; + max = RegExpTree::kInfinity; + Advance(); + break; + case '?': + min = 0; + max = 1; + Advance(); + break; + case '{': + if (ParseIntervalQuantifier(&min, &max)) { + if (max < min) + return ReportError(JSMSG_NUMBERS_OUT_OF_ORDER); + break; + } else { + continue; + } + default: + continue; + } + RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; + if (current() == '?') { + quantifier_type = RegExpQuantifier::NON_GREEDY; + Advance(); + } + builder->AddQuantifierToAtom(min, max, quantifier_type); + } +} + +template class irregexp::RegExpParser<Latin1Char>; +template class irregexp::RegExpParser<char16_t>; + +template <typename CharT> +static bool +ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length, + bool multiline, bool match_only, bool unicode, bool ignore_case, + bool global, bool sticky, RegExpCompileData* data) +{ + if (match_only) { + // Try to strip a leading '.*' from the RegExp, but only if it is not + // followed by a '?' (which will affect how the .* is parsed). This + // pattern will affect the captures produced by the RegExp, but not + // whether there is a match or not. + if (length >= 3 && chars[0] == '.' && chars[1] == '*' && chars[2] != '?') { + chars += 2; + length -= 2; + } + + // Try to strip a trailing '.*' from the RegExp, which as above will + // affect the captures but not whether there is a match. Only do this + // when the following conditions are met: + // 1. there are no other meta characters in the RegExp, so that we + // are sure this will not affect how the RegExp is parsed + // 2. global and sticky flags are not set, as lastIndex needs to be + // set properly on global or sticky match + if (length >= 3 && !HasRegExpMetaChars(chars, length - 2) && + !global && !sticky && + chars[length - 2] == '.' && chars[length - 1] == '*') + { + length -= 2; + } + } + + RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, multiline, unicode, ignore_case); + data->tree = parser.ParsePattern(); + if (!data->tree) + return false; + + data->simple = parser.simple(); + data->contains_anchor = parser.contains_anchor(); + data->capture_count = parser.captures_started(); + return true; +} + +bool +irregexp::ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str, + bool multiline, bool match_only, bool unicode, bool ignore_case, + bool global, bool sticky, RegExpCompileData* data) +{ + JS::AutoCheckCannotGC nogc; + return str->hasLatin1Chars() + ? ::ParsePattern(ts, alloc, str->latin1Chars(nogc), str->length(), + multiline, match_only, unicode, ignore_case, global, sticky, data) + : ::ParsePattern(ts, alloc, str->twoByteChars(nogc), str->length(), + multiline, match_only, unicode, ignore_case, global, sticky, data); +} + +template <typename CharT> +static bool +ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length, + bool unicode) +{ + LifoAllocScope scope(&alloc); + + RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, false, unicode, false); + return parser.ParsePattern() != nullptr; +} + +bool +irregexp::ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str, + bool unicode) +{ + JS::AutoCheckCannotGC nogc; + return str->hasLatin1Chars() + ? ::ParsePatternSyntax(ts, alloc, str->latin1Chars(nogc), str->length(), unicode) + : ::ParsePatternSyntax(ts, alloc, str->twoByteChars(nogc), str->length(), unicode); +} |