/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- * vim: set ts=8 sts=4 et sw=4 tw=99: */ // Copyright 2012 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "irregexp/RegExpParser.h" #include "frontend/TokenStream.h" using namespace js; using namespace js::irregexp; // ---------------------------------------------------------------------------- // RegExpBuilder RegExpBuilder::RegExpBuilder(LifoAlloc* alloc) : alloc(alloc), pending_empty_(false), characters_(nullptr) #ifdef DEBUG , last_added_(ADD_NONE) #endif {} void RegExpBuilder::FlushCharacters() { pending_empty_ = false; if (characters_ != nullptr) { RegExpTree* atom = alloc->newInfallible<RegExpAtom>(characters_); characters_ = nullptr; text_.Add(alloc, atom); #ifdef DEBUG last_added_ = ADD_ATOM; #endif } } void RegExpBuilder::FlushText() { FlushCharacters(); int num_text = text_.length(); if (num_text == 0) return; if (num_text == 1) { terms_.Add(alloc, text_.last()); } else { RegExpText* text = alloc->newInfallible<RegExpText>(alloc); for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text); terms_.Add(alloc, text); } text_.Clear(); } void RegExpBuilder::AddCharacter(char16_t c) { pending_empty_ = false; if (characters_ == nullptr) characters_ = alloc->newInfallible<CharacterVector>(*alloc); characters_->append(c); #ifdef DEBUG last_added_ = ADD_CHAR; #endif } void RegExpBuilder::AddEmpty() { pending_empty_ = true; } void RegExpBuilder::AddAtom(RegExpTree* term) { if (term->IsEmpty()) { AddEmpty(); return; } if (term->IsTextElement()) { FlushCharacters(); text_.Add(alloc, term); } else { FlushText(); terms_.Add(alloc, term); } #ifdef DEBUG last_added_ = ADD_ATOM; #endif } void RegExpBuilder::AddAssertion(RegExpTree* assert) { FlushText(); terms_.Add(alloc, assert); #ifdef DEBUG last_added_ = ADD_ASSERT; #endif } void RegExpBuilder::NewAlternative() { FlushTerms(); } void RegExpBuilder::FlushTerms() { FlushText(); int num_terms = terms_.length(); RegExpTree* alternative; if (num_terms == 0) alternative = RegExpEmpty::GetInstance(); else if (num_terms == 1) alternative = terms_.last(); else alternative = alloc->newInfallible<RegExpAlternative>(terms_.GetList(alloc)); alternatives_.Add(alloc, alternative); terms_.Clear(); #ifdef DEBUG last_added_ = ADD_NONE; #endif } RegExpTree* RegExpBuilder::ToRegExp() { FlushTerms(); int num_alternatives = alternatives_.length(); if (num_alternatives == 0) { return RegExpEmpty::GetInstance(); } if (num_alternatives == 1) { return alternatives_.last(); } return alloc->newInfallible<RegExpDisjunction>(alternatives_.GetList(alloc)); } void RegExpBuilder::AddQuantifierToAtom(int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { if (pending_empty_) { pending_empty_ = false; return; } RegExpTree* atom; if (characters_ != nullptr) { MOZ_ASSERT(last_added_ == ADD_CHAR); // Last atom was character. CharacterVector* char_vector = characters_; int num_chars = char_vector->length(); if (num_chars > 1) { CharacterVector* prefix = alloc->newInfallible<CharacterVector>(*alloc); prefix->append(char_vector->begin(), num_chars - 1); text_.Add(alloc, alloc->newInfallible<RegExpAtom>(prefix)); char_vector = alloc->newInfallible<CharacterVector>(*alloc); char_vector->append((*characters_)[num_chars - 1]); } characters_ = nullptr; atom = alloc->newInfallible<RegExpAtom>(char_vector); FlushText(); } else if (text_.length() > 0) { MOZ_ASSERT(last_added_ == ADD_ATOM); atom = text_.RemoveLast(); FlushText(); } else if (terms_.length() > 0) { MOZ_ASSERT(last_added_ == ADD_ATOM); atom = terms_.RemoveLast(); if (atom->max_match() == 0) { // Guaranteed to only match an empty string. #ifdef DEBUG last_added_ = ADD_TERM; #endif if (min == 0) return; terms_.Add(alloc, atom); return; } } else { // Only call immediately after adding an atom or character! MOZ_CRASH("Bad call"); } terms_.Add(alloc, alloc->newInfallible<RegExpQuantifier>(min, max, quantifier_type, atom)); #ifdef DEBUG last_added_ = ADD_TERM; #endif } // ---------------------------------------------------------------------------- // RegExpParser template <typename CharT> RegExpParser<CharT>::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc, const CharT* chars, const CharT* end, bool multiline_mode, bool unicode, bool ignore_case) : ts(ts), alloc(alloc), captures_(nullptr), next_pos_(chars), end_(end), current_(kEndMarker), capture_count_(0), has_more_(true), multiline_(multiline_mode), unicode_(unicode), ignore_case_(ignore_case), simple_(false), contains_anchor_(false), is_scanned_for_captures_(false) { Advance(); } template <typename CharT> RegExpTree* RegExpParser<CharT>::ReportError(unsigned errorNumber) { gc::AutoSuppressGC suppressGC(ts.context()); ts.reportError(errorNumber); return nullptr; } template <typename CharT> void RegExpParser<CharT>::Advance() { if (next_pos_ < end_) { current_ = *next_pos_; next_pos_++; } else { current_ = kEndMarker; has_more_ = false; } } // Returns the value (0 .. 15) of a hexadecimal character c. // If c is not a legal hexadecimal character, returns a value < 0. inline int HexValue(uint32_t c) { c -= '0'; if (static_cast<unsigned>(c) <= 9) return c; c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36. if (static_cast<unsigned>(c) <= 5) return c + 10; return -1; } template <typename CharT> widechar RegExpParser<CharT>::ParseOctalLiteral() { MOZ_ASSERT('0' <= current() && current() <= '7'); // For compatibility with some other browsers (not all), we parse // up to three octal digits with a value below 256. widechar value = current() - '0'; Advance(); if ('0' <= current() && current() <= '7') { value = value * 8 + current() - '0'; Advance(); if (value < 32 && '0' <= current() && current() <= '7') { value = value * 8 + current() - '0'; Advance(); } } return value; } template <typename CharT> bool RegExpParser<CharT>::ParseHexEscape(int length, widechar* value) { const CharT* start = position(); uint32_t val = 0; bool done = false; for (int i = 0; !done; i++) { widechar c = current(); int d = HexValue(c); if (d < 0) { Reset(start); return false; } val = val * 16 + d; Advance(); if (i == length - 1) { done = true; } } *value = val; return true; } template <typename CharT> bool RegExpParser<CharT>::ParseBracedHexEscape(widechar* value) { MOZ_ASSERT(current() == '{'); Advance(); bool first = true; uint32_t code = 0; while (true) { widechar c = current(); if (c == kEndMarker) { ReportError(JSMSG_INVALID_UNICODE_ESCAPE); return false; } if (c == '}') { if (first) { ReportError(JSMSG_INVALID_UNICODE_ESCAPE); return false; } Advance(); break; } int d = HexValue(c); if (d < 0) { ReportError(JSMSG_INVALID_UNICODE_ESCAPE); return false; } code = (code << 4) | d; if (code > unicode::NonBMPMax) { ReportError(JSMSG_UNICODE_OVERFLOW); return false; } Advance(); first = false; } *value = code; return true; } template <typename CharT> bool RegExpParser<CharT>::ParseTrailSurrogate(widechar* value) { if (current() != '\\') return false; const CharT* start = position(); Advance(); if (current() != 'u') { Reset(start); return false; } Advance(); if (!ParseHexEscape(4, value)) { Reset(start); return false; } if (!unicode::IsTrailSurrogate(*value)) { Reset(start); return false; } return true; } template <typename CharT> bool RegExpParser<CharT>::ParseRawSurrogatePair(char16_t* lead, char16_t* trail) { widechar c1 = current(); if (!unicode::IsLeadSurrogate(c1)) return false; const CharT* start = position(); Advance(); widechar c2 = current(); if (!unicode::IsTrailSurrogate(c2)) { Reset(start); return false; } Advance(); *lead = c1; *trail = c2; return true; } static inline RegExpTree* RangeAtom(LifoAlloc* alloc, char16_t from, char16_t to) { CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); ranges->append(CharacterRange::Range(from, to)); return alloc->newInfallible<RegExpCharacterClass>(ranges, false); } static inline RegExpTree* NegativeLookahead(LifoAlloc* alloc, char16_t from, char16_t to) { return alloc->newInfallible<RegExpLookahead>(RangeAtom(alloc, from, to), false, 0, 0); } static bool IsSyntaxCharacter(widechar c) { switch (c) { case '^': case '$': case '\\': case '.': case '*': case '+': case '?': case '(': case ')': case '[': case ']': case '{': case '}': case '|': case '/': return true; default: return false; } } #ifdef DEBUG // Currently only used in an assert.kASSERT. static bool IsSpecialClassEscape(widechar c) { switch (c) { case 'd': case 'D': case 's': case 'S': case 'w': case 'W': return true; default: return false; } } #endif template <typename CharT> bool RegExpParser<CharT>::ParseClassCharacterEscape(widechar* code) { MOZ_ASSERT(current() == '\\'); MOZ_ASSERT(has_next() && !IsSpecialClassEscape(Next())); Advance(); switch (current()) { case 'b': Advance(); *code = '\b'; return true; // ControlEscape :: one of // f n r t v case 'f': Advance(); *code = '\f'; return true; case 'n': Advance(); *code = '\n'; return true; case 'r': Advance(); *code = '\r'; return true; case 't': Advance(); *code = '\t'; return true; case 'v': Advance(); *code = '\v'; return true; case 'c': { widechar controlLetter = Next(); widechar letter = controlLetter & ~('A' ^ 'a'); // For compatibility with JSC, inside a character class // we also accept digits and underscore as control characters, // but only in non-unicode mode if ((!unicode_ && ((controlLetter >= '0' && controlLetter <= '9') || controlLetter == '_')) || (letter >= 'A' && letter <= 'Z')) { Advance(2); // Control letters mapped to ASCII control characters in the range // 0x00-0x1f. *code = controlLetter & 0x1f; return true; } if (unicode_) { ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); return false; } // We match JSC in reading the backslash as a literal // character instead of as starting an escape. *code = '\\'; return true; } case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': if (unicode_) { if (current() == '0') { Advance(); *code = 0; return true; } ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); return false; } // For compatibility, outside of unicode mode, we interpret a decimal // escape that isn't a back reference (and therefore either \0 or not // valid according to the specification) as a 1..3 digit octal // character code. *code = ParseOctalLiteral(); return true; case 'x': { Advance(); widechar value; if (ParseHexEscape(2, &value)) { *code = value; return true; } if (unicode_) { ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); return false; } // If \x is not followed by a two-digit hexadecimal, treat it // as an identity escape in non-unicode mode. *code = 'x'; return true; } case 'u': { Advance(); widechar value; if (unicode_) { if (current() == '{') { if (!ParseBracedHexEscape(&value)) return false; *code = value; return true; } if (ParseHexEscape(4, &value)) { if (unicode::IsLeadSurrogate(value)) { widechar trail; if (ParseTrailSurrogate(&trail)) { *code = unicode::UTF16Decode(value, trail); return true; } } *code = value; return true; } ReportError(JSMSG_INVALID_UNICODE_ESCAPE); return false; } if (ParseHexEscape(4, &value)) { *code = value; return true; } // If \u is not followed by a four-digit or braced hexadecimal, treat it // as an identity escape. *code = 'u'; return true; } default: { // Extended identity escape (non-unicode only). We accept any character // that hasn't been matched by a more specific case, not just the subset // required by the ECMAScript specification. widechar result = current(); if (unicode_ && result != '-' && !IsSyntaxCharacter(result)) { ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); return false; } Advance(); *code = result; return true; } } return true; } class WideCharRange { public: WideCharRange() : from_(0), to_(0) {} WideCharRange(widechar from, widechar to) : from_(from), to_(to) {} static inline WideCharRange Singleton(widechar value) { return WideCharRange(value, value); } static inline WideCharRange Range(widechar from, widechar to) { MOZ_ASSERT(from <= to); return WideCharRange(from, to); } bool Contains(widechar i) const { return from_ <= i && i <= to_; } widechar from() const { return from_; } widechar to() const { return to_; } private: widechar from_; widechar to_; }; typedef InfallibleVector<WideCharRange, 1> WideCharRangeVector; static inline CharacterRange LeadSurrogateRange() { return CharacterRange::Range(unicode::LeadSurrogateMin, unicode::LeadSurrogateMax); } static inline CharacterRange TrailSurrogateRange() { return CharacterRange::Range(unicode::TrailSurrogateMin, unicode::TrailSurrogateMax); } static inline WideCharRange NonBMPRange() { return WideCharRange::Range(unicode::NonBMPMin, unicode::NonBMPMax); } static const char16_t kNoCharClass = 0; // Adds a character or pre-defined character class to character ranges. // If char_class is not kInvalidClass, it's interpreted as a class // escape (i.e., 's' means whitespace, from '\s'). static inline void AddCharOrEscape(LifoAlloc* alloc, CharacterRangeVector* ranges, char16_t char_class, widechar c) { if (char_class != kNoCharClass) CharacterRange::AddClassEscape(alloc, char_class, ranges); else ranges->append(CharacterRange::Singleton(c)); } static inline void AddCharOrEscapeUnicode(LifoAlloc* alloc, CharacterRangeVector* ranges, CharacterRangeVector* lead_ranges, CharacterRangeVector* trail_ranges, WideCharRangeVector* wide_ranges, char16_t char_class, widechar c, bool ignore_case) { if (char_class != kNoCharClass) { CharacterRange::AddClassEscapeUnicode(alloc, char_class, ranges, ignore_case); switch (char_class) { case 'S': case 'W': case 'D': lead_ranges->append(LeadSurrogateRange()); trail_ranges->append(TrailSurrogateRange()); wide_ranges->append(NonBMPRange()); break; case '.': MOZ_CRASH("Bad char_class!"); } return; } if (unicode::IsLeadSurrogate(c)) lead_ranges->append(CharacterRange::Singleton(c)); else if (unicode::IsTrailSurrogate(c)) trail_ranges->append(CharacterRange::Singleton(c)); else if (c >= unicode::NonBMPMin) wide_ranges->append(WideCharRange::Singleton(c)); else ranges->append(CharacterRange::Singleton(c)); } static inline void AddUnicodeRange(LifoAlloc* alloc, CharacterRangeVector* ranges, CharacterRangeVector* lead_ranges, CharacterRangeVector* trail_ranges, WideCharRangeVector* wide_ranges, widechar first, widechar next) { MOZ_ASSERT(first <= next); if (first < unicode::LeadSurrogateMin) { if (next < unicode::LeadSurrogateMin) { ranges->append(CharacterRange::Range(first, next)); return; } ranges->append(CharacterRange::Range(first, unicode::LeadSurrogateMin - 1)); first = unicode::LeadSurrogateMin; } if (first <= unicode::LeadSurrogateMax) { if (next <= unicode::LeadSurrogateMax) { lead_ranges->append(CharacterRange::Range(first, next)); return; } lead_ranges->append(CharacterRange::Range(first, unicode::LeadSurrogateMax)); first = unicode::LeadSurrogateMax + 1; } MOZ_ASSERT(unicode::LeadSurrogateMax + 1 == unicode::TrailSurrogateMin); if (first <= unicode::TrailSurrogateMax) { if (next <= unicode::TrailSurrogateMax) { trail_ranges->append(CharacterRange::Range(first, next)); return; } trail_ranges->append(CharacterRange::Range(first, unicode::TrailSurrogateMax)); first = unicode::TrailSurrogateMax + 1; } if (first <= unicode::UTF16Max) { if (next <= unicode::UTF16Max) { ranges->append(CharacterRange::Range(first, next)); return; } ranges->append(CharacterRange::Range(first, unicode::UTF16Max)); first = unicode::NonBMPMin; } MOZ_ASSERT(unicode::UTF16Max + 1 == unicode::NonBMPMin); wide_ranges->append(WideCharRange::Range(first, next)); } // Negate a vector of ranges by subtracting its ranges from a range // encompassing the full range of possible values. template <typename RangeType> static inline void NegateUnicodeRanges(LifoAlloc* alloc, InfallibleVector<RangeType, 1>** ranges, RangeType full_range) { typedef InfallibleVector<RangeType, 1> RangeVector; RangeVector* tmp_ranges = alloc->newInfallible<RangeVector>(*alloc); tmp_ranges->append(full_range); RangeVector* result_ranges = alloc->newInfallible<RangeVector>(*alloc); // Perform the following calculation: // result_ranges = tmp_ranges - ranges // with the following steps: // result_ranges = tmp_ranges - ranges[0] // SWAP(result_ranges, tmp_ranges) // result_ranges = tmp_ranges - ranges[1] // SWAP(result_ranges, tmp_ranges) // ... // result_ranges = tmp_ranges - ranges[N-1] // SWAP(result_ranges, tmp_ranges) // The last SWAP is just for simplicity of the loop. for (size_t i = 0; i < (*ranges)->length(); i++) { result_ranges->clear(); const RangeType& range = (**ranges)[i]; for (size_t j = 0; j < tmp_ranges->length(); j++) { const RangeType& tmpRange = (*tmp_ranges)[j]; auto from1 = tmpRange.from(); auto to1 = tmpRange.to(); auto from2 = range.from(); auto to2 = range.to(); if (from1 < from2) { if (to1 < from2) { result_ranges->append(tmpRange); } else if (to1 <= to2) { result_ranges->append(RangeType::Range(from1, from2 - 1)); } else { result_ranges->append(RangeType::Range(from1, from2 - 1)); result_ranges->append(RangeType::Range(to2 + 1, to1)); } } else if (from1 <= to2) { if (to1 > to2) result_ranges->append(RangeType::Range(to2 + 1, to1)); } else { result_ranges->append(tmpRange); } } auto tmp = tmp_ranges; tmp_ranges = result_ranges; result_ranges = tmp; } // After the loop, result is pointed at by tmp_ranges, instead of // result_ranges. *ranges = tmp_ranges; } static bool WideCharRangesContain(WideCharRangeVector* wide_ranges, widechar c) { for (size_t i = 0; i < wide_ranges->length(); i++) { const WideCharRange& range = (*wide_ranges)[i]; if (range.Contains(c)) return true; } return false; } static void CalculateCaseInsensitiveRanges(LifoAlloc* alloc, widechar from, widechar to, int32_t diff, WideCharRangeVector* wide_ranges, WideCharRangeVector** tmp_wide_ranges) { widechar contains_from = 0; widechar contains_to = 0; for (widechar c = from; c <= to; c++) { if (WideCharRangesContain(wide_ranges, c) && !WideCharRangesContain(wide_ranges, c + diff)) { if (contains_from == 0) contains_from = c; contains_to = c; } else if (contains_from != 0) { if (!*tmp_wide_ranges) *tmp_wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc); (*tmp_wide_ranges)->append(WideCharRange::Range(contains_from + diff, contains_to + diff)); contains_from = 0; } } if (contains_from != 0) { if (!*tmp_wide_ranges) *tmp_wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc); (*tmp_wide_ranges)->append(WideCharRange::Range(contains_from + diff, contains_to + diff)); } } static RegExpTree* UnicodeRangesAtom(LifoAlloc* alloc, CharacterRangeVector* ranges, CharacterRangeVector* lead_ranges, CharacterRangeVector* trail_ranges, WideCharRangeVector* wide_ranges, bool is_negated, bool ignore_case) { // Calculate case folding for non-BMP first and negate the range if needed. if (ignore_case) { WideCharRangeVector* tmp_wide_ranges = nullptr; #define CALL_CALC(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \ CalculateCaseInsensitiveRanges(alloc, FROM, TO, DIFF, wide_ranges, &tmp_wide_ranges); FOR_EACH_NON_BMP_CASE_FOLDING(CALL_CALC) FOR_EACH_NON_BMP_REV_CASE_FOLDING(CALL_CALC) #undef CALL_CALC if (tmp_wide_ranges) { for (size_t i = 0; i < tmp_wide_ranges->length(); i++) wide_ranges->append((*tmp_wide_ranges)[i]); } } if (is_negated) { NegateUnicodeRanges(alloc, &lead_ranges, LeadSurrogateRange()); NegateUnicodeRanges(alloc, &trail_ranges, TrailSurrogateRange()); NegateUnicodeRanges(alloc, &wide_ranges, NonBMPRange()); } RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc); bool added = false; if (is_negated) { ranges->append(LeadSurrogateRange()); ranges->append(TrailSurrogateRange()); } if (ranges->length() > 0) { builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, is_negated)); added = true; } if (lead_ranges->length() > 0) { if (added) builder->NewAlternative(); builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(lead_ranges, false)); builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); added = true; } if (trail_ranges->length() > 0) { if (added) builder->NewAlternative(); builder->AddAssertion(alloc->newInfallible<RegExpAssertion>( RegExpAssertion::NOT_AFTER_LEAD_SURROGATE)); builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(trail_ranges, false)); added = true; } for (size_t i = 0; i < wide_ranges->length(); i++) { if (added) builder->NewAlternative(); const WideCharRange& range = (*wide_ranges)[i]; widechar from = range.from(); widechar to = range.to(); char16_t from_lead, from_trail; char16_t to_lead, to_trail; unicode::UTF16Encode(from, &from_lead, &from_trail); if (from == to) { builder->AddCharacter(from_lead); builder->AddCharacter(from_trail); } else { unicode::UTF16Encode(to, &to_lead, &to_trail); if (from_lead == to_lead) { MOZ_ASSERT(from_trail != to_trail); builder->AddCharacter(from_lead); builder->AddAtom(RangeAtom(alloc, from_trail, to_trail)); } else if (from_trail == unicode::TrailSurrogateMin && to_trail == unicode::TrailSurrogateMax) { builder->AddAtom(RangeAtom(alloc, from_lead, to_lead)); builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); } else if (from_lead + 1 == to_lead) { builder->AddCharacter(from_lead); builder->AddAtom(RangeAtom(alloc, from_trail, unicode::TrailSurrogateMax)); builder->NewAlternative(); builder->AddCharacter(to_lead); builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, to_trail)); } else if (from_lead + 2 == to_lead) { builder->AddCharacter(from_lead); builder->AddAtom(RangeAtom(alloc, from_trail, unicode::TrailSurrogateMax)); builder->NewAlternative(); builder->AddCharacter(from_lead + 1); builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); builder->NewAlternative(); builder->AddCharacter(to_lead); builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, to_trail)); } else { builder->AddCharacter(from_lead); builder->AddAtom(RangeAtom(alloc, from_trail, unicode::TrailSurrogateMax)); builder->NewAlternative(); builder->AddAtom(RangeAtom(alloc, from_lead + 1, to_lead - 1)); builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); builder->NewAlternative(); builder->AddCharacter(to_lead); builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, to_trail)); } } added = true; } return builder->ToRegExp(); } template <typename CharT> RegExpTree* RegExpParser<CharT>::ParseCharacterClass() { MOZ_ASSERT(current() == '['); Advance(); bool is_negated = false; if (current() == '^') { is_negated = true; Advance(); } CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); CharacterRangeVector* lead_ranges = nullptr; CharacterRangeVector* trail_ranges = nullptr; WideCharRangeVector* wide_ranges = nullptr; if (unicode_) { lead_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); trail_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc); } while (has_more() && current() != ']') { char16_t char_class = kNoCharClass; widechar first = 0; if (!ParseClassAtom(&char_class, &first)) return nullptr; if (current() == '-') { Advance(); if (current() == kEndMarker) { // If we reach the end we break out of the loop and let the // following code report an error. break; } else if (current() == ']') { if (unicode_) { AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, char_class, first, ignore_case_); } else { AddCharOrEscape(alloc, ranges, char_class, first); } ranges->append(CharacterRange::Singleton('-')); break; } char16_t char_class_2 = kNoCharClass; widechar next = 0; if (!ParseClassAtom(&char_class_2, &next)) return nullptr; if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { if (unicode_) return ReportError(JSMSG_RANGE_WITH_CLASS_ESCAPE); // Either end is an escaped character class. Treat the '-' verbatim. AddCharOrEscape(alloc, ranges, char_class, first); ranges->append(CharacterRange::Singleton('-')); AddCharOrEscape(alloc, ranges, char_class_2, next); continue; } if (first > next) return ReportError(JSMSG_BAD_CLASS_RANGE); if (unicode_) AddUnicodeRange(alloc, ranges, lead_ranges, trail_ranges,wide_ranges, first, next); else ranges->append(CharacterRange::Range(first, next)); } else { if (unicode_) { AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, char_class, first, ignore_case_); } else { AddCharOrEscape(alloc, ranges, char_class, first); } } } if (!has_more()) return ReportError(JSMSG_UNTERM_CLASS); Advance(); if (!unicode_) { if (ranges->length() == 0) { ranges->append(CharacterRange::Everything()); is_negated = !is_negated; } return alloc->newInfallible<RegExpCharacterClass>(ranges, is_negated); } if (!is_negated && ranges->length() == 0 && lead_ranges->length() == 0 && trail_ranges->length() == 0 && wide_ranges->length() == 0) { ranges->append(CharacterRange::Everything()); return alloc->newInfallible<RegExpCharacterClass>(ranges, true); } return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, is_negated, ignore_case_); } template <typename CharT> bool RegExpParser<CharT>::ParseClassAtom(char16_t* char_class, widechar* value) { MOZ_ASSERT(*char_class == kNoCharClass); widechar first = current(); if (first == '\\') { switch (Next()) { case 'w': case 'W': case 'd': case 'D': case 's': case 'S': { *char_class = Next(); Advance(2); return true; } case kEndMarker: return ReportError(JSMSG_ESCAPE_AT_END_OF_REGEXP); default: if (!ParseClassCharacterEscape(value)) return false; return true; } } else { if (unicode_) { char16_t lead, trail; if (ParseRawSurrogatePair(&lead, &trail)) { *value = unicode::UTF16Decode(lead, trail); return true; } } Advance(); *value = first; return true; } } // In order to know whether an escape is a backreference or not we have to scan // the entire regexp and find the number of capturing parentheses. However we // don't want to scan the regexp twice unless it is necessary. This mini-parser // is called when needed. It can see the difference between capturing and // noncapturing parentheses and can skip character classes and backslash-escaped // characters. template <typename CharT> void RegExpParser<CharT>::ScanForCaptures() { // Start with captures started previous to current position int capture_count = captures_started(); // Add count of captures after this position. widechar n; while ((n = current()) != kEndMarker) { Advance(); switch (n) { case '\\': Advance(); break; case '[': { widechar c; while ((c = current()) != kEndMarker) { Advance(); if (c == '\\') { Advance(); } else { if (c == ']') break; } } break; } case '(': if (current() != '?') capture_count++; break; } } capture_count_ = capture_count; is_scanned_for_captures_ = true; } inline bool IsInRange(int value, int lower_limit, int higher_limit) { MOZ_ASSERT(lower_limit <= higher_limit); return static_cast<unsigned int>(value - lower_limit) <= static_cast<unsigned int>(higher_limit - lower_limit); } inline bool IsDecimalDigit(widechar c) { // ECMA-262, 3rd, 7.8.3 (p 16) return IsInRange(c, '0', '9'); } template <typename CharT> bool RegExpParser<CharT>::ParseBackReferenceIndex(int* index_out) { MOZ_ASSERT('\\' == current()); MOZ_ASSERT('1' <= Next() && Next() <= '9'); // Try to parse a decimal literal that is no greater than the total number // of left capturing parentheses in the input. const CharT* start = position(); int value = Next() - '0'; Advance(2); while (true) { widechar c = current(); if (IsDecimalDigit(c)) { value = 10 * value + (c - '0'); if (value > kMaxCaptures) { Reset(start); return false; } Advance(); } else { break; } } if (value > captures_started()) { if (!is_scanned_for_captures_) { const CharT* saved_position = position(); ScanForCaptures(); Reset(saved_position); } if (value > capture_count_) { Reset(start); return false; } } *index_out = value; return true; } // QuantifierPrefix :: // { DecimalDigits } // { DecimalDigits , } // { DecimalDigits , DecimalDigits } // // Returns true if parsing succeeds, and set the min_out and max_out // values. Values are truncated to RegExpTree::kInfinity if they overflow. template <typename CharT> bool RegExpParser<CharT>::ParseIntervalQuantifier(int* min_out, int* max_out) { MOZ_ASSERT(current() == '{'); const CharT* start = position(); Advance(); int min = 0; if (!IsDecimalDigit(current())) { Reset(start); return false; } while (IsDecimalDigit(current())) { int next = current() - '0'; if (min > (RegExpTree::kInfinity - next) / 10) { // Overflow. Skip past remaining decimal digits and return -1. do { Advance(); } while (IsDecimalDigit(current())); min = RegExpTree::kInfinity; break; } min = 10 * min + next; Advance(); } int max = 0; if (current() == '}') { max = min; Advance(); } else if (current() == ',') { Advance(); if (current() == '}') { max = RegExpTree::kInfinity; Advance(); } else { while (IsDecimalDigit(current())) { int next = current() - '0'; if (max > (RegExpTree::kInfinity - next) / 10) { do { Advance(); } while (IsDecimalDigit(current())); max = RegExpTree::kInfinity; break; } max = 10 * max + next; Advance(); } if (current() != '}') { Reset(start); return false; } Advance(); } } else { Reset(start); return false; } *min_out = min; *max_out = max; return true; } // Pattern :: // Disjunction template <typename CharT> RegExpTree* RegExpParser<CharT>::ParsePattern() { RegExpTree* result = ParseDisjunction(); MOZ_ASSERT_IF(result, !has_more()); return result; } static inline RegExpTree* CaseFoldingSurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail, int32_t diff) { RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc); builder->AddCharacter(lead); CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); ranges->append(CharacterRange::Range(trail, trail)); ranges->append(CharacterRange::Range(trail + diff, trail + diff)); builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, false)); return builder->ToRegExp(); } static inline RegExpTree* SurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail, bool ignore_case) { if (ignore_case) { #define CALL_ATOM(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \ if (lead == LEAD &&trail >= TRAIL_FROM && trail <= TRAIL_TO) \ return CaseFoldingSurrogatePairAtom(alloc, lead, trail, DIFF); FOR_EACH_NON_BMP_CASE_FOLDING(CALL_ATOM) FOR_EACH_NON_BMP_REV_CASE_FOLDING(CALL_ATOM) #undef CALL_ATOM } RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc); builder->AddCharacter(lead); builder->AddCharacter(trail); return builder->ToRegExp(); } static inline RegExpTree* LeadSurrogateAtom(LifoAlloc* alloc, char16_t value) { RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc); builder->AddCharacter(value); builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); return builder->ToRegExp(); } static inline RegExpTree* TrailSurrogateAtom(LifoAlloc* alloc, char16_t value) { RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc); builder->AddAssertion(alloc->newInfallible<RegExpAssertion>( RegExpAssertion::NOT_AFTER_LEAD_SURROGATE)); builder->AddCharacter(value); return builder->ToRegExp(); } static inline RegExpTree* UnicodeEverythingAtom(LifoAlloc* alloc) { RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc); // everything except \x0a, \x0d, \u2028 and \u2029 CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); ranges->append(CharacterRange::Range(0x0, 0x09)); ranges->append(CharacterRange::Range(0x0b, 0x0c)); ranges->append(CharacterRange::Range(0x0e, 0x2027)); ranges->append(CharacterRange::Range(0x202A, unicode::LeadSurrogateMin - 1)); ranges->append(CharacterRange::Range(unicode::TrailSurrogateMax + 1, unicode::UTF16Max)); builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, false)); builder->NewAlternative(); builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax)); builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); builder->NewAlternative(); builder->AddAssertion(alloc->newInfallible<RegExpAssertion>( RegExpAssertion::NOT_AFTER_LEAD_SURROGATE)); builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); builder->NewAlternative(); builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax)); builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); return builder->ToRegExp(); } RegExpTree* UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class, bool ignore_case) { CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); CharacterRangeVector* lead_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); CharacterRangeVector* trail_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); WideCharRangeVector* wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc); AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, char_class, 0, ignore_case); return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, false, false); } static inline RegExpTree* UnicodeBackReferenceAtom(LifoAlloc* alloc, RegExpTree* atom) { // If a back reference has a standalone lead surrogate as its last // character, then that lead surrogate shouldn't match lead surrogates that // are paired with a corresponding trail surrogate. RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc); builder->AddAtom(atom); builder->AddAssertion(alloc->newInfallible<RegExpAssertion>( RegExpAssertion::NOT_IN_SURROGATE_PAIR)); return builder->ToRegExp(); } // Disjunction :: // Alternative // Alternative | Disjunction // Alternative :: // [empty] // Term Alternative // Term :: // Assertion // Atom // Atom Quantifier template <typename CharT> RegExpTree* RegExpParser<CharT>::ParseDisjunction() { // Used to store current state while parsing subexpressions. RegExpParserState initial_state(alloc, nullptr, INITIAL, 0); RegExpParserState* stored_state = &initial_state; // Cache the builder in a local variable for quick access. RegExpBuilder* builder = initial_state.builder(); while (true) { switch (current()) { case kEndMarker: if (stored_state->IsSubexpression()) { // Inside a parenthesized group when hitting end of input. return ReportError(JSMSG_MISSING_PAREN); } MOZ_ASSERT(INITIAL == stored_state->group_type()); // Parsing completed successfully. return builder->ToRegExp(); case ')': { if (!stored_state->IsSubexpression()) return ReportError(JSMSG_UNMATCHED_RIGHT_PAREN); MOZ_ASSERT(INITIAL != stored_state->group_type()); Advance(); // End disjunction parsing and convert builder content to new single // regexp atom. RegExpTree* body = builder->ToRegExp(); int end_capture_index = captures_started(); int capture_index = stored_state->capture_index(); SubexpressionType group_type = stored_state->group_type(); // Restore previous state. stored_state = stored_state->previous_state(); builder = stored_state->builder(); // Build result of subexpression. if (group_type == CAPTURE) { RegExpCapture* capture = alloc->newInfallible<RegExpCapture>(body, capture_index); (*captures_)[capture_index - 1] = capture; body = capture; } else if (group_type != GROUPING) { MOZ_ASSERT(group_type == POSITIVE_LOOKAHEAD || group_type == NEGATIVE_LOOKAHEAD); bool is_positive = (group_type == POSITIVE_LOOKAHEAD); body = alloc->newInfallible<RegExpLookahead>(body, is_positive, end_capture_index - capture_index, capture_index); } builder->AddAtom(body); if (unicode_ && (group_type == POSITIVE_LOOKAHEAD || group_type == NEGATIVE_LOOKAHEAD)) continue; // For compatability with JSC and ES3, we allow quantifiers after // lookaheads, and break in all cases. break; } case '|': { Advance(); builder->NewAlternative(); continue; } case '*': case '+': case '?': return ReportError(JSMSG_NOTHING_TO_REPEAT); case '^': { Advance(); if (multiline_) { builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::START_OF_LINE)); } else { builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::START_OF_INPUT)); set_contains_anchor(); } continue; } case '$': { Advance(); RegExpAssertion::AssertionType assertion_type = multiline_ ? RegExpAssertion::END_OF_LINE : RegExpAssertion::END_OF_INPUT; builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(assertion_type)); continue; } case '.': { Advance(); // everything except \x0a, \x0d, \u2028 and \u2029 if (unicode_) { builder->AddAtom(UnicodeEverythingAtom(alloc)); break; } CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); CharacterRange::AddClassEscape(alloc, '.', ranges); RegExpTree* atom = alloc->newInfallible<RegExpCharacterClass>(ranges, false); builder->AddAtom(atom); break; } case '(': { SubexpressionType subexpr_type = CAPTURE; Advance(); if (current() == '?') { switch (Next()) { case ':': subexpr_type = GROUPING; break; case '=': subexpr_type = POSITIVE_LOOKAHEAD; break; case '!': subexpr_type = NEGATIVE_LOOKAHEAD; break; default: return ReportError(JSMSG_INVALID_GROUP); } Advance(2); } else { if (captures_ == nullptr) captures_ = alloc->newInfallible<RegExpCaptureVector>(*alloc); if (captures_started() >= kMaxCaptures) return ReportError(JSMSG_TOO_MANY_PARENS); captures_->append((RegExpCapture*) nullptr); } // Store current state and begin new disjunction parsing. stored_state = alloc->newInfallible<RegExpParserState>(alloc, stored_state, subexpr_type, captures_started()); builder = stored_state->builder(); continue; } case '[': { RegExpTree* atom = ParseCharacterClass(); if (!atom) return nullptr; builder->AddAtom(atom); break; } // Atom :: // \ AtomEscape case '\\': switch (Next()) { case kEndMarker: return ReportError(JSMSG_ESCAPE_AT_END_OF_REGEXP); case 'b': Advance(2); builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::BOUNDARY)); continue; case 'B': Advance(2); builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::NON_BOUNDARY)); continue; // AtomEscape :: // CharacterClassEscape // // CharacterClassEscape :: one of // d D s S w W case 'D': case 'S': case 'W': if (unicode_) { Advance(); builder->AddAtom(UnicodeCharacterClassEscapeAtom(alloc, current(), ignore_case_)); Advance(); break; } MOZ_FALLTHROUGH; case 'd': case 's': case 'w': { widechar c = Next(); Advance(2); CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); if (unicode_) CharacterRange::AddClassEscapeUnicode(alloc, c, ranges, ignore_case_); else CharacterRange::AddClassEscape(alloc, c, ranges); RegExpTree* atom = alloc->newInfallible<RegExpCharacterClass>(ranges, false); builder->AddAtom(atom); break; } case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { int index = 0; if (ParseBackReferenceIndex(&index)) { RegExpCapture* capture = nullptr; if (captures_ != nullptr && index <= (int) captures_->length()) { capture = (*captures_)[index - 1]; } if (capture == nullptr) { builder->AddEmpty(); break; } RegExpTree* atom = alloc->newInfallible<RegExpBackReference>(capture); if (unicode_) builder->AddAtom(UnicodeBackReferenceAtom(alloc, atom)); else builder->AddAtom(atom); break; } if (unicode_) return ReportError(JSMSG_BACK_REF_OUT_OF_RANGE); widechar first_digit = Next(); if (first_digit == '8' || first_digit == '9') { // Treat as identity escape builder->AddCharacter(first_digit); Advance(2); break; } MOZ_FALLTHROUGH; } case '0': { if (unicode_) { Advance(2); if (IsDecimalDigit(current())) return ReportError(JSMSG_INVALID_DECIMAL_ESCAPE); builder->AddCharacter(0); break; } Advance(); widechar octal = ParseOctalLiteral(); builder->AddCharacter(octal); break; } // ControlEscape :: one of // f n r t v case 'f': Advance(2); builder->AddCharacter('\f'); break; case 'n': Advance(2); builder->AddCharacter('\n'); break; case 'r': Advance(2); builder->AddCharacter('\r'); break; case 't': Advance(2); builder->AddCharacter('\t'); break; case 'v': Advance(2); builder->AddCharacter('\v'); break; case 'c': { Advance(); widechar controlLetter = Next(); // Special case if it is an ASCII letter. // Convert lower case letters to uppercase. widechar letter = controlLetter & ~('a' ^ 'A'); if (letter < 'A' || 'Z' < letter) { if (unicode_) return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); // controlLetter is not in range 'A'-'Z' or 'a'-'z'. // This is outside the specification. We match JSC in // reading the backslash as a literal character instead // of as starting an escape. builder->AddCharacter('\\'); } else { Advance(2); builder->AddCharacter(controlLetter & 0x1f); } break; } case 'x': { Advance(2); widechar value; if (ParseHexEscape(2, &value)) { builder->AddCharacter(value); } else { if (unicode_) return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); builder->AddCharacter('x'); } break; } case 'u': { Advance(2); widechar value; if (unicode_) { if (current() == '{') { if (!ParseBracedHexEscape(&value)) return nullptr; if (unicode::IsLeadSurrogate(value)) { builder->AddAtom(LeadSurrogateAtom(alloc, value)); } else if (unicode::IsTrailSurrogate(value)) { builder->AddAtom(TrailSurrogateAtom(alloc, value)); } else if (value >= unicode::NonBMPMin) { char16_t lead, trail; unicode::UTF16Encode(value, &lead, &trail); builder->AddAtom(SurrogatePairAtom(alloc, lead, trail, ignore_case_)); } else { builder->AddCharacter(value); } } else if (ParseHexEscape(4, &value)) { if (unicode::IsLeadSurrogate(value)) { widechar trail; if (ParseTrailSurrogate(&trail)) { builder->AddAtom(SurrogatePairAtom(alloc, value, trail, ignore_case_)); } else { builder->AddAtom(LeadSurrogateAtom(alloc, value)); } } else if (unicode::IsTrailSurrogate(value)) { builder->AddAtom(TrailSurrogateAtom(alloc, value)); } else { builder->AddCharacter(value); } } else { return ReportError(JSMSG_INVALID_UNICODE_ESCAPE); } break; } if (ParseHexEscape(4, &value)) { builder->AddCharacter(value); } else { builder->AddCharacter('u'); } break; } default: // Identity escape. if (unicode_ && !IsSyntaxCharacter(Next())) return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); builder->AddCharacter(Next()); Advance(2); break; } break; case '{': { if (unicode_) return ReportError(JSMSG_RAW_BRACE_IN_REGEP); int dummy; if (ParseIntervalQuantifier(&dummy, &dummy)) return ReportError(JSMSG_NOTHING_TO_REPEAT); MOZ_FALLTHROUGH; } default: if (unicode_) { char16_t lead, trail; if (ParseRawSurrogatePair(&lead, &trail)) { builder->AddAtom(SurrogatePairAtom(alloc, lead, trail, ignore_case_)); } else { widechar c = current(); if (unicode::IsLeadSurrogate(c)) builder->AddAtom(LeadSurrogateAtom(alloc, c)); else if (unicode::IsTrailSurrogate(c)) builder->AddAtom(TrailSurrogateAtom(alloc, c)); else if (c == ']') return ReportError(JSMSG_RAW_BRACKET_IN_REGEP); else if (c == '}') return ReportError(JSMSG_RAW_BRACE_IN_REGEP); else builder->AddCharacter(c); Advance(); } break; } builder->AddCharacter(current()); Advance(); break; } // end switch(current()) int min; int max; switch (current()) { // QuantifierPrefix :: // * // + // ? // { case '*': min = 0; max = RegExpTree::kInfinity; Advance(); break; case '+': min = 1; max = RegExpTree::kInfinity; Advance(); break; case '?': min = 0; max = 1; Advance(); break; case '{': if (ParseIntervalQuantifier(&min, &max)) { if (max < min) return ReportError(JSMSG_NUMBERS_OUT_OF_ORDER); break; } else { continue; } default: continue; } RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; if (current() == '?') { quantifier_type = RegExpQuantifier::NON_GREEDY; Advance(); } builder->AddQuantifierToAtom(min, max, quantifier_type); } } template class irregexp::RegExpParser<Latin1Char>; template class irregexp::RegExpParser<char16_t>; template <typename CharT> static bool ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length, bool multiline, bool match_only, bool unicode, bool ignore_case, bool global, bool sticky, RegExpCompileData* data) { if (match_only) { // Try to strip a leading '.*' from the RegExp, but only if it is not // followed by a '?' (which will affect how the .* is parsed). This // pattern will affect the captures produced by the RegExp, but not // whether there is a match or not. if (length >= 3 && chars[0] == '.' && chars[1] == '*' && chars[2] != '?') { chars += 2; length -= 2; } // Try to strip a trailing '.*' from the RegExp, which as above will // affect the captures but not whether there is a match. Only do this // when the following conditions are met: // 1. there are no other meta characters in the RegExp, so that we // are sure this will not affect how the RegExp is parsed // 2. global and sticky flags are not set, as lastIndex needs to be // set properly on global or sticky match if (length >= 3 && !HasRegExpMetaChars(chars, length - 2) && !global && !sticky && chars[length - 2] == '.' && chars[length - 1] == '*') { length -= 2; } } RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, multiline, unicode, ignore_case); data->tree = parser.ParsePattern(); if (!data->tree) return false; data->simple = parser.simple(); data->contains_anchor = parser.contains_anchor(); data->capture_count = parser.captures_started(); return true; } bool irregexp::ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str, bool multiline, bool match_only, bool unicode, bool ignore_case, bool global, bool sticky, RegExpCompileData* data) { JS::AutoCheckCannotGC nogc; return str->hasLatin1Chars() ? ::ParsePattern(ts, alloc, str->latin1Chars(nogc), str->length(), multiline, match_only, unicode, ignore_case, global, sticky, data) : ::ParsePattern(ts, alloc, str->twoByteChars(nogc), str->length(), multiline, match_only, unicode, ignore_case, global, sticky, data); } template <typename CharT> static bool ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length, bool unicode) { LifoAllocScope scope(&alloc); RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, false, unicode, false); return parser.ParsePattern() != nullptr; } bool irregexp::ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str, bool unicode) { JS::AutoCheckCannotGC nogc; return str->hasLatin1Chars() ? ::ParsePatternSyntax(ts, alloc, str->latin1Chars(nogc), str->length(), unicode) : ::ParsePatternSyntax(ts, alloc, str->twoByteChars(nogc), str->length(), unicode); }