diff options
Diffstat (limited to 'js/src/irregexp/RegExpParser.h')
-rw-r--r-- | js/src/irregexp/RegExpParser.h | 310 |
1 files changed, 310 insertions, 0 deletions
diff --git a/js/src/irregexp/RegExpParser.h b/js/src/irregexp/RegExpParser.h new file mode 100644 index 000000000..b5228a86f --- /dev/null +++ b/js/src/irregexp/RegExpParser.h @@ -0,0 +1,310 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- + * vim: set ts=8 sts=4 et sw=4 tw=99: */ + +// Copyright 2012 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef V8_PARSER_H_ +#define V8_PARSER_H_ + +#include "irregexp/RegExpAST.h" + +namespace js { + +namespace frontend { + class TokenStream; +} + +namespace irregexp { + +bool +ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str, + bool multiline, bool match_only, bool unicode, bool ignore_case, + bool global, bool sticky, RegExpCompileData* data); + +bool +ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str, + bool unicode); + +// A BufferedVector is an automatically growing list, just like (and backed +// by) a Vector, that is optimized for the case of adding and removing +// a single element. The last element added is stored outside the backing list, +// and if no more than one element is ever added, the ZoneList isn't even +// allocated. +// Elements must not be nullptr pointers. +template <typename T, int initial_size> +class BufferedVector +{ + public: + typedef InfallibleVector<T*, 1> VectorType; + + BufferedVector() : list_(nullptr), last_(nullptr) {} + + // Adds element at end of list. This element is buffered and can + // be read using last() or removed using RemoveLast until a new Add or until + // RemoveLast or GetList has been called. + void Add(LifoAlloc* alloc, T* value) { + if (last_ != nullptr) { + if (list_ == nullptr) { + list_ = alloc->newInfallible<VectorType>(*alloc); + list_->reserve(initial_size); + } + list_->append(last_); + } + last_ = value; + } + + T* last() { + MOZ_ASSERT(last_ != nullptr); + return last_; + } + + T* RemoveLast() { + MOZ_ASSERT(last_ != nullptr); + T* result = last_; + if ((list_ != nullptr) && (list_->length() > 0)) + last_ = list_->popCopy(); + else + last_ = nullptr; + return result; + } + + T* Get(int i) { + MOZ_ASSERT((0 <= i) && (i < length())); + if (list_ == nullptr) { + MOZ_ASSERT(0 == i); + return last_; + } else { + if (size_t(i) == list_->length()) { + MOZ_ASSERT(last_ != nullptr); + return last_; + } else { + return (*list_)[i]; + } + } + } + + void Clear() { + list_ = nullptr; + last_ = nullptr; + } + + int length() { + int length = (list_ == nullptr) ? 0 : list_->length(); + return length + ((last_ == nullptr) ? 0 : 1); + } + + VectorType* GetList(LifoAlloc* alloc) { + if (list_ == nullptr) + list_ = alloc->newInfallible<VectorType>(*alloc); + if (last_ != nullptr) { + list_->append(last_); + last_ = nullptr; + } + return list_; + } + + private: + VectorType* list_; + T* last_; +}; + + +// Accumulates RegExp atoms and assertions into lists of terms and alternatives. +class RegExpBuilder +{ + public: + explicit RegExpBuilder(LifoAlloc* alloc); + void AddCharacter(char16_t character); + // "Adds" an empty expression. Does nothing except consume a + // following quantifier + void AddEmpty(); + void AddAtom(RegExpTree* tree); + void AddAssertion(RegExpTree* tree); + void NewAlternative(); // '|' + void AddQuantifierToAtom(int min, int max, RegExpQuantifier::QuantifierType type); + RegExpTree* ToRegExp(); + + private: + void FlushCharacters(); + void FlushText(); + void FlushTerms(); + + LifoAlloc* alloc; + bool pending_empty_; + CharacterVector* characters_; + BufferedVector<RegExpTree, 2> terms_; + BufferedVector<RegExpTree, 2> text_; + BufferedVector<RegExpTree, 2> alternatives_; + + enum LastAdded { + ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM + }; +#ifdef DEBUG + LastAdded last_added_; +#endif +}; + +// Characters parsed by RegExpParser can be either char16_t or kEndMarker. +typedef uint32_t widechar; + +template <typename CharT> +class RegExpParser +{ + public: + RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc, + const CharT* chars, const CharT* end, bool multiline_mode, bool unicode, + bool ignore_case); + + RegExpTree* ParsePattern(); + RegExpTree* ParseDisjunction(); + RegExpTree* ParseCharacterClass(); + + // Parses a {...,...} quantifier and stores the range in the given + // out parameters. + bool ParseIntervalQuantifier(int* min_out, int* max_out); + + // Tries to parse the input as a single escaped character. If successful + // it stores the result in the output parameter and returns true. + // Otherwise it throws an error and returns false. The character must not + // be 'b' or 'B' since they are usually handled specially. + bool ParseClassCharacterEscape(widechar* code); + + // Checks whether the following is a length-digit hexadecimal number, + // and sets the value if it is. + bool ParseHexEscape(int length, widechar* value); + + bool ParseBracedHexEscape(widechar* value); + bool ParseTrailSurrogate(widechar* value); + bool ParseRawSurrogatePair(char16_t* lead, char16_t* trail); + + widechar ParseOctalLiteral(); + + // Tries to parse the input as a back reference. If successful it + // stores the result in the output parameter and returns true. If + // it fails it will push back the characters read so the same characters + // can be reparsed. + bool ParseBackReferenceIndex(int* index_out); + + bool ParseClassAtom(char16_t* char_class, widechar *value); + RegExpTree* ReportError(unsigned errorNumber); + void Advance(); + void Advance(int dist) { + next_pos_ += dist - 1; + Advance(); + } + + void Reset(const CharT* pos) { + next_pos_ = pos; + has_more_ = (pos < end_); + Advance(); + } + + // Reports whether the pattern might be used as a literal search string. + // Only use if the result of the parse is a single atom node. + bool simple() { return simple_; } + bool contains_anchor() { return contains_anchor_; } + void set_contains_anchor() { contains_anchor_ = true; } + int captures_started() { return captures_ == nullptr ? 0 : captures_->length(); } + const CharT* position() { return next_pos_ - 1; } + + static const int kMaxCaptures = 1 << 16; + static const widechar kEndMarker = (1 << 21); + + private: + enum SubexpressionType { + INITIAL, + CAPTURE, // All positive values represent captures. + POSITIVE_LOOKAHEAD, + NEGATIVE_LOOKAHEAD, + GROUPING + }; + + class RegExpParserState { + public: + RegExpParserState(LifoAlloc* alloc, + RegExpParserState* previous_state, + SubexpressionType group_type, + int disjunction_capture_index) + : previous_state_(previous_state), + builder_(alloc->newInfallible<RegExpBuilder>(alloc)), + group_type_(group_type), + disjunction_capture_index_(disjunction_capture_index) + {} + // Parser state of containing expression, if any. + RegExpParserState* previous_state() { return previous_state_; } + bool IsSubexpression() { return previous_state_ != nullptr; } + // RegExpBuilder building this regexp's AST. + RegExpBuilder* builder() { return builder_; } + // Type of regexp being parsed (parenthesized group or entire regexp). + SubexpressionType group_type() { return group_type_; } + // Index in captures array of first capture in this sub-expression, if any. + // Also the capture index of this sub-expression itself, if group_type + // is CAPTURE. + int capture_index() { return disjunction_capture_index_; } + + private: + // Linked list implementation of stack of states. + RegExpParserState* previous_state_; + // Builder for the stored disjunction. + RegExpBuilder* builder_; + // Stored disjunction type (capture, look-ahead or grouping), if any. + SubexpressionType group_type_; + // Stored disjunction's capture index (if any). + int disjunction_capture_index_; + }; + + widechar current() { return current_; } + bool has_more() { return has_more_; } + bool has_next() { return next_pos_ < end_; } + widechar Next() { + if (has_next()) + return *next_pos_; + return kEndMarker; + } + void ScanForCaptures(); + + frontend::TokenStream& ts; + LifoAlloc* alloc; + RegExpCaptureVector* captures_; + const CharT* next_pos_; + const CharT* end_; + widechar current_; + // The capture count is only valid after we have scanned for captures. + int capture_count_; + bool has_more_; + bool multiline_; + bool unicode_; + bool ignore_case_; + bool simple_; + bool contains_anchor_; + bool is_scanned_for_captures_; +}; + +} } // namespace js::irregexp + +#endif // V8_PARSER_H_ |