1 files changed, 310 insertions, 0 deletions
diff --git a/js/src/irregexp/RegExpParser.h b/js/src/irregexp/RegExpParser.h
new file mode 100644
index 000000000..b5228a86f
--- /dev/null
+++ b/js/src/irregexp/RegExpParser.h
@@ -0,0 +1,310 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * vim: set ts=8 sts=4 et sw=4 tw=99: */
+
+// Copyright 2012 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef V8_PARSER_H_
+#define V8_PARSER_H_
+
+#include "irregexp/RegExpAST.h"
+
+namespace js {
+
+namespace frontend {
+    class TokenStream;
+}
+
+namespace irregexp {
+
+bool
+ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str,
+             bool multiline, bool match_only, bool unicode, bool ignore_case,
+             bool global, bool sticky, RegExpCompileData* data);
+
+bool
+ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str,
+                   bool unicode);
+
+// A BufferedVector is an automatically growing list, just like (and backed
+// by) a Vector, that is optimized for the case of adding and removing
+// a single element. The last element added is stored outside the backing list,
+// and if no more than one element is ever added, the ZoneList isn't even
+// allocated.
+// Elements must not be nullptr pointers.
+template <typename T, int initial_size>
+class BufferedVector
+{
+  public:
+    typedef InfallibleVector<T*, 1> VectorType;
+
+    BufferedVector() : list_(nullptr), last_(nullptr) {}
+
+    // Adds element at end of list. This element is buffered and can
+    // be read using last() or removed using RemoveLast until a new Add or until
+    // RemoveLast or GetList has been called.
+    void Add(LifoAlloc* alloc, T* value) {
+        if (last_ != nullptr) {
+            if (list_ == nullptr) {
+                list_ = alloc->newInfallible<VectorType>(*alloc);
+                list_->reserve(initial_size);
+            }
+            list_->append(last_);
+        }
+        last_ = value;
+    }
+
+    T* last() {
+        MOZ_ASSERT(last_ != nullptr);
+        return last_;
+    }
+
+    T* RemoveLast() {
+        MOZ_ASSERT(last_ != nullptr);
+        T* result = last_;
+        if ((list_ != nullptr) && (list_->length() > 0))
+            last_ = list_->popCopy();
+        else
+            last_ = nullptr;
+        return result;
+    }
+
+    T* Get(int i) {
+        MOZ_ASSERT((0 <= i) && (i < length()));
+        if (list_ == nullptr) {
+            MOZ_ASSERT(0 == i);
+            return last_;
+        } else {
+            if (size_t(i) == list_->length()) {
+                MOZ_ASSERT(last_ != nullptr);
+                return last_;
+            } else {
+                return (*list_)[i];
+            }
+        }
+    }
+
+    void Clear() {
+        list_ = nullptr;
+        last_ = nullptr;
+    }
+
+    int length() {
+        int length = (list_ == nullptr) ? 0 : list_->length();
+        return length + ((last_ == nullptr) ? 0 : 1);
+    }
+
+    VectorType* GetList(LifoAlloc* alloc) {
+        if (list_ == nullptr)
+            list_ = alloc->newInfallible<VectorType>(*alloc);
+        if (last_ != nullptr) {
+            list_->append(last_);
+            last_ = nullptr;
+        }
+        return list_;
+    }
+
+  private:
+    VectorType* list_;
+    T* last_;
+};
+
+
+// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
+class RegExpBuilder
+{
+  public:
+    explicit RegExpBuilder(LifoAlloc* alloc);
+    void AddCharacter(char16_t character);
+    // "Adds" an empty expression. Does nothing except consume a
+    // following quantifier
+    void AddEmpty();
+    void AddAtom(RegExpTree* tree);
+    void AddAssertion(RegExpTree* tree);
+    void NewAlternative();  // '|'
+    void AddQuantifierToAtom(int min, int max, RegExpQuantifier::QuantifierType type);
+    RegExpTree* ToRegExp();
+
+  private:
+    void FlushCharacters();
+    void FlushText();
+    void FlushTerms();
+
+    LifoAlloc* alloc;
+    bool pending_empty_;
+    CharacterVector* characters_;
+    BufferedVector<RegExpTree, 2> terms_;
+    BufferedVector<RegExpTree, 2> text_;
+    BufferedVector<RegExpTree, 2> alternatives_;
+
+    enum LastAdded {
+        ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM
+    };
+#ifdef DEBUG
+    LastAdded last_added_;
+#endif
+};
+
+// Characters parsed by RegExpParser can be either char16_t or kEndMarker.
+typedef uint32_t widechar;
+
+template <typename CharT>
+class RegExpParser
+{
+  public:
+    RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc,
+                 const CharT* chars, const CharT* end, bool multiline_mode, bool unicode,
+                 bool ignore_case);
+
+    RegExpTree* ParsePattern();
+    RegExpTree* ParseDisjunction();
+    RegExpTree* ParseCharacterClass();
+
+    // Parses a {...,...} quantifier and stores the range in the given
+    // out parameters.
+    bool ParseIntervalQuantifier(int* min_out, int* max_out);
+
+    // Tries to parse the input as a single escaped character.  If successful
+    // it stores the result in the output parameter and returns true.
+    // Otherwise it throws an error and returns false.  The character must not
+    // be 'b' or 'B' since they are usually handled specially.
+    bool ParseClassCharacterEscape(widechar* code);
+
+    // Checks whether the following is a length-digit hexadecimal number,
+    // and sets the value if it is.
+    bool ParseHexEscape(int length, widechar* value);
+
+    bool ParseBracedHexEscape(widechar* value);
+    bool ParseTrailSurrogate(widechar* value);
+    bool ParseRawSurrogatePair(char16_t* lead, char16_t* trail);
+
+    widechar ParseOctalLiteral();
+
+    // Tries to parse the input as a back reference.  If successful it
+    // stores the result in the output parameter and returns true.  If
+    // it fails it will push back the characters read so the same characters
+    // can be reparsed.
+    bool ParseBackReferenceIndex(int* index_out);
+
+    bool ParseClassAtom(char16_t* char_class, widechar *value);
+    RegExpTree* ReportError(unsigned errorNumber);
+    void Advance();
+    void Advance(int dist) {
+        next_pos_ += dist - 1;
+        Advance();
+    }
+
+    void Reset(const CharT* pos) {
+        next_pos_ = pos;
+        has_more_ = (pos < end_);
+        Advance();
+    }
+
+    // Reports whether the pattern might be used as a literal search string.
+    // Only use if the result of the parse is a single atom node.
+    bool simple() { return simple_; }
+    bool contains_anchor() { return contains_anchor_; }
+    void set_contains_anchor() { contains_anchor_ = true; }
+    int captures_started() { return captures_ == nullptr ? 0 : captures_->length(); }
+    const CharT* position() { return next_pos_ - 1; }
+
+    static const int kMaxCaptures = 1 << 16;
+    static const widechar kEndMarker = (1 << 21);
+
+  private:
+    enum SubexpressionType {
+        INITIAL,
+        CAPTURE,  // All positive values represent captures.
+        POSITIVE_LOOKAHEAD,
+        NEGATIVE_LOOKAHEAD,
+        GROUPING
+    };
+
+    class RegExpParserState {
+      public:
+        RegExpParserState(LifoAlloc* alloc,
+                          RegExpParserState* previous_state,
+                          SubexpressionType group_type,
+                          int disjunction_capture_index)
+            : previous_state_(previous_state),
+              builder_(alloc->newInfallible<RegExpBuilder>(alloc)),
+              group_type_(group_type),
+              disjunction_capture_index_(disjunction_capture_index)
+        {}
+        // Parser state of containing expression, if any.
+        RegExpParserState* previous_state() { return previous_state_; }
+        bool IsSubexpression() { return previous_state_ != nullptr; }
+        // RegExpBuilder building this regexp's AST.
+        RegExpBuilder* builder() { return builder_; }
+        // Type of regexp being parsed (parenthesized group or entire regexp).
+        SubexpressionType group_type() { return group_type_; }
+        // Index in captures array of first capture in this sub-expression, if any.
+        // Also the capture index of this sub-expression itself, if group_type
+        // is CAPTURE.
+        int capture_index() { return disjunction_capture_index_; }
+
+      private:
+        // Linked list implementation of stack of states.
+        RegExpParserState* previous_state_;
+        // Builder for the stored disjunction.
+        RegExpBuilder* builder_;
+        // Stored disjunction type (capture, look-ahead or grouping), if any.
+        SubexpressionType group_type_;
+        // Stored disjunction's capture index (if any).
+        int disjunction_capture_index_;
+    };
+
+    widechar current() { return current_; }
+    bool has_more() { return has_more_; }
+    bool has_next() { return next_pos_ < end_; }
+    widechar Next() {
+        if (has_next())
+            return *next_pos_;
+        return kEndMarker;
+    }
+    void ScanForCaptures();
+
+    frontend::TokenStream& ts;
+    LifoAlloc* alloc;
+    RegExpCaptureVector* captures_;
+    const CharT* next_pos_;
+    const CharT* end_;
+    widechar current_;
+    // The capture count is only valid after we have scanned for captures.
+    int capture_count_;
+    bool has_more_;
+    bool multiline_;
+    bool unicode_;
+    bool ignore_case_;
+    bool simple_;
+    bool contains_anchor_;
+    bool is_scanned_for_captures_;
+};
+
+} } // namespace js::irregexp
+
+#endif // V8_PARSER_H_