summaryrefslogtreecommitdiffstats
path: root/js/src/regexp/regexp-parser.h
blob: 131d12161f62222b59d65c7feb7fc413d886e28c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef V8_REGEXP_REGEXP_PARSER_H_
#define V8_REGEXP_REGEXP_PARSER_H_

#include "regexp/regexp-ast.h"
#include "regexp/regexp-error.h"

namespace v8 {
namespace internal {

struct RegExpCompileData;

// A BufferedZoneList is an automatically growing list, just like (and backed
// by) a ZoneList, that is optimized for the case of adding and removing
// a single element. The last element added is stored outside the backing list,
// and if no more than one element is ever added, the ZoneList isn't even
// allocated.
// Elements must not be nullptr pointers.
template <typename T, int initial_size>
class BufferedZoneList {
 public:
  BufferedZoneList() : list_(nullptr), last_(nullptr) {}

  // Adds element at end of list. This element is buffered and can
  // be read using last() or removed using RemoveLast until a new Add or until
  // RemoveLast or GetList has been called.
  void Add(T* value, Zone* zone) {
    if (last_ != nullptr) {
      if (list_ == nullptr) {
        list_ = new (zone) ZoneList<T*>(initial_size, zone);
      }
      list_->Add(last_, zone);
    }
    last_ = value;
  }

  T* last() {
    DCHECK(last_ != nullptr);
    return last_;
  }

  T* RemoveLast() {
    DCHECK(last_ != nullptr);
    T* result = last_;
    if ((list_ != nullptr) && (list_->length() > 0))
      last_ = list_->RemoveLast();
    else
      last_ = nullptr;
    return result;
  }

  T* Get(int i) {
    DCHECK((0 <= i) && (i < length()));
    if (list_ == nullptr) {
      DCHECK_EQ(0, i);
      return last_;
    } else {
      if (i == list_->length()) {
        DCHECK(last_ != nullptr);
        return last_;
      } else {
        return list_->at(i);
      }
    }
  }

  void Clear() {
    list_ = nullptr;
    last_ = nullptr;
  }

  int length() {
    int length = (list_ == nullptr) ? 0 : list_->length();
    return length + ((last_ == nullptr) ? 0 : 1);
  }

  ZoneList<T*>* GetList(Zone* zone) {
    if (list_ == nullptr) {
      list_ = new (zone) ZoneList<T*>(initial_size, zone);
    }
    if (last_ != nullptr) {
      list_->Add(last_, zone);
      last_ = nullptr;
    }
    return list_;
  }

 private:
  ZoneList<T*>* list_;
  T* last_;
};


// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
class RegExpBuilder : public ZoneObject {
 public:
  RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
  void AddCharacter(uc16 character);
  void AddUnicodeCharacter(uc32 character);
  void AddEscapedUnicodeCharacter(uc32 character);
  // "Adds" an empty expression. Does nothing except consume a
  // following quantifier
  void AddEmpty();
  void AddCharacterClass(RegExpCharacterClass* cc);
  void AddCharacterClassForDesugaring(uc32 c);
  void AddAtom(RegExpTree* tree);
  void AddTerm(RegExpTree* tree);
  void AddAssertion(RegExpTree* tree);
  void NewAlternative();  // '|'
  bool AddQuantifierToAtom(int min, int max,
                           RegExpQuantifier::QuantifierType type);
  void FlushText();
  RegExpTree* ToRegExp();
  JSRegExp::Flags flags() const { return flags_; }
  void set_flags(JSRegExp::Flags flags) { flags_ = flags; }

  bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
  bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
  bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; }

 private:
  static const uc16 kNoPendingSurrogate = 0;
  void AddLeadSurrogate(uc16 lead_surrogate);
  void AddTrailSurrogate(uc16 trail_surrogate);
  void FlushPendingSurrogate();
  void FlushCharacters();
  void FlushTerms();
  bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
  bool NeedsDesugaringForIgnoreCase(uc32 c);
  Zone* zone() const { return zone_; }
  bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }

  Zone* zone_;
  bool pending_empty_;
  JSRegExp::Flags flags_;
  ZoneList<uc16>* characters_;
  uc16 pending_surrogate_;
  BufferedZoneList<RegExpTree, 2> terms_;
  BufferedZoneList<RegExpTree, 2> text_;
  BufferedZoneList<RegExpTree, 2> alternatives_;
#ifdef DEBUG
  enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
#define LAST(x) last_added_ = x;
#else
#define LAST(x)
#endif
};

class V8_EXPORT_PRIVATE RegExpParser {
 public:
  RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, Isolate* isolate,
               Zone* zone);

  static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
                          JSRegExp::Flags flags, RegExpCompileData* result);

  RegExpTree* ParsePattern();
  RegExpTree* ParseDisjunction();
  RegExpTree* ParseGroup();

  // Parses a {...,...} quantifier and stores the range in the given
  // out parameters.
  bool ParseIntervalQuantifier(int* min_out, int* max_out);

  // Parses and returns a single escaped character.  The character
  // must not be 'b' or 'B' since they are usually handle specially.
  uc32 ParseClassCharacterEscape();

  // Checks whether the following is a length-digit hexadecimal number,
  // and sets the value if it is.
  bool ParseHexEscape(int length, uc32* value);
  bool ParseUnicodeEscape(uc32* value);
  bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);

  bool ParsePropertyClassName(ZoneVector<char>* name_1,
                              ZoneVector<char>* name_2);
  bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
                             const ZoneVector<char>& name_1,
                             const ZoneVector<char>& name_2);

  RegExpTree* GetPropertySequence(const ZoneVector<char>& name_1);
  RegExpTree* ParseCharacterClass(const RegExpBuilder* state);

  uc32 ParseOctalLiteral();

  // Tries to parse the input as a back reference.  If successful it
  // stores the result in the output parameter and returns true.  If
  // it fails it will push back the characters read so the same characters
  // can be reparsed.
  bool ParseBackReferenceIndex(int* index_out);

  // Parse inside a class. Either add escaped class to the range, or return
  // false and pass parsed single character through |char_out|.
  void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
                        bool add_unicode_case_equivalents, uc32* char_out,
                        bool* is_class_escape);

  char ParseClassEscape();

  RegExpTree* ReportError(RegExpError error);
  void Advance();
  void Advance(int dist);
  void Reset(int pos);

  // Reports whether the pattern might be used as a literal search string.
  // Only use if the result of the parse is a single atom node.
  bool simple();
  bool contains_anchor() { return contains_anchor_; }
  void set_contains_anchor() { contains_anchor_ = true; }
  int captures_started() { return captures_started_; }
  int position() { return next_pos_ - 1; }
  bool failed() { return failed_; }
  // The Unicode flag can't be changed using in-regexp syntax, so it's OK to
  // just read the initial flag value here.
  bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; }

  static bool IsSyntaxCharacterOrSlash(uc32 c);

  static const uc32 kEndMarker = (1 << 21);

 private:
  enum SubexpressionType {
    INITIAL,
    CAPTURE,  // All positive values represent captures.
    POSITIVE_LOOKAROUND,
    NEGATIVE_LOOKAROUND,
    GROUPING
  };

  class RegExpParserState : public ZoneObject {
   public:
    // Push a state on the stack.
    RegExpParserState(RegExpParserState* previous_state,
                      SubexpressionType group_type,
                      RegExpLookaround::Type lookaround_type,
                      int disjunction_capture_index,
                      const ZoneVector<uc16>* capture_name,
                      JSRegExp::Flags flags, Zone* zone)
        : previous_state_(previous_state),
          builder_(new (zone) RegExpBuilder(zone, flags)),
          group_type_(group_type),
          lookaround_type_(lookaround_type),
          disjunction_capture_index_(disjunction_capture_index),
          capture_name_(capture_name) {}
    // Parser state of containing expression, if any.
    RegExpParserState* previous_state() const { return previous_state_; }
    bool IsSubexpression() { return previous_state_ != nullptr; }
    // RegExpBuilder building this regexp's AST.
    RegExpBuilder* builder() const { return builder_; }
    // Type of regexp being parsed (parenthesized group or entire regexp).
    SubexpressionType group_type() const { return group_type_; }
    // Lookahead or Lookbehind.
    RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
    // Index in captures array of first capture in this sub-expression, if any.
    // Also the capture index of this sub-expression itself, if group_type
    // is CAPTURE.
    int capture_index() const { return disjunction_capture_index_; }
    // The name of the current sub-expression, if group_type is CAPTURE. Only
    // used for named captures.
    const ZoneVector<uc16>* capture_name() const { return capture_name_; }

    bool IsNamedCapture() const { return capture_name_ != nullptr; }

    // Check whether the parser is inside a capture group with the given index.
    bool IsInsideCaptureGroup(int index);
    // Check whether the parser is inside a capture group with the given name.
    bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);

   private:
    // Linked list implementation of stack of states.
    RegExpParserState* const previous_state_;
    // Builder for the stored disjunction.
    RegExpBuilder* const builder_;
    // Stored disjunction type (capture, look-ahead or grouping), if any.
    const SubexpressionType group_type_;
    // Stored read direction.
    const RegExpLookaround::Type lookaround_type_;
    // Stored disjunction's capture index (if any).
    const int disjunction_capture_index_;
    // Stored capture name (if any).
    const ZoneVector<uc16>* const capture_name_;
  };

  // Return the 1-indexed RegExpCapture object, allocate if necessary.
  RegExpCapture* GetCapture(int index);

  // Creates a new named capture at the specified index. Must be called exactly
  // once for each named capture. Fails if a capture with the same name is
  // encountered.
  bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);

  // Parses the name of a capture group (?<name>pattern). The name must adhere
  // to IdentifierName in the ECMAScript standard.
  const ZoneVector<uc16>* ParseCaptureGroupName();

  bool ParseNamedBackReference(RegExpBuilder* builder,
                               RegExpParserState* state);
  RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);

  // After the initial parsing pass, patch corresponding RegExpCapture objects
  // into all RegExpBackReferences. This is done after initial parsing in order
  // to avoid complicating cases in which references comes before the capture.
  void PatchNamedBackReferences();

  Handle<FixedArray> CreateCaptureNameMap();

  // Returns true iff the pattern contains named captures. May call
  // ScanForCaptures to look ahead at the remaining pattern.
  bool HasNamedCaptures();

  Isolate* isolate() { return isolate_; }
  Zone* zone() const { return zone_; }

  uc32 current() { return current_; }
  bool has_more() { return has_more_; }
  bool has_next() { return next_pos_ < in()->length(); }
  uc32 Next();
  template <bool update_position>
  uc32 ReadNext();
  FlatStringReader* in() { return in_; }
  void ScanForCaptures();

  struct RegExpCaptureNameLess {
    bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
      DCHECK_NOT_NULL(lhs);
      DCHECK_NOT_NULL(rhs);
      return *lhs->name() < *rhs->name();
    }
  };

  Isolate* isolate_;
  Zone* zone_;
  RegExpError error_ = RegExpError::kNone;
  int error_pos_ = 0;
  ZoneList<RegExpCapture*>* captures_;
  ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
  ZoneList<RegExpBackReference*>* named_back_references_;
  FlatStringReader* in_;
  uc32 current_;
  // These are the flags specified outside the regexp syntax ie after the
  // terminating '/' or in the second argument to the constructor.  The current
  // flags are stored on the RegExpBuilder.
  JSRegExp::Flags top_level_flags_;
  int next_pos_;
  int captures_started_;
  int capture_count_;  // Only valid after we have scanned for captures.
  bool has_more_;
  bool simple_;
  bool contains_anchor_;
  bool is_scanned_for_captures_;
  bool has_named_captures_;  // Only valid after we have scanned for captures.
  bool failed_;
};

}  // namespace internal
}  // namespace v8

#endif  // V8_REGEXP_REGEXP_PARSER_H_