summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/i18n/regexcmp.h
blob: 578fdcf73502b99eb1163228d7326ed7e35c8bcb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
//
//  regexcmp.h
//
//  Copyright (C) 2002-2016, International Business Machines Corporation and others.
//  All Rights Reserved.
//
//  This file contains declarations for the class RegexCompile
//
//  This class is internal to the regular expression implementation.
//  For the public Regular Expression API, see the file "unicode/regex.h"
//


#ifndef RBBISCAN_H
#define RBBISCAN_H

#include "unicode/utypes.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS

#include "unicode/parseerr.h"
#include "unicode/uniset.h"
#include "unicode/uobject.h"
#include "unicode/utext.h"
#include "uhash.h"
#include "uvector.h"
#include "uvectr32.h"



U_NAMESPACE_BEGIN


//--------------------------------------------------------------------------------
//
//  class RegexCompile    Contains the regular expression compiler.
//
//--------------------------------------------------------------------------------
struct  RegexTableEl;
class   RegexPattern;


class U_I18N_API RegexCompile : public UMemory {
public:

    enum {
        kStackSize = 100            // The size of the state stack for
    };                              //   pattern parsing.  Corresponds roughly
                                    //   to the depth of parentheses nesting
                                    //   that is allowed in the rules.

    struct RegexPatternChar {
        UChar32             fChar;
        UBool               fQuoted;
    };

    RegexCompile(RegexPattern *rp, UErrorCode &e);

    void       compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e);
    void       compile(UText *pat, UParseError &pp, UErrorCode &e);
    

    virtual    ~RegexCompile();

    void        nextChar(RegexPatternChar &c);      // Get the next char from the input stream.

    static void cleanup();                       // Memory cleanup



    // Categories of parentheses in pattern.
    //   The category is saved in the compile-time parentheses stack frame, and
    //   determines the code to be generated when the matching close ) is encountered.
    enum EParenClass {
        plain        = -1,               // No special handling
        capturing    = -2,
        atomic       = -3,
        lookAhead    = -4,
        negLookAhead = -5,
        flags        = -6,
        lookBehind   = -7,
        lookBehindN  = -8
    };

private:


    UBool       doParseActions(int32_t a);
    void        error(UErrorCode e);                   // error reporting convenience function.

    UChar32     nextCharLL();
    UChar32     peekCharLL();
    UnicodeSet  *scanProp();
    UnicodeSet  *scanPosixProp();
    void        handleCloseParen();
    int32_t     blockTopLoc(UBool reserve);          // Locate a position in the compiled pattern
                                                     //  at the top of the just completed block
                                                     //  or operation, and optionally ensure that
                                                     //  there is space to add an opcode there.
    void        compileSet(UnicodeSet *theSet);      // Generate the compiled pattern for
                                                     //   a reference to a UnicodeSet.
    void        compileInterval(int32_t InitOp,      // Generate the code for a {min,max} quantifier.
                               int32_t LoopOp);
    UBool       compileInlineInterval();             // Generate inline code for a {min,max} quantifier
    void        literalChar(UChar32 c);              // Compile a literal char
    void        fixLiterals(UBool split=FALSE);      // Generate code for pending literal characters.
    void        insertOp(int32_t where);             // Open up a slot for a new op in the
                                                     //   generated code at the specified location.
    void        appendOp(int32_t op);                // Append a new op to the compiled pattern.
    void        appendOp(int32_t type, int32_t val); // Build & append a new op to the compiled pattern.
    int32_t     buildOp(int32_t type, int32_t val);  // Construct a new pcode instruction.
    int32_t     allocateData(int32_t size);          // Allocate space in the matcher data area.
                                                     //   Return index of the newly allocated data.
    int32_t     allocateStackData(int32_t size);     // Allocate space in the match back-track stack frame.
                                                     //   Return offset index in the frame.
    int32_t     minMatchLength(int32_t start,
                               int32_t end);
    int32_t     maxMatchLength(int32_t start,
                               int32_t end);
    void        matchStartType();
    void        stripNOPs();

    void        setEval(int32_t op);
    void        setPushOp(int32_t op);
    UChar32     scanNamedChar();
    UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated);

public:   // Public for testing only.
    static void U_EXPORT2 findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars);
private:


    UErrorCode                    *fStatus;
    RegexPattern                  *fRXPat;
    UParseError                   *fParseErr;

    //
    //  Data associated with low level character scanning
    //
    int64_t                       fScanIndex;        // Index of current character being processed
                                                     //   in the rule input string.
    UBool                         fQuoteMode;        // Scan is in a \Q...\E quoted region
    UBool                         fInBackslashQuote; // Scan is between a '\' and the following char.
    UBool                         fEOLComments;      // When scan is just after '(?',  inhibit #... to
                                                     //   end of line comments, in favor of (?#...) comments.
    int64_t                       fLineNum;          // Line number in input file.
    int64_t                       fCharNum;          // Char position within the line.
    UChar32                       fLastChar;         // Previous char, needed to count CR-LF
                                                     //   as a single line, not two.
    UChar32                       fPeekChar;         // Saved char, if we've scanned ahead.


    RegexPatternChar              fC;                // Current char for parse state machine
                                                     //   processing.

    //
    //   Data for the state machine that parses the regular expression.
    //
    RegexTableEl                  **fStateTable;     // State Transition Table for regex Rule
                                                     //   parsing.  index by p[state][char-class]

    uint16_t                      fStack[kStackSize];  // State stack, holds state pushes
    int32_t                       fStackPtr;           //  and pops as specified in the state
                                                       //  transition rules.

    //
    //  Data associated with the generation of the pcode for the match engine
    //
    int32_t                       fModeFlags;        // Match Flags.  (Case Insensitive, etc.)
                                                     //   Always has high bit (31) set so that flag values
                                                     //   on the paren stack are distinguished from relocatable
                                                     //   pcode addresses.
    int32_t                       fNewModeFlags;     // New flags, while compiling (?i, holds state
                                                     //   until last flag is scanned.
    UBool                         fSetModeFlag;      // true for (?ismx, false for (?-ismx

    UnicodeString                 fLiteralChars;     // Literal chars or strings from the pattern are accumulated here.
                                                     //   Once completed, meaning that some non-literal pattern
                                                     //   construct is encountered, the appropriate opcodes
                                                     //   to match the literal will be generated, and this
                                                     //   string will be cleared.

    int64_t                       fPatternLength;    // Length of the input pattern string.
    
    UVector32                     fParenStack;       // parentheses stack.  Each frame consists of
                                                     //   the positions of compiled pattern operations
                                                     //   needing fixup, followed by negative value.  The
                                                     //   first entry in each frame is the position of the
                                                     //   spot reserved for use when a quantifier
                                                     //   needs to add a SAVE at the start of a (block)
                                                     //   The negative value (-1, -2,...) indicates
                                                     //   the kind of paren that opened the frame.  Some
                                                     //   need special handling on close.


    int32_t                       fMatchOpenParen;   // The position in the compiled pattern
                                                     //   of the slot reserved for a state save
                                                     //   at the start of the most recently processed
                                                     //   parenthesized block. Updated when processing
                                                     //   a close to the location for the corresponding open.

    int32_t                       fMatchCloseParen;  // The position in the pattern of the first
                                                     //   location after the most recently processed
                                                     //   parenthesized block.

    int32_t                       fIntervalLow;      // {lower, upper} interval quantifier values.
    int32_t                       fIntervalUpper;    // Placed here temporarily, when pattern is
                                                     //   initially scanned.  Each new interval
                                                     //   encountered overwrites these values.
                                                     //   -1 for the upper interval value means none
                                                     //   was specified (unlimited occurences.)

    int64_t                       fNameStartPos;     // Starting position of a \N{NAME} name in a
                                                     //   pattern, valid while remainder of name is
                                                     //   scanned.

    UStack                        fSetStack;         // Stack of UnicodeSets, used while evaluating
                                                     //   (at compile time) set expressions within
                                                     //   the pattern.
    UStack                        fSetOpStack;       // Stack of pending set operators (&&, --, union)

    UChar32                       fLastSetLiteral;   // The last single code point added to a set.
                                                     //   needed when "-y" is scanned, and we need
                                                     //   to turn "x-y" into a range.

    UnicodeString                *fCaptureName;      // Named Capture, the group name is built up
                                                     //   in this string while being scanned.
};

// Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions]
//   The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself.

enum SetOperations {
    setStart         = 0 << 16 | 1,
    setEnd           = 1 << 16 | 2,
    setNegation      = 2 << 16 | 3,
    setCaseClose     = 2 << 16 | 9,
    setDifference2   = 3 << 16 | 4,    // '--' set difference operator
    setIntersection2 = 3 << 16 | 5,    // '&&' set intersection operator
    setUnion         = 4 << 16 | 6,    // implicit union of adjacent items
    setDifference1   = 4 << 16 | 7,    // '-', single dash difference op, for compatibility with old UnicodeSet.
    setIntersection1 = 4 << 16 | 8     // '&', single amp intersection op, for compatibility with old UnicodeSet.
    };

U_NAMESPACE_END
#endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
#endif   // RBBISCAN_H