1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
|
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
#*****************************************************************************
#
# Copyright (C) 2002-2015, International Business Machines Corporation and others.
# All Rights Reserved.
#
#*****************************************************************************
#
# file: regexcst.txt
# ICU Regular Expression Parser State Table
#
# This state table is used when reading and parsing a regular expression pattern
# The pattern parser uses a state machine; the data in this file define the
# state transitions that occur for each input character.
#
# *** This file defines the regex pattern grammar. This is it.
# *** The determination of what is accepted is here.
#
# This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
# that are then built with the rule parser.
#
#
# Here is the syntax of the state definitions in this file:
#
#
#StateName:
# input-char n next-state ^push-state action
# input-char n next-state ^push-state action
# | | | | |
# | | | | |--- action to be performed by state machine
# | | | | See function RBBIRuleScanner::doParseActions()
# | | | |
# | | | |--- Push this named state onto the state stack.
# | | | Later, when next state is specified as "pop",
# | | | the pushed state will become the current state.
# | | |
# | | |--- Transition to this state if the current input character matches the input
# | | character or char class in the left hand column. "pop" causes the next
# | | state to be popped from the state stack.
# | |
# | |--- When making the state transition specified on this line, advance to the next
# | character from the input only if 'n' appears here.
# |
# |--- Character or named character classes to test for. If the current character being scanned
# matches, peform the actions and go to the state specified on this line.
# The input character is tested sequentally, in the order written. The characters and
# character classes tested for do not need to be mutually exclusive. The first match wins.
#
#
# start state, scan position is at the beginning of the pattern.
#
start:
default term doPatStart
#
# term. At a position where we can accept the start most items in a pattern.
#
term:
quoted n expr-quant doLiteralChar
rule_char n expr-quant doLiteralChar
'[' n set-open ^set-finish doSetBegin
'(' n open-paren
'.' n expr-quant doDotAny
'^' n expr-quant doCaret
'$' n expr-quant doDollar
'\' n backslash
'|' n term doOrOperator
')' n pop doCloseParen
eof term doPatFinish
default errorDeath doRuleError
#
# expr-quant We've just finished scanning a term, now look for the optional
# trailing quantifier - *, +, ?, *?, etc.
#
expr-quant:
'*' n quant-star
'+' n quant-plus
'?' n quant-opt
'{' n interval-open doIntervalInit
'(' n open-paren-quant
default expr-cont
#
# expr-cont Expression, continuation. At a point where additional terms are
# allowed, but not required. No Quantifiers
#
expr-cont:
'|' n term doOrOperator
')' n pop doCloseParen
default term
#
# open-paren-quant Special case handling for comments appearing before a quantifier,
# e.g. x(?#comment )*
# Open parens from expr-quant come here; anything but a (?# comment
# branches into the normal parenthesis sequence as quickly as possible.
#
open-paren-quant:
'?' n open-paren-quant2 doSuppressComments
default open-paren
open-paren-quant2:
'#' n paren-comment ^expr-quant
default open-paren-extended
#
# open-paren We've got an open paren. We need to scan further to
# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
#
open-paren:
'?' n open-paren-extended doSuppressComments
default term ^expr-quant doOpenCaptureParen
open-paren-extended:
':' n term ^expr-quant doOpenNonCaptureParen # (?:
'>' n term ^expr-quant doOpenAtomicParen # (?>
'=' n term ^expr-cont doOpenLookAhead # (?=
'!' n term ^expr-cont doOpenLookAheadNeg # (?!
'<' n open-paren-lookbehind
'#' n paren-comment ^term
'i' paren-flag doBeginMatchMode
'd' paren-flag doBeginMatchMode
'm' paren-flag doBeginMatchMode
's' paren-flag doBeginMatchMode
'u' paren-flag doBeginMatchMode
'w' paren-flag doBeginMatchMode
'x' paren-flag doBeginMatchMode
'-' paren-flag doBeginMatchMode
'(' n errorDeath doConditionalExpr
'{' n errorDeath doPerlInline
default errorDeath doBadOpenParenType
open-paren-lookbehind:
'=' n term ^expr-cont doOpenLookBehind # (?<=
'!' n term ^expr-cont doOpenLookBehindNeg # (?<!
ascii_letter named-capture doBeginNamedCapture # (?<name
default errorDeath doBadOpenParenType
#
# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')'
#
paren-comment:
')' n pop
eof errorDeath doMismatchedParenErr
default n paren-comment
#
# paren-flag Scanned a (?ismx-ismx flag setting
#
paren-flag:
'i' n paren-flag doMatchMode
'd' n paren-flag doMatchMode
'm' n paren-flag doMatchMode
's' n paren-flag doMatchMode
'u' n paren-flag doMatchMode
'w' n paren-flag doMatchMode
'x' n paren-flag doMatchMode
'-' n paren-flag doMatchMode
')' n term doSetMatchMode
':' n term ^expr-quant doMatchModeParen
default errorDeath doBadModeFlag
#
# named-capture (?<name> ... ), position currently on the name.
#
named-capture:
ascii_letter n named-capture doContinueNamedCapture
digit_char n named-capture doContinueNamedCapture
'>' n term ^expr-quant doOpenCaptureParen # common w non-named capture.
default errorDeath doBadNamedCapture
#
# quant-star Scanning a '*' quantifier. Need to look ahead to decide
# between plain '*', '*?', '*+'
#
quant-star:
'?' n expr-cont doNGStar # *?
'+' n expr-cont doPossessiveStar # *+
default expr-cont doStar
#
# quant-plus Scanning a '+' quantifier. Need to look ahead to decide
# between plain '+', '+?', '++'
#
quant-plus:
'?' n expr-cont doNGPlus # *?
'+' n expr-cont doPossessivePlus # *+
default expr-cont doPlus
#
# quant-opt Scanning a '?' quantifier. Need to look ahead to decide
# between plain '?', '??', '?+'
#
quant-opt:
'?' n expr-cont doNGOpt # ??
'+' n expr-cont doPossessiveOpt # ?+
default expr-cont doOpt # ?
#
# Interval scanning a '{', the opening delimiter for an interval specification
# {number} or {min, max} or {min,}
#
interval-open:
digit_char interval-lower
default errorDeath doIntervalError
interval-lower:
digit_char n interval-lower doIntevalLowerDigit
',' n interval-upper
'}' n interval-type doIntervalSame # {n}
default errorDeath doIntervalError
interval-upper:
digit_char n interval-upper doIntervalUpperDigit
'}' n interval-type
default errorDeath doIntervalError
interval-type:
'?' n expr-cont doNGInterval # {n,m}?
'+' n expr-cont doPossessiveInterval # {n,m}+
default expr-cont doInterval # {m,n}
#
# backslash # Backslash. Figure out which of the \thingies we have encountered.
# The low level next-char function will have preprocessed
# some of them already; those won't come here.
backslash:
'A' n term doBackslashA
'B' n term doBackslashB
'b' n term doBackslashb
'd' n expr-quant doBackslashd
'D' n expr-quant doBackslashD
'G' n term doBackslashG
'h' n expr-quant doBackslashh
'H' n expr-quant doBackslashH
'k' n named-backref
'N' expr-quant doNamedChar # \N{NAME} named char
'p' expr-quant doProperty # \p{Lu} style property
'P' expr-quant doProperty
'R' n expr-quant doBackslashR
'Q' n term doEnterQuoteMode
'S' n expr-quant doBackslashS
's' n expr-quant doBackslashs
'v' n expr-quant doBackslashv
'V' n expr-quant doBackslashV
'W' n expr-quant doBackslashW
'w' n expr-quant doBackslashw
'X' n expr-quant doBackslashX
'Z' n term doBackslashZ
'z' n term doBackslashz
digit_char n expr-quant doBackRef # Will scan multiple digits
eof errorDeath doEscapeError
default n expr-quant doEscapedLiteralChar
# named-backref Scanned \k
# Leading to \k<captureName>
# Failure to get the full sequence is an error.
#
named-backref:
'<' n named-backref-2 doBeginNamedBackRef
default errorDeath doBadNamedCapture
named-backref-2:
ascii_letter n named-backref-3 doContinueNamedBackRef
default errorDeath doBadNamedCapture
named-backref-3:
ascii_letter n named-backref-3 doContinueNamedBackRef
digit_char n named-backref-3 doContinueNamedBackRef
'>' n expr-quant doCompleteNamedBackRef
default errorDeath doBadNamedCapture
#
# [set expression] parsing,
# All states involved in parsing set expressions have names beginning with "set-"
#
set-open:
'^' n set-open2 doSetNegate
':' set-posix doSetPosixProp
default set-open2
set-open2:
']' n set-after-lit doSetLiteral
default set-start
# set-posix:
# scanned a '[:' If it really is a [:property:], doSetPosixProp will have
# moved the scan to the closing ']'. If it wasn't a property
# expression, the scan will still be at the opening ':', which should
# be interpreted as a normal set expression.
set-posix:
']' n pop doSetEnd
':' set-start
default errorDeath doRuleError # should not be possible.
#
# set-start after the [ and special case leading characters (^ and/or ]) but before
# everything else. A '-' is literal at this point.
#
set-start:
']' n pop doSetEnd
'[' n set-open ^set-after-set doSetBeginUnion
'\' n set-escape
'-' n set-start-dash
'&' n set-start-amp
default n set-after-lit doSetLiteral
# set-start-dash Turn "[--" into a syntax error.
# "[-x" is good, - and x are literals.
#
set-start-dash:
'-' errorDeath doRuleError
default set-after-lit doSetAddDash
# set-start-amp Turn "[&&" into a syntax error.
# "[&x" is good, & and x are literals.
#
set-start-amp:
'&' errorDeath doRuleError
default set-after-lit doSetAddAmp
#
# set-after-lit The last thing scanned was a literal character within a set.
# Can be followed by anything. Single '-' or '&' are
# literals in this context, not operators.
set-after-lit:
']' n pop doSetEnd
'[' n set-open ^set-after-set doSetBeginUnion
'-' n set-lit-dash
'&' n set-lit-amp
'\' n set-escape
eof errorDeath doSetNoCloseError
default n set-after-lit doSetLiteral
set-after-set:
']' n pop doSetEnd
'[' n set-open ^set-after-set doSetBeginUnion
'-' n set-set-dash
'&' n set-set-amp
'\' n set-escape
eof errorDeath doSetNoCloseError
default n set-after-lit doSetLiteral
set-after-range:
']' n pop doSetEnd
'[' n set-open ^set-after-set doSetBeginUnion
'-' n set-range-dash
'&' n set-range-amp
'\' n set-escape
eof errorDeath doSetNoCloseError
default n set-after-lit doSetLiteral
# set-after-op
# After a -- or &&
# It is an error to close a set at this point.
#
set-after-op:
'[' n set-open ^set-after-set doSetBeginUnion
']' errorDeath doSetOpError
'\' n set-escape
default n set-after-lit doSetLiteral
#
# set-set-amp
# Have scanned [[set]&
# Could be a '&' intersection operator, if a set follows.
# Could be the start of a '&&' operator.
# Otherewise is a literal.
set-set-amp:
'[' n set-open ^set-after-set doSetBeginIntersection1
'&' n set-after-op doSetIntersection2
default set-after-lit doSetAddAmp
# set-lit-amp Have scanned "[literals&"
# Could be a start of "&&" operator or a literal
# In [abc&[def]], the '&' is a literal
#
set-lit-amp:
'&' n set-after-op doSetIntersection2
default set-after-lit doSetAddAmp
#
# set-set-dash
# Have scanned [set]-
# Could be a '-' difference operator, if a [set] follows.
# Could be the start of a '--' operator.
# Otherewise is a literal.
set-set-dash:
'[' n set-open ^set-after-set doSetBeginDifference1
'-' n set-after-op doSetDifference2
default set-after-lit doSetAddDash
#
# set-range-dash
# scanned a-b- or \w-
# any set or range like item where the trailing single '-' should
# be literal, not a set difference operation.
# A trailing "--" is still a difference operator.
set-range-dash:
'-' n set-after-op doSetDifference2
default set-after-lit doSetAddDash
set-range-amp:
'&' n set-after-op doSetIntersection2
default set-after-lit doSetAddAmp
# set-lit-dash
# Have scanned "[literals-" Could be a range or a -- operator or a literal
# In [abc-[def]], the '-' is a literal (confirmed with a Java test)
# [abc-\p{xx} the '-' is an error
# [abc-] the '-' is a literal
# [ab-xy] the '-' is a range
#
set-lit-dash:
'-' n set-after-op doSetDifference2
'[' set-after-lit doSetAddDash
']' set-after-lit doSetAddDash
'\' n set-lit-dash-escape
default n set-after-range doSetRange
# set-lit-dash-escape
#
# scanned "[literal-\"
# Could be a range, if the \ introduces an escaped literal char or a named char.
# Otherwise it is an error.
#
set-lit-dash-escape:
's' errorDeath doSetOpError
'S' errorDeath doSetOpError
'w' errorDeath doSetOpError
'W' errorDeath doSetOpError
'd' errorDeath doSetOpError
'D' errorDeath doSetOpError
'N' set-after-range doSetNamedRange
default n set-after-range doSetRange
#
# set-escape
# Common back-slash escape processing within set expressions
#
set-escape:
'p' set-after-set doSetProp
'P' set-after-set doSetProp
'N' set-after-lit doSetNamedChar
's' n set-after-range doSetBackslash_s
'S' n set-after-range doSetBackslash_S
'w' n set-after-range doSetBackslash_w
'W' n set-after-range doSetBackslash_W
'd' n set-after-range doSetBackslash_d
'D' n set-after-range doSetBackslash_D
'h' n set-after-range doSetBackslash_h
'H' n set-after-range doSetBackslash_H
'v' n set-after-range doSetBackslash_v
'V' n set-after-range doSetBackslash_V
default n set-after-lit doSetLiteralEscaped
#
# set-finish
# Have just encountered the final ']' that completes a [set], and
# arrived here via a pop. From here, we exit the set parsing world, and go
# back to generic regular expression parsing.
#
set-finish:
default expr-quant doSetFinish
#
# errorDeath. This state is specified as the next state whenever a syntax error
# in the source rules is detected. Barring bugs, the state machine will never
# actually get here, but will stop because of the action associated with the error.
# But, just in case, this state asks the state machine to exit.
errorDeath:
default n errorDeath doExit
|