intl/icu/source/i18n/regexcst.txt


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505

# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
#*****************************************************************************
#
#   Copyright (C) 2002-2015, International Business Machines Corporation and others.
#   All Rights Reserved.
#
#*****************************************************************************
#
#  file:  regexcst.txt
#  ICU Regular Expression Parser State Table
#
#     This state table is used when reading and parsing a regular expression pattern
#     The pattern parser uses a state machine; the data in this file define the
#     state transitions that occur for each input character.
#
#     *** This file defines the regex pattern grammar.   This is it.
#     *** The determination of what is accepted is here.
#
#     This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
#     that are then built with the rule parser.
#

#
# Here is the syntax of the state definitions in this file:
#
#
#StateName:
#   input-char           n next-state           ^push-state     action
#   input-char           n next-state           ^push-state     action
#       |                |   |                      |             |
#       |                |   |                      |             |--- action to be performed by state machine
#       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
#       |                |   |                      |
#       |                |   |                      |--- Push this named state onto the state stack.
#       |                |   |                           Later, when next state is specified as "pop",
#       |                |   |                           the pushed state will become the current state.
#       |                |   |
#       |                |   |--- Transition to this state if the current input character matches the input
#       |                |        character or char class in the left hand column.  "pop" causes the next
#       |                |        state to be popped from the state stack.
#       |                |
#       |                |--- When making the state transition specified on this line, advance to the next
#       |                     character from the input only if 'n' appears here.
#       |
#       |--- Character or named character classes to test for.  If the current character being scanned
#            matches, peform the actions and go to the state specified on this line.
#            The input character is tested sequentally, in the order written.  The characters and
#            character classes tested for do not need to be mutually exclusive.  The first match wins.
#


#
#  start state, scan position is at the beginning of the pattern.
#
start:
   default                 term                                     doPatStart


#
#  term.  At a position where we can accept the start most items in a pattern.
#
term:
    quoted               n expr-quant                               doLiteralChar
    rule_char            n expr-quant                               doLiteralChar
    '['                  n set-open       ^set-finish               doSetBegin
    '('                  n open-paren
    '.'                  n expr-quant                               doDotAny
    '^'                  n expr-quant                               doCaret
    '$'                  n expr-quant                               doDollar
    '\'                  n backslash
    '|'                  n  term                                    doOrOperator
    ')'                  n  pop                                     doCloseParen
    eof	                   term                                     doPatFinish
    default                errorDeath                               doRuleError


#
#   expr-quant    We've just finished scanning a term, now look for the optional
#                 trailing quantifier - *, +, ?, *?,  etc.
#
expr-quant:
    '*'                  n  quant-star
    '+'                  n  quant-plus
    '?'                  n  quant-opt
    '{'                  n  interval-open                          doIntervalInit
    '('                  n  open-paren-quant
    default                 expr-cont


#
#  expr-cont      Expression, continuation.  At a point where additional terms are
#                                            allowed, but not required.  No Quantifiers
#
expr-cont:
    '|'                  n  term                                    doOrOperator
    ')'                  n  pop                                     doCloseParen
    default                 term


#
#   open-paren-quant   Special case handling for comments appearing before a quantifier,
#                        e.g.   x(?#comment )*
#                      Open parens from expr-quant come here; anything but a (?# comment
#                      branches into the normal parenthesis sequence as quickly as possible.
#
open-paren-quant:
    '?'                  n  open-paren-quant2                      doSuppressComments
    default                 open-paren

open-paren-quant2:
    '#'                  n  paren-comment   ^expr-quant
    default                 open-paren-extended


#
#   open-paren    We've got an open paren.  We need to scan further to
#                 determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
#
open-paren:
    '?'                  n  open-paren-extended                     doSuppressComments
    default                 term            ^expr-quant             doOpenCaptureParen

open-paren-extended:
    ':'                  n  term            ^expr-quant             doOpenNonCaptureParen  #  (?:
    '>'                  n  term            ^expr-quant             doOpenAtomicParen      #  (?>
    '='                  n  term            ^expr-cont              doOpenLookAhead        #  (?=
    '!'                  n  term            ^expr-cont              doOpenLookAheadNeg     #  (?!
    '<'                  n  open-paren-lookbehind
    '#'                  n  paren-comment   ^term
    'i'                     paren-flag                              doBeginMatchMode
    'd'                     paren-flag                              doBeginMatchMode
    'm'                     paren-flag                              doBeginMatchMode
    's'                     paren-flag                              doBeginMatchMode
    'u'                     paren-flag                              doBeginMatchMode
    'w'                     paren-flag                              doBeginMatchMode
    'x'                     paren-flag                              doBeginMatchMode
    '-'                     paren-flag                              doBeginMatchMode
    '('                  n  errorDeath                              doConditionalExpr
    '{'                  n  errorDeath                              doPerlInline
    default                 errorDeath                              doBadOpenParenType

open-paren-lookbehind:
    '='                  n  term            ^expr-cont              doOpenLookBehind       #  (?<=
    '!'                  n  term            ^expr-cont              doOpenLookBehindNeg    #  (?<!
    ascii_letter            named-capture                           doBeginNamedCapture    #  (?<name
    default                 errorDeath                              doBadOpenParenType


#
#   paren-comment    We've got a (?# ... )  style comment.  Eat pattern text till we get to the ')'
#
paren-comment:
    ')'                  n  pop
    eof		                errorDeath                              doMismatchedParenErr
    default              n  paren-comment

#
#  paren-flag    Scanned a (?ismx-ismx  flag setting
#
paren-flag:
    'i'                  n  paren-flag                              doMatchMode
    'd'                  n  paren-flag                              doMatchMode
    'm'                  n  paren-flag                              doMatchMode
    's'                  n  paren-flag                              doMatchMode
    'u'                  n  paren-flag                              doMatchMode
    'w'                  n  paren-flag                              doMatchMode
    'x'                  n  paren-flag                              doMatchMode
    '-'                  n  paren-flag                              doMatchMode
    ')'                  n  term                                    doSetMatchMode
    ':'                  n  term              ^expr-quant           doMatchModeParen
    default                 errorDeath                              doBadModeFlag

#
#  named-capture    (?<name> ... ), position currently on the name.
#
named-capture:
    ascii_letter         n  named-capture                           doContinueNamedCapture
    digit_char           n  named-capture                           doContinueNamedCapture
    '>'                  n  term               ^expr-quant          doOpenCaptureParen      # common w non-named capture.
    default                 errorDeath                              doBadNamedCapture

#
#  quant-star     Scanning a '*' quantifier.  Need to look ahead to decide
#                 between plain '*', '*?', '*+'
#
quant-star:
     '?'                 n  expr-cont                               doNGStar               #  *?
     '+'                 n  expr-cont                               doPossessiveStar       #  *+
     default                expr-cont                               doStar


#
#  quant-plus     Scanning a '+' quantifier.  Need to look ahead to decide
#                 between plain '+', '+?', '++'
#
quant-plus:
     '?'                 n  expr-cont                               doNGPlus               #  *?
     '+'                 n  expr-cont                               doPossessivePlus       #  *+
     default                expr-cont                               doPlus


#
#  quant-opt  Scanning a '?' quantifier.  Need to look ahead to decide
#                  between plain '?', '??', '?+'
#
quant-opt:
     '?'                 n  expr-cont                               doNGOpt                 #  ??
     '+'                 n  expr-cont                               doPossessiveOpt         #  ?+
     default                expr-cont                               doOpt                   #  ?


#
#   Interval         scanning a '{', the opening delimiter for an interval specification
#                                   {number} or {min, max} or {min,}
#
interval-open:
    digit_char              interval-lower
    default                 errorDeath                              doIntervalError

interval-lower:
    digit_char           n  interval-lower                          doIntevalLowerDigit
    ','			         n  interval-upper
    '}'                  n  interval-type                           doIntervalSame             # {n}
    default                 errorDeath                              doIntervalError

interval-upper:
    digit_char           n  interval-upper                          doIntervalUpperDigit
    '}'                  n  interval-type
    default                 errorDeath                              doIntervalError

interval-type:
    '?'                  n  expr-cont                               doNGInterval                # {n,m}?
    '+'                  n  expr-cont                               doPossessiveInterval        # {n,m}+
    default                 expr-cont                               doInterval                  # {m,n}


#
#  backslash        #  Backslash.  Figure out which of the \thingies we have encountered.
#                                  The low level next-char function will have preprocessed
#                                  some of them already; those won't come here.
backslash:
   'A'                   n  term                                    doBackslashA
   'B'                   n  term                                    doBackslashB
   'b'                   n  term                                    doBackslashb
   'd'                   n  expr-quant                              doBackslashd
   'D'                   n  expr-quant                              doBackslashD
   'G'                   n  term                                    doBackslashG
   'h'                   n  expr-quant                              doBackslashh
   'H'                   n  expr-quant                              doBackslashH
   'k'                   n  named-backref
   'N'                      expr-quant                              doNamedChar      #   \N{NAME}  named char
   'p'                      expr-quant                              doProperty       #   \p{Lu}  style property
   'P'                      expr-quant                              doProperty
   'R'                   n  expr-quant                              doBackslashR
   'Q'                   n  term                                    doEnterQuoteMode
   'S'                   n  expr-quant                              doBackslashS
   's'                   n  expr-quant                              doBackslashs
   'v'                   n  expr-quant                              doBackslashv
   'V'                   n  expr-quant                              doBackslashV
   'W'                   n  expr-quant                              doBackslashW
   'w'                   n  expr-quant                              doBackslashw
   'X'                   n  expr-quant                              doBackslashX
   'Z'                   n  term                                    doBackslashZ
   'z'                   n  term                                    doBackslashz
   digit_char            n  expr-quant                              doBackRef         #  Will scan multiple digits
   eof                      errorDeath                              doEscapeError
   default               n  expr-quant                              doEscapedLiteralChar


# named-backref   Scanned \k
#                 Leading to \k<captureName>
#                 Failure to get the full sequence is an error.
#
named-backref:
    '<'                  n  named-backref-2                         doBeginNamedBackRef
    default                 errorDeath                              doBadNamedCapture

named-backref-2:
    ascii_letter         n  named-backref-3                         doContinueNamedBackRef
    default                 errorDeath                              doBadNamedCapture

named-backref-3:
    ascii_letter         n  named-backref-3                         doContinueNamedBackRef
    digit_char           n  named-backref-3                         doContinueNamedBackRef
    '>'                  n  expr-quant                              doCompleteNamedBackRef
    default                 errorDeath                              doBadNamedCapture


#
# [set expression] parsing,
#    All states involved in parsing set expressions have names beginning with "set-"
#

set-open:
   '^'                   n  set-open2                               doSetNegate
   ':'                      set-posix                               doSetPosixProp
   default                  set-open2

set-open2:
   ']'                   n  set-after-lit                           doSetLiteral
   default                  set-start

#  set-posix:
#                  scanned a '[:'  If it really is a [:property:], doSetPosixProp will have
#                  moved the scan to the closing ']'.  If it wasn't a property
#                  expression, the scan will still be at the opening ':', which should
#                  be interpreted as a normal set expression.
set-posix:
    ']'                  n   pop                                    doSetEnd
    ':'                      set-start
    default                  errorDeath                             doRuleError  # should not be possible.

#
#   set-start   after the [ and special case leading characters (^ and/or ]) but before
#               everything else.   A '-' is literal at this point.
#
set-start:
    ']'                  n  pop                                     doSetEnd
    '['                  n  set-open      ^set-after-set            doSetBeginUnion
    '\'                  n  set-escape
    '-'                  n  set-start-dash
    '&'                  n  set-start-amp
    default              n  set-after-lit                           doSetLiteral

#    set-start-dash    Turn "[--" into a syntax error.
#                           "[-x" is good, - and x are literals.
#
set-start-dash:
    '-'                     errorDeath                              doRuleError
    default                 set-after-lit                           doSetAddDash

#    set-start-amp     Turn "[&&" into a syntax error.
#                           "[&x" is good, & and x are literals.
#
set-start-amp:
    '&'                     errorDeath                              doRuleError
    default                 set-after-lit                           doSetAddAmp

#
#   set-after-lit    The last thing scanned was a literal character within a set.
#                    Can be followed by anything.  Single '-' or '&' are
#                    literals in this context, not operators.
set-after-lit:
    ']'                  n  pop                                     doSetEnd
    '['                  n  set-open      ^set-after-set            doSetBeginUnion
    '-'                  n  set-lit-dash
    '&'                  n  set-lit-amp
    '\'                  n  set-escape
    eof                     errorDeath                              doSetNoCloseError
    default              n  set-after-lit                           doSetLiteral

set-after-set:
    ']'                  n  pop                                     doSetEnd
    '['                  n  set-open      ^set-after-set            doSetBeginUnion
    '-'                  n  set-set-dash
    '&'                  n  set-set-amp
    '\'                  n  set-escape
    eof                     errorDeath                              doSetNoCloseError
    default              n  set-after-lit                           doSetLiteral

set-after-range:
    ']'                  n  pop                                     doSetEnd
    '['                  n  set-open      ^set-after-set            doSetBeginUnion
    '-'                  n  set-range-dash
    '&'                  n  set-range-amp
    '\'                  n  set-escape
    eof                     errorDeath                              doSetNoCloseError
    default              n  set-after-lit                           doSetLiteral
    

# set-after-op
#     After a --  or &&
#     It is an error to close a set at this point.
#
set-after-op:
    '['                  n  set-open         ^set-after-set         doSetBeginUnion
    ']'                     errorDeath                              doSetOpError
    '\'                  n  set-escape
    default              n  set-after-lit                           doSetLiteral

#
#   set-set-amp
#      Have scanned [[set]&
#      Could be a '&' intersection operator, if a set follows.
#      Could be the start of a '&&' operator.
#      Otherewise is a literal.
set-set-amp:
    '['                  n  set-open      ^set-after-set           doSetBeginIntersection1
    '&'                  n  set-after-op                           doSetIntersection2
    default                 set-after-lit                          doSetAddAmp


# set-lit-amp   Have scanned "[literals&"
#               Could be a start of "&&" operator or a literal
#               In [abc&[def]],   the '&' is a literal
#
set-lit-amp:
    '&'                  n  set-after-op                            doSetIntersection2
    default                 set-after-lit                           doSetAddAmp


#
#  set-set-dash
#      Have scanned [set]-
#      Could be a '-' difference operator, if a [set] follows.
#      Could be the start of a '--' operator.
#      Otherewise is a literal.
set-set-dash:
    '['                  n  set-open      ^set-after-set           doSetBeginDifference1
    '-'                  n  set-after-op                           doSetDifference2
    default                 set-after-lit                          doSetAddDash


#
#  set-range-dash
#      scanned  a-b-  or \w-
#         any set or range like item where the trailing single '-' should
#         be literal, not a set difference operation.
#         A trailing "--" is still a difference operator.
set-range-dash:
    '-'                  n  set-after-op                           doSetDifference2
    default                 set-after-lit                          doSetAddDash


set-range-amp:
    '&'                  n  set-after-op                           doSetIntersection2
    default                 set-after-lit                          doSetAddAmp


#  set-lit-dash
#     Have scanned "[literals-" Could be a range or a -- operator or a literal
#     In [abc-[def]], the '-' is a literal (confirmed with a Java test)
#        [abc-\p{xx}  the '-' is an error
#        [abc-]       the '-' is a literal
#        [ab-xy]      the '-' is a range
#
set-lit-dash:
    '-'                  n  set-after-op                            doSetDifference2
    '['                     set-after-lit                           doSetAddDash
    ']'                     set-after-lit                           doSetAddDash
    '\'                  n  set-lit-dash-escape
    default              n  set-after-range                         doSetRange

# set-lit-dash-escape
#
#    scanned "[literal-\"
#    Could be a range, if the \ introduces an escaped literal char or a named char.
#    Otherwise it is an error.
#
set-lit-dash-escape:
   's'                      errorDeath                             doSetOpError
   'S'                      errorDeath                             doSetOpError
   'w'                      errorDeath                             doSetOpError
   'W'                      errorDeath                             doSetOpError
   'd'                      errorDeath                             doSetOpError
   'D'                      errorDeath                             doSetOpError
   'N'                      set-after-range                        doSetNamedRange
   default               n  set-after-range                        doSetRange

   
#
#  set-escape
#       Common back-slash escape processing within set expressions
#
set-escape:
   'p'                      set-after-set                           doSetProp
   'P'                      set-after-set                           doSetProp
   'N'                      set-after-lit                           doSetNamedChar
   's'                   n  set-after-range                         doSetBackslash_s
   'S'                   n  set-after-range                         doSetBackslash_S
   'w'                   n  set-after-range                         doSetBackslash_w
   'W'                   n  set-after-range                         doSetBackslash_W
   'd'                   n  set-after-range                         doSetBackslash_d
   'D'                   n  set-after-range                         doSetBackslash_D
   'h'                   n  set-after-range                         doSetBackslash_h
   'H'                   n  set-after-range                         doSetBackslash_H
   'v'                   n  set-after-range                         doSetBackslash_v
   'V'                   n  set-after-range                         doSetBackslash_V
   default               n  set-after-lit                           doSetLiteralEscaped 

#
# set-finish
#     Have just encountered the final ']' that completes a [set], and
#     arrived here via a pop.  From here, we exit the set parsing world, and go
#     back to generic regular expression parsing.
#
set-finish:
    default                 expr-quant                              doSetFinish


#
# errorDeath.   This state is specified as the next state whenever a syntax error
#               in the source rules is detected.  Barring bugs, the state machine will never
#               actually get here, but will stop because of the action associated with the error.
#               But, just in case, this state asks the state machine to exit.
errorDeath:
    default              n errorDeath                               doExit