diff options
author | wolfbeast <mcwerewolf@gmail.com> | 2018-03-12 14:15:56 +0100 |
---|---|---|
committer | wolfbeast <mcwerewolf@gmail.com> | 2018-03-12 14:15:56 +0100 |
commit | b392e6d2ea60191615771900690b37b52b47bcd3 (patch) | |
tree | 64904a45624d866ff0232dc2f5b50478b17d69ba | |
parent | ba74a4174d2cd6ccbabbc5aa6f4ffdf74b48f45c (diff) | |
download | UXP-b392e6d2ea60191615771900690b37b52b47bcd3.tar UXP-b392e6d2ea60191615771900690b37b52b47bcd3.tar.gz UXP-b392e6d2ea60191615771900690b37b52b47bcd3.tar.lz UXP-b392e6d2ea60191615771900690b37b52b47bcd3.tar.xz UXP-b392e6d2ea60191615771900690b37b52b47bcd3.zip |
Generate irregexp character tables with make_unicode.py.
-rw-r--r-- | js/src/irregexp/RegExpCharacters-inl.h | 40 | ||||
-rw-r--r-- | js/src/irregexp/RegExpCharacters.cpp | 135 | ||||
-rw-r--r-- | js/src/irregexp/RegExpCharacters.h | 90 | ||||
-rw-r--r-- | js/src/irregexp/RegExpEngine.cpp | 115 | ||||
-rw-r--r-- | js/src/moz.build | 1 | ||||
-rwxr-xr-x | js/src/vm/make_unicode.py | 213 |
6 files changed, 485 insertions, 109 deletions
diff --git a/js/src/irregexp/RegExpCharacters-inl.h b/js/src/irregexp/RegExpCharacters-inl.h new file mode 100644 index 000000000..d001819fc --- /dev/null +++ b/js/src/irregexp/RegExpCharacters-inl.h @@ -0,0 +1,40 @@ +/* Generated by make_unicode.py DO NOT MODIFY */ +/* Unicode version: 9.0.0 */ +#ifndef V8_JSREGEXPCHARACTERS_INL_H_ +#define V8_JSREGEXPCHARACTERS_INL_H_ + +namespace js { + +namespace irregexp { + +static inline bool +RangeContainsLatin1Equivalents(CharacterRange range, bool unicode) +{ + if (unicode) { + // "LATIN SMALL LETTER LONG S" case folds to "LATIN SMALL LETTER S". + if (range.Contains(0x017F)) + return true; + // "LATIN CAPITAL LETTER SHARP S" case folds to "LATIN SMALL LETTER SHARP S". + if (range.Contains(0x1E9E)) + return true; + // "KELVIN SIGN" case folds to "LATIN SMALL LETTER K". + if (range.Contains(0x212A)) + return true; + // "ANGSTROM SIGN" case folds to "LATIN SMALL LETTER A WITH RING ABOVE". + if (range.Contains(0x212B)) + return true; + } + + // "GREEK CAPITAL LETTER MU" case maps to "MICRO SIGN". + // "GREEK SMALL LETTER MU" case maps to "MICRO SIGN". + if (range.Contains(0x039C) || range.Contains(0x03BC)) + return true; + // "LATIN CAPITAL LETTER Y WITH DIAERESIS" case maps to "LATIN SMALL LETTER Y WITH DIAERESIS". + if (range.Contains(0x0178)) + return true; + return false; +} + +} } // namespace js::irregexp + +#endif // V8_JSREGEXPCHARACTERS_INL_H_ diff --git a/js/src/irregexp/RegExpCharacters.cpp b/js/src/irregexp/RegExpCharacters.cpp new file mode 100644 index 000000000..096c02760 --- /dev/null +++ b/js/src/irregexp/RegExpCharacters.cpp @@ -0,0 +1,135 @@ +/* Generated by make_unicode.py DO NOT MODIFY */ +/* Unicode version: 9.0.0 */ +#include "irregexp/RegExpCharacters.h" + +#include "mozilla/Assertions.h" + +char16_t +js::irregexp::ConvertNonLatin1ToLatin1(char16_t c, bool unicode) +{ + MOZ_ASSERT(c > 0xFF, "Character mustn't be Latin1"); + if (unicode) { + // "LATIN SMALL LETTER LONG S" case folds to "LATIN SMALL LETTER S". + if (c == 0x017F) + return 0x73; + // "LATIN CAPITAL LETTER SHARP S" case folds to "LATIN SMALL LETTER SHARP S". + if (c == 0x1E9E) + return 0xDF; + // "KELVIN SIGN" case folds to "LATIN SMALL LETTER K". + if (c == 0x212A) + return 0x6B; + // "ANGSTROM SIGN" case folds to "LATIN SMALL LETTER A WITH RING ABOVE". + if (c == 0x212B) + return 0xE5; + } + + // "GREEK CAPITAL LETTER MU" case maps to "MICRO SIGN". + // "GREEK SMALL LETTER MU" case maps to "MICRO SIGN". + if (c == 0x039C || c == 0x03BC) + return 0xB5; + // "LATIN CAPITAL LETTER Y WITH DIAERESIS" case maps to "LATIN SMALL LETTER Y WITH DIAERESIS". + if (c == 0x0178) + return 0xFF; + return 0; +} + +const int js::irregexp::kSpaceRanges[] = { + 0x0009, 0x000D + 1, // CHARACTER TABULATION..CARRIAGE RETURN (CR) + 0x0020, 0x0020 + 1, // SPACE + 0x00A0, 0x00A0 + 1, // NO-BREAK SPACE + 0x1680, 0x1680 + 1, // OGHAM SPACE MARK + 0x2000, 0x200A + 1, // EN QUAD..HAIR SPACE + 0x2028, 0x2029 + 1, // LINE SEPARATOR..PARAGRAPH SEPARATOR + 0x202F, 0x202F + 1, // NARROW NO-BREAK SPACE + 0x205F, 0x205F + 1, // MEDIUM MATHEMATICAL SPACE + 0x3000, 0x3000 + 1, // IDEOGRAPHIC SPACE + 0xFEFF, 0xFEFF + 1, // ZERO WIDTH NO-BREAK SPACE + 0xFFFF + 1 +}; +const int js::irregexp::kSpaceRangeCount = 21; + +const int js::irregexp::kSpaceAndSurrogateRanges[] = { + 0x0009, 0x000D + 1, // CHARACTER TABULATION..CARRIAGE RETURN (CR) + 0x0020, 0x0020 + 1, // SPACE + 0x00A0, 0x00A0 + 1, // NO-BREAK SPACE + 0x1680, 0x1680 + 1, // OGHAM SPACE MARK + 0x2000, 0x200A + 1, // EN QUAD..HAIR SPACE + 0x2028, 0x2029 + 1, // LINE SEPARATOR..PARAGRAPH SEPARATOR + 0x202F, 0x202F + 1, // NARROW NO-BREAK SPACE + 0x205F, 0x205F + 1, // MEDIUM MATHEMATICAL SPACE + 0x3000, 0x3000 + 1, // IDEOGRAPHIC SPACE + 0xD800, 0xDFFF + 1, // <Lead Surrogate Min>..<Trail Surrogate Max> + 0xFEFF, 0xFEFF + 1, // ZERO WIDTH NO-BREAK SPACE + 0xFFFF + 1 +}; +const int js::irregexp::kSpaceAndSurrogateRangeCount = 23; + +const int js::irregexp::kWordRanges[] = { + 0x0030, 0x0039 + 1, // DIGIT ZERO..DIGIT NINE + 0x0041, 0x005A + 1, // LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z + 0x005F, 0x005F + 1, // LOW LINE + 0x0061, 0x007A + 1, // LATIN SMALL LETTER A..LATIN SMALL LETTER Z + 0xFFFF + 1 +}; +const int js::irregexp::kWordRangeCount = 9; + +const int js::irregexp::kIgnoreCaseWordRanges[] = { + 0x0030, 0x0039 + 1, // DIGIT ZERO..DIGIT NINE + 0x0041, 0x005A + 1, // LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z + 0x005F, 0x005F + 1, // LOW LINE + 0x0061, 0x007A + 1, // LATIN SMALL LETTER A..LATIN SMALL LETTER Z + 0x017F, 0x017F + 1, // LATIN SMALL LETTER LONG S + 0x212A, 0x212A + 1, // KELVIN SIGN + 0xFFFF + 1 +}; +const int js::irregexp::kIgnoreCaseWordRangeCount = 13; + +const int js::irregexp::kWordAndSurrogateRanges[] = { + 0x0030, 0x0039 + 1, // DIGIT ZERO..DIGIT NINE + 0x0041, 0x005A + 1, // LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z + 0x005F, 0x005F + 1, // LOW LINE + 0x0061, 0x007A + 1, // LATIN SMALL LETTER A..LATIN SMALL LETTER Z + 0xD800, 0xDFFF + 1, // <Lead Surrogate Min>..<Trail Surrogate Max> + 0xFFFF + 1 +}; +const int js::irregexp::kWordAndSurrogateRangeCount = 11; + +const int js::irregexp::kNegatedIgnoreCaseWordAndSurrogateRanges[] = { + 0x0000, 0x002F + 1, // NULL..SOLIDUS + 0x003A, 0x0040 + 1, // COLON..COMMERCIAL AT + 0x005B, 0x005E + 1, // LEFT SQUARE BRACKET..CIRCUMFLEX ACCENT + 0x0060, 0x0060 + 1, // GRAVE ACCENT + 0x007B, 0x017E + 1, // LEFT CURLY BRACKET..LATIN SMALL LETTER Z WITH CARON + 0x0180, 0x2129 + 1, // LATIN SMALL LETTER B WITH STROKE..TURNED GREEK SMALL LETTER IOTA + 0x212B, 0xD7FF + 1, // ANGSTROM SIGN..<Unused> + 0xE000, 0xFFFF + 1, // Private Use..<Unused> + 0xFFFF + 1 +}; +const int js::irregexp::kNegatedIgnoreCaseWordAndSurrogateRangeCount = 17; + +const int js::irregexp::kDigitRanges[] = { + 0x0030, 0x0039 + 1, // DIGIT ZERO..DIGIT NINE + 0xFFFF + 1 +}; +const int js::irregexp::kDigitRangeCount = 3; + +const int js::irregexp::kDigitAndSurrogateRanges[] = { + 0x0030, 0x0039 + 1, // DIGIT ZERO..DIGIT NINE + 0xD800, 0xDFFF + 1, // <Lead Surrogate Min>..<Trail Surrogate Max> + 0xFFFF + 1 +}; +const int js::irregexp::kDigitAndSurrogateRangeCount = 5; + +const int js::irregexp::kSurrogateRanges[] = { + 0xD800, 0xDFFF + 1, // <Lead Surrogate Min>..<Trail Surrogate Max> + 0xFFFF + 1 +}; +const int js::irregexp::kSurrogateRangeCount = 3; + +const int js::irregexp::kLineTerminatorRanges[] = { + 0x000A, 0x000A + 1, // LINE FEED (LF) + 0x000D, 0x000D + 1, // CARRIAGE RETURN (CR) + 0x2028, 0x2029 + 1, // LINE SEPARATOR..PARAGRAPH SEPARATOR + 0xFFFF + 1 +}; +const int js::irregexp::kLineTerminatorRangeCount = 7; diff --git a/js/src/irregexp/RegExpCharacters.h b/js/src/irregexp/RegExpCharacters.h new file mode 100644 index 000000000..0d3cf096f --- /dev/null +++ b/js/src/irregexp/RegExpCharacters.h @@ -0,0 +1,90 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- + * vim: set ts=8 sts=4 et sw=4 tw=99: */ + +// Copyright 2012 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef V8_JSREGEXPCHARACTERS_H_ +#define V8_JSREGEXPCHARACTERS_H_ + +namespace js { + +namespace irregexp { + +char16_t +ConvertNonLatin1ToLatin1(char16_t c, bool unicode); + +// ------------------------------------------------------------------- +// CharacterRange + +// The ranges have inclusive from and exclusive to. + +// This covers \s as defined in ES2016, 21.2.2.12 CharacterClassEscape, +// which includes WhiteSpace (11.2) and LineTerminator (11.3) values. +extern const int kSpaceRanges[]; +extern const int kSpaceRangeCount; + +// Characters in \s and additionally all surrogate characters. +extern const int kSpaceAndSurrogateRanges[]; +extern const int kSpaceAndSurrogateRangeCount; + +// This covers \w as defined in ES2016, 21.2.2.12 CharacterClassEscape. +extern const int kWordRanges[]; +extern const int kWordRangeCount; + +// Characters which case-fold to characters in \w. +extern const int kIgnoreCaseWordRanges[]; +extern const int kIgnoreCaseWordRangeCount; + +// Characters in \w and additionally all surrogate characters. +extern const int kWordAndSurrogateRanges[]; +extern const int kWordAndSurrogateRangeCount; + +// All characters excluding those which case-fold to \w and excluding all +// surrogate characters. +extern const int kNegatedIgnoreCaseWordAndSurrogateRanges[]; +extern const int kNegatedIgnoreCaseWordAndSurrogateRangeCount; + +// This covers \d as defined in ES2016, 21.2.2.12 CharacterClassEscape. +extern const int kDigitRanges[]; +extern const int kDigitRangeCount; + +// Characters in \d and additionally all surrogate characters. +extern const int kDigitAndSurrogateRanges[]; +extern const int kDigitAndSurrogateRangeCount; + +// The range of all surrogate characters. +extern const int kSurrogateRanges[]; +extern const int kSurrogateRangeCount; + +// Line terminators as defined in ES2016, 11.3 LineTerminator. +extern const int kLineTerminatorRanges[]; +extern const int kLineTerminatorRangeCount; + +} } // namespace js::irregexp + +#endif // V8_JSREGEXPCHARACTERS_H_ diff --git a/js/src/irregexp/RegExpEngine.cpp b/js/src/irregexp/RegExpEngine.cpp index 2e19065fd..0011e976f 100644 --- a/js/src/irregexp/RegExpEngine.cpp +++ b/js/src/irregexp/RegExpEngine.cpp @@ -31,10 +31,14 @@ #include "irregexp/RegExpEngine.h" #include "irregexp/NativeRegExpMacroAssembler.h" +#include "irregexp/RegExpCharacters.h" #include "irregexp/RegExpMacroAssembler.h" #include "jit/ExecutableAllocator.h" #include "jit/JitCommon.h" +// Generated table +#include "irregexp/RegExpCharacters-inl.h" + using namespace js; using namespace js::irregexp; @@ -61,61 +65,6 @@ RegExpNode::RegExpNode(LifoAlloc* alloc) bm_info_[0] = bm_info_[1] = nullptr; } -// ------------------------------------------------------------------- -// CharacterRange - -// The '2' variant has inclusive from and exclusive to. -// This covers \s as defined in ECMA-262 5.1, 15.10.2.12, -// which include WhiteSpace (7.2) or LineTerminator (7.3) values. -static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, - 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, - 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, - 0xFEFF, 0xFF00, 0x10000 }; -static const int kSpaceRangeCount = ArrayLength(kSpaceRanges); - -static const int kSpaceAndSurrogateRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, - 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, - 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, - unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1, - 0xFEFF, 0xFF00, 0x10000 }; -static const int kSpaceAndSurrogateRangeCount = ArrayLength(kSpaceAndSurrogateRanges); -static const int kWordRanges[] = { - '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 }; -static const int kWordRangeCount = ArrayLength(kWordRanges); -static const int kIgnoreCaseWordRanges[] = { - '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, - 0x017F, 0x017F + 1, 0x212A, 0x212A + 1, - 0x10000 }; -static const int kIgnoreCaseWordCount = ArrayLength(kIgnoreCaseWordRanges); -static const int kWordAndSurrogateRanges[] = { - '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, - unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1, - 0x10000 }; -static const int kWordAndSurrogateRangeCount = ArrayLength(kWordAndSurrogateRanges); -static const int kNegatedIgnoreCaseWordAndSurrogateRanges[] = { - 0, '0', '9' + 1, 'A', - 'Z' + 1, '_', '_' + 1, 'a', - 'z' + 1, 0x017F, - 0x017F + 1, 0x212A, - 0x212A + 1, unicode::LeadSurrogateMin, - unicode::TrailSurrogateMax + 1, 0x10000, - 0x10000 }; -static const int kNegatedIgnoreCaseWordAndSurrogateRangeCount = - ArrayLength(kNegatedIgnoreCaseWordAndSurrogateRanges); -static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 }; -static const int kDigitRangeCount = ArrayLength(kDigitRanges); -static const int kDigitAndSurrogateRanges[] = { - '0', '9' + 1, - unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1, - 0x10000 }; -static const int kDigitAndSurrogateRangeCount = ArrayLength(kDigitAndSurrogateRanges); -static const int kSurrogateRanges[] = { - unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1, - 0x10000 }; -static const int kSurrogateRangeCount = ArrayLength(kSurrogateRanges); -static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E, - 0x2028, 0x202A, 0x10000 }; -static const int kLineTerminatorRangeCount = ArrayLength(kLineTerminatorRanges); static const int kMaxOneByteCharCode = 0xff; static const int kMaxUtf16CodeUnit = 0xffff; @@ -213,7 +162,7 @@ CharacterRange::AddClassEscapeUnicode(LifoAlloc* alloc, char16_t type, break; case 'w': if (ignore_case) - AddClass(kIgnoreCaseWordRanges, kIgnoreCaseWordCount, ranges); + AddClass(kIgnoreCaseWordRanges, kIgnoreCaseWordRangeCount, ranges); else AddClassEscape(alloc, type, ranges); break; @@ -233,33 +182,6 @@ CharacterRange::AddClassEscapeUnicode(LifoAlloc* alloc, char16_t type, } } -#define FOR_EACH_NON_ASCII_TO_ASCII_FOLDING(macro) \ - /* LATIN CAPITAL LETTER Y WITH DIAERESIS */ \ - macro(0x0178, 0x00FF) \ - /* LATIN SMALL LETTER LONG S */ \ - macro(0x017F, 0x0073) \ - /* LATIN CAPITAL LETTER SHARP S */ \ - macro(0x1E9E, 0x00DF) \ - /* KELVIN SIGN */ \ - macro(0x212A, 0x006B) \ - /* ANGSTROM SIGN */ \ - macro(0x212B, 0x00E5) - -// We need to check for the following characters: 0x39c 0x3bc 0x178. -static inline bool -RangeContainsLatin1Equivalents(CharacterRange range, bool unicode) -{ - /* TODO(dcarney): this could be a lot more efficient. */ - if (unicode) { -#define CHECK_RANGE(C, F) \ - if (range.Contains(C)) return true; -FOR_EACH_NON_ASCII_TO_ASCII_FOLDING(CHECK_RANGE) -#undef CHECK_RANGE - } - - return range.Contains(0x39c) || range.Contains(0x3bc) || range.Contains(0x178); -} - static bool RangesContainLatin1Equivalents(const CharacterRangeVector& ranges, bool unicode) { @@ -336,7 +258,7 @@ GetCaseIndependentLetters(char16_t character, // step 3.g. // The standard requires that non-ASCII characters cannot have ASCII // character codes in their equivalence class, even though this - // situation occurs multiple times in the unicode tables. + // situation occurs multiple times in the Unicode tables. static const unsigned kMaxAsciiCharCode = 127; if (upper <= kMaxAsciiCharCode) { if (character > kMaxAsciiCharCode) { @@ -365,31 +287,6 @@ GetCaseIndependentLetters(char16_t character, choices, ArrayLength(choices), letters); } -static char16_t -ConvertNonLatin1ToLatin1(char16_t c, bool unicode) -{ - MOZ_ASSERT(c > kMaxOneByteCharCode); - if (unicode) { - switch (c) { -#define CONVERT(C, F) case C: return F; -FOR_EACH_NON_ASCII_TO_ASCII_FOLDING(CONVERT) -#undef CONVERT - } - } - - switch (c) { - // This are equivalent characters in unicode. - case 0x39c: - case 0x3bc: - return 0xb5; - // This is an uppercase of a Latin-1 character - // outside of Latin-1. - case 0x178: - return 0xff; - } - return 0; -} - void CharacterRange::AddCaseEquivalents(bool is_ascii, bool unicode, CharacterRangeVector* ranges) { diff --git a/js/src/moz.build b/js/src/moz.build index 77acb10b9..a18170a75 100644 --- a/js/src/moz.build +++ b/js/src/moz.build @@ -196,6 +196,7 @@ UNIFIED_SOURCES += [ 'gc/Zone.cpp', 'irregexp/NativeRegExpMacroAssembler.cpp', 'irregexp/RegExpAST.cpp', + 'irregexp/RegExpCharacters.cpp', 'irregexp/RegExpEngine.cpp', 'irregexp/RegExpInterpreter.cpp', 'irregexp/RegExpMacroAssembler.cpp', diff --git a/js/src/vm/make_unicode.py b/js/src/vm/make_unicode.py index 5565d7d14..73c090ac9 100755 --- a/js/src/vm/make_unicode.py +++ b/js/src/vm/make_unicode.py @@ -133,6 +133,17 @@ def read_derived_core_properties(derived_core_properties): for char in range(int(start, 16), int(end, 16) + 1): yield (char, char_property) +def int_ranges(ints): + """ Yields consecutive ranges (inclusive) from integer values. """ + from itertools import tee, izip_longest + + (a, b) = tee(sorted(ints)) + start = next(b) + for (curr, succ) in izip_longest(a, b): + if curr + 1 != succ: + yield (start, curr) + start = succ + def utf16_encode(code): NonBMPMin = 0x10000 LeadSurrogateMin = 0xD800 @@ -740,6 +751,204 @@ def splitbins(t): assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] return best +def make_irregexp_tables(version, + table, index, + folding_table, folding_index, + test_table): + import string + from functools import partial + from itertools import chain, ifilter, imap + + MAX_ASCII = 0x7F + MAX_LATIN1 = 0xFF + LEAD_SURROGATE_MIN = 0xD800 + TRAIL_SURROGATE_MAX = 0xDFFF + + def hex2(n): + assert 0 <= n and n < 16**2 + return '0x{:02X}'.format(n) + + def hex4(n): + assert 0 <= n and n < 16**4 + return '0x{:04X}'.format(n) + + def uhex4(n): + assert 0 <= n and n < 16**4 + return 'U+{:04X}'.format(n) + + def case_info(code): + assert 0 <= code and code <= MAX_BMP + (upper, lower, flags) = table[index[code]] + return ((code + upper) & 0xffff, (code + lower) & 0xffff, flags) + + def is_space(code): + (_, _, flags) = case_info(code) + return bool(flags & FLAG_SPACE) + + def to_upper(code): + (upper, _, _) = case_info(code) + return upper + + def casefold(code): + assert 0 <= code and code <= MAX_BMP + (folding, _, _, _) = folding_table[folding_index[code]] + return (code + folding) & 0xffff + + def casefolds_to_ascii(code): + return casefold(code) <= MAX_ASCII + + def casefolds_to_latin1(code): + return casefold(code) <= MAX_LATIN1 + + def casemaps_to_nonlatin1(code): + upper = to_upper(code) + return upper > MAX_LATIN1 + + def char_name(code): + assert 0 <= code and code <= MAX_BMP + if code not in test_table: + return '<Unused>' + if code == LEAD_SURROGATE_MIN: + return '<Lead Surrogate Min>' + if code == TRAIL_SURROGATE_MAX: + return '<Trail Surrogate Max>' + (_, _, name, alias) = test_table[code] + return name if not name.startswith('<') else alias + + def write_character_range(println, name, characters): + char_ranges = list(int_ranges(characters)) + println('') + println('const int js::irregexp::k{}Ranges[] = {{'.format(name)) + for (start, end) in char_ranges: + s_name = char_name(start) + e_name = char_name(end) + println(' {}, {} + 1, // {}'.format(hex4(start), hex4(end), + '{}..{}'.format(s_name, e_name) + if start != end else s_name)) + println(' {} + 1'.format(hex4(MAX_BMP))) + println('};') + println('const int js::irregexp::k{}RangeCount = {};'.format(name, + len(char_ranges) * 2 + 1)) + + def write_character_test(println, test, consequent, default): + # Latin1 characters which, when case-mapped through + # String.prototype.toUpperCase(), canonicalize to a non-Latin1 character. + # ES2017, §21.2.2.8.2 Runtime Semantics: Canonicalize + casemapped_to_nonlatin1 = ifilter(casemaps_to_nonlatin1, xrange(0, MAX_LATIN1 + 1)) + + def casemap_closure(ch): + upper = to_upper(ch) + return (ch, [c for c in xrange(MAX_LATIN1 + 1, MAX_BMP + 1) if upper == to_upper(c)]) + + # Mapping from Latin1 characters to the list of case map equivalent + # non-Latin1 characters. + casemap_for_latin1 = dict(chain(imap(casemap_closure, casemapped_to_nonlatin1))) + + # Non-latin1 characters which, when Unicode case-folded, canonicalize to + # a Latin1 character. + # ES2017, §21.2.2.8.2 Runtime Semantics: Canonicalize + casefolded_to_latin1 = ifilter(casefolds_to_latin1, xrange(MAX_LATIN1 + 1, MAX_BMP + 1)) + + println(' if (unicode) {') + for ch in casefolded_to_latin1: + casefolded = casefold(ch) + # Skip if also handled below for case mapping. + if casefolded in casemap_for_latin1 and ch in casemap_for_latin1[casefolded]: + continue + println(' // "{}" case folds to "{}".'.format(char_name(ch), + char_name(casefolded))) + println(' if ({})'.format(test(ch))) + println(' return {};'.format(consequent(casefolded))) + println(' }') + println('') + for (ch, casemapped_chars) in casemap_for_latin1.iteritems(): + for casemapped in casemapped_chars: + println(' // "{}" case maps to "{}".'.format(char_name(casemapped), + char_name(ch))) + println(' if ({})'.format(' || '.join(imap(test, casemapped_chars)))) + println(' return {};'.format(consequent(ch))) + println(' return {};'.format(default)) + + with io.open('../irregexp/RegExpCharacters-inl.h', 'wb') as chars_file: + write = partial(print, file=chars_file, sep='', end='') + println = partial(write, end='\n') + + write(warning_message) + write(unicode_version_message.format(version)) + + println('#ifndef V8_JSREGEXPCHARACTERS_INL_H_') + println('#define V8_JSREGEXPCHARACTERS_INL_H_') + println('') + println('namespace js {') + println('') + println('namespace irregexp {') + println('') + + println('static inline bool') + println('RangeContainsLatin1Equivalents(CharacterRange range, bool unicode)') + println('{') + write_character_test(println, lambda ch: 'range.Contains({})'.format(hex4(ch)), + lambda _: 'true', 'false') + println('}') + + println('') + println('} } // namespace js::irregexp') + println('') + println('#endif // V8_JSREGEXPCHARACTERS_INL_H_') + + with io.open('../irregexp/RegExpCharacters.cpp', 'wb') as chars_file: + write = partial(print, file=chars_file, sep='', end='') + println = partial(write, end='\n') + character_range = partial(write_character_range, println) + + # Characters in \s, 21.2.2.12 CharacterClassEscape. + space_chars = filter(is_space, xrange(0, MAX_BMP + 1)) + + # Characters in \d, 21.2.2.12 CharacterClassEscape. + digit_chars = map(ord, string.digits) + assert all(ch <= MAX_ASCII for ch in digit_chars) + + # Characters in \w, 21.2.2.12 CharacterClassEscape. + word_chars = map(ord, string.digits + string.ascii_letters + '_') + assert all(ch <= MAX_ASCII for ch in word_chars) + + # Characters which case-fold to characters in \w. + ignorecase_word_chars = (word_chars + + filter(casefolds_to_ascii, xrange(MAX_ASCII + 1, MAX_BMP + 1))) + + # Surrogate characters. + surrogate_chars = range(LEAD_SURROGATE_MIN, TRAIL_SURROGATE_MAX + 1) + + write(warning_message) + write(unicode_version_message.format(version)) + println('#include "irregexp/RegExpCharacters.h"') + println('') + println('#include "mozilla/Assertions.h"') + println('') + + println('char16_t') + println('js::irregexp::ConvertNonLatin1ToLatin1(char16_t c, bool unicode)') + println('{') + println(' MOZ_ASSERT(c > {}, "Character mustn\'t be Latin1");'.format(hex2(MAX_LATIN1))) + write_character_test(println, lambda ch: 'c == {}'.format(hex4(ch)), hex2, '0') + println('}') + + character_range('Space', space_chars) + character_range('SpaceAndSurrogate', space_chars + surrogate_chars) + + character_range('Word', word_chars) + character_range('IgnoreCaseWord', ignorecase_word_chars) + character_range('WordAndSurrogate', word_chars + surrogate_chars) + character_range('NegatedIgnoreCaseWordAndSurrogate', + set(xrange(0, MAX_BMP + 1)) - set(ignorecase_word_chars + surrogate_chars)) + + character_range('Digit', digit_chars) + character_range('DigitAndSurrogate', digit_chars + surrogate_chars) + + character_range('Surrogate', surrogate_chars) + + character_range('LineTerminator', line_terminator) + def update_unicode(args): import urllib2 @@ -807,6 +1016,10 @@ def update_unicode(args): make_non_bmp_file(unicode_version, non_bmp_lower_map, non_bmp_upper_map, non_bmp_folding_map, non_bmp_rev_folding_map) + make_irregexp_tables(unicode_version, + table, index, + folding_table, folding_index, + test_table) make_bmp_mapping_test(unicode_version, test_table) make_non_bmp_mapping_test(unicode_version, non_bmp_upper_map, non_bmp_lower_map) |