diff options
Diffstat (limited to 'intl/icu/source/i18n/collationruleparser.h')
-rw-r--r-- | intl/icu/source/i18n/collationruleparser.h | 197 |
1 files changed, 197 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/collationruleparser.h b/intl/icu/source/i18n/collationruleparser.h new file mode 100644 index 000000000..bd393c2b6 --- /dev/null +++ b/intl/icu/source/i18n/collationruleparser.h @@ -0,0 +1,197 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2013-2014, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* collationruleparser.h +* +* created on: 2013apr10 +* created by: Markus W. Scherer +*/ + +#ifndef __COLLATIONRULEPARSER_H__ +#define __COLLATIONRULEPARSER_H__ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_COLLATION + +#include "unicode/ucol.h" +#include "unicode/uniset.h" +#include "unicode/unistr.h" + +struct UParseError; + +U_NAMESPACE_BEGIN + +struct CollationData; +struct CollationTailoring; + +class Locale; +class Normalizer2; + +struct CollationSettings; + +class U_I18N_API CollationRuleParser : public UMemory { +public: + /** Special reset positions. */ + enum Position { + FIRST_TERTIARY_IGNORABLE, + LAST_TERTIARY_IGNORABLE, + FIRST_SECONDARY_IGNORABLE, + LAST_SECONDARY_IGNORABLE, + FIRST_PRIMARY_IGNORABLE, + LAST_PRIMARY_IGNORABLE, + FIRST_VARIABLE, + LAST_VARIABLE, + FIRST_REGULAR, + LAST_REGULAR, + FIRST_IMPLICIT, + LAST_IMPLICIT, + FIRST_TRAILING, + LAST_TRAILING + }; + + /** + * First character of contractions that encode special reset positions. + * U+FFFE cannot be tailored via rule syntax. + * + * The second contraction character is POS_BASE + Position. + */ + static const UChar POS_LEAD = 0xfffe; + /** + * Base for the second character of contractions that encode special reset positions. + * Braille characters U+28xx are printable and normalization-inert. + * @see POS_LEAD + */ + static const UChar POS_BASE = 0x2800; + + class U_I18N_API Sink : public UObject { + public: + virtual ~Sink(); + /** + * Adds a reset. + * strength=UCOL_IDENTICAL for &str. + * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3. + */ + virtual void addReset(int32_t strength, const UnicodeString &str, + const char *&errorReason, UErrorCode &errorCode) = 0; + /** + * Adds a relation with strength and prefix | str / extension. + */ + virtual void addRelation(int32_t strength, const UnicodeString &prefix, + const UnicodeString &str, const UnicodeString &extension, + const char *&errorReason, UErrorCode &errorCode) = 0; + + virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason, + UErrorCode &errorCode); + + virtual void optimize(const UnicodeSet &set, const char *&errorReason, + UErrorCode &errorCode); + }; + + class U_I18N_API Importer : public UObject { + public: + virtual ~Importer(); + virtual void getRules( + const char *localeID, const char *collationType, + UnicodeString &rules, + const char *&errorReason, UErrorCode &errorCode) = 0; + }; + + /** + * Constructor. + * The Sink must be set before parsing. + * The Importer can be set, otherwise [import locale] syntax is not supported. + */ + CollationRuleParser(const CollationData *base, UErrorCode &errorCode); + ~CollationRuleParser(); + + /** + * Sets the pointer to a Sink object. + * The pointer is aliased: Pointer copy without cloning or taking ownership. + */ + void setSink(Sink *sinkAlias) { + sink = sinkAlias; + } + + /** + * Sets the pointer to an Importer object. + * The pointer is aliased: Pointer copy without cloning or taking ownership. + */ + void setImporter(Importer *importerAlias) { + importer = importerAlias; + } + + void parse(const UnicodeString &ruleString, + CollationSettings &outSettings, + UParseError *outParseError, + UErrorCode &errorCode); + + const char *getErrorReason() const { return errorReason; } + + /** + * Gets a script or reorder code from its string representation. + * @return the script/reorder code, or + * -1 if not recognized + */ + static int32_t getReorderCode(const char *word); + +private: + /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */ + static const int32_t STRENGTH_MASK = 0xf; + static const int32_t STARRED_FLAG = 0x10; + static const int32_t OFFSET_SHIFT = 8; + + void parse(const UnicodeString &ruleString, UErrorCode &errorCode); + void parseRuleChain(UErrorCode &errorCode); + int32_t parseResetAndPosition(UErrorCode &errorCode); + int32_t parseRelationOperator(UErrorCode &errorCode); + void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode); + void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode); + int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode); + int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode); + + /** + * Sets str to a contraction of U+FFFE and (U+2800 + Position). + * @return rule index after the special reset position + */ + int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode); + void parseSetting(UErrorCode &errorCode); + void parseReordering(const UnicodeString &raw, UErrorCode &errorCode); + static UColAttributeValue getOnOffValue(const UnicodeString &s); + + int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode); + int32_t readWords(int32_t i, UnicodeString &raw) const; + int32_t skipComment(int32_t i) const; + + void setParseError(const char *reason, UErrorCode &errorCode); + void setErrorContext(); + + /** + * ASCII [:P:] and [:S:]: + * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E] + */ + static UBool isSyntaxChar(UChar32 c); + int32_t skipWhiteSpace(int32_t i) const; + + const Normalizer2 &nfd, &nfc; + + const UnicodeString *rules; + const CollationData *const baseData; + CollationSettings *settings; + UParseError *parseError; + const char *errorReason; + + Sink *sink; + Importer *importer; + + int32_t ruleIndex; +}; + +U_NAMESPACE_END + +#endif // !UCONFIG_NO_COLLATION +#endif // __COLLATIONRULEPARSER_H__ |