diff options
Diffstat (limited to 'intl/icu/source/tools/toolutil/ppucd.h')
-rw-r--r-- | intl/icu/source/tools/toolutil/ppucd.h | 176 |
1 files changed, 176 insertions, 0 deletions
diff --git a/intl/icu/source/tools/toolutil/ppucd.h b/intl/icu/source/tools/toolutil/ppucd.h new file mode 100644 index 000000000..593bd2479 --- /dev/null +++ b/intl/icu/source/tools/toolutil/ppucd.h @@ -0,0 +1,176 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2011-2013, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: ppucd.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2011dec11 +* created by: Markus W. Scherer +*/ + +#ifndef __PPUCD_H__ +#define __PPUCD_H__ + +#include "unicode/utypes.h" +#include "unicode/uniset.h" +#include "unicode/unistr.h" + +#include <stdio.h> + +/** Additions to the uchar.h enum UProperty. */ +enum { + /** Name_Alias */ + PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT, + PPUCD_CONDITIONAL_CASE_MAPPINGS, + PPUCD_TURKIC_CASE_FOLDING +}; + +U_NAMESPACE_BEGIN + +class U_TOOLUTIL_API PropertyNames { +public: + virtual ~PropertyNames(); + virtual int32_t getPropertyEnum(const char *name) const; + virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const; +}; + +struct U_TOOLUTIL_API UniProps { + UniProps(); + ~UniProps(); + + int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; } + + UChar32 start, end; + UBool binProps[UCHAR_BINARY_LIMIT]; + int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]; + UVersionInfo age; + UChar32 bmg, bpb; + UChar32 scf, slc, stc, suc; + int32_t digitValue; + const char *numericValue; + const char *name; + const char *nameAlias; + UnicodeString cf, lc, tc, uc; + UnicodeSet scx; +}; + +class U_TOOLUTIL_API PreparsedUCD { +public: + enum LineType { + /** No line, end of file. */ + NO_LINE, + /** Empty line. (Might contain a comment.) */ + EMPTY_LINE, + + /** ucd;6.1.0 */ + UNICODE_VERSION_LINE, + + /** property;Binary;Alpha;Alphabetic */ + PROPERTY_LINE, + /** binary;N;No;F;False */ + BINARY_LINE, + /** value;gc;Zs;Space_Separator */ + VALUE_LINE, + + /** defaults;0000..10FFFF;age=NA;bc=L;... */ + DEFAULTS_LINE, + /** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */ + BLOCK_LINE, + /** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */ + CP_LINE, + + /** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */ + ALG_NAMES_RANGE_LINE, + + LINE_TYPE_COUNT + }; + + /** + * Constructor. + * Prepare this object for a new, empty package. + */ + PreparsedUCD(const char *filename, UErrorCode &errorCode); + + /** Destructor. */ + ~PreparsedUCD(); + + /** Sets (aliases) a non-standard PropertyNames implementation. Caller retains ownership. */ + void setPropertyNames(const PropertyNames *pn) { pnames=pn; } + + /** + * Reads a line from the preparsed UCD file. + * Splits the line by replacing each ';' with a NUL. + */ + LineType readLine(UErrorCode &errorCode); + + /** Returns the number of the line read by readLine(). */ + int32_t getLineNumber() const { return lineNumber; } + + /** Returns the line's next field, or NULL. */ + const char *nextField(); + + /** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */ + const UVersionInfo &getUnicodeVersion() const { return ucdVersion; } + + /** Returns TRUE if the current line has property values. */ + UBool lineHasPropertyValues() const { return DEFAULTS_LINE<=lineType && lineType<=CP_LINE; } + + /** + * Parses properties from the current line. + * Clears newValues and sets UProperty codes for property values mentioned + * on the current line (as opposed to being inherited). + * Returns a pointer to the filled-in UniProps, or NULL if something went wrong. + * The returned UniProps are usable until the next line of the same type is read. + */ + const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode); + + /** + * Returns the code point range for the current algnamesrange line. + * Calls & parses nextField(). + * Further nextField() calls will yield the range's type & prefix string. + * Returns U_SUCCESS(errorCode). + */ + UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode); + +private: + UBool isLineBufferAvailable(int32_t i) { + return defaultLineIndex!=i && blockLineIndex!=i; + } + + /** Resets the field iterator and returns the line's first field (the line type field). */ + const char *firstField(); + + UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues, + UErrorCode &errorCode); + UChar32 parseCodePoint(const char *s, UErrorCode &errorCode); + UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode); + void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode); + void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode); + + static const int32_t kNumLineBuffers=3; + + PropertyNames *icuPnames; // owned + const PropertyNames *pnames; // aliased + FILE *file; + int32_t defaultLineIndex, blockLineIndex, lineIndex; + int32_t lineNumber; + LineType lineType; + char *fieldLimit; + char *lineLimit; + + UVersionInfo ucdVersion; + UniProps defaultProps, blockProps, cpProps; + // Multiple lines so that default and block properties can maintain pointers + // into their line buffers. + char lines[kNumLineBuffers][4096]; +}; + +U_NAMESPACE_END + +#endif // __PPUCD_H__ |