From 5f8de423f190bbb79a62f804151bc24824fa32d8 Mon Sep 17 00:00:00 2001 From: "Matt A. Tobin" Date: Fri, 2 Feb 2018 04:16:08 -0500 Subject: Add m-esr52 at 52.6.0 --- intl/icu/source/common/normalizer2impl.h | 790 +++++++++++++++++++++++++++++++ 1 file changed, 790 insertions(+) create mode 100644 intl/icu/source/common/normalizer2impl.h (limited to 'intl/icu/source/common/normalizer2impl.h') diff --git a/intl/icu/source/common/normalizer2impl.h b/intl/icu/source/common/normalizer2impl.h new file mode 100644 index 000000000..a6bf96797 --- /dev/null +++ b/intl/icu/source/common/normalizer2impl.h @@ -0,0 +1,790 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2009-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: normalizer2impl.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2009nov22 +* created by: Markus W. Scherer +*/ + +#ifndef __NORMALIZER2IMPL_H__ +#define __NORMALIZER2IMPL_H__ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_NORMALIZATION + +#include "unicode/normalizer2.h" +#include "unicode/unistr.h" +#include "unicode/unorm.h" +#include "unicode/utf16.h" +#include "mutex.h" +#include "uset_imp.h" +#include "utrie2.h" + +U_NAMESPACE_BEGIN + +struct CanonIterData; + +class U_COMMON_API Hangul { +public: + /* Korean Hangul and Jamo constants */ + enum { + JAMO_L_BASE=0x1100, /* "lead" jamo */ + JAMO_L_END=0x1112, + JAMO_V_BASE=0x1161, /* "vowel" jamo */ + JAMO_V_END=0x1175, + JAMO_T_BASE=0x11a7, /* "trail" jamo */ + JAMO_T_END=0x11c2, + + HANGUL_BASE=0xac00, + HANGUL_END=0xd7a3, + + JAMO_L_COUNT=19, + JAMO_V_COUNT=21, + JAMO_T_COUNT=28, + + JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT, + + HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT, + HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT + }; + + static inline UBool isHangul(UChar32 c) { + return HANGUL_BASE<=c && c=MIN_NORMAL_MAYBE_YES) { + return (uint8_t)norm16; + } + if(norm16=MIN_NORMAL_MAYBE_YES ? (uint8_t)norm16 : 0; + } + + /** + * Returns the FCD data for code point c. + * @param c A Unicode code point. + * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. + */ + uint16_t getFCD16(UChar32 c) const { + if(c<0) { + return 0; + } else if(c<0x180) { + return tccc180[c]; + } else if(c<=0xffff) { + if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } + } + return getFCD16FromNormData(c); + } + /** + * Returns the FCD data for the next code point (post-increment). + * Might skip only a lead surrogate rather than the whole surrogate pair if none of + * the supplementary code points associated with the lead surrogate have non-zero FCD data. + * @param s A valid pointer into a string. Requires s!=limit. + * @param limit The end of the string, or NULL. + * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. + */ + uint16_t nextFCD16(const UChar *&s, const UChar *limit) const { + UChar32 c=*s++; + if(c<0x180) { + return tccc180[c]; + } else if(!singleLeadMightHaveNonZeroFCD16(c)) { + return 0; + } + UChar c2; + if(U16_IS_LEAD(c) && s!=limit && U16_IS_TRAIL(c2=*s)) { + c=U16_GET_SUPPLEMENTARY(c, c2); + ++s; + } + return getFCD16FromNormData(c); + } + /** + * Returns the FCD data for the previous code point (pre-decrement). + * @param start The start of the string. + * @param s A valid pointer into a string. Requires start>8]; + if(bits==0) { return false; } + return (UBool)((bits>>((lead>>5)&7))&1); + } + /** Returns the FCD value from the regular normalization data. */ + uint16_t getFCD16FromNormData(UChar32 c) const; + + void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16, + CanonIterData &newData, UErrorCode &errorCode) const; + + /** + * Gets the decomposition for one code point. + * @param c code point + * @param buffer out-only buffer for algorithmic decompositions + * @param length out-only, takes the length of the decomposition, if any + * @return pointer to the decomposition, or NULL if none + */ + const UChar *getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const; + + /** + * Gets the raw decomposition for one code point. + * @param c code point + * @param buffer out-only buffer for algorithmic decompositions + * @param length out-only, takes the length of the decomposition, if any + * @return pointer to the decomposition, or NULL if none + */ + const UChar *getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const; + + UChar32 composePair(UChar32 a, UChar32 b) const; + + UBool isCanonSegmentStarter(UChar32 c) const; + UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const; + + enum { + MIN_CCC_LCCC_CP=0x300 + }; + + enum { + MIN_YES_YES_WITH_CC=0xff01, + JAMO_VT=0xff00, + MIN_NORMAL_MAYBE_YES=0xfe00, + JAMO_L=1, + MAX_DELTA=0x40 + }; + + enum { + // Byte offsets from the start of the data, after the generic header. + IX_NORM_TRIE_OFFSET, + IX_EXTRA_DATA_OFFSET, + IX_SMALL_FCD_OFFSET, + IX_RESERVED3_OFFSET, + IX_RESERVED4_OFFSET, + IX_RESERVED5_OFFSET, + IX_RESERVED6_OFFSET, + IX_TOTAL_SIZE, + + // Code point thresholds for quick check codes. + IX_MIN_DECOMP_NO_CP, + IX_MIN_COMP_NO_MAYBE_CP, + + // Norm16 value thresholds for quick check combinations and types of extra data. + IX_MIN_YES_NO, // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. + IX_MIN_NO_NO, + IX_LIMIT_NO_NO, + IX_MIN_MAYBE_YES, + + IX_MIN_YES_NO_MAPPINGS_ONLY, // Mappings only in [minYesNoMappingsOnly..minNoNo[. + + IX_RESERVED15, + IX_COUNT + }; + + enum { + MAPPING_HAS_CCC_LCCC_WORD=0x80, + MAPPING_HAS_RAW_MAPPING=0x40, + MAPPING_NO_COMP_BOUNDARY_AFTER=0x20, + MAPPING_LENGTH_MASK=0x1f + }; + + enum { + COMP_1_LAST_TUPLE=0x8000, + COMP_1_TRIPLE=1, + COMP_1_TRAIL_LIMIT=0x3400, + COMP_1_TRAIL_MASK=0x7ffe, + COMP_1_TRAIL_SHIFT=9, // 10-1 for the "triple" bit + COMP_2_TRAIL_SHIFT=6, + COMP_2_TRAIL_MASK=0xffc0 + }; + + // higher-level functionality ------------------------------------------ *** + + // NFD without an NFD Normalizer2 instance. + UnicodeString &decompose(const UnicodeString &src, UnicodeString &dest, + UErrorCode &errorCode) const; + /** + * Decomposes [src, limit[ and writes the result to dest. + * limit can be NULL if src is NUL-terminated. + * destLengthEstimate is the initial dest buffer capacity and can be -1. + */ + void decompose(const UChar *src, const UChar *limit, + UnicodeString &dest, int32_t destLengthEstimate, + UErrorCode &errorCode) const; + + const UChar *decompose(const UChar *src, const UChar *limit, + ReorderingBuffer *buffer, UErrorCode &errorCode) const; + void decomposeAndAppend(const UChar *src, const UChar *limit, + UBool doDecompose, + UnicodeString &safeMiddle, + ReorderingBuffer &buffer, + UErrorCode &errorCode) const; + UBool compose(const UChar *src, const UChar *limit, + UBool onlyContiguous, + UBool doCompose, + ReorderingBuffer &buffer, + UErrorCode &errorCode) const; + const UChar *composeQuickCheck(const UChar *src, const UChar *limit, + UBool onlyContiguous, + UNormalizationCheckResult *pQCResult) const; + void composeAndAppend(const UChar *src, const UChar *limit, + UBool doCompose, + UBool onlyContiguous, + UnicodeString &safeMiddle, + ReorderingBuffer &buffer, + UErrorCode &errorCode) const; + const UChar *makeFCD(const UChar *src, const UChar *limit, + ReorderingBuffer *buffer, UErrorCode &errorCode) const; + void makeFCDAndAppend(const UChar *src, const UChar *limit, + UBool doMakeFCD, + UnicodeString &safeMiddle, + ReorderingBuffer &buffer, + UErrorCode &errorCode) const; + + UBool hasDecompBoundary(UChar32 c, UBool before) const; + UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); } + + UBool hasCompBoundaryBefore(UChar32 c) const { + return c=minMaybeYes; } + static UBool isInert(uint16_t norm16) { return norm16==0; } + static UBool isJamoL(uint16_t norm16) { return norm16==1; } + static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; } + UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; } + UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16=MIN_YES_YES_WITH_CC || norm16=limitNoNo; } + + // For use with isCompYes(). + // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. + // static uint8_t getCCFromYes(uint16_t norm16) { + // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; + // } + uint8_t getCCFromNoNo(uint16_t norm16) const { + const uint16_t *mapping=getMapping(norm16); + if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) { + return (uint8_t)*(mapping-1); + } else { + return 0; + } + } + // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() + uint8_t getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const; + + // Requires algorithmic-NoNo. + UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const { + return c+norm16-(minMaybeYes-MAX_DELTA-1); + } + + // Requires minYesNo