diff options
author | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
---|---|---|
committer | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
commit | 5f8de423f190bbb79a62f804151bc24824fa32d8 (patch) | |
tree | 10027f336435511475e392454359edea8e25895d /intl/icu/source/common/filterednormalizer2.cpp | |
parent | 49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff) | |
download | UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip |
Add m-esr52 at 52.6.0
Diffstat (limited to 'intl/icu/source/common/filterednormalizer2.cpp')
-rw-r--r-- | intl/icu/source/common/filterednormalizer2.cpp | 290 |
1 files changed, 290 insertions, 0 deletions
diff --git a/intl/icu/source/common/filterednormalizer2.cpp b/intl/icu/source/common/filterednormalizer2.cpp new file mode 100644 index 000000000..fb6e831af --- /dev/null +++ b/intl/icu/source/common/filterednormalizer2.cpp @@ -0,0 +1,290 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2009-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: filterednormalizer2.cpp +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2009dec10 +* created by: Markus W. Scherer +*/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_NORMALIZATION + +#include "unicode/normalizer2.h" +#include "unicode/uniset.h" +#include "unicode/unistr.h" +#include "unicode/unorm.h" +#include "cpputils.h" + +U_NAMESPACE_BEGIN + +FilteredNormalizer2::~FilteredNormalizer2() {} + +UnicodeString & +FilteredNormalizer2::normalize(const UnicodeString &src, + UnicodeString &dest, + UErrorCode &errorCode) const { + uprv_checkCanGetBuffer(src, errorCode); + if(U_FAILURE(errorCode)) { + dest.setToBogus(); + return dest; + } + if(&dest==&src) { + errorCode=U_ILLEGAL_ARGUMENT_ERROR; + return dest; + } + dest.remove(); + return normalize(src, dest, USET_SPAN_SIMPLE, errorCode); +} + +// Internal: No argument checking, and appends to dest. +// Pass as input spanCondition the one that is likely to yield a non-zero +// span length at the start of src. +// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2, +// USET_SPAN_SIMPLE should be passed in for the start of src +// and USET_SPAN_NOT_CONTAINED should be passed in if we continue after +// an in-filter prefix. +UnicodeString & +FilteredNormalizer2::normalize(const UnicodeString &src, + UnicodeString &dest, + USetSpanCondition spanCondition, + UErrorCode &errorCode) const { + UnicodeString tempDest; // Don't throw away destination buffer between iterations. + for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) { + int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition); + int32_t spanLength=spanLimit-prevSpanLimit; + if(spanCondition==USET_SPAN_NOT_CONTAINED) { + if(spanLength!=0) { + dest.append(src, prevSpanLimit, spanLength); + } + spanCondition=USET_SPAN_SIMPLE; + } else { + if(spanLength!=0) { + // Not norm2.normalizeSecondAndAppend() because we do not want + // to modify the non-filter part of dest. + dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit), + tempDest, errorCode)); + if(U_FAILURE(errorCode)) { + break; + } + } + spanCondition=USET_SPAN_NOT_CONTAINED; + } + prevSpanLimit=spanLimit; + } + return dest; +} + +UnicodeString & +FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, + const UnicodeString &second, + UErrorCode &errorCode) const { + return normalizeSecondAndAppend(first, second, TRUE, errorCode); +} + +UnicodeString & +FilteredNormalizer2::append(UnicodeString &first, + const UnicodeString &second, + UErrorCode &errorCode) const { + return normalizeSecondAndAppend(first, second, FALSE, errorCode); +} + +UnicodeString & +FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, + const UnicodeString &second, + UBool doNormalize, + UErrorCode &errorCode) const { + uprv_checkCanGetBuffer(first, errorCode); + uprv_checkCanGetBuffer(second, errorCode); + if(U_FAILURE(errorCode)) { + return first; + } + if(&first==&second) { + errorCode=U_ILLEGAL_ARGUMENT_ERROR; + return first; + } + if(first.isEmpty()) { + if(doNormalize) { + return normalize(second, first, errorCode); + } else { + return first=second; + } + } + // merge the in-filter suffix of the first string with the in-filter prefix of the second + int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE); + if(prefixLimit!=0) { + UnicodeString prefix(second.tempSubString(0, prefixLimit)); + int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE); + if(suffixStart==0) { + if(doNormalize) { + norm2.normalizeSecondAndAppend(first, prefix, errorCode); + } else { + norm2.append(first, prefix, errorCode); + } + } else { + UnicodeString middle(first, suffixStart, INT32_MAX); + if(doNormalize) { + norm2.normalizeSecondAndAppend(middle, prefix, errorCode); + } else { + norm2.append(middle, prefix, errorCode); + } + first.replace(suffixStart, INT32_MAX, middle); + } + } + if(prefixLimit<second.length()) { + UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX)); + if(doNormalize) { + normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode); + } else { + first.append(rest); + } + } + return first; +} + +UBool +FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const { + return set.contains(c) && norm2.getDecomposition(c, decomposition); +} + +UBool +FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const { + return set.contains(c) && norm2.getRawDecomposition(c, decomposition); +} + +UChar32 +FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const { + return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL; +} + +uint8_t +FilteredNormalizer2::getCombiningClass(UChar32 c) const { + return set.contains(c) ? norm2.getCombiningClass(c) : 0; +} + +UBool +FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const { + uprv_checkCanGetBuffer(s, errorCode); + if(U_FAILURE(errorCode)) { + return FALSE; + } + USetSpanCondition spanCondition=USET_SPAN_SIMPLE; + for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { + int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); + if(spanCondition==USET_SPAN_NOT_CONTAINED) { + spanCondition=USET_SPAN_SIMPLE; + } else { + if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) || + U_FAILURE(errorCode) + ) { + return FALSE; + } + spanCondition=USET_SPAN_NOT_CONTAINED; + } + prevSpanLimit=spanLimit; + } + return TRUE; +} + +UNormalizationCheckResult +FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const { + uprv_checkCanGetBuffer(s, errorCode); + if(U_FAILURE(errorCode)) { + return UNORM_MAYBE; + } + UNormalizationCheckResult result=UNORM_YES; + USetSpanCondition spanCondition=USET_SPAN_SIMPLE; + for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { + int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); + if(spanCondition==USET_SPAN_NOT_CONTAINED) { + spanCondition=USET_SPAN_SIMPLE; + } else { + UNormalizationCheckResult qcResult= + norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode); + if(U_FAILURE(errorCode) || qcResult==UNORM_NO) { + return qcResult; + } else if(qcResult==UNORM_MAYBE) { + result=qcResult; + } + spanCondition=USET_SPAN_NOT_CONTAINED; + } + prevSpanLimit=spanLimit; + } + return result; +} + +int32_t +FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const { + uprv_checkCanGetBuffer(s, errorCode); + if(U_FAILURE(errorCode)) { + return 0; + } + USetSpanCondition spanCondition=USET_SPAN_SIMPLE; + for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { + int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); + if(spanCondition==USET_SPAN_NOT_CONTAINED) { + spanCondition=USET_SPAN_SIMPLE; + } else { + int32_t yesLimit= + prevSpanLimit+ + norm2.spanQuickCheckYes( + s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode); + if(U_FAILURE(errorCode) || yesLimit<spanLimit) { + return yesLimit; + } + spanCondition=USET_SPAN_NOT_CONTAINED; + } + prevSpanLimit=spanLimit; + } + return s.length(); +} + +UBool +FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const { + return !set.contains(c) || norm2.hasBoundaryBefore(c); +} + +UBool +FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const { + return !set.contains(c) || norm2.hasBoundaryAfter(c); +} + +UBool +FilteredNormalizer2::isInert(UChar32 c) const { + return !set.contains(c) || norm2.isInert(c); +} + +U_NAMESPACE_END + +// C API ------------------------------------------------------------------- *** + +U_NAMESPACE_USE + +U_CAPI UNormalizer2 * U_EXPORT2 +unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) { + if(U_FAILURE(*pErrorCode)) { + return NULL; + } + if(filterSet==NULL) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return NULL; + } + Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2, + *UnicodeSet::fromUSet(filterSet)); + if(fn2==NULL) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + } + return (UNormalizer2 *)fn2; +} + +#endif // !UCONFIG_NO_NORMALIZATION |