diff options
Diffstat (limited to 'intl/icu/source/i18n/unesctrn.cpp')
-rw-r--r-- | intl/icu/source/i18n/unesctrn.cpp | 293 |
1 files changed, 293 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/unesctrn.cpp b/intl/icu/source/i18n/unesctrn.cpp new file mode 100644 index 000000000..88a78d6e3 --- /dev/null +++ b/intl/icu/source/i18n/unesctrn.cpp @@ -0,0 +1,293 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* + ********************************************************************** + * Copyright (c) 2001-2011, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + * Date Name Description + * 11/19/2001 aliu Creation. + ********************************************************************** + */ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_TRANSLITERATION + +#include "unicode/uchar.h" +#include "unicode/utf16.h" +#include "unesctrn.h" +#include "util.h" + +#include "cmemory.h" + +U_NAMESPACE_BEGIN + +/** + * Special character marking the end of the spec[] array. + */ +static const UChar END = 0xFFFF; + +// Unicode: "U+10FFFF" hex, min=4, max=6 +static const UChar SPEC_Unicode[] = { + 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, + END +}; + +// Java: "\\uFFFF" hex, min=4, max=4 +static const UChar SPEC_Java[] = { + 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, + END +}; + +// C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8 +static const UChar SPEC_C[] = { + 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, + 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, + END +}; + +// XML: "" hex, min=1, max=6 +static const UChar SPEC_XML[] = { + 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, + END +}; + +// XML10: "" dec, min=1, max=7 (not really "Hex-Any") +static const UChar SPEC_XML10[] = { + 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, + END +}; + +// Perl: "\\x{263A}" hex, min=1, max=6 +static const UChar SPEC_Perl[] = { + 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, + END +}; + +// All: Java, C, Perl, XML, XML10, Unicode +static const UChar SPEC_Any[] = { + 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, // Unicode + 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, // Java + 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, // C (surrogates) + 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, // XML + 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, // XML10 + 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl + END +}; + +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator) + +static UChar* copySpec(const UChar* spec) { + int32_t len = 0; + while (spec[len] != END) { + ++len; + } + ++len; + UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar)); + // Check for memory allocation error. + if (result != NULL) { + uprv_memcpy(result, spec, (size_t)len*sizeof(result[0])); + } + return result; +} + +/** + * Factory methods. Ignore the context. + */ +static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) { + return new UnescapeTransliterator(ID, SPEC_Unicode); +} +static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) { + return new UnescapeTransliterator(ID, SPEC_Java); +} +static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) { + return new UnescapeTransliterator(ID, SPEC_C); +} +static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) { + return new UnescapeTransliterator(ID, SPEC_XML); +} +static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) { + return new UnescapeTransliterator(ID, SPEC_XML10); +} +static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) { + return new UnescapeTransliterator(ID, SPEC_Perl); +} +static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) { + return new UnescapeTransliterator(ID, SPEC_Any); +} + +/** + * Registers standard variants with the system. Called by + * Transliterator during initialization. + */ +void UnescapeTransliterator::registerIDs() { + Token t = integerToken(0); + + Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t); + + Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t); + + Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t); + + Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t); + + Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t); + + Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t); + + Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t); +} + +/** + * Constructor. Takes the encoded spec array. + */ +UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID, + const UChar *newSpec) : + Transliterator(newID, NULL) +{ + this->spec = copySpec(newSpec); +} + +/** + * Copy constructor. + */ +UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) : + Transliterator(o) { + this->spec = copySpec(o.spec); +} + +UnescapeTransliterator::~UnescapeTransliterator() { + uprv_free(spec); +} + +/** + * Transliterator API. + */ +Transliterator* UnescapeTransliterator::clone() const { + return new UnescapeTransliterator(*this); +} + +/** + * Implements {@link Transliterator#handleTransliterate}. + */ +void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos, + UBool isIncremental) const { + int32_t start = pos.start; + int32_t limit = pos.limit; + int32_t i, j, ipat; + + while (start < limit) { + // Loop over the forms in spec[]. Exit this loop when we + // match one of the specs. Exit the outer loop if a + // partial match is detected and isIncremental is true. + for (j=0, ipat=0; spec[ipat] != END; ++j) { + + // Read the header + int32_t prefixLen = spec[ipat++]; + int32_t suffixLen = spec[ipat++]; + int8_t radix = (int8_t) spec[ipat++]; + int32_t minDigits = spec[ipat++]; + int32_t maxDigits = spec[ipat++]; + + // s is a copy of start that is advanced over the + // characters as we parse them. + int32_t s = start; + UBool match = TRUE; + + for (i=0; i<prefixLen; ++i) { + if (s >= limit) { + if (i > 0) { + // We've already matched a character. This is + // a partial match, so we return if in + // incremental mode. In non-incremental mode, + // go to the next spec. + if (isIncremental) { + goto exit; + } + match = FALSE; + break; + } + } + UChar c = text.charAt(s++); + if (c != spec[ipat + i]) { + match = FALSE; + break; + } + } + + if (match) { + UChar32 u = 0; + int32_t digitCount = 0; + for (;;) { + if (s >= limit) { + // Check for partial match in incremental mode. + if (s > start && isIncremental) { + goto exit; + } + break; + } + UChar32 ch = text.char32At(s); + int32_t digit = u_digit(ch, radix); + if (digit < 0) { + break; + } + s += U16_LENGTH(ch); + u = (u * radix) + digit; + if (++digitCount == maxDigits) { + break; + } + } + + match = (digitCount >= minDigits); + + if (match) { + for (i=0; i<suffixLen; ++i) { + if (s >= limit) { + // Check for partial match in incremental mode. + if (s > start && isIncremental) { + goto exit; + } + match = FALSE; + break; + } + UChar c = text.charAt(s++); + if (c != spec[ipat + prefixLen + i]) { + match = FALSE; + break; + } + } + + if (match) { + // At this point, we have a match + UnicodeString str(u); + text.handleReplaceBetween(start, s, str); + limit -= s - start - str.length(); + // The following break statement leaves the + // loop that is traversing the forms in + // spec[]. We then parse the next input + // character. + break; + } + } + } + + ipat += prefixLen + suffixLen; + } + + if (start < limit) { + start += U16_LENGTH(text.char32At(start)); + } + } + + exit: + pos.contextLimit += limit - pos.limit; + pos.limit = limit; + pos.start = start; +} + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_TRANSLITERATION */ + +//eof |