1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
|
// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 2001-2011, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 07/03/01 aliu Creation.
**********************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_TRANSLITERATION
#include "unicode/normalizer2.h"
#include "unicode/utf16.h"
#include "cstring.h"
#include "nortrans.h"
U_NAMESPACE_BEGIN
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
static inline Transliterator::Token cstrToken(const char *s) {
return Transliterator::pointerToken((void *)s);
}
/**
* System registration hook.
*/
void NormalizationTransliterator::registerIDs() {
// In the Token, the byte after the NUL is the UNormalization2Mode.
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
_create, cstrToken("nfc\0\0"));
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
_create, cstrToken("nfkc\0\0"));
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
_create, cstrToken("nfc\0\1"));
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
_create, cstrToken("nfkc\0\1"));
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
_create, cstrToken("nfc\0\2"));
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
_create, cstrToken("nfc\0\3"));
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
UNICODE_STRING_SIMPLE("NFD"), TRUE);
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
UNICODE_STRING_SIMPLE("NFKD"), TRUE);
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
UNICODE_STRING_SIMPLE("NFD"), FALSE);
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
UNICODE_STRING_SIMPLE("FCD"), FALSE);
}
/**
* Factory methods
*/
Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
Token context) {
const char *name = (const char *)context.pointer;
UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
UErrorCode errorCode = U_ZERO_ERROR;
const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
if(U_SUCCESS(errorCode)) {
return new NormalizationTransliterator(ID, *norm2);
} else {
return NULL;
}
}
/**
* Constructs a transliterator.
*/
NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
const Normalizer2 &norm2) :
Transliterator(id, 0), fNorm2(norm2) {}
/**
* Destructor.
*/
NormalizationTransliterator::~NormalizationTransliterator() {
}
/**
* Copy constructor.
*/
NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
Transliterator(o), fNorm2(o.fNorm2) {}
/**
* Transliterator API.
*/
Transliterator* NormalizationTransliterator::clone(void) const {
return new NormalizationTransliterator(*this);
}
/**
* Implements {@link Transliterator#handleTransliterate}.
*/
void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
UBool isIncremental) const {
// start and limit of the input range
int32_t start = offsets.start;
int32_t limit = offsets.limit;
if(start >= limit) {
return;
}
/*
* Normalize as short chunks at a time as possible even in
* bulk mode, so that styled text is minimally disrupted.
* In incremental mode, a chunk that ends with offsets.limit
* must not be normalized.
*
* If it was known that the input text is not styled, then
* a bulk mode normalization could look like this:
UnicodeString input, normalized;
int32_t length = limit - start;
_Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
input.releaseBuffer(length);
UErrorCode status = U_ZERO_ERROR;
fNorm2.normalize(input, normalized, status);
text.handleReplaceBetween(start, limit, normalized);
int32_t delta = normalized.length() - length;
offsets.contextLimit += delta;
offsets.limit += delta;
offsets.start = limit + delta;
*/
UErrorCode errorCode = U_ZERO_ERROR;
UnicodeString segment;
UnicodeString normalized;
UChar32 c = text.char32At(start);
do {
int32_t prev = start;
// Skip at least one character so we make progress.
// c holds the character at start.
segment.remove();
do {
segment.append(c);
start += U16_LENGTH(c);
} while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
// stop in incremental mode when we reach the input limit
// in case there are additional characters that could change the
// normalization result
start=prev;
break;
}
fNorm2.normalize(segment, normalized, errorCode);
if(U_FAILURE(errorCode)) {
break;
}
if(segment != normalized) {
// replace the input chunk with its normalized form
text.handleReplaceBetween(prev, start, normalized);
// update all necessary indexes accordingly
int32_t delta = normalized.length() - (start - prev);
start += delta;
limit += delta;
}
} while(start < limit);
offsets.start = start;
offsets.contextLimit += limit - offsets.limit;
offsets.limit = limit;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_TRANSLITERATION */
|