1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
|
// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 2001-2011, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 06/07/01 aliu Creation.
**********************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_TRANSLITERATION
#include "unicode/unifilt.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/utf16.h"
#include "cmemory.h"
#include "name2uni.h"
#include "patternprops.h"
#include "uprops.h"
#include "uinvchar.h"
#include "util.h"
U_NAMESPACE_BEGIN
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NameUnicodeTransliterator)
static const UChar OPEN[] = {92,78,126,123,126,0}; // "\N~{~"
static const UChar OPEN_DELIM = 92; // '\\' first char of OPEN
static const UChar CLOSE_DELIM = 125; // '}'
static const UChar SPACE = 32; // ' '
U_CDECL_BEGIN
// USetAdder implementation
// Does not use uset.h to reduce code dependencies
static void U_CALLCONV
_set_add(USet *set, UChar32 c) {
uset_add(set, c);
}
// These functions aren't used.
/*static void U_CALLCONV
_set_addRange(USet *set, UChar32 start, UChar32 end) {
((UnicodeSet *)set)->add(start, end);
}
static void U_CALLCONV
_set_addString(USet *set, const UChar *str, int32_t length) {
((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
}*/
U_CDECL_END
/**
* Constructs a transliterator with the default delimiters '{' and
* '}'.
*/
NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) :
Transliterator(UNICODE_STRING("Name-Any", 8), adoptedFilter) {
UnicodeSet *legalPtr = &legal;
// Get the legal character set
USetAdder sa = {
(USet *)legalPtr, // USet* == UnicodeSet*
_set_add,
NULL, // Don't need _set_addRange
NULL, // Don't need _set_addString
NULL, // Don't need remove()
NULL
};
uprv_getCharNameCharacters(&sa);
}
/**
* Destructor.
*/
NameUnicodeTransliterator::~NameUnicodeTransliterator() {}
/**
* Copy constructor.
*/
NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) :
Transliterator(o), legal(o.legal) {}
/**
* Assignment operator.
*/
/*NameUnicodeTransliterator& NameUnicodeTransliterator::operator=(
const NameUnicodeTransliterator& o) {
Transliterator::operator=(o);
// not necessary: the legal sets should all be the same -- legal=o.legal;
return *this;
}*/
/**
* Transliterator API.
*/
Transliterator* NameUnicodeTransliterator::clone(void) const {
return new NameUnicodeTransliterator(*this);
}
/**
* Implements {@link Transliterator#handleTransliterate}.
*/
void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
UBool isIncremental) const {
// The failure mode, here and below, is to behave like Any-Null,
// if either there is no name data (max len == 0) or there is no
// memory (malloc() => NULL).
int32_t maxLen = uprv_getMaxCharNameLength();
if (maxLen == 0) {
offsets.start = offsets.limit;
return;
}
// Accomodate the longest possible name
++maxLen; // allow for temporary trailing space
char* cbuf = (char*) uprv_malloc(maxLen);
if (cbuf == NULL) {
offsets.start = offsets.limit;
return;
}
UnicodeString openPat(TRUE, OPEN, -1);
UnicodeString str, name;
int32_t cursor = offsets.start;
int32_t limit = offsets.limit;
// Modes:
// 0 - looking for open delimiter
// 1 - after open delimiter
int32_t mode = 0;
int32_t openPos = -1; // open delim candidate pos
UChar32 c;
while (cursor < limit) {
c = text.char32At(cursor);
switch (mode) {
case 0: // looking for open delimiter
if (c == OPEN_DELIM) { // quick check first
openPos = cursor;
int32_t i =
ICU_Utility::parsePattern(openPat, text, cursor, limit);
if (i >= 0 && i < limit) {
mode = 1;
name.truncate(0);
cursor = i;
continue; // *** reprocess char32At(cursor)
}
}
break;
case 1: // after open delimiter
// Look for legal chars. If \s+ is found, convert it
// to a single space. If closeDelimiter is found, exit
// the loop. If any other character is found, exit the
// loop. If the limit is reached, exit the loop.
// Convert \s+ => SPACE. This assumes there are no
// runs of >1 space characters in names.
if (PatternProps::isWhiteSpace(c)) {
// Ignore leading whitespace
if (name.length() > 0 &&
name.charAt(name.length()-1) != SPACE) {
name.append(SPACE);
// If we are too long then abort. maxLen includes
// temporary trailing space, so use '>'.
if (name.length() > maxLen) {
mode = 0;
}
}
break;
}
if (c == CLOSE_DELIM) {
int32_t len = name.length();
// Delete trailing space, if any
if (len > 0 &&
name.charAt(len-1) == SPACE) {
--len;
}
if (uprv_isInvariantUString(name.getBuffer(), len)) {
name.extract(0, len, cbuf, maxLen, US_INV);
UErrorCode status = U_ZERO_ERROR;
c = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status);
if (U_SUCCESS(status)) {
// Lookup succeeded
// assert(U16_LENGTH(CLOSE_DELIM) == 1);
cursor++; // advance over CLOSE_DELIM
str.truncate(0);
str.append(c);
text.handleReplaceBetween(openPos, cursor, str);
// Adjust indices for the change in the length of
// the string. Do not assume that str.length() ==
// 1, in case of surrogates.
int32_t delta = cursor - openPos - str.length();
cursor -= delta;
limit -= delta;
// assert(cursor == openPos + str.length());
}
}
// If the lookup failed, we leave things as-is and
// still switch to mode 0 and continue.
mode = 0;
openPos = -1; // close off candidate
continue; // *** reprocess char32At(cursor)
}
// Check if c is a legal char. We assume here that
// legal.contains(OPEN_DELIM) is FALSE, so when we abort a
// name, we don't have to go back to openPos+1.
if (legal.contains(c)) {
name.append(c);
// If we go past the longest possible name then abort.
// maxLen includes temporary trailing space, so use '>='.
if (name.length() >= maxLen) {
mode = 0;
}
}
// Invalid character
else {
--cursor; // Backup and reprocess this character
mode = 0;
}
break;
}
cursor += U16_LENGTH(c);
}
offsets.contextLimit += limit - offsets.limit;
offsets.limit = limit;
// In incremental mode, only advance the cursor up to the last
// open delimiter candidate.
offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor;
uprv_free(cbuf);
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_TRANSLITERATION */
|