diff options
Diffstat (limited to 'intl/icu/source/i18n/collationdatawriter.cpp')
-rw-r--r-- | intl/icu/source/i18n/collationdatawriter.cpp | 352 |
1 files changed, 352 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/collationdatawriter.cpp b/intl/icu/source/i18n/collationdatawriter.cpp new file mode 100644 index 000000000..596236bc6 --- /dev/null +++ b/intl/icu/source/i18n/collationdatawriter.cpp @@ -0,0 +1,352 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2013-2015, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* collationdatawriter.cpp +* +* created on: 2013aug06 +* created by: Markus W. Scherer +*/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_COLLATION + +#include "unicode/tblcoll.h" +#include "unicode/udata.h" +#include "unicode/uniset.h" +#include "cmemory.h" +#include "collationdata.h" +#include "collationdatabuilder.h" +#include "collationdatareader.h" +#include "collationdatawriter.h" +#include "collationfastlatin.h" +#include "collationsettings.h" +#include "collationtailoring.h" +#include "uassert.h" +#include "ucmndata.h" + +U_NAMESPACE_BEGIN + +uint8_t * +RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const { + if(U_FAILURE(errorCode)) { return NULL; } + LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000)); + if(buffer.isNull()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + length = cloneBinary(buffer.getAlias(), 20000, errorCode); + if(errorCode == U_BUFFER_OVERFLOW_ERROR) { + if(buffer.allocateInsteadAndCopy(length, 0) == NULL) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + errorCode = U_ZERO_ERROR; + length = cloneBinary(buffer.getAlias(), length, errorCode); + } + if(U_FAILURE(errorCode)) { return NULL; } + return buffer.orphan(); +} + +int32_t +RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const { + int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1]; + return CollationDataWriter::writeTailoring( + *tailoring, *settings, indexes, dest, capacity, + errorCode); +} + +static const UDataInfo dataInfo = { + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + U_SIZEOF_UCHAR, + 0, + + { 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol" + { 5, 0, 0, 0 }, // formatVersion + { 6, 3, 0, 0 } // dataVersion +}; + +int32_t +CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings, + const void *rootElements, int32_t rootElementsLength, + int32_t indexes[], uint8_t *dest, int32_t capacity, + UErrorCode &errorCode) { + return write(TRUE, NULL, + data, settings, + rootElements, rootElementsLength, + indexes, dest, capacity, errorCode); +} + +int32_t +CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings, + int32_t indexes[], uint8_t *dest, int32_t capacity, + UErrorCode &errorCode) { + return write(FALSE, t.version, + *t.data, settings, + NULL, 0, + indexes, dest, capacity, errorCode); +} + +int32_t +CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion, + const CollationData &data, const CollationSettings &settings, + const void *rootElements, int32_t rootElementsLength, + int32_t indexes[], uint8_t *dest, int32_t capacity, + UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return 0; } + if(capacity < 0 || (capacity > 0 && dest == NULL)) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + // Figure out which data items to write before settling on + // the indexes length and writing offsets. + // For any data item, we need to write the start and limit offsets, + // so the indexes length must be at least index-of-start-offset + 2. + int32_t indexesLength; + UBool hasMappings; + UnicodeSet unsafeBackwardSet; + const CollationData *baseData = data.base; + + int32_t fastLatinVersion; + if(data.fastLatinTable != NULL) { + fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16; + } else { + fastLatinVersion = 0; + } + int32_t fastLatinTableLength = 0; + + if(isBase) { + // For the root collator, we write an even number of indexes + // so that we start with an 8-aligned offset. + indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1; + U_ASSERT(settings.reorderCodesLength == 0); + hasMappings = TRUE; + unsafeBackwardSet = *data.unsafeBackwardSet; + fastLatinTableLength = data.fastLatinTableLength; + } else if(baseData == NULL) { + hasMappings = FALSE; + if(settings.reorderCodesLength == 0) { + // only options + indexesLength = CollationDataReader::IX_OPTIONS + 1; // no limit offset here + } else { + // only options, reorder codes, and the reorder table + indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2; + } + } else { + hasMappings = TRUE; + // Tailored mappings, and what else? + // Check in ascending order of optional tailoring data items. + indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2; + if(data.contextsLength != 0) { + indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2; + } + unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet); + if(!unsafeBackwardSet.isEmpty()) { + indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2; + } + if(data.fastLatinTable != baseData->fastLatinTable) { + fastLatinTableLength = data.fastLatinTableLength; + indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2; + } + } + + UVector32 codesAndRanges(errorCode); + const int32_t *reorderCodes = settings.reorderCodes; + int32_t reorderCodesLength = settings.reorderCodesLength; + if(settings.hasReordering() && + CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) { + // Rebuild the full list of reorder ranges. + // The list in the settings is truncated for efficiency. + data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode); + // Write the codes, then the ranges. + for(int32_t i = 0; i < reorderCodesLength; ++i) { + codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode); + } + if(U_FAILURE(errorCode)) { return 0; } + reorderCodes = codesAndRanges.getBuffer(); + reorderCodesLength = codesAndRanges.size(); + } + + int32_t headerSize; + if(isBase) { + headerSize = 0; // udata_create() writes the header + } else { + DataHeader header; + header.dataHeader.magic1 = 0xda; + header.dataHeader.magic2 = 0x27; + uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo)); + uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo)); + headerSize = (int32_t)sizeof(header); + U_ASSERT((headerSize & 3) == 0); // multiple of 4 bytes + if(hasMappings && data.cesLength != 0) { + // Sum of the sizes of the data items which are + // not automatically multiples of 8 bytes and which are placed before the CEs. + int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4; + if((sum & 7) != 0) { + // We need to add padding somewhere so that the 64-bit CEs are 8-aligned. + // We add to the header size here. + // Alternatively, we could increment the indexesLength + // or add a few bytes to the reorderTable. + headerSize += 4; + } + } + header.dataHeader.headerSize = (uint16_t)headerSize; + if(headerSize <= capacity) { + uprv_memcpy(dest, &header, sizeof(header)); + // Write 00 bytes so that the padding is not mistaken for a copyright string. + uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header)); + dest += headerSize; + capacity -= headerSize; + } else { + dest = NULL; + capacity = 0; + } + } + + indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength; + U_ASSERT((settings.options & ~0xffff) == 0); + indexes[CollationDataReader::IX_OPTIONS] = + data.numericPrimary | fastLatinVersion | settings.options; + indexes[CollationDataReader::IX_RESERVED2] = 0; + indexes[CollationDataReader::IX_RESERVED3] = 0; + + // Byte offsets of data items all start from the start of the indexes. + // We add the headerSize at the very end. + int32_t totalSize = indexesLength * 4; + + if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) { + indexes[CollationDataReader::IX_JAMO_CE32S_START] = data.jamoCE32s - data.ce32s; + } else { + indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1; + } + + indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize; + totalSize += reorderCodesLength * 4; + + indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize; + if(settings.reorderTable != NULL) { + totalSize += 256; + } + + indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize; + if(hasMappings) { + UErrorCode errorCode2 = U_ZERO_ERROR; + int32_t length; + if(totalSize < capacity) { + length = utrie2_serialize(data.trie, dest + totalSize, + capacity - totalSize, &errorCode2); + } else { + length = utrie2_serialize(data.trie, NULL, 0, &errorCode2); + } + if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) { + errorCode = errorCode2; + return 0; + } + // The trie size should be a multiple of 8 bytes due to the way + // compactIndex2(UNewTrie2 *trie) currently works. + U_ASSERT((length & 7) == 0); + totalSize += length; + } + + indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize; + indexes[CollationDataReader::IX_CES_OFFSET] = totalSize; + if(hasMappings && data.cesLength != 0) { + U_ASSERT(((headerSize + totalSize) & 7) == 0); + totalSize += data.cesLength * 8; + } + + indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize; + indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize; + if(hasMappings) { + totalSize += data.ce32sLength * 4; + } + + indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize; + totalSize += rootElementsLength * 4; + + indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize; + if(hasMappings) { + totalSize += data.contextsLength * 2; + } + + indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize; + if(hasMappings && !unsafeBackwardSet.isEmpty()) { + UErrorCode errorCode2 = U_ZERO_ERROR; + int32_t length; + if(totalSize < capacity) { + uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize); + length = unsafeBackwardSet.serialize( + p, (capacity - totalSize) / 2, errorCode2); + } else { + length = unsafeBackwardSet.serialize(NULL, 0, errorCode2); + } + if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) { + errorCode = errorCode2; + return 0; + } + totalSize += length * 2; + } + + indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize; + totalSize += fastLatinTableLength * 2; + + UnicodeString scripts; + indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize; + if(isBase) { + scripts.append((UChar)data.numScripts); + scripts.append(reinterpret_cast<const UChar *>(data.scriptsIndex), data.numScripts + 16); + scripts.append(reinterpret_cast<const UChar *>(data.scriptStarts), data.scriptStartsLength); + totalSize += scripts.length() * 2; + } + + indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize; + if(isBase) { + totalSize += 256; + } + + indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize; + indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize; + + if(totalSize > capacity) { + errorCode = U_BUFFER_OVERFLOW_ERROR; + return headerSize + totalSize; + } + + uprv_memcpy(dest, indexes, indexesLength * 4); + copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest); + copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest); + // The trie has already been serialized into the dest buffer. + copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest); + copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest); + copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest); + copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest); + // The unsafeBackwardSet has already been serialized into the dest buffer. + copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest); + copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest); + copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest); + + return headerSize + totalSize; +} + +void +CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex, + const void *src, uint8_t *dest) { + int32_t start = indexes[startIndex]; + int32_t limit = indexes[startIndex + 1]; + if(start < limit) { + uprv_memcpy(dest + start, src, limit - start); + } +} + +U_NAMESPACE_END + +#endif // !UCONFIG_NO_COLLATION |