diff options
Diffstat (limited to 'intl/icu/source/common/dictionarydata.cpp')
-rw-r--r-- | intl/icu/source/common/dictionarydata.cpp | 242 |
1 files changed, 242 insertions, 0 deletions
diff --git a/intl/icu/source/common/dictionarydata.cpp b/intl/icu/source/common/dictionarydata.cpp new file mode 100644 index 000000000..00f66369e --- /dev/null +++ b/intl/icu/source/common/dictionarydata.cpp @@ -0,0 +1,242 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2014-2016, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* dictionarydata.h +* +* created on: 2012may31 +* created by: Markus W. Scherer & Maxime Serrano +*/ + +#include "dictionarydata.h" +#include "unicode/ucharstrie.h" +#include "unicode/bytestrie.h" +#include "unicode/udata.h" +#include "cmemory.h" + +#if !UCONFIG_NO_BREAK_ITERATION + +U_NAMESPACE_BEGIN + +const int32_t DictionaryData::TRIE_TYPE_BYTES = 0; +const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1; +const int32_t DictionaryData::TRIE_TYPE_MASK = 7; +const int32_t DictionaryData::TRIE_HAS_VALUES = 8; + +const int32_t DictionaryData::TRANSFORM_NONE = 0; +const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000; +const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000; +const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff; + +DictionaryMatcher::~DictionaryMatcher() { +} + +UCharsDictionaryMatcher::~UCharsDictionaryMatcher() { + udata_close(file); +} + +int32_t UCharsDictionaryMatcher::getType() const { + return DictionaryData::TRIE_TYPE_UCHARS; +} + +int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, + int32_t *lengths, int32_t *cpLengths, int32_t *values, + int32_t *prefix) const { + + UCharsTrie uct(characters); + int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); + int32_t wordCount = 0; + int32_t codePointsMatched = 0; + + for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) { + UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c); + int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; + codePointsMatched += 1; + if (USTRINGTRIE_HAS_VALUE(result)) { + if (wordCount < limit) { + if (values != NULL) { + values[wordCount] = uct.getValue(); + } + if (lengths != NULL) { + lengths[wordCount] = lengthMatched; + } + if (cpLengths != NULL) { + cpLengths[wordCount] = codePointsMatched; + } + ++wordCount; + } + if (result == USTRINGTRIE_FINAL_VALUE) { + break; + } + } + else if (result == USTRINGTRIE_NO_MATCH) { + break; + } + if (lengthMatched >= maxLength) { + break; + } + } + + if (prefix != NULL) { + *prefix = codePointsMatched; + } + return wordCount; +} + +BytesDictionaryMatcher::~BytesDictionaryMatcher() { + udata_close(file); +} + +UChar32 BytesDictionaryMatcher::transform(UChar32 c) const { + if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) { + if (c == 0x200D) { + return 0xFF; + } else if (c == 0x200C) { + return 0xFE; + } + int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK); + if (delta < 0 || 0xFD < delta) { + return U_SENTINEL; + } + return (UChar32)delta; + } + return c; +} + +int32_t BytesDictionaryMatcher::getType() const { + return DictionaryData::TRIE_TYPE_BYTES; +} + +int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, + int32_t *lengths, int32_t *cpLengths, int32_t *values, + int32_t *prefix) const { + BytesTrie bt(characters); + int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); + int32_t wordCount = 0; + int32_t codePointsMatched = 0; + + for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) { + UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c)); + int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; + codePointsMatched += 1; + if (USTRINGTRIE_HAS_VALUE(result)) { + if (wordCount < limit) { + if (values != NULL) { + values[wordCount] = bt.getValue(); + } + if (lengths != NULL) { + lengths[wordCount] = lengthMatched; + } + if (cpLengths != NULL) { + cpLengths[wordCount] = codePointsMatched; + } + ++wordCount; + } + if (result == USTRINGTRIE_FINAL_VALUE) { + break; + } + } + else if (result == USTRINGTRIE_NO_MATCH) { + break; + } + if (lengthMatched >= maxLength) { + break; + } + } + + if (prefix != NULL) { + *prefix = codePointsMatched; + } + return wordCount; +} + + +U_NAMESPACE_END + +U_NAMESPACE_USE + +U_CAPI int32_t U_EXPORT2 +udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, + void *outData, UErrorCode *pErrorCode) { + const UDataInfo *pInfo; + int32_t headerSize; + const uint8_t *inBytes; + uint8_t *outBytes; + const int32_t *inIndexes; + int32_t indexes[DictionaryData::IX_COUNT]; + int32_t i, offset, size; + + headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0; + pInfo = (const UDataInfo *)((const char *)inData + 4); + if (!(pInfo->dataFormat[0] == 0x44 && + pInfo->dataFormat[1] == 0x69 && + pInfo->dataFormat[2] == 0x63 && + pInfo->dataFormat[3] == 0x74 && + pInfo->formatVersion[0] == 1)) { + udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]); + *pErrorCode = U_UNSUPPORTED_ERROR; + return 0; + } + + inBytes = (const uint8_t *)inData + headerSize; + outBytes = (uint8_t *)outData + headerSize; + + inIndexes = (const int32_t *)inBytes; + if (length >= 0) { + length -= headerSize; + if (length < (int32_t)(sizeof(indexes))) { + udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length); + *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + } + + for (i = 0; i < DictionaryData::IX_COUNT; i++) { + indexes[i] = udata_readInt32(ds, inIndexes[i]); + } + + size = indexes[DictionaryData::IX_TOTAL_SIZE]; + + if (length >= 0) { + if (length < size) { + udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length); + *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + if (inBytes != outBytes) { + uprv_memcpy(outBytes, inBytes, size); + } + + offset = 0; + ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode); + offset = (int32_t)sizeof(indexes); + int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; + int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET]; + + if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { + ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode); + } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) { + // nothing to do + } else { + udata_printError(ds, "udict_swap(): unknown trie type!\n"); + *pErrorCode = U_UNSUPPORTED_ERROR; + return 0; + } + + // these next two sections are empty in the current format, + // but may be used later. + offset = nextOffset; + nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET]; + offset = nextOffset; + nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE]; + offset = nextOffset; + } + return headerSize + size; +} +#endif |