diff options
Diffstat (limited to 'intl/icu/source/i18n/collationdatareader.h')
-rw-r--r-- | intl/icu/source/i18n/collationdatareader.h | 253 |
1 files changed, 253 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/collationdatareader.h b/intl/icu/source/i18n/collationdatareader.h new file mode 100644 index 000000000..ff8ec3d40 --- /dev/null +++ b/intl/icu/source/i18n/collationdatareader.h @@ -0,0 +1,253 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2013-2015, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* collationdatareader.h +* +* created on: 2013feb07 +* created by: Markus W. Scherer +*/ + +#ifndef __COLLATIONDATAREADER_H__ +#define __COLLATIONDATAREADER_H__ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_COLLATION + +#include "unicode/udata.h" + +struct UDataMemory; + +U_NAMESPACE_BEGIN + +struct CollationTailoring; + +/** + * Collation binary data reader. + */ +struct U_I18N_API CollationDataReader /* all static */ { + // The following constants are also copied into source/common/ucol_swp.cpp. + // Keep them in sync! + enum { + /** + * Number of int32_t indexes. + * + * Can be 2 if there are only options. + * Can be 7 or 8 if there are only options and a script reordering. + * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0. + */ + IX_INDEXES_LENGTH, // 0 + /** + * Bits 31..24: numericPrimary, for numeric collation + * 23..16: fast Latin format version (0 = no fast Latin table) + * 15.. 0: options bit set + */ + IX_OPTIONS, + IX_RESERVED2, + IX_RESERVED3, + + /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */ + IX_JAMO_CE32S_START, // 4 + + // Byte offsets from the start of the data, after the generic header. + // The indexes[] are at byte offset 0, other data follows. + // Each data item is aligned properly. + // The data items should be in descending order of unit size, + // to minimize the need for padding. + // Each item's byte length is given by the difference between its offset and + // the next index/offset value. + /** Byte offset to int32_t reorderCodes[]. */ + IX_REORDER_CODES_OFFSET, + /** + * Byte offset to uint8_t reorderTable[]. + * Empty table if <256 bytes (padding only). + * Otherwise 256 bytes or more (with padding). + */ + IX_REORDER_TABLE_OFFSET, + /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */ + IX_TRIE_OFFSET, + + IX_RESERVED8_OFFSET, // 8 + /** Byte offset to int64_t ces[]. */ + IX_CES_OFFSET, + IX_RESERVED10_OFFSET, + /** Byte offset to uint32_t ce32s[]. */ + IX_CE32S_OFFSET, + + /** Byte offset to uint32_t rootElements[]. */ + IX_ROOT_ELEMENTS_OFFSET, // 12 + /** Byte offset to UChar *contexts[]. */ + IX_CONTEXTS_OFFSET, + /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */ + IX_UNSAFE_BWD_OFFSET, + /** Byte offset to uint16_t fastLatinTable[]. */ + IX_FAST_LATIN_TABLE_OFFSET, + + /** Byte offset to uint16_t scripts[]. */ + IX_SCRIPTS_OFFSET, // 16 + /** + * Byte offset to UBool compressibleBytes[]. + * Empty table if <256 bytes (padding only). + * Otherwise 256 bytes or more (with padding). + */ + IX_COMPRESSIBLE_BYTES_OFFSET, + IX_RESERVED18_OFFSET, + IX_TOTAL_SIZE + }; + + static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength, + CollationTailoring &tailoring, UErrorCode &errorCode); + + static UBool U_CALLCONV + isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo); + +private: + CollationDataReader(); // no constructor +}; + +/* + * Format of collation data (ucadata.icu, binary data in coll/ *.res files). + * Format version 5. + * + * The root collation data is stored in the ucadata.icu file. + * Tailorings are stored inside .res resource bundle files, with a complete file header. + * + * Collation data begins with a standard ICU data file header + * (DataHeader, see ucmndata.h and unicode/udata.h). + * The UDataInfo.dataVersion field contains the UCA and other version numbers, + * see the comments for CollationTailoring.version. + * + * After the header, the file contains the following parts. + * Constants are defined as enum values of the CollationDataReader class. + * See also the Collation class. + * + * int32_t indexes[indexesLength]; + * The indexes array has variable length. + * Some tailorings only need the length and the options, + * others only add reorderCodes and the reorderTable, + * some need to store mappings. + * Only as many indexes are stored as needed to read all of the data. + * + * Index 0: indexesLength + * Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS + * Index 2..3: Unused/reserved/0. + * Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo + * are stored in a short, contiguous part of the ce32s array. + * + * Indexes 5..19 are byte offsets in ascending order. + * Each byte offset marks the start of the next part in the data file, + * and the end of the previous one. + * When two consecutive byte offsets are the same (or too short), + * then the corresponding part is empty. + * Byte offsets are offsets from after the header, + * that is, from the beginning of the indexes[]. + * Each part starts at an offset with proper alignment for its data. + * If necessary, the previous part may include padding bytes to achieve this alignment. + * The last byte offset that is stored in the indexes indicates the total size of the data + * (starting with the indexes). + * + * int32_t reorderCodes[]; -- empty in root + * The list of script and reordering codes. + * + * Beginning with format version 5, this array may optionally + * have trailing entries with a full list of reorder ranges + * as described for CollationSettings::reorderRanges. + * + * Script or reorder codes are first and do not exceed 16-bit values. + * Range limits are stored in the upper 16 bits, and are never 0. + * Split this array into reorder codes and ranges at the first entry + * with non-zero upper 16 bits. + * + * If the ranges are missing but needed for split-reordered primary lead bytes, + * then they are regenerated at load time. + * + * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes + * Primary-weight lead byte permutation table. + * Normally present when the reorderCodes are, but can be built at load time. + * + * Beginning with format version 5, a 0 entry at a non-zero index + * (which is otherwise an illegal value) + * means that the primary lead byte is "split" + * (there are different offsets for primaries that share that lead byte) + * and the reordering offset must be determined via the reorder ranges + * that are either stored as part of the reorderCodes array + * or regenerated at load time. + * + * UTrie2 trie; -- see utrie2_impl.h and utrie2.h + * The trie holds the main collation data. Each code point is mapped to a 32-bit value. + * It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set, + * in which case it is a special CE32 and contains a 4-bit tag and further data. + * See the Collation class for details. + * + * The trie has a value for each lead surrogate code unit with some bits encoding + * collective properties of the 1024 supplementary characters whose UTF-16 form starts with + * the lead surrogate. See Collation::LEAD_SURROGATE_TAG.. + * + * int64_t ces[]; + * 64-bit CEs and expansions that cannot be stored in a more compact form. + * + * uint32_t ce32s[]; + * CE32s for expansions in compact form, and for characters whose trie values + * contain special data. + * + * uint32_t rootElements[]; -- empty in all tailorings + * Compact storage for all of the CEs that occur in the root collation. + * See the CollationRootElements class. + * + * UChar *contexts[]; + * Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings. + * + * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize() + * Serialized form of characters that are unsafe when iterating backwards, + * and at the end of an identical string prefix. + * Back up to a safe character. + * Lead surrogates are "unsafe" when any of their corresponding supplementary + * code points are unsafe. + * Does not include [:^lccc=0:][:^tccc=0:]. + * For each tailoring, the root unsafeBackwardSet is subtracted. + * (As a result, in many tailorings no set needs to be stored.) + * + * uint16_t fastLatinTable[]; + * Optional optimization for Latin text. + * See the CollationFastLatin class. + * + * uint16_t scripts[]; -- empty in all tailorings + * Format version 5: + * uint16_t numScripts; + * uint16_t scriptsIndex[numScripts+16]; + * uint16_t scriptStarts[]; + * See CollationData::numScripts etc. + * + * Format version 4: + * Table of the reordering groups with their first and last lead bytes, + * and their script and reordering codes. + * See CollationData::scripts. + * + * UBool compressibleBytes[]; -- empty in all tailorings + * Flag for getSortKey(), indicating primary weight lead bytes that are compressible. + * + * ----------------- + * Changes for formatVersion 5 (ICU 55) + * + * Reordering moves single scripts, not groups of scripts. + * Reorder ranges are optionally appended to the reorderCodes, + * and a 0 entry in the reorderTable indicates a split lead byte. + * The scripts data has a new format. + * + * The rootElements may contain secondary and tertiary weights below common=05. + * (Used for small Hiragana letters.) + * Where is occurs, there is also an explicit unit with common secondary & tertiary weights. + * There are no other data structure changes, but builder code needs to be able to handle such data. + * + * The collation element for the merge separator code point U+FFFE + * does not necessarily have special, unique secondary/tertiary weights any more. + */ + +U_NAMESPACE_END + +#endif // !UCONFIG_NO_COLLATION +#endif // __COLLATIONDATAREADER_H__ |