diff options
Diffstat (limited to 'application/basilisk/components/translation/cld2/internal/cld2_dynamic_data.h')
-rw-r--r-- | application/basilisk/components/translation/cld2/internal/cld2_dynamic_data.h | 216 |
1 files changed, 216 insertions, 0 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/cld2_dynamic_data.h b/application/basilisk/components/translation/cld2/internal/cld2_dynamic_data.h new file mode 100644 index 000000000..693d35b38 --- /dev/null +++ b/application/basilisk/components/translation/cld2/internal/cld2_dynamic_data.h @@ -0,0 +1,216 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_ +#define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_ + +#include "integral_types.h" +#include "cld2tablesummary.h" +#include "utf8statetable.h" +#include "scoreonescriptspan.h" + +/* + There are two primary parts to a CLD2 dynamic data file: + 1. A header, wherein trivial data, block lengths and block offsets are kept + 2. A data block, wherein the large binary blocks are kept + + By reading the header, an application can determine the offsets and lengths of + all the data blocks for all tables. Offsets in the header are expressed + relative to the first byte of the file, inclusive of the header itself; thus, + any offset whose value is less than the length of the header is invalid. + + Any offset whose value is zero indicates a field that is null in the + underlying CLD2 data; a real example of this is the fast_state field of the + UTF8PropObj, which may be null. + + The size of the header can be precalculated by calling calculateHeaderSize(), + which will indicate the exact size of the header for a data file that contains + a given number of CLD2TableSummary objects. + + Notes on endianness: + The data format is only suitable for little-endian machines. For big-endian + systems, a tedious transformation would need to be made first to reverse the + byte order of significant portions of the binary - not just the lengths, but + also some of the underlying table data. + + Note on 32/64 bit: + The data format is agnostic to 32/64 bit pointers. All the offsets within the + data blob itself are 32-bit values relative to the start of the file, and the + file should certainly never be gigabytes in size! + When the file is ultimately read by the loading code and mmap()'d, new + pointers are generated at whatever size the system uses, initialized to the + start of the mmap, and incremented by the 32-bit offset. This should be safe + regardless of 32- or 64-bit architectures. + + -------------------------------------------------------------------- + FIELD + -------------------------------------------------------------------- + DATA_FILE_MARKER (no null terminator) + total file size (sanity check, uint32) + -------------------------------------------------------------------- + UTF8PropObj: const uint32 state0 + UTF8PropObj: const uint32 state0_size + UTF8PropObj: const uint32 total_size + UTF8PropObj: const int max_expand + UTF8PropObj: const int entry_shift (coerced to 32 bits) + UTF8PropObj: const int bytes_per_entry (coerced to 32 bits) + UTF8PropObj: const uint32 losub + UTF8PropObj: const uint32 hiadd + offset of UTF8PropObj: const uint8* state_table + length of UTF8PropObj: const uint8* state_table + offset of UTF8PropObj: const RemapEntry* remap_base (4-byte struct) + length of UTF8PropObj: const RemapEntry* remap_base (4-byte struct) + offset of UTF8PropObj: const uint8* remap_string + length of UTF8PropObj: const uint8* remap_string + offset of UTF8PropObj: const uint8* fast_state + length of UTF8PropObj: const uint8* fast_state + -------------------------------------------------------------------- + start of const short kAvgDeltaOctaScore[] + length of const short kAvgDeltaOctaScore[] + -------------------------------------------------------------------- + number of CLD2TableSummary objects encoded (n) + [Table 1]: CLD2TableSummary: uint32 kCLDTableSizeOne + [Table 1]: CLD2TableSummary: uint32 kCLDTableSize + [Table 1]: CLD2TableSummary: uint32 kCLDTableKeyMask + [Table 1]: CLD2TableSummary: uint32 kCLDTableBuildDate + [Table 1]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable + [Table 1]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable + [Table 1]: offset of CLD2TableSummary: const uint32* kCLDTableInd + [Table 1]: length of CLD2TableSummary: const uint32* kCLDTableInd + [Table 1]: offset of CLD2TableSummary: const char* kRecognizedLangScripts + [Table 1]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1 + . + . + . + [Table n]: CLD2TableSummary: uint32 kCLDTableSizeOne + [Table n]: CLD2TableSummary: uint32 kCLDTableSize + [Table n]: CLD2TableSummary: uint32 kCLDTableKeyMask + [Table n]: CLD2TableSummary: uint32 kCLDTableBuildDate + [Table n]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable + [Table n]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable + [Table n]: offset of CLD2TableSummary: const uint32* kCLDTableInd + [Table n]: length of CLD2TableSummary: const uint32* kCLDTableInd + [Table n]: offset of CLD2TableSummary: const char* kRecognizedLangScripts + [Table n]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1 + -------------------------------------------------------------------- + + + Immediately after the header fields comes the data block. The data block has + the following content, in this order (note that padding is applied in order to + keep lookups word-aligned): + + UTF8PropObj: const uint8* state_table + UTF8PropObj: const RemapEntry* remap_base (4-byte struct) + UTF8PropObj: const uint8* remap_string + UTF8PropObj: const uint8* fast_state + const short kAvgDeltaOctaScore[] + [Table 1]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable + [Table 1]: CLD2TableSummary: const uint32* kCLDTableInd + [Table 1]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator) + . + . + . + [Table n]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable + [Table n]: CLD2TableSummary: const uint32* kCLDTableInd + [Table n]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator) + + + It is STRONGLY recommended that the chunks within the data block be kept + 128-bit aligned for efficiency reasons, although the code will work without + such alignment: the main lookup tables have randomly-accessed groups of four + 4-byte entries, and these must be 16-byte aligned to avoid the performance + cost of multiple cache misses per group. +*/ +namespace CLD2DynamicData { + +static const char* DATA_FILE_MARKER = "cld2_data_file00"; +static const int DATA_FILE_MARKER_LENGTH = 16; // Keep aligned to 128 bits + +// Nicer version of memcmp that shows the offset at which bytes differ +bool mem_compare(const void* data1, const void* data2, const int length); + +// Enable or disable debugging; 0 to disable, 1 to enable +void setDebug(int debug); + +// Lower-level structure for individual tables. There are n table headers in +// a given file header. +typedef struct { + CLD2::uint32 kCLDTableSizeOne; + CLD2::uint32 kCLDTableSize; + CLD2::uint32 kCLDTableKeyMask; + CLD2::uint32 kCLDTableBuildDate; + CLD2::uint32 startOf_kCLDTable; + CLD2::uint32 lengthOf_kCLDTable; + CLD2::uint32 startOf_kCLDTableInd; + CLD2::uint32 lengthOf_kCLDTableInd; + CLD2::uint32 startOf_kRecognizedLangScripts; + CLD2::uint32 lengthOf_kRecognizedLangScripts; +} TableHeader; + + +// Top-level structure for a CLD2 Data File Header. +// Contains all the primitive fields for the header as well as an array of +// headers for the individual tables. +typedef struct { + // Marker fields help recognize and verify the data file + char sanityString[DATA_FILE_MARKER_LENGTH]; + CLD2::uint32 totalFileSizeBytes; + + // UTF8 primitives + CLD2::uint32 utf8PropObj_state0; + CLD2::uint32 utf8PropObj_state0_size; + CLD2::uint32 utf8PropObj_total_size; + CLD2::uint32 utf8PropObj_max_expand; + CLD2::uint32 utf8PropObj_entry_shift; + CLD2::uint32 utf8PropObj_bytes_per_entry; + CLD2::uint32 utf8PropObj_losub; + CLD2::uint32 utf8PropObj_hiadd; + CLD2::uint32 startOf_utf8PropObj_state_table; + CLD2::uint32 lengthOf_utf8PropObj_state_table; + CLD2::uint32 startOf_utf8PropObj_remap_base; + CLD2::uint32 lengthOf_utf8PropObj_remap_base; + CLD2::uint32 startOf_utf8PropObj_remap_string; + CLD2::uint32 lengthOf_utf8PropObj_remap_string; + CLD2::uint32 startOf_utf8PropObj_fast_state; + CLD2::uint32 lengthOf_utf8PropObj_fast_state; + + // Average delta-octa-score bits + CLD2::uint32 startOf_kAvgDeltaOctaScore; + CLD2::uint32 lengthOf_kAvgDeltaOctaScore; + + // Table bits + CLD2::uint32 numTablesEncoded; + TableHeader* tableHeaders; +} FileHeader; + +// Calculate the exact size of a header that encodes the specified number of +// tables. This can be used to reserve space within the data file, +// calculate offsets, and so on. +CLD2::uint32 calculateHeaderSize(CLD2::uint32 numTables); + +// Dump a given header to stdout as a human-readable string. +void dumpHeader(FileHeader* header); + +// Verify that a given pair of scoring tables match precisely +// If there is a problem, returns an error message; otherwise, the empty string. +bool verify(const CLD2::ScoringTables* realData, const CLD2::ScoringTables* loadedData); + +// Return true iff the program is running in little-endian mode. +bool isLittleEndian(); + +// Return true iff the core size assumptions are ok on this platform. +bool coreAssumptionsOk(); + +} // End namespace CLD2DynamicData +#endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_ |