diff options
Diffstat (limited to 'application/basilisk/components/translation/cld2/internal/cldutil_shared.cc')
-rw-r--r-- | application/basilisk/components/translation/cld2/internal/cldutil_shared.cc | 437 |
1 files changed, 437 insertions, 0 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/cldutil_shared.cc b/application/basilisk/components/translation/cld2/internal/cldutil_shared.cc new file mode 100644 index 000000000..f111473af --- /dev/null +++ b/application/basilisk/components/translation/cld2/internal/cldutil_shared.cc @@ -0,0 +1,437 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// Author: dsites@google.com (Dick Sites) +// + +#include "cldutil_shared.h" +#include <string> + +#include "cld2tablesummary.h" +#include "integral_types.h" +#include "port.h" +#include "utf8statetable.h" + +namespace CLD2 { + +// Runtime routines for hashing, looking up, and scoring +// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams. +// Unigrams and bigrams are for CJK languages only, including simplified/ +// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and +// Zhuang Han characters. Surrounding spaces are not considered. +// Quadgrams and octagrams for for non-CJK and include two bits indicating +// preceding and trailing spaces (word boundaries). + + +// Indicator bits for leading/trailing space around quad/octagram +// NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of +// 1-, 2-, or 3-bytes each. +static const uint32 kPreSpaceIndicator = 0x00004444; +static const uint32 kPostSpaceIndicator = 0x44440000; + +// Little-endian masks for 0..24 bytes picked up as uint32's +static const uint32 kWordMask0[4] = { + 0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF +}; + +static const int kMinCJKUTF8CharBytes = 3; + +static const int kMinGramCount = 3; +static const int kMaxGramCount = 16; + +static const int UTFmax = 4; // Max number of bytes in a UTF-8 character + + +// Routines to access a hash table of <key:wordhash, value:probs> pairs +// Buckets have 4-byte wordhash for sizes < 32K buckets, but only +// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as +// bucket subscript. +// Probs is a packed: three languages plus a subscript for probability table +// Buckets have all the keys together, then all the values.Key array never +// crosses a cache-line boundary, so no-match case takes exactly one cache miss. +// Match case may sometimes take an additional cache miss on value access. +// +// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64 +// byte buckets with single cache miss. +// Or 2-byte key and 6-byte value, allowing 5 languages instead of three. + + +//----------------------------------------------------------------------------// +// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores // +//----------------------------------------------------------------------------// + +// Design principles for these hash functions +// - Few operations +// - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in +// Latin script expect 1- and 2-byte mixtures. +// - Last byte of each character has about 5 bits of information +// - Spread good bits around so they can interact in at least two ways +// with other characters +// - Use add for additional mixing thorugh carries + +// CJK Three-byte bigram +// ....dddd..cccccc..bbbbbb....aaaa +// ..................ffffff..eeeeee +// make +// ....dddd..cccccc..bbbbbb....aaaa +// 000....dddd..cccccc..bbbbbb....a +// ..................ffffff..eeeeee +// ffffff..eeeeee000000000000000000 +// +// CJK Four-byte bigram +// ..dddddd..cccccc....bbbb....aaaa +// ..hhhhhh..gggggg....ffff....eeee +// make +// ..dddddd..cccccc....bbbb....aaaa +// 000..dddddd..cccccc....bbbb....a +// ..hhhhhh..gggggg....ffff....eeee +// ..ffff....eeee000000000000000000 + +// BIGRAM +// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post +// OVERSHOOTS up to 3 bytes +// For runtime use of tables +// Does X86 unaligned loads +uint32 BiHashV2(const char* word_ptr, int bytecount) { + if (bytecount == 0) {return 0;} + const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr); + uint32 word0, word1; + if (bytecount <= 4) { + word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3]; + word0 = word0 ^ (word0 >> 3); + return word0; + } + // Else do 8 bytes + word0 = UNALIGNED_LOAD32(word_ptr32); + word0 = word0 ^ (word0 >> 3); + word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3]; + word1 = word1 ^ (word1 << 18); + return word0 + word1; +} + +// +// Ascii-7 One-byte chars +// ...ddddd...ccccc...bbbbb...aaaaa +// make +// ...ddddd...ccccc...bbbbb...aaaaa +// 000...ddddd...ccccc...bbbbb...aa +// +// Latin 1- and 2-byte chars +// ...ddddd...ccccc...bbbbb...aaaaa +// ...................fffff...eeeee +// make +// ...ddddd...ccccc...bbbbb...aaaaa +// 000...ddddd...ccccc...bbbbb...aa +// ...................fffff...eeeee +// ...............fffff...eeeee0000 +// +// Non-CJK Two-byte chars +// ...ddddd...........bbbbb........ +// ...hhhhh...........fffff........ +// make +// ...ddddd...........bbbbb........ +// 000...ddddd...........bbbbb..... +// ...hhhhh...........fffff........ +// hhhh...........fffff........0000 +// +// Non-CJK Three-byte chars +// ...........ccccc................ +// ...................fffff........ +// ...lllll...................iiiii +// make +// ...........ccccc................ +// 000...........ccccc............. +// ...................fffff........ +// ...............fffff........0000 +// ...lllll...................iiiii +// .lllll...................iiiii00 +// + +// QUADGRAM +// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add +// OVERSHOOTS up to 3 bytes +// For runtime use of tables +// Does X86 unaligned loads +uint32 QuadHashV2Mix(const char* word_ptr, int bytecount, uint32 prepost) { + const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr); + uint32 word0, word1, word2; + if (bytecount <= 4) { + word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3]; + word0 = word0 ^ (word0 >> 3); + return word0 ^ prepost; + } else if (bytecount <= 8) { + word0 = UNALIGNED_LOAD32(word_ptr32); + word0 = word0 ^ (word0 >> 3); + word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3]; + word1 = word1 ^ (word1 << 4); + return (word0 ^ prepost) + word1; + } + // else do 12 bytes + word0 = UNALIGNED_LOAD32(word_ptr32); + word0 = word0 ^ (word0 >> 3); + word1 = UNALIGNED_LOAD32(word_ptr32 + 1); + word1 = word1 ^ (word1 << 4); + word2 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3]; + word2 = word2 ^ (word2 << 2); + return (word0 ^ prepost) + word1 + word2; +} + + +// QUADGRAM wrapper with surrounding spaces +// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add +// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes +// For runtime use of tables +uint32 QuadHashV2(const char* word_ptr, int bytecount) { + if (bytecount == 0) {return 0;} + uint32 prepost = 0; + if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} + if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} + return QuadHashV2Mix(word_ptr, bytecount, prepost); +} + +// QUADGRAM wrapper with surrounding underscores (offline use) +// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add +// OVERSHOOTS up to 3 bytes +// For offline construction of tables +uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount) { + if (bytecount == 0) {return 0;} + const char* local_word_ptr = word_ptr; + int local_bytecount = bytecount; + uint32 prepost = 0; + if (local_word_ptr[0] == '_') { + prepost |= kPreSpaceIndicator; + ++local_word_ptr; + --local_bytecount; + } + if (local_word_ptr[local_bytecount - 1] == '_') { + prepost |= kPostSpaceIndicator; + --local_bytecount; + } + return QuadHashV2Mix(local_word_ptr, local_bytecount, prepost); +} + + +// OCTAGRAM +// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add +// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes +// +// The low 32 bits follow the pattern from above, tuned to different scripts +// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each +// For runtime use of tables V3 +// Does X86 unaligned loads +uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) { + const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr); + uint64 word0; + uint64 word1; + uint64 sum; + + if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} + if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} + switch ((bytecount - 1) >> 2) { + case 0: // 1..4 bytes + word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3]; + sum = word0; + word0 = word0 ^ (word0 >> 3); + break; + case 1: // 5..8 bytes + word0 = UNALIGNED_LOAD32(word_ptr32); + sum = word0; + word0 = word0 ^ (word0 >> 3); + word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3]; + sum += word1; + word1 = word1 ^ (word1 << 4); + word0 += word1; + break; + case 2: // 9..12 bytes + word0 = UNALIGNED_LOAD32(word_ptr32); + sum = word0; + word0 = word0 ^ (word0 >> 3); + word1 = UNALIGNED_LOAD32(word_ptr32 + 1); + sum += word1; + word1 = word1 ^ (word1 << 4); + word0 += word1; + word1 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3]; + sum += word1; + word1 = word1 ^ (word1 << 2); + word0 += word1; + break; + case 3: // 13..16 bytes + word0 =UNALIGNED_LOAD32(word_ptr32); + sum = word0; + word0 = word0 ^ (word0 >> 3); + word1 = UNALIGNED_LOAD32(word_ptr32 + 1); + sum += word1; + word1 = word1 ^ (word1 << 4); + word0 += word1; + word1 = UNALIGNED_LOAD32(word_ptr32 + 2); + sum += word1; + word1 = word1 ^ (word1 << 2); + word0 += word1; + word1 = UNALIGNED_LOAD32(word_ptr32 + 3) & kWordMask0[bytecount & 3]; + sum += word1; + word1 = word1 ^ (word1 >> 8); + word0 += word1; + break; + case 4: // 17..20 bytes + word0 = UNALIGNED_LOAD32(word_ptr32); + sum = word0; + word0 = word0 ^ (word0 >> 3); + word1 = UNALIGNED_LOAD32(word_ptr32 + 1); + sum += word1; + word1 = word1 ^ (word1 << 4); + word0 += word1; + word1 = UNALIGNED_LOAD32(word_ptr32 + 2); + sum += word1; + word1 = word1 ^ (word1 << 2); + word0 += word1; + word1 = UNALIGNED_LOAD32(word_ptr32 + 3); + sum += word1; + word1 = word1 ^ (word1 >> 8); + word0 += word1; + word1 = UNALIGNED_LOAD32(word_ptr32 + 4) & kWordMask0[bytecount & 3]; + sum += word1; + word1 = word1 ^ (word1 >> 4); + word0 += word1; + break; + default: // 21..24 bytes and higher (ignores beyond 24) + word0 = UNALIGNED_LOAD32(word_ptr32); + sum = word0; + word0 = word0 ^ (word0 >> 3); + word1 = UNALIGNED_LOAD32(word_ptr32 + 1); + sum += word1; + word1 = word1 ^ (word1 << 4); + word0 += word1; + word1 = UNALIGNED_LOAD32(word_ptr32 + 2); + sum += word1; + word1 = word1 ^ (word1 << 2); + word0 += word1; + word1 = UNALIGNED_LOAD32(word_ptr32 + 3); + sum += word1; + word1 = word1 ^ (word1 >> 8); + word0 += word1; + word1 = UNALIGNED_LOAD32(word_ptr32 + 4); + sum += word1; + word1 = word1 ^ (word1 >> 4); + word0 += word1; + word1 = UNALIGNED_LOAD32(word_ptr32 + 5) & kWordMask0[bytecount & 3]; + sum += word1; + word1 = word1 ^ (word1 >> 6); + word0 += word1; + break; + } + + sum += (sum >> 17); // extra 1-bit shift for bytes 2 & 3 + sum += (sum >> 9); // extra 1-bit shift for bytes 1 & 3 + sum = (sum & 0xff) << 32; + return (word0 ^ prepost) + sum; +} + +// OCTAGRAM wrapper with surrounding spaces +// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add +// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes +// +// The low 32 bits follow the pattern from above, tuned to different scripts +// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each +// For runtime use of tables V3 +uint64 OctaHash40(const char* word_ptr, int bytecount) { + if (bytecount == 0) {return 0;} + uint64 prepost = 0; + if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} + if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} + return OctaHash40Mix(word_ptr, bytecount, prepost); +} + + +// OCTAGRAM wrapper with surrounding underscores (offline use) +// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add +// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes +// +// The low 32 bits follow the pattern from above, tuned to different scripts +// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each +// For offline construction of tables +uint64 OctaHash40underscore(const char* word_ptr, int bytecount) { + if (bytecount == 0) {return 0;} + const char* local_word_ptr = word_ptr; + int local_bytecount = bytecount; + uint64 prepost = 0; + if (local_word_ptr[0] == '_') { + prepost |= kPreSpaceIndicator; + ++local_word_ptr; + --local_bytecount; + } + if (local_word_ptr[local_bytecount - 1] == '_') { + prepost |= kPostSpaceIndicator; + --local_bytecount; + } + return OctaHash40Mix(local_word_ptr, local_bytecount, prepost); +} + +// Hash a consecutive pair of tokens/words A B +// Old: hash is B - A, which gives too many false hits on one-char diffs +// Now: rotate(A,13) + B +uint64 PairHash(uint64 worda_hash, uint64 wordb_hash) { + return ((worda_hash >> 13) | (worda_hash << (64 - 13))) + wordb_hash; +} + + + + +//----------------------------------------------------------------------------// +// Finding groups of 1/2/4/8 letters // +//----------------------------------------------------------------------------// + +// src points to a letter. Find the byte length of a unigram starting there. +int UniLen(const char* src) { + const char* src_end = src; + src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; + return src_end - src; +} + +// src points to a letter. Find the byte length of a bigram starting there. +int BiLen(const char* src) { + const char* src_end = src; + src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; + src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; + return src_end - src; +} + +// src points to a letter. Find the byte length of a quadgram starting there. +int QuadLen(const char* src) { + const char* src_end = src; + src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; + src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; + src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; + src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; + return src_end - src; +} + +// src points to a letter. Find the byte length of an octagram starting there. +int OctaLen(const char* src) { + const char* src_end = src; + int charcount = 0; + while (src_end[0] != ' ') { + src_end += UTF8OneCharLen(src); + ++charcount; + if (charcount == 8) {break;} + } + return src_end - src; +} + +} // End namespace CLD2 + + + + + |