diff options
Diffstat (limited to 'application/basilisk/components/translation/cld2/internal/cldutil_shared.cc')
-rw-r--r-- | application/basilisk/components/translation/cld2/internal/cldutil_shared.cc | 437 |
1 files changed, 0 insertions, 437 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/cldutil_shared.cc b/application/basilisk/components/translation/cld2/internal/cldutil_shared.cc deleted file mode 100644 index f111473af..000000000 --- a/application/basilisk/components/translation/cld2/internal/cldutil_shared.cc +++ /dev/null @@ -1,437 +0,0 @@ -// Copyright 2013 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// -// Author: dsites@google.com (Dick Sites) -// - -#include "cldutil_shared.h" -#include <string> - -#include "cld2tablesummary.h" -#include "integral_types.h" -#include "port.h" -#include "utf8statetable.h" - -namespace CLD2 { - -// Runtime routines for hashing, looking up, and scoring -// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams. -// Unigrams and bigrams are for CJK languages only, including simplified/ -// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and -// Zhuang Han characters. Surrounding spaces are not considered. -// Quadgrams and octagrams for for non-CJK and include two bits indicating -// preceding and trailing spaces (word boundaries). - - -// Indicator bits for leading/trailing space around quad/octagram -// NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of -// 1-, 2-, or 3-bytes each. -static const uint32 kPreSpaceIndicator = 0x00004444; -static const uint32 kPostSpaceIndicator = 0x44440000; - -// Little-endian masks for 0..24 bytes picked up as uint32's -static const uint32 kWordMask0[4] = { - 0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF -}; - -static const int kMinCJKUTF8CharBytes = 3; - -static const int kMinGramCount = 3; -static const int kMaxGramCount = 16; - -static const int UTFmax = 4; // Max number of bytes in a UTF-8 character - - -// Routines to access a hash table of <key:wordhash, value:probs> pairs -// Buckets have 4-byte wordhash for sizes < 32K buckets, but only -// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as -// bucket subscript. -// Probs is a packed: three languages plus a subscript for probability table -// Buckets have all the keys together, then all the values.Key array never -// crosses a cache-line boundary, so no-match case takes exactly one cache miss. -// Match case may sometimes take an additional cache miss on value access. -// -// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64 -// byte buckets with single cache miss. -// Or 2-byte key and 6-byte value, allowing 5 languages instead of three. - - -//----------------------------------------------------------------------------// -// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores // -//----------------------------------------------------------------------------// - -// Design principles for these hash functions -// - Few operations -// - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in -// Latin script expect 1- and 2-byte mixtures. -// - Last byte of each character has about 5 bits of information -// - Spread good bits around so they can interact in at least two ways -// with other characters -// - Use add for additional mixing thorugh carries - -// CJK Three-byte bigram -// ....dddd..cccccc..bbbbbb....aaaa -// ..................ffffff..eeeeee -// make -// ....dddd..cccccc..bbbbbb....aaaa -// 000....dddd..cccccc..bbbbbb....a -// ..................ffffff..eeeeee -// ffffff..eeeeee000000000000000000 -// -// CJK Four-byte bigram -// ..dddddd..cccccc....bbbb....aaaa -// ..hhhhhh..gggggg....ffff....eeee -// make -// ..dddddd..cccccc....bbbb....aaaa -// 000..dddddd..cccccc....bbbb....a -// ..hhhhhh..gggggg....ffff....eeee -// ..ffff....eeee000000000000000000 - -// BIGRAM -// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post -// OVERSHOOTS up to 3 bytes -// For runtime use of tables -// Does X86 unaligned loads -uint32 BiHashV2(const char* word_ptr, int bytecount) { - if (bytecount == 0) {return 0;} - const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr); - uint32 word0, word1; - if (bytecount <= 4) { - word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3]; - word0 = word0 ^ (word0 >> 3); - return word0; - } - // Else do 8 bytes - word0 = UNALIGNED_LOAD32(word_ptr32); - word0 = word0 ^ (word0 >> 3); - word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3]; - word1 = word1 ^ (word1 << 18); - return word0 + word1; -} - -// -// Ascii-7 One-byte chars -// ...ddddd...ccccc...bbbbb...aaaaa -// make -// ...ddddd...ccccc...bbbbb...aaaaa -// 000...ddddd...ccccc...bbbbb...aa -// -// Latin 1- and 2-byte chars -// ...ddddd...ccccc...bbbbb...aaaaa -// ...................fffff...eeeee -// make -// ...ddddd...ccccc...bbbbb...aaaaa -// 000...ddddd...ccccc...bbbbb...aa -// ...................fffff...eeeee -// ...............fffff...eeeee0000 -// -// Non-CJK Two-byte chars -// ...ddddd...........bbbbb........ -// ...hhhhh...........fffff........ -// make -// ...ddddd...........bbbbb........ -// 000...ddddd...........bbbbb..... -// ...hhhhh...........fffff........ -// hhhh...........fffff........0000 -// -// Non-CJK Three-byte chars -// ...........ccccc................ -// ...................fffff........ -// ...lllll...................iiiii -// make -// ...........ccccc................ -// 000...........ccccc............. -// ...................fffff........ -// ...............fffff........0000 -// ...lllll...................iiiii -// .lllll...................iiiii00 -// - -// QUADGRAM -// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add -// OVERSHOOTS up to 3 bytes -// For runtime use of tables -// Does X86 unaligned loads -uint32 QuadHashV2Mix(const char* word_ptr, int bytecount, uint32 prepost) { - const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr); - uint32 word0, word1, word2; - if (bytecount <= 4) { - word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3]; - word0 = word0 ^ (word0 >> 3); - return word0 ^ prepost; - } else if (bytecount <= 8) { - word0 = UNALIGNED_LOAD32(word_ptr32); - word0 = word0 ^ (word0 >> 3); - word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3]; - word1 = word1 ^ (word1 << 4); - return (word0 ^ prepost) + word1; - } - // else do 12 bytes - word0 = UNALIGNED_LOAD32(word_ptr32); - word0 = word0 ^ (word0 >> 3); - word1 = UNALIGNED_LOAD32(word_ptr32 + 1); - word1 = word1 ^ (word1 << 4); - word2 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3]; - word2 = word2 ^ (word2 << 2); - return (word0 ^ prepost) + word1 + word2; -} - - -// QUADGRAM wrapper with surrounding spaces -// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add -// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes -// For runtime use of tables -uint32 QuadHashV2(const char* word_ptr, int bytecount) { - if (bytecount == 0) {return 0;} - uint32 prepost = 0; - if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} - if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} - return QuadHashV2Mix(word_ptr, bytecount, prepost); -} - -// QUADGRAM wrapper with surrounding underscores (offline use) -// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add -// OVERSHOOTS up to 3 bytes -// For offline construction of tables -uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount) { - if (bytecount == 0) {return 0;} - const char* local_word_ptr = word_ptr; - int local_bytecount = bytecount; - uint32 prepost = 0; - if (local_word_ptr[0] == '_') { - prepost |= kPreSpaceIndicator; - ++local_word_ptr; - --local_bytecount; - } - if (local_word_ptr[local_bytecount - 1] == '_') { - prepost |= kPostSpaceIndicator; - --local_bytecount; - } - return QuadHashV2Mix(local_word_ptr, local_bytecount, prepost); -} - - -// OCTAGRAM -// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add -// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes -// -// The low 32 bits follow the pattern from above, tuned to different scripts -// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each -// For runtime use of tables V3 -// Does X86 unaligned loads -uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) { - const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr); - uint64 word0; - uint64 word1; - uint64 sum; - - if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} - if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} - switch ((bytecount - 1) >> 2) { - case 0: // 1..4 bytes - word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3]; - sum = word0; - word0 = word0 ^ (word0 >> 3); - break; - case 1: // 5..8 bytes - word0 = UNALIGNED_LOAD32(word_ptr32); - sum = word0; - word0 = word0 ^ (word0 >> 3); - word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3]; - sum += word1; - word1 = word1 ^ (word1 << 4); - word0 += word1; - break; - case 2: // 9..12 bytes - word0 = UNALIGNED_LOAD32(word_ptr32); - sum = word0; - word0 = word0 ^ (word0 >> 3); - word1 = UNALIGNED_LOAD32(word_ptr32 + 1); - sum += word1; - word1 = word1 ^ (word1 << 4); - word0 += word1; - word1 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3]; - sum += word1; - word1 = word1 ^ (word1 << 2); - word0 += word1; - break; - case 3: // 13..16 bytes - word0 =UNALIGNED_LOAD32(word_ptr32); - sum = word0; - word0 = word0 ^ (word0 >> 3); - word1 = UNALIGNED_LOAD32(word_ptr32 + 1); - sum += word1; - word1 = word1 ^ (word1 << 4); - word0 += word1; - word1 = UNALIGNED_LOAD32(word_ptr32 + 2); - sum += word1; - word1 = word1 ^ (word1 << 2); - word0 += word1; - word1 = UNALIGNED_LOAD32(word_ptr32 + 3) & kWordMask0[bytecount & 3]; - sum += word1; - word1 = word1 ^ (word1 >> 8); - word0 += word1; - break; - case 4: // 17..20 bytes - word0 = UNALIGNED_LOAD32(word_ptr32); - sum = word0; - word0 = word0 ^ (word0 >> 3); - word1 = UNALIGNED_LOAD32(word_ptr32 + 1); - sum += word1; - word1 = word1 ^ (word1 << 4); - word0 += word1; - word1 = UNALIGNED_LOAD32(word_ptr32 + 2); - sum += word1; - word1 = word1 ^ (word1 << 2); - word0 += word1; - word1 = UNALIGNED_LOAD32(word_ptr32 + 3); - sum += word1; - word1 = word1 ^ (word1 >> 8); - word0 += word1; - word1 = UNALIGNED_LOAD32(word_ptr32 + 4) & kWordMask0[bytecount & 3]; - sum += word1; - word1 = word1 ^ (word1 >> 4); - word0 += word1; - break; - default: // 21..24 bytes and higher (ignores beyond 24) - word0 = UNALIGNED_LOAD32(word_ptr32); - sum = word0; - word0 = word0 ^ (word0 >> 3); - word1 = UNALIGNED_LOAD32(word_ptr32 + 1); - sum += word1; - word1 = word1 ^ (word1 << 4); - word0 += word1; - word1 = UNALIGNED_LOAD32(word_ptr32 + 2); - sum += word1; - word1 = word1 ^ (word1 << 2); - word0 += word1; - word1 = UNALIGNED_LOAD32(word_ptr32 + 3); - sum += word1; - word1 = word1 ^ (word1 >> 8); - word0 += word1; - word1 = UNALIGNED_LOAD32(word_ptr32 + 4); - sum += word1; - word1 = word1 ^ (word1 >> 4); - word0 += word1; - word1 = UNALIGNED_LOAD32(word_ptr32 + 5) & kWordMask0[bytecount & 3]; - sum += word1; - word1 = word1 ^ (word1 >> 6); - word0 += word1; - break; - } - - sum += (sum >> 17); // extra 1-bit shift for bytes 2 & 3 - sum += (sum >> 9); // extra 1-bit shift for bytes 1 & 3 - sum = (sum & 0xff) << 32; - return (word0 ^ prepost) + sum; -} - -// OCTAGRAM wrapper with surrounding spaces -// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add -// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes -// -// The low 32 bits follow the pattern from above, tuned to different scripts -// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each -// For runtime use of tables V3 -uint64 OctaHash40(const char* word_ptr, int bytecount) { - if (bytecount == 0) {return 0;} - uint64 prepost = 0; - if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} - if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} - return OctaHash40Mix(word_ptr, bytecount, prepost); -} - - -// OCTAGRAM wrapper with surrounding underscores (offline use) -// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add -// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes -// -// The low 32 bits follow the pattern from above, tuned to different scripts -// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each -// For offline construction of tables -uint64 OctaHash40underscore(const char* word_ptr, int bytecount) { - if (bytecount == 0) {return 0;} - const char* local_word_ptr = word_ptr; - int local_bytecount = bytecount; - uint64 prepost = 0; - if (local_word_ptr[0] == '_') { - prepost |= kPreSpaceIndicator; - ++local_word_ptr; - --local_bytecount; - } - if (local_word_ptr[local_bytecount - 1] == '_') { - prepost |= kPostSpaceIndicator; - --local_bytecount; - } - return OctaHash40Mix(local_word_ptr, local_bytecount, prepost); -} - -// Hash a consecutive pair of tokens/words A B -// Old: hash is B - A, which gives too many false hits on one-char diffs -// Now: rotate(A,13) + B -uint64 PairHash(uint64 worda_hash, uint64 wordb_hash) { - return ((worda_hash >> 13) | (worda_hash << (64 - 13))) + wordb_hash; -} - - - - -//----------------------------------------------------------------------------// -// Finding groups of 1/2/4/8 letters // -//----------------------------------------------------------------------------// - -// src points to a letter. Find the byte length of a unigram starting there. -int UniLen(const char* src) { - const char* src_end = src; - src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; - return src_end - src; -} - -// src points to a letter. Find the byte length of a bigram starting there. -int BiLen(const char* src) { - const char* src_end = src; - src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; - src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; - return src_end - src; -} - -// src points to a letter. Find the byte length of a quadgram starting there. -int QuadLen(const char* src) { - const char* src_end = src; - src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; - src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; - src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; - src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; - return src_end - src; -} - -// src points to a letter. Find the byte length of an octagram starting there. -int OctaLen(const char* src) { - const char* src_end = src; - int charcount = 0; - while (src_end[0] != ' ') { - src_end += UTF8OneCharLen(src); - ++charcount; - if (charcount == 8) {break;} - } - return src_end - src; -} - -} // End namespace CLD2 - - - - - |