diff options
Diffstat (limited to 'application/basilisk/components/translation/cld2/internal/scoreonescriptspan.cc')
-rw-r--r-- | application/basilisk/components/translation/cld2/internal/scoreonescriptspan.cc | 1334 |
1 files changed, 0 insertions, 1334 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/scoreonescriptspan.cc b/application/basilisk/components/translation/cld2/internal/scoreonescriptspan.cc deleted file mode 100644 index b2cebc02e..000000000 --- a/application/basilisk/components/translation/cld2/internal/scoreonescriptspan.cc +++ /dev/null @@ -1,1334 +0,0 @@ -// Copyright 2013 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// -// Author: dsites@google.com (Dick Sites) -// Updated 2014.01 for dual table lookup -// - -#include "scoreonescriptspan.h" - -#include "cldutil.h" -#include "debug.h" -#include "lang_script.h" - -#include <stdio.h> - -using namespace std; - -namespace CLD2 { - -static const int kUnreliablePercentThreshold = 75; - -void AddLangProb(uint32 langprob, Tote* chunk_tote) { - ProcessProbV2Tote(langprob, chunk_tote); -} - -void ZeroPSLang(uint32 langprob, Tote* chunk_tote) { - uint8 top1 = (langprob >> 8) & 0xff; - chunk_tote->SetScore(top1, 0); -} - -bool SameCloseSet(uint16 lang1, uint16 lang2) { - int lang1_close_set = LanguageCloseSet(static_cast<Language>(lang1)); - if (lang1_close_set == 0) {return false;} - int lang2_close_set = LanguageCloseSet(static_cast<Language>(lang2)); - return (lang1_close_set == lang2_close_set); -} - -bool SameCloseSet(Language lang1, Language lang2) { - int lang1_close_set = LanguageCloseSet(lang1); - if (lang1_close_set == 0) {return false;} - int lang2_close_set = LanguageCloseSet(lang2); - return (lang1_close_set == lang2_close_set); -} - - -// Needs expected score per 1KB in scoring context -void SetChunkSummary(ULScript ulscript, int first_linear_in_chunk, - int offset, int len, - const ScoringContext* scoringcontext, - const Tote* chunk_tote, - ChunkSummary* chunksummary) { - int key3[3]; - chunk_tote->CurrentTopThreeKeys(key3); - Language lang1 = FromPerScriptNumber(ulscript, key3[0]); - Language lang2 = FromPerScriptNumber(ulscript, key3[1]); - - int actual_score_per_kb = 0; - if (len > 0) { - actual_score_per_kb = (chunk_tote->GetScore(key3[0]) << 10) / len; - } - int expected_subscr = lang1 * 4 + LScript4(ulscript); - int expected_score_per_kb = - scoringcontext->scoringtables->kExpectedScore[expected_subscr]; - - chunksummary->offset = offset; - chunksummary->chunk_start = first_linear_in_chunk; - chunksummary->lang1 = lang1; - chunksummary->lang2 = lang2; - chunksummary->score1 = chunk_tote->GetScore(key3[0]); - chunksummary->score2 = chunk_tote->GetScore(key3[1]); - chunksummary->bytes = len; - chunksummary->grams = chunk_tote->GetScoreCount(); - chunksummary->ulscript = ulscript; - chunksummary->reliability_delta = ReliabilityDelta(chunksummary->score1, - chunksummary->score2, - chunksummary->grams); - // If lang1/lang2 in same close set, set delta reliability to 100% - if (SameCloseSet(lang1, lang2)) { - chunksummary->reliability_delta = 100; - } - chunksummary->reliability_score = - ReliabilityExpected(actual_score_per_kb, expected_score_per_kb); -} - -// Return true if just lang1 is there: lang2=0 and lang3=0 -bool IsSingleLang(uint32 langprob) { - // Probably a bug -- which end is lang1? But only used to call empty Boost1 - return ((langprob & 0x00ffff00) == 0); -} - -// Update scoring context distinct_boost for single language quad -void AddDistinctBoost1(uint32 langprob, ScoringContext* scoringcontext) { - // Probably keep this empty -- not a good enough signal -} - -// Update scoring context distinct_boost for distinct octagram -// Keep last 4 used. Since these are mostly (except at splices) in -// hitbuffer, we might be able to just use a subscript and splice -void AddDistinctBoost2(uint32 langprob, ScoringContext* scoringcontext) { -// this is called 0..n times per chunk with decoded hitbuffer->distinct... - LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn; - if (scoringcontext->ulscript != ULScript_Latin) { - distinct_boost = &scoringcontext->distinct_boost.othr; - } - int n = distinct_boost->n; - distinct_boost->langprob[n] = langprob; - distinct_boost->n = distinct_boost->wrap(n + 1); -} - -// For each chunk, add extra weight for language priors (from content-lang and -// meta lang=xx) and distinctive tokens -void ScoreBoosts(const ScoringContext* scoringcontext, Tote* chunk_tote) { - // Get boosts for current script - const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn; - const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn; - const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn; - if (scoringcontext->ulscript != ULScript_Latin) { - langprior_boost = &scoringcontext->langprior_boost.othr; - langprior_whack = &scoringcontext->langprior_whack.othr; - distinct_boost = &scoringcontext->distinct_boost.othr; - } - - for (int k = 0; k < kMaxBoosts; ++k) { - uint32 langprob = langprior_boost->langprob[k]; - if (langprob > 0) {AddLangProb(langprob, chunk_tote);} - } - for (int k = 0; k < kMaxBoosts; ++k) { - uint32 langprob = distinct_boost->langprob[k]; - if (langprob > 0) {AddLangProb(langprob, chunk_tote);} - } - // boost has a packed set of per-script langs and probabilites - // whack has a packed set of per-script lang to be suppressed (zeroed) - // When a language in a close set is given as an explicit hint, others in - // that set will be whacked here. - for (int k = 0; k < kMaxBoosts; ++k) { - uint32 langprob = langprior_whack->langprob[k]; - if (langprob > 0) {ZeroPSLang(langprob, chunk_tote);} - } -} - - - -// At this point, The chunk is described by -// hitbuffer->base[cspan->chunk_base .. cspan->chunk_base + cspan->base_len) -// hitbuffer->delta[cspan->chunk_delta ... ) -// hitbuffer->distinct[cspan->chunk_distinct ... ) -// Scored text is in text[lo..hi) where -// lo is 0 or the min of first base/delta/distinct hitbuffer offset and -// hi is the min of next base/delta/distinct hitbuffer offset after -// base_len, etc. -void GetTextSpanOffsets(const ScoringHitBuffer* hitbuffer, - const ChunkSpan* cspan, int* lo, int* hi) { - // Front of this span - int lo_base = hitbuffer->base[cspan->chunk_base].offset; - int lo_delta = hitbuffer->delta[cspan->chunk_delta].offset; - int lo_distinct = hitbuffer->distinct[cspan->chunk_distinct].offset; - // Front of next span - int hi_base = hitbuffer->base[cspan->chunk_base + - cspan->base_len].offset; - int hi_delta = hitbuffer->delta[cspan->chunk_delta + - cspan->delta_len].offset; - int hi_distinct = hitbuffer->distinct[cspan->chunk_distinct + - cspan->distinct_len].offset; - - *lo = 0; -// if (cspan->chunk_base > 0) { -// *lo = minint(minint(lo_base, lo_delta), lo_distinct); -// } - *lo = minint(minint(lo_base, lo_delta), lo_distinct); - *hi = minint(minint(hi_base, hi_delta), hi_distinct); -} - - -int DiffScore(const CLD2TableSummary* obj, int indirect, - uint16 lang1, uint16 lang2) { - if (indirect < static_cast<int>(obj->kCLDTableSizeOne)) { - // Up to three languages at indirect - uint32 langprob = obj->kCLDTableInd[indirect]; - return GetLangScore(langprob, lang1) - GetLangScore(langprob, lang2); - } else { - // Up to six languages at start + 2 * (indirect - start) - indirect += (indirect - obj->kCLDTableSizeOne); - uint32 langprob = obj->kCLDTableInd[indirect]; - uint32 langprob2 = obj->kCLDTableInd[indirect + 1]; - return (GetLangScore(langprob, lang1) + GetLangScore(langprob2, lang1)) - - (GetLangScore(langprob, lang2) + GetLangScore(langprob2, lang2)); - } - -} - -// Score all the bases, deltas, distincts, boosts for one chunk into chunk_tote -// After last chunk there is always a hitbuffer entry with an offset just off -// the end of the text. -// Sets delta_len, and distinct_len -void ScoreOneChunk(const char* text, ULScript ulscript, - const ScoringHitBuffer* hitbuffer, - int chunk_i, - ScoringContext* scoringcontext, - ChunkSpan* cspan, Tote* chunk_tote, - ChunkSummary* chunksummary) { - int first_linear_in_chunk = hitbuffer->chunk_start[chunk_i]; - int first_linear_in_next_chunk = hitbuffer->chunk_start[chunk_i + 1]; - - chunk_tote->Reinit(); - cspan->delta_len = 0; - cspan->distinct_len = 0; - if (scoringcontext->flags_cld2_verbose) { - fprintf(scoringcontext->debug_file, "<br>ScoreOneChunk[%d..%d) ", - first_linear_in_chunk, first_linear_in_next_chunk); - } - - // 2013.02.05 linear design: just use base and base_len for the span - cspan->chunk_base = first_linear_in_chunk; - cspan->base_len = first_linear_in_next_chunk - first_linear_in_chunk; - for (int i = first_linear_in_chunk; i < first_linear_in_next_chunk; ++i) { - uint32 langprob = hitbuffer->linear[i].langprob; - AddLangProb(langprob, chunk_tote); - if (hitbuffer->linear[i].type <= QUADHIT) { - chunk_tote->AddScoreCount(); // Just count quads, not octas - } - if (hitbuffer->linear[i].type == DISTINCTHIT) { - AddDistinctBoost2(langprob, scoringcontext); - } - } - - // Score language prior boosts - // Score distinct word boost - ScoreBoosts(scoringcontext, chunk_tote); - - int lo = hitbuffer->linear[first_linear_in_chunk].offset; - int hi = hitbuffer->linear[first_linear_in_next_chunk].offset; - - // Chunk_tote: get top langs, scores, etc. and fill in chunk summary - SetChunkSummary(ulscript, first_linear_in_chunk, lo, hi - lo, - scoringcontext, chunk_tote, chunksummary); - - bool more_to_come = false; - bool score_cjk = false; - if (scoringcontext->flags_cld2_html) { - // Show one chunk in readable output - CLD2_Debug(text, lo, hi, more_to_come, score_cjk, hitbuffer, - scoringcontext, cspan, chunksummary); - } - - scoringcontext->prior_chunk_lang = static_cast<Language>(chunksummary->lang1); -} - - -// Score chunks of text described by hitbuffer, allowing each to be in a -// different language, and optionally adjusting the boundaries inbetween. -// Set last_cspan to the last chunkspan used -void ScoreAllHits(const char* text, ULScript ulscript, - bool more_to_come, bool score_cjk, - const ScoringHitBuffer* hitbuffer, - ScoringContext* scoringcontext, - SummaryBuffer* summarybuffer, ChunkSpan* last_cspan) { - ChunkSpan prior_cspan = {0, 0, 0, 0, 0, 0}; - ChunkSpan cspan = {0, 0, 0, 0, 0, 0}; - - for (int i = 0; i < hitbuffer->next_chunk_start; ++i) { - // Score one chunk - // Sets delta_len, and distinct_len - Tote chunk_tote; - ChunkSummary chunksummary; - ScoreOneChunk(text, ulscript, - hitbuffer, i, - scoringcontext, &cspan, &chunk_tote, &chunksummary); - - // Put result in summarybuffer - if (summarybuffer->n < kMaxSummaries) { - summarybuffer->chunksummary[summarybuffer->n] = chunksummary; - summarybuffer->n += 1; - } - - prior_cspan = cspan; - cspan.chunk_base += cspan.base_len; - cspan.chunk_delta += cspan.delta_len; - cspan.chunk_distinct += cspan.distinct_len; - } - - // Add one dummy off the end to hold first unused linear_in_chunk - int linear_off_end = hitbuffer->next_linear; - int offset_off_end = hitbuffer->linear[linear_off_end].offset; - ChunkSummary* cs = &summarybuffer->chunksummary[summarybuffer->n]; - memset(cs, 0, sizeof(ChunkSummary)); - cs->offset = offset_off_end; - cs->chunk_start = linear_off_end; - *last_cspan = prior_cspan; -} - - -void SummaryBufferToDocTote(const SummaryBuffer* summarybuffer, - bool more_to_come, DocTote* doc_tote) { - int cs_bytes_sum = 0; - for (int i = 0; i < summarybuffer->n; ++i) { - const ChunkSummary* cs = &summarybuffer->chunksummary[i]; - int reliability = minint(cs->reliability_delta, cs->reliability_score); - // doc_tote uses full languages - doc_tote->Add(cs->lang1, cs->bytes, cs->score1, reliability); - cs_bytes_sum += cs->bytes; - } -} - -// Turn on for debugging vectors -static const bool kShowLettersOriginal = false; - - -// If next chunk language matches last vector language, extend last element -// Otherwise add new element to vector -void ItemToVector(ScriptScanner* scanner, - ResultChunkVector* vec, Language new_lang, - int mapped_offset, int mapped_len) { - uint16 last_vec_lang = static_cast<uint16>(UNKNOWN_LANGUAGE); - int last_vec_subscr = vec->size() - 1; - if (last_vec_subscr >= 0) { - ResultChunk* priorrc = &(*vec)[last_vec_subscr]; - last_vec_lang = priorrc->lang1; - if (new_lang == last_vec_lang) { - // Extend prior. Current mapped_offset may be beyond prior end, so do - // the arithmetic to include any such gap - priorrc->bytes = minint((mapped_offset + mapped_len) - priorrc->offset, - kMaxResultChunkBytes); - if (kShowLettersOriginal) { - // Optionally print the new chunk original text - string temp2(&scanner->GetBufferStart()[priorrc->offset], - priorrc->bytes); - fprintf(stderr, "Item[%d..%d) '%s'<br>\n", - priorrc->offset, priorrc->offset + priorrc->bytes, - GetHtmlEscapedText(temp2).c_str()); - } - return; - } - } - // Add new vector element - ResultChunk rc; - rc.offset = mapped_offset; - rc.bytes = minint(mapped_len, kMaxResultChunkBytes); - rc.lang1 = static_cast<uint16>(new_lang); - vec->push_back(rc); - if (kShowLettersOriginal) { - // Optionally print the new chunk original text - string temp2(&scanner->GetBufferStart()[rc.offset], rc.bytes); - fprintf(stderr, "Item[%d..%d) '%s'<br>\n", - rc.offset, rc.offset + rc.bytes, - GetHtmlEscapedText(temp2).c_str()); - } -} - -uint16 PriorVecLang(const ResultChunkVector* vec) { - if (vec->empty()) {return static_cast<uint16>(UNKNOWN_LANGUAGE);} - return (*vec)[vec->size() - 1].lang1; -} - -uint16 NextChunkLang(const SummaryBuffer* summarybuffer, int i) { - if ((i + 1) >= summarybuffer->n) { - return static_cast<uint16>(UNKNOWN_LANGUAGE); - } - return summarybuffer->chunksummary[i + 1].lang1; -} - - - -// Add n elements of summarybuffer to resultchunk vector: -// Each element is letters-only text [offset..offset+bytes) -// This maps back to original[Back(offset)..Back(offset+bytes)) -// -// We go out of our way to minimize the variation in the ResultChunkVector, -// so that the caller has fewer but more meaningful spans in different -// lanaguges, for the likely purpose of translation or spell-check. -// -// The language of each chunk is lang1, but it might be unreliable for -// either of two reasons: its score is relatively too close to the score of -// lang2, or its score is too far away from the expected score of real text in -// the given language. Unreliable languages are mapped to Unknown. -// -void SummaryBufferToVector(ScriptScanner* scanner, const char* text, - const SummaryBuffer* summarybuffer, - bool more_to_come, ResultChunkVector* vec) { - if (vec == NULL) {return;} - - if (kShowLettersOriginal) { - fprintf(stderr, "map2original_ "); - scanner->map2original_.DumpWindow(); - fprintf(stderr, "<br>\n"); - fprintf(stderr, "map2uplow_ "); - scanner->map2uplow_.DumpWindow(); - fprintf(stderr, "<br>\n"); - } - - for (int i = 0; i < summarybuffer->n; ++i) { - const ChunkSummary* cs = &summarybuffer->chunksummary[i]; - int unmapped_offset = cs->offset; - int unmapped_len = cs->bytes; - - if (kShowLettersOriginal) { - // Optionally print the chunk lowercase letters/marks text - string temp(&text[unmapped_offset], unmapped_len); - fprintf(stderr, "Letters [%d..%d) '%s'<br>\n", - unmapped_offset, unmapped_offset + unmapped_len, - GetHtmlEscapedText(temp).c_str()); - } - - int mapped_offset = scanner->MapBack(unmapped_offset); - - // Trim back a little to prefer splicing original at word boundaries - if (mapped_offset > 0) { - // Size of prior vector entry, if any - int prior_size = 0; - if (!vec->empty()) { - ResultChunk* rc = &(*vec)[vec->size() - 1]; - prior_size = rc->bytes; - } - // Maximum back up size to leave at least 3 bytes in prior, - // and not entire buffer, and no more than 12 bytes total backup - int n_limit = minint(prior_size - 3, mapped_offset); - n_limit = minint(n_limit, 12); - - // Backscan over letters, stopping if prior byte is < 0x41 - // There is some possibility that we will backscan over a different script - const char* s = &scanner->GetBufferStart()[mapped_offset]; - const unsigned char* us = reinterpret_cast<const unsigned char*>(s); - int n = 0; - while ((n < n_limit) && (us[-n - 1] >= 0x41)) {++n;} - if (n >= n_limit) {n = 0;} // New boundary not found within range - - // Also back up exactly one leading punctuation character if '"#@ - if (n < n_limit) { - unsigned char c = us[-n - 1]; - if ((c == '\'') || (c == '"') || (c == '#') || (c == '@')) {++n;} - } - // Shrink the previous chunk slightly - if (n > 0) { - ResultChunk* rc = &(*vec)[vec->size() - 1]; - rc->bytes -= n; - mapped_offset -= n; - if (kShowLettersOriginal) { - fprintf(stderr, "Back up %d bytes<br>\n", n); - // Optionally print the prior chunk original text - string temp2(&scanner->GetBufferStart()[rc->offset], rc->bytes); - fprintf(stderr, "Prior [%d..%d) '%s'<br>\n", - rc->offset, rc->offset + rc->bytes, - GetHtmlEscapedText(temp2).c_str()); - } - } - } - - int mapped_len = - scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset; - - if (kShowLettersOriginal) { - // Optionally print the chunk original text - string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len); - fprintf(stderr, "Original[%d..%d) '%s'<br>\n", - mapped_offset, mapped_offset + mapped_len, - GetHtmlEscapedText(temp2).c_str()); - } - - Language new_lang = static_cast<Language>(cs->lang1); - bool reliability_delta_bad = - (cs->reliability_delta < kUnreliablePercentThreshold); - bool reliability_score_bad = - (cs->reliability_score < kUnreliablePercentThreshold); - - // If the top language matches last vector, ignore reliability_delta - uint16 prior_lang = PriorVecLang(vec); - if (prior_lang == cs->lang1) { - reliability_delta_bad = false; - } - // If the top language is in same close set as last vector, set up to merge - if (SameCloseSet(cs->lang1, prior_lang)) { - new_lang = static_cast<Language>(prior_lang); - reliability_delta_bad = false; - } - // If the top two languages are in the same close set and the last vector - // language is the second language, set up to merge - if (SameCloseSet(cs->lang1, cs->lang2) && - (prior_lang == cs->lang2)) { - new_lang = static_cast<Language>(prior_lang); - reliability_delta_bad = false; - } - // If unreliable and the last and next vector languages are both - // the second language, set up to merge - uint16 next_lang = NextChunkLang(summarybuffer, i); - if (reliability_delta_bad && - (prior_lang == cs->lang2) && (next_lang == cs->lang2)) { - new_lang = static_cast<Language>(prior_lang); - reliability_delta_bad = false; - } - - if (reliability_delta_bad || reliability_score_bad) { - new_lang = UNKNOWN_LANGUAGE; - } - ItemToVector(scanner, vec, new_lang, mapped_offset, mapped_len); - } -} - -// Add just one element to resultchunk vector: -// For RTypeNone or RTypeOne -void JustOneItemToVector(ScriptScanner* scanner, const char* text, - Language lang1, int unmapped_offset, int unmapped_len, - ResultChunkVector* vec) { - if (vec == NULL) {return;} - - if (kShowLettersOriginal) { - fprintf(stderr, "map2original_ "); - scanner->map2original_.DumpWindow(); - fprintf(stderr, "<br>\n"); - fprintf(stderr, "map2uplow_ "); - scanner->map2uplow_.DumpWindow(); - fprintf(stderr, "<br>\n"); - } - - if (kShowLettersOriginal) { - // Optionally print the chunk lowercase letters/marks text - string temp(&text[unmapped_offset], unmapped_len); - fprintf(stderr, "Letters1 [%d..%d) '%s'<br>\n", - unmapped_offset, unmapped_offset + unmapped_len, - GetHtmlEscapedText(temp).c_str()); - } - - int mapped_offset = scanner->MapBack(unmapped_offset); - int mapped_len = - scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset; - - if (kShowLettersOriginal) { - // Optionally print the chunk original text - string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len); - fprintf(stderr, "Original1[%d..%d) '%s'<br>\n", - mapped_offset, mapped_offset + mapped_len, - GetHtmlEscapedText(temp2).c_str()); - } - - ItemToVector(scanner, vec, lang1, mapped_offset, mapped_len); -} - - -// Debugging. Not thread safe. Defined in getonescriptspan -char* DisplayPiece(const char* next_byte_, int byte_length_); - -// If high bit is on, take out high bit and add 2B to make table2 entries easy -inline int PrintableIndirect(int x) { - if ((x & 0x80000000u) != 0) { - return (x & ~0x80000000u) + 2000000000; - } - return x; -} -void DumpHitBuffer(FILE* df, const char* text, - const ScoringHitBuffer* hitbuffer) { - fprintf(df, - "<br>DumpHitBuffer[%s, next_base/delta/distinct %d, %d, %d)<br>\n", - ULScriptCode(hitbuffer->ulscript), - hitbuffer->next_base, hitbuffer->next_delta, - hitbuffer->next_distinct); - for (int i = 0; i < hitbuffer->maxscoringhits; ++i) { - if (i < hitbuffer->next_base) { - fprintf(df, "Q[%d]%d,%d,%s ", - i, hitbuffer->base[i].offset, - PrintableIndirect(hitbuffer->base[i].indirect), - DisplayPiece(&text[hitbuffer->base[i].offset], 6)); - } - if (i < hitbuffer->next_delta) { - fprintf(df, "DL[%d]%d,%d,%s ", - i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect, - DisplayPiece(&text[hitbuffer->delta[i].offset], 12)); - } - if (i < hitbuffer->next_distinct) { - fprintf(df, "D[%d]%d,%d,%s ", - i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect, - DisplayPiece(&text[hitbuffer->distinct[i].offset], 12)); - } - if (i < hitbuffer->next_base) { - fprintf(df, "<br>\n"); - } - if (i > 50) {break;} - } - if (hitbuffer->next_base > 50) { - int i = hitbuffer->next_base; - fprintf(df, "Q[%d]%d,%d,%s ", - i, hitbuffer->base[i].offset, - PrintableIndirect(hitbuffer->base[i].indirect), - DisplayPiece(&text[hitbuffer->base[i].offset], 6)); - } - if (hitbuffer->next_delta > 50) { - int i = hitbuffer->next_delta; - fprintf(df, "DL[%d]%d,%d,%s ", - i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect, - DisplayPiece(&text[hitbuffer->delta[i].offset], 12)); - } - if (hitbuffer->next_distinct > 50) { - int i = hitbuffer->next_distinct; - fprintf(df, "D[%d]%d,%d,%s ", - i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect, - DisplayPiece(&text[hitbuffer->distinct[i].offset], 12)); - } - fprintf(df, "<br>\n"); -} - - -void DumpLinearBuffer(FILE* df, const char* text, - const ScoringHitBuffer* hitbuffer) { - fprintf(df, "<br>DumpLinearBuffer[%d)<br>\n", - hitbuffer->next_linear); - // Include the dummy entry off the end - for (int i = 0; i < hitbuffer->next_linear + 1; ++i) { - if ((50 < i) && (i < (hitbuffer->next_linear - 1))) {continue;} - fprintf(df, "[%d]%d,%c=%08x,%s<br>\n", - i, hitbuffer->linear[i].offset, - "UQLD"[hitbuffer->linear[i].type], - hitbuffer->linear[i].langprob, - DisplayPiece(&text[hitbuffer->linear[i].offset], 6)); - } - fprintf(df, "<br>\n"); - - fprintf(df, "DumpChunkStart[%d]<br>\n", hitbuffer->next_chunk_start); - for (int i = 0; i < hitbuffer->next_chunk_start + 1; ++i) { - fprintf(df, "[%d]%d\n", i, hitbuffer->chunk_start[i]); - } - fprintf(df, "<br>\n"); -} - -// Move this verbose debugging output to debug.cc eventually -void DumpChunkSummary(FILE* df, const ChunkSummary* cs) { - // Print chunksummary - fprintf(df, "%d lin[%d] %s.%d %s.%d %dB %d# %s %dRd %dRs<br>\n", - cs->offset, - cs->chunk_start, - LanguageCode(static_cast<Language>(cs->lang1)), - cs->score1, - LanguageCode(static_cast<Language>(cs->lang2)), - cs->score2, - cs->bytes, - cs->grams, - ULScriptCode(static_cast<ULScript>(cs->ulscript)), - cs->reliability_delta, - cs->reliability_score); -} - -void DumpSummaryBuffer(FILE* df, const SummaryBuffer* summarybuffer) { - fprintf(df, "<br>DumpSummaryBuffer[%d]<br>\n", summarybuffer->n); - fprintf(df, "[i] offset linear[chunk_start] lang.score1 lang.score2 " - "bytesB ngrams# script rel_delta rel_score<br>\n"); - for (int i = 0; i <= summarybuffer->n; ++i) { - fprintf(df, "[%d] ", i); - DumpChunkSummary(df, &summarybuffer->chunksummary[i]); - } - fprintf(df, "<br>\n"); -} - - - -// Within hitbufer->linear[] -// <-- prior chunk --><-- this chunk --> -// | | | -// linear0 linear1 linear2 -// lang0 lang1 -// The goal of sharpening is to move this_linear to better separate langs -int BetterBoundary(const char* text, - ScoringHitBuffer* hitbuffer, - ScoringContext* scoringcontext, - uint16 pslang0, uint16 pslang1, - int linear0, int linear1, int linear2) { - // Degenerate case, no change - if ((linear2 - linear0) <= 8) {return linear1;} - - // Each diff gives pslang0 score - pslang1 score - // Running diff has four entries + + + + followed by four entries - - - - - // so that this value is maximal at the sharpest boundary between pslang0 - // (positive diffs) and pslang1 (negative diffs) - int running_diff = 0; - int diff[8]; // Ring buffer of pslang0-pslang1 differences - // Initialize with first 8 diffs - for (int i = linear0; i < linear0 + 8; ++i) { - int j = i & 7; - uint32 langprob = hitbuffer->linear[i].langprob; - diff[j] = GetLangScore(langprob, pslang0) - - GetLangScore(langprob, pslang1); - if (i < linear0 + 4) { - // First four diffs pslang0 - pslang1 - running_diff += diff[j]; - } else { - // Second four diffs -(pslang0 - pslang1) - running_diff -= diff[j]; - } - } - - // Now scan for sharpest boundary. j is at left end of 8 entries - // To be a boundary, there must be both >0 and <0 entries in the window - int better_boundary_value = 0; - int better_boundary = linear1; - for (int i = linear0; i < linear2 - 8; ++i) { - int j = i & 7; - if (better_boundary_value < running_diff) { - bool has_plus = false; - bool has_minus = false; - for (int kk = 0; kk < 8; ++kk) { - if (diff[kk] > 0) {has_plus = true;} - if (diff[kk] < 0) {has_minus = true;} - } - if (has_plus && has_minus) { - better_boundary_value = running_diff; - better_boundary = i + 4; - } - } - // Shift right one entry - uint32 langprob = hitbuffer->linear[i + 8].langprob; - int newdiff = GetLangScore(langprob, pslang0) - - GetLangScore(langprob, pslang1); - int middiff = diff[(i + 4) & 7]; - int olddiff = diff[j]; - diff[j] = newdiff; - running_diff -= olddiff; // Remove left - running_diff += 2 * middiff; // Convert middle from - to + - running_diff -= newdiff; // Insert right - } - - if (scoringcontext->flags_cld2_verbose && (linear1 != better_boundary)) { - Language lang0 = FromPerScriptNumber(scoringcontext->ulscript, pslang0); - Language lang1 = FromPerScriptNumber(scoringcontext->ulscript, pslang1); - fprintf(scoringcontext->debug_file, " Better lin[%d=>%d] %s^^%s <br>\n", - linear1, better_boundary, - LanguageCode(lang0), LanguageCode(lang1)); - int lin0_off = hitbuffer->linear[linear0].offset; - int lin1_off = hitbuffer->linear[linear1].offset; - int lin2_off = hitbuffer->linear[linear2].offset; - int better_offm1 = hitbuffer->linear[better_boundary - 1].offset; - int better_off = hitbuffer->linear[better_boundary].offset; - int better_offp1 = hitbuffer->linear[better_boundary + 1].offset; - string old0(&text[lin0_off], lin1_off - lin0_off); - string old1(&text[lin1_off], lin2_off - lin1_off); - string new0(&text[lin0_off], better_offm1 - lin0_off); - string new0m1(&text[better_offm1], better_off - better_offm1); - string new1(&text[better_off], better_offp1 - better_off); - string new1p1(&text[better_offp1], lin2_off - better_offp1); - fprintf(scoringcontext->debug_file, "%s^^%s => <br>\n%s^%s^^%s^%s<br>\n", - GetHtmlEscapedText(old0).c_str(), - GetHtmlEscapedText(old1).c_str(), - GetHtmlEscapedText(new0).c_str(), - GetHtmlEscapedText(new0m1).c_str(), - GetHtmlEscapedText(new1).c_str(), - GetHtmlEscapedText(new1p1).c_str()); - // Slow picture of differences per linear entry - int d; - for (int i = linear0; i < linear2; ++i) { - if (i == better_boundary) { - fprintf(scoringcontext->debug_file, "^^ "); - } - uint32 langprob = hitbuffer->linear[i].langprob; - d = GetLangScore(langprob, pslang0) - GetLangScore(langprob, pslang1); - const char* s = "="; - //if (d > 2) {s = "\xc2\xaf";} // Macron - if (d > 2) {s = "#";} - else if (d > 0) {s = "+";} - else if (d < -2) {s = "_";} - else if (d < 0) {s = "-";} - fprintf(scoringcontext->debug_file, "%s ", s); - } - fprintf(scoringcontext->debug_file, " (scale: #+=-_)<br>\n"); - } - return better_boundary; -} - - -// For all but the first summary, if its top language differs from -// the previous chunk, refine the boundary -// Linearized version -void SharpenBoundaries(const char* text, - bool more_to_come, - ScoringHitBuffer* hitbuffer, - ScoringContext* scoringcontext, - SummaryBuffer* summarybuffer) { - - int prior_linear = summarybuffer->chunksummary[0].chunk_start; - uint16 prior_lang = summarybuffer->chunksummary[0].lang1; - - if (scoringcontext->flags_cld2_verbose) { - fprintf(scoringcontext->debug_file, "<br>SharpenBoundaries<br>\n"); - } - for (int i = 1; i < summarybuffer->n; ++i) { - ChunkSummary* cs = &summarybuffer->chunksummary[i]; - uint16 this_lang = cs->lang1; - if (this_lang == prior_lang) { - prior_linear = cs->chunk_start; - continue; - } - - int this_linear = cs->chunk_start; - int next_linear = summarybuffer->chunksummary[i + 1].chunk_start; - - // If this/prior in same close set, don't move boundary - if (SameCloseSet(prior_lang, this_lang)) { - prior_linear = this_linear; - prior_lang = this_lang; - continue; - } - - - // Within hitbuffer->linear[] - // <-- prior chunk --><-- this chunk --> - // | | | - // prior_linear this_linear next_linear - // prior_lang this_lang - // The goal of sharpening is to move this_linear to better separate langs - - uint8 pslang0 = PerScriptNumber(scoringcontext->ulscript, - static_cast<Language>(prior_lang)); - uint8 pslang1 = PerScriptNumber(scoringcontext->ulscript, - static_cast<Language>(this_lang)); - int better_linear = BetterBoundary(text, - hitbuffer, - scoringcontext, - pslang0, pslang1, - prior_linear, this_linear, next_linear); - - int old_offset = hitbuffer->linear[this_linear].offset; - int new_offset = hitbuffer->linear[better_linear].offset; - cs->chunk_start = better_linear; - cs->offset = new_offset; - // If this_linear moved right, make bytes smaller for this, larger for prior - // If this_linear moved left, make bytes larger for this, smaller for prior - cs->bytes -= (new_offset - old_offset); - summarybuffer->chunksummary[i - 1].bytes += (new_offset - old_offset); - - this_linear = better_linear; // Update so that next chunk doesn't intrude - - // Consider rescoring the two chunks - - // Update for next round (note: using pre-updated boundary) - prior_linear = this_linear; - prior_lang = this_lang; - } -} - -// Make a langprob that gives small weight to the default language for ulscript -uint32 DefaultLangProb(ULScript ulscript) { - Language default_lang = DefaultLanguage(ulscript); - return MakeLangProb(default_lang, 1); -} - -// Effectively, do a merge-sort based on text offsets -// Look up each indirect value in appropriate scoring table and keep -// just the resulting langprobs -void LinearizeAll(ScoringContext* scoringcontext, bool score_cjk, - ScoringHitBuffer* hitbuffer) { - const CLD2TableSummary* base_obj; // unigram or quadgram - const CLD2TableSummary* base_obj2; // quadgram dual table - const CLD2TableSummary* delta_obj; // bigram or octagram - const CLD2TableSummary* distinct_obj; // bigram or octagram - uint16 base_hit; - if (score_cjk) { - base_obj = scoringcontext->scoringtables->unigram_compat_obj; - base_obj2 = scoringcontext->scoringtables->unigram_compat_obj; - delta_obj = scoringcontext->scoringtables->deltabi_obj; - distinct_obj = scoringcontext->scoringtables->distinctbi_obj; - base_hit = UNIHIT; - } else { - base_obj = scoringcontext->scoringtables->quadgram_obj; - base_obj2 = scoringcontext->scoringtables->quadgram_obj2; - delta_obj = scoringcontext->scoringtables->deltaocta_obj; - distinct_obj = scoringcontext->scoringtables->distinctocta_obj; - base_hit = QUADHIT; - } - - int base_limit = hitbuffer->next_base; - int delta_limit = hitbuffer->next_delta; - int distinct_limit = hitbuffer->next_distinct; - int base_i = 0; - int delta_i = 0; - int distinct_i = 0; - int linear_i = 0; - - // Start with an initial base hit for the default language for this script - // Inserting this avoids edge effects with no hits at all - hitbuffer->linear[linear_i].offset = hitbuffer->lowest_offset; - hitbuffer->linear[linear_i].type = base_hit; - hitbuffer->linear[linear_i].langprob = - DefaultLangProb(scoringcontext->ulscript); - ++linear_i; - - while ((base_i < base_limit) || (delta_i < delta_limit) || - (distinct_i < distinct_limit)) { - int base_off = hitbuffer->base[base_i].offset; - int delta_off = hitbuffer->delta[delta_i].offset; - int distinct_off = hitbuffer->distinct[distinct_i].offset; - - // Do delta and distinct first, so that they are not lost at base_limit - if ((delta_i < delta_limit) && - (delta_off <= base_off) && (delta_off <= distinct_off)) { - // Add delta entry - int indirect = hitbuffer->delta[delta_i].indirect; - ++delta_i; - uint32 langprob = delta_obj->kCLDTableInd[indirect]; - if (langprob > 0) { - hitbuffer->linear[linear_i].offset = delta_off; - hitbuffer->linear[linear_i].type = DELTAHIT; - hitbuffer->linear[linear_i].langprob = langprob; - ++linear_i; - } - } - else if ((distinct_i < distinct_limit) && - (distinct_off <= base_off) && (distinct_off <= delta_off)) { - // Add distinct entry - int indirect = hitbuffer->distinct[distinct_i].indirect; - ++distinct_i; - uint32 langprob = distinct_obj->kCLDTableInd[indirect]; - if (langprob > 0) { - hitbuffer->linear[linear_i].offset = distinct_off; - hitbuffer->linear[linear_i].type = DISTINCTHIT; - hitbuffer->linear[linear_i].langprob = langprob; - ++linear_i; - } - } - else { - // Add one or two base entries - int indirect = hitbuffer->base[base_i].indirect; - // First, get right scoring table - const CLD2TableSummary* local_base_obj = base_obj; - if ((indirect & 0x80000000u) != 0) { - local_base_obj = base_obj2; - indirect &= ~0x80000000u; - } - ++base_i; - // One langprob in kQuadInd[0..SingleSize), - // two in kQuadInd[SingleSize..Size) - if (indirect < static_cast<int>(local_base_obj->kCLDTableSizeOne)) { - // Up to three languages at indirect - uint32 langprob = local_base_obj->kCLDTableInd[indirect]; - if (langprob > 0) { - hitbuffer->linear[linear_i].offset = base_off; - hitbuffer->linear[linear_i].type = base_hit; - hitbuffer->linear[linear_i].langprob = langprob; - ++linear_i; - } - } else { - // Up to six languages at start + 2 * (indirect - start) - indirect += (indirect - local_base_obj->kCLDTableSizeOne); - uint32 langprob = local_base_obj->kCLDTableInd[indirect]; - uint32 langprob2 = local_base_obj->kCLDTableInd[indirect + 1]; - if (langprob > 0) { - hitbuffer->linear[linear_i].offset = base_off; - hitbuffer->linear[linear_i].type = base_hit; - hitbuffer->linear[linear_i].langprob = langprob; - ++linear_i; - } - if (langprob2 > 0) { - hitbuffer->linear[linear_i].offset = base_off; - hitbuffer->linear[linear_i].type = base_hit; - hitbuffer->linear[linear_i].langprob = langprob2; - ++linear_i; - } - } - } - } - - // Update - hitbuffer->next_linear = linear_i; - - // Add a dummy entry off the end, just to capture final offset - hitbuffer->linear[linear_i].offset = - hitbuffer->base[hitbuffer->next_base].offset; - hitbuffer->linear[linear_i].langprob = 0; -} - -// Break linear array into chunks of ~20 quadgram hits or ~50 CJK unigram hits -void ChunkAll(int letter_offset, bool score_cjk, ScoringHitBuffer* hitbuffer) { - int chunksize; - uint16 base_hit; - if (score_cjk) { - chunksize = kChunksizeUnis; - base_hit = UNIHIT; - } else { - chunksize = kChunksizeQuads; - base_hit = QUADHIT; - } - - int linear_i = 0; - int linear_off_end = hitbuffer->next_linear; - int text_i = letter_offset; // Next unseen text offset - int next_chunk_start = 0; - int bases_left = hitbuffer->next_base; - while (bases_left > 0) { - // Linearize one chunk - int base_len = chunksize; // Default; may be changed below - if (bases_left < (chunksize + (chunksize >> 1))) { - // If within 1.5 chunks of the end, avoid runts by using it all - base_len = bases_left; - } else if (bases_left < (2 * chunksize)) { - // Avoid runts by splitting 1.5 to 2 chunks in half (about 3/4 each) - base_len = (bases_left + 1) >> 1; - } - - hitbuffer->chunk_start[next_chunk_start] = linear_i; - hitbuffer->chunk_offset[next_chunk_start] = text_i; - ++next_chunk_start; - - int base_count = 0; - while ((base_count < base_len) && (linear_i < linear_off_end)) { - if (hitbuffer->linear[linear_i].type == base_hit) {++base_count;} - ++linear_i; - } - text_i = hitbuffer->linear[linear_i].offset; // Next unseen text offset - bases_left -= base_len; - } - - // If no base hits at all, make a single dummy chunk - if (next_chunk_start == 0) { - hitbuffer->chunk_start[next_chunk_start] = 0; - hitbuffer->chunk_offset[next_chunk_start] = hitbuffer->linear[0].offset; - ++next_chunk_start; - } - - // Remember the linear array start of dummy entry - hitbuffer->next_chunk_start = next_chunk_start; - - // Add a dummy entry off the end, just to capture final linear subscr - hitbuffer->chunk_start[next_chunk_start] = hitbuffer->next_linear; - hitbuffer->chunk_offset[next_chunk_start] = text_i; -} - - -// Merge-sort the individual hit arrays, go indirect on the scoring subscripts, -// break linear array into chunks. -// -// Input: -// hitbuffer base, delta, distinct arrays -// Output: -// linear array -// chunk_start array -// -void LinearizeHitBuffer(int letter_offset, - ScoringContext* scoringcontext, - bool more_to_come, bool score_cjk, - ScoringHitBuffer* hitbuffer) { - LinearizeAll(scoringcontext, score_cjk, hitbuffer); - ChunkAll(letter_offset, score_cjk, hitbuffer); -} - - - -// The hitbuffer is in an awkward form -- three sets of base/delta/distinct -// scores, each with an indirect subscript to one of six scoring tables, some -// of which can yield two langprobs for six languages, others one langprob for -// three languages. The only correlation between base/delta/distinct is their -// offsets into the letters-only text buffer. -// -// SummaryBuffer needs to be built to linear, giving linear offset of start of -// each chunk -// -// So we first do all the langprob lookups and merge-sort by offset to make -// a single linear vector, building a side vector of chunk beginnings as we go. -// The sharpening is simply moving the beginnings, scoring is a simple linear -// sweep, etc. - -void ProcessHitBuffer(const LangSpan& scriptspan, - int letter_offset, - ScoringContext* scoringcontext, - DocTote* doc_tote, - ResultChunkVector* vec, - bool more_to_come, bool score_cjk, - ScoringHitBuffer* hitbuffer) { - if (scoringcontext->flags_cld2_verbose) { - fprintf(scoringcontext->debug_file, "Hitbuffer[) "); - DumpHitBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer); - } - - LinearizeHitBuffer(letter_offset, scoringcontext, more_to_come, score_cjk, - hitbuffer); - - if (scoringcontext->flags_cld2_verbose) { - fprintf(scoringcontext->debug_file, "Linear[) "); - DumpLinearBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer); - } - - SummaryBuffer summarybuffer; - summarybuffer.n = 0; - ChunkSpan last_cspan; - ScoreAllHits(scriptspan.text, scriptspan.ulscript, - more_to_come, score_cjk, hitbuffer, - scoringcontext, - &summarybuffer, &last_cspan); - - if (scoringcontext->flags_cld2_verbose) { - DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer); - } - - if (vec != NULL) { - // Sharpen boundaries of summarybuffer - // This is not a high-performance path - SharpenBoundaries(scriptspan.text, more_to_come, hitbuffer, scoringcontext, - &summarybuffer); - // Show after the sharpening - // CLD2_Debug2(scriptspan.text, more_to_come, score_cjk, - // hitbuffer, scoringcontext, &summarybuffer); - - if (scoringcontext->flags_cld2_verbose) { - DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer); - } - } - - SummaryBufferToDocTote(&summarybuffer, more_to_come, doc_tote); - SummaryBufferToVector(scoringcontext->scanner, scriptspan.text, - &summarybuffer, more_to_come, vec); -} - -void SpliceHitBuffer(ScoringHitBuffer* hitbuffer, int next_offset) { - // Splice hitbuffer and summarybuffer for next round. With big chunks and - // distinctive-word state carried across chunks, we might not need to do this. - hitbuffer->next_base = 0; - hitbuffer->next_delta = 0; - hitbuffer->next_distinct = 0; - hitbuffer->next_linear = 0; - hitbuffer->next_chunk_start = 0; - hitbuffer->lowest_offset = next_offset; -} - - -// Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating -// scoringcontext -void ScoreEntireScriptSpan(const LangSpan& scriptspan, - ScoringContext* scoringcontext, - DocTote* doc_tote, - ResultChunkVector* vec) { - int bytes = scriptspan.text_bytes; - // Artificially set score to 1024 per 1KB, or 1 per byte - int score = bytes; - int reliability = 100; - // doc_tote uses full languages - Language one_one_lang = DefaultLanguage(scriptspan.ulscript); - doc_tote->Add(one_one_lang, bytes, score, reliability); - - if (scoringcontext->flags_cld2_html) { - ChunkSummary chunksummary = { - 1, 0, - one_one_lang, UNKNOWN_LANGUAGE, score, 1, - bytes, 0, scriptspan.ulscript, reliability, reliability - }; - CLD2_Debug(scriptspan.text, 1, scriptspan.text_bytes, - false, false, NULL, - scoringcontext, NULL, &chunksummary); - } - - // First byte is always a space - JustOneItemToVector(scoringcontext->scanner, scriptspan.text, - one_one_lang, 1, bytes - 1, vec); - - scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; -} - -// Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext -void ScoreCJKScriptSpan(const LangSpan& scriptspan, - ScoringContext* scoringcontext, - DocTote* doc_tote, - ResultChunkVector* vec) { - // Allocate three parallel arrays of scoring hits - ScoringHitBuffer* hitbuffer = new ScoringHitBuffer; - hitbuffer->init(); - hitbuffer->ulscript = scriptspan.ulscript; - - scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; - scoringcontext->oldest_distinct_boost = 0; - - // Incoming scriptspan has a single leading space at scriptspan.text[0] - // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3] - - int letter_offset = 1; // Skip initial space - hitbuffer->lowest_offset = letter_offset; - int letter_limit = scriptspan.text_bytes; - while (letter_offset < letter_limit) { - if (scoringcontext->flags_cld2_verbose) { - fprintf(scoringcontext->debug_file, " ScoreCJKScriptSpan[%d,%d)<br>\n", - letter_offset, letter_limit); - } - // - // Fill up one hitbuffer, possibly splicing onto previous fragment - // - // NOTE: GetUniHits deals with close repeats - // NOTE: After last chunk there is always a hitbuffer entry with an offset - // just off the end of the text = next_offset. - int next_offset = GetUniHits(scriptspan.text, letter_offset, letter_limit, - scoringcontext, hitbuffer); - // NOTE: GetBiHitVectors deals with close repeats, - // does one hash and two lookups (delta and distinct) per word - GetBiHits(scriptspan.text, letter_offset, next_offset, - scoringcontext, hitbuffer); - - // - // Score one hitbuffer in chunks to summarybuffer - // - bool more_to_come = next_offset < letter_limit; - bool score_cjk = true; - ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec, - more_to_come, score_cjk, hitbuffer); - SpliceHitBuffer(hitbuffer, next_offset); - - letter_offset = next_offset; - } - - delete hitbuffer; - // Context across buffers is not connected yet - scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; -} - - - -// Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext -// We have a scriptspan with all lowercase text in one script. Look up -// quadgrams and octagrams, saving the hits in three parallel vectors. -// Score from those vectors in chunks, toting each chunk to get a single -// language, and combining into the overall document score. The hit vectors -// in general are not big enough to handle and entire scriptspan, so -// repeat until the entire scriptspan is scored. -// Caller deals with minimizing numbr of runt scriptspans -// This routine deals with minimizing number of runt chunks. -// -// Returns updated scoringcontext -// Returns updated doc_tote -// If vec != NULL, appends to that vector of ResultChunk's -void ScoreQuadScriptSpan(const LangSpan& scriptspan, - ScoringContext* scoringcontext, - DocTote* doc_tote, - ResultChunkVector* vec) { - // Allocate three parallel arrays of scoring hits - ScoringHitBuffer* hitbuffer = new ScoringHitBuffer; - hitbuffer->init(); - hitbuffer->ulscript = scriptspan.ulscript; - - scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; - scoringcontext->oldest_distinct_boost = 0; - - // Incoming scriptspan has a single leading space at scriptspan.text[0] - // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3] - - int letter_offset = 1; // Skip initial space - hitbuffer->lowest_offset = letter_offset; - int letter_limit = scriptspan.text_bytes; - while (letter_offset < letter_limit) { - // - // Fill up one hitbuffer, possibly splicing onto previous fragment - // - // NOTE: GetQuadHits deals with close repeats - // NOTE: After last chunk there is always a hitbuffer entry with an offset - // just off the end of the text = next_offset. - int next_offset = GetQuadHits(scriptspan.text, letter_offset, letter_limit, - scoringcontext, hitbuffer); - // If true, there is more text to process in this scriptspan - // NOTE: GetOctaHitVectors deals with close repeats, - // does one hash and two lookups (delta and distinct) per word - GetOctaHits(scriptspan.text, letter_offset, next_offset, - scoringcontext, hitbuffer); - - // - // Score one hitbuffer in chunks to summarybuffer - // - bool more_to_come = next_offset < letter_limit; - bool score_cjk = false; - ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec, - more_to_come, score_cjk, hitbuffer); - SpliceHitBuffer(hitbuffer, next_offset); - - letter_offset = next_offset; - } - - delete hitbuffer; -} - - -// Score one scriptspan into doc_tote and vec, updating scoringcontext -// Inputs: -// One scriptspan of perhaps 40-60KB, all same script lower-case letters -// and single ASCII spaces. First character is a space to allow simple -// begining-of-word detect. End of buffer has three spaces and NUL to -// allow easy scan-to-end-of-word. -// Scoring context of -// scoring tables -// flags -// running boosts -// Outputs: -// Updated doc_tote giving overall languages and byte counts -// Optional updated chunk vector giving offset, length, language -// -// Caller initializes flags, boosts, doc_tote and vec. -// Caller aggregates across multiple scriptspans -// Caller calculates final document result -// Caller deals with detecting and triggering suppression of repeated text. -// -// This top-level routine just chooses the recognition type and calls one of -// the next-level-down routines. -// -void ScoreOneScriptSpan(const LangSpan& scriptspan, - ScoringContext* scoringcontext, - DocTote* doc_tote, - ResultChunkVector* vec) { - if (scoringcontext->flags_cld2_verbose) { - fprintf(scoringcontext->debug_file, "<br>ScoreOneScriptSpan(%s,%d) ", - ULScriptCode(scriptspan.ulscript), scriptspan.text_bytes); - // Optionally print the chunk lowercase letters/marks text - string temp(&scriptspan.text[0], scriptspan.text_bytes); - fprintf(scoringcontext->debug_file, "'%s'", - GetHtmlEscapedText(temp).c_str()); - fprintf(scoringcontext->debug_file, "<br>\n"); - } - scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; - scoringcontext->oldest_distinct_boost = 0; - ULScriptRType rtype = ULScriptRecognitionType(scriptspan.ulscript); - if (scoringcontext->flags_cld2_score_as_quads && (rtype != RTypeCJK)) { - rtype = RTypeMany; - } - switch (rtype) { - case RTypeNone: - case RTypeOne: - ScoreEntireScriptSpan(scriptspan, scoringcontext, doc_tote, vec); - break; - case RTypeCJK: - ScoreCJKScriptSpan(scriptspan, scoringcontext, doc_tote, vec); - break; - case RTypeMany: - ScoreQuadScriptSpan(scriptspan, scoringcontext, doc_tote, vec); - break; - } -} - -} // End namespace CLD2 - |