diff options
Diffstat (limited to 'application/basilisk/components/translation/cld2/internal/scoreonescriptspan.cc')
-rw-r--r-- | application/basilisk/components/translation/cld2/internal/scoreonescriptspan.cc | 1334 |
1 files changed, 1334 insertions, 0 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/scoreonescriptspan.cc b/application/basilisk/components/translation/cld2/internal/scoreonescriptspan.cc new file mode 100644 index 000000000..b2cebc02e --- /dev/null +++ b/application/basilisk/components/translation/cld2/internal/scoreonescriptspan.cc @@ -0,0 +1,1334 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// Author: dsites@google.com (Dick Sites) +// Updated 2014.01 for dual table lookup +// + +#include "scoreonescriptspan.h" + +#include "cldutil.h" +#include "debug.h" +#include "lang_script.h" + +#include <stdio.h> + +using namespace std; + +namespace CLD2 { + +static const int kUnreliablePercentThreshold = 75; + +void AddLangProb(uint32 langprob, Tote* chunk_tote) { + ProcessProbV2Tote(langprob, chunk_tote); +} + +void ZeroPSLang(uint32 langprob, Tote* chunk_tote) { + uint8 top1 = (langprob >> 8) & 0xff; + chunk_tote->SetScore(top1, 0); +} + +bool SameCloseSet(uint16 lang1, uint16 lang2) { + int lang1_close_set = LanguageCloseSet(static_cast<Language>(lang1)); + if (lang1_close_set == 0) {return false;} + int lang2_close_set = LanguageCloseSet(static_cast<Language>(lang2)); + return (lang1_close_set == lang2_close_set); +} + +bool SameCloseSet(Language lang1, Language lang2) { + int lang1_close_set = LanguageCloseSet(lang1); + if (lang1_close_set == 0) {return false;} + int lang2_close_set = LanguageCloseSet(lang2); + return (lang1_close_set == lang2_close_set); +} + + +// Needs expected score per 1KB in scoring context +void SetChunkSummary(ULScript ulscript, int first_linear_in_chunk, + int offset, int len, + const ScoringContext* scoringcontext, + const Tote* chunk_tote, + ChunkSummary* chunksummary) { + int key3[3]; + chunk_tote->CurrentTopThreeKeys(key3); + Language lang1 = FromPerScriptNumber(ulscript, key3[0]); + Language lang2 = FromPerScriptNumber(ulscript, key3[1]); + + int actual_score_per_kb = 0; + if (len > 0) { + actual_score_per_kb = (chunk_tote->GetScore(key3[0]) << 10) / len; + } + int expected_subscr = lang1 * 4 + LScript4(ulscript); + int expected_score_per_kb = + scoringcontext->scoringtables->kExpectedScore[expected_subscr]; + + chunksummary->offset = offset; + chunksummary->chunk_start = first_linear_in_chunk; + chunksummary->lang1 = lang1; + chunksummary->lang2 = lang2; + chunksummary->score1 = chunk_tote->GetScore(key3[0]); + chunksummary->score2 = chunk_tote->GetScore(key3[1]); + chunksummary->bytes = len; + chunksummary->grams = chunk_tote->GetScoreCount(); + chunksummary->ulscript = ulscript; + chunksummary->reliability_delta = ReliabilityDelta(chunksummary->score1, + chunksummary->score2, + chunksummary->grams); + // If lang1/lang2 in same close set, set delta reliability to 100% + if (SameCloseSet(lang1, lang2)) { + chunksummary->reliability_delta = 100; + } + chunksummary->reliability_score = + ReliabilityExpected(actual_score_per_kb, expected_score_per_kb); +} + +// Return true if just lang1 is there: lang2=0 and lang3=0 +bool IsSingleLang(uint32 langprob) { + // Probably a bug -- which end is lang1? But only used to call empty Boost1 + return ((langprob & 0x00ffff00) == 0); +} + +// Update scoring context distinct_boost for single language quad +void AddDistinctBoost1(uint32 langprob, ScoringContext* scoringcontext) { + // Probably keep this empty -- not a good enough signal +} + +// Update scoring context distinct_boost for distinct octagram +// Keep last 4 used. Since these are mostly (except at splices) in +// hitbuffer, we might be able to just use a subscript and splice +void AddDistinctBoost2(uint32 langprob, ScoringContext* scoringcontext) { +// this is called 0..n times per chunk with decoded hitbuffer->distinct... + LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn; + if (scoringcontext->ulscript != ULScript_Latin) { + distinct_boost = &scoringcontext->distinct_boost.othr; + } + int n = distinct_boost->n; + distinct_boost->langprob[n] = langprob; + distinct_boost->n = distinct_boost->wrap(n + 1); +} + +// For each chunk, add extra weight for language priors (from content-lang and +// meta lang=xx) and distinctive tokens +void ScoreBoosts(const ScoringContext* scoringcontext, Tote* chunk_tote) { + // Get boosts for current script + const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn; + const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn; + const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn; + if (scoringcontext->ulscript != ULScript_Latin) { + langprior_boost = &scoringcontext->langprior_boost.othr; + langprior_whack = &scoringcontext->langprior_whack.othr; + distinct_boost = &scoringcontext->distinct_boost.othr; + } + + for (int k = 0; k < kMaxBoosts; ++k) { + uint32 langprob = langprior_boost->langprob[k]; + if (langprob > 0) {AddLangProb(langprob, chunk_tote);} + } + for (int k = 0; k < kMaxBoosts; ++k) { + uint32 langprob = distinct_boost->langprob[k]; + if (langprob > 0) {AddLangProb(langprob, chunk_tote);} + } + // boost has a packed set of per-script langs and probabilites + // whack has a packed set of per-script lang to be suppressed (zeroed) + // When a language in a close set is given as an explicit hint, others in + // that set will be whacked here. + for (int k = 0; k < kMaxBoosts; ++k) { + uint32 langprob = langprior_whack->langprob[k]; + if (langprob > 0) {ZeroPSLang(langprob, chunk_tote);} + } +} + + + +// At this point, The chunk is described by +// hitbuffer->base[cspan->chunk_base .. cspan->chunk_base + cspan->base_len) +// hitbuffer->delta[cspan->chunk_delta ... ) +// hitbuffer->distinct[cspan->chunk_distinct ... ) +// Scored text is in text[lo..hi) where +// lo is 0 or the min of first base/delta/distinct hitbuffer offset and +// hi is the min of next base/delta/distinct hitbuffer offset after +// base_len, etc. +void GetTextSpanOffsets(const ScoringHitBuffer* hitbuffer, + const ChunkSpan* cspan, int* lo, int* hi) { + // Front of this span + int lo_base = hitbuffer->base[cspan->chunk_base].offset; + int lo_delta = hitbuffer->delta[cspan->chunk_delta].offset; + int lo_distinct = hitbuffer->distinct[cspan->chunk_distinct].offset; + // Front of next span + int hi_base = hitbuffer->base[cspan->chunk_base + + cspan->base_len].offset; + int hi_delta = hitbuffer->delta[cspan->chunk_delta + + cspan->delta_len].offset; + int hi_distinct = hitbuffer->distinct[cspan->chunk_distinct + + cspan->distinct_len].offset; + + *lo = 0; +// if (cspan->chunk_base > 0) { +// *lo = minint(minint(lo_base, lo_delta), lo_distinct); +// } + *lo = minint(minint(lo_base, lo_delta), lo_distinct); + *hi = minint(minint(hi_base, hi_delta), hi_distinct); +} + + +int DiffScore(const CLD2TableSummary* obj, int indirect, + uint16 lang1, uint16 lang2) { + if (indirect < static_cast<int>(obj->kCLDTableSizeOne)) { + // Up to three languages at indirect + uint32 langprob = obj->kCLDTableInd[indirect]; + return GetLangScore(langprob, lang1) - GetLangScore(langprob, lang2); + } else { + // Up to six languages at start + 2 * (indirect - start) + indirect += (indirect - obj->kCLDTableSizeOne); + uint32 langprob = obj->kCLDTableInd[indirect]; + uint32 langprob2 = obj->kCLDTableInd[indirect + 1]; + return (GetLangScore(langprob, lang1) + GetLangScore(langprob2, lang1)) - + (GetLangScore(langprob, lang2) + GetLangScore(langprob2, lang2)); + } + +} + +// Score all the bases, deltas, distincts, boosts for one chunk into chunk_tote +// After last chunk there is always a hitbuffer entry with an offset just off +// the end of the text. +// Sets delta_len, and distinct_len +void ScoreOneChunk(const char* text, ULScript ulscript, + const ScoringHitBuffer* hitbuffer, + int chunk_i, + ScoringContext* scoringcontext, + ChunkSpan* cspan, Tote* chunk_tote, + ChunkSummary* chunksummary) { + int first_linear_in_chunk = hitbuffer->chunk_start[chunk_i]; + int first_linear_in_next_chunk = hitbuffer->chunk_start[chunk_i + 1]; + + chunk_tote->Reinit(); + cspan->delta_len = 0; + cspan->distinct_len = 0; + if (scoringcontext->flags_cld2_verbose) { + fprintf(scoringcontext->debug_file, "<br>ScoreOneChunk[%d..%d) ", + first_linear_in_chunk, first_linear_in_next_chunk); + } + + // 2013.02.05 linear design: just use base and base_len for the span + cspan->chunk_base = first_linear_in_chunk; + cspan->base_len = first_linear_in_next_chunk - first_linear_in_chunk; + for (int i = first_linear_in_chunk; i < first_linear_in_next_chunk; ++i) { + uint32 langprob = hitbuffer->linear[i].langprob; + AddLangProb(langprob, chunk_tote); + if (hitbuffer->linear[i].type <= QUADHIT) { + chunk_tote->AddScoreCount(); // Just count quads, not octas + } + if (hitbuffer->linear[i].type == DISTINCTHIT) { + AddDistinctBoost2(langprob, scoringcontext); + } + } + + // Score language prior boosts + // Score distinct word boost + ScoreBoosts(scoringcontext, chunk_tote); + + int lo = hitbuffer->linear[first_linear_in_chunk].offset; + int hi = hitbuffer->linear[first_linear_in_next_chunk].offset; + + // Chunk_tote: get top langs, scores, etc. and fill in chunk summary + SetChunkSummary(ulscript, first_linear_in_chunk, lo, hi - lo, + scoringcontext, chunk_tote, chunksummary); + + bool more_to_come = false; + bool score_cjk = false; + if (scoringcontext->flags_cld2_html) { + // Show one chunk in readable output + CLD2_Debug(text, lo, hi, more_to_come, score_cjk, hitbuffer, + scoringcontext, cspan, chunksummary); + } + + scoringcontext->prior_chunk_lang = static_cast<Language>(chunksummary->lang1); +} + + +// Score chunks of text described by hitbuffer, allowing each to be in a +// different language, and optionally adjusting the boundaries inbetween. +// Set last_cspan to the last chunkspan used +void ScoreAllHits(const char* text, ULScript ulscript, + bool more_to_come, bool score_cjk, + const ScoringHitBuffer* hitbuffer, + ScoringContext* scoringcontext, + SummaryBuffer* summarybuffer, ChunkSpan* last_cspan) { + ChunkSpan prior_cspan = {0, 0, 0, 0, 0, 0}; + ChunkSpan cspan = {0, 0, 0, 0, 0, 0}; + + for (int i = 0; i < hitbuffer->next_chunk_start; ++i) { + // Score one chunk + // Sets delta_len, and distinct_len + Tote chunk_tote; + ChunkSummary chunksummary; + ScoreOneChunk(text, ulscript, + hitbuffer, i, + scoringcontext, &cspan, &chunk_tote, &chunksummary); + + // Put result in summarybuffer + if (summarybuffer->n < kMaxSummaries) { + summarybuffer->chunksummary[summarybuffer->n] = chunksummary; + summarybuffer->n += 1; + } + + prior_cspan = cspan; + cspan.chunk_base += cspan.base_len; + cspan.chunk_delta += cspan.delta_len; + cspan.chunk_distinct += cspan.distinct_len; + } + + // Add one dummy off the end to hold first unused linear_in_chunk + int linear_off_end = hitbuffer->next_linear; + int offset_off_end = hitbuffer->linear[linear_off_end].offset; + ChunkSummary* cs = &summarybuffer->chunksummary[summarybuffer->n]; + memset(cs, 0, sizeof(ChunkSummary)); + cs->offset = offset_off_end; + cs->chunk_start = linear_off_end; + *last_cspan = prior_cspan; +} + + +void SummaryBufferToDocTote(const SummaryBuffer* summarybuffer, + bool more_to_come, DocTote* doc_tote) { + int cs_bytes_sum = 0; + for (int i = 0; i < summarybuffer->n; ++i) { + const ChunkSummary* cs = &summarybuffer->chunksummary[i]; + int reliability = minint(cs->reliability_delta, cs->reliability_score); + // doc_tote uses full languages + doc_tote->Add(cs->lang1, cs->bytes, cs->score1, reliability); + cs_bytes_sum += cs->bytes; + } +} + +// Turn on for debugging vectors +static const bool kShowLettersOriginal = false; + + +// If next chunk language matches last vector language, extend last element +// Otherwise add new element to vector +void ItemToVector(ScriptScanner* scanner, + ResultChunkVector* vec, Language new_lang, + int mapped_offset, int mapped_len) { + uint16 last_vec_lang = static_cast<uint16>(UNKNOWN_LANGUAGE); + int last_vec_subscr = vec->size() - 1; + if (last_vec_subscr >= 0) { + ResultChunk* priorrc = &(*vec)[last_vec_subscr]; + last_vec_lang = priorrc->lang1; + if (new_lang == last_vec_lang) { + // Extend prior. Current mapped_offset may be beyond prior end, so do + // the arithmetic to include any such gap + priorrc->bytes = minint((mapped_offset + mapped_len) - priorrc->offset, + kMaxResultChunkBytes); + if (kShowLettersOriginal) { + // Optionally print the new chunk original text + string temp2(&scanner->GetBufferStart()[priorrc->offset], + priorrc->bytes); + fprintf(stderr, "Item[%d..%d) '%s'<br>\n", + priorrc->offset, priorrc->offset + priorrc->bytes, + GetHtmlEscapedText(temp2).c_str()); + } + return; + } + } + // Add new vector element + ResultChunk rc; + rc.offset = mapped_offset; + rc.bytes = minint(mapped_len, kMaxResultChunkBytes); + rc.lang1 = static_cast<uint16>(new_lang); + vec->push_back(rc); + if (kShowLettersOriginal) { + // Optionally print the new chunk original text + string temp2(&scanner->GetBufferStart()[rc.offset], rc.bytes); + fprintf(stderr, "Item[%d..%d) '%s'<br>\n", + rc.offset, rc.offset + rc.bytes, + GetHtmlEscapedText(temp2).c_str()); + } +} + +uint16 PriorVecLang(const ResultChunkVector* vec) { + if (vec->empty()) {return static_cast<uint16>(UNKNOWN_LANGUAGE);} + return (*vec)[vec->size() - 1].lang1; +} + +uint16 NextChunkLang(const SummaryBuffer* summarybuffer, int i) { + if ((i + 1) >= summarybuffer->n) { + return static_cast<uint16>(UNKNOWN_LANGUAGE); + } + return summarybuffer->chunksummary[i + 1].lang1; +} + + + +// Add n elements of summarybuffer to resultchunk vector: +// Each element is letters-only text [offset..offset+bytes) +// This maps back to original[Back(offset)..Back(offset+bytes)) +// +// We go out of our way to minimize the variation in the ResultChunkVector, +// so that the caller has fewer but more meaningful spans in different +// lanaguges, for the likely purpose of translation or spell-check. +// +// The language of each chunk is lang1, but it might be unreliable for +// either of two reasons: its score is relatively too close to the score of +// lang2, or its score is too far away from the expected score of real text in +// the given language. Unreliable languages are mapped to Unknown. +// +void SummaryBufferToVector(ScriptScanner* scanner, const char* text, + const SummaryBuffer* summarybuffer, + bool more_to_come, ResultChunkVector* vec) { + if (vec == NULL) {return;} + + if (kShowLettersOriginal) { + fprintf(stderr, "map2original_ "); + scanner->map2original_.DumpWindow(); + fprintf(stderr, "<br>\n"); + fprintf(stderr, "map2uplow_ "); + scanner->map2uplow_.DumpWindow(); + fprintf(stderr, "<br>\n"); + } + + for (int i = 0; i < summarybuffer->n; ++i) { + const ChunkSummary* cs = &summarybuffer->chunksummary[i]; + int unmapped_offset = cs->offset; + int unmapped_len = cs->bytes; + + if (kShowLettersOriginal) { + // Optionally print the chunk lowercase letters/marks text + string temp(&text[unmapped_offset], unmapped_len); + fprintf(stderr, "Letters [%d..%d) '%s'<br>\n", + unmapped_offset, unmapped_offset + unmapped_len, + GetHtmlEscapedText(temp).c_str()); + } + + int mapped_offset = scanner->MapBack(unmapped_offset); + + // Trim back a little to prefer splicing original at word boundaries + if (mapped_offset > 0) { + // Size of prior vector entry, if any + int prior_size = 0; + if (!vec->empty()) { + ResultChunk* rc = &(*vec)[vec->size() - 1]; + prior_size = rc->bytes; + } + // Maximum back up size to leave at least 3 bytes in prior, + // and not entire buffer, and no more than 12 bytes total backup + int n_limit = minint(prior_size - 3, mapped_offset); + n_limit = minint(n_limit, 12); + + // Backscan over letters, stopping if prior byte is < 0x41 + // There is some possibility that we will backscan over a different script + const char* s = &scanner->GetBufferStart()[mapped_offset]; + const unsigned char* us = reinterpret_cast<const unsigned char*>(s); + int n = 0; + while ((n < n_limit) && (us[-n - 1] >= 0x41)) {++n;} + if (n >= n_limit) {n = 0;} // New boundary not found within range + + // Also back up exactly one leading punctuation character if '"#@ + if (n < n_limit) { + unsigned char c = us[-n - 1]; + if ((c == '\'') || (c == '"') || (c == '#') || (c == '@')) {++n;} + } + // Shrink the previous chunk slightly + if (n > 0) { + ResultChunk* rc = &(*vec)[vec->size() - 1]; + rc->bytes -= n; + mapped_offset -= n; + if (kShowLettersOriginal) { + fprintf(stderr, "Back up %d bytes<br>\n", n); + // Optionally print the prior chunk original text + string temp2(&scanner->GetBufferStart()[rc->offset], rc->bytes); + fprintf(stderr, "Prior [%d..%d) '%s'<br>\n", + rc->offset, rc->offset + rc->bytes, + GetHtmlEscapedText(temp2).c_str()); + } + } + } + + int mapped_len = + scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset; + + if (kShowLettersOriginal) { + // Optionally print the chunk original text + string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len); + fprintf(stderr, "Original[%d..%d) '%s'<br>\n", + mapped_offset, mapped_offset + mapped_len, + GetHtmlEscapedText(temp2).c_str()); + } + + Language new_lang = static_cast<Language>(cs->lang1); + bool reliability_delta_bad = + (cs->reliability_delta < kUnreliablePercentThreshold); + bool reliability_score_bad = + (cs->reliability_score < kUnreliablePercentThreshold); + + // If the top language matches last vector, ignore reliability_delta + uint16 prior_lang = PriorVecLang(vec); + if (prior_lang == cs->lang1) { + reliability_delta_bad = false; + } + // If the top language is in same close set as last vector, set up to merge + if (SameCloseSet(cs->lang1, prior_lang)) { + new_lang = static_cast<Language>(prior_lang); + reliability_delta_bad = false; + } + // If the top two languages are in the same close set and the last vector + // language is the second language, set up to merge + if (SameCloseSet(cs->lang1, cs->lang2) && + (prior_lang == cs->lang2)) { + new_lang = static_cast<Language>(prior_lang); + reliability_delta_bad = false; + } + // If unreliable and the last and next vector languages are both + // the second language, set up to merge + uint16 next_lang = NextChunkLang(summarybuffer, i); + if (reliability_delta_bad && + (prior_lang == cs->lang2) && (next_lang == cs->lang2)) { + new_lang = static_cast<Language>(prior_lang); + reliability_delta_bad = false; + } + + if (reliability_delta_bad || reliability_score_bad) { + new_lang = UNKNOWN_LANGUAGE; + } + ItemToVector(scanner, vec, new_lang, mapped_offset, mapped_len); + } +} + +// Add just one element to resultchunk vector: +// For RTypeNone or RTypeOne +void JustOneItemToVector(ScriptScanner* scanner, const char* text, + Language lang1, int unmapped_offset, int unmapped_len, + ResultChunkVector* vec) { + if (vec == NULL) {return;} + + if (kShowLettersOriginal) { + fprintf(stderr, "map2original_ "); + scanner->map2original_.DumpWindow(); + fprintf(stderr, "<br>\n"); + fprintf(stderr, "map2uplow_ "); + scanner->map2uplow_.DumpWindow(); + fprintf(stderr, "<br>\n"); + } + + if (kShowLettersOriginal) { + // Optionally print the chunk lowercase letters/marks text + string temp(&text[unmapped_offset], unmapped_len); + fprintf(stderr, "Letters1 [%d..%d) '%s'<br>\n", + unmapped_offset, unmapped_offset + unmapped_len, + GetHtmlEscapedText(temp).c_str()); + } + + int mapped_offset = scanner->MapBack(unmapped_offset); + int mapped_len = + scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset; + + if (kShowLettersOriginal) { + // Optionally print the chunk original text + string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len); + fprintf(stderr, "Original1[%d..%d) '%s'<br>\n", + mapped_offset, mapped_offset + mapped_len, + GetHtmlEscapedText(temp2).c_str()); + } + + ItemToVector(scanner, vec, lang1, mapped_offset, mapped_len); +} + + +// Debugging. Not thread safe. Defined in getonescriptspan +char* DisplayPiece(const char* next_byte_, int byte_length_); + +// If high bit is on, take out high bit and add 2B to make table2 entries easy +inline int PrintableIndirect(int x) { + if ((x & 0x80000000u) != 0) { + return (x & ~0x80000000u) + 2000000000; + } + return x; +} +void DumpHitBuffer(FILE* df, const char* text, + const ScoringHitBuffer* hitbuffer) { + fprintf(df, + "<br>DumpHitBuffer[%s, next_base/delta/distinct %d, %d, %d)<br>\n", + ULScriptCode(hitbuffer->ulscript), + hitbuffer->next_base, hitbuffer->next_delta, + hitbuffer->next_distinct); + for (int i = 0; i < hitbuffer->maxscoringhits; ++i) { + if (i < hitbuffer->next_base) { + fprintf(df, "Q[%d]%d,%d,%s ", + i, hitbuffer->base[i].offset, + PrintableIndirect(hitbuffer->base[i].indirect), + DisplayPiece(&text[hitbuffer->base[i].offset], 6)); + } + if (i < hitbuffer->next_delta) { + fprintf(df, "DL[%d]%d,%d,%s ", + i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect, + DisplayPiece(&text[hitbuffer->delta[i].offset], 12)); + } + if (i < hitbuffer->next_distinct) { + fprintf(df, "D[%d]%d,%d,%s ", + i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect, + DisplayPiece(&text[hitbuffer->distinct[i].offset], 12)); + } + if (i < hitbuffer->next_base) { + fprintf(df, "<br>\n"); + } + if (i > 50) {break;} + } + if (hitbuffer->next_base > 50) { + int i = hitbuffer->next_base; + fprintf(df, "Q[%d]%d,%d,%s ", + i, hitbuffer->base[i].offset, + PrintableIndirect(hitbuffer->base[i].indirect), + DisplayPiece(&text[hitbuffer->base[i].offset], 6)); + } + if (hitbuffer->next_delta > 50) { + int i = hitbuffer->next_delta; + fprintf(df, "DL[%d]%d,%d,%s ", + i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect, + DisplayPiece(&text[hitbuffer->delta[i].offset], 12)); + } + if (hitbuffer->next_distinct > 50) { + int i = hitbuffer->next_distinct; + fprintf(df, "D[%d]%d,%d,%s ", + i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect, + DisplayPiece(&text[hitbuffer->distinct[i].offset], 12)); + } + fprintf(df, "<br>\n"); +} + + +void DumpLinearBuffer(FILE* df, const char* text, + const ScoringHitBuffer* hitbuffer) { + fprintf(df, "<br>DumpLinearBuffer[%d)<br>\n", + hitbuffer->next_linear); + // Include the dummy entry off the end + for (int i = 0; i < hitbuffer->next_linear + 1; ++i) { + if ((50 < i) && (i < (hitbuffer->next_linear - 1))) {continue;} + fprintf(df, "[%d]%d,%c=%08x,%s<br>\n", + i, hitbuffer->linear[i].offset, + "UQLD"[hitbuffer->linear[i].type], + hitbuffer->linear[i].langprob, + DisplayPiece(&text[hitbuffer->linear[i].offset], 6)); + } + fprintf(df, "<br>\n"); + + fprintf(df, "DumpChunkStart[%d]<br>\n", hitbuffer->next_chunk_start); + for (int i = 0; i < hitbuffer->next_chunk_start + 1; ++i) { + fprintf(df, "[%d]%d\n", i, hitbuffer->chunk_start[i]); + } + fprintf(df, "<br>\n"); +} + +// Move this verbose debugging output to debug.cc eventually +void DumpChunkSummary(FILE* df, const ChunkSummary* cs) { + // Print chunksummary + fprintf(df, "%d lin[%d] %s.%d %s.%d %dB %d# %s %dRd %dRs<br>\n", + cs->offset, + cs->chunk_start, + LanguageCode(static_cast<Language>(cs->lang1)), + cs->score1, + LanguageCode(static_cast<Language>(cs->lang2)), + cs->score2, + cs->bytes, + cs->grams, + ULScriptCode(static_cast<ULScript>(cs->ulscript)), + cs->reliability_delta, + cs->reliability_score); +} + +void DumpSummaryBuffer(FILE* df, const SummaryBuffer* summarybuffer) { + fprintf(df, "<br>DumpSummaryBuffer[%d]<br>\n", summarybuffer->n); + fprintf(df, "[i] offset linear[chunk_start] lang.score1 lang.score2 " + "bytesB ngrams# script rel_delta rel_score<br>\n"); + for (int i = 0; i <= summarybuffer->n; ++i) { + fprintf(df, "[%d] ", i); + DumpChunkSummary(df, &summarybuffer->chunksummary[i]); + } + fprintf(df, "<br>\n"); +} + + + +// Within hitbufer->linear[] +// <-- prior chunk --><-- this chunk --> +// | | | +// linear0 linear1 linear2 +// lang0 lang1 +// The goal of sharpening is to move this_linear to better separate langs +int BetterBoundary(const char* text, + ScoringHitBuffer* hitbuffer, + ScoringContext* scoringcontext, + uint16 pslang0, uint16 pslang1, + int linear0, int linear1, int linear2) { + // Degenerate case, no change + if ((linear2 - linear0) <= 8) {return linear1;} + + // Each diff gives pslang0 score - pslang1 score + // Running diff has four entries + + + + followed by four entries - - - - + // so that this value is maximal at the sharpest boundary between pslang0 + // (positive diffs) and pslang1 (negative diffs) + int running_diff = 0; + int diff[8]; // Ring buffer of pslang0-pslang1 differences + // Initialize with first 8 diffs + for (int i = linear0; i < linear0 + 8; ++i) { + int j = i & 7; + uint32 langprob = hitbuffer->linear[i].langprob; + diff[j] = GetLangScore(langprob, pslang0) - + GetLangScore(langprob, pslang1); + if (i < linear0 + 4) { + // First four diffs pslang0 - pslang1 + running_diff += diff[j]; + } else { + // Second four diffs -(pslang0 - pslang1) + running_diff -= diff[j]; + } + } + + // Now scan for sharpest boundary. j is at left end of 8 entries + // To be a boundary, there must be both >0 and <0 entries in the window + int better_boundary_value = 0; + int better_boundary = linear1; + for (int i = linear0; i < linear2 - 8; ++i) { + int j = i & 7; + if (better_boundary_value < running_diff) { + bool has_plus = false; + bool has_minus = false; + for (int kk = 0; kk < 8; ++kk) { + if (diff[kk] > 0) {has_plus = true;} + if (diff[kk] < 0) {has_minus = true;} + } + if (has_plus && has_minus) { + better_boundary_value = running_diff; + better_boundary = i + 4; + } + } + // Shift right one entry + uint32 langprob = hitbuffer->linear[i + 8].langprob; + int newdiff = GetLangScore(langprob, pslang0) - + GetLangScore(langprob, pslang1); + int middiff = diff[(i + 4) & 7]; + int olddiff = diff[j]; + diff[j] = newdiff; + running_diff -= olddiff; // Remove left + running_diff += 2 * middiff; // Convert middle from - to + + running_diff -= newdiff; // Insert right + } + + if (scoringcontext->flags_cld2_verbose && (linear1 != better_boundary)) { + Language lang0 = FromPerScriptNumber(scoringcontext->ulscript, pslang0); + Language lang1 = FromPerScriptNumber(scoringcontext->ulscript, pslang1); + fprintf(scoringcontext->debug_file, " Better lin[%d=>%d] %s^^%s <br>\n", + linear1, better_boundary, + LanguageCode(lang0), LanguageCode(lang1)); + int lin0_off = hitbuffer->linear[linear0].offset; + int lin1_off = hitbuffer->linear[linear1].offset; + int lin2_off = hitbuffer->linear[linear2].offset; + int better_offm1 = hitbuffer->linear[better_boundary - 1].offset; + int better_off = hitbuffer->linear[better_boundary].offset; + int better_offp1 = hitbuffer->linear[better_boundary + 1].offset; + string old0(&text[lin0_off], lin1_off - lin0_off); + string old1(&text[lin1_off], lin2_off - lin1_off); + string new0(&text[lin0_off], better_offm1 - lin0_off); + string new0m1(&text[better_offm1], better_off - better_offm1); + string new1(&text[better_off], better_offp1 - better_off); + string new1p1(&text[better_offp1], lin2_off - better_offp1); + fprintf(scoringcontext->debug_file, "%s^^%s => <br>\n%s^%s^^%s^%s<br>\n", + GetHtmlEscapedText(old0).c_str(), + GetHtmlEscapedText(old1).c_str(), + GetHtmlEscapedText(new0).c_str(), + GetHtmlEscapedText(new0m1).c_str(), + GetHtmlEscapedText(new1).c_str(), + GetHtmlEscapedText(new1p1).c_str()); + // Slow picture of differences per linear entry + int d; + for (int i = linear0; i < linear2; ++i) { + if (i == better_boundary) { + fprintf(scoringcontext->debug_file, "^^ "); + } + uint32 langprob = hitbuffer->linear[i].langprob; + d = GetLangScore(langprob, pslang0) - GetLangScore(langprob, pslang1); + const char* s = "="; + //if (d > 2) {s = "\xc2\xaf";} // Macron + if (d > 2) {s = "#";} + else if (d > 0) {s = "+";} + else if (d < -2) {s = "_";} + else if (d < 0) {s = "-";} + fprintf(scoringcontext->debug_file, "%s ", s); + } + fprintf(scoringcontext->debug_file, " (scale: #+=-_)<br>\n"); + } + return better_boundary; +} + + +// For all but the first summary, if its top language differs from +// the previous chunk, refine the boundary +// Linearized version +void SharpenBoundaries(const char* text, + bool more_to_come, + ScoringHitBuffer* hitbuffer, + ScoringContext* scoringcontext, + SummaryBuffer* summarybuffer) { + + int prior_linear = summarybuffer->chunksummary[0].chunk_start; + uint16 prior_lang = summarybuffer->chunksummary[0].lang1; + + if (scoringcontext->flags_cld2_verbose) { + fprintf(scoringcontext->debug_file, "<br>SharpenBoundaries<br>\n"); + } + for (int i = 1; i < summarybuffer->n; ++i) { + ChunkSummary* cs = &summarybuffer->chunksummary[i]; + uint16 this_lang = cs->lang1; + if (this_lang == prior_lang) { + prior_linear = cs->chunk_start; + continue; + } + + int this_linear = cs->chunk_start; + int next_linear = summarybuffer->chunksummary[i + 1].chunk_start; + + // If this/prior in same close set, don't move boundary + if (SameCloseSet(prior_lang, this_lang)) { + prior_linear = this_linear; + prior_lang = this_lang; + continue; + } + + + // Within hitbuffer->linear[] + // <-- prior chunk --><-- this chunk --> + // | | | + // prior_linear this_linear next_linear + // prior_lang this_lang + // The goal of sharpening is to move this_linear to better separate langs + + uint8 pslang0 = PerScriptNumber(scoringcontext->ulscript, + static_cast<Language>(prior_lang)); + uint8 pslang1 = PerScriptNumber(scoringcontext->ulscript, + static_cast<Language>(this_lang)); + int better_linear = BetterBoundary(text, + hitbuffer, + scoringcontext, + pslang0, pslang1, + prior_linear, this_linear, next_linear); + + int old_offset = hitbuffer->linear[this_linear].offset; + int new_offset = hitbuffer->linear[better_linear].offset; + cs->chunk_start = better_linear; + cs->offset = new_offset; + // If this_linear moved right, make bytes smaller for this, larger for prior + // If this_linear moved left, make bytes larger for this, smaller for prior + cs->bytes -= (new_offset - old_offset); + summarybuffer->chunksummary[i - 1].bytes += (new_offset - old_offset); + + this_linear = better_linear; // Update so that next chunk doesn't intrude + + // Consider rescoring the two chunks + + // Update for next round (note: using pre-updated boundary) + prior_linear = this_linear; + prior_lang = this_lang; + } +} + +// Make a langprob that gives small weight to the default language for ulscript +uint32 DefaultLangProb(ULScript ulscript) { + Language default_lang = DefaultLanguage(ulscript); + return MakeLangProb(default_lang, 1); +} + +// Effectively, do a merge-sort based on text offsets +// Look up each indirect value in appropriate scoring table and keep +// just the resulting langprobs +void LinearizeAll(ScoringContext* scoringcontext, bool score_cjk, + ScoringHitBuffer* hitbuffer) { + const CLD2TableSummary* base_obj; // unigram or quadgram + const CLD2TableSummary* base_obj2; // quadgram dual table + const CLD2TableSummary* delta_obj; // bigram or octagram + const CLD2TableSummary* distinct_obj; // bigram or octagram + uint16 base_hit; + if (score_cjk) { + base_obj = scoringcontext->scoringtables->unigram_compat_obj; + base_obj2 = scoringcontext->scoringtables->unigram_compat_obj; + delta_obj = scoringcontext->scoringtables->deltabi_obj; + distinct_obj = scoringcontext->scoringtables->distinctbi_obj; + base_hit = UNIHIT; + } else { + base_obj = scoringcontext->scoringtables->quadgram_obj; + base_obj2 = scoringcontext->scoringtables->quadgram_obj2; + delta_obj = scoringcontext->scoringtables->deltaocta_obj; + distinct_obj = scoringcontext->scoringtables->distinctocta_obj; + base_hit = QUADHIT; + } + + int base_limit = hitbuffer->next_base; + int delta_limit = hitbuffer->next_delta; + int distinct_limit = hitbuffer->next_distinct; + int base_i = 0; + int delta_i = 0; + int distinct_i = 0; + int linear_i = 0; + + // Start with an initial base hit for the default language for this script + // Inserting this avoids edge effects with no hits at all + hitbuffer->linear[linear_i].offset = hitbuffer->lowest_offset; + hitbuffer->linear[linear_i].type = base_hit; + hitbuffer->linear[linear_i].langprob = + DefaultLangProb(scoringcontext->ulscript); + ++linear_i; + + while ((base_i < base_limit) || (delta_i < delta_limit) || + (distinct_i < distinct_limit)) { + int base_off = hitbuffer->base[base_i].offset; + int delta_off = hitbuffer->delta[delta_i].offset; + int distinct_off = hitbuffer->distinct[distinct_i].offset; + + // Do delta and distinct first, so that they are not lost at base_limit + if ((delta_i < delta_limit) && + (delta_off <= base_off) && (delta_off <= distinct_off)) { + // Add delta entry + int indirect = hitbuffer->delta[delta_i].indirect; + ++delta_i; + uint32 langprob = delta_obj->kCLDTableInd[indirect]; + if (langprob > 0) { + hitbuffer->linear[linear_i].offset = delta_off; + hitbuffer->linear[linear_i].type = DELTAHIT; + hitbuffer->linear[linear_i].langprob = langprob; + ++linear_i; + } + } + else if ((distinct_i < distinct_limit) && + (distinct_off <= base_off) && (distinct_off <= delta_off)) { + // Add distinct entry + int indirect = hitbuffer->distinct[distinct_i].indirect; + ++distinct_i; + uint32 langprob = distinct_obj->kCLDTableInd[indirect]; + if (langprob > 0) { + hitbuffer->linear[linear_i].offset = distinct_off; + hitbuffer->linear[linear_i].type = DISTINCTHIT; + hitbuffer->linear[linear_i].langprob = langprob; + ++linear_i; + } + } + else { + // Add one or two base entries + int indirect = hitbuffer->base[base_i].indirect; + // First, get right scoring table + const CLD2TableSummary* local_base_obj = base_obj; + if ((indirect & 0x80000000u) != 0) { + local_base_obj = base_obj2; + indirect &= ~0x80000000u; + } + ++base_i; + // One langprob in kQuadInd[0..SingleSize), + // two in kQuadInd[SingleSize..Size) + if (indirect < static_cast<int>(local_base_obj->kCLDTableSizeOne)) { + // Up to three languages at indirect + uint32 langprob = local_base_obj->kCLDTableInd[indirect]; + if (langprob > 0) { + hitbuffer->linear[linear_i].offset = base_off; + hitbuffer->linear[linear_i].type = base_hit; + hitbuffer->linear[linear_i].langprob = langprob; + ++linear_i; + } + } else { + // Up to six languages at start + 2 * (indirect - start) + indirect += (indirect - local_base_obj->kCLDTableSizeOne); + uint32 langprob = local_base_obj->kCLDTableInd[indirect]; + uint32 langprob2 = local_base_obj->kCLDTableInd[indirect + 1]; + if (langprob > 0) { + hitbuffer->linear[linear_i].offset = base_off; + hitbuffer->linear[linear_i].type = base_hit; + hitbuffer->linear[linear_i].langprob = langprob; + ++linear_i; + } + if (langprob2 > 0) { + hitbuffer->linear[linear_i].offset = base_off; + hitbuffer->linear[linear_i].type = base_hit; + hitbuffer->linear[linear_i].langprob = langprob2; + ++linear_i; + } + } + } + } + + // Update + hitbuffer->next_linear = linear_i; + + // Add a dummy entry off the end, just to capture final offset + hitbuffer->linear[linear_i].offset = + hitbuffer->base[hitbuffer->next_base].offset; + hitbuffer->linear[linear_i].langprob = 0; +} + +// Break linear array into chunks of ~20 quadgram hits or ~50 CJK unigram hits +void ChunkAll(int letter_offset, bool score_cjk, ScoringHitBuffer* hitbuffer) { + int chunksize; + uint16 base_hit; + if (score_cjk) { + chunksize = kChunksizeUnis; + base_hit = UNIHIT; + } else { + chunksize = kChunksizeQuads; + base_hit = QUADHIT; + } + + int linear_i = 0; + int linear_off_end = hitbuffer->next_linear; + int text_i = letter_offset; // Next unseen text offset + int next_chunk_start = 0; + int bases_left = hitbuffer->next_base; + while (bases_left > 0) { + // Linearize one chunk + int base_len = chunksize; // Default; may be changed below + if (bases_left < (chunksize + (chunksize >> 1))) { + // If within 1.5 chunks of the end, avoid runts by using it all + base_len = bases_left; + } else if (bases_left < (2 * chunksize)) { + // Avoid runts by splitting 1.5 to 2 chunks in half (about 3/4 each) + base_len = (bases_left + 1) >> 1; + } + + hitbuffer->chunk_start[next_chunk_start] = linear_i; + hitbuffer->chunk_offset[next_chunk_start] = text_i; + ++next_chunk_start; + + int base_count = 0; + while ((base_count < base_len) && (linear_i < linear_off_end)) { + if (hitbuffer->linear[linear_i].type == base_hit) {++base_count;} + ++linear_i; + } + text_i = hitbuffer->linear[linear_i].offset; // Next unseen text offset + bases_left -= base_len; + } + + // If no base hits at all, make a single dummy chunk + if (next_chunk_start == 0) { + hitbuffer->chunk_start[next_chunk_start] = 0; + hitbuffer->chunk_offset[next_chunk_start] = hitbuffer->linear[0].offset; + ++next_chunk_start; + } + + // Remember the linear array start of dummy entry + hitbuffer->next_chunk_start = next_chunk_start; + + // Add a dummy entry off the end, just to capture final linear subscr + hitbuffer->chunk_start[next_chunk_start] = hitbuffer->next_linear; + hitbuffer->chunk_offset[next_chunk_start] = text_i; +} + + +// Merge-sort the individual hit arrays, go indirect on the scoring subscripts, +// break linear array into chunks. +// +// Input: +// hitbuffer base, delta, distinct arrays +// Output: +// linear array +// chunk_start array +// +void LinearizeHitBuffer(int letter_offset, + ScoringContext* scoringcontext, + bool more_to_come, bool score_cjk, + ScoringHitBuffer* hitbuffer) { + LinearizeAll(scoringcontext, score_cjk, hitbuffer); + ChunkAll(letter_offset, score_cjk, hitbuffer); +} + + + +// The hitbuffer is in an awkward form -- three sets of base/delta/distinct +// scores, each with an indirect subscript to one of six scoring tables, some +// of which can yield two langprobs for six languages, others one langprob for +// three languages. The only correlation between base/delta/distinct is their +// offsets into the letters-only text buffer. +// +// SummaryBuffer needs to be built to linear, giving linear offset of start of +// each chunk +// +// So we first do all the langprob lookups and merge-sort by offset to make +// a single linear vector, building a side vector of chunk beginnings as we go. +// The sharpening is simply moving the beginnings, scoring is a simple linear +// sweep, etc. + +void ProcessHitBuffer(const LangSpan& scriptspan, + int letter_offset, + ScoringContext* scoringcontext, + DocTote* doc_tote, + ResultChunkVector* vec, + bool more_to_come, bool score_cjk, + ScoringHitBuffer* hitbuffer) { + if (scoringcontext->flags_cld2_verbose) { + fprintf(scoringcontext->debug_file, "Hitbuffer[) "); + DumpHitBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer); + } + + LinearizeHitBuffer(letter_offset, scoringcontext, more_to_come, score_cjk, + hitbuffer); + + if (scoringcontext->flags_cld2_verbose) { + fprintf(scoringcontext->debug_file, "Linear[) "); + DumpLinearBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer); + } + + SummaryBuffer summarybuffer; + summarybuffer.n = 0; + ChunkSpan last_cspan; + ScoreAllHits(scriptspan.text, scriptspan.ulscript, + more_to_come, score_cjk, hitbuffer, + scoringcontext, + &summarybuffer, &last_cspan); + + if (scoringcontext->flags_cld2_verbose) { + DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer); + } + + if (vec != NULL) { + // Sharpen boundaries of summarybuffer + // This is not a high-performance path + SharpenBoundaries(scriptspan.text, more_to_come, hitbuffer, scoringcontext, + &summarybuffer); + // Show after the sharpening + // CLD2_Debug2(scriptspan.text, more_to_come, score_cjk, + // hitbuffer, scoringcontext, &summarybuffer); + + if (scoringcontext->flags_cld2_verbose) { + DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer); + } + } + + SummaryBufferToDocTote(&summarybuffer, more_to_come, doc_tote); + SummaryBufferToVector(scoringcontext->scanner, scriptspan.text, + &summarybuffer, more_to_come, vec); +} + +void SpliceHitBuffer(ScoringHitBuffer* hitbuffer, int next_offset) { + // Splice hitbuffer and summarybuffer for next round. With big chunks and + // distinctive-word state carried across chunks, we might not need to do this. + hitbuffer->next_base = 0; + hitbuffer->next_delta = 0; + hitbuffer->next_distinct = 0; + hitbuffer->next_linear = 0; + hitbuffer->next_chunk_start = 0; + hitbuffer->lowest_offset = next_offset; +} + + +// Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating +// scoringcontext +void ScoreEntireScriptSpan(const LangSpan& scriptspan, + ScoringContext* scoringcontext, + DocTote* doc_tote, + ResultChunkVector* vec) { + int bytes = scriptspan.text_bytes; + // Artificially set score to 1024 per 1KB, or 1 per byte + int score = bytes; + int reliability = 100; + // doc_tote uses full languages + Language one_one_lang = DefaultLanguage(scriptspan.ulscript); + doc_tote->Add(one_one_lang, bytes, score, reliability); + + if (scoringcontext->flags_cld2_html) { + ChunkSummary chunksummary = { + 1, 0, + one_one_lang, UNKNOWN_LANGUAGE, score, 1, + bytes, 0, scriptspan.ulscript, reliability, reliability + }; + CLD2_Debug(scriptspan.text, 1, scriptspan.text_bytes, + false, false, NULL, + scoringcontext, NULL, &chunksummary); + } + + // First byte is always a space + JustOneItemToVector(scoringcontext->scanner, scriptspan.text, + one_one_lang, 1, bytes - 1, vec); + + scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; +} + +// Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext +void ScoreCJKScriptSpan(const LangSpan& scriptspan, + ScoringContext* scoringcontext, + DocTote* doc_tote, + ResultChunkVector* vec) { + // Allocate three parallel arrays of scoring hits + ScoringHitBuffer* hitbuffer = new ScoringHitBuffer; + hitbuffer->init(); + hitbuffer->ulscript = scriptspan.ulscript; + + scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; + scoringcontext->oldest_distinct_boost = 0; + + // Incoming scriptspan has a single leading space at scriptspan.text[0] + // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3] + + int letter_offset = 1; // Skip initial space + hitbuffer->lowest_offset = letter_offset; + int letter_limit = scriptspan.text_bytes; + while (letter_offset < letter_limit) { + if (scoringcontext->flags_cld2_verbose) { + fprintf(scoringcontext->debug_file, " ScoreCJKScriptSpan[%d,%d)<br>\n", + letter_offset, letter_limit); + } + // + // Fill up one hitbuffer, possibly splicing onto previous fragment + // + // NOTE: GetUniHits deals with close repeats + // NOTE: After last chunk there is always a hitbuffer entry with an offset + // just off the end of the text = next_offset. + int next_offset = GetUniHits(scriptspan.text, letter_offset, letter_limit, + scoringcontext, hitbuffer); + // NOTE: GetBiHitVectors deals with close repeats, + // does one hash and two lookups (delta and distinct) per word + GetBiHits(scriptspan.text, letter_offset, next_offset, + scoringcontext, hitbuffer); + + // + // Score one hitbuffer in chunks to summarybuffer + // + bool more_to_come = next_offset < letter_limit; + bool score_cjk = true; + ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec, + more_to_come, score_cjk, hitbuffer); + SpliceHitBuffer(hitbuffer, next_offset); + + letter_offset = next_offset; + } + + delete hitbuffer; + // Context across buffers is not connected yet + scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; +} + + + +// Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext +// We have a scriptspan with all lowercase text in one script. Look up +// quadgrams and octagrams, saving the hits in three parallel vectors. +// Score from those vectors in chunks, toting each chunk to get a single +// language, and combining into the overall document score. The hit vectors +// in general are not big enough to handle and entire scriptspan, so +// repeat until the entire scriptspan is scored. +// Caller deals with minimizing numbr of runt scriptspans +// This routine deals with minimizing number of runt chunks. +// +// Returns updated scoringcontext +// Returns updated doc_tote +// If vec != NULL, appends to that vector of ResultChunk's +void ScoreQuadScriptSpan(const LangSpan& scriptspan, + ScoringContext* scoringcontext, + DocTote* doc_tote, + ResultChunkVector* vec) { + // Allocate three parallel arrays of scoring hits + ScoringHitBuffer* hitbuffer = new ScoringHitBuffer; + hitbuffer->init(); + hitbuffer->ulscript = scriptspan.ulscript; + + scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; + scoringcontext->oldest_distinct_boost = 0; + + // Incoming scriptspan has a single leading space at scriptspan.text[0] + // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3] + + int letter_offset = 1; // Skip initial space + hitbuffer->lowest_offset = letter_offset; + int letter_limit = scriptspan.text_bytes; + while (letter_offset < letter_limit) { + // + // Fill up one hitbuffer, possibly splicing onto previous fragment + // + // NOTE: GetQuadHits deals with close repeats + // NOTE: After last chunk there is always a hitbuffer entry with an offset + // just off the end of the text = next_offset. + int next_offset = GetQuadHits(scriptspan.text, letter_offset, letter_limit, + scoringcontext, hitbuffer); + // If true, there is more text to process in this scriptspan + // NOTE: GetOctaHitVectors deals with close repeats, + // does one hash and two lookups (delta and distinct) per word + GetOctaHits(scriptspan.text, letter_offset, next_offset, + scoringcontext, hitbuffer); + + // + // Score one hitbuffer in chunks to summarybuffer + // + bool more_to_come = next_offset < letter_limit; + bool score_cjk = false; + ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec, + more_to_come, score_cjk, hitbuffer); + SpliceHitBuffer(hitbuffer, next_offset); + + letter_offset = next_offset; + } + + delete hitbuffer; +} + + +// Score one scriptspan into doc_tote and vec, updating scoringcontext +// Inputs: +// One scriptspan of perhaps 40-60KB, all same script lower-case letters +// and single ASCII spaces. First character is a space to allow simple +// begining-of-word detect. End of buffer has three spaces and NUL to +// allow easy scan-to-end-of-word. +// Scoring context of +// scoring tables +// flags +// running boosts +// Outputs: +// Updated doc_tote giving overall languages and byte counts +// Optional updated chunk vector giving offset, length, language +// +// Caller initializes flags, boosts, doc_tote and vec. +// Caller aggregates across multiple scriptspans +// Caller calculates final document result +// Caller deals with detecting and triggering suppression of repeated text. +// +// This top-level routine just chooses the recognition type and calls one of +// the next-level-down routines. +// +void ScoreOneScriptSpan(const LangSpan& scriptspan, + ScoringContext* scoringcontext, + DocTote* doc_tote, + ResultChunkVector* vec) { + if (scoringcontext->flags_cld2_verbose) { + fprintf(scoringcontext->debug_file, "<br>ScoreOneScriptSpan(%s,%d) ", + ULScriptCode(scriptspan.ulscript), scriptspan.text_bytes); + // Optionally print the chunk lowercase letters/marks text + string temp(&scriptspan.text[0], scriptspan.text_bytes); + fprintf(scoringcontext->debug_file, "'%s'", + GetHtmlEscapedText(temp).c_str()); + fprintf(scoringcontext->debug_file, "<br>\n"); + } + scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; + scoringcontext->oldest_distinct_boost = 0; + ULScriptRType rtype = ULScriptRecognitionType(scriptspan.ulscript); + if (scoringcontext->flags_cld2_score_as_quads && (rtype != RTypeCJK)) { + rtype = RTypeMany; + } + switch (rtype) { + case RTypeNone: + case RTypeOne: + ScoreEntireScriptSpan(scriptspan, scoringcontext, doc_tote, vec); + break; + case RTypeCJK: + ScoreCJKScriptSpan(scriptspan, scoringcontext, doc_tote, vec); + break; + case RTypeMany: + ScoreQuadScriptSpan(scriptspan, scoringcontext, doc_tote, vec); + break; + } +} + +} // End namespace CLD2 + |