diff options
Diffstat (limited to 'media/sphinxbase/src/libsphinxbase/lm/lm3g_templates.c')
-rw-r--r-- | media/sphinxbase/src/libsphinxbase/lm/lm3g_templates.c | 560 |
1 files changed, 560 insertions, 0 deletions
diff --git a/media/sphinxbase/src/libsphinxbase/lm/lm3g_templates.c b/media/sphinxbase/src/libsphinxbase/lm/lm3g_templates.c new file mode 100644 index 000000000..080cfa8e6 --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/lm3g_templates.c @@ -0,0 +1,560 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2007 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/* + * \file lm3g_templates.c Core Sphinx 3-gram code used in + * DMP/DMP32/ARPA (for now) model code. + */ + +#include <assert.h> + +/* Locate a specific bigram within a bigram list */ +#define BINARY_SEARCH_THRESH 16 +static int32 +find_bg(bigram_t * bg, int32 n, int32 w) +{ + int32 i, b, e; + + /* Binary search until segment size < threshold */ + b = 0; + e = n; + while (e - b > BINARY_SEARCH_THRESH) { + i = (b + e) >> 1; + if (bg[i].wid < w) + b = i + 1; + else if (bg[i].wid > w) + e = i; + else + return i; + } + + /* Linear search within narrowed segment */ + for (i = b; (i < e) && (bg[i].wid != w); i++); + return ((i < e) ? i : -1); +} + +static int32 +lm3g_bg_score(NGRAM_MODEL_TYPE *model, + int32 lw1, int32 lw2, int32 *n_used) +{ + int32 i, n, b, score; + bigram_t *bg; + + if (lw1 < 0 || model->base.n < 2) { + *n_used = 1; + return model->lm3g.unigrams[lw2].prob1.l; + } + + b = FIRST_BG(model, lw1); + n = FIRST_BG(model, lw1 + 1) - b; + bg = model->lm3g.bigrams + b; + + if ((i = find_bg(bg, n, lw2)) >= 0) { + /* Access mode = bigram */ + *n_used = 2; + score = model->lm3g.prob2[bg[i].prob2].l; + } + else { + /* Access mode = unigram */ + *n_used = 1; + score = model->lm3g.unigrams[lw1].bo_wt1.l + model->lm3g.unigrams[lw2].prob1.l; + } + + return (score); +} + +static void +load_tginfo(NGRAM_MODEL_TYPE *model, int32 lw1, int32 lw2) +{ + int32 i, n, b, t; + bigram_t *bg; + tginfo_t *tginfo; + + /* First allocate space for tg information for bg lw1,lw2 */ + tginfo = (tginfo_t *) listelem_malloc(model->lm3g.le); + tginfo->w1 = lw1; + tginfo->tg = NULL; + tginfo->next = model->lm3g.tginfo[lw2]; + model->lm3g.tginfo[lw2] = tginfo; + + /* Locate bigram lw1,lw2 */ + b = model->lm3g.unigrams[lw1].bigrams; + n = model->lm3g.unigrams[lw1 + 1].bigrams - b; + bg = model->lm3g.bigrams + b; + + if ((n > 0) && ((i = find_bg(bg, n, lw2)) >= 0)) { + tginfo->bowt = model->lm3g.bo_wt2[bg[i].bo_wt2].l; + + /* Find t = Absolute first trigram index for bigram lw1,lw2 */ + b += i; /* b = Absolute index of bigram lw1,lw2 on disk */ + t = FIRST_TG(model, b); + + tginfo->tg = model->lm3g.trigrams + t; + + /* Find #tg for bigram w1,w2 */ + tginfo->n_tg = FIRST_TG(model, b + 1) - t; + } + else { /* No bigram w1,w2 */ + tginfo->bowt = 0; + tginfo->n_tg = 0; + } +} + +/* Similar to find_bg */ +static int32 +find_tg(trigram_t * tg, int32 n, uint32 w) +{ + int32 i, b, e; + + b = 0; + e = n; + while (e - b > BINARY_SEARCH_THRESH) { + i = (b + e) >> 1; + if (tg[i].wid < w) + b = i + 1; + else if (tg[i].wid > w) + e = i; + else + return i; + } + + for (i = b; (i < e) && (tg[i].wid != w); i++); + return ((i < e) ? i : -1); +} + +static int32 +lm3g_tg_score(NGRAM_MODEL_TYPE *model, int32 lw1, + int32 lw2, int32 lw3, int32 *n_used) +{ + ngram_model_t *base = &model->base; + int32 i, n, score; + trigram_t *tg; + tginfo_t *tginfo, *prev_tginfo; + + if ((base->n < 3) || (lw1 < 0) || (lw2 < 0)) + return (lm3g_bg_score(model, lw2, lw3, n_used)); + + prev_tginfo = NULL; + for (tginfo = model->lm3g.tginfo[lw2]; tginfo; tginfo = tginfo->next) { + if (tginfo->w1 == lw1) + break; + prev_tginfo = tginfo; + } + + if (!tginfo) { + load_tginfo(model, lw1, lw2); + tginfo = model->lm3g.tginfo[lw2]; + } + else if (prev_tginfo) { + prev_tginfo->next = tginfo->next; + tginfo->next = model->lm3g.tginfo[lw2]; + model->lm3g.tginfo[lw2] = tginfo; + } + + tginfo->used = 1; + + /* Trigrams for w1,w2 now pointed to by tginfo */ + n = tginfo->n_tg; + tg = tginfo->tg; + if ((i = find_tg(tg, n, lw3)) >= 0) { + /* Access mode = trigram */ + *n_used = 3; + score = model->lm3g.prob3[tg[i].prob3].l; + } + else { + score = tginfo->bowt + lm3g_bg_score(model, lw2, lw3, n_used); + } + + return (score); +} + +static int32 +lm3g_template_score(ngram_model_t *base, int32 wid, + int32 *history, int32 n_hist, + int32 *n_used) +{ + NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base; + switch (n_hist) { + case 0: + /* Access mode: unigram */ + *n_used = 1; + return model->lm3g.unigrams[wid].prob1.l; + case 1: + return lm3g_bg_score(model, history[0], wid, n_used); + case 2: + default: + /* Anything greater than 2 is the same as a trigram for now. */ + return lm3g_tg_score(model, history[1], history[0], wid, n_used); + } +} + +static int32 +lm3g_template_raw_score(ngram_model_t *base, int32 wid, + int32 *history, int32 n_hist, + int32 *n_used) +{ + NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base; + int32 score; + + switch (n_hist) { + case 0: + /* Access mode: unigram */ + *n_used = 1; + /* Undo insertion penalty. */ + score = model->lm3g.unigrams[wid].prob1.l - base->log_wip; + /* Undo language weight. */ + score = (int32)(score / base->lw); + /* Undo unigram interpolation */ + if (strcmp(base->word_str[wid], "<s>") != 0) { /* FIXME: configurable start_sym */ + /* This operation is numerically unstable, so try to avoid it + * as possible */ + if (base->log_uniform + base->log_uniform_weight > logmath_get_zero(base->lmath)) { + score = logmath_log(base->lmath, + logmath_exp(base->lmath, score) + - logmath_exp(base->lmath, + base->log_uniform + base->log_uniform_weight)); + } + } + return score; + case 1: + score = lm3g_bg_score(model, history[0], wid, n_used); + break; + case 2: + default: + /* Anything greater than 2 is the same as a trigram for now. */ + score = lm3g_tg_score(model, history[1], history[0], wid, n_used); + break; + } + /* FIXME (maybe): This doesn't undo unigram weighting in backoff cases. */ + return (int32)((score - base->log_wip) / base->lw); +} + +static int32 +lm3g_template_add_ug(ngram_model_t *base, + int32 wid, int32 lweight) +{ + NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base; + return lm3g_add_ug(base, &model->lm3g, wid, lweight); +} + +static void +lm3g_template_flush(ngram_model_t *base) +{ + NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base; + lm3g_tginfo_reset(base, &model->lm3g); +} + +typedef struct lm3g_iter_s { + ngram_iter_t base; + unigram_t *ug; + bigram_t *bg; + trigram_t *tg; +} lm3g_iter_t; + +static ngram_iter_t * +lm3g_template_iter(ngram_model_t *base, int32 wid, + int32 *history, int32 n_hist) +{ + NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base; + lm3g_iter_t *itor = (lm3g_iter_t *)ckd_calloc(1, sizeof(*itor)); + + ngram_iter_init((ngram_iter_t *)itor, base, n_hist, FALSE); + + if (n_hist == 0) { + /* Unigram is the easiest. */ + itor->ug = model->lm3g.unigrams + wid; + return (ngram_iter_t *)itor; + } + else if (n_hist == 1) { + int32 i, n, b; + /* Find the bigram, as in bg_score above (duplicate code...) */ + itor->ug = model->lm3g.unigrams + history[0]; + b = FIRST_BG(model, history[0]); + n = FIRST_BG(model, history[0] + 1) - b; + itor->bg = model->lm3g.bigrams + b; + /* If no such bigram exists then fail. */ + if ((i = find_bg(itor->bg, n, wid)) < 0) { + ngram_iter_free((ngram_iter_t *)itor); + return NULL; + } + itor->bg += i; + return (ngram_iter_t *)itor; + } + else if (n_hist == 2) { + int32 i, n; + tginfo_t *tginfo, *prev_tginfo; + /* Find the trigram, as in tg_score above (duplicate code...) */ + itor->ug = model->lm3g.unigrams + history[1]; + prev_tginfo = NULL; + for (tginfo = model->lm3g.tginfo[history[0]]; + tginfo; tginfo = tginfo->next) { + if (tginfo->w1 == history[1]) + break; + prev_tginfo = tginfo; + } + + if (!tginfo) { + load_tginfo(model, history[1], history[0]); + tginfo = model->lm3g.tginfo[history[0]]; + } + else if (prev_tginfo) { + prev_tginfo->next = tginfo->next; + tginfo->next = model->lm3g.tginfo[history[0]]; + model->lm3g.tginfo[history[0]] = tginfo; + } + + tginfo->used = 1; + + /* Trigrams for w1,w2 now pointed to by tginfo */ + n = tginfo->n_tg; + itor->tg = tginfo->tg; + if ((i = find_tg(itor->tg, n, wid)) >= 0) { + itor->tg += i; + /* Now advance the bigram pointer accordingly. FIXME: + * Note that we actually already found the relevant bigram + * in load_tginfo. */ + itor->bg = model->lm3g.bigrams; + while (FIRST_TG(model, (itor->bg - model->lm3g.bigrams + 1)) + <= (itor->tg - model->lm3g.trigrams)) + ++itor->bg; + return (ngram_iter_t *)itor; + } + else { + ngram_iter_free((ngram_iter_t *)itor); + return (ngram_iter_t *)NULL; + } + } + else { + /* Should not happen. */ + assert(n_hist == 0); /* Guaranteed to fail. */ + ngram_iter_free((ngram_iter_t *)itor); + return NULL; + } +} + +static ngram_iter_t * +lm3g_template_mgrams(ngram_model_t *base, int m) +{ + NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base; + lm3g_iter_t *itor = (lm3g_iter_t *)ckd_calloc(1, sizeof(*itor)); + ngram_iter_init((ngram_iter_t *)itor, base, m, FALSE); + + itor->ug = model->lm3g.unigrams; + itor->bg = model->lm3g.bigrams; + itor->tg = model->lm3g.trigrams; + + /* Advance bigram pointer to match first trigram. */ + if (m > 1 && base->n_counts[1] > 1) { + while (FIRST_TG(model, (itor->bg - model->lm3g.bigrams + 1)) + <= (itor->tg - model->lm3g.trigrams)) + ++itor->bg; + } + + /* Advance unigram pointer to match first bigram. */ + if (m > 0 && base->n_counts[0] > 1) { + while (itor->ug[1].bigrams <= (itor->bg - model->lm3g.bigrams)) + ++itor->ug; + } + + return (ngram_iter_t *)itor; +} + +static ngram_iter_t * +lm3g_template_successors(ngram_iter_t *bitor) +{ + NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)bitor->model; + lm3g_iter_t *from = (lm3g_iter_t *)bitor; + lm3g_iter_t *itor = (lm3g_iter_t *)ckd_calloc(1, sizeof(*itor)); + + itor->ug = from->ug; + switch (bitor->m) { + case 0: + /* Next itor bigrams is the same as this itor bigram or + itor bigrams is more than total count. This means no successors */ + if (((itor->ug + 1) - model->lm3g.unigrams < bitor->model->n_counts[0] && + itor->ug->bigrams == (itor->ug + 1)->bigrams) || + itor->ug->bigrams == bitor->model->n_counts[1]) + goto done; + + /* Start iterating from first bigram successor of from->ug. */ + itor->bg = model->lm3g.bigrams + itor->ug->bigrams; + break; + case 1: + itor->bg = from->bg; + + /* This indicates no successors */ + if (((itor->bg + 1) - model->lm3g.bigrams < bitor->model->n_counts[1] && + FIRST_TG (model, itor->bg - model->lm3g.bigrams) == + FIRST_TG (model, (itor->bg + 1) - model->lm3g.bigrams)) || + FIRST_TG (model, itor->bg - model->lm3g.bigrams) == bitor->model->n_counts[2]) + goto done; + + /* Start iterating from first trigram successor of from->bg. */ + itor->tg = (model->lm3g.trigrams + + FIRST_TG(model, (itor->bg - model->lm3g.bigrams))); +#if 0 + printf("%s %s => %d (%s)\n", + model->base.word_str[itor->ug - model->lm3g.unigrams], + model->base.word_str[itor->bg->wid], + FIRST_TG(model, (itor->bg - model->lm3g.bigrams)), + model->base.word_str[itor->tg->wid]); +#endif + break; + case 2: + default: + /* All invalid! */ + goto done; + } + + ngram_iter_init((ngram_iter_t *)itor, bitor->model, bitor->m + 1, TRUE); + return (ngram_iter_t *)itor; + done: + ckd_free(itor); + return NULL; +} + +static int32 const * +lm3g_template_iter_get(ngram_iter_t *base, + int32 *out_score, int32 *out_bowt) +{ + NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base->model; + lm3g_iter_t *itor = (lm3g_iter_t *)base; + + base->wids[0] = itor->ug - model->lm3g.unigrams; + if (itor->bg) base->wids[1] = itor->bg->wid; + if (itor->tg) base->wids[2] = itor->tg->wid; +#if 0 + printf("itor_get: %d %d %d\n", base->wids[0], base->wids[1], base->wids[2]); +#endif + + switch (base->m) { + case 0: + *out_score = itor->ug->prob1.l; + *out_bowt = itor->ug->bo_wt1.l; + break; + case 1: + *out_score = model->lm3g.prob2[itor->bg->prob2].l; + if (model->lm3g.bo_wt2) + *out_bowt = model->lm3g.bo_wt2[itor->bg->bo_wt2].l; + else + *out_bowt = 0; + break; + case 2: + *out_score = model->lm3g.prob3[itor->tg->prob3].l; + *out_bowt = 0; + break; + default: /* Should not happen. */ + return NULL; + } + return base->wids; +} + +static ngram_iter_t * +lm3g_template_iter_next(ngram_iter_t *base) +{ + NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base->model; + lm3g_iter_t *itor = (lm3g_iter_t *)base; + + switch (base->m) { + case 0: + ++itor->ug; + /* Check for end condition. */ + if (itor->ug - model->lm3g.unigrams >= base->model->n_counts[0]) + goto done; + break; + case 1: + ++itor->bg; + /* Check for end condition. */ + if (itor->bg - model->lm3g.bigrams >= base->model->n_counts[1]) + goto done; + /* Advance unigram pointer if necessary in order to get one + * that points to this bigram. */ + while (itor->bg - model->lm3g.bigrams >= itor->ug[1].bigrams) { + /* Stop if this is a successor iterator, since we don't + * want a new unigram. */ + if (base->successor) + goto done; + ++itor->ug; + if (itor->ug == model->lm3g.unigrams + base->model->n_counts[0]) { + E_ERROR("Bigram %d has no valid unigram parent\n", + itor->bg - model->lm3g.bigrams); + goto done; + } + } + break; + case 2: + ++itor->tg; + /* Check for end condition. */ + if (itor->tg - model->lm3g.trigrams >= base->model->n_counts[2]) + goto done; + /* Advance bigram pointer if necessary. */ + while (itor->tg - model->lm3g.trigrams >= + FIRST_TG(model, (itor->bg - model->lm3g.bigrams + 1))) { + if (base->successor) + goto done; + ++itor->bg; + if (itor->bg == model->lm3g.bigrams + base->model->n_counts[1]) { + E_ERROR("Trigram %d has no valid bigram parent\n", + itor->tg - model->lm3g.trigrams); + + goto done; + } + } + /* Advance unigram pointer if necessary. */ + while (itor->bg - model->lm3g.bigrams >= itor->ug[1].bigrams) { + ++itor->ug; + if (itor->ug == model->lm3g.unigrams + base->model->n_counts[0]) { + E_ERROR("Trigram %d has no valid unigram parent\n", + itor->tg - model->lm3g.trigrams); + goto done; + } + } + break; + default: /* Should not happen. */ + goto done; + } + + return (ngram_iter_t *)itor; +done: + ngram_iter_free(base); + return NULL; +} + +static void +lm3g_template_iter_free(ngram_iter_t *base) +{ + ckd_free(base); +} |