diff options
Diffstat (limited to 'media/sphinxbase/src/libsphinxbase/lm/lm3g_model.c')
-rw-r--r-- | media/sphinxbase/src/libsphinxbase/lm/lm3g_model.c | 258 |
1 files changed, 258 insertions, 0 deletions
diff --git a/media/sphinxbase/src/libsphinxbase/lm/lm3g_model.c b/media/sphinxbase/src/libsphinxbase/lm/lm3g_model.c new file mode 100644 index 000000000..e9943001e --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/lm3g_model.c @@ -0,0 +1,258 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2007 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/* + * \file lm3g_model.c Core Sphinx 3-gram code used in + * DMP/DMP32/ARPA (for now) model code. + * + * Author: A cast of thousands, probably. + */ +#include <string.h> +#include <assert.h> +#include <limits.h> + +#include "sphinxbase/listelem_alloc.h" +#include "sphinxbase/ckd_alloc.h" +#include "sphinxbase/err.h" + +#include "lm3g_model.h" + +void +lm3g_tginfo_free(ngram_model_t *base, lm3g_model_t *lm3g) +{ + if (lm3g->tginfo == NULL) + return; + listelem_alloc_free(lm3g->le); + ckd_free(lm3g->tginfo); +} + +void +lm3g_tginfo_reset(ngram_model_t *base, lm3g_model_t *lm3g) +{ + if (lm3g->tginfo == NULL) + return; + listelem_alloc_free(lm3g->le); + memset(lm3g->tginfo, 0, base->n_counts[0] * sizeof(tginfo_t *)); + lm3g->le = listelem_alloc_init(sizeof(tginfo_t)); +} + +void +lm3g_apply_weights(ngram_model_t *base, + lm3g_model_t *lm3g, + float32 lw, float32 wip, float32 uw) +{ + int32 log_wip, log_uw, log_uniform_weight; + int i; + + /* Precalculate some log values we will like. */ + log_wip = logmath_log(base->lmath, wip); + log_uw = logmath_log(base->lmath, uw); + log_uniform_weight = logmath_log(base->lmath, 1.0 - uw); + + for (i = 0; i < base->n_counts[0]; ++i) { + int32 prob1, bo_wt, n_used; + + /* Backoff weights just get scaled by the lw. */ + bo_wt = (int32)(lm3g->unigrams[i].bo_wt1.l / base->lw); + /* Unscaling unigram probs is a bit more complicated, so punt + * it back to the general code. */ + prob1 = ngram_ng_prob(base, i, NULL, 0, &n_used); + /* Now compute the new scaled probabilities. */ + lm3g->unigrams[i].bo_wt1.l = (int32)(bo_wt * lw); + if (strcmp(base->word_str[i], "<s>") == 0) { /* FIXME: configurable start_sym */ + /* Apply language weight and WIP */ + lm3g->unigrams[i].prob1.l = (int32)(prob1 * lw) + log_wip; + } + else { + /* Interpolate unigram probability with uniform. */ + prob1 += log_uw; + prob1 = logmath_add(base->lmath, prob1, base->log_uniform + log_uniform_weight); + /* Apply language weight and WIP */ + lm3g->unigrams[i].prob1.l = (int32)(prob1 * lw) + log_wip; + } + } + + for (i = 0; i < lm3g->n_prob2; ++i) { + int32 prob2; + /* Can't just punt this back to general code since it is quantized. */ + prob2 = (int32)((lm3g->prob2[i].l - base->log_wip) / base->lw); + lm3g->prob2[i].l = (int32)(prob2 * lw) + log_wip; + } + + if (base->n > 2) { + for (i = 0; i < lm3g->n_bo_wt2; ++i) { + lm3g->bo_wt2[i].l = (int32)(lm3g->bo_wt2[i].l / base->lw * lw); + } + for (i = 0; i < lm3g->n_prob3; i++) { + int32 prob3; + /* Can't just punt this back to general code since it is quantized. */ + prob3 = (int32)((lm3g->prob3[i].l - base->log_wip) / base->lw); + lm3g->prob3[i].l = (int32)(prob3 * lw) + log_wip; + } + } + + /* Store updated values in the model. */ + base->log_wip = log_wip; + base->log_uw = log_uw; + base->log_uniform_weight = log_uniform_weight; + base->lw = lw; +} + +int32 +lm3g_add_ug(ngram_model_t *base, + lm3g_model_t *lm3g, int32 wid, int32 lweight) +{ + int32 score; + + /* This would be very bad if this happened! */ + assert(!NGRAM_IS_CLASSWID(wid)); + + /* Reallocate unigram array. */ + lm3g->unigrams = ckd_realloc(lm3g->unigrams, + sizeof(*lm3g->unigrams) * base->n_1g_alloc); + memset(lm3g->unigrams + base->n_counts[0], 0, + (base->n_1g_alloc - base->n_counts[0]) * sizeof(*lm3g->unigrams)); + /* Reallocate tginfo array. */ + lm3g->tginfo = ckd_realloc(lm3g->tginfo, + sizeof(*lm3g->tginfo) * base->n_1g_alloc); + memset(lm3g->tginfo + base->n_counts[0], 0, + (base->n_1g_alloc - base->n_counts[0]) * sizeof(*lm3g->tginfo)); + /* FIXME: we really ought to update base->log_uniform *and* + * renormalize all the other unigrams. This is really slow, so I + * will probably just provide a function to renormalize after + * adding unigrams, for anyone who really cares. */ + /* This could be simplified but then we couldn't do it in logmath */ + score = lweight + base->log_uniform + base->log_uw; + score = logmath_add(base->lmath, score, + base->log_uniform + base->log_uniform_weight); + lm3g->unigrams[wid].prob1.l = score; + /* This unigram by definition doesn't participate in any bigrams, + * so its backoff weight and bigram pointer are both undefined. */ + lm3g->unigrams[wid].bo_wt1.l = 0; + lm3g->unigrams[wid].bigrams = 0; + /* Finally, increase the unigram count */ + ++base->n_counts[0]; + /* FIXME: Note that this can actually be quite bogus due to the + * presence of class words. If wid falls outside the unigram + * count, increase it to compensate, at the cost of no longer + * really knowing how many unigrams we have :( */ + if (wid >= base->n_counts[0]) + base->n_counts[0] = wid + 1; + + return score; +} + +#define INITIAL_SORTED_ENTRIES MAX_UINT16 + +void +init_sorted_list(sorted_list_t * l) +{ + l->list = ckd_calloc(INITIAL_SORTED_ENTRIES, sizeof(sorted_entry_t)); + l->list[0].val.l = INT_MIN; + l->list[0].lower = 0; + l->list[0].higher = 0; + l->free = 1; + l->size = INITIAL_SORTED_ENTRIES; +} + +void +free_sorted_list(sorted_list_t * l) +{ + free(l->list); +} + +lmprob_t * +vals_in_sorted_list(sorted_list_t * l) +{ + lmprob_t *vals; + int32 i; + + vals = ckd_calloc(l->free, sizeof(lmprob_t)); + for (i = 0; i < l->free; i++) + vals[i] = l->list[i].val; + return (vals); +} + +int32 +sorted_id(sorted_list_t * l, int32 *val) +{ + int32 i = 0; + + for (;;) { + if (*val == l->list[i].val.l) + return (i); + if (*val < l->list[i].val.l) { + if (l->list[i].lower == 0) { + + if (l->free >= l->size) { + int newsize = l->size + INITIAL_SORTED_ENTRIES; + l->list = ckd_realloc(l->list, sizeof(sorted_entry_t) * newsize); + memset(l->list + l->size, + 0, INITIAL_SORTED_ENTRIES * sizeof(sorted_entry_t)); + l->size = newsize; + } + + l->list[i].lower = l->free; + (l->free)++; + i = l->list[i].lower; + l->list[i].val.l = *val; + return (i); + } + else + i = l->list[i].lower; + } + else { + if (l->list[i].higher == 0) { + + if (l->free >= l->size) { + int newsize = l->size + INITIAL_SORTED_ENTRIES; + l->list = ckd_realloc(l->list, sizeof(sorted_entry_t) * newsize); + memset(l->list + l->size, + 0, INITIAL_SORTED_ENTRIES * sizeof(sorted_entry_t)); + l->size = newsize; + } + + l->list[i].higher = l->free; + (l->free)++; + i = l->list[i].higher; + l->list[i].val.l = *val; + return (i); + } + else + i = l->list[i].higher; + } + } +} |