diff options
Diffstat (limited to 'media/sphinxbase/src/libsphinxbase/lm/ngram_model_set.c')
-rw-r--r-- | media/sphinxbase/src/libsphinxbase/lm/ngram_model_set.c | 870 |
1 files changed, 0 insertions, 870 deletions
diff --git a/media/sphinxbase/src/libsphinxbase/lm/ngram_model_set.c b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_set.c deleted file mode 100644 index 50b7557ae..000000000 --- a/media/sphinxbase/src/libsphinxbase/lm/ngram_model_set.c +++ /dev/null @@ -1,870 +0,0 @@ -/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ -/* ==================================================================== - * Copyright (c) 2008 Carnegie Mellon University. All rights - * reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * This work was supported in part by funding from the Defense Advanced - * Research Projects Agency and the National Science Foundation of the - * United States of America, and the CMU Sphinx Speech Consortium. - * - * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND - * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY - * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * ==================================================================== - * - */ -/** - * @file ngram_model_set.c Set of language models. - * @author David Huggins-Daines <dhuggins@cs.cmu.edu> - */ - -#include <string.h> -#include <stdlib.h> - -#include "sphinxbase/err.h" -#include "sphinxbase/ckd_alloc.h" -#include "sphinxbase/strfuncs.h" -#include "sphinxbase/filename.h" - -#include "ngram_model_set.h" - -static ngram_funcs_t ngram_model_set_funcs; - -static int -my_compare(const void *a, const void *b) -{ - /* Make sure <UNK> floats to the beginning. */ - if (strcmp(*(char * const *)a, "<UNK>") == 0) - return -1; - else if (strcmp(*(char * const *)b, "<UNK>") == 0) - return 1; - else - return strcmp(*(char * const *)a, *(char * const *)b); -} - -static void -build_widmap(ngram_model_t *base, logmath_t *lmath, int32 n) -{ - ngram_model_set_t *set = (ngram_model_set_t *)base; - ngram_model_t **models = set->lms; - hash_table_t *vocab; - glist_t hlist; - gnode_t *gn; - int32 i; - - /* Construct a merged vocabulary and a set of word-ID mappings. */ - vocab = hash_table_new(models[0]->n_words, FALSE); - /* Create the set of merged words. */ - for (i = 0; i < set->n_models; ++i) { - int32 j; - for (j = 0; j < models[i]->n_words; ++j) { - /* Ignore collisions. */ - (void)hash_table_enter_int32(vocab, models[i]->word_str[j], j); - } - } - /* Create the array of words, then sort it. */ - if (hash_table_lookup(vocab, "<UNK>", NULL) != 0) - (void)hash_table_enter_int32(vocab, "<UNK>", 0); - /* Now we know the number of unigrams, initialize the base model. */ - ngram_model_init(base, &ngram_model_set_funcs, lmath, n, hash_table_inuse(vocab)); - base->writable = FALSE; /* We will reuse the pointers from the submodels. */ - i = 0; - hlist = hash_table_tolist(vocab, NULL); - for (gn = hlist; gn; gn = gnode_next(gn)) { - hash_entry_t *ent = gnode_ptr(gn); - base->word_str[i++] = (char *)ent->key; - } - glist_free(hlist); - qsort(base->word_str, base->n_words, sizeof(*base->word_str), my_compare); - - /* Now create the word ID mappings. */ - if (set->widmap) - ckd_free_2d((void **)set->widmap); - set->widmap = (int32 **) ckd_calloc_2d(base->n_words, set->n_models, - sizeof(**set->widmap)); - for (i = 0; i < base->n_words; ++i) { - int32 j; - /* Also create the master wid mapping. */ - (void)hash_table_enter_int32(base->wid, base->word_str[i], i); - /* printf("%s: %d => ", base->word_str[i], i); */ - for (j = 0; j < set->n_models; ++j) { - set->widmap[i][j] = ngram_wid(models[j], base->word_str[i]); - /* printf("%d ", set->widmap[i][j]); */ - } - /* printf("\n"); */ - } - hash_table_free(vocab); -} - -ngram_model_t * -ngram_model_set_init(cmd_ln_t *config, - ngram_model_t **models, - char **names, - const float32 *weights, - int32 n_models) -{ - ngram_model_set_t *model; - ngram_model_t *base; - logmath_t *lmath; - int32 i, n; - - if (n_models == 0) /* WTF */ - return NULL; - - /* Do consistency checking on the models. They must all use the - * same logbase and shift. */ - lmath = models[0]->lmath; - for (i = 1; i < n_models; ++i) { - if (logmath_get_base(models[i]->lmath) != logmath_get_base(lmath) - || logmath_get_shift(models[i]->lmath) != logmath_get_shift(lmath)) { - E_ERROR("Log-math parameters don't match, will not create LM set\n"); - return NULL; - } - } - - /* Allocate the combined model, initialize it. */ - model = ckd_calloc(1, sizeof(*model)); - base = &model->base; - model->n_models = n_models; - model->lms = ckd_calloc(n_models, sizeof(*model->lms)); - model->names = ckd_calloc(n_models, sizeof(*model->names)); - /* Initialize weights to a uniform distribution */ - model->lweights = ckd_calloc(n_models, sizeof(*model->lweights)); - { - int32 uniform = logmath_log(lmath, 1.0/n_models); - for (i = 0; i < n_models; ++i) - model->lweights[i] = uniform; - } - /* Default to interpolate if weights were given. */ - if (weights) - model->cur = -1; - - n = 0; - for (i = 0; i < n_models; ++i) { - model->lms[i] = ngram_model_retain(models[i]); - model->names[i] = ckd_salloc(names[i]); - if (weights) - model->lweights[i] = logmath_log(lmath, weights[i]); - /* N is the maximum of all merged models. */ - if (models[i]->n > n) - n = models[i]->n; - } - /* Allocate the history mapping table. */ - model->maphist = ckd_calloc(n - 1, sizeof(*model->maphist)); - - /* Now build the word-ID mapping and merged vocabulary. */ - build_widmap(base, lmath, n); - return base; -} - -ngram_model_t * -ngram_model_set_read(cmd_ln_t *config, - const char *lmctlfile, - logmath_t *lmath) -{ - FILE *ctlfp; - glist_t lms = NULL; - glist_t lmnames = NULL; - __BIGSTACKVARIABLE__ char str[1024]; - ngram_model_t *set = NULL; - hash_table_t *classes; - char *basedir, *c; - - /* Read all the class definition files to accumulate a mapping of - * classnames to definitions. */ - classes = hash_table_new(0, FALSE); - if ((ctlfp = fopen(lmctlfile, "r")) == NULL) { - E_ERROR_SYSTEM("Failed to open %s", lmctlfile); - return NULL; - } - - /* Try to find the base directory to append to relative paths in - * the lmctl file. */ - if ((c = strrchr(lmctlfile, '/')) || (c = strrchr(lmctlfile, '\\'))) { - /* Include the trailing slash. */ - basedir = ckd_calloc(c - lmctlfile + 2, 1); - memcpy(basedir, lmctlfile, c - lmctlfile + 1); - } - else { - basedir = NULL; - } - E_INFO("Reading LM control file '%s'\n", lmctlfile); - if (basedir) - E_INFO("Will prepend '%s' to unqualified paths\n", basedir); - - if (fscanf(ctlfp, "%1023s", str) == 1) { - if (strcmp(str, "{") == 0) { - /* Load LMclass files */ - while ((fscanf(ctlfp, "%1023s", str) == 1) - && (strcmp(str, "}") != 0)) { - char *deffile; - if (basedir && !path_is_absolute(str)) - deffile = string_join(basedir, str, NULL); - else - deffile = ckd_salloc(str); - E_INFO("Reading classdef from '%s'\n", deffile); - if (read_classdef_file(classes, deffile) < 0) { - ckd_free(deffile); - goto error_out; - } - ckd_free(deffile); - } - - if (strcmp(str, "}") != 0) { - E_ERROR("Unexpected EOF in %s\n", lmctlfile); - goto error_out; - } - - /* This might be the first LM name. */ - if (fscanf(ctlfp, "%1023s", str) != 1) - str[0] = '\0'; - } - } - else - str[0] = '\0'; - - /* Read in one LM at a time and add classes to them as necessary. */ - while (str[0] != '\0') { - char *lmfile; - ngram_model_t *lm; - - if (basedir && str[0] != '/' && str[0] != '\\') - lmfile = string_join(basedir, str, NULL); - else - lmfile = ckd_salloc(str); - E_INFO("Reading lm from '%s'\n", lmfile); - lm = ngram_model_read(config, lmfile, NGRAM_AUTO, lmath); - if (lm == NULL) { - ckd_free(lmfile); - goto error_out; - } - if (fscanf(ctlfp, "%1023s", str) != 1) { - E_ERROR("LMname missing after LMFileName '%s'\n", lmfile); - ckd_free(lmfile); - goto error_out; - } - ckd_free(lmfile); - lms = glist_add_ptr(lms, lm); - lmnames = glist_add_ptr(lmnames, ckd_salloc(str)); - - if (fscanf(ctlfp, "%1023s", str) == 1) { - if (strcmp(str, "{") == 0) { - /* LM uses classes; read their names */ - while ((fscanf(ctlfp, "%1023s", str) == 1) && - (strcmp(str, "}") != 0)) { - void *val; - classdef_t *classdef; - - if (hash_table_lookup(classes, str, &val) == -1) { - E_ERROR("Unknown class %s in control file\n", str); - goto error_out; - } - classdef = val; - if (ngram_model_add_class(lm, str, 1.0, - classdef->words, classdef->weights, - classdef->n_words) < 0) { - goto error_out; - } - E_INFO("Added class %s containing %d words\n", - str, classdef->n_words); - } - if (strcmp(str, "}") != 0) { - E_ERROR("Unexpected EOF in %s\n", lmctlfile); - goto error_out; - } - if (fscanf(ctlfp, "%1023s", str) != 1) - str[0] = '\0'; - } - } - else - str[0] = '\0'; - } - fclose(ctlfp); - - /* Now construct arrays out of lms and lmnames, and build an - * ngram_model_set. */ - lms = glist_reverse(lms); - lmnames = glist_reverse(lmnames); - { - int32 n_models; - ngram_model_t **lm_array; - char **name_array; - gnode_t *lm_node, *name_node; - int32 i; - - n_models = glist_count(lms); - lm_array = ckd_calloc(n_models, sizeof(*lm_array)); - name_array = ckd_calloc(n_models, sizeof(*name_array)); - lm_node = lms; - name_node = lmnames; - for (i = 0; i < n_models; ++i) { - lm_array[i] = gnode_ptr(lm_node); - name_array[i] = gnode_ptr(name_node); - lm_node = gnode_next(lm_node); - name_node = gnode_next(name_node); - } - set = ngram_model_set_init(config, lm_array, name_array, - NULL, n_models); - ckd_free(lm_array); - ckd_free(name_array); - } -error_out: - { - gnode_t *gn; - glist_t hlist; - - if (set == NULL) { - for (gn = lms; gn; gn = gnode_next(gn)) { - ngram_model_free(gnode_ptr(gn)); - } - } - glist_free(lms); - for (gn = lmnames; gn; gn = gnode_next(gn)) { - ckd_free(gnode_ptr(gn)); - } - glist_free(lmnames); - hlist = hash_table_tolist(classes, NULL); - for (gn = hlist; gn; gn = gnode_next(gn)) { - hash_entry_t *he = gnode_ptr(gn); - ckd_free((char *)he->key); - classdef_free(he->val); - } - glist_free(hlist); - hash_table_free(classes); - ckd_free(basedir); - } - return set; -} - -int32 -ngram_model_set_count(ngram_model_t *base) -{ - ngram_model_set_t *set = (ngram_model_set_t *)base; - return set->n_models; -} - -ngram_model_set_iter_t * -ngram_model_set_iter(ngram_model_t *base) -{ - ngram_model_set_t *set = (ngram_model_set_t *)base; - ngram_model_set_iter_t *itor; - - if (set == NULL || set->n_models == 0) - return NULL; - itor = ckd_calloc(1, sizeof(*itor)); - itor->set = set; - return itor; -} - -ngram_model_set_iter_t * -ngram_model_set_iter_next(ngram_model_set_iter_t *itor) -{ - if (++itor->cur == itor->set->n_models) { - ngram_model_set_iter_free(itor); - return NULL; - } - return itor; -} - -void -ngram_model_set_iter_free(ngram_model_set_iter_t *itor) -{ - ckd_free(itor); -} - -ngram_model_t * -ngram_model_set_iter_model(ngram_model_set_iter_t *itor, - char const **lmname) -{ - if (lmname) *lmname = itor->set->names[itor->cur]; - return itor->set->lms[itor->cur]; -} - -ngram_model_t * -ngram_model_set_lookup(ngram_model_t *base, - const char *name) -{ - ngram_model_set_t *set = (ngram_model_set_t *)base; - int32 i; - - if (name == NULL) { - if (set->cur == -1) - return NULL; - else - return set->lms[set->cur]; - } - - /* There probably won't be very many submodels. */ - for (i = 0; i < set->n_models; ++i) - if (0 == strcmp(set->names[i], name)) - break; - if (i == set->n_models) - return NULL; - return set->lms[i]; -} - -ngram_model_t * -ngram_model_set_select(ngram_model_t *base, - const char *name) -{ - ngram_model_set_t *set = (ngram_model_set_t *)base; - int32 i; - - /* There probably won't be very many submodels. */ - for (i = 0; i < set->n_models; ++i) - if (0 == strcmp(set->names[i], name)) - break; - if (i == set->n_models) - return NULL; - set->cur = i; - return set->lms[set->cur]; -} - -const char * -ngram_model_set_current(ngram_model_t *base) -{ - ngram_model_set_t *set = (ngram_model_set_t *)base; - - if (set->cur == -1) - return NULL; - else - return set->names[set->cur]; -} - -int32 -ngram_model_set_current_wid(ngram_model_t *base, - int32 set_wid) -{ - ngram_model_set_t *set = (ngram_model_set_t *)base; - - if (set->cur == -1 || set_wid >= base->n_words) - return NGRAM_INVALID_WID; - else - return set->widmap[set_wid][set->cur]; -} - -int32 -ngram_model_set_known_wid(ngram_model_t *base, - int32 set_wid) -{ - ngram_model_set_t *set = (ngram_model_set_t *)base; - - if (set_wid >= base->n_words) - return FALSE; - else if (set->cur == -1) { - int32 i; - for (i = 0; i < set->n_models; ++i) { - if (set->widmap[set_wid][i] != ngram_unknown_wid(set->lms[i])) - return TRUE; - } - return FALSE; - } - else - return (set->widmap[set_wid][set->cur] - != ngram_unknown_wid(set->lms[set->cur])); -} - -ngram_model_t * -ngram_model_set_interp(ngram_model_t *base, - const char **names, - const float32 *weights) -{ - ngram_model_set_t *set = (ngram_model_set_t *)base; - - /* If we have a set of weights here, then set them. */ - if (names && weights) { - int32 i, j; - - /* We hope there aren't many models. */ - for (i = 0; i < set->n_models; ++i) { - for (j = 0; j < set->n_models; ++j) - if (0 == strcmp(names[i], set->names[j])) - break; - if (j == set->n_models) { - E_ERROR("Unknown LM name %s\n", names[i]); - return NULL; - } - set->lweights[j] = logmath_log(base->lmath, weights[i]); - } - } - else if (weights) { - memcpy(set->lweights, weights, set->n_models * sizeof(*set->lweights)); - } - /* Otherwise just enable existing weights. */ - set->cur = -1; - return base; -} - -ngram_model_t * -ngram_model_set_add(ngram_model_t *base, - ngram_model_t *model, - const char *name, - float32 weight, - int reuse_widmap) - -{ - ngram_model_set_t *set = (ngram_model_set_t *)base; - float32 fprob; - int32 scale, i; - - /* Add it to the array of lms. */ - ++set->n_models; - set->lms = ckd_realloc(set->lms, set->n_models * sizeof(*set->lms)); - set->lms[set->n_models - 1] = model; - set->names = ckd_realloc(set->names, set->n_models * sizeof(*set->names)); - set->names[set->n_models - 1] = ckd_salloc(name); - /* Expand the history mapping table if necessary. */ - if (model->n > base->n) { - base->n = model->n; - set->maphist = ckd_realloc(set->maphist, - (model->n - 1) * sizeof(*set->maphist)); - } - - /* Renormalize the interpolation weights. */ - fprob = weight * 1.0 / set->n_models; - set->lweights = ckd_realloc(set->lweights, - set->n_models * sizeof(*set->lweights)); - set->lweights[set->n_models - 1] = logmath_log(base->lmath, fprob); - /* Now normalize everything else to fit it in. This is - * accomplished by simply scaling all the other probabilities - * by (1-fprob). */ - scale = logmath_log(base->lmath, 1.0 - fprob); - for (i = 0; i < set->n_models - 1; ++i) - set->lweights[i] += scale; - - /* Reuse the old word ID mapping if requested. */ - if (reuse_widmap) { - int32 **new_widmap; - - /* Tack another column onto the widmap array. */ - new_widmap = (int32 **)ckd_calloc_2d(base->n_words, set->n_models, - sizeof (**new_widmap)); - for (i = 0; i < base->n_words; ++i) { - /* Copy all the existing mappings. */ - memcpy(new_widmap[i], set->widmap[i], - (set->n_models - 1) * sizeof(**new_widmap)); - /* Create the new mapping. */ - new_widmap[i][set->n_models-1] = ngram_wid(model, base->word_str[i]); - } - ckd_free_2d((void **)set->widmap); - set->widmap = new_widmap; - } - else { - build_widmap(base, base->lmath, base->n); - } - return model; -} - -ngram_model_t * -ngram_model_set_remove(ngram_model_t *base, - const char *name, - int reuse_widmap) -{ - ngram_model_set_t *set = (ngram_model_set_t *)base; - ngram_model_t *submodel; - int32 lmidx, scale, n, i; - float32 fprob; - - for (lmidx = 0; lmidx < set->n_models; ++lmidx) - if (0 == strcmp(name, set->names[lmidx])) - break; - if (lmidx == set->n_models) - return NULL; - submodel = set->lms[lmidx]; - - /* Renormalize the interpolation weights by scaling them by - * 1/(1-fprob) */ - fprob = logmath_exp(base->lmath, set->lweights[lmidx]); - scale = logmath_log(base->lmath, 1.0 - fprob); - - /* Remove it from the array of lms, renormalize remaining weights, - * and recalcluate n. */ - --set->n_models; - n = 0; - ckd_free(set->names[lmidx]); - set->names[lmidx] = NULL; - for (i = 0; i < set->n_models; ++i) { - if (i >= lmidx) { - set->lms[i] = set->lms[i+1]; - set->names[i] = set->names[i+1]; - set->lweights[i] = set->lweights[i+1]; - } - set->lweights[i] -= scale; - if (set->lms[i]->n > n) - n = set->lms[i]->n; - } - /* There's no need to shrink these arrays. */ - set->lms[set->n_models] = NULL; - set->lweights[set->n_models] = base->log_zero; - /* No need to shrink maphist either. */ - - /* Reuse the existing word ID mapping if requested. */ - if (reuse_widmap) { - /* Just go through and shrink each row. */ - for (i = 0; i < base->n_words; ++i) { - memmove(set->widmap[i] + lmidx, set->widmap[i] + lmidx + 1, - (set->n_models - lmidx) * sizeof(**set->widmap)); - } - } - else { - build_widmap(base, base->lmath, n); - } - return submodel; -} - -void -ngram_model_set_map_words(ngram_model_t *base, - const char **words, - int32 n_words) -{ - ngram_model_set_t *set = (ngram_model_set_t *)base; - int32 i; - - /* Recreate the word mapping. */ - if (base->writable) { - for (i = 0; i < base->n_words; ++i) { - ckd_free(base->word_str[i]); - } - } - ckd_free(base->word_str); - ckd_free_2d((void **)set->widmap); - base->writable = TRUE; - base->n_words = base->n_1g_alloc = n_words; - base->word_str = ckd_calloc(n_words, sizeof(*base->word_str)); - set->widmap = (int32 **)ckd_calloc_2d(n_words, set->n_models, sizeof(**set->widmap)); - hash_table_empty(base->wid); - for (i = 0; i < n_words; ++i) { - int32 j; - base->word_str[i] = ckd_salloc(words[i]); - (void)hash_table_enter_int32(base->wid, base->word_str[i], i); - for (j = 0; j < set->n_models; ++j) { - set->widmap[i][j] = ngram_wid(set->lms[j], base->word_str[i]); - } - } -} - -static int -ngram_model_set_apply_weights(ngram_model_t *base, float32 lw, - float32 wip, float32 uw) -{ - ngram_model_set_t *set = (ngram_model_set_t *)base; - int32 i; - - /* Apply weights to each sub-model. */ - for (i = 0; i < set->n_models; ++i) - ngram_model_apply_weights(set->lms[i], lw, wip, uw); - return 0; -} - -static int32 -ngram_model_set_score(ngram_model_t *base, int32 wid, - int32 *history, int32 n_hist, - int32 *n_used) -{ - ngram_model_set_t *set = (ngram_model_set_t *)base; - int32 mapwid; - int32 score; - int32 i; - - /* Truncate the history. */ - if (n_hist > base->n - 1) - n_hist = base->n - 1; - - /* Interpolate if there is no current. */ - if (set->cur == -1) { - score = base->log_zero; - for (i = 0; i < set->n_models; ++i) { - int32 j; - /* Map word and history IDs for each model. */ - mapwid = set->widmap[wid][i]; - for (j = 0; j < n_hist; ++j) { - if (history[j] == NGRAM_INVALID_WID) - set->maphist[j] = NGRAM_INVALID_WID; - else - set->maphist[j] = set->widmap[history[j]][i]; - } - score = logmath_add(base->lmath, score, - set->lweights[i] + - ngram_ng_score(set->lms[i], - mapwid, set->maphist, n_hist, n_used)); - } - } - else { - int32 j; - /* Map word and history IDs (FIXME: do this in a function?) */ - mapwid = set->widmap[wid][set->cur]; - for (j = 0; j < n_hist; ++j) { - if (history[j] == NGRAM_INVALID_WID) - set->maphist[j] = NGRAM_INVALID_WID; - else - set->maphist[j] = set->widmap[history[j]][set->cur]; - } - score = ngram_ng_score(set->lms[set->cur], - mapwid, set->maphist, n_hist, n_used); - } - - return score; -} - -static int32 -ngram_model_set_raw_score(ngram_model_t *base, int32 wid, - int32 *history, int32 n_hist, - int32 *n_used) -{ - ngram_model_set_t *set = (ngram_model_set_t *)base; - int32 mapwid; - int32 score; - int32 i; - - /* Truncate the history. */ - if (n_hist > base->n - 1) - n_hist = base->n - 1; - - /* Interpolate if there is no current. */ - if (set->cur == -1) { - score = base->log_zero; - for (i = 0; i < set->n_models; ++i) { - int32 j; - /* Map word and history IDs for each model. */ - mapwid = set->widmap[wid][i]; - for (j = 0; j < n_hist; ++j) { - if (history[j] == NGRAM_INVALID_WID) - set->maphist[j] = NGRAM_INVALID_WID; - else - set->maphist[j] = set->widmap[history[j]][i]; - } - score = logmath_add(base->lmath, score, - set->lweights[i] + - ngram_ng_prob(set->lms[i], - mapwid, set->maphist, n_hist, n_used)); - } - } - else { - int32 j; - /* Map word and history IDs (FIXME: do this in a function?) */ - mapwid = set->widmap[wid][set->cur]; - for (j = 0; j < n_hist; ++j) { - if (history[j] == NGRAM_INVALID_WID) - set->maphist[j] = NGRAM_INVALID_WID; - else - set->maphist[j] = set->widmap[history[j]][set->cur]; - } - score = ngram_ng_prob(set->lms[set->cur], - mapwid, set->maphist, n_hist, n_used); - } - - return score; -} - -static int32 -ngram_model_set_add_ug(ngram_model_t *base, - int32 wid, int32 lweight) -{ - ngram_model_set_t *set = (ngram_model_set_t *)base; - int32 *newwid; - int32 i, prob; - - /* At this point the word has already been added to the master - model and we have a new word ID for it. Add it to active - submodels and track the word IDs. */ - newwid = ckd_calloc(set->n_models, sizeof(*newwid)); - prob = base->log_zero; - for (i = 0; i < set->n_models; ++i) { - int32 wprob, n_hist; - - /* Only add to active models. */ - if (set->cur == -1 || set->cur == i) { - /* Did this word already exist? */ - newwid[i] = ngram_wid(set->lms[i], base->word_str[wid]); - if (newwid[i] == NGRAM_INVALID_WID) { - /* Add it to the submodel. */ - newwid[i] = ngram_model_add_word(set->lms[i], base->word_str[wid], - logmath_exp(base->lmath, lweight)); - if (newwid[i] == NGRAM_INVALID_WID) { - ckd_free(newwid); - return base->log_zero; - } - } - /* Now get the unigram probability for the new word and either - * interpolate it or use it (if this is the current model). */ - wprob = ngram_ng_prob(set->lms[i], newwid[i], NULL, 0, &n_hist); - if (set->cur == i) - prob = wprob; - else if (set->cur == -1) - prob = logmath_add(base->lmath, prob, set->lweights[i] + wprob); - } - else { - newwid[i] = NGRAM_INVALID_WID; - } - } - /* Okay we have the word IDs for this in all the submodels. Now - do some complicated memory mangling to add this to the - widmap. */ - set->widmap = ckd_realloc(set->widmap, base->n_words * sizeof(*set->widmap)); - set->widmap[0] = ckd_realloc(set->widmap[0], - base->n_words - * set->n_models - * sizeof(**set->widmap)); - for (i = 0; i < base->n_words; ++i) - set->widmap[i] = set->widmap[0] + i * set->n_models; - memcpy(set->widmap[wid], newwid, set->n_models * sizeof(*newwid)); - ckd_free(newwid); - return prob; -} - -static void -ngram_model_set_free(ngram_model_t *base) -{ - ngram_model_set_t *set = (ngram_model_set_t *)base; - int32 i; - - for (i = 0; i < set->n_models; ++i) - ngram_model_free(set->lms[i]); - ckd_free(set->lms); - for (i = 0; i < set->n_models; ++i) - ckd_free(set->names[i]); - ckd_free(set->names); - ckd_free(set->lweights); - ckd_free(set->maphist); - ckd_free_2d((void **)set->widmap); -} - -static void -ngram_model_set_flush(ngram_model_t *base) -{ - ngram_model_set_t *set = (ngram_model_set_t *)base; - int32 i; - - for (i = 0; i < set->n_models; ++i) - ngram_model_flush(set->lms[i]); -} - -static ngram_funcs_t ngram_model_set_funcs = { - ngram_model_set_free, /* free */ - ngram_model_set_apply_weights, /* apply_weights */ - ngram_model_set_score, /* score */ - ngram_model_set_raw_score, /* raw_score */ - ngram_model_set_add_ug, /* add_ug */ - ngram_model_set_flush /* flush */ -}; |