diff options
Diffstat (limited to 'media/sphinxbase/src/libsphinxbase/lm/ngram_model.c')
-rw-r--r-- | media/sphinxbase/src/libsphinxbase/lm/ngram_model.c | 1129 |
1 files changed, 1129 insertions, 0 deletions
diff --git a/media/sphinxbase/src/libsphinxbase/lm/ngram_model.c b/media/sphinxbase/src/libsphinxbase/lm/ngram_model.c new file mode 100644 index 000000000..02af4151b --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/ngram_model.c @@ -0,0 +1,1129 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2007 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/* + * \file ngram_model.c N-Gram language models. + * + * Author: David Huggins-Daines, much code taken from sphinx3/src/libs3decoder/liblm + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <string.h> +#include <assert.h> + +#include "sphinxbase/ngram_model.h" +#include "sphinxbase/ckd_alloc.h" +#include "sphinxbase/filename.h" +#include "sphinxbase/pio.h" +#include "sphinxbase/err.h" +#include "sphinxbase/logmath.h" +#include "sphinxbase/strfuncs.h" +#include "sphinxbase/case.h" + +#include "ngram_model_internal.h" + +ngram_file_type_t +ngram_file_name_to_type(const char *file_name) +{ + const char *ext; + + ext = strrchr(file_name, '.'); + if (ext == NULL) { + return NGRAM_INVALID; + } + if (0 == strcmp_nocase(ext, ".gz")) { + while (--ext >= file_name) { + if (*ext == '.') break; + } + if (ext < file_name) { + return NGRAM_INVALID; + } + } + else if (0 == strcmp_nocase(ext, ".bz2")) { + while (--ext >= file_name) { + if (*ext == '.') break; + } + if (ext < file_name) { + return NGRAM_INVALID; + } + } + /* We use strncmp because there might be a .gz on the end. */ + if (0 == strncmp_nocase(ext, ".ARPA", 5)) + return NGRAM_ARPA; + if (0 == strncmp_nocase(ext, ".DMP", 4)) + return NGRAM_DMP; + return NGRAM_INVALID; + } + +ngram_file_type_t +ngram_str_to_type(const char *str_name) +{ + if (0 == strcmp_nocase(str_name, "arpa")) + return NGRAM_ARPA; + if (0 == strcmp_nocase(str_name, "dmp")) + return NGRAM_DMP; + return NGRAM_INVALID; +} + +char const * +ngram_type_to_str(int type) +{ + switch (type) { + case NGRAM_ARPA: + return "arpa"; + case NGRAM_DMP: + return "dmp"; + default: + return NULL; + } +} + + + ngram_model_t * + ngram_model_read(cmd_ln_t *config, + const char *file_name, + ngram_file_type_t file_type, + logmath_t *lmath) + { + ngram_model_t *model = NULL; + + switch (file_type) { + case NGRAM_AUTO: { + if ((model = ngram_model_arpa_read(config, file_name, lmath)) != NULL) + break; + if ((model = ngram_model_dmp_read(config, file_name, lmath)) != NULL) + break; + return NULL; + } + case NGRAM_ARPA: + model = ngram_model_arpa_read(config, file_name, lmath); + break; + case NGRAM_DMP: + model = ngram_model_dmp_read(config, file_name, lmath); + break; + default: + E_ERROR("language model file type not supported\n"); + return NULL; + } + + /* Now set weights based on config if present. */ + if (config) { + float32 lw = 1.0; + float32 wip = 1.0; + float32 uw = 1.0; + + if (cmd_ln_exists_r(config, "-lw")) + lw = cmd_ln_float32_r(config, "-lw"); + if (cmd_ln_exists_r(config, "-wip")) + wip = cmd_ln_float32_r(config, "-wip"); + if (cmd_ln_exists_r(config, "-uw")) + uw = cmd_ln_float32_r(config, "-uw"); + + ngram_model_apply_weights(model, lw, wip, uw); + } + + return model; + } + + int + ngram_model_write(ngram_model_t *model, const char *file_name, + ngram_file_type_t file_type) + { + switch (file_type) { + case NGRAM_AUTO: { + file_type = ngram_file_name_to_type(file_name); + /* Default to ARPA (catches .lm and other things) */ + if (file_type == NGRAM_INVALID) + file_type = NGRAM_ARPA; + return ngram_model_write(model, file_name, file_type); + } + case NGRAM_ARPA: + return ngram_model_arpa_write(model, file_name); + case NGRAM_DMP: + return ngram_model_dmp_write(model, file_name); + default: + E_ERROR("language model file type not supported\n"); + return -1; + } + E_ERROR("language model file type not supported\n"); + return -1; + } + + int32 + ngram_model_init(ngram_model_t *base, + ngram_funcs_t *funcs, + logmath_t *lmath, + int32 n, int32 n_unigram) + { + base->refcount = 1; + base->funcs = funcs; + base->n = n; + /* If this was previously initialized... */ + if (base->n_counts == NULL) + base->n_counts = ckd_calloc(3, sizeof(*base->n_counts)); + /* Don't reset weights if logmath object hasn't changed. */ + if (base->lmath != lmath) { + /* Set default values for weights. */ + base->lw = 1.0; + base->log_wip = 0; /* i.e. 1.0 */ + base->log_uw = 0; /* i.e. 1.0 */ + base->log_uniform = logmath_log(lmath, 1.0 / n_unigram); + base->log_uniform_weight = logmath_get_zero(lmath); + base->log_zero = logmath_get_zero(lmath); + base->lmath = lmath; + } + /* Allocate or reallocate space for word strings. */ + if (base->word_str) { + /* Free all previous word strings if they were allocated. */ + if (base->writable) { + int32 i; + for (i = 0; i < base->n_words; ++i) { + ckd_free(base->word_str[i]); + base->word_str[i] = NULL; + } + } + base->word_str = ckd_realloc(base->word_str, n_unigram * sizeof(char *)); + } + else + base->word_str = ckd_calloc(n_unigram, sizeof(char *)); + /* NOTE: They are no longer case-insensitive since we are allowing + * other encodings for word strings. Beware. */ + if (base->wid) + hash_table_empty(base->wid); + else + base->wid = hash_table_new(n_unigram, FALSE); + base->n_counts[0] = base->n_1g_alloc = base->n_words = n_unigram; + + return 0; +} + +ngram_model_t * +ngram_model_retain(ngram_model_t *model) +{ + ++model->refcount; + return model; +} + + +void +ngram_model_flush(ngram_model_t *model) +{ + if (model->funcs && model->funcs->flush) + (*model->funcs->flush)(model); +} + +int +ngram_model_free(ngram_model_t *model) +{ + int i; + + if (model == NULL) + return 0; + if (--model->refcount > 0) + return model->refcount; + if (model->funcs && model->funcs->free) + (*model->funcs->free)(model); + if (model->writable) { + /* Free all words. */ + for (i = 0; i < model->n_words; ++i) { + ckd_free(model->word_str[i]); + } + } + else { + /* Free all class words. */ + for (i = 0; i < model->n_classes; ++i) { + ngram_class_t *lmclass; + int32 j; + + lmclass = model->classes[i]; + for (j = 0; j < lmclass->n_words; ++j) { + ckd_free(model->word_str[lmclass->start_wid + j]); + } + for (j = 0; j < lmclass->n_hash; ++j) { + if (lmclass->nword_hash[j].wid != -1) { + ckd_free(model->word_str[lmclass->nword_hash[j].wid]); + } + } + } + } + for (i = 0; i < model->n_classes; ++i) { + ngram_class_free(model->classes[i]); + } + ckd_free(model->classes); + hash_table_free(model->wid); + ckd_free(model->word_str); + ckd_free(model->n_counts); + ckd_free(model); + return 0; +} + +int +ngram_model_casefold(ngram_model_t *model, int kase) +{ + int writable, i; + hash_table_t *new_wid; + + /* Were word strings already allocated? */ + writable = model->writable; + /* Either way, we are going to allocate some word strings. */ + model->writable = TRUE; + + /* And, don't forget, we need to rebuild the word to unigram ID + * mapping. */ + new_wid = hash_table_new(model->n_words, FALSE); + for (i = 0; i < model->n_words; ++i) { + char *outstr; + if (writable) { + outstr = model->word_str[i]; + } + else { + outstr = ckd_salloc(model->word_str[i]); + } + /* Don't case-fold <tags> or [classes] */ + if (outstr[0] == '<' || outstr[0] == '[') { + } + else { + switch (kase) { + case NGRAM_UPPER: + ucase(outstr); + break; + case NGRAM_LOWER: + lcase(outstr); + break; + default: + ; + } + } + model->word_str[i] = outstr; + + /* Now update the hash table. We might have terrible + * collisions here, so warn about them. */ + if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) { + E_WARN("Duplicate word in dictionary after conversion: %s\n", + model->word_str[i]); + } + } + /* Swap out the hash table. */ + hash_table_free(model->wid); + model->wid = new_wid; + return 0; +} + +int +ngram_model_apply_weights(ngram_model_t *model, + float32 lw, float32 wip, float32 uw) +{ + return (*model->funcs->apply_weights)(model, lw, wip, uw); +} + +float32 +ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip, + int32 *out_log_uw) +{ + if (out_log_wip) *out_log_wip = model->log_wip; + if (out_log_uw) *out_log_uw = model->log_uw; + return model->lw; +} + + +int32 +ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history, + int32 n_hist, int32 *n_used) +{ + int32 score, class_weight = 0; + int i; + + /* Closed vocabulary, OOV word probability is zero */ + if (wid == NGRAM_INVALID_WID) + return model->log_zero; + + /* "Declassify" wid and history */ + if (NGRAM_IS_CLASSWID(wid)) { + ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)]; + + class_weight = ngram_class_prob(lmclass, wid); + if (class_weight == 1) /* Meaning, not found in class. */ + return model->log_zero; + wid = lmclass->tag_wid; + } + for (i = 0; i < n_hist; ++i) { + if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i])) + history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid; + } + score = (*model->funcs->score)(model, wid, history, n_hist, n_used); + + /* Multiply by unigram in-class weight. */ + return score + class_weight; +} + +int32 +ngram_score(ngram_model_t *model, const char *word, ...) +{ + va_list history; + const char *hword; + int32 *histid; + int32 n_hist; + int32 n_used; + int32 prob; + + va_start(history, word); + n_hist = 0; + while ((hword = va_arg(history, const char *)) != NULL) + ++n_hist; + va_end(history); + + histid = ckd_calloc(n_hist, sizeof(*histid)); + va_start(history, word); + n_hist = 0; + while ((hword = va_arg(history, const char *)) != NULL) { + histid[n_hist] = ngram_wid(model, hword); + ++n_hist; + } + va_end(history); + + prob = ngram_ng_score(model, ngram_wid(model, word), + histid, n_hist, &n_used); + ckd_free(histid); + return prob; +} + +int32 +ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used) +{ + int32 hist[2]; + hist[0] = w2; + hist[1] = w1; + return ngram_ng_score(model, w3, hist, 2, n_used); +} + +int32 +ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used) +{ + return ngram_ng_score(model, w2, &w1, 1, n_used); +} + +int32 +ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history, + int32 n_hist, int32 *n_used) +{ + int32 prob, class_weight = 0; + int i; + + /* Closed vocabulary, OOV word probability is zero */ + if (wid == NGRAM_INVALID_WID) + return model->log_zero; + + /* "Declassify" wid and history */ + if (NGRAM_IS_CLASSWID(wid)) { + ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)]; + + class_weight = ngram_class_prob(lmclass, wid); + if (class_weight == 1) /* Meaning, not found in class. */ + return class_weight; + wid = lmclass->tag_wid; + } + for (i = 0; i < n_hist; ++i) { + if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i])) + history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid; + } + prob = (*model->funcs->raw_score)(model, wid, history, + n_hist, n_used); + /* Multiply by unigram in-class weight. */ + return prob + class_weight; +} + +int32 +ngram_probv(ngram_model_t *model, const char *word, ...) +{ + va_list history; + const char *hword; + int32 *histid; + int32 n_hist; + int32 n_used; + int32 prob; + + va_start(history, word); + n_hist = 0; + while ((hword = va_arg(history, const char *)) != NULL) + ++n_hist; + va_end(history); + + histid = ckd_calloc(n_hist, sizeof(*histid)); + va_start(history, word); + n_hist = 0; + while ((hword = va_arg(history, const char *)) != NULL) { + histid[n_hist] = ngram_wid(model, hword); + ++n_hist; + } + va_end(history); + + prob = ngram_ng_prob(model, ngram_wid(model, word), + histid, n_hist, &n_used); + ckd_free(histid); + return prob; +} + +int32 +ngram_prob(ngram_model_t *model, const char *const *words, int32 n) +{ + int32 *ctx_id; + int32 nused; + int32 prob; + int32 wid; + uint32 i; + + ctx_id = (int32 *)ckd_calloc(n - 1, sizeof(*ctx_id)); + for (i = 1; i < n; ++i) + ctx_id[i - 1] = ngram_wid(model, words[i]); + + wid = ngram_wid(model, *words); + prob = ngram_ng_prob(model, wid, ctx_id, n - 1, &nused); + ckd_free(ctx_id); + + return prob; +} + +int32 +ngram_score_to_prob(ngram_model_t *base, int32 score) +{ + int32 prob; + + /* Undo insertion penalty. */ + prob = score - base->log_wip; + /* Undo language weight. */ + prob = (int32)(prob / base->lw); + + return prob; +} + +int32 +ngram_unknown_wid(ngram_model_t *model) +{ + int32 val; + + /* FIXME: This could be memoized for speed if necessary. */ + /* Look up <UNK>, if not found return NGRAM_INVALID_WID. */ + if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1) + return NGRAM_INVALID_WID; + else + return val; +} + +int32 +ngram_zero(ngram_model_t *model) +{ + return model->log_zero; +} + +int32 +ngram_model_get_size(ngram_model_t *model) +{ + if (model != NULL) + return model->n; + return 0; +} + +int32 const * +ngram_model_get_counts(ngram_model_t *model) +{ + if (model != NULL) + return model->n_counts; + return NULL; +} + +void +ngram_iter_init(ngram_iter_t *itor, ngram_model_t *model, + int m, int successor) +{ + itor->model = model; + itor->wids = ckd_calloc(model->n, sizeof(*itor->wids)); + itor->m = m; + itor->successor = successor; +} + +ngram_iter_t * +ngram_model_mgrams(ngram_model_t *model, int m) +{ + ngram_iter_t *itor; + /* The fact that m=n-1 is not exactly obvious. Prevent accidents. */ + if (m >= model->n) + return NULL; + if (model->funcs->mgrams == NULL) + return NULL; + itor = (*model->funcs->mgrams)(model, m); + return itor; +} + +ngram_iter_t * +ngram_iter(ngram_model_t *model, const char *word, ...) +{ + va_list history; + const char *hword; + int32 *histid; + int32 n_hist; + ngram_iter_t *itor; + + va_start(history, word); + n_hist = 0; + while ((hword = va_arg(history, const char *)) != NULL) + ++n_hist; + va_end(history); + + histid = ckd_calloc(n_hist, sizeof(*histid)); + va_start(history, word); + n_hist = 0; + while ((hword = va_arg(history, const char *)) != NULL) { + histid[n_hist] = ngram_wid(model, hword); + ++n_hist; + } + va_end(history); + + itor = ngram_ng_iter(model, ngram_wid(model, word), histid, n_hist); + ckd_free(histid); + return itor; +} + +ngram_iter_t * +ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist) +{ + if (n_hist >= model->n) + return NULL; + if (model->funcs->iter == NULL) + return NULL; + return (*model->funcs->iter)(model, wid, history, n_hist); +} + +ngram_iter_t * +ngram_iter_successors(ngram_iter_t *itor) +{ + /* Stop when we are at the highest order N-Gram. */ + if (itor->m == itor->model->n - 1) + return NULL; + return (*itor->model->funcs->successors)(itor); +} + +int32 const * +ngram_iter_get(ngram_iter_t *itor, + int32 *out_score, + int32 *out_bowt) +{ + return (*itor->model->funcs->iter_get)(itor, out_score, out_bowt); +} + +ngram_iter_t * +ngram_iter_next(ngram_iter_t *itor) +{ + return (*itor->model->funcs->iter_next)(itor); +} + +void +ngram_iter_free(ngram_iter_t *itor) +{ + ckd_free(itor->wids); + (*itor->model->funcs->iter_free)(itor); +} + +int32 +ngram_wid(ngram_model_t *model, const char *word) +{ + int32 val; + + if (hash_table_lookup_int32(model->wid, word, &val) == -1) + return ngram_unknown_wid(model); + else + return val; +} + +const char * +ngram_word(ngram_model_t *model, int32 wid) +{ + /* Remove any class tag */ + wid = NGRAM_BASEWID(wid); + if (wid >= model->n_words) + return NULL; + return model->word_str[wid]; +} + +/** + * Add a word to the word string and ID mapping. + */ +int32 +ngram_add_word_internal(ngram_model_t *model, + const char *word, + int32 classid) +{ + + /* Check for hash collisions. */ + int32 wid; + if (hash_table_lookup_int32(model->wid, word, &wid) == 0) { + E_WARN("Omit duplicate word '%s'\n", word); + return wid; + } + + /* Take the next available word ID */ + wid = model->n_words; + if (classid >= 0) { + wid = NGRAM_CLASSWID(wid, classid); + } + + /* Reallocate word_str if necessary. */ + if (model->n_words >= model->n_1g_alloc) { + model->n_1g_alloc += UG_ALLOC_STEP; + model->word_str = ckd_realloc(model->word_str, + sizeof(*model->word_str) * model->n_1g_alloc); + } + /* Add the word string in the appropriate manner. */ + /* Class words are always dynamically allocated. */ + model->word_str[model->n_words] = ckd_salloc(word); + /* Now enter it into the hash table. */ + if (hash_table_enter_int32(model->wid, model->word_str[model->n_words], wid) != wid) { + E_ERROR("Hash insertion failed for word %s => %p (should not happen)\n", + model->word_str[model->n_words], (void *)(long)(wid)); + } + /* Increment number of words. */ + ++model->n_words; + return wid; +} + +int32 +ngram_model_add_word(ngram_model_t *model, + const char *word, float32 weight) +{ + int32 wid, prob = model->log_zero; + + /* If we add word to unwritable model, we need to make it writable */ + if (!model->writable) { + E_WARN("Can't add word '%s' to read-only language model. " + "Disable mmap with '-mmap no' to make it writable\n", word); + return -1; + } + + wid = ngram_add_word_internal(model, word, -1); + if (wid == NGRAM_INVALID_WID) + return wid; + + /* Do what needs to be done to add the word to the unigram. */ + if (model->funcs && model->funcs->add_ug) + prob = (*model->funcs->add_ug)(model, wid, logmath_log(model->lmath, weight)); + if (prob == 0) + return -1; + + return wid; +} + +ngram_class_t * +ngram_class_new(ngram_model_t *model, int32 tag_wid, int32 start_wid, glist_t classwords) +{ + ngram_class_t *lmclass; + gnode_t *gn; + float32 tprob; + int i; + + lmclass = ckd_calloc(1, sizeof(*lmclass)); + lmclass->tag_wid = tag_wid; + /* wid_base is the wid (minus class tag) of the first word in the list. */ + lmclass->start_wid = start_wid; + lmclass->n_words = glist_count(classwords); + lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1)); + lmclass->nword_hash = NULL; + lmclass->n_hash = 0; + tprob = 0.0; + for (gn = classwords; gn; gn = gnode_next(gn)) { + tprob += gnode_float32(gn); + } + if (tprob > 1.1 || tprob < 0.9) { + E_INFO("Total class probability is %f, will normalize\n", tprob); + for (gn = classwords; gn; gn = gnode_next(gn)) { + gn->data.fl /= tprob; + } + } + for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) { + lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn)); + } + + return lmclass; +} + +int32 +ngram_class_add_word(ngram_class_t *lmclass, int32 wid, int32 lweight) +{ + int32 hash; + + if (lmclass->nword_hash == NULL) { + /* Initialize everything in it to -1 */ + lmclass->nword_hash = ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash)); + memset(lmclass->nword_hash, 0xff, NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash)); + lmclass->n_hash = NGRAM_HASH_SIZE; + lmclass->n_hash_inuse = 0; + } + /* Stupidest possible hash function. This will work pretty well + * when this function is called repeatedly with contiguous word + * IDs, though... */ + hash = wid & (lmclass->n_hash - 1); + if (lmclass->nword_hash[hash].wid == -1) { + /* Good, no collision. */ + lmclass->nword_hash[hash].wid = wid; + lmclass->nword_hash[hash].prob1 = lweight; + ++lmclass->n_hash_inuse; + return hash; + } + else { + int32 next; /**< Next available bucket. */ + /* Collision... Find the end of the hash chain. */ + while (lmclass->nword_hash[hash].next != -1) + hash = lmclass->nword_hash[hash].next; + assert(hash != -1); + /* Does we has any more bukkit? */ + if (lmclass->n_hash_inuse == lmclass->n_hash) { + /* Oh noes! Ok, we makes more. */ + lmclass->nword_hash = ckd_realloc(lmclass->nword_hash, + lmclass->n_hash * 2 * sizeof(*lmclass->nword_hash)); + memset(lmclass->nword_hash + lmclass->n_hash, + 0xff, lmclass->n_hash * sizeof(*lmclass->nword_hash)); + /* Just use the next allocated one (easy) */ + next = lmclass->n_hash; + lmclass->n_hash *= 2; + } + else { + /* Look for any available bucket. We hope this doesn't happen. */ + for (next = 0; next < lmclass->n_hash; ++next) + if (lmclass->nword_hash[next].wid == -1) + break; + /* This should absolutely not happen. */ + assert(next != lmclass->n_hash); + } + lmclass->nword_hash[next].wid = wid; + lmclass->nword_hash[next].prob1 = lweight; + lmclass->nword_hash[hash].next = next; + ++lmclass->n_hash_inuse; + return next; + } +} + +void +ngram_class_free(ngram_class_t *lmclass) +{ + ckd_free(lmclass->nword_hash); + ckd_free(lmclass->prob1); + ckd_free(lmclass); +} + +int32 +ngram_model_add_class_word(ngram_model_t *model, + const char *classname, + const char *word, + float32 weight) +{ + ngram_class_t *lmclass; + int32 classid, tag_wid, wid, i, scale; + float32 fprob; + + /* Find the class corresponding to classname. Linear search + * probably okay here since there won't be very many classes, and + * this doesn't have to be fast. */ + tag_wid = ngram_wid(model, classname); + if (tag_wid == NGRAM_INVALID_WID) { + E_ERROR("No such word or class tag: %s\n", classname); + return tag_wid; + } + for (classid = 0; classid < model->n_classes; ++classid) { + if (model->classes[classid]->tag_wid == tag_wid) + break; + } + /* Hmm, no such class. It's probably not a good idea to create one. */ + if (classid == model->n_classes) { + E_ERROR("Word %s is not a class tag (call ngram_model_add_class() first)\n", classname); + return NGRAM_INVALID_WID; + } + lmclass = model->classes[classid]; + + /* Add this word to the model's set of words. */ + wid = ngram_add_word_internal(model, word, classid); + if (wid == NGRAM_INVALID_WID) + return wid; + + /* This is the fixed probability of the new word. */ + fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1); + /* Now normalize everything else to fit it in. This is + * accomplished by simply scaling all the other probabilities + * by (1-fprob). */ + scale = logmath_log(model->lmath, 1.0 - fprob); + for (i = 0; i < lmclass->n_words; ++i) + lmclass->prob1[i] += scale; + for (i = 0; i < lmclass->n_hash; ++i) + if (lmclass->nword_hash[i].wid != -1) + lmclass->nword_hash[i].prob1 += scale; + + /* Now add it to the class hash table. */ + return ngram_class_add_word(lmclass, wid, logmath_log(model->lmath, fprob)); +} + +int32 +ngram_model_add_class(ngram_model_t *model, + const char *classname, + float32 classweight, + char **words, + const float32 *weights, + int32 n_words) +{ + ngram_class_t *lmclass; + glist_t classwords = NULL; + int32 i, start_wid = -1; + int32 classid, tag_wid; + + /* Check if classname already exists in model. If not, add it.*/ + if ((tag_wid = ngram_wid(model, classname)) == ngram_unknown_wid(model)) { + tag_wid = ngram_model_add_word(model, classname, classweight); + if (tag_wid == NGRAM_INVALID_WID) + return -1; + } + + if (model->n_classes == 128) { + E_ERROR("Number of classes cannot exceed 128 (sorry)\n"); + return -1; + } + classid = model->n_classes; + for (i = 0; i < n_words; ++i) { + int32 wid; + + wid = ngram_add_word_internal(model, words[i], classid); + if (wid == NGRAM_INVALID_WID) + return -1; + if (start_wid == -1) + start_wid = NGRAM_BASEWID(wid); + classwords = glist_add_float32(classwords, weights[i]); + } + classwords = glist_reverse(classwords); + lmclass = ngram_class_new(model, tag_wid, start_wid, classwords); + glist_free(classwords); + if (lmclass == NULL) + return -1; + + ++model->n_classes; + if (model->classes == NULL) + model->classes = ckd_calloc(1, sizeof(*model->classes)); + else + model->classes = ckd_realloc(model->classes, + model->n_classes * sizeof(*model->classes)); + model->classes[classid] = lmclass; + return classid; +} + +int32 +ngram_class_prob(ngram_class_t *lmclass, int32 wid) +{ + int32 base_wid = NGRAM_BASEWID(wid); + + if (base_wid < lmclass->start_wid + || base_wid > lmclass->start_wid + lmclass->n_words) { + int32 hash; + + /* Look it up in the hash table. */ + hash = wid & (lmclass->n_hash - 1); + while (hash != -1 && lmclass->nword_hash[hash].wid != wid) + hash = lmclass->nword_hash[hash].next; + if (hash == -1) + return 1; + return lmclass->nword_hash[hash].prob1; + } + else { + return lmclass->prob1[base_wid - lmclass->start_wid]; + } +} + +int32 +read_classdef_file(hash_table_t *classes, const char *file_name) +{ + FILE *fp; + int32 is_pipe; + int inclass; /**< Are we currently reading a list of class words? */ + int32 rv = -1; + gnode_t *gn; + glist_t classwords = NULL; + glist_t classprobs = NULL; + char *classname = NULL; + + if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) { + E_ERROR("File %s not found\n", file_name); + return -1; + } + + inclass = FALSE; + while (!feof(fp)) { + char line[512]; + char *wptr[2]; + int n_words; + + if (fgets(line, sizeof(line), fp) == NULL) + break; + + n_words = str2words(line, wptr, 2); + if (n_words <= 0) + continue; + + if (inclass) { + /* Look for an end of class marker. */ + if (n_words == 2 && 0 == strcmp(wptr[0], "END")) { + classdef_t *classdef; + gnode_t *word, *weight; + int32 i; + + if (classname == NULL || 0 != strcmp(wptr[1], classname)) + goto error_out; + inclass = FALSE; + + /* Construct a class from the list of words collected. */ + classdef = ckd_calloc(1, sizeof(*classdef)); + classwords = glist_reverse(classwords); + classprobs = glist_reverse(classprobs); + classdef->n_words = glist_count(classwords); + classdef->words = ckd_calloc(classdef->n_words, + sizeof(*classdef->words)); + classdef->weights = ckd_calloc(classdef->n_words, + sizeof(*classdef->weights)); + word = classwords; + weight = classprobs; + for (i = 0; i < classdef->n_words; ++i) { + classdef->words[i] = gnode_ptr(word); + classdef->weights[i] = gnode_float32(weight); + word = gnode_next(word); + weight = gnode_next(weight); + } + + /* Add this class to the hash table. */ + if (hash_table_enter(classes, classname, classdef) != classdef) { + classdef_free(classdef); + goto error_out; + } + + /* Reset everything. */ + glist_free(classwords); + glist_free(classprobs); + classwords = NULL; + classprobs = NULL; + classname = NULL; + } + else { + float32 fprob; + + if (n_words == 2) + fprob = (float32)atof_c(wptr[1]); + else + fprob = 1.0f; + /* Add it to the list of words for this class. */ + classwords = glist_add_ptr(classwords, ckd_salloc(wptr[0])); + classprobs = glist_add_float32(classprobs, fprob); + } + } + else { + /* Start a new LM class if the LMCLASS marker is seen */ + if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) { + if (inclass) + goto error_out; + inclass = TRUE; + classname = ckd_salloc(wptr[1]); + } + /* Otherwise, just ignore whatever junk we got */ + } + } + rv = 0; /* Success. */ + +error_out: + /* Free all the stuff we might have allocated. */ + fclose_comp(fp, is_pipe); + for (gn = classwords; gn; gn = gnode_next(gn)) + ckd_free(gnode_ptr(gn)); + glist_free(classwords); + glist_free(classprobs); + ckd_free(classname); + + return rv; +} + +void +classdef_free(classdef_t *classdef) +{ + int32 i; + for (i = 0; i < classdef->n_words; ++i) + ckd_free(classdef->words[i]); + ckd_free(classdef->words); + ckd_free(classdef->weights); + ckd_free(classdef); +} + + +int32 +ngram_model_read_classdef(ngram_model_t *model, + const char *file_name) +{ + hash_table_t *classes; + glist_t hl = NULL; + gnode_t *gn; + int32 rv = -1; + + classes = hash_table_new(0, FALSE); + if (read_classdef_file(classes, file_name) < 0) { + hash_table_free(classes); + return -1; + } + + /* Create a new class in the language model for each classdef. */ + hl = hash_table_tolist(classes, NULL); + for (gn = hl; gn; gn = gnode_next(gn)) { + hash_entry_t *he = gnode_ptr(gn); + classdef_t *classdef = he->val; + + if (ngram_model_add_class(model, he->key, 1.0, + classdef->words, + classdef->weights, + classdef->n_words) < 0) + goto error_out; + } + rv = 0; + +error_out: + for (gn = hl; gn; gn = gnode_next(gn)) { + hash_entry_t *he = gnode_ptr(gn); + ckd_free((char *)he->key); + classdef_free(he->val); + } + glist_free(hl); + hash_table_free(classes); + return rv; +} |