Add m-esr52 at 52.6.0

author: Matt A. Tobin <mattatobin@localhost.localdomain> 2018-02-02 04:16:08 -0500
committer: Matt A. Tobin <mattatobin@localhost.localdomain> 2018-02-02 04:16:08 -0500
commit: 5f8de423f190bbb79a62f804151bc24824fa32d8 (patch)
tree: 10027f336435511475e392454359edea8e25895d /media/sphinxbase/src/libsphinxbase/lm/ngram_model.c
parent: 49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff)
download: UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip
1 files changed, 1129 insertions, 0 deletions
diff --git a/media/sphinxbase/src/libsphinxbase/lm/ngram_model.c b/media/sphinxbase/src/libsphinxbase/lm/ngram_model.c
new file mode 100644
index 000000000..02af4151b
--- /dev/null
+++ b/media/sphinxbase/src/libsphinxbase/lm/ngram_model.c
@@ -0,0 +1,1129 @@
+/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
+/* ====================================================================
+ * Copyright (c) 1999-2007 Carnegie Mellon University.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * This work was supported in part by funding from the Defense Advanced 
+ * Research Projects Agency and the National Science Foundation of the 
+ * United States of America, and the CMU Sphinx Speech Consortium.
+ *
+ * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
+ * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
+ * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ====================================================================
+ *
+ */
+/*
+ * \file ngram_model.c N-Gram language models.
+ *
+ * Author: David Huggins-Daines, much code taken from sphinx3/src/libs3decoder/liblm
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <string.h>
+#include <assert.h>
+
+#include "sphinxbase/ngram_model.h"
+#include "sphinxbase/ckd_alloc.h"
+#include "sphinxbase/filename.h"
+#include "sphinxbase/pio.h"
+#include "sphinxbase/err.h"
+#include "sphinxbase/logmath.h"
+#include "sphinxbase/strfuncs.h"
+#include "sphinxbase/case.h"
+
+#include "ngram_model_internal.h"
+
+ngram_file_type_t
+ngram_file_name_to_type(const char *file_name)
+{
+    const char *ext;
+
+    ext = strrchr(file_name, '.');
+    if (ext == NULL) {
+        return NGRAM_INVALID;
+    }
+    if (0 == strcmp_nocase(ext, ".gz")) {
+        while (--ext >= file_name) {
+            if (*ext == '.') break;
+        }
+        if (ext < file_name) {
+            return NGRAM_INVALID;
+         }
+     }
+     else if (0 == strcmp_nocase(ext, ".bz2")) {
+         while (--ext >= file_name) {
+             if (*ext == '.') break;
+         }
+         if (ext < file_name) {
+             return NGRAM_INVALID;
+         }
+     }
+     /* We use strncmp because there might be a .gz on the end. */
+     if (0 == strncmp_nocase(ext, ".ARPA", 5))
+         return NGRAM_ARPA;
+     if (0 == strncmp_nocase(ext, ".DMP", 4))
+         return NGRAM_DMP;
+     return NGRAM_INVALID;
+ }
+
+ngram_file_type_t
+ngram_str_to_type(const char *str_name)
+{
+    if (0 == strcmp_nocase(str_name, "arpa"))
+        return NGRAM_ARPA;
+    if (0 == strcmp_nocase(str_name, "dmp"))
+        return NGRAM_DMP;
+    return NGRAM_INVALID;
+}
+
+char const *
+ngram_type_to_str(int type)
+{
+    switch (type) {
+    case NGRAM_ARPA:
+        return "arpa";
+    case NGRAM_DMP:
+        return "dmp";
+    default:
+        return NULL;
+    }
+}
+
+
+ ngram_model_t *
+ ngram_model_read(cmd_ln_t *config,
+                  const char *file_name,
+                  ngram_file_type_t file_type,
+                  logmath_t *lmath)
+ {
+     ngram_model_t *model = NULL;
+
+     switch (file_type) {
+     case NGRAM_AUTO: {
+         if ((model = ngram_model_arpa_read(config, file_name, lmath)) != NULL)
+             break;
+         if ((model = ngram_model_dmp_read(config, file_name, lmath)) != NULL)
+             break;
+         return NULL;
+     }
+     case NGRAM_ARPA:
+         model = ngram_model_arpa_read(config, file_name, lmath);
+         break;
+     case NGRAM_DMP:
+         model = ngram_model_dmp_read(config, file_name, lmath);
+         break;
+     default:
+         E_ERROR("language model file type not supported\n");
+         return NULL;
+     }
+
+     /* Now set weights based on config if present. */
+     if (config) {
+         float32 lw = 1.0;
+         float32 wip = 1.0;
+         float32 uw = 1.0;
+
+         if (cmd_ln_exists_r(config, "-lw"))
+             lw = cmd_ln_float32_r(config, "-lw");
+         if (cmd_ln_exists_r(config, "-wip"))
+             wip = cmd_ln_float32_r(config, "-wip");
+         if (cmd_ln_exists_r(config, "-uw"))
+             uw = cmd_ln_float32_r(config, "-uw");
+
+         ngram_model_apply_weights(model, lw, wip, uw);
+     }
+
+     return model;
+ }
+
+ int
+ ngram_model_write(ngram_model_t *model, const char *file_name,
+                   ngram_file_type_t file_type)
+ {
+     switch (file_type) {
+     case NGRAM_AUTO: {
+         file_type = ngram_file_name_to_type(file_name);
+         /* Default to ARPA (catches .lm and other things) */
+         if (file_type == NGRAM_INVALID)
+             file_type = NGRAM_ARPA;
+         return ngram_model_write(model, file_name, file_type);
+     }
+     case NGRAM_ARPA:
+         return ngram_model_arpa_write(model, file_name);
+     case NGRAM_DMP:
+         return ngram_model_dmp_write(model, file_name);
+     default:
+         E_ERROR("language model file type not supported\n");
+         return -1;
+     }
+     E_ERROR("language model file type not supported\n");
+     return -1;
+ }
+
+ int32
+ ngram_model_init(ngram_model_t *base,
+                  ngram_funcs_t *funcs,
+                  logmath_t *lmath,
+                  int32 n, int32 n_unigram)
+ {
+     base->refcount = 1;
+     base->funcs = funcs;
+     base->n = n;
+     /* If this was previously initialized... */
+    if (base->n_counts == NULL)
+        base->n_counts = ckd_calloc(3, sizeof(*base->n_counts));
+    /* Don't reset weights if logmath object hasn't changed. */
+    if (base->lmath != lmath) {
+        /* Set default values for weights. */
+        base->lw = 1.0;
+        base->log_wip = 0; /* i.e. 1.0 */
+        base->log_uw = 0;  /* i.e. 1.0 */
+        base->log_uniform = logmath_log(lmath, 1.0 / n_unigram);
+        base->log_uniform_weight = logmath_get_zero(lmath);
+        base->log_zero = logmath_get_zero(lmath);
+        base->lmath = lmath;
+    }
+    /* Allocate or reallocate space for word strings. */
+    if (base->word_str) {
+        /* Free all previous word strings if they were allocated. */
+        if (base->writable) {
+            int32 i;
+            for (i = 0; i < base->n_words; ++i) {
+                ckd_free(base->word_str[i]);
+                base->word_str[i] = NULL;
+            }
+        }
+        base->word_str = ckd_realloc(base->word_str, n_unigram * sizeof(char *));
+    }
+    else
+        base->word_str = ckd_calloc(n_unigram, sizeof(char *));
+    /* NOTE: They are no longer case-insensitive since we are allowing
+     * other encodings for word strings.  Beware. */
+    if (base->wid)
+        hash_table_empty(base->wid);
+    else
+        base->wid = hash_table_new(n_unigram, FALSE);
+    base->n_counts[0] = base->n_1g_alloc = base->n_words = n_unigram;
+
+    return 0;
+}
+
+ngram_model_t *
+ngram_model_retain(ngram_model_t *model)
+{
+    ++model->refcount;
+    return model;
+}
+
+
+void
+ngram_model_flush(ngram_model_t *model)
+{
+    if (model->funcs && model->funcs->flush)
+        (*model->funcs->flush)(model);
+}
+
+int
+ngram_model_free(ngram_model_t *model)
+{
+    int i;
+
+    if (model == NULL)
+        return 0;
+    if (--model->refcount > 0)
+        return model->refcount;
+    if (model->funcs && model->funcs->free)
+        (*model->funcs->free)(model);
+    if (model->writable) {
+        /* Free all words. */
+        for (i = 0; i < model->n_words; ++i) {
+            ckd_free(model->word_str[i]);
+        }
+    }
+    else {
+        /* Free all class words. */
+        for (i = 0; i < model->n_classes; ++i) {
+            ngram_class_t *lmclass;
+            int32 j;
+
+            lmclass = model->classes[i];
+            for (j = 0; j < lmclass->n_words; ++j) {
+                ckd_free(model->word_str[lmclass->start_wid + j]);
+            }
+            for (j = 0; j < lmclass->n_hash; ++j) {
+                if (lmclass->nword_hash[j].wid != -1) {
+                    ckd_free(model->word_str[lmclass->nword_hash[j].wid]);
+                }
+            }
+        }
+    }
+    for (i = 0; i < model->n_classes; ++i) {
+        ngram_class_free(model->classes[i]);
+    }
+    ckd_free(model->classes);
+    hash_table_free(model->wid);
+    ckd_free(model->word_str);
+    ckd_free(model->n_counts);
+    ckd_free(model);
+    return 0;
+}
+
+int
+ngram_model_casefold(ngram_model_t *model, int kase)
+{
+    int writable, i;
+    hash_table_t *new_wid;
+
+    /* Were word strings already allocated? */
+    writable = model->writable;
+    /* Either way, we are going to allocate some word strings. */
+    model->writable = TRUE;
+
+    /* And, don't forget, we need to rebuild the word to unigram ID
+     * mapping. */
+    new_wid = hash_table_new(model->n_words, FALSE);
+    for (i = 0; i < model->n_words; ++i) {
+        char *outstr;
+        if (writable) {
+            outstr = model->word_str[i];
+        }
+        else {
+            outstr = ckd_salloc(model->word_str[i]);
+        }
+        /* Don't case-fold <tags> or [classes] */
+        if (outstr[0] == '<' || outstr[0] == '[') {
+        }
+        else {
+            switch (kase) {
+            case NGRAM_UPPER:
+                ucase(outstr);
+                break;
+            case NGRAM_LOWER:
+                lcase(outstr);
+                break;
+            default:
+                ;
+            }
+        }
+        model->word_str[i] = outstr;
+
+        /* Now update the hash table.  We might have terrible
+         * collisions here, so warn about them. */
+        if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
+            E_WARN("Duplicate word in dictionary after conversion: %s\n",
+                   model->word_str[i]);
+        }
+    }
+    /* Swap out the hash table. */
+    hash_table_free(model->wid);
+    model->wid = new_wid;
+    return 0;
+}
+
+int
+ngram_model_apply_weights(ngram_model_t *model,
+                          float32 lw, float32 wip, float32 uw)
+{
+    return (*model->funcs->apply_weights)(model, lw, wip, uw);
+}
+
+float32
+ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip,
+                        int32 *out_log_uw)
+{
+    if (out_log_wip) *out_log_wip = model->log_wip;
+    if (out_log_uw) *out_log_uw = model->log_uw;
+    return model->lw;
+}
+
+
+int32
+ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history,
+               int32 n_hist, int32 *n_used)
+{
+    int32 score, class_weight = 0;
+    int i;
+
+    /* Closed vocabulary, OOV word probability is zero */
+    if (wid == NGRAM_INVALID_WID)
+        return model->log_zero;
+
+    /* "Declassify" wid and history */
+    if (NGRAM_IS_CLASSWID(wid)) {
+        ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
+
+        class_weight = ngram_class_prob(lmclass, wid);
+        if (class_weight == 1) /* Meaning, not found in class. */
+            return model->log_zero;
+        wid = lmclass->tag_wid;
+    }
+    for (i = 0; i < n_hist; ++i) {
+        if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
+            history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
+    }
+    score = (*model->funcs->score)(model, wid, history, n_hist, n_used);
+
+    /* Multiply by unigram in-class weight. */
+    return score + class_weight;
+}
+
+int32
+ngram_score(ngram_model_t *model, const char *word, ...)
+{
+    va_list history;
+    const char *hword;
+    int32 *histid;
+    int32 n_hist;
+    int32 n_used;
+    int32 prob;
+
+    va_start(history, word);
+    n_hist = 0;
+    while ((hword = va_arg(history, const char *)) != NULL)
+        ++n_hist;
+    va_end(history);
+
+    histid = ckd_calloc(n_hist, sizeof(*histid));
+    va_start(history, word);
+    n_hist = 0;
+    while ((hword = va_arg(history, const char *)) != NULL) {
+        histid[n_hist] = ngram_wid(model, hword);
+        ++n_hist;
+    }
+    va_end(history);
+
+    prob = ngram_ng_score(model, ngram_wid(model, word),
+                          histid, n_hist, &n_used);
+    ckd_free(histid);
+    return prob;
+}
+
+int32
+ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
+{
+    int32 hist[2];
+    hist[0] = w2;
+    hist[1] = w1;
+    return ngram_ng_score(model, w3, hist, 2, n_used);
+}
+
+int32
+ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
+{
+    return ngram_ng_score(model, w2, &w1, 1, n_used);
+}
+
+int32
+ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history,
+              int32 n_hist, int32 *n_used)
+{
+    int32 prob, class_weight = 0;
+    int i;
+
+    /* Closed vocabulary, OOV word probability is zero */
+    if (wid == NGRAM_INVALID_WID)
+        return model->log_zero;
+
+    /* "Declassify" wid and history */
+    if (NGRAM_IS_CLASSWID(wid)) {
+        ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
+
+        class_weight = ngram_class_prob(lmclass, wid);
+        if (class_weight == 1) /* Meaning, not found in class. */
+            return class_weight;
+        wid = lmclass->tag_wid;
+    }
+    for (i = 0; i < n_hist; ++i) {
+        if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
+            history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
+    }
+    prob = (*model->funcs->raw_score)(model, wid, history,
+                                      n_hist, n_used);
+    /* Multiply by unigram in-class weight. */
+    return prob + class_weight;
+}
+
+int32
+ngram_probv(ngram_model_t *model, const char *word, ...)
+{
+    va_list history;
+    const char *hword;
+    int32 *histid;
+    int32 n_hist;
+    int32 n_used;
+    int32 prob;
+
+    va_start(history, word);
+    n_hist = 0;
+    while ((hword = va_arg(history, const char *)) != NULL)
+        ++n_hist;
+    va_end(history);
+
+    histid = ckd_calloc(n_hist, sizeof(*histid));
+    va_start(history, word);
+    n_hist = 0;
+    while ((hword = va_arg(history, const char *)) != NULL) {
+        histid[n_hist] = ngram_wid(model, hword);
+        ++n_hist;
+    }
+    va_end(history);
+
+    prob = ngram_ng_prob(model, ngram_wid(model, word),
+                         histid, n_hist, &n_used);
+    ckd_free(histid);
+    return prob;
+}
+
+int32
+ngram_prob(ngram_model_t *model, const char *const *words, int32 n)
+{
+    int32 *ctx_id;
+    int32 nused;
+    int32 prob;
+    int32 wid;
+    uint32 i;
+
+    ctx_id = (int32 *)ckd_calloc(n - 1, sizeof(*ctx_id));
+    for (i = 1; i < n; ++i)
+      ctx_id[i - 1] = ngram_wid(model, words[i]);
+
+    wid = ngram_wid(model, *words);
+    prob = ngram_ng_prob(model, wid, ctx_id, n - 1, &nused);
+    ckd_free(ctx_id);
+    
+    return prob;
+}
+
+int32
+ngram_score_to_prob(ngram_model_t *base, int32 score)
+{
+    int32 prob;
+
+    /* Undo insertion penalty. */
+    prob = score - base->log_wip;
+    /* Undo language weight. */
+    prob = (int32)(prob / base->lw);
+
+    return prob;
+}
+
+int32
+ngram_unknown_wid(ngram_model_t *model)
+{
+    int32 val;
+
+    /* FIXME: This could be memoized for speed if necessary. */
+    /* Look up <UNK>, if not found return NGRAM_INVALID_WID. */
+    if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1)
+        return NGRAM_INVALID_WID;
+    else
+        return val;
+}
+
+int32
+ngram_zero(ngram_model_t *model)
+{
+    return model->log_zero;
+}
+
+int32
+ngram_model_get_size(ngram_model_t *model)
+{
+  if (model != NULL)
+    return model->n;
+  return 0;
+}
+
+int32 const *
+ngram_model_get_counts(ngram_model_t *model)
+{
+  if (model != NULL)
+    return model->n_counts;
+  return NULL;
+}
+
+void
+ngram_iter_init(ngram_iter_t *itor, ngram_model_t *model,
+                int m, int successor)
+{
+    itor->model = model;
+    itor->wids = ckd_calloc(model->n, sizeof(*itor->wids));
+    itor->m = m;
+    itor->successor = successor;
+}
+
+ngram_iter_t *
+ngram_model_mgrams(ngram_model_t *model, int m)
+{
+    ngram_iter_t *itor;
+    /* The fact that m=n-1 is not exactly obvious.  Prevent accidents. */
+    if (m >= model->n)
+        return NULL;
+    if (model->funcs->mgrams == NULL)
+        return NULL;
+    itor = (*model->funcs->mgrams)(model, m);
+    return itor;
+}
+
+ngram_iter_t *
+ngram_iter(ngram_model_t *model, const char *word, ...)
+{
+    va_list history;
+    const char *hword;
+    int32 *histid;
+    int32 n_hist;
+    ngram_iter_t *itor;
+
+    va_start(history, word);
+    n_hist = 0;
+    while ((hword = va_arg(history, const char *)) != NULL)
+        ++n_hist;
+    va_end(history);
+
+    histid = ckd_calloc(n_hist, sizeof(*histid));
+    va_start(history, word);
+    n_hist = 0;
+    while ((hword = va_arg(history, const char *)) != NULL) {
+        histid[n_hist] = ngram_wid(model, hword);
+        ++n_hist;
+    }
+    va_end(history);
+
+    itor = ngram_ng_iter(model, ngram_wid(model, word), histid, n_hist);
+    ckd_free(histid);
+    return itor;
+}
+
+ngram_iter_t *
+ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
+{
+    if (n_hist >= model->n)
+        return NULL;
+    if (model->funcs->iter == NULL)
+        return NULL;
+    return (*model->funcs->iter)(model, wid, history, n_hist);
+}
+
+ngram_iter_t *
+ngram_iter_successors(ngram_iter_t *itor)
+{
+    /* Stop when we are at the highest order N-Gram. */
+    if (itor->m == itor->model->n - 1)
+        return NULL;
+    return (*itor->model->funcs->successors)(itor);
+}
+
+int32 const *
+ngram_iter_get(ngram_iter_t *itor,
+               int32 *out_score,
+               int32 *out_bowt)
+{
+    return (*itor->model->funcs->iter_get)(itor, out_score, out_bowt);
+}
+
+ngram_iter_t *
+ngram_iter_next(ngram_iter_t *itor)
+{
+    return (*itor->model->funcs->iter_next)(itor);
+}
+
+void
+ngram_iter_free(ngram_iter_t *itor)
+{
+    ckd_free(itor->wids);
+    (*itor->model->funcs->iter_free)(itor);
+}
+
+int32
+ngram_wid(ngram_model_t *model, const char *word)
+{
+    int32 val;
+
+    if (hash_table_lookup_int32(model->wid, word, &val) == -1)
+        return ngram_unknown_wid(model);
+    else
+        return val;
+}
+
+const char *
+ngram_word(ngram_model_t *model, int32 wid)
+{
+    /* Remove any class tag */
+    wid = NGRAM_BASEWID(wid);
+    if (wid >= model->n_words)
+        return NULL;
+    return model->word_str[wid];
+}
+
+/**
+ * Add a word to the word string and ID mapping.
+ */
+int32
+ngram_add_word_internal(ngram_model_t *model,
+                        const char *word,
+                        int32 classid)
+{
+
+    /* Check for hash collisions. */
+    int32 wid;
+    if (hash_table_lookup_int32(model->wid, word, &wid) == 0) {
+        E_WARN("Omit duplicate word '%s'\n", word);
+        return wid;
+    }
+
+    /* Take the next available word ID */
+    wid = model->n_words;
+    if (classid >= 0) {
+        wid = NGRAM_CLASSWID(wid, classid);
+    }
+
+    /* Reallocate word_str if necessary. */
+    if (model->n_words >= model->n_1g_alloc) {
+        model->n_1g_alloc += UG_ALLOC_STEP;
+        model->word_str = ckd_realloc(model->word_str,
+                                      sizeof(*model->word_str) * model->n_1g_alloc);
+    }
+    /* Add the word string in the appropriate manner. */
+    /* Class words are always dynamically allocated. */
+    model->word_str[model->n_words] = ckd_salloc(word);
+    /* Now enter it into the hash table. */
+    if (hash_table_enter_int32(model->wid, model->word_str[model->n_words], wid) != wid) {
+        E_ERROR("Hash insertion failed for word %s => %p (should not happen)\n",
+                model->word_str[model->n_words], (void *)(long)(wid));
+    }
+    /* Increment number of words. */
+    ++model->n_words;
+    return wid;
+}
+
+int32
+ngram_model_add_word(ngram_model_t *model,
+                     const char *word, float32 weight)
+{
+    int32 wid, prob = model->log_zero;
+
+    /* If we add word to unwritable model, we need to make it writable */
+    if (!model->writable) {
+      E_WARN("Can't add word '%s' to read-only language model. "
+             "Disable mmap with '-mmap no' to make it writable\n", word);
+      return -1;
+    }
+
+    wid = ngram_add_word_internal(model, word, -1);
+    if (wid == NGRAM_INVALID_WID)
+        return wid;
+
+    /* Do what needs to be done to add the word to the unigram. */
+    if (model->funcs && model->funcs->add_ug)
+      prob = (*model->funcs->add_ug)(model, wid, logmath_log(model->lmath, weight));
+    if (prob == 0)
+      return -1;
+
+    return wid;
+}
+
+ngram_class_t *
+ngram_class_new(ngram_model_t *model, int32 tag_wid, int32 start_wid, glist_t classwords)
+{
+    ngram_class_t *lmclass;
+    gnode_t *gn;
+    float32 tprob;
+    int i;
+
+    lmclass = ckd_calloc(1, sizeof(*lmclass));
+    lmclass->tag_wid = tag_wid;
+    /* wid_base is the wid (minus class tag) of the first word in the list. */
+    lmclass->start_wid = start_wid;
+    lmclass->n_words = glist_count(classwords);
+    lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1));
+    lmclass->nword_hash = NULL;
+    lmclass->n_hash = 0;
+    tprob = 0.0;
+    for (gn = classwords; gn; gn = gnode_next(gn)) {
+        tprob += gnode_float32(gn);
+    }
+    if (tprob > 1.1 || tprob < 0.9) {
+        E_INFO("Total class probability is %f, will normalize\n", tprob);
+        for (gn = classwords; gn; gn = gnode_next(gn)) {
+            gn->data.fl /= tprob;
+        }
+    }
+    for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) {
+        lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn));
+    }
+
+    return lmclass;
+}
+
+int32
+ngram_class_add_word(ngram_class_t *lmclass, int32 wid, int32 lweight)
+{
+    int32 hash;
+
+    if (lmclass->nword_hash == NULL) {
+        /* Initialize everything in it to -1 */
+        lmclass->nword_hash = ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
+        memset(lmclass->nword_hash, 0xff, NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
+        lmclass->n_hash = NGRAM_HASH_SIZE;
+        lmclass->n_hash_inuse = 0;
+    }
+    /* Stupidest possible hash function.  This will work pretty well
+     * when this function is called repeatedly with contiguous word
+     * IDs, though... */
+    hash = wid & (lmclass->n_hash - 1);
+    if (lmclass->nword_hash[hash].wid == -1) {
+        /* Good, no collision. */
+        lmclass->nword_hash[hash].wid = wid;
+        lmclass->nword_hash[hash].prob1 = lweight;
+        ++lmclass->n_hash_inuse;
+        return hash;
+    }
+    else {
+        int32 next; /**< Next available bucket. */
+        /* Collision... Find the end of the hash chain. */
+        while (lmclass->nword_hash[hash].next != -1)
+            hash = lmclass->nword_hash[hash].next;
+        assert(hash != -1);
+        /* Does we has any more bukkit? */
+        if (lmclass->n_hash_inuse == lmclass->n_hash) {
+            /* Oh noes!  Ok, we makes more. */
+            lmclass->nword_hash = ckd_realloc(lmclass->nword_hash, 
+                                              lmclass->n_hash * 2 * sizeof(*lmclass->nword_hash));
+            memset(lmclass->nword_hash + lmclass->n_hash,
+                   0xff, lmclass->n_hash * sizeof(*lmclass->nword_hash));
+            /* Just use the next allocated one (easy) */
+            next = lmclass->n_hash;
+            lmclass->n_hash *= 2;
+        }
+        else {
+            /* Look for any available bucket.  We hope this doesn't happen. */
+            for (next = 0; next < lmclass->n_hash; ++next)
+                if (lmclass->nword_hash[next].wid == -1)
+                    break;
+            /* This should absolutely not happen. */
+            assert(next != lmclass->n_hash);
+        }
+        lmclass->nword_hash[next].wid = wid;
+        lmclass->nword_hash[next].prob1 = lweight;
+        lmclass->nword_hash[hash].next = next;
+        ++lmclass->n_hash_inuse;
+        return next;
+    }
+}
+
+void
+ngram_class_free(ngram_class_t *lmclass)
+{
+    ckd_free(lmclass->nword_hash);
+    ckd_free(lmclass->prob1);
+    ckd_free(lmclass);
+}
+
+int32
+ngram_model_add_class_word(ngram_model_t *model,
+                           const char *classname,
+                           const char *word,
+                           float32 weight)
+{
+    ngram_class_t *lmclass;
+    int32 classid, tag_wid, wid, i, scale;
+    float32 fprob;
+
+    /* Find the class corresponding to classname.  Linear search
+     * probably okay here since there won't be very many classes, and
+     * this doesn't have to be fast. */
+    tag_wid = ngram_wid(model, classname);
+    if (tag_wid == NGRAM_INVALID_WID) {
+        E_ERROR("No such word or class tag: %s\n", classname);
+        return tag_wid;
+    }
+    for (classid = 0; classid < model->n_classes; ++classid) {
+        if (model->classes[classid]->tag_wid == tag_wid)
+            break;
+    }
+    /* Hmm, no such class.  It's probably not a good idea to create one. */
+    if (classid == model->n_classes) {
+        E_ERROR("Word %s is not a class tag (call ngram_model_add_class() first)\n", classname);
+        return NGRAM_INVALID_WID;
+    }
+    lmclass = model->classes[classid];
+
+    /* Add this word to the model's set of words. */
+    wid = ngram_add_word_internal(model, word, classid);
+    if (wid == NGRAM_INVALID_WID)
+        return wid;
+
+    /* This is the fixed probability of the new word. */
+    fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1);
+    /* Now normalize everything else to fit it in.  This is
+     * accomplished by simply scaling all the other probabilities
+     * by (1-fprob). */
+    scale = logmath_log(model->lmath, 1.0 - fprob);
+    for (i = 0; i < lmclass->n_words; ++i)
+        lmclass->prob1[i] += scale;
+    for (i = 0; i < lmclass->n_hash; ++i)
+        if (lmclass->nword_hash[i].wid != -1)
+            lmclass->nword_hash[i].prob1 += scale;
+
+    /* Now add it to the class hash table. */
+    return ngram_class_add_word(lmclass, wid, logmath_log(model->lmath, fprob));
+}
+
+int32
+ngram_model_add_class(ngram_model_t *model,
+                      const char *classname,
+                      float32 classweight,
+                      char **words,
+                      const float32 *weights,
+                      int32 n_words)
+{
+    ngram_class_t *lmclass;
+    glist_t classwords = NULL;
+    int32 i, start_wid = -1;
+    int32 classid, tag_wid;
+
+    /* Check if classname already exists in model.  If not, add it.*/
+    if ((tag_wid = ngram_wid(model, classname)) == ngram_unknown_wid(model)) {
+        tag_wid = ngram_model_add_word(model, classname, classweight);
+        if (tag_wid == NGRAM_INVALID_WID)
+            return -1;
+    }
+
+    if (model->n_classes == 128) {
+        E_ERROR("Number of classes cannot exceed 128 (sorry)\n");
+        return -1;
+    }
+    classid = model->n_classes;
+    for (i = 0; i < n_words; ++i) {
+        int32 wid;
+
+        wid = ngram_add_word_internal(model, words[i], classid);
+        if (wid == NGRAM_INVALID_WID)
+            return -1;
+        if (start_wid == -1)
+            start_wid = NGRAM_BASEWID(wid);
+        classwords = glist_add_float32(classwords, weights[i]);
+    }
+    classwords = glist_reverse(classwords);
+    lmclass = ngram_class_new(model, tag_wid, start_wid, classwords);
+    glist_free(classwords);
+    if (lmclass == NULL)
+        return -1;
+
+    ++model->n_classes;
+    if (model->classes == NULL)
+        model->classes = ckd_calloc(1, sizeof(*model->classes));
+    else
+        model->classes = ckd_realloc(model->classes,
+                                     model->n_classes * sizeof(*model->classes));
+    model->classes[classid] = lmclass;
+    return classid;
+}
+
+int32
+ngram_class_prob(ngram_class_t *lmclass, int32 wid)
+{
+    int32 base_wid = NGRAM_BASEWID(wid);
+
+    if (base_wid < lmclass->start_wid
+        || base_wid > lmclass->start_wid + lmclass->n_words) {
+        int32 hash;
+
+        /* Look it up in the hash table. */
+        hash = wid & (lmclass->n_hash - 1);
+        while (hash != -1 && lmclass->nword_hash[hash].wid != wid)
+            hash = lmclass->nword_hash[hash].next;
+        if (hash == -1)
+            return 1;
+        return lmclass->nword_hash[hash].prob1;
+    }
+    else {
+        return lmclass->prob1[base_wid - lmclass->start_wid];
+    }
+}
+
+int32
+read_classdef_file(hash_table_t *classes, const char *file_name)
+{
+    FILE *fp;
+    int32 is_pipe;
+    int inclass;  /**< Are we currently reading a list of class words? */
+    int32 rv = -1;
+    gnode_t *gn;
+    glist_t classwords = NULL;
+    glist_t classprobs = NULL;
+    char *classname = NULL;
+
+    if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
+        E_ERROR("File %s not found\n", file_name);
+        return -1;
+    }
+
+    inclass = FALSE;
+    while (!feof(fp)) {
+        char line[512];
+        char *wptr[2];
+        int n_words;
+
+        if (fgets(line, sizeof(line), fp) == NULL)
+            break;
+
+        n_words = str2words(line, wptr, 2);
+        if (n_words <= 0)
+            continue;
+
+        if (inclass) {
+            /* Look for an end of class marker. */
+            if (n_words == 2 && 0 == strcmp(wptr[0], "END")) {
+                classdef_t *classdef;
+                gnode_t *word, *weight;
+                int32 i;
+
+                if (classname == NULL || 0 != strcmp(wptr[1], classname))
+                    goto error_out;
+                inclass = FALSE;
+
+                /* Construct a class from the list of words collected. */
+                classdef = ckd_calloc(1, sizeof(*classdef));
+                classwords = glist_reverse(classwords);
+                classprobs = glist_reverse(classprobs);
+                classdef->n_words = glist_count(classwords);
+                classdef->words = ckd_calloc(classdef->n_words,
+                                             sizeof(*classdef->words));
+                classdef->weights = ckd_calloc(classdef->n_words,
+                                               sizeof(*classdef->weights));
+                word = classwords;
+                weight = classprobs;
+                for (i = 0; i < classdef->n_words; ++i) {
+                    classdef->words[i] = gnode_ptr(word);
+                    classdef->weights[i] = gnode_float32(weight);
+                    word = gnode_next(word);
+                    weight = gnode_next(weight);
+                }
+                
+                /* Add this class to the hash table. */
+                if (hash_table_enter(classes, classname, classdef) != classdef) {
+                    classdef_free(classdef);
+                    goto error_out;
+                }
+
+                /* Reset everything. */
+                glist_free(classwords);
+                glist_free(classprobs);
+                classwords = NULL;
+                classprobs = NULL;
+                classname = NULL;
+            }
+            else {
+                float32 fprob;
+
+                if (n_words == 2)
+                    fprob = (float32)atof_c(wptr[1]);
+                else
+                    fprob = 1.0f;
+                /* Add it to the list of words for this class. */
+                classwords = glist_add_ptr(classwords, ckd_salloc(wptr[0]));
+                classprobs = glist_add_float32(classprobs, fprob);
+            }
+        }
+        else {
+            /* Start a new LM class if the LMCLASS marker is seen */
+            if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) {
+                if (inclass)
+                    goto error_out;
+                inclass = TRUE;
+                classname = ckd_salloc(wptr[1]);
+            }
+            /* Otherwise, just ignore whatever junk we got */
+        }
+    }
+    rv = 0; /* Success. */
+
+error_out:
+    /* Free all the stuff we might have allocated. */
+    fclose_comp(fp, is_pipe);
+    for (gn = classwords; gn; gn = gnode_next(gn))
+        ckd_free(gnode_ptr(gn));
+    glist_free(classwords);
+    glist_free(classprobs);
+    ckd_free(classname);
+
+    return rv;
+}
+
+void
+classdef_free(classdef_t *classdef)
+{
+    int32 i;
+    for (i = 0; i < classdef->n_words; ++i)
+        ckd_free(classdef->words[i]);
+    ckd_free(classdef->words);
+    ckd_free(classdef->weights);
+    ckd_free(classdef);
+}
+
+
+int32
+ngram_model_read_classdef(ngram_model_t *model,
+                          const char *file_name)
+{
+    hash_table_t *classes;
+    glist_t hl = NULL;
+    gnode_t *gn;
+    int32 rv = -1;
+
+    classes = hash_table_new(0, FALSE);
+    if (read_classdef_file(classes, file_name) < 0) {
+        hash_table_free(classes);
+        return -1;
+    }
+    
+    /* Create a new class in the language model for each classdef. */
+    hl = hash_table_tolist(classes, NULL);
+    for (gn = hl; gn; gn = gnode_next(gn)) {
+        hash_entry_t *he = gnode_ptr(gn);
+        classdef_t *classdef = he->val;
+
+        if (ngram_model_add_class(model, he->key, 1.0,
+                                  classdef->words,
+                                  classdef->weights,
+                                  classdef->n_words) < 0)
+            goto error_out;
+    }
+    rv = 0;
+
+error_out:
+    for (gn = hl; gn; gn = gnode_next(gn)) {
+        hash_entry_t *he = gnode_ptr(gn);
+        ckd_free((char *)he->key);
+        classdef_free(he->val);
+    }
+    glist_free(hl);
+    hash_table_free(classes);
+    return rv;
+}
author	Matt A. Tobin <mattatobin@localhost.localdomain>	2018-02-02 04:16:08 -0500
committer	Matt A. Tobin <mattatobin@localhost.localdomain>	2018-02-02 04:16:08 -0500
commit	5f8de423f190bbb79a62f804151bc24824fa32d8 (patch)
tree	10027f336435511475e392454359edea8e25895d /media/sphinxbase/src/libsphinxbase/lm/ngram_model.c
parent	49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff)
download	UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip