Add m-esr52 at 52.6.0

author: Matt A. Tobin <mattatobin@localhost.localdomain> 2018-02-02 04:16:08 -0500
committer: Matt A. Tobin <mattatobin@localhost.localdomain> 2018-02-02 04:16:08 -0500
commit: 5f8de423f190bbb79a62f804151bc24824fa32d8 (patch)
tree: 10027f336435511475e392454359edea8e25895d /media/sphinxbase/src/libsphinxbase/lm/ngram_model_arpa.c
parent: 49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff)
download: UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip
1 files changed, 660 insertions, 0 deletions
diff --git a/media/sphinxbase/src/libsphinxbase/lm/ngram_model_arpa.c b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_arpa.c
new file mode 100644
index 000000000..a4b72cb00
--- /dev/null
+++ b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_arpa.c
@@ -0,0 +1,660 @@
+/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
+/* ====================================================================
+ * Copyright (c) 1999-2007 Carnegie Mellon University.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * This work was supported in part by funding from the Defense Advanced 
+ * Research Projects Agency and the National Science Foundation of the 
+ * United States of America, and the CMU Sphinx Speech Consortium.
+ *
+ * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
+ * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
+ * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ====================================================================
+ *
+ */
+/*
+ * \file ngram_model_arpa.c ARPA format language models
+ *
+ * Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
+ */
+
+#include "sphinxbase/ckd_alloc.h"
+#include <string.h>
+#include <limits.h>
+#include <assert.h>
+
+#include "sphinxbase/err.h"
+#include "sphinxbase/pio.h"
+#include "sphinxbase/listelem_alloc.h"
+#include "sphinxbase/strfuncs.h"
+
+#include "ngram_model_arpa.h"
+
+static ngram_funcs_t ngram_model_arpa_funcs;
+
+#define TSEG_BASE(m,b)		((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
+#define FIRST_BG(m,u)		((m)->lm3g.unigrams[u].bigrams)
+#define FIRST_TG(m,b)		(TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
+
+/*
+ * Read and return #unigrams, #bigrams, #trigrams as stated in input file.
+ */
+static int
+ReadNgramCounts(lineiter_t **li, int32 * n_ug, int32 * n_bg, int32 * n_tg)
+{
+    int32 ngram, ngram_cnt;
+
+    /* skip file until past the '\data\' marker */
+    while (*li) {
+        string_trim((*li)->buf, STRING_BOTH);
+        if (strcmp((*li)->buf, "\\data\\") == 0)
+            break;
+        *li = lineiter_next(*li);
+    }
+    if (*li == NULL || strcmp((*li)->buf, "\\data\\") != 0) {
+        E_INFO("No \\data\\ mark in LM file\n");
+        return -1;
+    }
+
+    *n_ug = *n_bg = *n_tg = 0;
+    while ((*li = lineiter_next(*li))) {
+        if (sscanf((*li)->buf, "ngram %d=%d", &ngram, &ngram_cnt) != 2)
+            break;
+        switch (ngram) {
+        case 1:
+            *n_ug = ngram_cnt;
+            break;
+        case 2:
+            *n_bg = ngram_cnt;
+            break;
+        case 3:
+            *n_tg = ngram_cnt;
+            break;
+        default:
+            E_ERROR("Unknown ngram (%d)\n", ngram);
+            return -1;
+        }
+    }
+    if (*li == NULL) {
+        E_ERROR("EOF while reading ngram counts\n");
+        return -1;
+    }
+
+    /* Position iterator to the unigrams header '\1-grams:\' */
+    while ((*li = lineiter_next(*li))) {
+        string_trim((*li)->buf, STRING_BOTH);
+        if (strcmp((*li)->buf, "\\1-grams:") == 0)
+            break;
+    }
+    if (*li == NULL) {
+        E_ERROR_SYSTEM("Failed to read \\1-grams: mark");
+        return -1;
+    }
+
+    if ((*n_ug <= 0) || (*n_bg < 0) || (*n_tg < 0)) {
+        E_ERROR("Bad or missing ngram count\n");
+        return -1;
+    }
+    return 0;
+}
+
+/*
+ * Read in the unigrams from given file into the LM structure model.
+ * On entry to this procedure, the iterator is positioned to the
+ * header line '\1-grams:'.
+ */
+static int
+ReadUnigrams(lineiter_t **li, ngram_model_arpa_t * model)
+{
+    ngram_model_t *base = &model->base;
+    int32 wcnt;
+    float p1;
+
+    E_INFO("Reading unigrams\n");
+
+    wcnt = 0;
+    while ((*li = lineiter_next(*li))) {
+        char *wptr[3], *name;
+        float32 bo_wt = 0.0f;
+        int n;
+
+        string_trim((*li)->buf, STRING_BOTH);
+        if (strcmp((*li)->buf, "\\2-grams:") == 0
+            || strcmp((*li)->buf, "\\end\\") == 0)
+            break;
+
+        if ((n = str2words((*li)->buf, wptr, 3)) < 2) {
+            if ((*li)->buf[0] != '\0')
+                E_WARN("Format error; unigram ignored: %s\n", (*li)->buf);
+            continue;
+        }
+        else {
+            p1 = (float)atof_c(wptr[0]);
+            name = wptr[1];
+            if (n == 3)
+                bo_wt = (float)atof_c(wptr[2]);
+        }
+
+        if (wcnt >= base->n_counts[0]) {
+            E_ERROR("Too many unigrams\n");
+            return -1;
+        }
+
+        /* Associate name with word id */
+        base->word_str[wcnt] = ckd_salloc(name);
+        if ((hash_table_enter(base->wid, base->word_str[wcnt], (void *)(long)wcnt))
+            != (void *)(long)wcnt) {
+                E_WARN("Duplicate word in dictionary: %s\n", base->word_str[wcnt]);
+        }
+        model->lm3g.unigrams[wcnt].prob1.l = logmath_log10_to_log(base->lmath, p1);
+        model->lm3g.unigrams[wcnt].bo_wt1.l = logmath_log10_to_log(base->lmath, bo_wt);
+        wcnt++;
+    }
+
+    if (base->n_counts[0] != wcnt) {
+        E_WARN("lm_t.ucount(%d) != #unigrams read(%d)\n",
+               base->n_counts[0], wcnt);
+        base->n_counts[0] = wcnt;
+        base->n_words = wcnt;
+    }
+    return 0;
+}
+
+/*
+ * Read bigrams from given file into given model structure.
+ */
+static int
+ReadBigrams(lineiter_t **li, ngram_model_arpa_t * model)
+{
+    ngram_model_t *base = &model->base;
+    int32 w1, w2, prev_w1, bgcount;
+    bigram_t *bgptr;
+
+    E_INFO("Reading bigrams\n");
+
+    bgcount = 0;
+    bgptr = model->lm3g.bigrams;
+    prev_w1 = -1;
+
+    while ((*li = lineiter_next(*li))) {
+        float32 p, bo_wt = 0.0f;
+        int32 p2, bo_wt2;
+        char *wptr[4], *word1, *word2;
+        int n;
+
+        string_trim((*li)->buf, STRING_BOTH);
+        wptr[3] = NULL;
+        if ((n = str2words((*li)->buf, wptr, 4)) < 3) {
+            if ((*li)->buf[0] != '\0')
+                break;
+            continue;
+        }
+        else {
+            p = (float32)atof_c(wptr[0]);
+            word1 = wptr[1];
+            word2 = wptr[2];
+            if (wptr[3])
+                bo_wt = (float32)atof_c(wptr[3]);
+        }
+
+        if ((w1 = ngram_wid(base, word1)) == NGRAM_INVALID_WID) {
+            E_ERROR("Unknown word: %s, skipping bigram (%s %s)\n",
+                    word1, word1, word2);
+            continue;
+        }
+        if ((w2 = ngram_wid(base, word2)) == NGRAM_INVALID_WID) {
+            E_ERROR("Unknown word: %s, skipping bigram (%s %s)\n",
+                    word2, word1, word2);
+            continue;
+        }
+
+        /* FIXME: Should use logmath_t quantization here. */
+        /* HACK!! to quantize probs to 4 decimal digits */
+        p = (float32)((int32)(p * 10000)) / 10000;
+        bo_wt = (float32)((int32)(bo_wt * 10000)) / 10000;
+
+        p2 = logmath_log10_to_log(base->lmath, p);
+        bo_wt2 = logmath_log10_to_log(base->lmath, bo_wt);
+
+        if (bgcount >= base->n_counts[1]) {
+            E_ERROR("Too many bigrams\n");
+            return -1;
+        }
+
+        bgptr->wid = w2;
+        bgptr->prob2 = sorted_id(&model->sorted_prob2, &p2);
+        if (base->n_counts[2] > 0)
+            bgptr->bo_wt2 = sorted_id(&model->sorted_bo_wt2, &bo_wt2);
+
+        if (w1 != prev_w1) {
+            if (w1 < prev_w1) {
+                E_ERROR("Bigram %s %s not in unigram order word id: %d prev word id: %d\n", word1, word2, w1, prev_w1);
+                return -1;
+            }
+
+            for (prev_w1++; prev_w1 <= w1; prev_w1++)
+                model->lm3g.unigrams[prev_w1].bigrams = bgcount;
+            prev_w1 = w1;
+        }
+        bgcount++;
+        bgptr++;
+
+        if ((bgcount & 0x0000ffff) == 0) {
+            E_INFOCONT(".");
+        }
+    }
+    if (*li == NULL || ((strcmp((*li)->buf, "\\end\\") != 0)
+                        && (strcmp((*li)->buf, "\\3-grams:") != 0))) {
+        E_ERROR("Bad bigram: %s\n", (*li)->buf);
+        return -1;
+    }
+
+    for (prev_w1++; prev_w1 <= base->n_counts[0]; prev_w1++)
+        model->lm3g.unigrams[prev_w1].bigrams = bgcount;
+
+    return 0;
+}
+
+/*
+ * Very similar to ReadBigrams.
+ */
+static int
+ReadTrigrams(lineiter_t **li, ngram_model_arpa_t * model)
+{
+    ngram_model_t *base = &model->base;
+    int32 i, w1, w2, w3, prev_w1, prev_w2, tgcount, prev_bg, bg, endbg;
+    int32 seg, prev_seg, prev_seg_lastbg;
+    trigram_t *tgptr;
+    bigram_t *bgptr;
+
+    E_INFO("Reading trigrams\n");
+
+    tgcount = 0;
+    tgptr = model->lm3g.trigrams;
+    prev_w1 = -1;
+    prev_w2 = -1;
+    prev_bg = -1;
+    prev_seg = -1;
+
+    while ((*li = lineiter_next(*li))) {
+        float32 p;
+        int32 p3;
+        char *wptr[4], *word1, *word2, *word3;
+
+        string_trim((*li)->buf, STRING_BOTH);
+        if (str2words((*li)->buf, wptr, 4) != 4) {
+            if ((*li)->buf[0] != '\0')
+                break;
+            continue;
+        }
+        else {
+            p = (float32)atof_c(wptr[0]);
+            word1 = wptr[1];
+            word2 = wptr[2];
+            word3 = wptr[3];
+        }
+
+        if ((w1 = ngram_wid(base, word1)) == NGRAM_INVALID_WID) {
+            E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
+                    word1, word1, word2, word3);
+            continue;
+        }
+        if ((w2 = ngram_wid(base, word2)) == NGRAM_INVALID_WID) {
+            E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
+                    word2, word1, word2, word3);
+            continue;
+        }
+        if ((w3 = ngram_wid(base, word3)) == NGRAM_INVALID_WID) {
+            E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
+                    word3, word1, word2, word3);
+            continue;
+        }
+
+        /* FIXME: Should use logmath_t quantization here. */
+        /* HACK!! to quantize probs to 4 decimal digits */
+        p = (float32)((int32)(p * 10000)) / 10000;
+        p3 = logmath_log10_to_log(base->lmath, p);
+
+        if (tgcount >= base->n_counts[2]) {
+            E_ERROR("Too many trigrams\n");
+            return -1;
+        }
+
+        tgptr->wid = w3;
+        tgptr->prob3 = sorted_id(&model->sorted_prob3, &p3);
+
+        if ((w1 != prev_w1) || (w2 != prev_w2)) {
+            /* Trigram for a new bigram; update tg info for all previous bigrams */
+            if ((w1 < prev_w1) || ((w1 == prev_w1) && (w2 < prev_w2))) {
+                E_ERROR("Trigrams not in bigram order\n");
+                return -1;
+            }
+
+            bg = (w1 !=
+                  prev_w1) ? model->lm3g.unigrams[w1].bigrams : prev_bg + 1;
+            endbg = model->lm3g.unigrams[w1 + 1].bigrams;
+            bgptr = model->lm3g.bigrams + bg;
+            for (; (bg < endbg) && (bgptr->wid != w2); bg++, bgptr++);
+            if (bg >= endbg) {
+                E_ERROR("Missing bigram for trigram: %s", (*li)->buf);
+                return -1;
+            }
+
+            /* bg = bigram entry index for <w1,w2>.  Update tseg_base */
+            seg = bg >> LOG_BG_SEG_SZ;
+            for (i = prev_seg + 1; i <= seg; i++)
+                model->lm3g.tseg_base[i] = tgcount;
+
+            /* Update trigrams pointers for all bigrams until bg */
+            if (prev_seg < seg) {
+                int32 tgoff = 0;
+
+                if (prev_seg >= 0) {
+                    tgoff = tgcount - model->lm3g.tseg_base[prev_seg];
+                    if (tgoff > 65535) {
+                        E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
+                        return -1;
+                    }
+                }
+
+                prev_seg_lastbg = ((prev_seg + 1) << LOG_BG_SEG_SZ) - 1;
+                bgptr = model->lm3g.bigrams + prev_bg;
+                for (++prev_bg, ++bgptr; prev_bg <= prev_seg_lastbg;
+                     prev_bg++, bgptr++)
+                    bgptr->trigrams = tgoff;
+
+                for (; prev_bg <= bg; prev_bg++, bgptr++)
+                    bgptr->trigrams = 0;
+            }
+            else {
+                int32 tgoff;
+
+                tgoff = tgcount - model->lm3g.tseg_base[prev_seg];
+                if (tgoff > 65535) {
+                    E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
+                    return -1;
+                }
+
+                bgptr = model->lm3g.bigrams + prev_bg;
+                for (++prev_bg, ++bgptr; prev_bg <= bg; prev_bg++, bgptr++)
+                    bgptr->trigrams = tgoff;
+            }
+
+            prev_w1 = w1;
+            prev_w2 = w2;
+            prev_bg = bg;
+            prev_seg = seg;
+        }
+
+        tgcount++;
+        tgptr++;
+
+        if ((tgcount & 0x0000ffff) == 0) {
+            E_INFOCONT(".");
+        }
+    }
+    if (*li == NULL || strcmp((*li)->buf, "\\end\\") != 0) {
+        E_ERROR("Bad trigram: %s\n", (*li)->buf);
+        return -1;
+    }
+
+    for (prev_bg++; prev_bg <= base->n_counts[1]; prev_bg++) {
+        if ((prev_bg & (BG_SEG_SZ - 1)) == 0)
+            model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ] = tgcount;
+        if ((tgcount - model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ]) > 65535) {
+            E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
+            return -1;
+        }
+        model->lm3g.bigrams[prev_bg].trigrams =
+            tgcount - model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ];
+    }
+    return 0;
+}
+
+static unigram_t *
+new_unigram_table(int32 n_ug)
+{
+    unigram_t *table;
+    int32 i;
+
+    table = ckd_calloc(n_ug, sizeof(unigram_t));
+    for (i = 0; i < n_ug; i++) {
+        table[i].prob1.l = INT_MIN;
+        table[i].bo_wt1.l = INT_MIN;
+    }
+    return table;
+}
+
+ngram_model_t *
+ngram_model_arpa_read(cmd_ln_t *config,
+		      const char *file_name,
+		      logmath_t *lmath)
+{
+    lineiter_t *li;
+    FILE *fp;
+    int32 is_pipe;
+    int32 n_unigram;
+    int32 n_bigram;
+    int32 n_trigram;
+    int32 n;
+    ngram_model_arpa_t *model;
+    ngram_model_t *base;
+
+    if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
+        E_ERROR("File %s not found\n", file_name);
+        return NULL;
+    }
+    li = lineiter_start(fp);
+ 
+    /* Read #unigrams, #bigrams, #trigrams from file */
+    if (ReadNgramCounts(&li, &n_unigram, &n_bigram, &n_trigram) == -1) {
+        lineiter_free(li);
+        fclose_comp(fp, is_pipe);
+        return NULL;
+    }
+    E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
+
+    /* Allocate space for LM, including initial OOVs and placeholders; initialize it */
+    model = ckd_calloc(1, sizeof(*model));
+    base = &model->base;
+    if (n_trigram > 0)
+        n = 3;
+    else if (n_bigram > 0)
+        n = 2;
+    else
+        n = 1;
+    /* Initialize base model. */
+    ngram_model_init(base, &ngram_model_arpa_funcs, lmath, n, n_unigram);
+    base->n_counts[0] = n_unigram;
+    base->n_counts[1] = n_bigram;
+    base->n_counts[2] = n_trigram;
+    base->writable = TRUE;
+
+    /*
+     * Allocate one extra unigram and bigram entry: sentinels to terminate
+     * followers (bigrams and trigrams, respectively) of previous entry.
+     */
+    model->lm3g.unigrams = new_unigram_table(n_unigram + 1);
+    model->lm3g.bigrams =
+        ckd_calloc(n_bigram + 1, sizeof(bigram_t));
+    if (n_trigram > 0)
+        model->lm3g.trigrams =
+            ckd_calloc(n_trigram, sizeof(trigram_t));
+
+    if (n_trigram > 0) {
+        model->lm3g.tseg_base =
+            ckd_calloc((n_bigram + 1) / BG_SEG_SZ + 1,
+                       sizeof(int32));
+    }
+    if (ReadUnigrams(&li, model) == -1) {
+        fclose_comp(fp, is_pipe);
+        ngram_model_free(base);
+        return NULL;
+    }
+    E_INFO("%8d = #unigrams created\n", base->n_counts[0]);
+
+    if (base->n_counts[2] > 0)
+        init_sorted_list(&model->sorted_bo_wt2);
+
+    if (base->n_counts[1] > 0) {
+        init_sorted_list(&model->sorted_prob2);
+
+        if (ReadBigrams(&li, model) == -1) {
+            fclose_comp(fp, is_pipe);
+            ngram_model_free(base);
+            return NULL;
+        }
+
+        base->n_counts[1] = FIRST_BG(model, base->n_counts[0]);
+        model->lm3g.n_prob2 = model->sorted_prob2.free;
+        model->lm3g.prob2 = vals_in_sorted_list(&model->sorted_prob2);
+        free_sorted_list(&model->sorted_prob2);
+        E_INFO("%8d = #bigrams created\n", base->n_counts[1]);
+        E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2);
+    }
+
+    if (base->n_counts[2] > 0) {
+        /* Create trigram bo-wts array */
+        model->lm3g.n_bo_wt2 = model->sorted_bo_wt2.free;
+        model->lm3g.bo_wt2 = vals_in_sorted_list(&model->sorted_bo_wt2);
+        free_sorted_list(&model->sorted_bo_wt2);
+        E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2);
+
+        init_sorted_list(&model->sorted_prob3);
+
+        if (ReadTrigrams(&li, model) == -1) {
+            fclose_comp(fp, is_pipe);
+            ngram_model_free(base);
+            return NULL;
+        }
+
+        base->n_counts[2] = FIRST_TG(model, base->n_counts[1]);
+        model->lm3g.n_prob3 = model->sorted_prob3.free;
+        model->lm3g.prob3 = vals_in_sorted_list(&model->sorted_prob3);
+        E_INFO("%8d = #trigrams created\n", base->n_counts[2]);
+        E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3);
+
+        free_sorted_list(&model->sorted_prob3);
+
+        /* Initialize tginfo */
+        model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *));
+        model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
+    }
+
+    lineiter_free(li);
+    fclose_comp(fp, is_pipe);
+    return base;
+}
+
+int
+ngram_model_arpa_write(ngram_model_t *model,
+		       const char *file_name)
+{
+    ngram_iter_t *itor;
+    FILE *fh;
+    int i;
+
+    if ((fh = fopen(file_name, "w")) == NULL) {
+        E_ERROR_SYSTEM("Failed to open %s for writing", file_name);
+        return -1;
+    }
+    fprintf(fh, "This is an ARPA-format language model file, generated by CMU Sphinx\n");
+
+    /* The ARPA format doesn't require any extra information that
+     * N-Gram iterators can't give us, so this is very
+     * straightforward compared with DMP writing. */
+
+    /* Write N-gram counts. */
+    fprintf(fh, "\\data\\\n");
+    for (i = 0; i < model->n; ++i) {
+        fprintf(fh, "ngram %d=%d\n", i+1, model->n_counts[i]);
+    }
+
+    /* Write N-grams */
+    for (i = 0; i < model->n; ++i) {
+        fprintf(fh, "\n\\%d-grams:\n", i + 1);
+        for (itor = ngram_model_mgrams(model, i); itor; itor = ngram_iter_next(itor)) {
+            int32 const *wids;
+            int32 score, bowt;
+            int j;
+
+            wids = ngram_iter_get(itor, &score, &bowt);
+            fprintf(fh, "%.4f ", logmath_log_to_log10(model->lmath, score));
+            for (j = 0; j <= i; ++j) {
+                assert(wids[j] < model->n_counts[0]);
+                fprintf(fh, "%s ", model->word_str[wids[j]]);
+            }
+            if (i < model->n-1)
+                fprintf(fh, "%.4f", logmath_log_to_log10(model->lmath, bowt));
+            fprintf(fh, "\n");
+        }
+    }
+    fprintf(fh, "\n\\end\\\n");
+    return fclose(fh);
+}
+
+static int
+ngram_model_arpa_apply_weights(ngram_model_t *base, float32 lw,
+                              float32 wip, float32 uw)
+{
+    ngram_model_arpa_t *model = (ngram_model_arpa_t *)base;
+    lm3g_apply_weights(base, &model->lm3g, lw, wip, uw);
+    return 0;
+}
+
+/* Lousy "templating" for things that are largely the same in DMP and
+ * ARPA models, except for the bigram and trigram types and some
+ * names. */
+#define NGRAM_MODEL_TYPE ngram_model_arpa_t
+#include "lm3g_templates.c"
+
+static void
+ngram_model_arpa_free(ngram_model_t *base)
+{
+    ngram_model_arpa_t *model = (ngram_model_arpa_t *)base;
+    ckd_free(model->lm3g.unigrams);
+    ckd_free(model->lm3g.bigrams);
+    ckd_free(model->lm3g.trigrams);
+    ckd_free(model->lm3g.prob2);
+    ckd_free(model->lm3g.bo_wt2);
+    ckd_free(model->lm3g.prob3);
+    lm3g_tginfo_free(base, &model->lm3g);
+    ckd_free(model->lm3g.tseg_base);
+}
+
+static ngram_funcs_t ngram_model_arpa_funcs = {
+    ngram_model_arpa_free,          /* free */
+    ngram_model_arpa_apply_weights, /* apply_weights */
+    lm3g_template_score,            /* score */
+    lm3g_template_raw_score,        /* raw_score */
+    lm3g_template_add_ug,           /* add_ug */
+    lm3g_template_flush,            /* flush */
+    lm3g_template_iter,             /* iter */
+    lm3g_template_mgrams,           /* mgrams */
+    lm3g_template_successors,       /* successors */
+    lm3g_template_iter_get,         /* iter_get */
+    lm3g_template_iter_next,        /* iter_next */
+    lm3g_template_iter_free         /* iter_free */
+};
author	Matt A. Tobin <mattatobin@localhost.localdomain>	2018-02-02 04:16:08 -0500
committer	Matt A. Tobin <mattatobin@localhost.localdomain>	2018-02-02 04:16:08 -0500
commit	5f8de423f190bbb79a62f804151bc24824fa32d8 (patch)
tree	10027f336435511475e392454359edea8e25895d /media/sphinxbase/src/libsphinxbase/lm/ngram_model_arpa.c
parent	49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff)
download	UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip