summaryrefslogtreecommitdiffstats
path: root/media/sphinxbase/src/libsphinxbase/lm/ngram_model_arpa.c
diff options
context:
space:
mode:
authorMatt A. Tobin <mattatobin@localhost.localdomain>2018-02-02 04:16:08 -0500
committerMatt A. Tobin <mattatobin@localhost.localdomain>2018-02-02 04:16:08 -0500
commit5f8de423f190bbb79a62f804151bc24824fa32d8 (patch)
tree10027f336435511475e392454359edea8e25895d /media/sphinxbase/src/libsphinxbase/lm/ngram_model_arpa.c
parent49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff)
downloadUXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip
Add m-esr52 at 52.6.0
Diffstat (limited to 'media/sphinxbase/src/libsphinxbase/lm/ngram_model_arpa.c')
-rw-r--r--media/sphinxbase/src/libsphinxbase/lm/ngram_model_arpa.c660
1 files changed, 660 insertions, 0 deletions
diff --git a/media/sphinxbase/src/libsphinxbase/lm/ngram_model_arpa.c b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_arpa.c
new file mode 100644
index 000000000..a4b72cb00
--- /dev/null
+++ b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_arpa.c
@@ -0,0 +1,660 @@
+/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
+/* ====================================================================
+ * Copyright (c) 1999-2007 Carnegie Mellon University. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * This work was supported in part by funding from the Defense Advanced
+ * Research Projects Agency and the National Science Foundation of the
+ * United States of America, and the CMU Sphinx Speech Consortium.
+ *
+ * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
+ * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
+ * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ====================================================================
+ *
+ */
+/*
+ * \file ngram_model_arpa.c ARPA format language models
+ *
+ * Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
+ */
+
+#include "sphinxbase/ckd_alloc.h"
+#include <string.h>
+#include <limits.h>
+#include <assert.h>
+
+#include "sphinxbase/err.h"
+#include "sphinxbase/pio.h"
+#include "sphinxbase/listelem_alloc.h"
+#include "sphinxbase/strfuncs.h"
+
+#include "ngram_model_arpa.h"
+
+static ngram_funcs_t ngram_model_arpa_funcs;
+
+#define TSEG_BASE(m,b) ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
+#define FIRST_BG(m,u) ((m)->lm3g.unigrams[u].bigrams)
+#define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
+
+/*
+ * Read and return #unigrams, #bigrams, #trigrams as stated in input file.
+ */
+static int
+ReadNgramCounts(lineiter_t **li, int32 * n_ug, int32 * n_bg, int32 * n_tg)
+{
+ int32 ngram, ngram_cnt;
+
+ /* skip file until past the '\data\' marker */
+ while (*li) {
+ string_trim((*li)->buf, STRING_BOTH);
+ if (strcmp((*li)->buf, "\\data\\") == 0)
+ break;
+ *li = lineiter_next(*li);
+ }
+ if (*li == NULL || strcmp((*li)->buf, "\\data\\") != 0) {
+ E_INFO("No \\data\\ mark in LM file\n");
+ return -1;
+ }
+
+ *n_ug = *n_bg = *n_tg = 0;
+ while ((*li = lineiter_next(*li))) {
+ if (sscanf((*li)->buf, "ngram %d=%d", &ngram, &ngram_cnt) != 2)
+ break;
+ switch (ngram) {
+ case 1:
+ *n_ug = ngram_cnt;
+ break;
+ case 2:
+ *n_bg = ngram_cnt;
+ break;
+ case 3:
+ *n_tg = ngram_cnt;
+ break;
+ default:
+ E_ERROR("Unknown ngram (%d)\n", ngram);
+ return -1;
+ }
+ }
+ if (*li == NULL) {
+ E_ERROR("EOF while reading ngram counts\n");
+ return -1;
+ }
+
+ /* Position iterator to the unigrams header '\1-grams:\' */
+ while ((*li = lineiter_next(*li))) {
+ string_trim((*li)->buf, STRING_BOTH);
+ if (strcmp((*li)->buf, "\\1-grams:") == 0)
+ break;
+ }
+ if (*li == NULL) {
+ E_ERROR_SYSTEM("Failed to read \\1-grams: mark");
+ return -1;
+ }
+
+ if ((*n_ug <= 0) || (*n_bg < 0) || (*n_tg < 0)) {
+ E_ERROR("Bad or missing ngram count\n");
+ return -1;
+ }
+ return 0;
+}
+
+/*
+ * Read in the unigrams from given file into the LM structure model.
+ * On entry to this procedure, the iterator is positioned to the
+ * header line '\1-grams:'.
+ */
+static int
+ReadUnigrams(lineiter_t **li, ngram_model_arpa_t * model)
+{
+ ngram_model_t *base = &model->base;
+ int32 wcnt;
+ float p1;
+
+ E_INFO("Reading unigrams\n");
+
+ wcnt = 0;
+ while ((*li = lineiter_next(*li))) {
+ char *wptr[3], *name;
+ float32 bo_wt = 0.0f;
+ int n;
+
+ string_trim((*li)->buf, STRING_BOTH);
+ if (strcmp((*li)->buf, "\\2-grams:") == 0
+ || strcmp((*li)->buf, "\\end\\") == 0)
+ break;
+
+ if ((n = str2words((*li)->buf, wptr, 3)) < 2) {
+ if ((*li)->buf[0] != '\0')
+ E_WARN("Format error; unigram ignored: %s\n", (*li)->buf);
+ continue;
+ }
+ else {
+ p1 = (float)atof_c(wptr[0]);
+ name = wptr[1];
+ if (n == 3)
+ bo_wt = (float)atof_c(wptr[2]);
+ }
+
+ if (wcnt >= base->n_counts[0]) {
+ E_ERROR("Too many unigrams\n");
+ return -1;
+ }
+
+ /* Associate name with word id */
+ base->word_str[wcnt] = ckd_salloc(name);
+ if ((hash_table_enter(base->wid, base->word_str[wcnt], (void *)(long)wcnt))
+ != (void *)(long)wcnt) {
+ E_WARN("Duplicate word in dictionary: %s\n", base->word_str[wcnt]);
+ }
+ model->lm3g.unigrams[wcnt].prob1.l = logmath_log10_to_log(base->lmath, p1);
+ model->lm3g.unigrams[wcnt].bo_wt1.l = logmath_log10_to_log(base->lmath, bo_wt);
+ wcnt++;
+ }
+
+ if (base->n_counts[0] != wcnt) {
+ E_WARN("lm_t.ucount(%d) != #unigrams read(%d)\n",
+ base->n_counts[0], wcnt);
+ base->n_counts[0] = wcnt;
+ base->n_words = wcnt;
+ }
+ return 0;
+}
+
+/*
+ * Read bigrams from given file into given model structure.
+ */
+static int
+ReadBigrams(lineiter_t **li, ngram_model_arpa_t * model)
+{
+ ngram_model_t *base = &model->base;
+ int32 w1, w2, prev_w1, bgcount;
+ bigram_t *bgptr;
+
+ E_INFO("Reading bigrams\n");
+
+ bgcount = 0;
+ bgptr = model->lm3g.bigrams;
+ prev_w1 = -1;
+
+ while ((*li = lineiter_next(*li))) {
+ float32 p, bo_wt = 0.0f;
+ int32 p2, bo_wt2;
+ char *wptr[4], *word1, *word2;
+ int n;
+
+ string_trim((*li)->buf, STRING_BOTH);
+ wptr[3] = NULL;
+ if ((n = str2words((*li)->buf, wptr, 4)) < 3) {
+ if ((*li)->buf[0] != '\0')
+ break;
+ continue;
+ }
+ else {
+ p = (float32)atof_c(wptr[0]);
+ word1 = wptr[1];
+ word2 = wptr[2];
+ if (wptr[3])
+ bo_wt = (float32)atof_c(wptr[3]);
+ }
+
+ if ((w1 = ngram_wid(base, word1)) == NGRAM_INVALID_WID) {
+ E_ERROR("Unknown word: %s, skipping bigram (%s %s)\n",
+ word1, word1, word2);
+ continue;
+ }
+ if ((w2 = ngram_wid(base, word2)) == NGRAM_INVALID_WID) {
+ E_ERROR("Unknown word: %s, skipping bigram (%s %s)\n",
+ word2, word1, word2);
+ continue;
+ }
+
+ /* FIXME: Should use logmath_t quantization here. */
+ /* HACK!! to quantize probs to 4 decimal digits */
+ p = (float32)((int32)(p * 10000)) / 10000;
+ bo_wt = (float32)((int32)(bo_wt * 10000)) / 10000;
+
+ p2 = logmath_log10_to_log(base->lmath, p);
+ bo_wt2 = logmath_log10_to_log(base->lmath, bo_wt);
+
+ if (bgcount >= base->n_counts[1]) {
+ E_ERROR("Too many bigrams\n");
+ return -1;
+ }
+
+ bgptr->wid = w2;
+ bgptr->prob2 = sorted_id(&model->sorted_prob2, &p2);
+ if (base->n_counts[2] > 0)
+ bgptr->bo_wt2 = sorted_id(&model->sorted_bo_wt2, &bo_wt2);
+
+ if (w1 != prev_w1) {
+ if (w1 < prev_w1) {
+ E_ERROR("Bigram %s %s not in unigram order word id: %d prev word id: %d\n", word1, word2, w1, prev_w1);
+ return -1;
+ }
+
+ for (prev_w1++; prev_w1 <= w1; prev_w1++)
+ model->lm3g.unigrams[prev_w1].bigrams = bgcount;
+ prev_w1 = w1;
+ }
+ bgcount++;
+ bgptr++;
+
+ if ((bgcount & 0x0000ffff) == 0) {
+ E_INFOCONT(".");
+ }
+ }
+ if (*li == NULL || ((strcmp((*li)->buf, "\\end\\") != 0)
+ && (strcmp((*li)->buf, "\\3-grams:") != 0))) {
+ E_ERROR("Bad bigram: %s\n", (*li)->buf);
+ return -1;
+ }
+
+ for (prev_w1++; prev_w1 <= base->n_counts[0]; prev_w1++)
+ model->lm3g.unigrams[prev_w1].bigrams = bgcount;
+
+ return 0;
+}
+
+/*
+ * Very similar to ReadBigrams.
+ */
+static int
+ReadTrigrams(lineiter_t **li, ngram_model_arpa_t * model)
+{
+ ngram_model_t *base = &model->base;
+ int32 i, w1, w2, w3, prev_w1, prev_w2, tgcount, prev_bg, bg, endbg;
+ int32 seg, prev_seg, prev_seg_lastbg;
+ trigram_t *tgptr;
+ bigram_t *bgptr;
+
+ E_INFO("Reading trigrams\n");
+
+ tgcount = 0;
+ tgptr = model->lm3g.trigrams;
+ prev_w1 = -1;
+ prev_w2 = -1;
+ prev_bg = -1;
+ prev_seg = -1;
+
+ while ((*li = lineiter_next(*li))) {
+ float32 p;
+ int32 p3;
+ char *wptr[4], *word1, *word2, *word3;
+
+ string_trim((*li)->buf, STRING_BOTH);
+ if (str2words((*li)->buf, wptr, 4) != 4) {
+ if ((*li)->buf[0] != '\0')
+ break;
+ continue;
+ }
+ else {
+ p = (float32)atof_c(wptr[0]);
+ word1 = wptr[1];
+ word2 = wptr[2];
+ word3 = wptr[3];
+ }
+
+ if ((w1 = ngram_wid(base, word1)) == NGRAM_INVALID_WID) {
+ E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
+ word1, word1, word2, word3);
+ continue;
+ }
+ if ((w2 = ngram_wid(base, word2)) == NGRAM_INVALID_WID) {
+ E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
+ word2, word1, word2, word3);
+ continue;
+ }
+ if ((w3 = ngram_wid(base, word3)) == NGRAM_INVALID_WID) {
+ E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
+ word3, word1, word2, word3);
+ continue;
+ }
+
+ /* FIXME: Should use logmath_t quantization here. */
+ /* HACK!! to quantize probs to 4 decimal digits */
+ p = (float32)((int32)(p * 10000)) / 10000;
+ p3 = logmath_log10_to_log(base->lmath, p);
+
+ if (tgcount >= base->n_counts[2]) {
+ E_ERROR("Too many trigrams\n");
+ return -1;
+ }
+
+ tgptr->wid = w3;
+ tgptr->prob3 = sorted_id(&model->sorted_prob3, &p3);
+
+ if ((w1 != prev_w1) || (w2 != prev_w2)) {
+ /* Trigram for a new bigram; update tg info for all previous bigrams */
+ if ((w1 < prev_w1) || ((w1 == prev_w1) && (w2 < prev_w2))) {
+ E_ERROR("Trigrams not in bigram order\n");
+ return -1;
+ }
+
+ bg = (w1 !=
+ prev_w1) ? model->lm3g.unigrams[w1].bigrams : prev_bg + 1;
+ endbg = model->lm3g.unigrams[w1 + 1].bigrams;
+ bgptr = model->lm3g.bigrams + bg;
+ for (; (bg < endbg) && (bgptr->wid != w2); bg++, bgptr++);
+ if (bg >= endbg) {
+ E_ERROR("Missing bigram for trigram: %s", (*li)->buf);
+ return -1;
+ }
+
+ /* bg = bigram entry index for <w1,w2>. Update tseg_base */
+ seg = bg >> LOG_BG_SEG_SZ;
+ for (i = prev_seg + 1; i <= seg; i++)
+ model->lm3g.tseg_base[i] = tgcount;
+
+ /* Update trigrams pointers for all bigrams until bg */
+ if (prev_seg < seg) {
+ int32 tgoff = 0;
+
+ if (prev_seg >= 0) {
+ tgoff = tgcount - model->lm3g.tseg_base[prev_seg];
+ if (tgoff > 65535) {
+ E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
+ return -1;
+ }
+ }
+
+ prev_seg_lastbg = ((prev_seg + 1) << LOG_BG_SEG_SZ) - 1;
+ bgptr = model->lm3g.bigrams + prev_bg;
+ for (++prev_bg, ++bgptr; prev_bg <= prev_seg_lastbg;
+ prev_bg++, bgptr++)
+ bgptr->trigrams = tgoff;
+
+ for (; prev_bg <= bg; prev_bg++, bgptr++)
+ bgptr->trigrams = 0;
+ }
+ else {
+ int32 tgoff;
+
+ tgoff = tgcount - model->lm3g.tseg_base[prev_seg];
+ if (tgoff > 65535) {
+ E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
+ return -1;
+ }
+
+ bgptr = model->lm3g.bigrams + prev_bg;
+ for (++prev_bg, ++bgptr; prev_bg <= bg; prev_bg++, bgptr++)
+ bgptr->trigrams = tgoff;
+ }
+
+ prev_w1 = w1;
+ prev_w2 = w2;
+ prev_bg = bg;
+ prev_seg = seg;
+ }
+
+ tgcount++;
+ tgptr++;
+
+ if ((tgcount & 0x0000ffff) == 0) {
+ E_INFOCONT(".");
+ }
+ }
+ if (*li == NULL || strcmp((*li)->buf, "\\end\\") != 0) {
+ E_ERROR("Bad trigram: %s\n", (*li)->buf);
+ return -1;
+ }
+
+ for (prev_bg++; prev_bg <= base->n_counts[1]; prev_bg++) {
+ if ((prev_bg & (BG_SEG_SZ - 1)) == 0)
+ model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ] = tgcount;
+ if ((tgcount - model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ]) > 65535) {
+ E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
+ return -1;
+ }
+ model->lm3g.bigrams[prev_bg].trigrams =
+ tgcount - model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ];
+ }
+ return 0;
+}
+
+static unigram_t *
+new_unigram_table(int32 n_ug)
+{
+ unigram_t *table;
+ int32 i;
+
+ table = ckd_calloc(n_ug, sizeof(unigram_t));
+ for (i = 0; i < n_ug; i++) {
+ table[i].prob1.l = INT_MIN;
+ table[i].bo_wt1.l = INT_MIN;
+ }
+ return table;
+}
+
+ngram_model_t *
+ngram_model_arpa_read(cmd_ln_t *config,
+ const char *file_name,
+ logmath_t *lmath)
+{
+ lineiter_t *li;
+ FILE *fp;
+ int32 is_pipe;
+ int32 n_unigram;
+ int32 n_bigram;
+ int32 n_trigram;
+ int32 n;
+ ngram_model_arpa_t *model;
+ ngram_model_t *base;
+
+ if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
+ E_ERROR("File %s not found\n", file_name);
+ return NULL;
+ }
+ li = lineiter_start(fp);
+
+ /* Read #unigrams, #bigrams, #trigrams from file */
+ if (ReadNgramCounts(&li, &n_unigram, &n_bigram, &n_trigram) == -1) {
+ lineiter_free(li);
+ fclose_comp(fp, is_pipe);
+ return NULL;
+ }
+ E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
+
+ /* Allocate space for LM, including initial OOVs and placeholders; initialize it */
+ model = ckd_calloc(1, sizeof(*model));
+ base = &model->base;
+ if (n_trigram > 0)
+ n = 3;
+ else if (n_bigram > 0)
+ n = 2;
+ else
+ n = 1;
+ /* Initialize base model. */
+ ngram_model_init(base, &ngram_model_arpa_funcs, lmath, n, n_unigram);
+ base->n_counts[0] = n_unigram;
+ base->n_counts[1] = n_bigram;
+ base->n_counts[2] = n_trigram;
+ base->writable = TRUE;
+
+ /*
+ * Allocate one extra unigram and bigram entry: sentinels to terminate
+ * followers (bigrams and trigrams, respectively) of previous entry.
+ */
+ model->lm3g.unigrams = new_unigram_table(n_unigram + 1);
+ model->lm3g.bigrams =
+ ckd_calloc(n_bigram + 1, sizeof(bigram_t));
+ if (n_trigram > 0)
+ model->lm3g.trigrams =
+ ckd_calloc(n_trigram, sizeof(trigram_t));
+
+ if (n_trigram > 0) {
+ model->lm3g.tseg_base =
+ ckd_calloc((n_bigram + 1) / BG_SEG_SZ + 1,
+ sizeof(int32));
+ }
+ if (ReadUnigrams(&li, model) == -1) {
+ fclose_comp(fp, is_pipe);
+ ngram_model_free(base);
+ return NULL;
+ }
+ E_INFO("%8d = #unigrams created\n", base->n_counts[0]);
+
+ if (base->n_counts[2] > 0)
+ init_sorted_list(&model->sorted_bo_wt2);
+
+ if (base->n_counts[1] > 0) {
+ init_sorted_list(&model->sorted_prob2);
+
+ if (ReadBigrams(&li, model) == -1) {
+ fclose_comp(fp, is_pipe);
+ ngram_model_free(base);
+ return NULL;
+ }
+
+ base->n_counts[1] = FIRST_BG(model, base->n_counts[0]);
+ model->lm3g.n_prob2 = model->sorted_prob2.free;
+ model->lm3g.prob2 = vals_in_sorted_list(&model->sorted_prob2);
+ free_sorted_list(&model->sorted_prob2);
+ E_INFO("%8d = #bigrams created\n", base->n_counts[1]);
+ E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2);
+ }
+
+ if (base->n_counts[2] > 0) {
+ /* Create trigram bo-wts array */
+ model->lm3g.n_bo_wt2 = model->sorted_bo_wt2.free;
+ model->lm3g.bo_wt2 = vals_in_sorted_list(&model->sorted_bo_wt2);
+ free_sorted_list(&model->sorted_bo_wt2);
+ E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2);
+
+ init_sorted_list(&model->sorted_prob3);
+
+ if (ReadTrigrams(&li, model) == -1) {
+ fclose_comp(fp, is_pipe);
+ ngram_model_free(base);
+ return NULL;
+ }
+
+ base->n_counts[2] = FIRST_TG(model, base->n_counts[1]);
+ model->lm3g.n_prob3 = model->sorted_prob3.free;
+ model->lm3g.prob3 = vals_in_sorted_list(&model->sorted_prob3);
+ E_INFO("%8d = #trigrams created\n", base->n_counts[2]);
+ E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3);
+
+ free_sorted_list(&model->sorted_prob3);
+
+ /* Initialize tginfo */
+ model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *));
+ model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
+ }
+
+ lineiter_free(li);
+ fclose_comp(fp, is_pipe);
+ return base;
+}
+
+int
+ngram_model_arpa_write(ngram_model_t *model,
+ const char *file_name)
+{
+ ngram_iter_t *itor;
+ FILE *fh;
+ int i;
+
+ if ((fh = fopen(file_name, "w")) == NULL) {
+ E_ERROR_SYSTEM("Failed to open %s for writing", file_name);
+ return -1;
+ }
+ fprintf(fh, "This is an ARPA-format language model file, generated by CMU Sphinx\n");
+
+ /* The ARPA format doesn't require any extra information that
+ * N-Gram iterators can't give us, so this is very
+ * straightforward compared with DMP writing. */
+
+ /* Write N-gram counts. */
+ fprintf(fh, "\\data\\\n");
+ for (i = 0; i < model->n; ++i) {
+ fprintf(fh, "ngram %d=%d\n", i+1, model->n_counts[i]);
+ }
+
+ /* Write N-grams */
+ for (i = 0; i < model->n; ++i) {
+ fprintf(fh, "\n\\%d-grams:\n", i + 1);
+ for (itor = ngram_model_mgrams(model, i); itor; itor = ngram_iter_next(itor)) {
+ int32 const *wids;
+ int32 score, bowt;
+ int j;
+
+ wids = ngram_iter_get(itor, &score, &bowt);
+ fprintf(fh, "%.4f ", logmath_log_to_log10(model->lmath, score));
+ for (j = 0; j <= i; ++j) {
+ assert(wids[j] < model->n_counts[0]);
+ fprintf(fh, "%s ", model->word_str[wids[j]]);
+ }
+ if (i < model->n-1)
+ fprintf(fh, "%.4f", logmath_log_to_log10(model->lmath, bowt));
+ fprintf(fh, "\n");
+ }
+ }
+ fprintf(fh, "\n\\end\\\n");
+ return fclose(fh);
+}
+
+static int
+ngram_model_arpa_apply_weights(ngram_model_t *base, float32 lw,
+ float32 wip, float32 uw)
+{
+ ngram_model_arpa_t *model = (ngram_model_arpa_t *)base;
+ lm3g_apply_weights(base, &model->lm3g, lw, wip, uw);
+ return 0;
+}
+
+/* Lousy "templating" for things that are largely the same in DMP and
+ * ARPA models, except for the bigram and trigram types and some
+ * names. */
+#define NGRAM_MODEL_TYPE ngram_model_arpa_t
+#include "lm3g_templates.c"
+
+static void
+ngram_model_arpa_free(ngram_model_t *base)
+{
+ ngram_model_arpa_t *model = (ngram_model_arpa_t *)base;
+ ckd_free(model->lm3g.unigrams);
+ ckd_free(model->lm3g.bigrams);
+ ckd_free(model->lm3g.trigrams);
+ ckd_free(model->lm3g.prob2);
+ ckd_free(model->lm3g.bo_wt2);
+ ckd_free(model->lm3g.prob3);
+ lm3g_tginfo_free(base, &model->lm3g);
+ ckd_free(model->lm3g.tseg_base);
+}
+
+static ngram_funcs_t ngram_model_arpa_funcs = {
+ ngram_model_arpa_free, /* free */
+ ngram_model_arpa_apply_weights, /* apply_weights */
+ lm3g_template_score, /* score */
+ lm3g_template_raw_score, /* raw_score */
+ lm3g_template_add_ug, /* add_ug */
+ lm3g_template_flush, /* flush */
+ lm3g_template_iter, /* iter */
+ lm3g_template_mgrams, /* mgrams */
+ lm3g_template_successors, /* successors */
+ lm3g_template_iter_get, /* iter_get */
+ lm3g_template_iter_next, /* iter_next */
+ lm3g_template_iter_free /* iter_free */
+};