diff options
Diffstat (limited to 'media/sphinxbase/src/libsphinxbase/lm/ngram_model_dmp.c')
-rw-r--r-- | media/sphinxbase/src/libsphinxbase/lm/ngram_model_dmp.c | 969 |
1 files changed, 969 insertions, 0 deletions
diff --git a/media/sphinxbase/src/libsphinxbase/lm/ngram_model_dmp.c b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_dmp.c new file mode 100644 index 000000000..c6a2d8b85 --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_dmp.c @@ -0,0 +1,969 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2007 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/* + * \file ngram_model_dmp.c DMP format language models + * + * Author: David Huggins-Daines <dhuggins@cs.cmu.edu> + */ + +#include <assert.h> +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <limits.h> + +#include "sphinxbase/ckd_alloc.h" +#include "sphinxbase/pio.h" +#include "sphinxbase/err.h" +#include "sphinxbase/byteorder.h" +#include "sphinxbase/listelem_alloc.h" + +#include "ngram_model_dmp.h" + +static const char darpa_hdr[] = "Darpa Trigram LM"; +static ngram_funcs_t ngram_model_dmp_funcs; + +#define TSEG_BASE(m,b) ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ]) +#define FIRST_BG(m,u) ((m)->lm3g.unigrams[u].bigrams) +#define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams)) + +static unigram_t * +new_unigram_table(int32 n_ug) +{ + unigram_t *table; + int32 i; + + table = ckd_calloc(n_ug, sizeof(unigram_t)); + for (i = 0; i < n_ug; i++) { + table[i].prob1.f = -99.0; + table[i].bo_wt1.f = -99.0; + } + return table; +} + +ngram_model_t * +ngram_model_dmp_read(cmd_ln_t *config, + const char *file_name, + logmath_t *lmath) +{ + ngram_model_t *base; + ngram_model_dmp_t *model; + FILE *fp; + int do_mmap, do_swap; + int32 is_pipe; + int32 i, j, k, vn, n, ts; + int32 n_unigram; + int32 n_bigram; + int32 n_trigram; + char str[1024]; + unigram_t *ugptr; + bigram_t *bgptr; + trigram_t *tgptr; + char *tmp_word_str; + char *map_base = NULL; + size_t offset = 0; + + base = NULL; + do_mmap = FALSE; + if (config) + do_mmap = cmd_ln_boolean_r(config, "-mmap"); + + if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) { + E_ERROR("Dump file %s not found\n", file_name); + goto error_out; + } + + if (is_pipe && do_mmap) { + E_WARN("Dump file is compressed, will not use memory-mapped I/O\n"); + do_mmap = 0; + } + + do_swap = FALSE; + if (fread(&k, sizeof(k), 1, fp) != 1) + goto error_out; + if (k != strlen(darpa_hdr)+1) { + SWAP_INT32(&k); + if (k != strlen(darpa_hdr)+1) { + E_ERROR("Wrong magic header size number %x: %s is not a dump file\n", k, file_name); + goto error_out; + } + do_swap = 1; + } + if (fread(str, 1, k, fp) != (size_t) k) { + E_ERROR("Cannot read header\n"); + goto error_out; + } + if (strncmp(str, darpa_hdr, k) != 0) { + E_ERROR("Wrong header %s: %s is not a dump file\n", darpa_hdr); + goto error_out; + } + + if (do_mmap) { + if (do_swap) { + E_INFO + ("Byteswapping required, will not use memory-mapped I/O for LM file\n"); + do_mmap = 0; + } + else { + E_INFO("Will use memory-mapped I/O for LM file\n"); +#ifdef __ADSPBLACKFIN__ /* This is true for both VisualDSP++ and uClinux. */ + E_FATAL("memory mapping is not supported at the moment."); +#else +#endif + } + } + + if (fread(&k, sizeof(k), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&k); + if (fread(str, 1, k, fp) != (size_t) k) { + E_ERROR("Cannot read LM filename in header\n"); + goto error_out; + } + + /* read version#, if present (must be <= 0) */ + if (fread(&vn, sizeof(vn), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&vn); + if (vn <= 0) { + /* read and don't compare timestamps (we don't care) */ + if (fread(&ts, sizeof(ts), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&ts); + + /* read and skip format description */ + for (;;) { + if (fread(&k, sizeof(k), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&k); + if (k == 0) + break; + if (fread(str, 1, k, fp) != (size_t) k) { + E_ERROR("Failed to read word\n"); + goto error_out; + } + } + /* read model->ucount */ + if (fread(&n_unigram, sizeof(n_unigram), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&n_unigram); + } + else { + n_unigram = vn; + } + + /* read model->bcount, tcount */ + if (fread(&n_bigram, sizeof(n_bigram), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&n_bigram); + if (fread(&n_trigram, sizeof(n_trigram), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&n_trigram); + E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram); + + /* Allocate space for LM, including initial OOVs and placeholders; initialize it */ + model = ckd_calloc(1, sizeof(*model)); + base = &model->base; + if (n_trigram > 0) + n = 3; + else if (n_bigram > 0) + n = 2; + else + n = 1; + ngram_model_init(base, &ngram_model_dmp_funcs, lmath, n, n_unigram); + base->n_counts[0] = n_unigram; + base->n_counts[1] = n_bigram; + base->n_counts[2] = n_trigram; + + /* read unigrams (always in memory, as they contain dictionary + * mappings that can't be precomputed, and also could have OOVs added) */ + model->lm3g.unigrams = new_unigram_table(n_unigram + 1); + ugptr = model->lm3g.unigrams; + for (i = 0; i <= n_unigram; ++i) { + /* Skip over the mapping ID, we don't care about it. */ + if (fread(ugptr, sizeof(int32), 1, fp) != 1) { + E_ERROR("Failed to read maping id %d\n", i); + goto error_out; + } + /* Read the actual unigram structure. */ + if (fread(ugptr, sizeof(unigram_t), 1, fp) != 1) { + E_ERROR("Failed to read unigrams data\n"); + ngram_model_free(base); + fclose_comp(fp, is_pipe); + return NULL; + } + /* Byte swap if necessary. */ + if (do_swap) { + SWAP_INT32(&ugptr->prob1.l); + SWAP_INT32(&ugptr->bo_wt1.l); + SWAP_INT32(&ugptr->bigrams); + } + /* Convert values to log. */ + ugptr->prob1.l = logmath_log10_to_log(lmath, ugptr->prob1.f); + ugptr->bo_wt1.l = logmath_log10_to_log(lmath, ugptr->bo_wt1.f); + E_DEBUG(2, ("ug %d: prob %d bo %d bigrams %d\n", + i, ugptr->prob1.l, ugptr->bo_wt1.l, ugptr->bigrams)); + ++ugptr; + } + E_INFO("%8d = LM.unigrams(+trailer) read\n", n_unigram); + + /* Now mmap() the file and read in the rest of the (read-only) stuff. */ + if (do_mmap) { + offset = ftell(fp); + + /* Check for improper word alignment. */ + if (offset & 0x3) { + E_WARN("-mmap specified, but trigram index is not word-aligned. Will not memory-map.\n"); + do_mmap = FALSE; + } + else { + model->dump_mmap = mmio_file_read(file_name); + if (model->dump_mmap == NULL) { + do_mmap = FALSE; + } + else { + map_base = mmio_file_ptr(model->dump_mmap); + } + } + } + + if (n_bigram > 0) { + /* read bigrams */ + if (do_mmap) { + model->lm3g.bigrams = (bigram_t *) (map_base + offset); + offset += (n_bigram + 1) * sizeof(bigram_t); + } + else { + model->lm3g.bigrams = + ckd_calloc(n_bigram + 1, sizeof(bigram_t)); + if (fread(model->lm3g.bigrams, sizeof(bigram_t), n_bigram + 1, fp) + != (size_t) n_bigram + 1) { + E_ERROR("Failed to read bigrams data\n"); + goto error_out; + } + if (do_swap) { + for (i = 0, bgptr = model->lm3g.bigrams; i <= n_bigram; + i++, bgptr++) { + SWAP_INT16(&bgptr->wid); + SWAP_INT16(&bgptr->prob2); + SWAP_INT16(&bgptr->bo_wt2); + SWAP_INT16(&bgptr->trigrams); + } + } + } + E_INFO("%8d = LM.bigrams(+trailer) read\n", n_bigram); + } + + /* read trigrams */ + if (n_trigram > 0) { + if (do_mmap) { + model->lm3g.trigrams = (trigram_t *) (map_base + offset); + offset += n_trigram * sizeof(trigram_t); + } + else { + model->lm3g.trigrams = + ckd_calloc(n_trigram, sizeof(trigram_t)); + if (fread + (model->lm3g.trigrams, sizeof(trigram_t), n_trigram, fp) + != (size_t) n_trigram) { + E_ERROR("Failed to read trigrams data\n"); + goto error_out; + } + if (do_swap) { + for (i = 0, tgptr = model->lm3g.trigrams; i < n_trigram; + i++, tgptr++) { + SWAP_INT16(&tgptr->wid); + SWAP_INT16(&tgptr->prob3); + } + } + } + E_INFO("%8d = LM.trigrams read\n", n_trigram); + /* Initialize tginfo */ + model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *)); + model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t)); + } + + if (n_bigram > 0) { + /* read n_prob2 and prob2 array (in memory) */ + if (do_mmap) + fseek(fp, offset, SEEK_SET); + if (fread(&k, sizeof(k), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&k); + model->lm3g.n_prob2 = k; + model->lm3g.prob2 = ckd_calloc(k, sizeof(*model->lm3g.prob2)); + if (fread(model->lm3g.prob2, sizeof(*model->lm3g.prob2), k, fp) != (size_t) k) { + E_ERROR("fread(prob2) failed\n"); + goto error_out; + } + for (i = 0; i < k; i++) { + if (do_swap) + SWAP_INT32(&model->lm3g.prob2[i].l); + /* Convert values to log. */ + model->lm3g.prob2[i].l = logmath_log10_to_log(lmath, model->lm3g.prob2[i].f); + } + E_INFO("%8d = LM.prob2 entries read\n", k); + } + + /* read n_bo_wt2 and bo_wt2 array (in memory) */ + if (base->n > 2) { + if (fread(&k, sizeof(k), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&k); + model->lm3g.n_bo_wt2 = k; + model->lm3g.bo_wt2 = ckd_calloc(k, sizeof(*model->lm3g.bo_wt2)); + if (fread(model->lm3g.bo_wt2, sizeof(*model->lm3g.bo_wt2), k, fp) != (size_t) k) { + E_ERROR("Failed to read backoff weights\n"); + goto error_out; + } + for (i = 0; i < k; i++) { + if (do_swap) + SWAP_INT32(&model->lm3g.bo_wt2[i].l); + /* Convert values to log. */ + model->lm3g.bo_wt2[i].l = logmath_log10_to_log(lmath, model->lm3g.bo_wt2[i].f); + } + E_INFO("%8d = LM.bo_wt2 entries read\n", k); + } + + /* read n_prob3 and prob3 array (in memory) */ + if (base->n > 2) { + if (fread(&k, sizeof(k), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&k); + model->lm3g.n_prob3 = k; + model->lm3g.prob3 = ckd_calloc(k, sizeof(*model->lm3g.prob3)); + if (fread(model->lm3g.prob3, sizeof(*model->lm3g.prob3), k, fp) != (size_t) k) { + E_ERROR("Failed to read trigram probability\n"); + goto error_out; + } + for (i = 0; i < k; i++) { + if (do_swap) + SWAP_INT32(&model->lm3g.prob3[i].l); + /* Convert values to log. */ + model->lm3g.prob3[i].l = logmath_log10_to_log(lmath, model->lm3g.prob3[i].f); + } + E_INFO("%8d = LM.prob3 entries read\n", k); + } + + /* read tseg_base size and tseg_base */ + if (do_mmap) + offset = ftell(fp); + if (n_trigram > 0) { + if (do_mmap) { + memcpy(&k, map_base + offset, sizeof(k)); + offset += sizeof(int32); + model->lm3g.tseg_base = (int32 *) (map_base + offset); + offset += k * sizeof(int32); + } + else { + k = (n_bigram + 1) / BG_SEG_SZ + 1; + if (fread(&k, sizeof(k), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&k); + model->lm3g.tseg_base = ckd_calloc(k, sizeof(int32)); + if (fread(model->lm3g.tseg_base, sizeof(int32), k, fp) != + (size_t) k) { + E_ERROR("Failed to read trigram index\n"); + goto error_out; + } + if (do_swap) + for (i = 0; i < k; i++) + SWAP_INT32(&model->lm3g.tseg_base[i]); + } + E_INFO("%8d = LM.tseg_base entries read\n", k); + } + + /* read ascii word strings */ + if (do_mmap) { + memcpy(&k, map_base + offset, sizeof(k)); + offset += sizeof(int32); + tmp_word_str = (char *) (map_base + offset); + offset += k; + } + else { + base->writable = TRUE; + if (fread(&k, sizeof(k), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&k); + tmp_word_str = ckd_calloc(k, 1); + if (fread(tmp_word_str, 1, k, fp) != (size_t) k) { + E_ERROR("Failed to read words\n"); + goto error_out; + } + } + + /* First make sure string just read contains n_counts[0] words (PARANOIA!!) */ + for (i = 0, j = 0; i < k; i++) + if (tmp_word_str[i] == '\0') + j++; + if (j != n_unigram) { + E_ERROR("Error reading word strings (%d doesn't match n_unigrams %d)\n", + j, n_unigram); + goto error_out; + } + + /* Break up string just read into words */ + if (do_mmap) { + j = 0; + for (i = 0; i < n_unigram; i++) { + base->word_str[i] = tmp_word_str + j; + if (hash_table_enter(base->wid, base->word_str[i], + (void *)(long)i) != (void *)(long)i) { + E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]); + } + j += strlen(base->word_str[i]) + 1; + } + } + else { + j = 0; + for (i = 0; i < n_unigram; i++) { + base->word_str[i] = ckd_salloc(tmp_word_str + j); + if (hash_table_enter(base->wid, base->word_str[i], + (void *)(long)i) != (void *)(long)i) { + E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]); + } + j += strlen(base->word_str[i]) + 1; + } + free(tmp_word_str); + } + E_INFO("%8d = ascii word strings read\n", i); + + fclose_comp(fp, is_pipe); + return base; + +error_out: + if (fp) + fclose_comp(fp, is_pipe); + ngram_model_free(base); + return NULL; +} + +ngram_model_dmp_t * +ngram_model_dmp_build(ngram_model_t *base) +{ + ngram_model_dmp_t *model; + ngram_model_t *newbase; + ngram_iter_t *itor; + sorted_list_t sorted_prob2; + sorted_list_t sorted_bo_wt2; + sorted_list_t sorted_prob3; + bigram_t *bgptr; + trigram_t *tgptr; + int i, bgcount, tgcount, seg; + + if (base->funcs == &ngram_model_dmp_funcs) { + E_INFO("Using existing DMP model.\n"); + return (ngram_model_dmp_t *)ngram_model_retain(base); + } + + /* Initialize new base model structure with params from base. */ + E_INFO("Building DMP model...\n"); + model = ckd_calloc(1, sizeof(*model)); + newbase = &model->base; + ngram_model_init(newbase, &ngram_model_dmp_funcs, + logmath_retain(base->lmath), + base->n, base->n_counts[0]); + /* Copy N-gram counts over. */ + memcpy(newbase->n_counts, base->n_counts, + base->n * sizeof(*base->n_counts)); + /* Make sure word strings are freed. */ + newbase->writable = TRUE; + /* Initialize unigram table and string table. */ + model->lm3g.unigrams = new_unigram_table(newbase->n_counts[0] + 1); + for (itor = ngram_model_mgrams(base, 0); itor; + itor = ngram_iter_next(itor)) { + int32 prob1, bo_wt1; + int32 const *wids; + + /* Can't guarantee they will go in unigram order, so just to + * be correct, we do this... */ + wids = ngram_iter_get(itor, &prob1, &bo_wt1); + model->lm3g.unigrams[wids[0]].prob1.l = prob1; + model->lm3g.unigrams[wids[0]].bo_wt1.l = bo_wt1; + newbase->word_str[wids[0]] = ckd_salloc(ngram_word(base, wids[0])); + if ((hash_table_enter_int32(newbase->wid, + newbase->word_str[wids[0]], wids[0])) + != wids[0]) { + E_WARN("Duplicate word in dictionary: %s\n", newbase->word_str[wids[0]]); + } + } + E_INFO("%8d = #unigrams created\n", newbase->n_counts[0]); + + if (newbase->n < 2) + return model; + + /* Construct quantized probability table for bigrams and + * (optionally) trigrams. Hesitate to use the "sorted list" thing + * since it isn't so useful, but it's there already. */ + init_sorted_list(&sorted_prob2); + if (newbase->n > 2) { + init_sorted_list(&sorted_bo_wt2); + init_sorted_list(&sorted_prob3); + } + /* Construct bigram and trigram arrays. */ + bgptr = model->lm3g.bigrams = ckd_calloc(newbase->n_counts[1] + 1, sizeof(bigram_t)); + if (newbase->n > 2) { + tgptr = model->lm3g.trigrams = ckd_calloc(newbase->n_counts[2], sizeof(trigram_t)); + model->lm3g.tseg_base = + ckd_calloc((newbase->n_counts[1] + 1) / BG_SEG_SZ + 1, sizeof(int32)); + } + else + tgptr = NULL; + /* Since bigrams and trigrams have to be contiguous with others + * with the same N-1-gram, we traverse them in depth-first order + * to build the bigram and trigram arrays. */ + for (i = 0; i < newbase->n_counts[0]; ++i) { + ngram_iter_t *uitor; + bgcount = bgptr - model->lm3g.bigrams; + /* First bigram index (same as next if no bigrams...) */ + model->lm3g.unigrams[i].bigrams = bgcount; + E_DEBUG(2, ("unigram %d: %s => bigram %d\n", i, newbase->word_str[i], bgcount)); + /* All bigrams corresponding to unigram i */ + uitor = ngram_ng_iter(base, i, NULL, 0); + for (itor = ngram_iter_successors(uitor); + itor; ++bgptr, itor = ngram_iter_next(itor)) { + int32 prob2, bo_wt2; + int32 const *wids; + ngram_iter_t *titor; + + wids = ngram_iter_get(itor, &prob2, &bo_wt2); + + assert (bgptr - model->lm3g.bigrams < newbase->n_counts[1]); + + bgptr->wid = wids[1]; + bgptr->prob2 = sorted_id(&sorted_prob2, &prob2); + if (newbase->n > 2) { + tgcount = (tgptr - model->lm3g.trigrams); + bgcount = (bgptr - model->lm3g.bigrams); + + /* Backoff weight (only if there are trigrams...) */ + bgptr->bo_wt2 = sorted_id(&sorted_bo_wt2, &bo_wt2); + + /* Find bigram segment for this bigram (this isn't + * used unless there are trigrams) */ + seg = bgcount >> LOG_BG_SEG_SZ; + /* If we just crossed a bigram segment boundary, then + * point tseg_base for the new segment to the current + * trigram pointer. */ + if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ) + model->lm3g.tseg_base[seg] = tgcount; + /* Now calculate the trigram offset. */ + bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg]; + E_DEBUG(2, ("bigram %d %s %s => trigram %d:%d\n", + bgcount, + newbase->word_str[wids[0]], + newbase->word_str[wids[1]], + seg, bgptr->trigrams)); + + /* And fill in successors' trigram info. */ + for (titor = ngram_iter_successors(itor); + titor; ++tgptr, titor = ngram_iter_next(titor)) { + int32 prob3, dummy; + + assert(tgptr - model->lm3g.trigrams < newbase->n_counts[2]); + wids = ngram_iter_get(titor, &prob3, &dummy); + tgptr->wid = wids[2]; + tgptr->prob3 = sorted_id(&sorted_prob3, &prob3); + E_DEBUG(2, ("trigram %d %s %s %s => prob %d\n", + tgcount, + newbase->word_str[wids[0]], + newbase->word_str[wids[1]], + newbase->word_str[wids[2]], + tgptr->prob3)); + } + } + } + ngram_iter_free(uitor); + } + /* Add sentinal unigram and bigram records. */ + bgcount = bgptr - model->lm3g.bigrams; + tgcount = tgptr - model->lm3g.trigrams; + seg = bgcount >> LOG_BG_SEG_SZ; + if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ) + model->lm3g.tseg_base[seg] = tgcount; + model->lm3g.unigrams[i].bigrams = bgcount; + if (newbase->n > 2) + bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg]; + + /* Now create probability tables. */ + model->lm3g.n_prob2 = sorted_prob2.free; + model->lm3g.prob2 = vals_in_sorted_list(&sorted_prob2); + E_INFO("%8d = #bigrams created\n", newbase->n_counts[1]); + E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2); + free_sorted_list(&sorted_prob2); + if (newbase->n > 2) { + /* Create trigram bo-wts array. */ + model->lm3g.n_bo_wt2 = sorted_bo_wt2.free; + model->lm3g.bo_wt2 = vals_in_sorted_list(&sorted_bo_wt2); + free_sorted_list(&sorted_bo_wt2); + E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2); + /* Create trigram probability table. */ + model->lm3g.n_prob3 = sorted_prob3.free; + model->lm3g.prob3 = vals_in_sorted_list(&sorted_prob3); + E_INFO("%8d = #trigrams created\n", newbase->n_counts[2]); + E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3); + free_sorted_list(&sorted_prob3); + /* Initialize tginfo */ + model->lm3g.tginfo = ckd_calloc(newbase->n_counts[0], sizeof(tginfo_t *)); + model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t)); + } + + return model; +} + +static void +fwrite_int32(FILE *fh, int32 val) +{ + fwrite(&val, 4, 1, fh); +} + +static void +fwrite_ug(FILE *fh, unigram_t *ug, logmath_t *lmath) +{ + int32 bogus = -1; + float32 log10val; + + /* Bogus dictionary mapping field. */ + fwrite(&bogus, 4, 1, fh); + /* Convert values to log10. */ + log10val = logmath_log_to_log10(lmath, ug->prob1.l); + fwrite(&log10val, 4, 1, fh); + log10val = logmath_log_to_log10(lmath, ug->bo_wt1.l); + fwrite(&log10val, 4, 1, fh); + fwrite_int32(fh, ug->bigrams); +} + +static void +fwrite_bg(FILE *fh, bigram_t *bg) +{ + fwrite(bg, sizeof(*bg), 1, fh); +} + +static void +fwrite_tg(FILE *fh, trigram_t *tg) +{ + fwrite(tg, sizeof(*tg), 1, fh); +} + +/** Please look at the definition of + */ +static char const *fmtdesc[] = { + "BEGIN FILE FORMAT DESCRIPTION", + "Header string length (int32) and string (including trailing 0)", + "Original LM filename string-length (int32) and filename (including trailing 0)", + "(int32) version number (present iff value <= 0)", + "(int32) original LM file modification timestamp (iff version# present)", + "(int32) string-length and string (including trailing 0) (iff version# present)", + "... previous entry continued any number of times (iff version# present)", + "(int32) 0 (terminating sequence of strings) (iff version# present)", + "(int32) log_bg_seg_sz (present iff different from default value of LOG2_BG_SEG_SZ)", + "(int32) lm_t.ucount (must be > 0)", + "(int32) lm_t.bcount", + "(int32) lm_t.tcount", + "lm_t.ucount+1 unigrams (including sentinel)", + "lm_t.bcount+1 bigrams (including sentinel 64 bits (bg_t) each if version=-1/-2, 128 bits (bg32_t) each if version=-3", + "lm_t.tcount trigrams (present iff lm_t.tcount > 0 32 bits (tg_t) each if version=-1/-2, 64 bits (tg32_t) each if version=-3)", + "(int32) lm_t.n_prob2", + "(int32) lm_t.prob2[]", + "(int32) lm_t.n_bo_wt2 (present iff lm_t.tcount > 0)", + "(int32) lm_t.bo_wt2[] (present iff lm_t.tcount > 0)", + "(int32) lm_t.n_prob3 (present iff lm_t.tcount > 0)", + "(int32) lm_t.prob3[] (present iff lm_t.tcount > 0)", + "(int32) (lm_t.bcount+1)/BG_SEG_SZ+1 (present iff lm_t.tcount > 0)", + "(int32) lm_t.tseg_base[] (present iff lm_t.tcount > 0)", + "(int32) Sum(all word string-lengths, including trailing 0 for each)", + "All word strings (including trailing 0 for each)", + "END FILE FORMAT DESCRIPTION", + NULL, +}; + +static void +ngram_model_dmp_write_header(FILE * fh) +{ + int32 k; + k = strlen(darpa_hdr) + 1; + fwrite_int32(fh, k); + fwrite(darpa_hdr, 1, k, fh); +} + +static void +ngram_model_dmp_write_lm_filename(FILE * fh, const char *lmfile) +{ + int32 k; + + k = strlen(lmfile) + 1; + fwrite_int32(fh, k); + fwrite(lmfile, 1, k, fh); +} + +#define LMDMP_VERSION_TG_16BIT -1 /**< VERSION 1 is the simplest DMP file which + is trigram or lower which used 16 bits in + bigram and trigram.*/ + +static void +ngram_model_dmp_write_version(FILE * fh, int32 mtime) +{ + fwrite_int32(fh, LMDMP_VERSION_TG_16BIT); /* version # */ + fwrite_int32(fh, mtime); +} + +static void +ngram_model_dmp_write_ngram_counts(FILE * fh, ngram_model_t *model) +{ + fwrite_int32(fh, model->n_counts[0]); + fwrite_int32(fh, model->n_counts[1]); + fwrite_int32(fh, model->n_counts[2]); +} + +static void +ngram_model_dmp_write_fmtdesc(FILE * fh) +{ + int32 i, k; + long pos; + + /* Write file format description into header */ + for (i = 0; fmtdesc[i] != NULL; i++) { + k = strlen(fmtdesc[i]) + 1; + fwrite_int32(fh, k); + fwrite(fmtdesc[i], 1, k, fh); + } + /* Pad it out in order to achieve 32-bit alignment */ + pos = ftell(fh); + k = pos & 3; + if (k) { + fwrite_int32(fh, 4-k); + fwrite("!!!!", 1, 4-k, fh); + } + fwrite_int32(fh, 0); +} + +static void +ngram_model_dmp_write_unigram(FILE *fh, ngram_model_t *model) +{ + ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; + int32 i; + + for (i = 0; i <= model->n_counts[0]; i++) { + fwrite_ug(fh, &(lm->lm3g.unigrams[i]), model->lmath); + } +} + + +static void +ngram_model_dmp_write_bigram(FILE *fh, ngram_model_t *model) +{ + ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; + int32 i; + + for (i = 0; i <= model->n_counts[1]; i++) { + fwrite_bg(fh, &(lm->lm3g.bigrams[i])); + } + +} + +static void +ngram_model_dmp_write_trigram(FILE *fh, ngram_model_t *model) +{ + ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; + int32 i; + + for (i = 0; i < model->n_counts[2]; i++) { + fwrite_tg(fh, &(lm->lm3g.trigrams[i])); + } +} + +static void +ngram_model_dmp_write_bgprob(FILE *fh, ngram_model_t *model) +{ + ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; + int32 i; + + fwrite_int32(fh, lm->lm3g.n_prob2); + for (i = 0; i < lm->lm3g.n_prob2; i++) { + float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob2[i].l); + fwrite(&log10val, 4, 1, fh); + } +} + +static void +ngram_model_dmp_write_tgbowt(FILE *fh, ngram_model_t *model) +{ + ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; + int32 i; + + fwrite_int32(fh, lm->lm3g.n_bo_wt2); + for (i = 0; i < lm->lm3g.n_bo_wt2; i++) { + float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.bo_wt2[i].l); + fwrite(&log10val, 4, 1, fh); + } +} + +static void +ngram_model_dmp_write_tgprob(FILE *fh, ngram_model_t *model) +{ + ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; + int32 i; + + fwrite_int32(fh, lm->lm3g.n_prob3); + for (i = 0; i < lm->lm3g.n_prob3; i++) { + float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob3[i].l); + fwrite(&log10val, 4, 1, fh); + } +} + +static void +ngram_model_dmp_write_tg_segbase(FILE *fh, ngram_model_t *model) +{ + ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; + int32 i, k; + + k = (model->n_counts[1] + 1) / BG_SEG_SZ + 1; + fwrite_int32(fh, k); + for (i = 0; i < k; i++) + fwrite_int32(fh, lm->lm3g.tseg_base[i]); +} + +static void +ngram_model_dmp_write_wordstr(FILE *fh, ngram_model_t *model) +{ + int32 i, k; + + k = 0; + for (i = 0; i < model->n_counts[0]; i++) + k += strlen(model->word_str[i]) + 1; + fwrite_int32(fh, k); + for (i = 0; i < model->n_counts[0]; i++) + fwrite(model->word_str[i], 1, + strlen(model->word_str[i]) + 1, fh); +} + +int +ngram_model_dmp_write(ngram_model_t *base, + const char *file_name) +{ + ngram_model_dmp_t *model; + ngram_model_t *newbase; + FILE *fh; + + /* First, construct a DMP model from the base model. */ + model = ngram_model_dmp_build(base); + newbase = &model->base; + + /* Now write it, confident in the knowledge that it's the right + * kind of language model internally. */ + if ((fh = fopen(file_name, "wb")) == NULL) { + E_ERROR("Cannot create file %s\n", file_name); + return -1; + } + ngram_model_dmp_write_header(fh); + ngram_model_dmp_write_lm_filename(fh, file_name); + ngram_model_dmp_write_version(fh, 0); + ngram_model_dmp_write_fmtdesc(fh); + ngram_model_dmp_write_ngram_counts(fh, newbase); + ngram_model_dmp_write_unigram(fh, newbase); + if (newbase->n > 1) { + ngram_model_dmp_write_bigram(fh, newbase); + if (newbase->n > 2) { + ngram_model_dmp_write_trigram(fh, newbase); + } + ngram_model_dmp_write_bgprob(fh, newbase); + if (newbase->n > 2) { + ngram_model_dmp_write_tgbowt(fh, newbase); + ngram_model_dmp_write_tgprob(fh, newbase); + ngram_model_dmp_write_tg_segbase(fh, newbase); + } + } + ngram_model_dmp_write_wordstr(fh, newbase); + ngram_model_free(newbase); + + return fclose(fh); +} + +static int +ngram_model_dmp_apply_weights(ngram_model_t *base, float32 lw, + float32 wip, float32 uw) +{ + ngram_model_dmp_t *model = (ngram_model_dmp_t *)base; + lm3g_apply_weights(base, &model->lm3g, lw, wip, uw); + return 0; +} + +/* Lousy "templating" for things that are largely the same in DMP and + * ARPA models, except for the bigram and trigram types and some + * names. */ +#define NGRAM_MODEL_TYPE ngram_model_dmp_t +#include "lm3g_templates.c" + +static void +ngram_model_dmp_free(ngram_model_t *base) +{ + ngram_model_dmp_t *model = (ngram_model_dmp_t *)base; + + ckd_free(model->lm3g.unigrams); + ckd_free(model->lm3g.prob2); + if (model->dump_mmap) { + mmio_file_unmap(model->dump_mmap); + } + else { + ckd_free(model->lm3g.bigrams); + if (base->n > 2) { + ckd_free(model->lm3g.trigrams); + ckd_free(model->lm3g.tseg_base); + } + } + if (base->n > 2) { + ckd_free(model->lm3g.bo_wt2); + ckd_free(model->lm3g.prob3); + } + + lm3g_tginfo_free(base, &model->lm3g); +} + +static ngram_funcs_t ngram_model_dmp_funcs = { + ngram_model_dmp_free, /* free */ + ngram_model_dmp_apply_weights, /* apply_weights */ + lm3g_template_score, /* score */ + lm3g_template_raw_score, /* raw_score */ + lm3g_template_add_ug, /* add_ug */ + lm3g_template_flush, /* flush */ + lm3g_template_iter, /* iter */ + lm3g_template_mgrams, /* mgrams */ + lm3g_template_successors, /* successors */ + lm3g_template_iter_get, /* iter_get */ + lm3g_template_iter_next, /* iter_next */ + lm3g_template_iter_free /* iter_free */ +}; |