diff options
Diffstat (limited to 'media/sphinxbase/src/libsphinxbase/lm')
18 files changed, 11621 insertions, 0 deletions
diff --git a/media/sphinxbase/src/libsphinxbase/lm/fsg_model.c b/media/sphinxbase/src/libsphinxbase/lm/fsg_model.c new file mode 100644 index 000000000..374897754 --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/fsg_model.c @@ -0,0 +1,944 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2004 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +/* System headers. */ +#ifdef _WIN32_WCE +/*MC in a debug build it's implicitly included by assert.h + but you need this in a release build */ +#include <windows.h> +#else +#include <time.h> +#endif /* _WIN32_WCE */ +#include <stdio.h> +#include <string.h> +#include <assert.h> + +/* SphinxBase headers. */ +#include "sphinxbase/err.h" +#include "sphinxbase/pio.h" +#include "sphinxbase/ckd_alloc.h" +#include "sphinxbase/prim_type.h" +#include "sphinxbase/strfuncs.h" +#include "sphinxbase/hash_table.h" +#include "sphinxbase/fsg_model.h" + +/** + * Adjacency list (opaque) for a state in an FSG. + * + * Actually we use hash tables so that random access is a bit faster. + * Plus it allows us to make the lookup code a bit less ugly. + */ + +struct trans_list_s { + hash_table_t *null_trans; /* Null transitions keyed by state. */ + hash_table_t *trans; /* Lists of non-null transitions keyed by state. */ +}; + +/** + * Implementation of arc iterator. + */ +struct fsg_arciter_s { + hash_iter_t *itor, *null_itor; + gnode_t *gn; +}; + +#define FSG_MODEL_BEGIN_DECL "FSG_BEGIN" +#define FSG_MODEL_END_DECL "FSG_END" +#define FSG_MODEL_N_DECL "N" +#define FSG_MODEL_NUM_STATES_DECL "NUM_STATES" +#define FSG_MODEL_S_DECL "S" +#define FSG_MODEL_START_STATE_DECL "START_STATE" +#define FSG_MODEL_F_DECL "F" +#define FSG_MODEL_FINAL_STATE_DECL "FINAL_STATE" +#define FSG_MODEL_T_DECL "T" +#define FSG_MODEL_TRANSITION_DECL "TRANSITION" +#define FSG_MODEL_COMMENT_CHAR '#' + + +static int32 +nextline_str2words(FILE * fp, int32 * lineno, + char **lineptr, char ***wordptr) +{ + for (;;) { + size_t len; + int32 n; + + ckd_free(*lineptr); + if ((*lineptr = fread_line(fp, &len)) == NULL) + return -1; + + (*lineno)++; + + if ((*lineptr)[0] == FSG_MODEL_COMMENT_CHAR) + continue; /* Skip comment lines */ + + n = str2words(*lineptr, NULL, 0); + if (n == 0) + continue; /* Skip blank lines */ + + /* Abuse of realloc(), but this doesn't have to be fast. */ + if (*wordptr == NULL) + *wordptr = ckd_calloc(n, sizeof(**wordptr)); + else + *wordptr = ckd_realloc(*wordptr, n * sizeof(**wordptr)); + return str2words(*lineptr, *wordptr, n); + } +} + +void +fsg_model_trans_add(fsg_model_t * fsg, + int32 from, int32 to, int32 logp, int32 wid) +{ + fsg_link_t *link; + glist_t gl; + gnode_t *gn; + + if (fsg->trans[from].trans == NULL) + fsg->trans[from].trans = hash_table_new(5, HASH_CASE_YES); + + /* Check for duplicate link (i.e., link already exists with label=wid) */ + for (gn = gl = fsg_model_trans(fsg, from, to); gn; gn = gnode_next(gn)) { + link = (fsg_link_t *) gnode_ptr(gn); + if (link->wid == wid) { + if (link->logs2prob < logp) + link->logs2prob = logp; + return; + } + } + + /* Create transition object */ + link = listelem_malloc(fsg->link_alloc); + link->from_state = from; + link->to_state = to; + link->logs2prob = logp; + link->wid = wid; + + /* Add it to the list of transitions and update the hash table */ + gl = glist_add_ptr(gl, (void *) link); + hash_table_replace_bkey(fsg->trans[from].trans, + (char const *) &link->to_state, + sizeof(link->to_state), gl); +} + +int32 +fsg_model_tag_trans_add(fsg_model_t * fsg, int32 from, int32 to, + int32 logp, int32 wid) +{ + fsg_link_t *link, *link2; + + /* Check for transition probability */ + if (logp > 0) { + E_FATAL("Null transition prob must be <= 1.0 (state %d -> %d)\n", + from, to); + } + + /* Self-loop null transitions (with prob <= 1.0) are redundant */ + if (from == to) + return -1; + + if (fsg->trans[from].null_trans == NULL) + fsg->trans[from].null_trans = hash_table_new(5, HASH_CASE_YES); + + /* Check for a duplicate link; if found, keep the higher prob */ + link = fsg_model_null_trans(fsg, from, to); + if (link) { + if (link->logs2prob < logp) { + link->logs2prob = logp; + return 0; + } + else + return -1; + } + + /* Create null transition object */ + link = listelem_malloc(fsg->link_alloc); + link->from_state = from; + link->to_state = to; + link->logs2prob = logp; + link->wid = -1; + + link2 = (fsg_link_t *) + hash_table_enter_bkey(fsg->trans[from].null_trans, + (char const *) &link->to_state, + sizeof(link->to_state), link); + assert(link == link2); + + return 1; +} + +int32 +fsg_model_null_trans_add(fsg_model_t * fsg, int32 from, int32 to, + int32 logp) +{ + return fsg_model_tag_trans_add(fsg, from, to, logp, -1); +} + +glist_t +fsg_model_null_trans_closure(fsg_model_t * fsg, glist_t nulls) +{ + gnode_t *gn1; + int updated; + fsg_link_t *tl1, *tl2; + int32 k, n; + + E_INFO("Computing transitive closure for null transitions\n"); + + /* If our caller didn't give us a list of null-transitions, + make such a list. Just loop through all the FSG states, + and all the null-transitions in that state (which are kept in + their own hash table). */ + if (nulls == NULL) { + int i; + for (i = 0; i < fsg->n_state; ++i) { + hash_iter_t *itor; + hash_table_t *null_trans = fsg->trans[i].null_trans; + if (null_trans == NULL) + continue; + for (itor = hash_table_iter(null_trans); + itor != NULL; + itor = hash_table_iter_next(itor)) { + nulls = glist_add_ptr(nulls, hash_entry_val(itor->ent)); + } + } + } + + /* + * Probably not the most efficient closure implementation, in general, but + * probably reasonably efficient for a sparse null transition matrix. + */ + n = 0; + do { + updated = FALSE; + + for (gn1 = nulls; gn1; gn1 = gnode_next(gn1)) { + hash_iter_t *itor; + + tl1 = (fsg_link_t *) gnode_ptr(gn1); + assert(tl1->wid < 0); + + if (fsg->trans[tl1->to_state].null_trans == NULL) + continue; + + for (itor = hash_table_iter(fsg->trans[tl1->to_state].null_trans); + itor; itor = hash_table_iter_next(itor)) { + + tl2 = (fsg_link_t *) hash_entry_val(itor->ent); + + k = fsg_model_null_trans_add(fsg, + tl1->from_state, + tl2->to_state, + tl1->logs2prob + + tl2->logs2prob); + if (k >= 0) { + updated = TRUE; + if (k > 0) { + nulls = glist_add_ptr(nulls, (void *) + fsg_model_null_trans + (fsg, tl1->from_state, + tl2->to_state)); + n++; + } + } + } + } + } while (updated); + + E_INFO("%d null transitions added\n", n); + + return nulls; +} + +glist_t +fsg_model_trans(fsg_model_t * fsg, int32 i, int32 j) +{ + void *val; + + if (fsg->trans[i].trans == NULL) + return NULL; + if (hash_table_lookup_bkey(fsg->trans[i].trans, (char const *) &j, + sizeof(j), &val) < 0) + return NULL; + return (glist_t) val; +} + +fsg_link_t * +fsg_model_null_trans(fsg_model_t * fsg, int32 i, int32 j) +{ + void *val; + + if (fsg->trans[i].null_trans == NULL) + return NULL; + if (hash_table_lookup_bkey(fsg->trans[i].null_trans, (char const *) &j, + sizeof(j), &val) < 0) + return NULL; + return (fsg_link_t *) val; +} + +fsg_arciter_t * +fsg_model_arcs(fsg_model_t * fsg, int32 i) +{ + fsg_arciter_t *itor; + + if (fsg->trans[i].trans == NULL && fsg->trans[i].null_trans == NULL) + return NULL; + itor = ckd_calloc(1, sizeof(*itor)); + if (fsg->trans[i].null_trans) + itor->null_itor = hash_table_iter(fsg->trans[i].null_trans); + if (fsg->trans[i].trans) + itor->itor = hash_table_iter(fsg->trans[i].trans); + if (itor->itor != NULL) + itor->gn = hash_entry_val(itor->itor->ent); + return itor; +} + +fsg_link_t * +fsg_arciter_get(fsg_arciter_t * itor) +{ + /* Iterate over non-null arcs first. */ + if (itor->gn) + return (fsg_link_t *) gnode_ptr(itor->gn); + else if (itor->null_itor) + return (fsg_link_t *) hash_entry_val(itor->null_itor->ent); + else + return NULL; +} + +fsg_arciter_t * +fsg_arciter_next(fsg_arciter_t * itor) +{ + /* Iterate over non-null arcs first. */ + if (itor->gn) { + itor->gn = gnode_next(itor->gn); + /* Move to the next destination arc. */ + if (itor->gn == NULL) { + itor->itor = hash_table_iter_next(itor->itor); + if (itor->itor != NULL) + itor->gn = hash_entry_val(itor->itor->ent); + else if (itor->null_itor == NULL) + goto stop_iteration; + } + } + else { + if (itor->null_itor == NULL) + goto stop_iteration; + itor->null_itor = hash_table_iter_next(itor->null_itor); + if (itor->null_itor == NULL) + goto stop_iteration; + } + return itor; + stop_iteration: + fsg_arciter_free(itor); + return NULL; + +} + +void +fsg_arciter_free(fsg_arciter_t * itor) +{ + if (itor == NULL) + return; + hash_table_iter_free(itor->null_itor); + hash_table_iter_free(itor->itor); + ckd_free(itor); +} + +int +fsg_model_word_id(fsg_model_t * fsg, char const *word) +{ + int wid; + + /* Search for an existing word matching this. */ + for (wid = 0; wid < fsg->n_word; ++wid) { + if (0 == strcmp(fsg->vocab[wid], word)) + break; + } + /* If not found, add this to the vocab. */ + if (wid == fsg->n_word) + return -1; + return wid; +} + +int +fsg_model_word_add(fsg_model_t * fsg, char const *word) +{ + int wid, old_size; + + /* Search for an existing word matching this. */ + wid = fsg_model_word_id(fsg, word); + /* If not found, add this to the vocab. */ + if (wid == -1) { + wid = fsg->n_word; + if (fsg->n_word == fsg->n_word_alloc) { + old_size = fsg->n_word_alloc; + fsg->n_word_alloc += 10; + fsg->vocab = ckd_realloc(fsg->vocab, + fsg->n_word_alloc * + sizeof(*fsg->vocab)); + if (fsg->silwords) + fsg->silwords = + bitvec_realloc(fsg->silwords, old_size, fsg->n_word_alloc); + if (fsg->altwords) + fsg->altwords = + bitvec_realloc(fsg->altwords, old_size, fsg->n_word_alloc); + } + ++fsg->n_word; + fsg->vocab[wid] = ckd_salloc(word); + } + return wid; +} + +int +fsg_model_add_silence(fsg_model_t * fsg, char const *silword, + int state, float32 silprob) +{ + int32 logsilp; + int n_trans, silwid, src; + + E_INFO("Adding silence transitions for %s to FSG\n", silword); + + silwid = fsg_model_word_add(fsg, silword); + logsilp = (int32) (logmath_log(fsg->lmath, silprob) * fsg->lw); + if (fsg->silwords == NULL) + fsg->silwords = bitvec_alloc(fsg->n_word_alloc); + bitvec_set(fsg->silwords, silwid); + + n_trans = 0; + if (state == -1) { + for (src = 0; src < fsg->n_state; src++) { + fsg_model_trans_add(fsg, src, src, logsilp, silwid); + ++n_trans; + } + } + else { + fsg_model_trans_add(fsg, state, state, logsilp, silwid); + ++n_trans; + } + + E_INFO("Added %d silence word transitions\n", n_trans); + return n_trans; +} + +int +fsg_model_add_alt(fsg_model_t * fsg, char const *baseword, + char const *altword) +{ + int i, basewid, altwid; + int ntrans; + + /* FIXME: This will get slow, eventually... */ + for (basewid = 0; basewid < fsg->n_word; ++basewid) + if (0 == strcmp(fsg->vocab[basewid], baseword)) + break; + if (basewid == fsg->n_word) { + E_ERROR("Base word %s not present in FSG vocabulary!\n", baseword); + return -1; + } + altwid = fsg_model_word_add(fsg, altword); + if (fsg->altwords == NULL) + fsg->altwords = bitvec_alloc(fsg->n_word_alloc); + bitvec_set(fsg->altwords, altwid); + if (fsg_model_is_filler(fsg, basewid)) { + if (fsg->silwords == NULL) + fsg->silwords = bitvec_alloc(fsg->n_word_alloc); + bitvec_set(fsg->silwords, altwid); + } + + E_DEBUG(2, ("Adding alternate word transitions (%s,%s) to FSG\n", + baseword, altword)); + + /* Look for all transitions involving baseword and duplicate them. */ + /* FIXME: This will also get slow, eventually... */ + ntrans = 0; + for (i = 0; i < fsg->n_state; ++i) { + hash_iter_t *itor; + if (fsg->trans[i].trans == NULL) + continue; + for (itor = hash_table_iter(fsg->trans[i].trans); itor; + itor = hash_table_iter_next(itor)) { + glist_t trans; + gnode_t *gn; + + trans = hash_entry_val(itor->ent); + for (gn = trans; gn; gn = gnode_next(gn)) { + fsg_link_t *fl = gnode_ptr(gn); + if (fl->wid == basewid) { + fsg_link_t *link; + + /* Create transition object */ + link = listelem_malloc(fsg->link_alloc); + link->from_state = fl->from_state; + link->to_state = fl->to_state; + link->logs2prob = fl->logs2prob; /* FIXME!!!??? */ + link->wid = altwid; + + trans = glist_add_ptr(trans, (void *) link); + ++ntrans; + } + } + hash_entry_val(itor->ent) = trans; + } + } + + E_DEBUG(2, ("Added %d alternate word transitions\n", ntrans)); + return ntrans; +} + + +fsg_model_t * +fsg_model_init(char const *name, logmath_t * lmath, float32 lw, + int32 n_state) +{ + fsg_model_t *fsg; + + /* Allocate basic stuff. */ + fsg = ckd_calloc(1, sizeof(*fsg)); + fsg->refcount = 1; + fsg->link_alloc = listelem_alloc_init(sizeof(fsg_link_t)); + fsg->lmath = lmath; + fsg->name = name ? ckd_salloc(name) : NULL; + fsg->n_state = n_state; + fsg->lw = lw; + + fsg->trans = ckd_calloc(fsg->n_state, sizeof(*fsg->trans)); + + return fsg; +} + +fsg_model_t * +fsg_model_read(FILE * fp, logmath_t * lmath, float32 lw) +{ + fsg_model_t *fsg; + hash_table_t *vocab; + hash_iter_t *itor; + int32 lastwid; + char **wordptr; + char *lineptr; + char *fsgname; + int32 lineno; + int32 n, i, j; + int n_state, n_trans, n_null_trans; + glist_t nulls; + float32 p; + + lineno = 0; + vocab = hash_table_new(32, FALSE); + wordptr = NULL; + lineptr = NULL; + nulls = NULL; + fsgname = NULL; + fsg = NULL; + + /* Scan upto FSG_BEGIN header */ + for (;;) { + n = nextline_str2words(fp, &lineno, &lineptr, &wordptr); + if (n < 0) { + E_ERROR("%s declaration missing\n", FSG_MODEL_BEGIN_DECL); + goto parse_error; + } + + if ((strcmp(wordptr[0], FSG_MODEL_BEGIN_DECL) == 0)) { + if (n > 2) { + E_ERROR("Line[%d]: malformed FSG_BEGIN declaration\n", + lineno); + goto parse_error; + } + break; + } + } + /* Save FSG name, or it will get clobbered below :(. + * If name is missing, try the default. + */ + if (n == 2) { + fsgname = ckd_salloc(wordptr[1]); + } + else { + E_WARN("FSG name is missing\n"); + fsgname = ckd_salloc("unknown"); + } + + /* Read #states */ + n = nextline_str2words(fp, &lineno, &lineptr, &wordptr); + if ((n != 2) + || ((strcmp(wordptr[0], FSG_MODEL_N_DECL) != 0) + && (strcmp(wordptr[0], FSG_MODEL_NUM_STATES_DECL) != 0)) + || (sscanf(wordptr[1], "%d", &n_state) != 1) + || (n_state <= 0)) { + E_ERROR + ("Line[%d]: #states declaration line missing or malformed\n", + lineno); + goto parse_error; + } + + /* Now create the FSG. */ + fsg = fsg_model_init(fsgname, lmath, lw, n_state); + ckd_free(fsgname); + fsgname = NULL; + + /* Read start state */ + n = nextline_str2words(fp, &lineno, &lineptr, &wordptr); + if ((n != 2) + || ((strcmp(wordptr[0], FSG_MODEL_S_DECL) != 0) + && (strcmp(wordptr[0], FSG_MODEL_START_STATE_DECL) != 0)) + || (sscanf(wordptr[1], "%d", &(fsg->start_state)) != 1) + || (fsg->start_state < 0) + || (fsg->start_state >= fsg->n_state)) { + E_ERROR + ("Line[%d]: start state declaration line missing or malformed\n", + lineno); + goto parse_error; + } + + /* Read final state */ + n = nextline_str2words(fp, &lineno, &lineptr, &wordptr); + if ((n != 2) + || ((strcmp(wordptr[0], FSG_MODEL_F_DECL) != 0) + && (strcmp(wordptr[0], FSG_MODEL_FINAL_STATE_DECL) != 0)) + || (sscanf(wordptr[1], "%d", &(fsg->final_state)) != 1) + || (fsg->final_state < 0) + || (fsg->final_state >= fsg->n_state)) { + E_ERROR + ("Line[%d]: final state declaration line missing or malformed\n", + lineno); + goto parse_error; + } + + /* Read transitions */ + lastwid = 0; + n_trans = n_null_trans = 0; + for (;;) { + int32 wid, tprob; + + n = nextline_str2words(fp, &lineno, &lineptr, &wordptr); + if (n <= 0) { + E_ERROR("Line[%d]: transition or FSG_END statement expected\n", + lineno); + goto parse_error; + } + + if ((strcmp(wordptr[0], FSG_MODEL_END_DECL) == 0)) { + break; + } + + if ((strcmp(wordptr[0], FSG_MODEL_T_DECL) == 0) + || (strcmp(wordptr[0], FSG_MODEL_TRANSITION_DECL) == 0)) { + + + if (((n != 4) && (n != 5)) + || (sscanf(wordptr[1], "%d", &i) != 1) + || (sscanf(wordptr[2], "%d", &j) != 1) + || (i < 0) || (i >= fsg->n_state) + || (j < 0) || (j >= fsg->n_state)) { + E_ERROR + ("Line[%d]: transition spec malformed; Expecting: from-state to-state trans-prob [word]\n", + lineno); + goto parse_error; + } + + p = atof_c(wordptr[3]); + if ((p <= 0.0) || (p > 1.0)) { + E_ERROR + ("Line[%d]: transition spec malformed; Expecting float as transition probability\n", + lineno); + goto parse_error; + } + } + else { + E_ERROR("Line[%d]: transition or FSG_END statement expected\n", + lineno); + goto parse_error; + } + + tprob = (int32) (logmath_log(lmath, p) * fsg->lw); + /* Add word to "dictionary". */ + if (n > 4) { + if (hash_table_lookup_int32(vocab, wordptr[4], &wid) < 0) { + (void) hash_table_enter_int32(vocab, + ckd_salloc(wordptr[4]), + lastwid); + wid = lastwid; + ++lastwid; + } + fsg_model_trans_add(fsg, i, j, tprob, wid); + ++n_trans; + } + else { + if (fsg_model_null_trans_add(fsg, i, j, tprob) == 1) { + ++n_null_trans; + nulls = + glist_add_ptr(nulls, fsg_model_null_trans(fsg, i, j)); + } + } + } + + E_INFO("FSG: %d states, %d unique words, %d transitions (%d null)\n", + fsg->n_state, hash_table_inuse(vocab), n_trans, n_null_trans); + + + /* Now create a string table from the "dictionary" */ + fsg->n_word = hash_table_inuse(vocab); + fsg->n_word_alloc = fsg->n_word + 10; /* Pad it a bit. */ + fsg->vocab = ckd_calloc(fsg->n_word_alloc, sizeof(*fsg->vocab)); + for (itor = hash_table_iter(vocab); itor; + itor = hash_table_iter_next(itor)) { + char const *word = hash_entry_key(itor->ent); + int32 wid = (int32) (long) hash_entry_val(itor->ent); + fsg->vocab[wid] = (char *) word; + } + hash_table_free(vocab); + + /* Do transitive closure on null transitions */ + nulls = fsg_model_null_trans_closure(fsg, nulls); + glist_free(nulls); + + ckd_free(lineptr); + ckd_free(wordptr); + + return fsg; + + parse_error: + for (itor = hash_table_iter(vocab); itor; + itor = hash_table_iter_next(itor)) + ckd_free((char *) hash_entry_key(itor->ent)); + glist_free(nulls); + hash_table_free(vocab); + ckd_free(fsgname); + ckd_free(lineptr); + ckd_free(wordptr); + fsg_model_free(fsg); + return NULL; +} + + +fsg_model_t * +fsg_model_readfile(const char *file, logmath_t * lmath, float32 lw) +{ + FILE *fp; + fsg_model_t *fsg; + + if ((fp = fopen(file, "r")) == NULL) { + E_ERROR_SYSTEM("Failed to open FSG file '%s' for reading", file); + return NULL; + } + fsg = fsg_model_read(fp, lmath, lw); + fclose(fp); + return fsg; +} + +fsg_model_t * +fsg_model_retain(fsg_model_t * fsg) +{ + ++fsg->refcount; + return fsg; +} + +static void +trans_list_free(fsg_model_t * fsg, int32 i) +{ + hash_iter_t *itor; + + /* FIXME (maybe): FSG links will all get freed when we call + * listelem_alloc_free() so don't bother freeing them explicitly + * here. */ + if (fsg->trans[i].trans) { + for (itor = hash_table_iter(fsg->trans[i].trans); + itor; itor = hash_table_iter_next(itor)) { + glist_t gl = (glist_t) hash_entry_val(itor->ent); + glist_free(gl); + } + } + hash_table_free(fsg->trans[i].trans); + hash_table_free(fsg->trans[i].null_trans); +} + +int +fsg_model_free(fsg_model_t * fsg) +{ + int i; + + if (fsg == NULL) + return 0; + + if (--fsg->refcount > 0) + return fsg->refcount; + + for (i = 0; i < fsg->n_word; ++i) + ckd_free(fsg->vocab[i]); + for (i = 0; i < fsg->n_state; ++i) + trans_list_free(fsg, i); + ckd_free(fsg->trans); + ckd_free(fsg->vocab); + listelem_alloc_free(fsg->link_alloc); + bitvec_free(fsg->silwords); + bitvec_free(fsg->altwords); + ckd_free(fsg->name); + ckd_free(fsg); + return 0; +} + + +void +fsg_model_write(fsg_model_t * fsg, FILE * fp) +{ + int32 i; + + fprintf(fp, "%s %s\n", FSG_MODEL_BEGIN_DECL, + fsg->name ? fsg->name : ""); + fprintf(fp, "%s %d\n", FSG_MODEL_NUM_STATES_DECL, fsg->n_state); + fprintf(fp, "%s %d\n", FSG_MODEL_START_STATE_DECL, fsg->start_state); + fprintf(fp, "%s %d\n", FSG_MODEL_FINAL_STATE_DECL, fsg->final_state); + + for (i = 0; i < fsg->n_state; i++) { + fsg_arciter_t *itor; + + for (itor = fsg_model_arcs(fsg, i); itor; + itor = fsg_arciter_next(itor)) { + fsg_link_t *tl = fsg_arciter_get(itor); + + fprintf(fp, "%s %d %d %f %s\n", FSG_MODEL_TRANSITION_DECL, + tl->from_state, tl->to_state, + logmath_exp(fsg->lmath, + (int32) (tl->logs2prob / fsg->lw)), + (tl->wid < 0) ? "" : fsg_model_word_str(fsg, tl->wid)); + } + } + + fprintf(fp, "%s\n", FSG_MODEL_END_DECL); + + fflush(fp); +} + +void +fsg_model_writefile(fsg_model_t * fsg, char const *file) +{ + FILE *fp; + + assert(fsg); + + E_INFO("Writing FSG file '%s'\n", file); + + if ((fp = fopen(file, "w")) == NULL) { + E_ERROR_SYSTEM("Failed to open FSG file '%s' for reading", file); + return; + } + + fsg_model_write(fsg, fp); + + fclose(fp); +} + +static void +fsg_model_write_fsm_trans(fsg_model_t * fsg, int i, FILE * fp) +{ + fsg_arciter_t *itor; + + for (itor = fsg_model_arcs(fsg, i); itor; + itor = fsg_arciter_next(itor)) { + fsg_link_t *tl = fsg_arciter_get(itor); + fprintf(fp, "%d %d %s %f\n", + tl->from_state, tl->to_state, + (tl->wid < 0) ? "<eps>" : fsg_model_word_str(fsg, tl->wid), + -logmath_log_to_ln(fsg->lmath, tl->logs2prob / fsg->lw)); + } +} + +void +fsg_model_write_fsm(fsg_model_t * fsg, FILE * fp) +{ + int i; + + /* Write transitions from initial state first. */ + fsg_model_write_fsm_trans(fsg, fsg_model_start_state(fsg), fp); + + /* Other states. */ + for (i = 0; i < fsg->n_state; i++) { + if (i == fsg_model_start_state(fsg)) + continue; + fsg_model_write_fsm_trans(fsg, i, fp); + } + + /* Final state. */ + fprintf(fp, "%d 0\n", fsg_model_final_state(fsg)); + + fflush(fp); +} + +void +fsg_model_writefile_fsm(fsg_model_t * fsg, char const *file) +{ + FILE *fp; + + assert(fsg); + + E_INFO("Writing FSM file '%s'\n", file); + + if ((fp = fopen(file, "w")) == NULL) { + E_ERROR_SYSTEM("Failed to open fsm file '%s' for writing", file); + return; + } + + fsg_model_write_fsm(fsg, fp); + + fclose(fp); +} + +void +fsg_model_write_symtab(fsg_model_t * fsg, FILE * file) +{ + int i; + + fprintf(file, "<eps> 0\n"); + for (i = 0; i < fsg_model_n_word(fsg); ++i) { + fprintf(file, "%s %d\n", fsg_model_word_str(fsg, i), i + 1); + } + fflush(file); +} + +void +fsg_model_writefile_symtab(fsg_model_t * fsg, char const *file) +{ + FILE *fp; + + assert(fsg); + + E_INFO("Writing FSM symbol table '%s'\n", file); + + if ((fp = fopen(file, "w")) == NULL) { + E_ERROR("Failed to open symbol table '%s' for writing", file); + return; + } + + fsg_model_write_symtab(fsg, fp); + + fclose(fp); +} diff --git a/media/sphinxbase/src/libsphinxbase/lm/jsgf.c b/media/sphinxbase/src/libsphinxbase/lm/jsgf.c new file mode 100644 index 000000000..90e161c62 --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/jsgf.c @@ -0,0 +1,943 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 2007 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +#include <string.h> +#include <assert.h> + +#include "sphinxbase/ckd_alloc.h" +#include "sphinxbase/strfuncs.h" +#include "sphinxbase/hash_table.h" +#include "sphinxbase/filename.h" +#include "sphinxbase/err.h" +#include "sphinxbase/jsgf.h" + +#include "jsgf_internal.h" +#include "jsgf_parser.h" +#include "jsgf_scanner.h" + +extern int yyparse (void* scanner, jsgf_t* jsgf); + +/** + * \file jsgf.c + * + * This file implements the data structures for parsing JSGF grammars + * into Sphinx finite-state grammars. + **/ + +static int expand_rule(jsgf_t *grammar, jsgf_rule_t *rule, int rule_entry, int rule_exit); + +jsgf_atom_t * +jsgf_atom_new(char *name, float weight) +{ + jsgf_atom_t *atom; + + atom = ckd_calloc(1, sizeof(*atom)); + atom->name = ckd_salloc(name); + atom->weight = weight; + return atom; +} + +int +jsgf_atom_free(jsgf_atom_t *atom) +{ + if (atom == NULL) + return 0; + ckd_free(atom->name); + ckd_free(atom); + return 0; +} + +jsgf_t * +jsgf_grammar_new(jsgf_t *parent) +{ + jsgf_t *grammar; + + grammar = ckd_calloc(1, sizeof(*grammar)); + /* If this is an imported/subgrammar, then we will share a global + * namespace with the parent grammar. */ + if (parent) { + grammar->rules = parent->rules; + grammar->imports = parent->imports; + grammar->searchpath = parent->searchpath; + grammar->parent = parent; + } + else { + grammar->rules = hash_table_new(64, 0); + grammar->imports = hash_table_new(16, 0); + } + + return grammar; +} + +void +jsgf_grammar_free(jsgf_t *jsgf) +{ + /* FIXME: Probably should just use refcounting instead. */ + if (jsgf->parent == NULL) { + hash_iter_t *itor; + gnode_t *gn; + + for (itor = hash_table_iter(jsgf->rules); itor; + itor = hash_table_iter_next(itor)) { + ckd_free((char *)itor->ent->key); + jsgf_rule_free((jsgf_rule_t *)itor->ent->val); + } + hash_table_free(jsgf->rules); + for (itor = hash_table_iter(jsgf->imports); itor; + itor = hash_table_iter_next(itor)) { + ckd_free((char *)itor->ent->key); + jsgf_grammar_free((jsgf_t *)itor->ent->val); + } + hash_table_free(jsgf->imports); + for (gn = jsgf->searchpath; gn; gn = gnode_next(gn)) + ckd_free(gnode_ptr(gn)); + glist_free(jsgf->searchpath); + for (gn = jsgf->links; gn; gn = gnode_next(gn)) + ckd_free(gnode_ptr(gn)); + glist_free(jsgf->links); + } + ckd_free(jsgf->name); + ckd_free(jsgf->version); + ckd_free(jsgf->charset); + ckd_free(jsgf->locale); + ckd_free(jsgf); +} + +static void +jsgf_rhs_free(jsgf_rhs_t *rhs) +{ + gnode_t *gn; + + if (rhs == NULL) + return; + + jsgf_rhs_free(rhs->alt); + for (gn = rhs->atoms; gn; gn = gnode_next(gn)) + jsgf_atom_free(gnode_ptr(gn)); + glist_free(rhs->atoms); + ckd_free(rhs); +} + +jsgf_atom_t * +jsgf_kleene_new(jsgf_t *jsgf, jsgf_atom_t *atom, int plus) +{ + jsgf_rule_t *rule; + jsgf_atom_t *rule_atom; + jsgf_rhs_t *rhs; + + /* Generate an "internal" rule of the form (<NULL> | <name> <g0006>) */ + /* Or if plus is true, (<name> | <name> <g0006>) */ + rhs = ckd_calloc(1, sizeof(*rhs)); + if (plus) + rhs->atoms = glist_add_ptr(NULL, jsgf_atom_new(atom->name, 1.0)); + else + rhs->atoms = glist_add_ptr(NULL, jsgf_atom_new("<NULL>", 1.0)); + rule = jsgf_define_rule(jsgf, NULL, rhs, 0); + rule_atom = jsgf_atom_new(rule->name, 1.0); + rhs = ckd_calloc(1, sizeof(*rhs)); + rhs->atoms = glist_add_ptr(NULL, rule_atom); + rhs->atoms = glist_add_ptr(rhs->atoms, atom); + rule->rhs->alt = rhs; + + return jsgf_atom_new(rule->name, 1.0); +} + +jsgf_rule_t * +jsgf_optional_new(jsgf_t *jsgf, jsgf_rhs_t *exp) +{ + jsgf_rhs_t *rhs = ckd_calloc(1, sizeof(*rhs)); + jsgf_atom_t *atom = jsgf_atom_new("<NULL>", 1.0); + rhs->alt = exp; + rhs->atoms = glist_add_ptr(NULL, atom); + return jsgf_define_rule(jsgf, NULL, rhs, 0); +} + +void +jsgf_add_link(jsgf_t *grammar, jsgf_atom_t *atom, int from, int to) +{ + jsgf_link_t *link; + + link = ckd_calloc(1, sizeof(*link)); + link->from = from; + link->to = to; + link->atom = atom; + grammar->links = glist_add_ptr(grammar->links, link); +} + +static char * +extract_grammar_name(char *rule_name) +{ + char* dot_pos; + char* grammar_name = ckd_salloc(rule_name + 1); + if ((dot_pos = strrchr(grammar_name + 1, '.')) == NULL) { + ckd_free(grammar_name); + return NULL; + } + *dot_pos='\0'; + return grammar_name; +} + +char const * +jsgf_grammar_name(jsgf_t *jsgf) +{ + return jsgf->name; +} + +static char * +jsgf_fullname(jsgf_t *jsgf, const char *name) +{ + char *fullname; + + /* Check if it is already qualified */ + if (strchr(name + 1, '.')) + return ckd_salloc(name); + + /* Skip leading < in name */ + fullname = ckd_malloc(strlen(jsgf->name) + strlen(name) + 4); + sprintf(fullname, "<%s.%s", jsgf->name, name + 1); + return fullname; +} + +static char * +jsgf_fullname_from_rule(jsgf_rule_t *rule, const char *name) +{ + char *fullname, *grammar_name; + + /* Check if it is already qualified */ + if (strchr(name + 1, '.')) + return ckd_salloc(name); + + /* Skip leading < in name */ + if ((grammar_name = extract_grammar_name(rule->name)) == NULL) + return ckd_salloc(name); + fullname = ckd_malloc(strlen(grammar_name) + strlen(name) + 4); + sprintf(fullname, "<%s.%s", grammar_name, name + 1); + ckd_free(grammar_name); + + return fullname; +} + +/* Extract as rulename everything after the secondlast dot, if existent. + * Because everything before the secondlast dot is the path-specification. */ +static char * +importname2rulename(char *importname) +{ + char *rulename = ckd_salloc(importname); + char *last_dotpos; + char *secondlast_dotpos; + + if ((last_dotpos = strrchr(rulename+1, '.')) != NULL) { + *last_dotpos='\0'; + if ((secondlast_dotpos = strrchr(rulename+1, '.')) != NULL) { + *last_dotpos='.'; + *secondlast_dotpos='<'; + secondlast_dotpos = ckd_salloc(secondlast_dotpos); + ckd_free(rulename); + return secondlast_dotpos; + } + else { + *last_dotpos='.'; + return rulename; + } + } + else { + return rulename; + } +} + +#define NO_NODE -1 +#define RECURSIVE_NODE -2 + +/** + * + * Expand a right-hand-side of a rule (i.e. a single alternate). + * + * @returns the FSG state at the end of this rule, NO_NODE if there's an + * error, and RECURSIVE_NODE if the right-hand-side ended in right-recursion (i.e. + * a link to an earlier FSG state). + */ +static int +expand_rhs(jsgf_t *grammar, jsgf_rule_t *rule, jsgf_rhs_t *rhs, + int rule_entry, int rule_exit) +{ + gnode_t *gn; + int lastnode; + + /* Last node expanded in this sequence. */ + lastnode = rule_entry; + + /* Iterate over atoms in rhs and generate links/nodes */ + for (gn = rhs->atoms; gn; gn = gnode_next(gn)) { + jsgf_atom_t *atom = gnode_ptr(gn); + + if (jsgf_atom_is_rule(atom)) { + jsgf_rule_t *subrule; + char *fullname; + gnode_t *subnode; + jsgf_rule_stack_t *rule_stack_entry = NULL; + + /* Special case for <NULL> and <VOID> pseudo-rules + If this is the only atom in the rhs, and it's the + first rhs in the rule, then emit a null transition, + creating an exit state if needed. */ + if (0 == strcmp(atom->name, "<NULL>")) { + if (gn == rhs->atoms && gnode_next(gn) == NULL) { + if (rule_exit == NO_NODE) { + jsgf_add_link(grammar, atom, + lastnode, grammar->nstate); + rule_exit = lastnode = grammar->nstate; + ++grammar->nstate; + } else { + jsgf_add_link(grammar, atom, + lastnode, rule_exit); + } + } + continue; + } + else if (0 == strcmp(atom->name, "<VOID>")) { + /* Make this entire RHS unspeakable */ + return NO_NODE; + } + + fullname = jsgf_fullname_from_rule(rule, atom->name); + if (hash_table_lookup(grammar->rules, fullname, (void**)&subrule) == -1) { + E_ERROR("Undefined rule in RHS: %s\n", fullname); + ckd_free(fullname); + return NO_NODE; + } + ckd_free(fullname); + + /* Look for this subrule in the stack of expanded rules */ + for (subnode = grammar->rulestack; subnode; subnode = gnode_next(subnode)) { + rule_stack_entry = (jsgf_rule_stack_t *)gnode_ptr(subnode); + if (rule_stack_entry->rule == subrule) + break; + } + + if (subnode != NULL) { + /* Allow right-recursion only. */ + if (gnode_next(gn) != NULL) { + E_ERROR("Only right-recursion is permitted (in %s.%s)\n", + grammar->name, rule->name); + return NO_NODE; + } + /* Add a link back to the beginning of this rule instance */ + E_INFO("Right recursion %s %d => %d\n", atom->name, lastnode, rule_stack_entry->entry); + jsgf_add_link(grammar, atom, lastnode, rule_stack_entry->entry); + + /* Let our caller know that this rhs didn't reach an + end state. */ + lastnode = RECURSIVE_NODE; + } + else { + /* If this is the last atom in this rhs, link its + expansion to the parent rule's exit state. + Otherwise, create a new exit state for it. */ + int subruleexit = NO_NODE; + if (gnode_next(gn) == NULL && rule_exit >= 0) + subruleexit = rule_exit; + + /* Expand the subrule */ + lastnode = expand_rule(grammar, subrule, lastnode, subruleexit); + + if (lastnode == NO_NODE) + return NO_NODE; + } + } + else { + /* An exit-state is created if this isn't the last atom + in the rhs, or if the containing rule doesn't have an + exit state yet. + Otherwise, the rhs's exit state becomes the containing + rule's exit state. */ + int exitstate; + if (gnode_next(gn) == NULL && rule_exit >= 0) { + exitstate = rule_exit; + } else { + exitstate = grammar->nstate; + ++grammar->nstate; + } + + /* Add a link for this token */ + jsgf_add_link(grammar, atom, + lastnode, exitstate); + lastnode = exitstate; + } + } + + return lastnode; +} + +static int +expand_rule(jsgf_t *grammar, jsgf_rule_t *rule, int rule_entry, + int rule_exit) +{ + jsgf_rule_stack_t* rule_stack_entry; + jsgf_rhs_t *rhs; + + /* Push this rule onto the stack */ + rule_stack_entry = (jsgf_rule_stack_t*)ckd_calloc(1, sizeof (jsgf_rule_stack_t)); + rule_stack_entry->rule = rule; + rule_stack_entry->entry = rule_entry; + grammar->rulestack = glist_add_ptr(grammar->rulestack, + rule_stack_entry); + + for (rhs = rule->rhs; rhs; rhs = rhs->alt) { + int lastnode; + + lastnode = expand_rhs(grammar, rule, rhs, + rule_entry, rule_exit); + + if (lastnode == NO_NODE) { + return NO_NODE; + } else if (lastnode == RECURSIVE_NODE) { + /* The rhs ended with right-recursion, i.e. a transition to + an earlier state. Nothing needs to happen at this level. */ + ; + } else if (rule_exit == NO_NODE) { + /* If this rule doesn't have an exit state yet, use the exit + state of its first right-hand-side. + All other right-hand-sides will use this exit state. */ + assert (lastnode >= 0); + rule_exit = lastnode; + } + } + + /* If no exit-state was created, use the entry-state. */ + if (rule_exit == NO_NODE) { + rule_exit = rule_entry; + } + + /* Pop this rule from the rule stack */ + ckd_free(gnode_ptr(grammar->rulestack)); + grammar->rulestack = gnode_free(grammar->rulestack, NULL); + + return rule_exit; +} + +jsgf_rule_iter_t * +jsgf_rule_iter(jsgf_t *grammar) +{ + return hash_table_iter(grammar->rules); +} + +jsgf_rule_t * +jsgf_get_rule(jsgf_t *grammar, char const *name) +{ + void *val; + char *fullname; + + fullname = string_join("<", name, ">", NULL); + if (hash_table_lookup(grammar->rules, fullname, &val) < 0) { + ckd_free(fullname); + return NULL; + } + ckd_free(fullname); + return (jsgf_rule_t *)val; +} + +jsgf_rule_t * +jsgf_get_public_rule(jsgf_t *grammar) +{ + jsgf_rule_iter_t *itor; + jsgf_rule_t *public_rule = NULL; + + for (itor = jsgf_rule_iter(grammar); itor; + itor = jsgf_rule_iter_next(itor)) { + jsgf_rule_t *rule = jsgf_rule_iter_rule(itor); + if (jsgf_rule_public(rule)) { + const char *rule_name = jsgf_rule_name(rule); + char *dot_pos; + if ((dot_pos = strrchr(rule_name + 1, '.')) == NULL) { + public_rule = rule; + jsgf_rule_iter_free(itor); + break; + } + if (0 == strncmp(rule_name + 1, jsgf_grammar_name(grammar), dot_pos - rule_name - 1)) { + public_rule = rule; + jsgf_rule_iter_free(itor); + break; + } + } + } + return public_rule; +} + +char const * +jsgf_rule_name(jsgf_rule_t *rule) +{ + return rule->name; +} + +int +jsgf_rule_public(jsgf_rule_t *rule) +{ + return rule->is_public; +} + +static fsg_model_t * +jsgf_build_fsg_internal(jsgf_t *grammar, jsgf_rule_t *rule, + logmath_t *lmath, float32 lw, int do_closure) +{ + fsg_model_t *fsg; + glist_t nulls; + gnode_t *gn; + int rule_entry, rule_exit; + + /* Clear previous links */ + for (gn = grammar->links; gn; gn = gnode_next(gn)) { + ckd_free(gnode_ptr(gn)); + } + glist_free(grammar->links); + grammar->links = NULL; + grammar->nstate = 0; + + /* Create the top-level entry state, and expand the + top-level rule. */ + rule_entry = grammar->nstate++; + rule_exit = expand_rule(grammar, rule, rule_entry, NO_NODE); + + /* If no exit-state was created, create one. */ + if (rule_exit == NO_NODE) { + rule_exit = grammar->nstate++; + jsgf_add_link(grammar, NULL, rule_entry, rule_exit); + } + + fsg = fsg_model_init(rule->name, lmath, lw, grammar->nstate); + fsg->start_state = rule_entry; + fsg->final_state = rule_exit; + grammar->links = glist_reverse(grammar->links); + for (gn = grammar->links; gn; gn = gnode_next(gn)) { + jsgf_link_t *link = gnode_ptr(gn); + + if (link->atom) { + if (jsgf_atom_is_rule(link->atom)) { + fsg_model_null_trans_add(fsg, link->from, link->to, + logmath_log(lmath, link->atom->weight)); + } + else { + int wid = fsg_model_word_add(fsg, link->atom->name); + fsg_model_trans_add(fsg, link->from, link->to, + logmath_log(lmath, link->atom->weight), wid); + } + } + else { + fsg_model_null_trans_add(fsg, link->from, link->to, 0); + } + } + if (do_closure) { + nulls = fsg_model_null_trans_closure(fsg, NULL); + glist_free(nulls); + } + + return fsg; +} + +fsg_model_t * +jsgf_build_fsg(jsgf_t *grammar, jsgf_rule_t *rule, + logmath_t *lmath, float32 lw) +{ + return jsgf_build_fsg_internal(grammar, rule, lmath, lw, TRUE); +} + +fsg_model_t * +jsgf_build_fsg_raw(jsgf_t *grammar, jsgf_rule_t *rule, + logmath_t *lmath, float32 lw) +{ + return jsgf_build_fsg_internal(grammar, rule, lmath, lw, FALSE); +} + +fsg_model_t * +jsgf_read_file(const char *file, logmath_t * lmath, float32 lw) +{ + fsg_model_t *fsg; + jsgf_rule_t *rule; + jsgf_t *jsgf; + jsgf_rule_iter_t *itor; + + if ((jsgf = jsgf_parse_file(file, NULL)) == NULL) { + E_ERROR("Error parsing file: %s\n", file); + return NULL; + } + + rule = NULL; + for (itor = jsgf_rule_iter(jsgf); itor; + itor = jsgf_rule_iter_next(itor)) { + rule = jsgf_rule_iter_rule(itor); + if (jsgf_rule_public(rule)) { + jsgf_rule_iter_free(itor); + break; + } + } + if (rule == NULL) { + E_ERROR("No public rules found in %s\n", file); + return NULL; + } + fsg = jsgf_build_fsg(jsgf, rule, lmath, lw); + jsgf_grammar_free(jsgf); + return fsg; +} + +fsg_model_t * +jsgf_read_string(const char *string, logmath_t * lmath, float32 lw) +{ + fsg_model_t *fsg; + jsgf_rule_t *rule; + jsgf_t *jsgf; + jsgf_rule_iter_t *itor; + + if ((jsgf = jsgf_parse_string(string, NULL)) == NULL) { + E_ERROR("Error parsing input string\n"); + return NULL; + } + + rule = NULL; + for (itor = jsgf_rule_iter(jsgf); itor; + itor = jsgf_rule_iter_next(itor)) { + rule = jsgf_rule_iter_rule(itor); + if (jsgf_rule_public(rule)) { + jsgf_rule_iter_free(itor); + break; + } + } + if (rule == NULL) { + jsgf_grammar_free(jsgf); + E_ERROR("No public rules found in input string\n"); + return NULL; + } + fsg = jsgf_build_fsg(jsgf, rule, lmath, lw); + jsgf_grammar_free(jsgf); + return fsg; +} + + +int +jsgf_write_fsg(jsgf_t *grammar, jsgf_rule_t *rule, FILE *outfh) +{ + fsg_model_t *fsg; + logmath_t *lmath = logmath_init(1.0001, 0, 0); + + if ((fsg = jsgf_build_fsg_raw(grammar, rule, lmath, 1.0)) == NULL) + goto error_out; + + fsg_model_write(fsg, outfh); + logmath_free(lmath); + return 0; + +error_out: + logmath_free(lmath); + return -1; +} + +jsgf_rule_t * +jsgf_define_rule(jsgf_t *jsgf, char *name, jsgf_rhs_t *rhs, int is_public) +{ + jsgf_rule_t *rule; + void *val; + + if (name == NULL) { + name = ckd_malloc(strlen(jsgf->name) + 16); + sprintf(name, "<%s.g%05d>", jsgf->name, hash_table_inuse(jsgf->rules)); + } + else { + char *newname; + + newname = jsgf_fullname(jsgf, name); + name = newname; + } + + rule = ckd_calloc(1, sizeof(*rule)); + rule->refcnt = 1; + rule->name = ckd_salloc(name); + rule->rhs = rhs; + rule->is_public = is_public; + + E_INFO("Defined rule: %s%s\n", + rule->is_public ? "PUBLIC " : "", + rule->name); + val = hash_table_enter(jsgf->rules, name, rule); + if (val != (void *)rule) { + E_WARN("Multiply defined symbol: %s\n", name); + } + return rule; +} + +jsgf_rule_t * +jsgf_rule_retain(jsgf_rule_t *rule) +{ + ++rule->refcnt; + return rule; +} + +int +jsgf_rule_free(jsgf_rule_t *rule) +{ + if (rule == NULL) + return 0; + if (--rule->refcnt > 0) + return rule->refcnt; + jsgf_rhs_free(rule->rhs); + ckd_free(rule->name); + ckd_free(rule); + return 0; +} + + +/* FIXME: This should go in libsphinxutil */ +static char * +path_list_search(glist_t paths, char *path) +{ + gnode_t *gn; + + for (gn = paths; gn; gn = gnode_next(gn)) { + char *fullpath; + FILE *tmp; + + fullpath = string_join(gnode_ptr(gn), "/", path, NULL); + tmp = fopen(fullpath, "r"); + if (tmp != NULL) { + fclose(tmp); + return fullpath; + } + else { + ckd_free(fullpath); + } + } + return NULL; +} + +jsgf_rule_t * +jsgf_import_rule(jsgf_t *jsgf, char *name) +{ + char *c, *path, *newpath; + size_t namelen, packlen; + void *val; + jsgf_t *imp; + int import_all; + + /* Trim the leading and trailing <> */ + namelen = strlen(name); + path = ckd_malloc(namelen - 2 + 6); /* room for a trailing .gram */ + strcpy(path, name + 1); + /* Split off the first part of the name */ + c = strrchr(path, '.'); + if (c == NULL) { + E_ERROR("Imported rule is not qualified: %s\n", name); + ckd_free(path); + return NULL; + } + packlen = c - path; + *c = '\0'; + + /* Look for import foo.* */ + import_all = (strlen(name) > 2 && 0 == strcmp(name + namelen - 3, ".*>")); + + /* Construct a filename. */ + for (c = path; *c; ++c) + if (*c == '.') *c = '/'; + strcat(path, ".gram"); + newpath = path_list_search(jsgf->searchpath, path); + if (newpath == NULL) { + E_ERROR("Failed to find grammar %s\n", path); + ckd_free(path); + return NULL; + } + ckd_free(path); + + path = newpath; + E_INFO("Importing %s from %s to %s\n", name, path, jsgf->name); + + /* FIXME: Also, we need to make sure that path is fully qualified + * here, by adding any prefixes from jsgf->name to it. */ + /* See if we have parsed it already */ + if (hash_table_lookup(jsgf->imports, path, &val) == 0) { + E_INFO("Already imported %s\n", path); + imp = val; + ckd_free(path); + } + else { + /* If not, parse it. */ + imp = jsgf_parse_file(path, jsgf); + val = hash_table_enter(jsgf->imports, path, imp); + if (val != (void *)imp) { + E_WARN("Multiply imported file: %s\n", path); + } + } + if (imp != NULL) { + hash_iter_t *itor; + /* Look for public rules matching rulename. */ + for (itor = hash_table_iter(imp->rules); itor; + itor = hash_table_iter_next(itor)) { + hash_entry_t *he = itor->ent; + jsgf_rule_t *rule = hash_entry_val(he); + int rule_matches; + char *rule_name = importname2rulename(name); + + if (import_all) { + /* Match package name (symbol table is shared) */ + rule_matches = !strncmp(rule_name, rule->name, packlen + 1); + } + else { + /* Exact match */ + rule_matches = !strcmp(rule_name, rule->name); + } + ckd_free(rule_name); + if (rule->is_public && rule_matches) { + void *val; + char *newname; + + /* Link this rule into the current namespace. */ + c = strrchr(rule->name, '.'); + assert(c != NULL); + newname = jsgf_fullname(jsgf, c); + + E_INFO("Imported %s\n", newname); + val = hash_table_enter(jsgf->rules, newname, + jsgf_rule_retain(rule)); + if (val != (void *)rule) { + E_WARN("Multiply defined symbol: %s\n", newname); + } + if (!import_all) { + hash_table_iter_free(itor); + return rule; + } + } + } + } + + return NULL; +} + +static void +jsgf_set_search_path(jsgf_t *jsgf, const char *filename) +{ + char *jsgf_path; + +#if !defined(_WIN32_WCE) + if ((jsgf_path = getenv("JSGF_PATH")) != NULL) { + char *word, *c; + /* FIXME: This should be a function in libsphinxbase. */ + word = jsgf_path = ckd_salloc(jsgf_path); + while ((c = strchr(word, ':'))) { + *c = '\0'; + jsgf->searchpath = glist_add_ptr(jsgf->searchpath, word); + word = c + 1; + } + jsgf->searchpath = glist_add_ptr(jsgf->searchpath, word); + jsgf->searchpath = glist_reverse(jsgf->searchpath); + return; + } +#endif + + if (!filename) { + jsgf->searchpath = glist_add_ptr(jsgf->searchpath, ckd_salloc(".")); + return; + } + + jsgf_path = ckd_salloc(filename); + path2dirname(filename, jsgf_path); + jsgf->searchpath = glist_add_ptr(jsgf->searchpath, jsgf_path); +} + +jsgf_t * +jsgf_parse_file(const char *filename, jsgf_t *parent) +{ + yyscan_t yyscanner; + jsgf_t *jsgf; + int yyrv; + FILE *in = NULL; + + yylex_init(&yyscanner); + if (filename == NULL) { + yyset_in(stdin, yyscanner); + } + else { + in = fopen(filename, "r"); + if (in == NULL) { + E_ERROR_SYSTEM("Failed to open %s for parsing", filename); + return NULL; + } + yyset_in(in, yyscanner); + } + + jsgf = jsgf_grammar_new(parent); + + if (!parent) + jsgf_set_search_path(jsgf, filename); + + yyrv = yyparse(yyscanner, jsgf); + if (yyrv != 0) { + E_ERROR("Failed to parse JSGF grammar from '%s'\n", filename ? filename : "(stdin)"); + jsgf_grammar_free(jsgf); + yylex_destroy(yyscanner); + return NULL; + } + if (in) + fclose(in); + yylex_destroy(yyscanner); + + return jsgf; +} + +jsgf_t * +jsgf_parse_string(const char *string, jsgf_t * parent) +{ + yyscan_t yyscanner; + jsgf_t *jsgf; + int yyrv; + YY_BUFFER_STATE buf; + + yylex_init(&yyscanner); + buf = yy_scan_string(string, yyscanner); + + jsgf = jsgf_grammar_new(parent); + if (!parent) + jsgf_set_search_path(jsgf, NULL); + + yyrv = yyparse(yyscanner, jsgf); + if (yyrv != 0) { + E_ERROR("Failed to parse JSGF grammar from input string\n"); + jsgf_grammar_free(jsgf); + yy_delete_buffer(buf, yyscanner); + yylex_destroy(yyscanner); + return NULL; + } + yy_delete_buffer(buf, yyscanner); + yylex_destroy(yyscanner); + + return jsgf; +} diff --git a/media/sphinxbase/src/libsphinxbase/lm/jsgf_internal.h b/media/sphinxbase/src/libsphinxbase/lm/jsgf_internal.h new file mode 100644 index 000000000..a5cbc9833 --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/jsgf_internal.h @@ -0,0 +1,140 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 2007 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +#ifndef __JSGF_INTERNAL_H__ +#define __JSGF_INTERNAL_H__ + +/** + * @file jsgf_internal.h Internal definitions for JSGF grammar compiler + */ + +#include <stdio.h> + +#include <sphinxbase/hash_table.h> +#include <sphinxbase/glist.h> +#include <sphinxbase/fsg_model.h> +#include <sphinxbase/logmath.h> +#include <sphinxbase/strfuncs.h> +#include <sphinxbase/jsgf.h> + + +/* Flex uses strdup which is missing on WinCE */ +#if defined(_WIN32) || defined(_WIN32_WCE) +#define strdup _strdup +#endif + +#ifdef __cplusplus +extern "C" { +#endif +#if 0 +/* Fool Emacs. */ +} +#endif + +#define YY_NO_INPUT /* Silence a compiler warning. */ + +typedef struct jsgf_rhs_s jsgf_rhs_t; +typedef struct jsgf_atom_s jsgf_atom_t; +typedef struct jsgf_link_s jsgf_link_t; +typedef struct jsgf_rule_stack_s jsgf_rule_stack_t; + +struct jsgf_s { + char *version; /**< JSGF version (from header) */ + char *charset; /**< JSGF charset (default UTF-8) */ + char *locale; /**< JSGF locale (default C) */ + char *name; /**< Grammar name */ + + hash_table_t *rules; /**< Defined or imported rules in this grammar. */ + hash_table_t *imports; /**< Pointers to imported grammars. */ + jsgf_t *parent; /**< Parent grammar (if this is an imported one) */ + glist_t searchpath; /**< List of directories to search for grammars. */ + + /* Scratch variables for FSG conversion. */ + int nstate; /**< Number of generated states. */ + glist_t links; /**< Generated FSG links. */ + glist_t rulestack; /**< Stack of currently expanded rules. */ +}; + +/* A type to keep track of the stack of rules currently being expanded. */ +struct jsgf_rule_stack_s { + jsgf_rule_t *rule; /**< The rule being expanded */ + int entry; /**< The entry-state for this expansion */ +}; + +struct jsgf_rule_s { + int refcnt; /**< Reference count. */ + char *name; /**< Rule name (NULL for an alternation/grouping) */ + int is_public; /**< Is this rule marked 'public'? */ + jsgf_rhs_t *rhs; /**< Expansion */ +}; + +struct jsgf_rhs_s { + glist_t atoms; /**< Sequence of items */ + jsgf_rhs_t *alt; /**< Linked list of alternates */ +}; + +struct jsgf_atom_s { + char *name; /**< Rule or token name */ + glist_t tags; /**< Tags, if any (glist_t of char *) */ + float weight; /**< Weight (default 1) */ +}; + +struct jsgf_link_s { + jsgf_atom_t *atom; /**< Name, tags, weight */ + int from; /**< From state */ + int to; /**< To state */ +}; + +#define jsgf_atom_is_rule(atom) ((atom)->name[0] == '<') + +void jsgf_add_link(jsgf_t *grammar, jsgf_atom_t *atom, int from, int to); +jsgf_atom_t *jsgf_atom_new(char *name, float weight); +jsgf_atom_t *jsgf_kleene_new(jsgf_t *jsgf, jsgf_atom_t *atom, int plus); +jsgf_rule_t *jsgf_optional_new(jsgf_t *jsgf, jsgf_rhs_t *exp); +jsgf_rule_t *jsgf_define_rule(jsgf_t *jsgf, char *name, jsgf_rhs_t *rhs, int is_public); +jsgf_rule_t *jsgf_import_rule(jsgf_t *jsgf, char *name); + +int jsgf_atom_free(jsgf_atom_t *atom); +int jsgf_rule_free(jsgf_rule_t *rule); +jsgf_rule_t *jsgf_rule_retain(jsgf_rule_t *rule); + +#ifdef __cplusplus +} +#endif + + +#endif /* __JSGF_H__ */ diff --git a/media/sphinxbase/src/libsphinxbase/lm/jsgf_parser.c b/media/sphinxbase/src/libsphinxbase/lm/jsgf_parser.c new file mode 100644 index 000000000..20acbb9d9 --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/jsgf_parser.c @@ -0,0 +1,1799 @@ + +/* A Bison parser, made by GNU Bison 2.4.1. */ + +/* Skeleton implementation for Bison's Yacc-like parsers in C + + Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004, 2005, 2006 + Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + +/* C LALR(1) parser skeleton written by Richard Stallman, by + simplifying the original so-called "semantic" parser. */ + +/* All symbols defined below should begin with yy or YY, to avoid + infringing on user name space. This should be done even for local + variables, as they might otherwise be expanded by user macros. + There are some unavoidable exceptions within include files to + define necessary library symbols; they are noted "INFRINGES ON + USER NAME SPACE" below. */ + +/* Identify Bison output. */ +#define YYBISON 1 + +/* Bison version. */ +#define YYBISON_VERSION "2.4.1" + +/* Skeleton name. */ +#define YYSKELETON_NAME "yacc.c" + +/* Pure parsers. */ +#define YYPURE 1 + +/* Push parsers. */ +#define YYPUSH 0 + +/* Pull parsers. */ +#define YYPULL 1 + +/* Using locations. */ +#define YYLSP_NEEDED 0 + + + +/* Copy the first part of user declarations. */ + +/* Line 189 of yacc.c */ +#line 37 "jsgf_parser.y" + +#define YYERROR_VERBOSE + +#include <stdio.h> +#include <string.h> + +#include <sphinxbase/hash_table.h> +#include <sphinxbase/ckd_alloc.h> +#include <sphinxbase/err.h> + +#include "jsgf_internal.h" +#include "jsgf_parser.h" +#include "jsgf_scanner.h" + +/* Suppress warnings from generated code */ +#if defined _MSC_VER +#pragma warning(disable: 4273) +#endif + +void yyerror(yyscan_t lex, jsgf_t *jsgf, const char *s); + + + +/* Line 189 of yacc.c */ +#line 97 "jsgf_parser.c" + +/* Enabling traces. */ +#ifndef YYDEBUG +# define YYDEBUG 0 +#endif + +/* Enabling verbose error messages. */ +#ifdef YYERROR_VERBOSE +# undef YYERROR_VERBOSE +# define YYERROR_VERBOSE 1 +#else +# define YYERROR_VERBOSE 0 +#endif + +/* Enabling the token table. */ +#ifndef YYTOKEN_TABLE +# define YYTOKEN_TABLE 0 +#endif + + +/* Tokens. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + /* Put the tokens into the symbol table, so that GDB and other debuggers + know about them. */ + enum yytokentype { + HEADER = 258, + GRAMMAR = 259, + IMPORT = 260, + PUBLIC = 261, + TOKEN = 262, + RULENAME = 263, + TAG = 264, + WEIGHT = 265 + }; +#endif +/* Tokens. */ +#define HEADER 258 +#define GRAMMAR 259 +#define IMPORT 260 +#define PUBLIC 261 +#define TOKEN 262 +#define RULENAME 263 +#define TAG 264 +#define WEIGHT 265 + + + + +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +typedef union YYSTYPE +{ + +/* Line 214 of yacc.c */ +#line 65 "jsgf_parser.y" + + char *name; + float weight; + jsgf_rule_t *rule; + jsgf_rhs_t *rhs; + jsgf_atom_t *atom; + + + +/* Line 214 of yacc.c */ +#line 163 "jsgf_parser.c" +} YYSTYPE; +# define YYSTYPE_IS_TRIVIAL 1 +# define yystype YYSTYPE /* obsolescent; will be withdrawn */ +# define YYSTYPE_IS_DECLARED 1 +#endif + + +/* Copy the second part of user declarations. */ + + +/* Line 264 of yacc.c */ +#line 175 "jsgf_parser.c" + +#ifdef short +# undef short +#endif + +#ifdef YYTYPE_UINT8 +typedef YYTYPE_UINT8 yytype_uint8; +#else +typedef unsigned char yytype_uint8; +#endif + +#ifdef YYTYPE_INT8 +typedef YYTYPE_INT8 yytype_int8; +#elif (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +typedef signed char yytype_int8; +#else +typedef short int yytype_int8; +#endif + +#ifdef YYTYPE_UINT16 +typedef YYTYPE_UINT16 yytype_uint16; +#else +typedef unsigned short int yytype_uint16; +#endif + +#ifdef YYTYPE_INT16 +typedef YYTYPE_INT16 yytype_int16; +#else +typedef short int yytype_int16; +#endif + +#ifndef YYSIZE_T +# ifdef __SIZE_TYPE__ +# define YYSIZE_T __SIZE_TYPE__ +# elif defined size_t +# define YYSIZE_T size_t +# elif ! defined YYSIZE_T && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +# include <stddef.h> /* INFRINGES ON USER NAME SPACE */ +# define YYSIZE_T size_t +# else +# define YYSIZE_T unsigned int +# endif +#endif + +#define YYSIZE_MAXIMUM ((YYSIZE_T) -1) + +#ifndef YY_ +# if YYENABLE_NLS +# if ENABLE_NLS +# include <libintl.h> /* INFRINGES ON USER NAME SPACE */ +# define YY_(msgid) dgettext ("bison-runtime", msgid) +# endif +# endif +# ifndef YY_ +# define YY_(msgid) msgid +# endif +#endif + +/* Suppress unused-variable warnings by "using" E. */ +#if ! defined lint || defined __GNUC__ +# define YYUSE(e) ((void) (e)) +#else +# define YYUSE(e) /* empty */ +#endif + +/* Identity function, used to suppress warnings about constant conditions. */ +#ifndef lint +# define YYID(n) (n) +#else +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static int +YYID (int yyi) +#else +static int +YYID (yyi) + int yyi; +#endif +{ + return yyi; +} +#endif + +#if ! defined yyoverflow || YYERROR_VERBOSE + +/* The parser invokes alloca or malloc; define the necessary symbols. */ + +# ifdef YYSTACK_USE_ALLOCA +# if YYSTACK_USE_ALLOCA +# ifdef __GNUC__ +# define YYSTACK_ALLOC __builtin_alloca +# elif defined __BUILTIN_VA_ARG_INCR +# include <alloca.h> /* INFRINGES ON USER NAME SPACE */ +# elif defined _AIX +# define YYSTACK_ALLOC __alloca +# elif defined _MSC_VER +# include <malloc.h> /* INFRINGES ON USER NAME SPACE */ +# define alloca _alloca +# else +# define YYSTACK_ALLOC alloca +# if ! defined _ALLOCA_H && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +# include <stdlib.h> /* INFRINGES ON USER NAME SPACE */ +# ifndef _STDLIB_H +# define _STDLIB_H 1 +# endif +# endif +# endif +# endif +# endif + +# ifdef YYSTACK_ALLOC + /* Pacify GCC's `empty if-body' warning. */ +# define YYSTACK_FREE(Ptr) do { /* empty */; } while (YYID (0)) +# ifndef YYSTACK_ALLOC_MAXIMUM + /* The OS might guarantee only one guard page at the bottom of the stack, + and a page size can be as small as 4096 bytes. So we cannot safely + invoke alloca (N) if N exceeds 4096. Use a slightly smaller number + to allow for a few compiler-allocated temporary stack slots. */ +# define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */ +# endif +# else +# define YYSTACK_ALLOC YYMALLOC +# define YYSTACK_FREE YYFREE +# ifndef YYSTACK_ALLOC_MAXIMUM +# define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM +# endif +# if (defined __cplusplus && ! defined _STDLIB_H \ + && ! ((defined YYMALLOC || defined malloc) \ + && (defined YYFREE || defined free))) +# include <stdlib.h> /* INFRINGES ON USER NAME SPACE */ +# ifndef _STDLIB_H +# define _STDLIB_H 1 +# endif +# endif +# ifndef YYMALLOC +# define YYMALLOC malloc +# if ! defined malloc && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# ifndef YYFREE +# define YYFREE free +# if ! defined free && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +void free (void *); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# endif +#endif /* ! defined yyoverflow || YYERROR_VERBOSE */ + + +#if (! defined yyoverflow \ + && (! defined __cplusplus \ + || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL))) + +/* A type that is properly aligned for any stack member. */ +union yyalloc +{ + yytype_int16 yyss_alloc; + YYSTYPE yyvs_alloc; +}; + +/* The size of the maximum gap between one aligned stack and the next. */ +# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1) + +/* The size of an array large to enough to hold all stacks, each with + N elements. */ +# define YYSTACK_BYTES(N) \ + ((N) * (sizeof (yytype_int16) + sizeof (YYSTYPE)) \ + + YYSTACK_GAP_MAXIMUM) + +/* Copy COUNT objects from FROM to TO. The source and destination do + not overlap. */ +# ifndef YYCOPY +# if defined __GNUC__ && 1 < __GNUC__ +# define YYCOPY(To, From, Count) \ + __builtin_memcpy (To, From, (Count) * sizeof (*(From))) +# else +# define YYCOPY(To, From, Count) \ + do \ + { \ + YYSIZE_T yyi; \ + for (yyi = 0; yyi < (Count); yyi++) \ + (To)[yyi] = (From)[yyi]; \ + } \ + while (YYID (0)) +# endif +# endif + +/* Relocate STACK from its old location to the new one. The + local variables YYSIZE and YYSTACKSIZE give the old and new number of + elements in the stack, and YYPTR gives the new location of the + stack. Advance YYPTR to a properly aligned location for the next + stack. */ +# define YYSTACK_RELOCATE(Stack_alloc, Stack) \ + do \ + { \ + YYSIZE_T yynewbytes; \ + YYCOPY (&yyptr->Stack_alloc, Stack, yysize); \ + Stack = &yyptr->Stack_alloc; \ + yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \ + yyptr += yynewbytes / sizeof (*yyptr); \ + } \ + while (YYID (0)) + +#endif + +/* YYFINAL -- State number of the termination state. */ +#define YYFINAL 7 +/* YYLAST -- Last index in YYTABLE. */ +#define YYLAST 54 + +/* YYNTOKENS -- Number of terminals. */ +#define YYNTOKENS 20 +/* YYNNTS -- Number of nonterminals. */ +#define YYNNTS 16 +/* YYNRULES -- Number of rules. */ +#define YYNRULES 33 +/* YYNRULES -- Number of states. */ +#define YYNSTATES 58 + +/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX. */ +#define YYUNDEFTOK 2 +#define YYMAXUTOK 265 + +#define YYTRANSLATE(YYX) \ + ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK) + +/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX. */ +static const yytype_uint8 yytranslate[] = +{ + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 14, 15, 18, 19, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 11, + 2, 12, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 16, 2, 17, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 13, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10 +}; + +#if YYDEBUG +/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in + YYRHS. */ +static const yytype_uint8 yyprhs[] = +{ + 0, 0, 3, 5, 8, 12, 15, 18, 22, 27, + 33, 37, 39, 42, 46, 48, 51, 56, 62, 64, + 68, 70, 73, 75, 78, 80, 83, 87, 91, 93, + 95, 97, 99, 102 +}; + +/* YYRHS -- A `-1'-separated list of the rules' RHS. */ +static const yytype_int8 yyrhs[] = +{ + 21, 0, -1, 22, -1, 22, 27, -1, 22, 25, + 27, -1, 23, 24, -1, 3, 11, -1, 3, 7, + 11, -1, 3, 7, 7, 11, -1, 3, 7, 7, + 7, 11, -1, 4, 7, 11, -1, 26, -1, 25, + 26, -1, 5, 8, 11, -1, 28, -1, 27, 28, + -1, 8, 12, 29, 11, -1, 6, 8, 12, 29, + 11, -1, 30, -1, 29, 13, 30, -1, 31, -1, + 30, 31, -1, 32, -1, 31, 9, -1, 35, -1, + 10, 35, -1, 14, 29, 15, -1, 16, 29, 17, + -1, 7, -1, 8, -1, 33, -1, 34, -1, 35, + 18, -1, 35, 19, -1 +}; + +/* YYRLINE[YYN] -- source line where rule number YYN was defined. */ +static const yytype_uint8 yyrline[] = +{ + 0, 82, 82, 83, 84, 87, 90, 91, 92, 93, + 97, 100, 101, 104, 107, 108, 111, 112, 115, 116, + 121, 123, 127, 128, 132, 133, 136, 139, 142, 143, + 144, 145, 146, 147 +}; +#endif + +#if YYDEBUG || YYERROR_VERBOSE || YYTOKEN_TABLE +/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM. + First, the terminals, then, starting at YYNTOKENS, nonterminals. */ +static const char *const yytname[] = +{ + "$end", "error", "$undefined", "HEADER", "GRAMMAR", "IMPORT", "PUBLIC", + "TOKEN", "RULENAME", "TAG", "WEIGHT", "';'", "'='", "'|'", "'('", "')'", + "'['", "']'", "'*'", "'+'", "$accept", "grammar", "header", + "jsgf_header", "grammar_header", "import_header", "import_statement", + "rule_list", "rule", "alternate_list", "rule_expansion", + "tagged_rule_item", "rule_item", "rule_group", "rule_optional", + "rule_atom", 0 +}; +#endif + +# ifdef YYPRINT +/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to + token YYLEX-NUM. */ +static const yytype_uint16 yytoknum[] = +{ + 0, 256, 257, 258, 259, 260, 261, 262, 263, 264, + 265, 59, 61, 124, 40, 41, 91, 93, 42, 43 +}; +# endif + +/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives. */ +static const yytype_uint8 yyr1[] = +{ + 0, 20, 21, 21, 21, 22, 23, 23, 23, 23, + 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, + 30, 30, 31, 31, 32, 32, 33, 34, 35, 35, + 35, 35, 35, 35 +}; + +/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN. */ +static const yytype_uint8 yyr2[] = +{ + 0, 2, 1, 2, 3, 2, 2, 3, 4, 5, + 3, 1, 2, 3, 1, 2, 4, 5, 1, 3, + 1, 2, 1, 2, 1, 2, 3, 3, 1, 1, + 1, 1, 2, 2 +}; + +/* YYDEFACT[STATE-NAME] -- Default rule to reduce with in state + STATE-NUM when YYTABLE doesn't specify something else to do. Zero + means the default is an error. */ +static const yytype_uint8 yydefact[] = +{ + 0, 0, 0, 2, 0, 0, 6, 1, 0, 0, + 0, 0, 11, 3, 14, 0, 5, 0, 7, 0, + 0, 0, 12, 4, 15, 0, 0, 8, 13, 0, + 28, 29, 0, 0, 0, 0, 18, 20, 22, 30, + 31, 24, 10, 9, 0, 25, 0, 0, 16, 0, + 21, 23, 32, 33, 17, 26, 27, 19 +}; + +/* YYDEFGOTO[NTERM-NUM]. */ +static const yytype_int8 yydefgoto[] = +{ + -1, 2, 3, 4, 16, 11, 12, 13, 14, 35, + 36, 37, 38, 39, 40, 41 +}; + +/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing + STATE-NUM. */ +#define YYPACT_NINF -37 +static const yytype_int8 yypact[] = +{ + -1, -2, 36, 22, 35, 8, -37, -37, 32, 33, + 30, 22, -37, 17, -37, 37, -37, 13, -37, 34, + 31, -4, -37, 17, -37, 38, 39, -37, -37, -4, + -37, -37, 0, -4, -4, 18, -4, 42, -37, -37, + -37, 19, -37, -37, 21, 19, 20, 9, -37, -4, + 42, -37, -37, -37, -37, -37, -37, -4 +}; + +/* YYPGOTO[NTERM-NUM]. */ +static const yytype_int8 yypgoto[] = +{ + -37, -37, -37, -37, -37, -37, 41, 43, -12, -16, + -3, -36, -37, -37, -37, 15 +}; + +/* YYTABLE[YYPACT[STATE-NUM]]. What to do in state STATE-NUM. If + positive, shift that token. If negative, reduce the rule which + number is the opposite. If zero, do what YYDEFACT says. + If YYTABLE_NINF, syntax error. */ +#define YYTABLE_NINF -1 +static const yytype_uint8 yytable[] = +{ + 50, 24, 1, 30, 31, 5, 32, 30, 31, 6, + 33, 24, 34, 44, 33, 17, 34, 46, 47, 18, + 26, 50, 49, 9, 27, 10, 56, 8, 9, 48, + 10, 49, 54, 49, 49, 55, 7, 52, 53, 15, + 19, 20, 21, 29, 25, 28, 57, 45, 0, 42, + 43, 51, 22, 0, 23 +}; + +static const yytype_int8 yycheck[] = +{ + 36, 13, 3, 7, 8, 7, 10, 7, 8, 11, + 14, 23, 16, 29, 14, 7, 16, 33, 34, 11, + 7, 57, 13, 6, 11, 8, 17, 5, 6, 11, + 8, 13, 11, 13, 13, 15, 0, 18, 19, 4, + 8, 8, 12, 12, 7, 11, 49, 32, -1, 11, + 11, 9, 11, -1, 11 +}; + +/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing + symbol of state STATE-NUM. */ +static const yytype_uint8 yystos[] = +{ + 0, 3, 21, 22, 23, 7, 11, 0, 5, 6, + 8, 25, 26, 27, 28, 4, 24, 7, 11, 8, + 8, 12, 26, 27, 28, 7, 7, 11, 11, 12, + 7, 8, 10, 14, 16, 29, 30, 31, 32, 33, + 34, 35, 11, 11, 29, 35, 29, 29, 11, 13, + 31, 9, 18, 19, 11, 15, 17, 30 +}; + +#define yyerrok (yyerrstatus = 0) +#define yyclearin (yychar = YYEMPTY) +#define YYEMPTY (-2) +#define YYEOF 0 + +#define YYACCEPT goto yyacceptlab +#define YYABORT goto yyabortlab +#define YYERROR goto yyerrorlab + + +/* Like YYERROR except do call yyerror. This remains here temporarily + to ease the transition to the new meaning of YYERROR, for GCC. + Once GCC version 2 has supplanted version 1, this can go. */ + +#define YYFAIL goto yyerrlab + +#define YYRECOVERING() (!!yyerrstatus) + +#define YYBACKUP(Token, Value) \ +do \ + if (yychar == YYEMPTY && yylen == 1) \ + { \ + yychar = (Token); \ + yylval = (Value); \ + yytoken = YYTRANSLATE (yychar); \ + YYPOPSTACK (1); \ + goto yybackup; \ + } \ + else \ + { \ + yyerror (yyscanner, jsgf, YY_("syntax error: cannot back up")); \ + YYERROR; \ + } \ +while (YYID (0)) + + +#define YYTERROR 1 +#define YYERRCODE 256 + + +/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N]. + If N is 0, then set CURRENT to the empty location which ends + the previous symbol: RHS[0] (always defined). */ + +#define YYRHSLOC(Rhs, K) ((Rhs)[K]) +#ifndef YYLLOC_DEFAULT +# define YYLLOC_DEFAULT(Current, Rhs, N) \ + do \ + if (YYID (N)) \ + { \ + (Current).first_line = YYRHSLOC (Rhs, 1).first_line; \ + (Current).first_column = YYRHSLOC (Rhs, 1).first_column; \ + (Current).last_line = YYRHSLOC (Rhs, N).last_line; \ + (Current).last_column = YYRHSLOC (Rhs, N).last_column; \ + } \ + else \ + { \ + (Current).first_line = (Current).last_line = \ + YYRHSLOC (Rhs, 0).last_line; \ + (Current).first_column = (Current).last_column = \ + YYRHSLOC (Rhs, 0).last_column; \ + } \ + while (YYID (0)) +#endif + + +/* YY_LOCATION_PRINT -- Print the location on the stream. + This macro was not mandated originally: define only if we know + we won't break user code: when these are the locations we know. */ + +#ifndef YY_LOCATION_PRINT +# if YYLTYPE_IS_TRIVIAL +# define YY_LOCATION_PRINT(File, Loc) \ + fprintf (File, "%d.%d-%d.%d", \ + (Loc).first_line, (Loc).first_column, \ + (Loc).last_line, (Loc).last_column) +# else +# define YY_LOCATION_PRINT(File, Loc) ((void) 0) +# endif +#endif + + +/* YYLEX -- calling `yylex' with the right arguments. */ + +#ifdef YYLEX_PARAM +# define YYLEX yylex (&yylval, YYLEX_PARAM) +#else +# define YYLEX yylex (&yylval, yyscanner) +#endif + +/* Enable debugging if requested. */ +#if YYDEBUG + +# ifndef YYFPRINTF +# include <stdio.h> /* INFRINGES ON USER NAME SPACE */ +# define YYFPRINTF fprintf +# endif + +# define YYDPRINTF(Args) \ +do { \ + if (yydebug) \ + YYFPRINTF Args; \ +} while (YYID (0)) + +# define YY_SYMBOL_PRINT(Title, Type, Value, Location) \ +do { \ + if (yydebug) \ + { \ + YYFPRINTF (stderr, "%s ", Title); \ + yy_symbol_print (stderr, \ + Type, Value, yyscanner, jsgf); \ + YYFPRINTF (stderr, "\n"); \ + } \ +} while (YYID (0)) + + +/*--------------------------------. +| Print this symbol on YYOUTPUT. | +`--------------------------------*/ + +/*ARGSUSED*/ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_symbol_value_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep, void* yyscanner, jsgf_t *jsgf) +#else +static void +yy_symbol_value_print (yyoutput, yytype, yyvaluep, yyscanner, jsgf) + FILE *yyoutput; + int yytype; + YYSTYPE const * const yyvaluep; + void* yyscanner; + jsgf_t *jsgf; +#endif +{ + if (!yyvaluep) + return; + YYUSE (yyscanner); + YYUSE (jsgf); +# ifdef YYPRINT + if (yytype < YYNTOKENS) + YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep); +# else + YYUSE (yyoutput); +# endif + switch (yytype) + { + default: + break; + } +} + + +/*--------------------------------. +| Print this symbol on YYOUTPUT. | +`--------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_symbol_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep, void* yyscanner, jsgf_t *jsgf) +#else +static void +yy_symbol_print (yyoutput, yytype, yyvaluep, yyscanner, jsgf) + FILE *yyoutput; + int yytype; + YYSTYPE const * const yyvaluep; + void* yyscanner; + jsgf_t *jsgf; +#endif +{ + if (yytype < YYNTOKENS) + YYFPRINTF (yyoutput, "token %s (", yytname[yytype]); + else + YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]); + + yy_symbol_value_print (yyoutput, yytype, yyvaluep, yyscanner, jsgf); + YYFPRINTF (yyoutput, ")"); +} + +/*------------------------------------------------------------------. +| yy_stack_print -- Print the state stack from its BOTTOM up to its | +| TOP (included). | +`------------------------------------------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_stack_print (yytype_int16 *yybottom, yytype_int16 *yytop) +#else +static void +yy_stack_print (yybottom, yytop) + yytype_int16 *yybottom; + yytype_int16 *yytop; +#endif +{ + YYFPRINTF (stderr, "Stack now"); + for (; yybottom <= yytop; yybottom++) + { + int yybot = *yybottom; + YYFPRINTF (stderr, " %d", yybot); + } + YYFPRINTF (stderr, "\n"); +} + +# define YY_STACK_PRINT(Bottom, Top) \ +do { \ + if (yydebug) \ + yy_stack_print ((Bottom), (Top)); \ +} while (YYID (0)) + + +/*------------------------------------------------. +| Report that the YYRULE is going to be reduced. | +`------------------------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_reduce_print (YYSTYPE *yyvsp, int yyrule, void* yyscanner, jsgf_t *jsgf) +#else +static void +yy_reduce_print (yyvsp, yyrule, yyscanner, jsgf) + YYSTYPE *yyvsp; + int yyrule; + void* yyscanner; + jsgf_t *jsgf; +#endif +{ + int yynrhs = yyr2[yyrule]; + int yyi; + unsigned long int yylno = yyrline[yyrule]; + YYFPRINTF (stderr, "Reducing stack by rule %d (line %lu):\n", + yyrule - 1, yylno); + /* The symbols being reduced. */ + for (yyi = 0; yyi < yynrhs; yyi++) + { + YYFPRINTF (stderr, " $%d = ", yyi + 1); + yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi], + &(yyvsp[(yyi + 1) - (yynrhs)]) + , yyscanner, jsgf); + YYFPRINTF (stderr, "\n"); + } +} + +# define YY_REDUCE_PRINT(Rule) \ +do { \ + if (yydebug) \ + yy_reduce_print (yyvsp, Rule, yyscanner, jsgf); \ +} while (YYID (0)) + +/* Nonzero means print parse trace. It is left uninitialized so that + multiple parsers can coexist. */ +int yydebug; +#else /* !YYDEBUG */ +# define YYDPRINTF(Args) +# define YY_SYMBOL_PRINT(Title, Type, Value, Location) +# define YY_STACK_PRINT(Bottom, Top) +# define YY_REDUCE_PRINT(Rule) +#endif /* !YYDEBUG */ + + +/* YYINITDEPTH -- initial size of the parser's stacks. */ +#ifndef YYINITDEPTH +# define YYINITDEPTH 200 +#endif + +/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only + if the built-in stack extension method is used). + + Do not make this value too large; the results are undefined if + YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH) + evaluated with infinite-precision integer arithmetic. */ + +#ifndef YYMAXDEPTH +# define YYMAXDEPTH 10000 +#endif + + + +#if YYERROR_VERBOSE + +# ifndef yystrlen +# if defined __GLIBC__ && defined _STRING_H +# define yystrlen strlen +# else +/* Return the length of YYSTR. */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static YYSIZE_T +yystrlen (const char *yystr) +#else +static YYSIZE_T +yystrlen (yystr) + const char *yystr; +#endif +{ + YYSIZE_T yylen; + for (yylen = 0; yystr[yylen]; yylen++) + continue; + return yylen; +} +# endif +# endif + +# ifndef yystpcpy +# if defined __GLIBC__ && defined _STRING_H && defined _GNU_SOURCE +# define yystpcpy stpcpy +# else +/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in + YYDEST. */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static char * +yystpcpy (char *yydest, const char *yysrc) +#else +static char * +yystpcpy (yydest, yysrc) + char *yydest; + const char *yysrc; +#endif +{ + char *yyd = yydest; + const char *yys = yysrc; + + while ((*yyd++ = *yys++) != '\0') + continue; + + return yyd - 1; +} +# endif +# endif + +# ifndef yytnamerr +/* Copy to YYRES the contents of YYSTR after stripping away unnecessary + quotes and backslashes, so that it's suitable for yyerror. The + heuristic is that double-quoting is unnecessary unless the string + contains an apostrophe, a comma, or backslash (other than + backslash-backslash). YYSTR is taken from yytname. If YYRES is + null, do not copy; instead, return the length of what the result + would have been. */ +static YYSIZE_T +yytnamerr (char *yyres, const char *yystr) +{ + if (*yystr == '"') + { + YYSIZE_T yyn = 0; + char const *yyp = yystr; + + for (;;) + switch (*++yyp) + { + case '\'': + case ',': + goto do_not_strip_quotes; + + case '\\': + if (*++yyp != '\\') + goto do_not_strip_quotes; + /* Fall through. */ + default: + if (yyres) + yyres[yyn] = *yyp; + yyn++; + break; + + case '"': + if (yyres) + yyres[yyn] = '\0'; + return yyn; + } + do_not_strip_quotes: ; + } + + if (! yyres) + return yystrlen (yystr); + + return yystpcpy (yyres, yystr) - yyres; +} +# endif + +/* Copy into YYRESULT an error message about the unexpected token + YYCHAR while in state YYSTATE. Return the number of bytes copied, + including the terminating null byte. If YYRESULT is null, do not + copy anything; just return the number of bytes that would be + copied. As a special case, return 0 if an ordinary "syntax error" + message will do. Return YYSIZE_MAXIMUM if overflow occurs during + size calculation. */ +static YYSIZE_T +yysyntax_error (char *yyresult, int yystate, int yychar) +{ + int yyn = yypact[yystate]; + + if (! (YYPACT_NINF < yyn && yyn <= YYLAST)) + return 0; + else + { + int yytype = YYTRANSLATE (yychar); + YYSIZE_T yysize0 = yytnamerr (0, yytname[yytype]); + YYSIZE_T yysize = yysize0; + YYSIZE_T yysize1; + int yysize_overflow = 0; + enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 }; + char const *yyarg[YYERROR_VERBOSE_ARGS_MAXIMUM]; + int yyx; + +# if 0 + /* This is so xgettext sees the translatable formats that are + constructed on the fly. */ + YY_("syntax error, unexpected %s"); + YY_("syntax error, unexpected %s, expecting %s"); + YY_("syntax error, unexpected %s, expecting %s or %s"); + YY_("syntax error, unexpected %s, expecting %s or %s or %s"); + YY_("syntax error, unexpected %s, expecting %s or %s or %s or %s"); +# endif + char *yyfmt; + char const *yyf; + static char const yyunexpected[] = "syntax error, unexpected %s"; + static char const yyexpecting[] = ", expecting %s"; + static char const yyor[] = " or %s"; + char yyformat[sizeof yyunexpected + + sizeof yyexpecting - 1 + + ((YYERROR_VERBOSE_ARGS_MAXIMUM - 2) + * (sizeof yyor - 1))]; + char const *yyprefix = yyexpecting; + + /* Start YYX at -YYN if negative to avoid negative indexes in + YYCHECK. */ + int yyxbegin = yyn < 0 ? -yyn : 0; + + /* Stay within bounds of both yycheck and yytname. */ + int yychecklim = YYLAST - yyn + 1; + int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS; + int yycount = 1; + + yyarg[0] = yytname[yytype]; + yyfmt = yystpcpy (yyformat, yyunexpected); + + for (yyx = yyxbegin; yyx < yyxend; ++yyx) + if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR) + { + if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM) + { + yycount = 1; + yysize = yysize0; + yyformat[sizeof yyunexpected - 1] = '\0'; + break; + } + yyarg[yycount++] = yytname[yyx]; + yysize1 = yysize + yytnamerr (0, yytname[yyx]); + yysize_overflow |= (yysize1 < yysize); + yysize = yysize1; + yyfmt = yystpcpy (yyfmt, yyprefix); + yyprefix = yyor; + } + + yyf = YY_(yyformat); + yysize1 = yysize + yystrlen (yyf); + yysize_overflow |= (yysize1 < yysize); + yysize = yysize1; + + if (yysize_overflow) + return YYSIZE_MAXIMUM; + + if (yyresult) + { + /* Avoid sprintf, as that infringes on the user's name space. + Don't have undefined behavior even if the translation + produced a string with the wrong number of "%s"s. */ + char *yyp = yyresult; + int yyi = 0; + while ((*yyp = *yyf) != '\0') + { + if (*yyp == '%' && yyf[1] == 's' && yyi < yycount) + { + yyp += yytnamerr (yyp, yyarg[yyi++]); + yyf += 2; + } + else + { + yyp++; + yyf++; + } + } + } + return yysize; + } +} +#endif /* YYERROR_VERBOSE */ + + +/*-----------------------------------------------. +| Release the memory associated to this symbol. | +`-----------------------------------------------*/ + +/*ARGSUSED*/ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep, void* yyscanner, jsgf_t *jsgf) +#else +static void +yydestruct (yymsg, yytype, yyvaluep, yyscanner, jsgf) + const char *yymsg; + int yytype; + YYSTYPE *yyvaluep; + void* yyscanner; + jsgf_t *jsgf; +#endif +{ + YYUSE (yyvaluep); + YYUSE (yyscanner); + YYUSE (jsgf); + + if (!yymsg) + yymsg = "Deleting"; + YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp); + + switch (yytype) + { + + default: + break; + } +} + +/* Prevent warnings from -Wmissing-prototypes. */ +#ifdef YYPARSE_PARAM +#if defined __STDC__ || defined __cplusplus +int yyparse (void *YYPARSE_PARAM); +#else +int yyparse (); +#endif +#else /* ! YYPARSE_PARAM */ +#if defined __STDC__ || defined __cplusplus +int yyparse (void* yyscanner, jsgf_t *jsgf); +#else +int yyparse (); +#endif +#endif /* ! YYPARSE_PARAM */ + + + + + +/*-------------------------. +| yyparse or yypush_parse. | +`-------------------------*/ + +#ifdef YYPARSE_PARAM +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +int +yyparse (void *YYPARSE_PARAM) +#else +int +yyparse (YYPARSE_PARAM) + void *YYPARSE_PARAM; +#endif +#else /* ! YYPARSE_PARAM */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +int +yyparse (void* yyscanner, jsgf_t *jsgf) +#else +int +yyparse (yyscanner, jsgf) + void* yyscanner; + jsgf_t *jsgf; +#endif +#endif +{ +/* The lookahead symbol. */ +int yychar; + +/* The semantic value of the lookahead symbol. */ +YYSTYPE yylval; + + /* Number of syntax errors so far. */ + int yynerrs; + + int yystate; + /* Number of tokens to shift before error messages enabled. */ + int yyerrstatus; + + /* The stacks and their tools: + `yyss': related to states. + `yyvs': related to semantic values. + + Refer to the stacks thru separate pointers, to allow yyoverflow + to reallocate them elsewhere. */ + + /* The state stack. */ + yytype_int16 yyssa[YYINITDEPTH]; + yytype_int16 *yyss; + yytype_int16 *yyssp; + + /* The semantic value stack. */ + YYSTYPE yyvsa[YYINITDEPTH]; + YYSTYPE *yyvs; + YYSTYPE *yyvsp; + + YYSIZE_T yystacksize; + + int yyn; + int yyresult; + /* Lookahead token as an internal (translated) token number. */ + int yytoken; + /* The variables used to return semantic value and location from the + action routines. */ + YYSTYPE yyval; + +#if YYERROR_VERBOSE + /* Buffer for error messages, and its allocated size. */ + char yymsgbuf[128]; + char *yymsg = yymsgbuf; + YYSIZE_T yymsg_alloc = sizeof yymsgbuf; +#endif + +#define YYPOPSTACK(N) (yyvsp -= (N), yyssp -= (N)) + + /* The number of symbols on the RHS of the reduced rule. + Keep to zero when no symbol should be popped. */ + int yylen = 0; + + yytoken = 0; + yyss = yyssa; + yyvs = yyvsa; + yystacksize = YYINITDEPTH; + + YYDPRINTF ((stderr, "Starting parse\n")); + + yystate = 0; + yyerrstatus = 0; + yynerrs = 0; + yychar = YYEMPTY; /* Cause a token to be read. */ + + /* Initialize stack pointers. + Waste one element of value and location stack + so that they stay on the same level as the state stack. + The wasted elements are never initialized. */ + yyssp = yyss; + yyvsp = yyvs; + + goto yysetstate; + +/*------------------------------------------------------------. +| yynewstate -- Push a new state, which is found in yystate. | +`------------------------------------------------------------*/ + yynewstate: + /* In all cases, when you get here, the value and location stacks + have just been pushed. So pushing a state here evens the stacks. */ + yyssp++; + + yysetstate: + *yyssp = yystate; + + if (yyss + yystacksize - 1 <= yyssp) + { + /* Get the current used size of the three stacks, in elements. */ + YYSIZE_T yysize = yyssp - yyss + 1; + +#ifdef yyoverflow + { + /* Give user a chance to reallocate the stack. Use copies of + these so that the &'s don't force the real ones into + memory. */ + YYSTYPE *yyvs1 = yyvs; + yytype_int16 *yyss1 = yyss; + + /* Each stack pointer address is followed by the size of the + data in use in that stack, in bytes. This used to be a + conditional around just the two extra args, but that might + be undefined if yyoverflow is a macro. */ + yyoverflow (YY_("memory exhausted"), + &yyss1, yysize * sizeof (*yyssp), + &yyvs1, yysize * sizeof (*yyvsp), + &yystacksize); + + yyss = yyss1; + yyvs = yyvs1; + } +#else /* no yyoverflow */ +# ifndef YYSTACK_RELOCATE + goto yyexhaustedlab; +# else + /* Extend the stack our own way. */ + if (YYMAXDEPTH <= yystacksize) + goto yyexhaustedlab; + yystacksize *= 2; + if (YYMAXDEPTH < yystacksize) + yystacksize = YYMAXDEPTH; + + { + yytype_int16 *yyss1 = yyss; + union yyalloc *yyptr = + (union yyalloc *) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize)); + if (! yyptr) + goto yyexhaustedlab; + YYSTACK_RELOCATE (yyss_alloc, yyss); + YYSTACK_RELOCATE (yyvs_alloc, yyvs); +# undef YYSTACK_RELOCATE + if (yyss1 != yyssa) + YYSTACK_FREE (yyss1); + } +# endif +#endif /* no yyoverflow */ + + yyssp = yyss + yysize - 1; + yyvsp = yyvs + yysize - 1; + + YYDPRINTF ((stderr, "Stack size increased to %lu\n", + (unsigned long int) yystacksize)); + + if (yyss + yystacksize - 1 <= yyssp) + YYABORT; + } + + YYDPRINTF ((stderr, "Entering state %d\n", yystate)); + + if (yystate == YYFINAL) + YYACCEPT; + + goto yybackup; + +/*-----------. +| yybackup. | +`-----------*/ +yybackup: + + /* Do appropriate processing given the current state. Read a + lookahead token if we need one and don't already have one. */ + + /* First try to decide what to do without reference to lookahead token. */ + yyn = yypact[yystate]; + if (yyn == YYPACT_NINF) + goto yydefault; + + /* Not known => get a lookahead token if don't already have one. */ + + /* YYCHAR is either YYEMPTY or YYEOF or a valid lookahead symbol. */ + if (yychar == YYEMPTY) + { + YYDPRINTF ((stderr, "Reading a token: ")); + yychar = YYLEX; + } + + if (yychar <= YYEOF) + { + yychar = yytoken = YYEOF; + YYDPRINTF ((stderr, "Now at end of input.\n")); + } + else + { + yytoken = YYTRANSLATE (yychar); + YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc); + } + + /* If the proper action on seeing token YYTOKEN is to reduce or to + detect an error, take that action. */ + yyn += yytoken; + if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken) + goto yydefault; + yyn = yytable[yyn]; + if (yyn <= 0) + { + if (yyn == 0 || yyn == YYTABLE_NINF) + goto yyerrlab; + yyn = -yyn; + goto yyreduce; + } + + /* Count tokens shifted since error; after three, turn off error + status. */ + if (yyerrstatus) + yyerrstatus--; + + /* Shift the lookahead token. */ + YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc); + + /* Discard the shifted token. */ + yychar = YYEMPTY; + + yystate = yyn; + *++yyvsp = yylval; + + goto yynewstate; + + +/*-----------------------------------------------------------. +| yydefault -- do the default action for the current state. | +`-----------------------------------------------------------*/ +yydefault: + yyn = yydefact[yystate]; + if (yyn == 0) + goto yyerrlab; + goto yyreduce; + + +/*-----------------------------. +| yyreduce -- Do a reduction. | +`-----------------------------*/ +yyreduce: + /* yyn is the number of a rule to reduce with. */ + yylen = yyr2[yyn]; + + /* If YYLEN is nonzero, implement the default value of the action: + `$$ = $1'. + + Otherwise, the following line sets YYVAL to garbage. + This behavior is undocumented and Bison + users should not rely upon it. Assigning to YYVAL + unconditionally makes the parser a bit smaller, and it avoids a + GCC warning that YYVAL may be used uninitialized. */ + yyval = yyvsp[1-yylen]; + + + YY_REDUCE_PRINT (yyn); + switch (yyn) + { + case 5: + +/* Line 1455 of yacc.c */ +#line 87 "jsgf_parser.y" + { jsgf->name = (yyvsp[(2) - (2)].name); } + break; + + case 7: + +/* Line 1455 of yacc.c */ +#line 91 "jsgf_parser.y" + { jsgf->version = (yyvsp[(2) - (3)].name); } + break; + + case 8: + +/* Line 1455 of yacc.c */ +#line 92 "jsgf_parser.y" + { jsgf->version = (yyvsp[(2) - (4)].name); jsgf->charset = (yyvsp[(3) - (4)].name); } + break; + + case 9: + +/* Line 1455 of yacc.c */ +#line 93 "jsgf_parser.y" + { jsgf->version = (yyvsp[(2) - (5)].name); jsgf->charset = (yyvsp[(3) - (5)].name); + jsgf->locale = (yyvsp[(4) - (5)].name); } + break; + + case 10: + +/* Line 1455 of yacc.c */ +#line 97 "jsgf_parser.y" + { (yyval.name) = (yyvsp[(2) - (3)].name); } + break; + + case 13: + +/* Line 1455 of yacc.c */ +#line 104 "jsgf_parser.y" + { jsgf_import_rule(jsgf, (yyvsp[(2) - (3)].name)); ckd_free((yyvsp[(2) - (3)].name)); } + break; + + case 16: + +/* Line 1455 of yacc.c */ +#line 111 "jsgf_parser.y" + { jsgf_define_rule(jsgf, (yyvsp[(1) - (4)].name), (yyvsp[(3) - (4)].rhs), 0); ckd_free((yyvsp[(1) - (4)].name)); } + break; + + case 17: + +/* Line 1455 of yacc.c */ +#line 112 "jsgf_parser.y" + { jsgf_define_rule(jsgf, (yyvsp[(2) - (5)].name), (yyvsp[(4) - (5)].rhs), 1); ckd_free((yyvsp[(2) - (5)].name)); } + break; + + case 18: + +/* Line 1455 of yacc.c */ +#line 115 "jsgf_parser.y" + { (yyval.rhs) = (yyvsp[(1) - (1)].rhs); (yyval.rhs)->atoms = glist_reverse((yyval.rhs)->atoms); } + break; + + case 19: + +/* Line 1455 of yacc.c */ +#line 116 "jsgf_parser.y" + { (yyval.rhs) = (yyvsp[(3) - (3)].rhs); + (yyval.rhs)->atoms = glist_reverse((yyval.rhs)->atoms); + (yyval.rhs)->alt = (yyvsp[(1) - (3)].rhs); } + break; + + case 20: + +/* Line 1455 of yacc.c */ +#line 121 "jsgf_parser.y" + { (yyval.rhs) = ckd_calloc(1, sizeof(*(yyval.rhs))); + (yyval.rhs)->atoms = glist_add_ptr((yyval.rhs)->atoms, (yyvsp[(1) - (1)].atom)); } + break; + + case 21: + +/* Line 1455 of yacc.c */ +#line 123 "jsgf_parser.y" + { (yyval.rhs) = (yyvsp[(1) - (2)].rhs); + (yyval.rhs)->atoms = glist_add_ptr((yyval.rhs)->atoms, (yyvsp[(2) - (2)].atom)); } + break; + + case 23: + +/* Line 1455 of yacc.c */ +#line 128 "jsgf_parser.y" + { (yyval.atom) = (yyvsp[(1) - (2)].atom); + (yyval.atom)->tags = glist_add_ptr((yyval.atom)->tags, (yyvsp[(2) - (2)].name)); } + break; + + case 25: + +/* Line 1455 of yacc.c */ +#line 133 "jsgf_parser.y" + { (yyval.atom) = (yyvsp[(2) - (2)].atom); (yyval.atom)->weight = (yyvsp[(1) - (2)].weight); } + break; + + case 26: + +/* Line 1455 of yacc.c */ +#line 136 "jsgf_parser.y" + { (yyval.rule) = jsgf_define_rule(jsgf, NULL, (yyvsp[(2) - (3)].rhs), 0); } + break; + + case 27: + +/* Line 1455 of yacc.c */ +#line 139 "jsgf_parser.y" + { (yyval.rule) = jsgf_optional_new(jsgf, (yyvsp[(2) - (3)].rhs)); } + break; + + case 28: + +/* Line 1455 of yacc.c */ +#line 142 "jsgf_parser.y" + { (yyval.atom) = jsgf_atom_new((yyvsp[(1) - (1)].name), 1.0); ckd_free((yyvsp[(1) - (1)].name)); } + break; + + case 29: + +/* Line 1455 of yacc.c */ +#line 143 "jsgf_parser.y" + { (yyval.atom) = jsgf_atom_new((yyvsp[(1) - (1)].name), 1.0); ckd_free((yyvsp[(1) - (1)].name)); } + break; + + case 30: + +/* Line 1455 of yacc.c */ +#line 144 "jsgf_parser.y" + { (yyval.atom) = jsgf_atom_new((yyvsp[(1) - (1)].rule)->name, 1.0); } + break; + + case 31: + +/* Line 1455 of yacc.c */ +#line 145 "jsgf_parser.y" + { (yyval.atom) = jsgf_atom_new((yyvsp[(1) - (1)].rule)->name, 1.0); } + break; + + case 32: + +/* Line 1455 of yacc.c */ +#line 146 "jsgf_parser.y" + { (yyval.atom) = jsgf_kleene_new(jsgf, (yyvsp[(1) - (2)].atom), 0); } + break; + + case 33: + +/* Line 1455 of yacc.c */ +#line 147 "jsgf_parser.y" + { (yyval.atom) = jsgf_kleene_new(jsgf, (yyvsp[(1) - (2)].atom), 1); } + break; + + + +/* Line 1455 of yacc.c */ +#line 1580 "jsgf_parser.c" + default: break; + } + YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc); + + YYPOPSTACK (yylen); + yylen = 0; + YY_STACK_PRINT (yyss, yyssp); + + *++yyvsp = yyval; + + /* Now `shift' the result of the reduction. Determine what state + that goes to, based on the state we popped back to and the rule + number reduced by. */ + + yyn = yyr1[yyn]; + + yystate = yypgoto[yyn - YYNTOKENS] + *yyssp; + if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp) + yystate = yytable[yystate]; + else + yystate = yydefgoto[yyn - YYNTOKENS]; + + goto yynewstate; + + +/*------------------------------------. +| yyerrlab -- here on detecting error | +`------------------------------------*/ +yyerrlab: + /* If not already recovering from an error, report this error. */ + if (!yyerrstatus) + { + ++yynerrs; +#if ! YYERROR_VERBOSE + yyerror (yyscanner, jsgf, YY_("syntax error")); +#else + { + YYSIZE_T yysize = yysyntax_error (0, yystate, yychar); + if (yymsg_alloc < yysize && yymsg_alloc < YYSTACK_ALLOC_MAXIMUM) + { + YYSIZE_T yyalloc = 2 * yysize; + if (! (yysize <= yyalloc && yyalloc <= YYSTACK_ALLOC_MAXIMUM)) + yyalloc = YYSTACK_ALLOC_MAXIMUM; + if (yymsg != yymsgbuf) + YYSTACK_FREE (yymsg); + yymsg = (char *) YYSTACK_ALLOC (yyalloc); + if (yymsg) + yymsg_alloc = yyalloc; + else + { + yymsg = yymsgbuf; + yymsg_alloc = sizeof yymsgbuf; + } + } + + if (0 < yysize && yysize <= yymsg_alloc) + { + (void) yysyntax_error (yymsg, yystate, yychar); + yyerror (yyscanner, jsgf, yymsg); + } + else + { + yyerror (yyscanner, jsgf, YY_("syntax error")); + if (yysize != 0) + goto yyexhaustedlab; + } + } +#endif + } + + + + if (yyerrstatus == 3) + { + /* If just tried and failed to reuse lookahead token after an + error, discard it. */ + + if (yychar <= YYEOF) + { + /* Return failure if at end of input. */ + if (yychar == YYEOF) + YYABORT; + } + else + { + yydestruct ("Error: discarding", + yytoken, &yylval, yyscanner, jsgf); + yychar = YYEMPTY; + } + } + + /* Else will try to reuse lookahead token after shifting the error + token. */ + goto yyerrlab1; + + +/*---------------------------------------------------. +| yyerrorlab -- error raised explicitly by YYERROR. | +`---------------------------------------------------*/ +yyerrorlab: + + /* Pacify compilers like GCC when the user code never invokes + YYERROR and the label yyerrorlab therefore never appears in user + code. */ + if (/*CONSTCOND*/ 0) + goto yyerrorlab; + + /* Do not reclaim the symbols of the rule which action triggered + this YYERROR. */ + YYPOPSTACK (yylen); + yylen = 0; + YY_STACK_PRINT (yyss, yyssp); + yystate = *yyssp; + goto yyerrlab1; + + +/*-------------------------------------------------------------. +| yyerrlab1 -- common code for both syntax error and YYERROR. | +`-------------------------------------------------------------*/ +yyerrlab1: + yyerrstatus = 3; /* Each real token shifted decrements this. */ + + for (;;) + { + yyn = yypact[yystate]; + if (yyn != YYPACT_NINF) + { + yyn += YYTERROR; + if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR) + { + yyn = yytable[yyn]; + if (0 < yyn) + break; + } + } + + /* Pop the current state because it cannot handle the error token. */ + if (yyssp == yyss) + YYABORT; + + + yydestruct ("Error: popping", + yystos[yystate], yyvsp, yyscanner, jsgf); + YYPOPSTACK (1); + yystate = *yyssp; + YY_STACK_PRINT (yyss, yyssp); + } + + *++yyvsp = yylval; + + + /* Shift the error token. */ + YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp); + + yystate = yyn; + goto yynewstate; + + +/*-------------------------------------. +| yyacceptlab -- YYACCEPT comes here. | +`-------------------------------------*/ +yyacceptlab: + yyresult = 0; + goto yyreturn; + +/*-----------------------------------. +| yyabortlab -- YYABORT comes here. | +`-----------------------------------*/ +yyabortlab: + yyresult = 1; + goto yyreturn; + +#if !defined(yyoverflow) || YYERROR_VERBOSE +/*-------------------------------------------------. +| yyexhaustedlab -- memory exhaustion comes here. | +`-------------------------------------------------*/ +yyexhaustedlab: + yyerror (yyscanner, jsgf, YY_("memory exhausted")); + yyresult = 2; + /* Fall through. */ +#endif + +yyreturn: + if (yychar != YYEMPTY) + yydestruct ("Cleanup: discarding lookahead", + yytoken, &yylval, yyscanner, jsgf); + /* Do not reclaim the symbols of the rule which action triggered + this YYABORT or YYACCEPT. */ + YYPOPSTACK (yylen); + YY_STACK_PRINT (yyss, yyssp); + while (yyssp != yyss) + { + yydestruct ("Cleanup: popping", + yystos[*yyssp], yyvsp, yyscanner, jsgf); + YYPOPSTACK (1); + } +#ifndef yyoverflow + if (yyss != yyssa) + YYSTACK_FREE (yyss); +#endif +#if YYERROR_VERBOSE + if (yymsg != yymsgbuf) + YYSTACK_FREE (yymsg); +#endif + /* Make sure YYID is used. */ + return YYID (yyresult); +} + + + +/* Line 1675 of yacc.c */ +#line 150 "jsgf_parser.y" + + +void +yyerror(yyscan_t lex, jsgf_t *jsgf, const char *s) +{ + E_ERROR("%s at line %d current token '%s'\n", s, yyget_lineno(lex), yyget_text(lex)); +} + diff --git a/media/sphinxbase/src/libsphinxbase/lm/jsgf_parser.h b/media/sphinxbase/src/libsphinxbase/lm/jsgf_parser.h new file mode 100644 index 000000000..95f68e329 --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/jsgf_parser.h @@ -0,0 +1,90 @@ + +/* A Bison parser, made by GNU Bison 2.4.1. */ + +/* Skeleton interface for Bison's Yacc-like parsers in C + + Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004, 2005, 2006 + Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + + +/* Tokens. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + /* Put the tokens into the symbol table, so that GDB and other debuggers + know about them. */ + enum yytokentype { + HEADER = 258, + GRAMMAR = 259, + IMPORT = 260, + PUBLIC = 261, + TOKEN = 262, + RULENAME = 263, + TAG = 264, + WEIGHT = 265 + }; +#endif +/* Tokens. */ +#define HEADER 258 +#define GRAMMAR 259 +#define IMPORT 260 +#define PUBLIC 261 +#define TOKEN 262 +#define RULENAME 263 +#define TAG 264 +#define WEIGHT 265 + + + + +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +typedef union YYSTYPE +{ + +/* Line 1676 of yacc.c */ +#line 65 "jsgf_parser.y" + + char *name; + float weight; + jsgf_rule_t *rule; + jsgf_rhs_t *rhs; + jsgf_atom_t *atom; + + + +/* Line 1676 of yacc.c */ +#line 82 "jsgf_parser.h" +} YYSTYPE; +# define YYSTYPE_IS_TRIVIAL 1 +# define yystype YYSTYPE /* obsolescent; will be withdrawn */ +# define YYSTYPE_IS_DECLARED 1 +#endif + + + + diff --git a/media/sphinxbase/src/libsphinxbase/lm/jsgf_scanner.c b/media/sphinxbase/src/libsphinxbase/lm/jsgf_scanner.c new file mode 100644 index 000000000..5d41d2a6b --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/jsgf_scanner.c @@ -0,0 +1,2199 @@ +#line 2 "jsgf_scanner.c" + +#line 4 "jsgf_scanner.c" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 5 +#define YY_FLEX_SUBMINOR_VERSION 37 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <stdlib.h> + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include <inttypes.h> +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#endif /* ! C99 */ + +#endif /* ! FLEXINT_H */ + +#ifdef __cplusplus + +/* The "const" storage-class-modifier is valid. */ +#define YY_USE_CONST + +#else /* ! __cplusplus */ + +/* C99 requires __STDC__ to be defined as 1. */ +#if defined (__STDC__) + +#define YY_USE_CONST + +#endif /* defined (__STDC__) */ +#endif /* ! __cplusplus */ + +#ifdef YY_USE_CONST +#define yyconst const +#else +#define yyconst +#endif + +/* Returned upon end-of-file. */ +#define YY_NULL 0 + +/* Promotes a possibly negative, possibly signed char to an unsigned + * integer for use as an array index. If the signed char is negative, + * we want to instead treat it as an 8-bit unsigned char, hence the + * double cast. + */ +#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c) + +/* An opaque pointer. */ +#ifndef YY_TYPEDEF_YY_SCANNER_T +#define YY_TYPEDEF_YY_SCANNER_T +typedef void* yyscan_t; +#endif + +/* For convenience, these vars (plus the bison vars far below) + are macros in the reentrant scanner. */ +#define yyin yyg->yyin_r +#define yyout yyg->yyout_r +#define yyextra yyg->yyextra_r +#define yyleng yyg->yyleng_r +#define yytext yyg->yytext_r +#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno) +#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column) +#define yy_flex_debug yyg->yy_flex_debug_r + +/* Enter a start condition. This macro really ought to take a parameter, + * but we do it the disgusting crufty way forced on us by the ()-less + * definition of BEGIN. + */ +#define BEGIN yyg->yy_start = 1 + 2 * + +/* Translate the current start state into a value that can be later handed + * to BEGIN to return to the state. The YYSTATE alias is for lex + * compatibility. + */ +#define YY_START ((yyg->yy_start - 1) / 2) +#define YYSTATE YY_START + +/* Action number for EOF rule of a given start state. */ +#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1) + +/* Special action meaning "start processing a new file". */ +#define YY_NEW_FILE yyrestart(yyin ,yyscanner ) + +#define YY_END_OF_BUFFER_CHAR 0 + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#define YY_BUF_SIZE 16384 +#endif + +/* The state buf must be large enough to hold one state per character in the main buffer. + */ +#define YY_STATE_BUF_SIZE ((YY_BUF_SIZE + 2) * sizeof(yy_state_type)) + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +#define EOB_ACT_CONTINUE_SCAN 0 +#define EOB_ACT_END_OF_FILE 1 +#define EOB_ACT_LAST_MATCH 2 + + /* Note: We specifically omit the test for yy_rule_can_match_eol because it requires + * access to the local variable yy_act. Since yyless() is a macro, it would break + * existing scanners that call yyless() from OUTSIDE yylex. + * One obvious solution it to make yy_act a global. I tried that, and saw + * a 5% performance hit in a non-yylineno scanner, because yy_act is + * normally declared as a register variable-- so it is not worth it. + */ + #define YY_LESS_LINENO(n) \ + do { \ + int yyl;\ + for ( yyl = n; yyl < yyleng; ++yyl )\ + if ( yytext[yyl] == '\n' )\ + --yylineno;\ + }while(0) + +/* Return all but the first "n" matched characters back to the input stream. */ +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + *yy_cp = yyg->yy_hold_char; \ + YY_RESTORE_YY_MORE_OFFSET \ + yyg->yy_c_buf_p = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \ + YY_DO_BEFORE_ACTION; /* set up yytext again */ \ + } \ + while ( 0 ) + +#define unput(c) yyunput( c, yyg->yytext_ptr , yyscanner ) + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + yy_size_t yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + yy_size_t yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + +#define YY_BUFFER_NEW 0 +#define YY_BUFFER_NORMAL 1 + /* When an EOF's been seen but there's still some text to process + * then we mark the buffer as YY_EOF_PENDING, to indicate that we + * shouldn't try reading from the input source any more. We might + * still have a bunch of tokens to match, though, because of + * possible backing-up. + * + * When we actually see the EOF, we change the status to "new" + * (via yyrestart()), so that the user can continue scanning by + * just pointing yyin at a new input file. + */ +#define YY_BUFFER_EOF_PENDING 2 + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +/* We provide macros for accessing buffer states in case in the + * future we want to put the buffer states in a more general + * "scanner state". + * + * Returns the top of the stack, or NULL. + */ +#define YY_CURRENT_BUFFER ( yyg->yy_buffer_stack \ + ? yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] \ + : NULL) + +/* Same as previous macro, but useful when we know that the buffer stack is not + * NULL or when we need an lvalue. For internal use only. + */ +#define YY_CURRENT_BUFFER_LVALUE yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] + +void yyrestart (FILE *input_file ,yyscan_t yyscanner ); +void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); +YY_BUFFER_STATE yy_create_buffer (FILE *file,int size ,yyscan_t yyscanner ); +void yy_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); +void yy_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); +void yypush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); +void yypop_buffer_state (yyscan_t yyscanner ); + +static void yyensure_buffer_stack (yyscan_t yyscanner ); +static void yy_load_buffer_state (yyscan_t yyscanner ); +static void yy_init_buffer (YY_BUFFER_STATE b,FILE *file ,yyscan_t yyscanner ); + +#define YY_FLUSH_BUFFER yy_flush_buffer(YY_CURRENT_BUFFER ,yyscanner) + +YY_BUFFER_STATE yy_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner ); +YY_BUFFER_STATE yy_scan_string (yyconst char *yy_str ,yyscan_t yyscanner ); +YY_BUFFER_STATE yy_scan_bytes (yyconst char *bytes,yy_size_t len ,yyscan_t yyscanner ); + +void *yyalloc (yy_size_t ,yyscan_t yyscanner ); +void *yyrealloc (void *,yy_size_t ,yyscan_t yyscanner ); +void yyfree (void * ,yyscan_t yyscanner ); + +#define yy_new_buffer yy_create_buffer + +#define yy_set_interactive(is_interactive) \ + { \ + if ( ! YY_CURRENT_BUFFER ){ \ + yyensure_buffer_stack (yyscanner); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \ + } + +#define yy_set_bol(at_bol) \ + { \ + if ( ! YY_CURRENT_BUFFER ){\ + yyensure_buffer_stack (yyscanner); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \ + } + +#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol) + +/* Begin user sect3 */ + +#define yywrap(yyscanner) 1 +#define YY_SKIP_YYWRAP + +typedef unsigned char YY_CHAR; + +typedef int yy_state_type; + +#define yytext_ptr yytext_r + +static yy_state_type yy_get_previous_state (yyscan_t yyscanner ); +static yy_state_type yy_try_NUL_trans (yy_state_type current_state ,yyscan_t yyscanner); +static int yy_get_next_buffer (yyscan_t yyscanner ); +static void yy_fatal_error (yyconst char msg[] ,yyscan_t yyscanner ); + +/* Done after the current pattern has been matched and before the + * corresponding action - sets up yytext. + */ +#define YY_DO_BEFORE_ACTION \ + yyg->yytext_ptr = yy_bp; \ + yyleng = (size_t) (yy_cp - yy_bp); \ + yyg->yy_hold_char = *yy_cp; \ + *yy_cp = '\0'; \ + yyg->yy_c_buf_p = yy_cp; + +#define YY_NUM_RULES 22 +#define YY_END_OF_BUFFER 23 +/* This struct is not used in this scanner, + but its presence is necessary. */ +struct yy_trans_info + { + flex_int32_t yy_verify; + flex_int32_t yy_nxt; + }; +static yyconst flex_int16_t yy_accept[98] = + { 0, + 0, 0, 0, 0, 0, 0, 0, 0, 23, 22, + 1, 22, 22, 22, 22, 22, 22, 22, 5, 1, + 5, 17, 1, 17, 21, 21, 18, 21, 21, 9, + 1, 9, 0, 3, 0, 0, 0, 0, 0, 0, + 4, 17, 17, 0, 17, 17, 7, 0, 20, 0, + 0, 0, 0, 0, 16, 8, 0, 0, 2, 14, + 0, 0, 0, 0, 19, 0, 17, 0, 17, 17, + 0, 0, 6, 20, 0, 15, 0, 0, 16, 0, + 0, 0, 0, 0, 19, 0, 0, 0, 10, 0, + 0, 0, 0, 12, 13, 11, 0 + + } ; + +static yyconst flex_int32_t yy_ec[256] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 4, 5, 1, 1, 1, 1, 6, + 6, 7, 6, 1, 8, 9, 10, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 1, 12, 13, + 6, 14, 1, 1, 1, 1, 1, 1, 1, 15, + 16, 1, 1, 17, 1, 1, 1, 1, 1, 1, + 1, 1, 18, 1, 1, 1, 1, 1, 1, 1, + 6, 19, 6, 1, 1, 1, 20, 21, 22, 1, + + 23, 1, 24, 1, 25, 1, 1, 26, 27, 1, + 28, 29, 1, 30, 1, 31, 32, 1, 1, 1, + 1, 1, 33, 6, 34, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 35, 1, 1, 1, + 36, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 37, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1 + } ; + +static yyconst flex_int32_t yy_meta[38] = + { 0, + 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, + 1, 2, 3, 3, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 2, 2, 1, 1, 1 + } ; + +static yyconst flex_int16_t yy_base[113] = + { 0, + 0, 36, 4, 12, 72, 105, 14, 20, 135, 312, + 312, 117, 2, 0, 103, 105, 99, 95, 312, 312, + 119, 0, 312, 138, 312, 21, 312, 0, 1, 312, + 312, 118, 109, 312, 123, 111, 104, 94, 101, 85, + 312, 0, 171, 14, 0, 204, 312, 109, 113, 41, + 106, 96, 21, 23, 312, 312, 88, 98, 312, 312, + 73, 71, 70, 89, 312, 44, 0, 39, 0, 237, + 43, 90, 312, 312, 57, 312, 37, 69, 43, 77, + 64, 57, 58, 64, 76, 94, 79, 59, 312, 39, + 14, 14, 4, 312, 312, 312, 312, 271, 274, 277, + + 280, 283, 0, 285, 288, 290, 293, 296, 299, 302, + 305, 308 + } ; + +static yyconst flex_int16_t yy_def[113] = + { 0, + 98, 98, 99, 99, 100, 100, 101, 101, 97, 97, + 97, 97, 97, 102, 97, 97, 97, 97, 97, 97, + 97, 103, 97, 104, 97, 97, 97, 105, 106, 97, + 97, 97, 97, 97, 107, 102, 97, 97, 97, 97, + 97, 103, 104, 108, 103, 109, 97, 97, 110, 97, + 97, 105, 106, 111, 97, 97, 97, 107, 97, 97, + 97, 97, 97, 97, 97, 112, 43, 108, 43, 109, + 97, 110, 97, 97, 97, 97, 106, 111, 106, 97, + 97, 97, 97, 97, 108, 112, 97, 97, 97, 97, + 97, 97, 97, 97, 97, 97, 0, 97, 97, 97, + + 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, + 97, 97 + } ; + +static yyconst flex_int16_t yy_nxt[350] = + { 0, + 42, 11, 11, 97, 12, 20, 11, 97, 34, 13, + 21, 35, 14, 20, 11, 31, 11, 65, 21, 54, + 32, 31, 11, 15, 16, 53, 32, 47, 17, 48, + 49, 50, 66, 96, 55, 95, 18, 11, 11, 54, + 12, 78, 65, 51, 94, 13, 44, 85, 14, 48, + 74, 50, 74, 87, 55, 54, 79, 66, 93, 15, + 16, 54, 86, 51, 17, 51, 74, 88, 74, 88, + 55, 53, 18, 23, 11, 24, 55, 25, 25, 65, + 33, 26, 92, 27, 28, 25, 91, 78, 74, 87, + 90, 89, 73, 84, 66, 83, 44, 85, 82, 81, + + 59, 51, 79, 80, 29, 25, 23, 11, 24, 76, + 25, 25, 86, 75, 26, 73, 27, 28, 25, 71, + 64, 63, 62, 61, 60, 59, 57, 56, 41, 40, + 39, 38, 37, 33, 97, 97, 97, 29, 25, 44, + 44, 45, 97, 44, 44, 97, 97, 44, 97, 44, + 44, 44, 97, 97, 97, 97, 46, 97, 97, 97, + 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, + 44, 44, 44, 44, 45, 97, 44, 44, 97, 97, + 44, 97, 44, 44, 44, 97, 97, 97, 97, 46, + 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, + + 97, 97, 97, 44, 44, 68, 44, 69, 97, 68, + 68, 97, 97, 68, 97, 68, 68, 68, 97, 97, + 97, 97, 70, 97, 97, 97, 97, 97, 97, 97, + 97, 97, 97, 97, 97, 97, 68, 68, 68, 44, + 69, 97, 68, 68, 97, 97, 68, 97, 68, 68, + 68, 97, 97, 97, 97, 70, 97, 97, 97, 97, + 97, 97, 97, 97, 97, 97, 97, 97, 97, 68, + 68, 10, 10, 10, 19, 19, 19, 22, 22, 22, + 30, 30, 30, 36, 36, 43, 43, 43, 52, 52, + 53, 53, 53, 58, 58, 58, 44, 44, 44, 67, + + 67, 67, 72, 72, 72, 77, 77, 77, 68, 68, + 68, 9, 97, 97, 97, 97, 97, 97, 97, 97, + 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, + 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, + 97, 97, 97, 97, 97, 97, 97, 97, 97 + } ; + +static yyconst flex_int16_t yy_chk[350] = + { 0, + 103, 1, 1, 0, 1, 3, 3, 0, 13, 1, + 3, 13, 1, 4, 4, 7, 7, 44, 4, 29, + 7, 8, 8, 1, 1, 54, 8, 26, 1, 26, + 26, 26, 44, 93, 29, 92, 1, 2, 2, 53, + 2, 54, 68, 26, 91, 2, 66, 66, 2, 50, + 50, 50, 71, 71, 53, 77, 54, 68, 90, 2, + 2, 79, 66, 50, 2, 71, 75, 75, 88, 88, + 77, 78, 2, 5, 5, 5, 79, 5, 5, 85, + 84, 5, 83, 5, 5, 5, 82, 78, 87, 87, + 81, 80, 72, 64, 85, 63, 86, 86, 62, 61, + + 58, 87, 78, 57, 5, 5, 6, 6, 6, 52, + 6, 6, 86, 51, 6, 49, 6, 6, 6, 48, + 40, 39, 38, 37, 36, 35, 33, 32, 21, 18, + 17, 16, 15, 12, 9, 0, 0, 6, 6, 24, + 24, 24, 0, 24, 24, 0, 0, 24, 0, 24, + 24, 24, 0, 0, 0, 0, 24, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 24, 24, 43, 43, 43, 0, 43, 43, 0, 0, + 43, 0, 43, 43, 43, 0, 0, 0, 0, 43, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 43, 43, 46, 46, 46, 0, 46, + 46, 0, 0, 46, 0, 46, 46, 46, 0, 0, + 0, 0, 46, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 46, 46, 70, 70, + 70, 0, 70, 70, 0, 0, 70, 0, 70, 70, + 70, 0, 0, 0, 0, 70, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 70, + 70, 98, 98, 98, 99, 99, 99, 100, 100, 100, + 101, 101, 101, 102, 102, 104, 104, 104, 105, 105, + 106, 106, 106, 107, 107, 107, 108, 108, 108, 109, + + 109, 109, 110, 110, 110, 111, 111, 111, 112, 112, + 112, 97, 97, 97, 97, 97, 97, 97, 97, 97, + 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, + 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, + 97, 97, 97, 97, 97, 97, 97, 97, 97 + } ; + +/* Table of booleans, true if rule could match eol. */ +static yyconst flex_int32_t yy_rule_can_match_eol[23] = + { 0, +1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, + 0, 0, 0, }; + +/* The intent behind this definition is that it'll catch + * any uses of REJECT which flex missed. + */ +#define REJECT reject_used_but_not_detected +#define yymore() yymore_used_but_not_detected +#define YY_MORE_ADJ 0 +#define YY_RESTORE_YY_MORE_OFFSET +#line 1 "_jsgf_scanner.l" +/* -*- mode: text -*- */ +/* ==================================================================== + * Copyright (c) 2007 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/* YOU MUST USE FLEX 2.5.37 OR NEWER TO PROCESS THIS FILE!!! */ +#line 39 "_jsgf_scanner.l" + +#include "jsgf_internal.h" +#include "jsgf_parser.h" + +#define YY_NO_UNISTD_H 1 + + + +#line 609 "jsgf_scanner.c" + +#define INITIAL 0 +#define COMMENT 1 +#define DECL 2 +#define DECLCOMMENT 3 + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include <unistd.h> +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +/* Holds the entire state of the reentrant scanner. */ +struct yyguts_t + { + + /* User-defined. Not touched by flex. */ + YY_EXTRA_TYPE yyextra_r; + + /* The rest are the same as the globals declared in the non-reentrant scanner. */ + FILE *yyin_r, *yyout_r; + size_t yy_buffer_stack_top; /**< index of top of stack. */ + size_t yy_buffer_stack_max; /**< capacity of stack. */ + YY_BUFFER_STATE * yy_buffer_stack; /**< Stack as an array. */ + char yy_hold_char; + yy_size_t yy_n_chars; + yy_size_t yyleng_r; + char *yy_c_buf_p; + int yy_init; + int yy_start; + int yy_did_buffer_switch_on_eof; + int yy_start_stack_ptr; + int yy_start_stack_depth; + int *yy_start_stack; + yy_state_type yy_last_accepting_state; + char* yy_last_accepting_cpos; + + int yylineno_r; + int yy_flex_debug_r; + + char *yytext_r; + int yy_more_flag; + int yy_more_len; + + YYSTYPE * yylval_r; + + }; /* end struct yyguts_t */ + +static int yy_init_globals (yyscan_t yyscanner ); + + /* This must go here because YYSTYPE and YYLTYPE are included + * from bison output in section 1.*/ + # define yylval yyg->yylval_r + +int yylex_init (yyscan_t* scanner); + +int yylex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +int yylex_destroy (yyscan_t yyscanner ); + +int yyget_debug (yyscan_t yyscanner ); + +void yyset_debug (int debug_flag ,yyscan_t yyscanner ); + +YY_EXTRA_TYPE yyget_extra (yyscan_t yyscanner ); + +void yyset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner ); + +FILE *yyget_in (yyscan_t yyscanner ); + +void yyset_in (FILE * in_str ,yyscan_t yyscanner ); + +FILE *yyget_out (yyscan_t yyscanner ); + +void yyset_out (FILE * out_str ,yyscan_t yyscanner ); + +yy_size_t yyget_leng (yyscan_t yyscanner ); + +char *yyget_text (yyscan_t yyscanner ); + +int yyget_lineno (yyscan_t yyscanner ); + +void yyset_lineno (int line_number ,yyscan_t yyscanner ); + +int yyget_column (yyscan_t yyscanner ); + +void yyset_column (int column_no ,yyscan_t yyscanner ); + +YYSTYPE * yyget_lval (yyscan_t yyscanner ); + +void yyset_lval (YYSTYPE * yylval_param ,yyscan_t yyscanner ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int yywrap (yyscan_t yyscanner ); +#else +extern int yywrap (yyscan_t yyscanner ); +#endif +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner); +#endif + +#ifndef YY_NO_INPUT + +#ifdef __cplusplus +static int yyinput (yyscan_t yyscanner ); +#else +static int input (yyscan_t yyscanner ); +#endif + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#define YY_READ_BUF_SIZE 8192 +#endif + +/* Copy whatever the last rule matched to the standard output. */ +#ifndef ECHO +/* This used to be an fputs(), but since the string might contain NUL's, + * we now use fwrite(). + */ +#define ECHO do { if (fwrite( yytext, yyleng, 1, yyout )) {} } while (0) +#endif + +/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL, + * is returned in "result". + */ +#ifndef YY_INPUT +#define YY_INPUT(buf,result,max_size) \ + if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \ + { \ + int c = '*'; \ + size_t n; \ + for ( n = 0; n < max_size && \ + (c = getc( yyin )) != EOF && c != '\n'; ++n ) \ + buf[n] = (char) c; \ + if ( c == '\n' ) \ + buf[n++] = (char) c; \ + if ( c == EOF && ferror( yyin ) ) \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + result = n; \ + } \ + else \ + { \ + errno=0; \ + while ( (result = fread(buf, 1, max_size, yyin))==0 && ferror(yyin)) \ + { \ + if( errno != EINTR) \ + { \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + break; \ + } \ + errno=0; \ + clearerr(yyin); \ + } \ + }\ +\ + +#endif + +/* No semi-colon after return; correct usage is to write "yyterminate();" - + * we don't want an extra ';' after the "return" because that will cause + * some compilers to complain about unreachable statements. + */ +#ifndef yyterminate +#define yyterminate() return YY_NULL +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Report a fatal error. */ +#ifndef YY_FATAL_ERROR +#define YY_FATAL_ERROR(msg) yy_fatal_error( msg , yyscanner) +#endif + +/* end tables serialization structures and prototypes */ + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int yylex \ + (YYSTYPE * yylval_param ,yyscan_t yyscanner); + +#define YY_DECL int yylex \ + (YYSTYPE * yylval_param , yyscan_t yyscanner) +#endif /* !YY_DECL */ + +/* Code executed at the beginning of each rule, after yytext and yyleng + * have been set up. + */ +#ifndef YY_USER_ACTION +#define YY_USER_ACTION +#endif + +/* Code executed at the end of each rule. */ +#ifndef YY_BREAK +#define YY_BREAK break; +#endif + +#define YY_RULE_SETUP \ + YY_USER_ACTION + +/** The main scanner function which does all the work. + */ +YY_DECL +{ + register yy_state_type yy_current_state; + register char *yy_cp, *yy_bp; + register int yy_act; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + +#line 59 "_jsgf_scanner.l" + + +#line 850 "jsgf_scanner.c" + + yylval = yylval_param; + + if ( !yyg->yy_init ) + { + yyg->yy_init = 1; + +#ifdef YY_USER_INIT + YY_USER_INIT; +#endif + + if ( ! yyg->yy_start ) + yyg->yy_start = 1; /* first start state */ + + if ( ! yyin ) + yyin = stdin; + + if ( ! yyout ) + yyout = stdout; + + if ( ! YY_CURRENT_BUFFER ) { + yyensure_buffer_stack (yyscanner); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); + } + + yy_load_buffer_state(yyscanner ); + } + + while ( 1 ) /* loops until end-of-file is reached */ + { + yy_cp = yyg->yy_c_buf_p; + + /* Support of yytext. */ + *yy_cp = yyg->yy_hold_char; + + /* yy_bp points to the position in yy_ch_buf of the start of + * the current run. + */ + yy_bp = yy_cp; + + yy_current_state = yyg->yy_start; +yy_match: + do + { + register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)]; + if ( yy_accept[yy_current_state] ) + { + yyg->yy_last_accepting_state = yy_current_state; + yyg->yy_last_accepting_cpos = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 98 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + ++yy_cp; + } + while ( yy_current_state != 97 ); + yy_cp = yyg->yy_last_accepting_cpos; + yy_current_state = yyg->yy_last_accepting_state; + +yy_find_action: + yy_act = yy_accept[yy_current_state]; + + YY_DO_BEFORE_ACTION; + + if ( yy_act != YY_END_OF_BUFFER && yy_rule_can_match_eol[yy_act] ) + { + int yyl; + for ( yyl = 0; yyl < yyleng; ++yyl ) + if ( yytext[yyl] == '\n' ) + + do{ yylineno++; + yycolumn=0; + }while(0) +; + } + +do_action: /* This label is used only to access EOF actions. */ + + switch ( yy_act ) + { /* beginning of action switch */ + case 0: /* must back up */ + /* undo the effects of YY_DO_BEFORE_ACTION */ + *yy_cp = yyg->yy_hold_char; + yy_cp = yyg->yy_last_accepting_cpos; + yy_current_state = yyg->yy_last_accepting_state; + goto yy_find_action; + +case 1: +/* rule 1 can match eol */ +YY_RULE_SETUP +#line 61 "_jsgf_scanner.l" +; /* ignore whitespace */ + YY_BREAK +case 2: +/* rule 2 can match eol */ +YY_RULE_SETUP +#line 62 "_jsgf_scanner.l" +; /* single-line comments */ + YY_BREAK +case 3: +YY_RULE_SETUP +#line 63 "_jsgf_scanner.l" +{ BEGIN(COMMENT); } /* C-style comments */ + YY_BREAK +case 4: +YY_RULE_SETUP +#line 64 "_jsgf_scanner.l" +{ BEGIN(INITIAL); } + YY_BREAK +case 5: +YY_RULE_SETUP +#line 65 "_jsgf_scanner.l" +; /* Ignore stuff in comment mode */ + YY_BREAK +case 6: +/* rule 6 can match eol */ +YY_RULE_SETUP +#line 67 "_jsgf_scanner.l" +; /* single-line comments inside decl */ + YY_BREAK +case 7: +YY_RULE_SETUP +#line 68 "_jsgf_scanner.l" +{ BEGIN(DECLCOMMENT); } /* C-style comments inside decl */ + YY_BREAK +case 8: +YY_RULE_SETUP +#line 69 "_jsgf_scanner.l" +{ BEGIN(DECL); } + YY_BREAK +case 9: +YY_RULE_SETUP +#line 70 "_jsgf_scanner.l" +; /* Ignore stuff in comment mode */ + YY_BREAK +case 10: +YY_RULE_SETUP +#line 72 "_jsgf_scanner.l" +{BEGIN(DECL); return HEADER;} + YY_BREAK +case 11: +YY_RULE_SETUP +#line 73 "_jsgf_scanner.l" +{BEGIN(DECL); return GRAMMAR;} + YY_BREAK +case 12: +YY_RULE_SETUP +#line 74 "_jsgf_scanner.l" +{BEGIN(DECL); return IMPORT;} + YY_BREAK +case 13: +YY_RULE_SETUP +#line 75 "_jsgf_scanner.l" +{BEGIN(DECL); return PUBLIC;} + YY_BREAK +case 14: +/* rule 14 can match eol */ +YY_RULE_SETUP +#line 77 "_jsgf_scanner.l" +{ BEGIN(DECL); yylval->name = strdup(yytext); return RULENAME; } + YY_BREAK +case 15: +/* rule 15 can match eol */ +YY_RULE_SETUP +#line 78 "_jsgf_scanner.l" +{ yylval->name = strdup(yytext); return RULENAME; } + YY_BREAK +case 16: +/* rule 16 can match eol */ +YY_RULE_SETUP +#line 80 "_jsgf_scanner.l" +{ yylval->name = strdup(yytext); return TAG; } + YY_BREAK +case 17: +YY_RULE_SETUP +#line 81 "_jsgf_scanner.l" +{ yylval->name = strdup(yytext); return TOKEN; } + YY_BREAK +case 18: +YY_RULE_SETUP +#line 82 "_jsgf_scanner.l" +{ BEGIN(INITIAL); return yytext[0]; } + YY_BREAK +case 19: +/* rule 19 can match eol */ +YY_RULE_SETUP +#line 83 "_jsgf_scanner.l" +{ yylval->name = strdup(yytext); return TOKEN; } + YY_BREAK +case 20: +YY_RULE_SETUP +#line 84 "_jsgf_scanner.l" +{ yylval->weight = atof_c(yytext+1); return WEIGHT; } + YY_BREAK +case 21: +YY_RULE_SETUP +#line 85 "_jsgf_scanner.l" +return yytext[0]; /* Single-character tokens */ + YY_BREAK +case 22: +YY_RULE_SETUP +#line 87 "_jsgf_scanner.l" +ECHO; + YY_BREAK +#line 1060 "jsgf_scanner.c" +case YY_STATE_EOF(INITIAL): +case YY_STATE_EOF(COMMENT): +case YY_STATE_EOF(DECL): +case YY_STATE_EOF(DECLCOMMENT): + yyterminate(); + + case YY_END_OF_BUFFER: + { + /* Amount of text matched not including the EOB char. */ + int yy_amount_of_matched_text = (int) (yy_cp - yyg->yytext_ptr) - 1; + + /* Undo the effects of YY_DO_BEFORE_ACTION. */ + *yy_cp = yyg->yy_hold_char; + YY_RESTORE_YY_MORE_OFFSET + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW ) + { + /* We're scanning a new file or input source. It's + * possible that this happened because the user + * just pointed yyin at a new source and called + * yylex(). If so, then we have to assure + * consistency between YY_CURRENT_BUFFER and our + * globals. Here is the right place to do so, because + * this is the first action (other than possibly a + * back-up) that will match for the new input source. + */ + yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL; + } + + /* Note that here we test for yy_c_buf_p "<=" to the position + * of the first EOB in the buffer, since yy_c_buf_p will + * already have been incremented past the NUL character + * (since all states make transitions on EOB to the + * end-of-buffer state). Contrast this with the test + * in input(). + */ + if ( yyg->yy_c_buf_p <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] ) + { /* This was really a NUL. */ + yy_state_type yy_next_state; + + yyg->yy_c_buf_p = yyg->yytext_ptr + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( yyscanner ); + + /* Okay, we're now positioned to make the NUL + * transition. We couldn't have + * yy_get_previous_state() go ahead and do it + * for us because it doesn't know how to deal + * with the possibility of jamming (and we don't + * want to build jamming into it because then it + * will run more slowly). + */ + + yy_next_state = yy_try_NUL_trans( yy_current_state , yyscanner); + + yy_bp = yyg->yytext_ptr + YY_MORE_ADJ; + + if ( yy_next_state ) + { + /* Consume the NUL. */ + yy_cp = ++yyg->yy_c_buf_p; + yy_current_state = yy_next_state; + goto yy_match; + } + + else + { + yy_cp = yyg->yy_last_accepting_cpos; + yy_current_state = yyg->yy_last_accepting_state; + goto yy_find_action; + } + } + + else switch ( yy_get_next_buffer( yyscanner ) ) + { + case EOB_ACT_END_OF_FILE: + { + yyg->yy_did_buffer_switch_on_eof = 0; + + if ( yywrap(yyscanner ) ) + { + /* Note: because we've taken care in + * yy_get_next_buffer() to have set up + * yytext, we can now set up + * yy_c_buf_p so that if some total + * hoser (like flex itself) wants to + * call the scanner after we return the + * YY_NULL, it'll still work - another + * YY_NULL will get returned. + */ + yyg->yy_c_buf_p = yyg->yytext_ptr + YY_MORE_ADJ; + + yy_act = YY_STATE_EOF(YY_START); + goto do_action; + } + + else + { + if ( ! yyg->yy_did_buffer_switch_on_eof ) + YY_NEW_FILE; + } + break; + } + + case EOB_ACT_CONTINUE_SCAN: + yyg->yy_c_buf_p = + yyg->yytext_ptr + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( yyscanner ); + + yy_cp = yyg->yy_c_buf_p; + yy_bp = yyg->yytext_ptr + YY_MORE_ADJ; + goto yy_match; + + case EOB_ACT_LAST_MATCH: + yyg->yy_c_buf_p = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars]; + + yy_current_state = yy_get_previous_state( yyscanner ); + + yy_cp = yyg->yy_c_buf_p; + yy_bp = yyg->yytext_ptr + YY_MORE_ADJ; + goto yy_find_action; + } + break; + } + + default: + YY_FATAL_ERROR( + "fatal flex scanner internal error--no action found" ); + } /* end of action switch */ + } /* end of scanning one token */ +} /* end of yylex */ + +/* yy_get_next_buffer - try to read in a new buffer + * + * Returns a code representing an action: + * EOB_ACT_LAST_MATCH - + * EOB_ACT_CONTINUE_SCAN - continue scanning from current position + * EOB_ACT_END_OF_FILE - end of file + */ +static int yy_get_next_buffer (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf; + register char *source = yyg->yytext_ptr; + register int number_to_move, i; + int ret_val; + + if ( yyg->yy_c_buf_p > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] ) + YY_FATAL_ERROR( + "fatal flex scanner internal error--end of buffer missed" ); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 ) + { /* Don't try to fill the buffer, so this is an EOF. */ + if ( yyg->yy_c_buf_p - yyg->yytext_ptr - YY_MORE_ADJ == 1 ) + { + /* We matched a single character, the EOB, so + * treat this as a final EOF. + */ + return EOB_ACT_END_OF_FILE; + } + + else + { + /* We matched some text prior to the EOB, first + * process it. + */ + return EOB_ACT_LAST_MATCH; + } + } + + /* Try to read more data. */ + + /* First move last chars to start of buffer. */ + number_to_move = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr) - 1; + + for ( i = 0; i < number_to_move; ++i ) + *(dest++) = *(source++); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING ) + /* don't do the read, it's not guaranteed to return an EOF, + * just force an EOF + */ + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars = 0; + + else + { + int num_to_read = + YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1; + + while ( num_to_read <= 0 ) + { /* Not enough room in the buffer - grow it. */ + + /* just a shorter name for the current buffer */ + YY_BUFFER_STATE b = YY_CURRENT_BUFFER_LVALUE; + + int yy_c_buf_p_offset = + (int) (yyg->yy_c_buf_p - b->yy_ch_buf); + + if ( b->yy_is_our_buffer ) + { + yy_size_t new_size = b->yy_buf_size * 2; + + if ( new_size <= 0 ) + b->yy_buf_size += b->yy_buf_size / 8; + else + b->yy_buf_size *= 2; + + b->yy_ch_buf = (char *) + /* Include room in for 2 EOB chars. */ + yyrealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 ,yyscanner ); + } + else + /* Can't grow it, we don't own it. */ + b->yy_ch_buf = 0; + + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( + "fatal error - scanner input buffer overflow" ); + + yyg->yy_c_buf_p = &b->yy_ch_buf[yy_c_buf_p_offset]; + + num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size - + number_to_move - 1; + + } + + if ( num_to_read > YY_READ_BUF_SIZE ) + num_to_read = YY_READ_BUF_SIZE; + + /* Read in more data. */ + YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]), + yyg->yy_n_chars, num_to_read ); + + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars; + } + + if ( yyg->yy_n_chars == 0 ) + { + if ( number_to_move == YY_MORE_ADJ ) + { + ret_val = EOB_ACT_END_OF_FILE; + yyrestart(yyin ,yyscanner); + } + + else + { + ret_val = EOB_ACT_LAST_MATCH; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = + YY_BUFFER_EOF_PENDING; + } + } + + else + ret_val = EOB_ACT_CONTINUE_SCAN; + + if ((yy_size_t) (yyg->yy_n_chars + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) { + /* Extend the array by 50%, plus the number we really need. */ + yy_size_t new_size = yyg->yy_n_chars + number_to_move + (yyg->yy_n_chars >> 1); + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) yyrealloc((void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ,yyscanner ); + if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" ); + } + + yyg->yy_n_chars += number_to_move; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] = YY_END_OF_BUFFER_CHAR; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] = YY_END_OF_BUFFER_CHAR; + + yyg->yytext_ptr = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0]; + + return ret_val; +} + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + + static yy_state_type yy_get_previous_state (yyscan_t yyscanner) +{ + register yy_state_type yy_current_state; + register char *yy_cp; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + yy_current_state = yyg->yy_start; + + for ( yy_cp = yyg->yytext_ptr + YY_MORE_ADJ; yy_cp < yyg->yy_c_buf_p; ++yy_cp ) + { + register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1); + if ( yy_accept[yy_current_state] ) + { + yyg->yy_last_accepting_state = yy_current_state; + yyg->yy_last_accepting_cpos = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 98 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + } + + return yy_current_state; +} + +/* yy_try_NUL_trans - try to make a transition on the NUL character + * + * synopsis + * next_state = yy_try_NUL_trans( current_state ); + */ + static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state , yyscan_t yyscanner) +{ + register int yy_is_jam; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* This var may be unused depending upon options. */ + register char *yy_cp = yyg->yy_c_buf_p; + + register YY_CHAR yy_c = 1; + if ( yy_accept[yy_current_state] ) + { + yyg->yy_last_accepting_state = yy_current_state; + yyg->yy_last_accepting_cpos = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 98 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + yy_is_jam = (yy_current_state == 97); + + (void)yyg; + return yy_is_jam ? 0 : yy_current_state; +} + +#ifndef YY_NO_INPUT +#ifdef __cplusplus + static int yyinput (yyscan_t yyscanner) +#else + static int input (yyscan_t yyscanner) +#endif + +{ + int c; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + *yyg->yy_c_buf_p = yyg->yy_hold_char; + + if ( *yyg->yy_c_buf_p == YY_END_OF_BUFFER_CHAR ) + { + /* yy_c_buf_p now points to the character we want to return. + * If this occurs *before* the EOB characters, then it's a + * valid NUL; if not, then we've hit the end of the buffer. + */ + if ( yyg->yy_c_buf_p < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] ) + /* This was really a NUL. */ + *yyg->yy_c_buf_p = '\0'; + + else + { /* need more input */ + yy_size_t offset = yyg->yy_c_buf_p - yyg->yytext_ptr; + ++yyg->yy_c_buf_p; + + switch ( yy_get_next_buffer( yyscanner ) ) + { + case EOB_ACT_LAST_MATCH: + /* This happens because yy_g_n_b() + * sees that we've accumulated a + * token and flags that we need to + * try matching the token before + * proceeding. But for input(), + * there's no matching to consider. + * So convert the EOB_ACT_LAST_MATCH + * to EOB_ACT_END_OF_FILE. + */ + + /* Reset buffer status. */ + yyrestart(yyin ,yyscanner); + + /*FALLTHROUGH*/ + + case EOB_ACT_END_OF_FILE: + { + if ( yywrap(yyscanner ) ) + return EOF; + + if ( ! yyg->yy_did_buffer_switch_on_eof ) + YY_NEW_FILE; +#ifdef __cplusplus + return yyinput(yyscanner); +#else + return input(yyscanner); +#endif + } + + case EOB_ACT_CONTINUE_SCAN: + yyg->yy_c_buf_p = yyg->yytext_ptr + offset; + break; + } + } + } + + c = *(unsigned char *) yyg->yy_c_buf_p; /* cast for 8-bit char's */ + *yyg->yy_c_buf_p = '\0'; /* preserve yytext */ + yyg->yy_hold_char = *++yyg->yy_c_buf_p; + + if ( c == '\n' ) + + do{ yylineno++; + yycolumn=0; + }while(0) +; + + return c; +} +#endif /* ifndef YY_NO_INPUT */ + +/** Immediately switch to a different input stream. + * @param input_file A readable stream. + * @param yyscanner The scanner object. + * @note This function does not reset the start condition to @c INITIAL . + */ + void yyrestart (FILE * input_file , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if ( ! YY_CURRENT_BUFFER ){ + yyensure_buffer_stack (yyscanner); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); + } + + yy_init_buffer(YY_CURRENT_BUFFER,input_file ,yyscanner); + yy_load_buffer_state(yyscanner ); +} + +/** Switch to a different input buffer. + * @param new_buffer The new input buffer. + * @param yyscanner The scanner object. + */ + void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* TODO. We should be able to replace this entire function body + * with + * yypop_buffer_state(); + * yypush_buffer_state(new_buffer); + */ + yyensure_buffer_stack (yyscanner); + if ( YY_CURRENT_BUFFER == new_buffer ) + return; + + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *yyg->yy_c_buf_p = yyg->yy_hold_char; + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p; + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars; + } + + YY_CURRENT_BUFFER_LVALUE = new_buffer; + yy_load_buffer_state(yyscanner ); + + /* We don't actually know whether we did this switch during + * EOF (yywrap()) processing, but the only time this flag + * is looked at is after yywrap() is called, so it's safe + * to go ahead and always set it. + */ + yyg->yy_did_buffer_switch_on_eof = 1; +} + +static void yy_load_buffer_state (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + yyg->yytext_ptr = yyg->yy_c_buf_p = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos; + yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file; + yyg->yy_hold_char = *yyg->yy_c_buf_p; +} + +/** Allocate and initialize an input buffer state. + * @param file A readable stream. + * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE. + * @param yyscanner The scanner object. + * @return the allocated buffer state. + */ + YY_BUFFER_STATE yy_create_buffer (FILE * file, int size , yyscan_t yyscanner) +{ + YY_BUFFER_STATE b; + + b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state ) ,yyscanner ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_buf_size = size; + + /* yy_ch_buf has to be 2 characters longer than the size given because + * we need to put in 2 end-of-buffer characters. + */ + b->yy_ch_buf = (char *) yyalloc(b->yy_buf_size + 2 ,yyscanner ); + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_is_our_buffer = 1; + + yy_init_buffer(b,file ,yyscanner); + + return b; +} + +/** Destroy the buffer. + * @param b a buffer created with yy_create_buffer() + * @param yyscanner The scanner object. + */ + void yy_delete_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if ( ! b ) + return; + + if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */ + YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; + + if ( b->yy_is_our_buffer ) + yyfree((void *) b->yy_ch_buf ,yyscanner ); + + yyfree((void *) b ,yyscanner ); +} + +/* Initializes or reinitializes a buffer. + * This function is sometimes called more than once on the same buffer, + * such as during a yyrestart() or at EOF. + */ + static void yy_init_buffer (YY_BUFFER_STATE b, FILE * file , yyscan_t yyscanner) + +{ + int oerrno = errno; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + yy_flush_buffer(b ,yyscanner); + + b->yy_input_file = file; + b->yy_fill_buffer = 1; + + /* If b is the current buffer, then yy_init_buffer was _probably_ + * called from yyrestart() or through yy_get_next_buffer. + * In that case, we don't want to reset the lineno or column. + */ + if (b != YY_CURRENT_BUFFER){ + b->yy_bs_lineno = 1; + b->yy_bs_column = 0; + } + + b->yy_is_interactive = 0; + + errno = oerrno; +} + +/** Discard all buffered characters. On the next scan, YY_INPUT will be called. + * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER. + * @param yyscanner The scanner object. + */ + void yy_flush_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + if ( ! b ) + return; + + b->yy_n_chars = 0; + + /* We always need two end-of-buffer characters. The first causes + * a transition to the end-of-buffer state. The second causes + * a jam in that state. + */ + b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR; + b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR; + + b->yy_buf_pos = &b->yy_ch_buf[0]; + + b->yy_at_bol = 1; + b->yy_buffer_status = YY_BUFFER_NEW; + + if ( b == YY_CURRENT_BUFFER ) + yy_load_buffer_state(yyscanner ); +} + +/** Pushes the new state onto the stack. The new state becomes + * the current state. This function will allocate the stack + * if necessary. + * @param new_buffer The new state. + * @param yyscanner The scanner object. + */ +void yypush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + if (new_buffer == NULL) + return; + + yyensure_buffer_stack(yyscanner); + + /* This block is copied from yy_switch_to_buffer. */ + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *yyg->yy_c_buf_p = yyg->yy_hold_char; + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p; + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars; + } + + /* Only push if top exists. Otherwise, replace top. */ + if (YY_CURRENT_BUFFER) + yyg->yy_buffer_stack_top++; + YY_CURRENT_BUFFER_LVALUE = new_buffer; + + /* copied from yy_switch_to_buffer. */ + yy_load_buffer_state(yyscanner ); + yyg->yy_did_buffer_switch_on_eof = 1; +} + +/** Removes and deletes the top of the stack, if present. + * The next element becomes the new top. + * @param yyscanner The scanner object. + */ +void yypop_buffer_state (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + if (!YY_CURRENT_BUFFER) + return; + + yy_delete_buffer(YY_CURRENT_BUFFER ,yyscanner); + YY_CURRENT_BUFFER_LVALUE = NULL; + if (yyg->yy_buffer_stack_top > 0) + --yyg->yy_buffer_stack_top; + + if (YY_CURRENT_BUFFER) { + yy_load_buffer_state(yyscanner ); + yyg->yy_did_buffer_switch_on_eof = 1; + } +} + +/* Allocates the stack if it does not exist. + * Guarantees space for at least one push. + */ +static void yyensure_buffer_stack (yyscan_t yyscanner) +{ + yy_size_t num_to_alloc; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if (!yyg->yy_buffer_stack) { + + /* First allocation is just for 2 elements, since we don't know if this + * scanner will even need a stack. We use 2 instead of 1 to avoid an + * immediate realloc on the next call. + */ + num_to_alloc = 1; + yyg->yy_buffer_stack = (struct yy_buffer_state**)yyalloc + (num_to_alloc * sizeof(struct yy_buffer_state*) + , yyscanner); + if ( ! yyg->yy_buffer_stack ) + YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" ); + + memset(yyg->yy_buffer_stack, 0, num_to_alloc * sizeof(struct yy_buffer_state*)); + + yyg->yy_buffer_stack_max = num_to_alloc; + yyg->yy_buffer_stack_top = 0; + return; + } + + if (yyg->yy_buffer_stack_top >= (yyg->yy_buffer_stack_max) - 1){ + + /* Increase the buffer to prepare for a possible push. */ + int grow_size = 8 /* arbitrary grow size */; + + num_to_alloc = yyg->yy_buffer_stack_max + grow_size; + yyg->yy_buffer_stack = (struct yy_buffer_state**)yyrealloc + (yyg->yy_buffer_stack, + num_to_alloc * sizeof(struct yy_buffer_state*) + , yyscanner); + if ( ! yyg->yy_buffer_stack ) + YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" ); + + /* zero only the new slots.*/ + memset(yyg->yy_buffer_stack + yyg->yy_buffer_stack_max, 0, grow_size * sizeof(struct yy_buffer_state*)); + yyg->yy_buffer_stack_max = num_to_alloc; + } +} + +/** Setup the input buffer state to scan directly from a user-specified character buffer. + * @param base the character buffer + * @param size the size in bytes of the character buffer + * @param yyscanner The scanner object. + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE yy_scan_buffer (char * base, yy_size_t size , yyscan_t yyscanner) +{ + YY_BUFFER_STATE b; + + if ( size < 2 || + base[size-2] != YY_END_OF_BUFFER_CHAR || + base[size-1] != YY_END_OF_BUFFER_CHAR ) + /* They forgot to leave room for the EOB's. */ + return 0; + + b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state ) ,yyscanner ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in yy_scan_buffer()" ); + + b->yy_buf_size = size - 2; /* "- 2" to take care of EOB's */ + b->yy_buf_pos = b->yy_ch_buf = base; + b->yy_is_our_buffer = 0; + b->yy_input_file = 0; + b->yy_n_chars = b->yy_buf_size; + b->yy_is_interactive = 0; + b->yy_at_bol = 1; + b->yy_fill_buffer = 0; + b->yy_buffer_status = YY_BUFFER_NEW; + + yy_switch_to_buffer(b ,yyscanner ); + + return b; +} + +/** Setup the input buffer state to scan a string. The next call to yylex() will + * scan from a @e copy of @a str. + * @param yystr a NUL-terminated string to scan + * @param yyscanner The scanner object. + * @return the newly allocated buffer state object. + * @note If you want to scan bytes that may contain NUL values, then use + * yy_scan_bytes() instead. + */ +YY_BUFFER_STATE yy_scan_string (yyconst char * yystr , yyscan_t yyscanner) +{ + + return yy_scan_bytes(yystr,strlen(yystr) ,yyscanner); +} + +/** Setup the input buffer state to scan the given bytes. The next call to yylex() will + * scan from a @e copy of @a bytes. + * @param yybytes the byte buffer to scan + * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes. + * @param yyscanner The scanner object. + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE yy_scan_bytes (yyconst char * yybytes, yy_size_t _yybytes_len , yyscan_t yyscanner) +{ + YY_BUFFER_STATE b; + char *buf; + yy_size_t n; + int i; + + /* Get memory for full buffer, including space for trailing EOB's. */ + n = _yybytes_len + 2; + buf = (char *) yyalloc(n ,yyscanner ); + if ( ! buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_scan_bytes()" ); + + for ( i = 0; i < _yybytes_len; ++i ) + buf[i] = yybytes[i]; + + buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR; + + b = yy_scan_buffer(buf,n ,yyscanner); + if ( ! b ) + YY_FATAL_ERROR( "bad buffer in yy_scan_bytes()" ); + + /* It's okay to grow etc. this buffer, and we should throw it + * away when we're done. + */ + b->yy_is_our_buffer = 1; + + return b; +} + +#ifndef YY_EXIT_FAILURE +#define YY_EXIT_FAILURE 2 +#endif + +static void yy_fatal_error (yyconst char* msg , yyscan_t yyscanner) +{ + (void) fprintf( stderr, "%s\n", msg ); + exit( YY_EXIT_FAILURE ); +} + +/* Redefine yyless() so it works in section 3 code. */ + +#undef yyless +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + yytext[yyleng] = yyg->yy_hold_char; \ + yyg->yy_c_buf_p = yytext + yyless_macro_arg; \ + yyg->yy_hold_char = *yyg->yy_c_buf_p; \ + *yyg->yy_c_buf_p = '\0'; \ + yyleng = yyless_macro_arg; \ + } \ + while ( 0 ) + +/* Accessor methods (get/set functions) to struct members. */ + +/** Get the user-defined data for this scanner. + * @param yyscanner The scanner object. + */ +YY_EXTRA_TYPE yyget_extra (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyextra; +} + +/** Get the current line number. + * @param yyscanner The scanner object. + */ +int yyget_lineno (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if (! YY_CURRENT_BUFFER) + return 0; + + return yylineno; +} + +/** Get the current column number. + * @param yyscanner The scanner object. + */ +int yyget_column (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if (! YY_CURRENT_BUFFER) + return 0; + + return yycolumn; +} + +/** Get the input stream. + * @param yyscanner The scanner object. + */ +FILE *yyget_in (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyin; +} + +/** Get the output stream. + * @param yyscanner The scanner object. + */ +FILE *yyget_out (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyout; +} + +/** Get the length of the current token. + * @param yyscanner The scanner object. + */ +yy_size_t yyget_leng (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyleng; +} + +/** Get the current token. + * @param yyscanner The scanner object. + */ + +char *yyget_text (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yytext; +} + +/** Set the user-defined data. This data is never touched by the scanner. + * @param user_defined The data to be associated with this scanner. + * @param yyscanner The scanner object. + */ +void yyset_extra (YY_EXTRA_TYPE user_defined , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyextra = user_defined ; +} + +/** Set the current line number. + * @param line_number + * @param yyscanner The scanner object. + */ +void yyset_lineno (int line_number , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* lineno is only valid if an input buffer exists. */ + if (! YY_CURRENT_BUFFER ) + YY_FATAL_ERROR( "yyset_lineno called with no buffer" ); + + yylineno = line_number; +} + +/** Set the current column. + * @param line_number + * @param yyscanner The scanner object. + */ +void yyset_column (int column_no , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* column is only valid if an input buffer exists. */ + if (! YY_CURRENT_BUFFER ) + YY_FATAL_ERROR( "yyset_column called with no buffer" ); + + yycolumn = column_no; +} + +/** Set the input stream. This does not discard the current + * input buffer. + * @param in_str A readable stream. + * @param yyscanner The scanner object. + * @see yy_switch_to_buffer + */ +void yyset_in (FILE * in_str , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyin = in_str ; +} + +void yyset_out (FILE * out_str , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyout = out_str ; +} + +int yyget_debug (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yy_flex_debug; +} + +void yyset_debug (int bdebug , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yy_flex_debug = bdebug ; +} + +/* Accessor methods for yylval and yylloc */ + +YYSTYPE * yyget_lval (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yylval; +} + +void yyset_lval (YYSTYPE * yylval_param , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yylval = yylval_param; +} + +/* User-visible API */ + +/* yylex_init is special because it creates the scanner itself, so it is + * the ONLY reentrant function that doesn't take the scanner as the last argument. + * That's why we explicitly handle the declaration, instead of using our macros. + */ + +int yylex_init(yyscan_t* ptr_yy_globals) + +{ + if (ptr_yy_globals == NULL){ + errno = EINVAL; + return 1; + } + + *ptr_yy_globals = (yyscan_t) yyalloc ( sizeof( struct yyguts_t ), NULL ); + + if (*ptr_yy_globals == NULL){ + errno = ENOMEM; + return 1; + } + + /* By setting to 0xAA, we expose bugs in yy_init_globals. Leave at 0x00 for releases. */ + memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t)); + + return yy_init_globals ( *ptr_yy_globals ); +} + +/* yylex_init_extra has the same functionality as yylex_init, but follows the + * convention of taking the scanner as the last argument. Note however, that + * this is a *pointer* to a scanner, as it will be allocated by this call (and + * is the reason, too, why this function also must handle its own declaration). + * The user defined value in the first argument will be available to yyalloc in + * the yyextra field. + */ + +int yylex_init_extra(YY_EXTRA_TYPE yy_user_defined,yyscan_t* ptr_yy_globals ) + +{ + struct yyguts_t dummy_yyguts; + + yyset_extra (yy_user_defined, &dummy_yyguts); + + if (ptr_yy_globals == NULL){ + errno = EINVAL; + return 1; + } + + *ptr_yy_globals = (yyscan_t) yyalloc ( sizeof( struct yyguts_t ), &dummy_yyguts ); + + if (*ptr_yy_globals == NULL){ + errno = ENOMEM; + return 1; + } + + /* By setting to 0xAA, we expose bugs in + yy_init_globals. Leave at 0x00 for releases. */ + memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t)); + + yyset_extra (yy_user_defined, *ptr_yy_globals); + + return yy_init_globals ( *ptr_yy_globals ); +} + +static int yy_init_globals (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + /* Initialization is the same as for the non-reentrant scanner. + * This function is called from yylex_destroy(), so don't allocate here. + */ + + yyg->yy_buffer_stack = 0; + yyg->yy_buffer_stack_top = 0; + yyg->yy_buffer_stack_max = 0; + yyg->yy_c_buf_p = (char *) 0; + yyg->yy_init = 0; + yyg->yy_start = 0; + + yyg->yy_start_stack_ptr = 0; + yyg->yy_start_stack_depth = 0; + yyg->yy_start_stack = NULL; + +/* Defined in main.c */ +#ifdef YY_STDINIT + yyin = stdin; + yyout = stdout; +#else + yyin = (FILE *) 0; + yyout = (FILE *) 0; +#endif + + /* For future reference: Set errno on error, since we are called by + * yylex_init() + */ + return 0; +} + +/* yylex_destroy is for both reentrant and non-reentrant scanners. */ +int yylex_destroy (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* Pop the buffer stack, destroying each element. */ + while(YY_CURRENT_BUFFER){ + yy_delete_buffer(YY_CURRENT_BUFFER ,yyscanner ); + YY_CURRENT_BUFFER_LVALUE = NULL; + yypop_buffer_state(yyscanner); + } + + /* Destroy the stack itself. */ + yyfree(yyg->yy_buffer_stack ,yyscanner); + yyg->yy_buffer_stack = NULL; + + /* Destroy the start condition stack. */ + yyfree(yyg->yy_start_stack ,yyscanner ); + yyg->yy_start_stack = NULL; + + /* Reset the globals. This is important in a non-reentrant scanner so the next time + * yylex() is called, initialization will occur. */ + yy_init_globals( yyscanner); + + /* Destroy the main struct (reentrant only). */ + yyfree ( yyscanner , yyscanner ); + yyscanner = NULL; + return 0; +} + +/* + * Internal utility routines. + */ + +#ifndef yytext_ptr +static void yy_flex_strncpy (char* s1, yyconst char * s2, int n , yyscan_t yyscanner) +{ + register int i; + for ( i = 0; i < n; ++i ) + s1[i] = s2[i]; +} +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * s , yyscan_t yyscanner) +{ + register int n; + for ( n = 0; s[n]; ++n ) + ; + + return n; +} +#endif + +void *yyalloc (yy_size_t size , yyscan_t yyscanner) +{ + return (void *) malloc( size ); +} + +void *yyrealloc (void * ptr, yy_size_t size , yyscan_t yyscanner) +{ + /* The cast to (char *) in the following accommodates both + * implementations that use char* generic pointers, and those + * that use void* generic pointers. It works with the latter + * because both ANSI C and C++ allow castless assignment from + * any pointer type to void*, and deal with argument conversions + * as though doing an assignment. + */ + return (void *) realloc( (char *) ptr, size ); +} + +void yyfree (void * ptr , yyscan_t yyscanner) +{ + free( (char *) ptr ); /* see yyrealloc() for (char *) cast */ +} + +#define YYTABLES_NAME "yytables" + +#line 87 "_jsgf_scanner.l" + + + diff --git a/media/sphinxbase/src/libsphinxbase/lm/jsgf_scanner.h b/media/sphinxbase/src/libsphinxbase/lm/jsgf_scanner.h new file mode 100644 index 000000000..72abefb88 --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/jsgf_scanner.h @@ -0,0 +1,352 @@ +#ifndef yyHEADER_H +#define yyHEADER_H 1 +#define yyIN_HEADER 1 + +#line 6 "jsgf_scanner.h" + +#line 8 "jsgf_scanner.h" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 5 +#define YY_FLEX_SUBMINOR_VERSION 37 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <stdlib.h> + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include <inttypes.h> +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#endif /* ! C99 */ + +#endif /* ! FLEXINT_H */ + +#ifdef __cplusplus + +/* The "const" storage-class-modifier is valid. */ +#define YY_USE_CONST + +#else /* ! __cplusplus */ + +/* C99 requires __STDC__ to be defined as 1. */ +#if defined (__STDC__) + +#define YY_USE_CONST + +#endif /* defined (__STDC__) */ +#endif /* ! __cplusplus */ + +#ifdef YY_USE_CONST +#define yyconst const +#else +#define yyconst +#endif + +/* An opaque pointer. */ +#ifndef YY_TYPEDEF_YY_SCANNER_T +#define YY_TYPEDEF_YY_SCANNER_T +typedef void* yyscan_t; +#endif + +/* For convenience, these vars (plus the bison vars far below) + are macros in the reentrant scanner. */ +#define yyin yyg->yyin_r +#define yyout yyg->yyout_r +#define yyextra yyg->yyextra_r +#define yyleng yyg->yyleng_r +#define yytext yyg->yytext_r +#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno) +#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column) +#define yy_flex_debug yyg->yy_flex_debug_r + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#define YY_BUF_SIZE 16384 +#endif + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + yy_size_t yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + yy_size_t yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +void yyrestart (FILE *input_file ,yyscan_t yyscanner ); +void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); +YY_BUFFER_STATE yy_create_buffer (FILE *file,int size ,yyscan_t yyscanner ); +void yy_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); +void yy_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); +void yypush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); +void yypop_buffer_state (yyscan_t yyscanner ); + +YY_BUFFER_STATE yy_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner ); +YY_BUFFER_STATE yy_scan_string (yyconst char *yy_str ,yyscan_t yyscanner ); +YY_BUFFER_STATE yy_scan_bytes (yyconst char *bytes,yy_size_t len ,yyscan_t yyscanner ); + +void *yyalloc (yy_size_t ,yyscan_t yyscanner ); +void *yyrealloc (void *,yy_size_t ,yyscan_t yyscanner ); +void yyfree (void * ,yyscan_t yyscanner ); + +/* Begin user sect3 */ + +#define yywrap(yyscanner) 1 +#define YY_SKIP_YYWRAP + +#define yytext_ptr yytext_r + +#ifdef YY_HEADER_EXPORT_START_CONDITIONS +#define INITIAL 0 +#define COMMENT 1 +#define DECL 2 +#define DECLCOMMENT 3 + +#endif + + +#ifdef HAVE_UNISTD_H +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include <unistd.h> +#endif +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +int yylex_init (yyscan_t* scanner); + +int yylex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +int yylex_destroy (yyscan_t yyscanner ); + +int yyget_debug (yyscan_t yyscanner ); + +void yyset_debug (int debug_flag ,yyscan_t yyscanner ); + +YY_EXTRA_TYPE yyget_extra (yyscan_t yyscanner ); + +void yyset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner ); + +FILE *yyget_in (yyscan_t yyscanner ); + +void yyset_in (FILE * in_str ,yyscan_t yyscanner ); + +FILE *yyget_out (yyscan_t yyscanner ); + +void yyset_out (FILE * out_str ,yyscan_t yyscanner ); + +yy_size_t yyget_leng (yyscan_t yyscanner ); + +char *yyget_text (yyscan_t yyscanner ); + +int yyget_lineno (yyscan_t yyscanner ); + +void yyset_lineno (int line_number ,yyscan_t yyscanner ); + +int yyget_column (yyscan_t yyscanner ); + +void yyset_column (int column_no ,yyscan_t yyscanner ); + +YYSTYPE * yyget_lval (yyscan_t yyscanner ); + +void yyset_lval (YYSTYPE * yylval_param ,yyscan_t yyscanner ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int yywrap (yyscan_t yyscanner ); +#else +extern int yywrap (yyscan_t yyscanner ); +#endif +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner); +#endif + +#ifndef YY_NO_INPUT + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#define YY_READ_BUF_SIZE 8192 +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int yylex \ + (YYSTYPE * yylval_param ,yyscan_t yyscanner); + +#define YY_DECL int yylex \ + (YYSTYPE * yylval_param , yyscan_t yyscanner) +#endif /* !YY_DECL */ + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + +#undef YY_NEW_FILE +#undef YY_FLUSH_BUFFER +#undef yy_set_bol +#undef yy_new_buffer +#undef yy_set_interactive +#undef YY_DO_BEFORE_ACTION + +#ifdef YY_DECL_IS_OURS +#undef YY_DECL_IS_OURS +#undef YY_DECL +#endif + +#line 87 "_jsgf_scanner.l" + + +#line 348 "jsgf_scanner.h" +#undef yyIN_HEADER +#endif /* yyHEADER_H */ diff --git a/media/sphinxbase/src/libsphinxbase/lm/lm3g_model.c b/media/sphinxbase/src/libsphinxbase/lm/lm3g_model.c new file mode 100644 index 000000000..e9943001e --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/lm3g_model.c @@ -0,0 +1,258 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2007 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/* + * \file lm3g_model.c Core Sphinx 3-gram code used in + * DMP/DMP32/ARPA (for now) model code. + * + * Author: A cast of thousands, probably. + */ +#include <string.h> +#include <assert.h> +#include <limits.h> + +#include "sphinxbase/listelem_alloc.h" +#include "sphinxbase/ckd_alloc.h" +#include "sphinxbase/err.h" + +#include "lm3g_model.h" + +void +lm3g_tginfo_free(ngram_model_t *base, lm3g_model_t *lm3g) +{ + if (lm3g->tginfo == NULL) + return; + listelem_alloc_free(lm3g->le); + ckd_free(lm3g->tginfo); +} + +void +lm3g_tginfo_reset(ngram_model_t *base, lm3g_model_t *lm3g) +{ + if (lm3g->tginfo == NULL) + return; + listelem_alloc_free(lm3g->le); + memset(lm3g->tginfo, 0, base->n_counts[0] * sizeof(tginfo_t *)); + lm3g->le = listelem_alloc_init(sizeof(tginfo_t)); +} + +void +lm3g_apply_weights(ngram_model_t *base, + lm3g_model_t *lm3g, + float32 lw, float32 wip, float32 uw) +{ + int32 log_wip, log_uw, log_uniform_weight; + int i; + + /* Precalculate some log values we will like. */ + log_wip = logmath_log(base->lmath, wip); + log_uw = logmath_log(base->lmath, uw); + log_uniform_weight = logmath_log(base->lmath, 1.0 - uw); + + for (i = 0; i < base->n_counts[0]; ++i) { + int32 prob1, bo_wt, n_used; + + /* Backoff weights just get scaled by the lw. */ + bo_wt = (int32)(lm3g->unigrams[i].bo_wt1.l / base->lw); + /* Unscaling unigram probs is a bit more complicated, so punt + * it back to the general code. */ + prob1 = ngram_ng_prob(base, i, NULL, 0, &n_used); + /* Now compute the new scaled probabilities. */ + lm3g->unigrams[i].bo_wt1.l = (int32)(bo_wt * lw); + if (strcmp(base->word_str[i], "<s>") == 0) { /* FIXME: configurable start_sym */ + /* Apply language weight and WIP */ + lm3g->unigrams[i].prob1.l = (int32)(prob1 * lw) + log_wip; + } + else { + /* Interpolate unigram probability with uniform. */ + prob1 += log_uw; + prob1 = logmath_add(base->lmath, prob1, base->log_uniform + log_uniform_weight); + /* Apply language weight and WIP */ + lm3g->unigrams[i].prob1.l = (int32)(prob1 * lw) + log_wip; + } + } + + for (i = 0; i < lm3g->n_prob2; ++i) { + int32 prob2; + /* Can't just punt this back to general code since it is quantized. */ + prob2 = (int32)((lm3g->prob2[i].l - base->log_wip) / base->lw); + lm3g->prob2[i].l = (int32)(prob2 * lw) + log_wip; + } + + if (base->n > 2) { + for (i = 0; i < lm3g->n_bo_wt2; ++i) { + lm3g->bo_wt2[i].l = (int32)(lm3g->bo_wt2[i].l / base->lw * lw); + } + for (i = 0; i < lm3g->n_prob3; i++) { + int32 prob3; + /* Can't just punt this back to general code since it is quantized. */ + prob3 = (int32)((lm3g->prob3[i].l - base->log_wip) / base->lw); + lm3g->prob3[i].l = (int32)(prob3 * lw) + log_wip; + } + } + + /* Store updated values in the model. */ + base->log_wip = log_wip; + base->log_uw = log_uw; + base->log_uniform_weight = log_uniform_weight; + base->lw = lw; +} + +int32 +lm3g_add_ug(ngram_model_t *base, + lm3g_model_t *lm3g, int32 wid, int32 lweight) +{ + int32 score; + + /* This would be very bad if this happened! */ + assert(!NGRAM_IS_CLASSWID(wid)); + + /* Reallocate unigram array. */ + lm3g->unigrams = ckd_realloc(lm3g->unigrams, + sizeof(*lm3g->unigrams) * base->n_1g_alloc); + memset(lm3g->unigrams + base->n_counts[0], 0, + (base->n_1g_alloc - base->n_counts[0]) * sizeof(*lm3g->unigrams)); + /* Reallocate tginfo array. */ + lm3g->tginfo = ckd_realloc(lm3g->tginfo, + sizeof(*lm3g->tginfo) * base->n_1g_alloc); + memset(lm3g->tginfo + base->n_counts[0], 0, + (base->n_1g_alloc - base->n_counts[0]) * sizeof(*lm3g->tginfo)); + /* FIXME: we really ought to update base->log_uniform *and* + * renormalize all the other unigrams. This is really slow, so I + * will probably just provide a function to renormalize after + * adding unigrams, for anyone who really cares. */ + /* This could be simplified but then we couldn't do it in logmath */ + score = lweight + base->log_uniform + base->log_uw; + score = logmath_add(base->lmath, score, + base->log_uniform + base->log_uniform_weight); + lm3g->unigrams[wid].prob1.l = score; + /* This unigram by definition doesn't participate in any bigrams, + * so its backoff weight and bigram pointer are both undefined. */ + lm3g->unigrams[wid].bo_wt1.l = 0; + lm3g->unigrams[wid].bigrams = 0; + /* Finally, increase the unigram count */ + ++base->n_counts[0]; + /* FIXME: Note that this can actually be quite bogus due to the + * presence of class words. If wid falls outside the unigram + * count, increase it to compensate, at the cost of no longer + * really knowing how many unigrams we have :( */ + if (wid >= base->n_counts[0]) + base->n_counts[0] = wid + 1; + + return score; +} + +#define INITIAL_SORTED_ENTRIES MAX_UINT16 + +void +init_sorted_list(sorted_list_t * l) +{ + l->list = ckd_calloc(INITIAL_SORTED_ENTRIES, sizeof(sorted_entry_t)); + l->list[0].val.l = INT_MIN; + l->list[0].lower = 0; + l->list[0].higher = 0; + l->free = 1; + l->size = INITIAL_SORTED_ENTRIES; +} + +void +free_sorted_list(sorted_list_t * l) +{ + free(l->list); +} + +lmprob_t * +vals_in_sorted_list(sorted_list_t * l) +{ + lmprob_t *vals; + int32 i; + + vals = ckd_calloc(l->free, sizeof(lmprob_t)); + for (i = 0; i < l->free; i++) + vals[i] = l->list[i].val; + return (vals); +} + +int32 +sorted_id(sorted_list_t * l, int32 *val) +{ + int32 i = 0; + + for (;;) { + if (*val == l->list[i].val.l) + return (i); + if (*val < l->list[i].val.l) { + if (l->list[i].lower == 0) { + + if (l->free >= l->size) { + int newsize = l->size + INITIAL_SORTED_ENTRIES; + l->list = ckd_realloc(l->list, sizeof(sorted_entry_t) * newsize); + memset(l->list + l->size, + 0, INITIAL_SORTED_ENTRIES * sizeof(sorted_entry_t)); + l->size = newsize; + } + + l->list[i].lower = l->free; + (l->free)++; + i = l->list[i].lower; + l->list[i].val.l = *val; + return (i); + } + else + i = l->list[i].lower; + } + else { + if (l->list[i].higher == 0) { + + if (l->free >= l->size) { + int newsize = l->size + INITIAL_SORTED_ENTRIES; + l->list = ckd_realloc(l->list, sizeof(sorted_entry_t) * newsize); + memset(l->list + l->size, + 0, INITIAL_SORTED_ENTRIES * sizeof(sorted_entry_t)); + l->size = newsize; + } + + l->list[i].higher = l->free; + (l->free)++; + i = l->list[i].higher; + l->list[i].val.l = *val; + return (i); + } + else + i = l->list[i].higher; + } + } +} diff --git a/media/sphinxbase/src/libsphinxbase/lm/lm3g_model.h b/media/sphinxbase/src/libsphinxbase/lm/lm3g_model.h new file mode 100644 index 000000000..698ed81f5 --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/lm3g_model.h @@ -0,0 +1,177 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2007 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/* + * \file lm3g_model.h Core Sphinx 3-gram code used in + * DMP/DMP32/ARPA (for now) model code. + * + * Author: A cast of thousands, probably. + */ + +#ifndef __NGRAM_MODEL_LM3G_H__ +#define __NGRAM_MODEL_LM3G_H__ + +#include "sphinxbase/listelem_alloc.h" + +#include "ngram_model_internal.h" + +/** + * Type used to store language model probabilities + */ +typedef union { + float32 f; + int32 l; +} lmprob_t; + +/** + * Bigram probs and bo-wts, and trigram probs are kept in separate + * tables rather than within the bigram_t and trigram_t structures. + * These tables hold unique prob and bo-wt values. The following tree + * structure is used to construct these tables of unique values. + * Whenever a new value is read from the LM file, the sorted tree + * structure is searched to see if the value already exists, and + * inserted if not found. + */ +typedef struct sorted_entry_s { + lmprob_t val; /**< value being kept in this node */ + uint32 lower; /**< index of another entry. All descendants down + this path have their val < this node's val. + 0 => no son exists (0 is root index) */ + uint32 higher; /**< index of another entry. All descendants down + this path have their val > this node's val + 0 => no son exists (0 is root index) */ +} sorted_entry_t; + +/** + * The sorted list. list is a (64K long) array. The first entry is the + * root of the tree and is created during initialization. + */ +typedef struct { + sorted_entry_t *list; + int32 free; /**< first free element in list */ + int32 size; +} sorted_list_t; + +/** + * Unigram structure (common among all lm3g implementations) + */ +typedef struct unigram_s { + lmprob_t prob1; /**< Unigram probability. */ + lmprob_t bo_wt1; /**< Unigram backoff weight. */ + int32 bigrams; /**< Index of 1st entry in lm_t.bigrams[] */ +} unigram_t; + +/** + * Bigram structure (might be implemented differently) + */ +typedef struct bigram_s bigram_t; +/** + * Trigram structure (might be implemented differently) + */ +typedef struct trigram_s trigram_t; + + +/* + * To conserve space, bigram info is kept in many tables. Since the number + * of distinct values << #bigrams, these table indices can be 16-bit values. + * prob2 and bo_wt2 are such indices, but keeping trigram index is less easy. + * It is supposed to be the index of the first trigram entry for each bigram. + * But such an index cannot be represented in 16-bits, hence the following + * segmentation scheme: Partition bigrams into segments of BG_SEG_SZ + * consecutive entries, such that #trigrams in each segment <= 2**16 (the + * corresponding trigram segment). The bigram_t.trigrams value is then a + * 16-bit relative index within the trigram segment. A separate table-- + * lm_t.tseg_base--has the index of the 1st trigram for each bigram segment. + */ +#define BG_SEG_SZ 512 /* chosen so that #trigram/segment <= 2**16 */ +#define LOG_BG_SEG_SZ 9 + +/** + * Trigram information cache. + * + * The following trigram information cache eliminates most traversals of 1g->2g->3g + * tree to locate trigrams for a given bigram (lw1,lw2). The organization is optimized + * for locality of access (to the same lw1), given lw2. + */ +typedef struct tginfo_s { + int32 w1; /**< lw1 component of bigram lw1,lw2. All bigrams with + same lw2 linked together (see lm_t.tginfo). */ + int32 n_tg; /**< number tg for parent bigram lw1,lw2 */ + int32 bowt; /**< tg bowt for lw1,lw2 */ + int32 used; /**< whether used since last lm_reset */ + trigram_t *tg; /**< Trigrams for lw1,lw2 */ + struct tginfo_s *next; /**< Next lw1 with same parent lw2; NULL if none. */ +} tginfo_t; + +/** + * Common internal structure for Sphinx 3-gram models. + */ +typedef struct lm3g_model_s { + unigram_t *unigrams; + bigram_t *bigrams; + trigram_t *trigrams; + lmprob_t *prob2; /**< Table of actual bigram probs */ + int32 n_prob2; /**< prob2 size */ + lmprob_t *bo_wt2; /**< Table of actual bigram backoff weights */ + int32 n_bo_wt2; /**< bo_wt2 size */ + lmprob_t *prob3; /**< Table of actual trigram probs */ + int32 n_prob3; /**< prob3 size */ + int32 *tseg_base; /**< tseg_base[i>>LOG_BG_SEG_SZ] = index of 1st + trigram for bigram segment (i>>LOG_BG_SEG_SZ) */ + tginfo_t **tginfo; /**< tginfo[lw2] is head of linked list of trigram information for + some cached subset of bigrams (*,lw2). */ + listelem_alloc_t *le; /**< List element allocator for tginfo. */ +} lm3g_model_t; + +void lm3g_tginfo_free(ngram_model_t *base, lm3g_model_t *lm3g); +void lm3g_tginfo_reset(ngram_model_t *base, lm3g_model_t *lm3g); +void lm3g_apply_weights(ngram_model_t *base, + lm3g_model_t *lm3g, + float32 lw, float32 wip, float32 uw); +int32 lm3g_add_ug(ngram_model_t *base, + lm3g_model_t *lm3g, int32 wid, int32 lweight); + + +/** + * Initialize sorted list with the 0-th entry = MIN_PROB_F, which may be needed + * to replace spurious values in the Darpa LM file. + */ +void init_sorted_list(sorted_list_t *l); +void free_sorted_list(sorted_list_t *l); +lmprob_t *vals_in_sorted_list(sorted_list_t *l); +int32 sorted_id(sorted_list_t * l, int32 *val); + +#endif /* __NGRAM_MODEL_LM3G_H__ */ diff --git a/media/sphinxbase/src/libsphinxbase/lm/lm3g_templates.c b/media/sphinxbase/src/libsphinxbase/lm/lm3g_templates.c new file mode 100644 index 000000000..080cfa8e6 --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/lm3g_templates.c @@ -0,0 +1,560 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2007 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/* + * \file lm3g_templates.c Core Sphinx 3-gram code used in + * DMP/DMP32/ARPA (for now) model code. + */ + +#include <assert.h> + +/* Locate a specific bigram within a bigram list */ +#define BINARY_SEARCH_THRESH 16 +static int32 +find_bg(bigram_t * bg, int32 n, int32 w) +{ + int32 i, b, e; + + /* Binary search until segment size < threshold */ + b = 0; + e = n; + while (e - b > BINARY_SEARCH_THRESH) { + i = (b + e) >> 1; + if (bg[i].wid < w) + b = i + 1; + else if (bg[i].wid > w) + e = i; + else + return i; + } + + /* Linear search within narrowed segment */ + for (i = b; (i < e) && (bg[i].wid != w); i++); + return ((i < e) ? i : -1); +} + +static int32 +lm3g_bg_score(NGRAM_MODEL_TYPE *model, + int32 lw1, int32 lw2, int32 *n_used) +{ + int32 i, n, b, score; + bigram_t *bg; + + if (lw1 < 0 || model->base.n < 2) { + *n_used = 1; + return model->lm3g.unigrams[lw2].prob1.l; + } + + b = FIRST_BG(model, lw1); + n = FIRST_BG(model, lw1 + 1) - b; + bg = model->lm3g.bigrams + b; + + if ((i = find_bg(bg, n, lw2)) >= 0) { + /* Access mode = bigram */ + *n_used = 2; + score = model->lm3g.prob2[bg[i].prob2].l; + } + else { + /* Access mode = unigram */ + *n_used = 1; + score = model->lm3g.unigrams[lw1].bo_wt1.l + model->lm3g.unigrams[lw2].prob1.l; + } + + return (score); +} + +static void +load_tginfo(NGRAM_MODEL_TYPE *model, int32 lw1, int32 lw2) +{ + int32 i, n, b, t; + bigram_t *bg; + tginfo_t *tginfo; + + /* First allocate space for tg information for bg lw1,lw2 */ + tginfo = (tginfo_t *) listelem_malloc(model->lm3g.le); + tginfo->w1 = lw1; + tginfo->tg = NULL; + tginfo->next = model->lm3g.tginfo[lw2]; + model->lm3g.tginfo[lw2] = tginfo; + + /* Locate bigram lw1,lw2 */ + b = model->lm3g.unigrams[lw1].bigrams; + n = model->lm3g.unigrams[lw1 + 1].bigrams - b; + bg = model->lm3g.bigrams + b; + + if ((n > 0) && ((i = find_bg(bg, n, lw2)) >= 0)) { + tginfo->bowt = model->lm3g.bo_wt2[bg[i].bo_wt2].l; + + /* Find t = Absolute first trigram index for bigram lw1,lw2 */ + b += i; /* b = Absolute index of bigram lw1,lw2 on disk */ + t = FIRST_TG(model, b); + + tginfo->tg = model->lm3g.trigrams + t; + + /* Find #tg for bigram w1,w2 */ + tginfo->n_tg = FIRST_TG(model, b + 1) - t; + } + else { /* No bigram w1,w2 */ + tginfo->bowt = 0; + tginfo->n_tg = 0; + } +} + +/* Similar to find_bg */ +static int32 +find_tg(trigram_t * tg, int32 n, uint32 w) +{ + int32 i, b, e; + + b = 0; + e = n; + while (e - b > BINARY_SEARCH_THRESH) { + i = (b + e) >> 1; + if (tg[i].wid < w) + b = i + 1; + else if (tg[i].wid > w) + e = i; + else + return i; + } + + for (i = b; (i < e) && (tg[i].wid != w); i++); + return ((i < e) ? i : -1); +} + +static int32 +lm3g_tg_score(NGRAM_MODEL_TYPE *model, int32 lw1, + int32 lw2, int32 lw3, int32 *n_used) +{ + ngram_model_t *base = &model->base; + int32 i, n, score; + trigram_t *tg; + tginfo_t *tginfo, *prev_tginfo; + + if ((base->n < 3) || (lw1 < 0) || (lw2 < 0)) + return (lm3g_bg_score(model, lw2, lw3, n_used)); + + prev_tginfo = NULL; + for (tginfo = model->lm3g.tginfo[lw2]; tginfo; tginfo = tginfo->next) { + if (tginfo->w1 == lw1) + break; + prev_tginfo = tginfo; + } + + if (!tginfo) { + load_tginfo(model, lw1, lw2); + tginfo = model->lm3g.tginfo[lw2]; + } + else if (prev_tginfo) { + prev_tginfo->next = tginfo->next; + tginfo->next = model->lm3g.tginfo[lw2]; + model->lm3g.tginfo[lw2] = tginfo; + } + + tginfo->used = 1; + + /* Trigrams for w1,w2 now pointed to by tginfo */ + n = tginfo->n_tg; + tg = tginfo->tg; + if ((i = find_tg(tg, n, lw3)) >= 0) { + /* Access mode = trigram */ + *n_used = 3; + score = model->lm3g.prob3[tg[i].prob3].l; + } + else { + score = tginfo->bowt + lm3g_bg_score(model, lw2, lw3, n_used); + } + + return (score); +} + +static int32 +lm3g_template_score(ngram_model_t *base, int32 wid, + int32 *history, int32 n_hist, + int32 *n_used) +{ + NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base; + switch (n_hist) { + case 0: + /* Access mode: unigram */ + *n_used = 1; + return model->lm3g.unigrams[wid].prob1.l; + case 1: + return lm3g_bg_score(model, history[0], wid, n_used); + case 2: + default: + /* Anything greater than 2 is the same as a trigram for now. */ + return lm3g_tg_score(model, history[1], history[0], wid, n_used); + } +} + +static int32 +lm3g_template_raw_score(ngram_model_t *base, int32 wid, + int32 *history, int32 n_hist, + int32 *n_used) +{ + NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base; + int32 score; + + switch (n_hist) { + case 0: + /* Access mode: unigram */ + *n_used = 1; + /* Undo insertion penalty. */ + score = model->lm3g.unigrams[wid].prob1.l - base->log_wip; + /* Undo language weight. */ + score = (int32)(score / base->lw); + /* Undo unigram interpolation */ + if (strcmp(base->word_str[wid], "<s>") != 0) { /* FIXME: configurable start_sym */ + /* This operation is numerically unstable, so try to avoid it + * as possible */ + if (base->log_uniform + base->log_uniform_weight > logmath_get_zero(base->lmath)) { + score = logmath_log(base->lmath, + logmath_exp(base->lmath, score) + - logmath_exp(base->lmath, + base->log_uniform + base->log_uniform_weight)); + } + } + return score; + case 1: + score = lm3g_bg_score(model, history[0], wid, n_used); + break; + case 2: + default: + /* Anything greater than 2 is the same as a trigram for now. */ + score = lm3g_tg_score(model, history[1], history[0], wid, n_used); + break; + } + /* FIXME (maybe): This doesn't undo unigram weighting in backoff cases. */ + return (int32)((score - base->log_wip) / base->lw); +} + +static int32 +lm3g_template_add_ug(ngram_model_t *base, + int32 wid, int32 lweight) +{ + NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base; + return lm3g_add_ug(base, &model->lm3g, wid, lweight); +} + +static void +lm3g_template_flush(ngram_model_t *base) +{ + NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base; + lm3g_tginfo_reset(base, &model->lm3g); +} + +typedef struct lm3g_iter_s { + ngram_iter_t base; + unigram_t *ug; + bigram_t *bg; + trigram_t *tg; +} lm3g_iter_t; + +static ngram_iter_t * +lm3g_template_iter(ngram_model_t *base, int32 wid, + int32 *history, int32 n_hist) +{ + NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base; + lm3g_iter_t *itor = (lm3g_iter_t *)ckd_calloc(1, sizeof(*itor)); + + ngram_iter_init((ngram_iter_t *)itor, base, n_hist, FALSE); + + if (n_hist == 0) { + /* Unigram is the easiest. */ + itor->ug = model->lm3g.unigrams + wid; + return (ngram_iter_t *)itor; + } + else if (n_hist == 1) { + int32 i, n, b; + /* Find the bigram, as in bg_score above (duplicate code...) */ + itor->ug = model->lm3g.unigrams + history[0]; + b = FIRST_BG(model, history[0]); + n = FIRST_BG(model, history[0] + 1) - b; + itor->bg = model->lm3g.bigrams + b; + /* If no such bigram exists then fail. */ + if ((i = find_bg(itor->bg, n, wid)) < 0) { + ngram_iter_free((ngram_iter_t *)itor); + return NULL; + } + itor->bg += i; + return (ngram_iter_t *)itor; + } + else if (n_hist == 2) { + int32 i, n; + tginfo_t *tginfo, *prev_tginfo; + /* Find the trigram, as in tg_score above (duplicate code...) */ + itor->ug = model->lm3g.unigrams + history[1]; + prev_tginfo = NULL; + for (tginfo = model->lm3g.tginfo[history[0]]; + tginfo; tginfo = tginfo->next) { + if (tginfo->w1 == history[1]) + break; + prev_tginfo = tginfo; + } + + if (!tginfo) { + load_tginfo(model, history[1], history[0]); + tginfo = model->lm3g.tginfo[history[0]]; + } + else if (prev_tginfo) { + prev_tginfo->next = tginfo->next; + tginfo->next = model->lm3g.tginfo[history[0]]; + model->lm3g.tginfo[history[0]] = tginfo; + } + + tginfo->used = 1; + + /* Trigrams for w1,w2 now pointed to by tginfo */ + n = tginfo->n_tg; + itor->tg = tginfo->tg; + if ((i = find_tg(itor->tg, n, wid)) >= 0) { + itor->tg += i; + /* Now advance the bigram pointer accordingly. FIXME: + * Note that we actually already found the relevant bigram + * in load_tginfo. */ + itor->bg = model->lm3g.bigrams; + while (FIRST_TG(model, (itor->bg - model->lm3g.bigrams + 1)) + <= (itor->tg - model->lm3g.trigrams)) + ++itor->bg; + return (ngram_iter_t *)itor; + } + else { + ngram_iter_free((ngram_iter_t *)itor); + return (ngram_iter_t *)NULL; + } + } + else { + /* Should not happen. */ + assert(n_hist == 0); /* Guaranteed to fail. */ + ngram_iter_free((ngram_iter_t *)itor); + return NULL; + } +} + +static ngram_iter_t * +lm3g_template_mgrams(ngram_model_t *base, int m) +{ + NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base; + lm3g_iter_t *itor = (lm3g_iter_t *)ckd_calloc(1, sizeof(*itor)); + ngram_iter_init((ngram_iter_t *)itor, base, m, FALSE); + + itor->ug = model->lm3g.unigrams; + itor->bg = model->lm3g.bigrams; + itor->tg = model->lm3g.trigrams; + + /* Advance bigram pointer to match first trigram. */ + if (m > 1 && base->n_counts[1] > 1) { + while (FIRST_TG(model, (itor->bg - model->lm3g.bigrams + 1)) + <= (itor->tg - model->lm3g.trigrams)) + ++itor->bg; + } + + /* Advance unigram pointer to match first bigram. */ + if (m > 0 && base->n_counts[0] > 1) { + while (itor->ug[1].bigrams <= (itor->bg - model->lm3g.bigrams)) + ++itor->ug; + } + + return (ngram_iter_t *)itor; +} + +static ngram_iter_t * +lm3g_template_successors(ngram_iter_t *bitor) +{ + NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)bitor->model; + lm3g_iter_t *from = (lm3g_iter_t *)bitor; + lm3g_iter_t *itor = (lm3g_iter_t *)ckd_calloc(1, sizeof(*itor)); + + itor->ug = from->ug; + switch (bitor->m) { + case 0: + /* Next itor bigrams is the same as this itor bigram or + itor bigrams is more than total count. This means no successors */ + if (((itor->ug + 1) - model->lm3g.unigrams < bitor->model->n_counts[0] && + itor->ug->bigrams == (itor->ug + 1)->bigrams) || + itor->ug->bigrams == bitor->model->n_counts[1]) + goto done; + + /* Start iterating from first bigram successor of from->ug. */ + itor->bg = model->lm3g.bigrams + itor->ug->bigrams; + break; + case 1: + itor->bg = from->bg; + + /* This indicates no successors */ + if (((itor->bg + 1) - model->lm3g.bigrams < bitor->model->n_counts[1] && + FIRST_TG (model, itor->bg - model->lm3g.bigrams) == + FIRST_TG (model, (itor->bg + 1) - model->lm3g.bigrams)) || + FIRST_TG (model, itor->bg - model->lm3g.bigrams) == bitor->model->n_counts[2]) + goto done; + + /* Start iterating from first trigram successor of from->bg. */ + itor->tg = (model->lm3g.trigrams + + FIRST_TG(model, (itor->bg - model->lm3g.bigrams))); +#if 0 + printf("%s %s => %d (%s)\n", + model->base.word_str[itor->ug - model->lm3g.unigrams], + model->base.word_str[itor->bg->wid], + FIRST_TG(model, (itor->bg - model->lm3g.bigrams)), + model->base.word_str[itor->tg->wid]); +#endif + break; + case 2: + default: + /* All invalid! */ + goto done; + } + + ngram_iter_init((ngram_iter_t *)itor, bitor->model, bitor->m + 1, TRUE); + return (ngram_iter_t *)itor; + done: + ckd_free(itor); + return NULL; +} + +static int32 const * +lm3g_template_iter_get(ngram_iter_t *base, + int32 *out_score, int32 *out_bowt) +{ + NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base->model; + lm3g_iter_t *itor = (lm3g_iter_t *)base; + + base->wids[0] = itor->ug - model->lm3g.unigrams; + if (itor->bg) base->wids[1] = itor->bg->wid; + if (itor->tg) base->wids[2] = itor->tg->wid; +#if 0 + printf("itor_get: %d %d %d\n", base->wids[0], base->wids[1], base->wids[2]); +#endif + + switch (base->m) { + case 0: + *out_score = itor->ug->prob1.l; + *out_bowt = itor->ug->bo_wt1.l; + break; + case 1: + *out_score = model->lm3g.prob2[itor->bg->prob2].l; + if (model->lm3g.bo_wt2) + *out_bowt = model->lm3g.bo_wt2[itor->bg->bo_wt2].l; + else + *out_bowt = 0; + break; + case 2: + *out_score = model->lm3g.prob3[itor->tg->prob3].l; + *out_bowt = 0; + break; + default: /* Should not happen. */ + return NULL; + } + return base->wids; +} + +static ngram_iter_t * +lm3g_template_iter_next(ngram_iter_t *base) +{ + NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base->model; + lm3g_iter_t *itor = (lm3g_iter_t *)base; + + switch (base->m) { + case 0: + ++itor->ug; + /* Check for end condition. */ + if (itor->ug - model->lm3g.unigrams >= base->model->n_counts[0]) + goto done; + break; + case 1: + ++itor->bg; + /* Check for end condition. */ + if (itor->bg - model->lm3g.bigrams >= base->model->n_counts[1]) + goto done; + /* Advance unigram pointer if necessary in order to get one + * that points to this bigram. */ + while (itor->bg - model->lm3g.bigrams >= itor->ug[1].bigrams) { + /* Stop if this is a successor iterator, since we don't + * want a new unigram. */ + if (base->successor) + goto done; + ++itor->ug; + if (itor->ug == model->lm3g.unigrams + base->model->n_counts[0]) { + E_ERROR("Bigram %d has no valid unigram parent\n", + itor->bg - model->lm3g.bigrams); + goto done; + } + } + break; + case 2: + ++itor->tg; + /* Check for end condition. */ + if (itor->tg - model->lm3g.trigrams >= base->model->n_counts[2]) + goto done; + /* Advance bigram pointer if necessary. */ + while (itor->tg - model->lm3g.trigrams >= + FIRST_TG(model, (itor->bg - model->lm3g.bigrams + 1))) { + if (base->successor) + goto done; + ++itor->bg; + if (itor->bg == model->lm3g.bigrams + base->model->n_counts[1]) { + E_ERROR("Trigram %d has no valid bigram parent\n", + itor->tg - model->lm3g.trigrams); + + goto done; + } + } + /* Advance unigram pointer if necessary. */ + while (itor->bg - model->lm3g.bigrams >= itor->ug[1].bigrams) { + ++itor->ug; + if (itor->ug == model->lm3g.unigrams + base->model->n_counts[0]) { + E_ERROR("Trigram %d has no valid unigram parent\n", + itor->tg - model->lm3g.trigrams); + goto done; + } + } + break; + default: /* Should not happen. */ + goto done; + } + + return (ngram_iter_t *)itor; +done: + ngram_iter_free(base); + return NULL; +} + +static void +lm3g_template_iter_free(ngram_iter_t *base) +{ + ckd_free(base); +} diff --git a/media/sphinxbase/src/libsphinxbase/lm/ngram_model.c b/media/sphinxbase/src/libsphinxbase/lm/ngram_model.c new file mode 100644 index 000000000..02af4151b --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/ngram_model.c @@ -0,0 +1,1129 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2007 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/* + * \file ngram_model.c N-Gram language models. + * + * Author: David Huggins-Daines, much code taken from sphinx3/src/libs3decoder/liblm + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <string.h> +#include <assert.h> + +#include "sphinxbase/ngram_model.h" +#include "sphinxbase/ckd_alloc.h" +#include "sphinxbase/filename.h" +#include "sphinxbase/pio.h" +#include "sphinxbase/err.h" +#include "sphinxbase/logmath.h" +#include "sphinxbase/strfuncs.h" +#include "sphinxbase/case.h" + +#include "ngram_model_internal.h" + +ngram_file_type_t +ngram_file_name_to_type(const char *file_name) +{ + const char *ext; + + ext = strrchr(file_name, '.'); + if (ext == NULL) { + return NGRAM_INVALID; + } + if (0 == strcmp_nocase(ext, ".gz")) { + while (--ext >= file_name) { + if (*ext == '.') break; + } + if (ext < file_name) { + return NGRAM_INVALID; + } + } + else if (0 == strcmp_nocase(ext, ".bz2")) { + while (--ext >= file_name) { + if (*ext == '.') break; + } + if (ext < file_name) { + return NGRAM_INVALID; + } + } + /* We use strncmp because there might be a .gz on the end. */ + if (0 == strncmp_nocase(ext, ".ARPA", 5)) + return NGRAM_ARPA; + if (0 == strncmp_nocase(ext, ".DMP", 4)) + return NGRAM_DMP; + return NGRAM_INVALID; + } + +ngram_file_type_t +ngram_str_to_type(const char *str_name) +{ + if (0 == strcmp_nocase(str_name, "arpa")) + return NGRAM_ARPA; + if (0 == strcmp_nocase(str_name, "dmp")) + return NGRAM_DMP; + return NGRAM_INVALID; +} + +char const * +ngram_type_to_str(int type) +{ + switch (type) { + case NGRAM_ARPA: + return "arpa"; + case NGRAM_DMP: + return "dmp"; + default: + return NULL; + } +} + + + ngram_model_t * + ngram_model_read(cmd_ln_t *config, + const char *file_name, + ngram_file_type_t file_type, + logmath_t *lmath) + { + ngram_model_t *model = NULL; + + switch (file_type) { + case NGRAM_AUTO: { + if ((model = ngram_model_arpa_read(config, file_name, lmath)) != NULL) + break; + if ((model = ngram_model_dmp_read(config, file_name, lmath)) != NULL) + break; + return NULL; + } + case NGRAM_ARPA: + model = ngram_model_arpa_read(config, file_name, lmath); + break; + case NGRAM_DMP: + model = ngram_model_dmp_read(config, file_name, lmath); + break; + default: + E_ERROR("language model file type not supported\n"); + return NULL; + } + + /* Now set weights based on config if present. */ + if (config) { + float32 lw = 1.0; + float32 wip = 1.0; + float32 uw = 1.0; + + if (cmd_ln_exists_r(config, "-lw")) + lw = cmd_ln_float32_r(config, "-lw"); + if (cmd_ln_exists_r(config, "-wip")) + wip = cmd_ln_float32_r(config, "-wip"); + if (cmd_ln_exists_r(config, "-uw")) + uw = cmd_ln_float32_r(config, "-uw"); + + ngram_model_apply_weights(model, lw, wip, uw); + } + + return model; + } + + int + ngram_model_write(ngram_model_t *model, const char *file_name, + ngram_file_type_t file_type) + { + switch (file_type) { + case NGRAM_AUTO: { + file_type = ngram_file_name_to_type(file_name); + /* Default to ARPA (catches .lm and other things) */ + if (file_type == NGRAM_INVALID) + file_type = NGRAM_ARPA; + return ngram_model_write(model, file_name, file_type); + } + case NGRAM_ARPA: + return ngram_model_arpa_write(model, file_name); + case NGRAM_DMP: + return ngram_model_dmp_write(model, file_name); + default: + E_ERROR("language model file type not supported\n"); + return -1; + } + E_ERROR("language model file type not supported\n"); + return -1; + } + + int32 + ngram_model_init(ngram_model_t *base, + ngram_funcs_t *funcs, + logmath_t *lmath, + int32 n, int32 n_unigram) + { + base->refcount = 1; + base->funcs = funcs; + base->n = n; + /* If this was previously initialized... */ + if (base->n_counts == NULL) + base->n_counts = ckd_calloc(3, sizeof(*base->n_counts)); + /* Don't reset weights if logmath object hasn't changed. */ + if (base->lmath != lmath) { + /* Set default values for weights. */ + base->lw = 1.0; + base->log_wip = 0; /* i.e. 1.0 */ + base->log_uw = 0; /* i.e. 1.0 */ + base->log_uniform = logmath_log(lmath, 1.0 / n_unigram); + base->log_uniform_weight = logmath_get_zero(lmath); + base->log_zero = logmath_get_zero(lmath); + base->lmath = lmath; + } + /* Allocate or reallocate space for word strings. */ + if (base->word_str) { + /* Free all previous word strings if they were allocated. */ + if (base->writable) { + int32 i; + for (i = 0; i < base->n_words; ++i) { + ckd_free(base->word_str[i]); + base->word_str[i] = NULL; + } + } + base->word_str = ckd_realloc(base->word_str, n_unigram * sizeof(char *)); + } + else + base->word_str = ckd_calloc(n_unigram, sizeof(char *)); + /* NOTE: They are no longer case-insensitive since we are allowing + * other encodings for word strings. Beware. */ + if (base->wid) + hash_table_empty(base->wid); + else + base->wid = hash_table_new(n_unigram, FALSE); + base->n_counts[0] = base->n_1g_alloc = base->n_words = n_unigram; + + return 0; +} + +ngram_model_t * +ngram_model_retain(ngram_model_t *model) +{ + ++model->refcount; + return model; +} + + +void +ngram_model_flush(ngram_model_t *model) +{ + if (model->funcs && model->funcs->flush) + (*model->funcs->flush)(model); +} + +int +ngram_model_free(ngram_model_t *model) +{ + int i; + + if (model == NULL) + return 0; + if (--model->refcount > 0) + return model->refcount; + if (model->funcs && model->funcs->free) + (*model->funcs->free)(model); + if (model->writable) { + /* Free all words. */ + for (i = 0; i < model->n_words; ++i) { + ckd_free(model->word_str[i]); + } + } + else { + /* Free all class words. */ + for (i = 0; i < model->n_classes; ++i) { + ngram_class_t *lmclass; + int32 j; + + lmclass = model->classes[i]; + for (j = 0; j < lmclass->n_words; ++j) { + ckd_free(model->word_str[lmclass->start_wid + j]); + } + for (j = 0; j < lmclass->n_hash; ++j) { + if (lmclass->nword_hash[j].wid != -1) { + ckd_free(model->word_str[lmclass->nword_hash[j].wid]); + } + } + } + } + for (i = 0; i < model->n_classes; ++i) { + ngram_class_free(model->classes[i]); + } + ckd_free(model->classes); + hash_table_free(model->wid); + ckd_free(model->word_str); + ckd_free(model->n_counts); + ckd_free(model); + return 0; +} + +int +ngram_model_casefold(ngram_model_t *model, int kase) +{ + int writable, i; + hash_table_t *new_wid; + + /* Were word strings already allocated? */ + writable = model->writable; + /* Either way, we are going to allocate some word strings. */ + model->writable = TRUE; + + /* And, don't forget, we need to rebuild the word to unigram ID + * mapping. */ + new_wid = hash_table_new(model->n_words, FALSE); + for (i = 0; i < model->n_words; ++i) { + char *outstr; + if (writable) { + outstr = model->word_str[i]; + } + else { + outstr = ckd_salloc(model->word_str[i]); + } + /* Don't case-fold <tags> or [classes] */ + if (outstr[0] == '<' || outstr[0] == '[') { + } + else { + switch (kase) { + case NGRAM_UPPER: + ucase(outstr); + break; + case NGRAM_LOWER: + lcase(outstr); + break; + default: + ; + } + } + model->word_str[i] = outstr; + + /* Now update the hash table. We might have terrible + * collisions here, so warn about them. */ + if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) { + E_WARN("Duplicate word in dictionary after conversion: %s\n", + model->word_str[i]); + } + } + /* Swap out the hash table. */ + hash_table_free(model->wid); + model->wid = new_wid; + return 0; +} + +int +ngram_model_apply_weights(ngram_model_t *model, + float32 lw, float32 wip, float32 uw) +{ + return (*model->funcs->apply_weights)(model, lw, wip, uw); +} + +float32 +ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip, + int32 *out_log_uw) +{ + if (out_log_wip) *out_log_wip = model->log_wip; + if (out_log_uw) *out_log_uw = model->log_uw; + return model->lw; +} + + +int32 +ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history, + int32 n_hist, int32 *n_used) +{ + int32 score, class_weight = 0; + int i; + + /* Closed vocabulary, OOV word probability is zero */ + if (wid == NGRAM_INVALID_WID) + return model->log_zero; + + /* "Declassify" wid and history */ + if (NGRAM_IS_CLASSWID(wid)) { + ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)]; + + class_weight = ngram_class_prob(lmclass, wid); + if (class_weight == 1) /* Meaning, not found in class. */ + return model->log_zero; + wid = lmclass->tag_wid; + } + for (i = 0; i < n_hist; ++i) { + if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i])) + history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid; + } + score = (*model->funcs->score)(model, wid, history, n_hist, n_used); + + /* Multiply by unigram in-class weight. */ + return score + class_weight; +} + +int32 +ngram_score(ngram_model_t *model, const char *word, ...) +{ + va_list history; + const char *hword; + int32 *histid; + int32 n_hist; + int32 n_used; + int32 prob; + + va_start(history, word); + n_hist = 0; + while ((hword = va_arg(history, const char *)) != NULL) + ++n_hist; + va_end(history); + + histid = ckd_calloc(n_hist, sizeof(*histid)); + va_start(history, word); + n_hist = 0; + while ((hword = va_arg(history, const char *)) != NULL) { + histid[n_hist] = ngram_wid(model, hword); + ++n_hist; + } + va_end(history); + + prob = ngram_ng_score(model, ngram_wid(model, word), + histid, n_hist, &n_used); + ckd_free(histid); + return prob; +} + +int32 +ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used) +{ + int32 hist[2]; + hist[0] = w2; + hist[1] = w1; + return ngram_ng_score(model, w3, hist, 2, n_used); +} + +int32 +ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used) +{ + return ngram_ng_score(model, w2, &w1, 1, n_used); +} + +int32 +ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history, + int32 n_hist, int32 *n_used) +{ + int32 prob, class_weight = 0; + int i; + + /* Closed vocabulary, OOV word probability is zero */ + if (wid == NGRAM_INVALID_WID) + return model->log_zero; + + /* "Declassify" wid and history */ + if (NGRAM_IS_CLASSWID(wid)) { + ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)]; + + class_weight = ngram_class_prob(lmclass, wid); + if (class_weight == 1) /* Meaning, not found in class. */ + return class_weight; + wid = lmclass->tag_wid; + } + for (i = 0; i < n_hist; ++i) { + if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i])) + history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid; + } + prob = (*model->funcs->raw_score)(model, wid, history, + n_hist, n_used); + /* Multiply by unigram in-class weight. */ + return prob + class_weight; +} + +int32 +ngram_probv(ngram_model_t *model, const char *word, ...) +{ + va_list history; + const char *hword; + int32 *histid; + int32 n_hist; + int32 n_used; + int32 prob; + + va_start(history, word); + n_hist = 0; + while ((hword = va_arg(history, const char *)) != NULL) + ++n_hist; + va_end(history); + + histid = ckd_calloc(n_hist, sizeof(*histid)); + va_start(history, word); + n_hist = 0; + while ((hword = va_arg(history, const char *)) != NULL) { + histid[n_hist] = ngram_wid(model, hword); + ++n_hist; + } + va_end(history); + + prob = ngram_ng_prob(model, ngram_wid(model, word), + histid, n_hist, &n_used); + ckd_free(histid); + return prob; +} + +int32 +ngram_prob(ngram_model_t *model, const char *const *words, int32 n) +{ + int32 *ctx_id; + int32 nused; + int32 prob; + int32 wid; + uint32 i; + + ctx_id = (int32 *)ckd_calloc(n - 1, sizeof(*ctx_id)); + for (i = 1; i < n; ++i) + ctx_id[i - 1] = ngram_wid(model, words[i]); + + wid = ngram_wid(model, *words); + prob = ngram_ng_prob(model, wid, ctx_id, n - 1, &nused); + ckd_free(ctx_id); + + return prob; +} + +int32 +ngram_score_to_prob(ngram_model_t *base, int32 score) +{ + int32 prob; + + /* Undo insertion penalty. */ + prob = score - base->log_wip; + /* Undo language weight. */ + prob = (int32)(prob / base->lw); + + return prob; +} + +int32 +ngram_unknown_wid(ngram_model_t *model) +{ + int32 val; + + /* FIXME: This could be memoized for speed if necessary. */ + /* Look up <UNK>, if not found return NGRAM_INVALID_WID. */ + if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1) + return NGRAM_INVALID_WID; + else + return val; +} + +int32 +ngram_zero(ngram_model_t *model) +{ + return model->log_zero; +} + +int32 +ngram_model_get_size(ngram_model_t *model) +{ + if (model != NULL) + return model->n; + return 0; +} + +int32 const * +ngram_model_get_counts(ngram_model_t *model) +{ + if (model != NULL) + return model->n_counts; + return NULL; +} + +void +ngram_iter_init(ngram_iter_t *itor, ngram_model_t *model, + int m, int successor) +{ + itor->model = model; + itor->wids = ckd_calloc(model->n, sizeof(*itor->wids)); + itor->m = m; + itor->successor = successor; +} + +ngram_iter_t * +ngram_model_mgrams(ngram_model_t *model, int m) +{ + ngram_iter_t *itor; + /* The fact that m=n-1 is not exactly obvious. Prevent accidents. */ + if (m >= model->n) + return NULL; + if (model->funcs->mgrams == NULL) + return NULL; + itor = (*model->funcs->mgrams)(model, m); + return itor; +} + +ngram_iter_t * +ngram_iter(ngram_model_t *model, const char *word, ...) +{ + va_list history; + const char *hword; + int32 *histid; + int32 n_hist; + ngram_iter_t *itor; + + va_start(history, word); + n_hist = 0; + while ((hword = va_arg(history, const char *)) != NULL) + ++n_hist; + va_end(history); + + histid = ckd_calloc(n_hist, sizeof(*histid)); + va_start(history, word); + n_hist = 0; + while ((hword = va_arg(history, const char *)) != NULL) { + histid[n_hist] = ngram_wid(model, hword); + ++n_hist; + } + va_end(history); + + itor = ngram_ng_iter(model, ngram_wid(model, word), histid, n_hist); + ckd_free(histid); + return itor; +} + +ngram_iter_t * +ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist) +{ + if (n_hist >= model->n) + return NULL; + if (model->funcs->iter == NULL) + return NULL; + return (*model->funcs->iter)(model, wid, history, n_hist); +} + +ngram_iter_t * +ngram_iter_successors(ngram_iter_t *itor) +{ + /* Stop when we are at the highest order N-Gram. */ + if (itor->m == itor->model->n - 1) + return NULL; + return (*itor->model->funcs->successors)(itor); +} + +int32 const * +ngram_iter_get(ngram_iter_t *itor, + int32 *out_score, + int32 *out_bowt) +{ + return (*itor->model->funcs->iter_get)(itor, out_score, out_bowt); +} + +ngram_iter_t * +ngram_iter_next(ngram_iter_t *itor) +{ + return (*itor->model->funcs->iter_next)(itor); +} + +void +ngram_iter_free(ngram_iter_t *itor) +{ + ckd_free(itor->wids); + (*itor->model->funcs->iter_free)(itor); +} + +int32 +ngram_wid(ngram_model_t *model, const char *word) +{ + int32 val; + + if (hash_table_lookup_int32(model->wid, word, &val) == -1) + return ngram_unknown_wid(model); + else + return val; +} + +const char * +ngram_word(ngram_model_t *model, int32 wid) +{ + /* Remove any class tag */ + wid = NGRAM_BASEWID(wid); + if (wid >= model->n_words) + return NULL; + return model->word_str[wid]; +} + +/** + * Add a word to the word string and ID mapping. + */ +int32 +ngram_add_word_internal(ngram_model_t *model, + const char *word, + int32 classid) +{ + + /* Check for hash collisions. */ + int32 wid; + if (hash_table_lookup_int32(model->wid, word, &wid) == 0) { + E_WARN("Omit duplicate word '%s'\n", word); + return wid; + } + + /* Take the next available word ID */ + wid = model->n_words; + if (classid >= 0) { + wid = NGRAM_CLASSWID(wid, classid); + } + + /* Reallocate word_str if necessary. */ + if (model->n_words >= model->n_1g_alloc) { + model->n_1g_alloc += UG_ALLOC_STEP; + model->word_str = ckd_realloc(model->word_str, + sizeof(*model->word_str) * model->n_1g_alloc); + } + /* Add the word string in the appropriate manner. */ + /* Class words are always dynamically allocated. */ + model->word_str[model->n_words] = ckd_salloc(word); + /* Now enter it into the hash table. */ + if (hash_table_enter_int32(model->wid, model->word_str[model->n_words], wid) != wid) { + E_ERROR("Hash insertion failed for word %s => %p (should not happen)\n", + model->word_str[model->n_words], (void *)(long)(wid)); + } + /* Increment number of words. */ + ++model->n_words; + return wid; +} + +int32 +ngram_model_add_word(ngram_model_t *model, + const char *word, float32 weight) +{ + int32 wid, prob = model->log_zero; + + /* If we add word to unwritable model, we need to make it writable */ + if (!model->writable) { + E_WARN("Can't add word '%s' to read-only language model. " + "Disable mmap with '-mmap no' to make it writable\n", word); + return -1; + } + + wid = ngram_add_word_internal(model, word, -1); + if (wid == NGRAM_INVALID_WID) + return wid; + + /* Do what needs to be done to add the word to the unigram. */ + if (model->funcs && model->funcs->add_ug) + prob = (*model->funcs->add_ug)(model, wid, logmath_log(model->lmath, weight)); + if (prob == 0) + return -1; + + return wid; +} + +ngram_class_t * +ngram_class_new(ngram_model_t *model, int32 tag_wid, int32 start_wid, glist_t classwords) +{ + ngram_class_t *lmclass; + gnode_t *gn; + float32 tprob; + int i; + + lmclass = ckd_calloc(1, sizeof(*lmclass)); + lmclass->tag_wid = tag_wid; + /* wid_base is the wid (minus class tag) of the first word in the list. */ + lmclass->start_wid = start_wid; + lmclass->n_words = glist_count(classwords); + lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1)); + lmclass->nword_hash = NULL; + lmclass->n_hash = 0; + tprob = 0.0; + for (gn = classwords; gn; gn = gnode_next(gn)) { + tprob += gnode_float32(gn); + } + if (tprob > 1.1 || tprob < 0.9) { + E_INFO("Total class probability is %f, will normalize\n", tprob); + for (gn = classwords; gn; gn = gnode_next(gn)) { + gn->data.fl /= tprob; + } + } + for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) { + lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn)); + } + + return lmclass; +} + +int32 +ngram_class_add_word(ngram_class_t *lmclass, int32 wid, int32 lweight) +{ + int32 hash; + + if (lmclass->nword_hash == NULL) { + /* Initialize everything in it to -1 */ + lmclass->nword_hash = ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash)); + memset(lmclass->nword_hash, 0xff, NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash)); + lmclass->n_hash = NGRAM_HASH_SIZE; + lmclass->n_hash_inuse = 0; + } + /* Stupidest possible hash function. This will work pretty well + * when this function is called repeatedly with contiguous word + * IDs, though... */ + hash = wid & (lmclass->n_hash - 1); + if (lmclass->nword_hash[hash].wid == -1) { + /* Good, no collision. */ + lmclass->nword_hash[hash].wid = wid; + lmclass->nword_hash[hash].prob1 = lweight; + ++lmclass->n_hash_inuse; + return hash; + } + else { + int32 next; /**< Next available bucket. */ + /* Collision... Find the end of the hash chain. */ + while (lmclass->nword_hash[hash].next != -1) + hash = lmclass->nword_hash[hash].next; + assert(hash != -1); + /* Does we has any more bukkit? */ + if (lmclass->n_hash_inuse == lmclass->n_hash) { + /* Oh noes! Ok, we makes more. */ + lmclass->nword_hash = ckd_realloc(lmclass->nword_hash, + lmclass->n_hash * 2 * sizeof(*lmclass->nword_hash)); + memset(lmclass->nword_hash + lmclass->n_hash, + 0xff, lmclass->n_hash * sizeof(*lmclass->nword_hash)); + /* Just use the next allocated one (easy) */ + next = lmclass->n_hash; + lmclass->n_hash *= 2; + } + else { + /* Look for any available bucket. We hope this doesn't happen. */ + for (next = 0; next < lmclass->n_hash; ++next) + if (lmclass->nword_hash[next].wid == -1) + break; + /* This should absolutely not happen. */ + assert(next != lmclass->n_hash); + } + lmclass->nword_hash[next].wid = wid; + lmclass->nword_hash[next].prob1 = lweight; + lmclass->nword_hash[hash].next = next; + ++lmclass->n_hash_inuse; + return next; + } +} + +void +ngram_class_free(ngram_class_t *lmclass) +{ + ckd_free(lmclass->nword_hash); + ckd_free(lmclass->prob1); + ckd_free(lmclass); +} + +int32 +ngram_model_add_class_word(ngram_model_t *model, + const char *classname, + const char *word, + float32 weight) +{ + ngram_class_t *lmclass; + int32 classid, tag_wid, wid, i, scale; + float32 fprob; + + /* Find the class corresponding to classname. Linear search + * probably okay here since there won't be very many classes, and + * this doesn't have to be fast. */ + tag_wid = ngram_wid(model, classname); + if (tag_wid == NGRAM_INVALID_WID) { + E_ERROR("No such word or class tag: %s\n", classname); + return tag_wid; + } + for (classid = 0; classid < model->n_classes; ++classid) { + if (model->classes[classid]->tag_wid == tag_wid) + break; + } + /* Hmm, no such class. It's probably not a good idea to create one. */ + if (classid == model->n_classes) { + E_ERROR("Word %s is not a class tag (call ngram_model_add_class() first)\n", classname); + return NGRAM_INVALID_WID; + } + lmclass = model->classes[classid]; + + /* Add this word to the model's set of words. */ + wid = ngram_add_word_internal(model, word, classid); + if (wid == NGRAM_INVALID_WID) + return wid; + + /* This is the fixed probability of the new word. */ + fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1); + /* Now normalize everything else to fit it in. This is + * accomplished by simply scaling all the other probabilities + * by (1-fprob). */ + scale = logmath_log(model->lmath, 1.0 - fprob); + for (i = 0; i < lmclass->n_words; ++i) + lmclass->prob1[i] += scale; + for (i = 0; i < lmclass->n_hash; ++i) + if (lmclass->nword_hash[i].wid != -1) + lmclass->nword_hash[i].prob1 += scale; + + /* Now add it to the class hash table. */ + return ngram_class_add_word(lmclass, wid, logmath_log(model->lmath, fprob)); +} + +int32 +ngram_model_add_class(ngram_model_t *model, + const char *classname, + float32 classweight, + char **words, + const float32 *weights, + int32 n_words) +{ + ngram_class_t *lmclass; + glist_t classwords = NULL; + int32 i, start_wid = -1; + int32 classid, tag_wid; + + /* Check if classname already exists in model. If not, add it.*/ + if ((tag_wid = ngram_wid(model, classname)) == ngram_unknown_wid(model)) { + tag_wid = ngram_model_add_word(model, classname, classweight); + if (tag_wid == NGRAM_INVALID_WID) + return -1; + } + + if (model->n_classes == 128) { + E_ERROR("Number of classes cannot exceed 128 (sorry)\n"); + return -1; + } + classid = model->n_classes; + for (i = 0; i < n_words; ++i) { + int32 wid; + + wid = ngram_add_word_internal(model, words[i], classid); + if (wid == NGRAM_INVALID_WID) + return -1; + if (start_wid == -1) + start_wid = NGRAM_BASEWID(wid); + classwords = glist_add_float32(classwords, weights[i]); + } + classwords = glist_reverse(classwords); + lmclass = ngram_class_new(model, tag_wid, start_wid, classwords); + glist_free(classwords); + if (lmclass == NULL) + return -1; + + ++model->n_classes; + if (model->classes == NULL) + model->classes = ckd_calloc(1, sizeof(*model->classes)); + else + model->classes = ckd_realloc(model->classes, + model->n_classes * sizeof(*model->classes)); + model->classes[classid] = lmclass; + return classid; +} + +int32 +ngram_class_prob(ngram_class_t *lmclass, int32 wid) +{ + int32 base_wid = NGRAM_BASEWID(wid); + + if (base_wid < lmclass->start_wid + || base_wid > lmclass->start_wid + lmclass->n_words) { + int32 hash; + + /* Look it up in the hash table. */ + hash = wid & (lmclass->n_hash - 1); + while (hash != -1 && lmclass->nword_hash[hash].wid != wid) + hash = lmclass->nword_hash[hash].next; + if (hash == -1) + return 1; + return lmclass->nword_hash[hash].prob1; + } + else { + return lmclass->prob1[base_wid - lmclass->start_wid]; + } +} + +int32 +read_classdef_file(hash_table_t *classes, const char *file_name) +{ + FILE *fp; + int32 is_pipe; + int inclass; /**< Are we currently reading a list of class words? */ + int32 rv = -1; + gnode_t *gn; + glist_t classwords = NULL; + glist_t classprobs = NULL; + char *classname = NULL; + + if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) { + E_ERROR("File %s not found\n", file_name); + return -1; + } + + inclass = FALSE; + while (!feof(fp)) { + char line[512]; + char *wptr[2]; + int n_words; + + if (fgets(line, sizeof(line), fp) == NULL) + break; + + n_words = str2words(line, wptr, 2); + if (n_words <= 0) + continue; + + if (inclass) { + /* Look for an end of class marker. */ + if (n_words == 2 && 0 == strcmp(wptr[0], "END")) { + classdef_t *classdef; + gnode_t *word, *weight; + int32 i; + + if (classname == NULL || 0 != strcmp(wptr[1], classname)) + goto error_out; + inclass = FALSE; + + /* Construct a class from the list of words collected. */ + classdef = ckd_calloc(1, sizeof(*classdef)); + classwords = glist_reverse(classwords); + classprobs = glist_reverse(classprobs); + classdef->n_words = glist_count(classwords); + classdef->words = ckd_calloc(classdef->n_words, + sizeof(*classdef->words)); + classdef->weights = ckd_calloc(classdef->n_words, + sizeof(*classdef->weights)); + word = classwords; + weight = classprobs; + for (i = 0; i < classdef->n_words; ++i) { + classdef->words[i] = gnode_ptr(word); + classdef->weights[i] = gnode_float32(weight); + word = gnode_next(word); + weight = gnode_next(weight); + } + + /* Add this class to the hash table. */ + if (hash_table_enter(classes, classname, classdef) != classdef) { + classdef_free(classdef); + goto error_out; + } + + /* Reset everything. */ + glist_free(classwords); + glist_free(classprobs); + classwords = NULL; + classprobs = NULL; + classname = NULL; + } + else { + float32 fprob; + + if (n_words == 2) + fprob = (float32)atof_c(wptr[1]); + else + fprob = 1.0f; + /* Add it to the list of words for this class. */ + classwords = glist_add_ptr(classwords, ckd_salloc(wptr[0])); + classprobs = glist_add_float32(classprobs, fprob); + } + } + else { + /* Start a new LM class if the LMCLASS marker is seen */ + if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) { + if (inclass) + goto error_out; + inclass = TRUE; + classname = ckd_salloc(wptr[1]); + } + /* Otherwise, just ignore whatever junk we got */ + } + } + rv = 0; /* Success. */ + +error_out: + /* Free all the stuff we might have allocated. */ + fclose_comp(fp, is_pipe); + for (gn = classwords; gn; gn = gnode_next(gn)) + ckd_free(gnode_ptr(gn)); + glist_free(classwords); + glist_free(classprobs); + ckd_free(classname); + + return rv; +} + +void +classdef_free(classdef_t *classdef) +{ + int32 i; + for (i = 0; i < classdef->n_words; ++i) + ckd_free(classdef->words[i]); + ckd_free(classdef->words); + ckd_free(classdef->weights); + ckd_free(classdef); +} + + +int32 +ngram_model_read_classdef(ngram_model_t *model, + const char *file_name) +{ + hash_table_t *classes; + glist_t hl = NULL; + gnode_t *gn; + int32 rv = -1; + + classes = hash_table_new(0, FALSE); + if (read_classdef_file(classes, file_name) < 0) { + hash_table_free(classes); + return -1; + } + + /* Create a new class in the language model for each classdef. */ + hl = hash_table_tolist(classes, NULL); + for (gn = hl; gn; gn = gnode_next(gn)) { + hash_entry_t *he = gnode_ptr(gn); + classdef_t *classdef = he->val; + + if (ngram_model_add_class(model, he->key, 1.0, + classdef->words, + classdef->weights, + classdef->n_words) < 0) + goto error_out; + } + rv = 0; + +error_out: + for (gn = hl; gn; gn = gnode_next(gn)) { + hash_entry_t *he = gnode_ptr(gn); + ckd_free((char *)he->key); + classdef_free(he->val); + } + glist_free(hl); + hash_table_free(classes); + return rv; +} diff --git a/media/sphinxbase/src/libsphinxbase/lm/ngram_model_arpa.c b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_arpa.c new file mode 100644 index 000000000..a4b72cb00 --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_arpa.c @@ -0,0 +1,660 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2007 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/* + * \file ngram_model_arpa.c ARPA format language models + * + * Author: David Huggins-Daines <dhuggins@cs.cmu.edu> + */ + +#include "sphinxbase/ckd_alloc.h" +#include <string.h> +#include <limits.h> +#include <assert.h> + +#include "sphinxbase/err.h" +#include "sphinxbase/pio.h" +#include "sphinxbase/listelem_alloc.h" +#include "sphinxbase/strfuncs.h" + +#include "ngram_model_arpa.h" + +static ngram_funcs_t ngram_model_arpa_funcs; + +#define TSEG_BASE(m,b) ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ]) +#define FIRST_BG(m,u) ((m)->lm3g.unigrams[u].bigrams) +#define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams)) + +/* + * Read and return #unigrams, #bigrams, #trigrams as stated in input file. + */ +static int +ReadNgramCounts(lineiter_t **li, int32 * n_ug, int32 * n_bg, int32 * n_tg) +{ + int32 ngram, ngram_cnt; + + /* skip file until past the '\data\' marker */ + while (*li) { + string_trim((*li)->buf, STRING_BOTH); + if (strcmp((*li)->buf, "\\data\\") == 0) + break; + *li = lineiter_next(*li); + } + if (*li == NULL || strcmp((*li)->buf, "\\data\\") != 0) { + E_INFO("No \\data\\ mark in LM file\n"); + return -1; + } + + *n_ug = *n_bg = *n_tg = 0; + while ((*li = lineiter_next(*li))) { + if (sscanf((*li)->buf, "ngram %d=%d", &ngram, &ngram_cnt) != 2) + break; + switch (ngram) { + case 1: + *n_ug = ngram_cnt; + break; + case 2: + *n_bg = ngram_cnt; + break; + case 3: + *n_tg = ngram_cnt; + break; + default: + E_ERROR("Unknown ngram (%d)\n", ngram); + return -1; + } + } + if (*li == NULL) { + E_ERROR("EOF while reading ngram counts\n"); + return -1; + } + + /* Position iterator to the unigrams header '\1-grams:\' */ + while ((*li = lineiter_next(*li))) { + string_trim((*li)->buf, STRING_BOTH); + if (strcmp((*li)->buf, "\\1-grams:") == 0) + break; + } + if (*li == NULL) { + E_ERROR_SYSTEM("Failed to read \\1-grams: mark"); + return -1; + } + + if ((*n_ug <= 0) || (*n_bg < 0) || (*n_tg < 0)) { + E_ERROR("Bad or missing ngram count\n"); + return -1; + } + return 0; +} + +/* + * Read in the unigrams from given file into the LM structure model. + * On entry to this procedure, the iterator is positioned to the + * header line '\1-grams:'. + */ +static int +ReadUnigrams(lineiter_t **li, ngram_model_arpa_t * model) +{ + ngram_model_t *base = &model->base; + int32 wcnt; + float p1; + + E_INFO("Reading unigrams\n"); + + wcnt = 0; + while ((*li = lineiter_next(*li))) { + char *wptr[3], *name; + float32 bo_wt = 0.0f; + int n; + + string_trim((*li)->buf, STRING_BOTH); + if (strcmp((*li)->buf, "\\2-grams:") == 0 + || strcmp((*li)->buf, "\\end\\") == 0) + break; + + if ((n = str2words((*li)->buf, wptr, 3)) < 2) { + if ((*li)->buf[0] != '\0') + E_WARN("Format error; unigram ignored: %s\n", (*li)->buf); + continue; + } + else { + p1 = (float)atof_c(wptr[0]); + name = wptr[1]; + if (n == 3) + bo_wt = (float)atof_c(wptr[2]); + } + + if (wcnt >= base->n_counts[0]) { + E_ERROR("Too many unigrams\n"); + return -1; + } + + /* Associate name with word id */ + base->word_str[wcnt] = ckd_salloc(name); + if ((hash_table_enter(base->wid, base->word_str[wcnt], (void *)(long)wcnt)) + != (void *)(long)wcnt) { + E_WARN("Duplicate word in dictionary: %s\n", base->word_str[wcnt]); + } + model->lm3g.unigrams[wcnt].prob1.l = logmath_log10_to_log(base->lmath, p1); + model->lm3g.unigrams[wcnt].bo_wt1.l = logmath_log10_to_log(base->lmath, bo_wt); + wcnt++; + } + + if (base->n_counts[0] != wcnt) { + E_WARN("lm_t.ucount(%d) != #unigrams read(%d)\n", + base->n_counts[0], wcnt); + base->n_counts[0] = wcnt; + base->n_words = wcnt; + } + return 0; +} + +/* + * Read bigrams from given file into given model structure. + */ +static int +ReadBigrams(lineiter_t **li, ngram_model_arpa_t * model) +{ + ngram_model_t *base = &model->base; + int32 w1, w2, prev_w1, bgcount; + bigram_t *bgptr; + + E_INFO("Reading bigrams\n"); + + bgcount = 0; + bgptr = model->lm3g.bigrams; + prev_w1 = -1; + + while ((*li = lineiter_next(*li))) { + float32 p, bo_wt = 0.0f; + int32 p2, bo_wt2; + char *wptr[4], *word1, *word2; + int n; + + string_trim((*li)->buf, STRING_BOTH); + wptr[3] = NULL; + if ((n = str2words((*li)->buf, wptr, 4)) < 3) { + if ((*li)->buf[0] != '\0') + break; + continue; + } + else { + p = (float32)atof_c(wptr[0]); + word1 = wptr[1]; + word2 = wptr[2]; + if (wptr[3]) + bo_wt = (float32)atof_c(wptr[3]); + } + + if ((w1 = ngram_wid(base, word1)) == NGRAM_INVALID_WID) { + E_ERROR("Unknown word: %s, skipping bigram (%s %s)\n", + word1, word1, word2); + continue; + } + if ((w2 = ngram_wid(base, word2)) == NGRAM_INVALID_WID) { + E_ERROR("Unknown word: %s, skipping bigram (%s %s)\n", + word2, word1, word2); + continue; + } + + /* FIXME: Should use logmath_t quantization here. */ + /* HACK!! to quantize probs to 4 decimal digits */ + p = (float32)((int32)(p * 10000)) / 10000; + bo_wt = (float32)((int32)(bo_wt * 10000)) / 10000; + + p2 = logmath_log10_to_log(base->lmath, p); + bo_wt2 = logmath_log10_to_log(base->lmath, bo_wt); + + if (bgcount >= base->n_counts[1]) { + E_ERROR("Too many bigrams\n"); + return -1; + } + + bgptr->wid = w2; + bgptr->prob2 = sorted_id(&model->sorted_prob2, &p2); + if (base->n_counts[2] > 0) + bgptr->bo_wt2 = sorted_id(&model->sorted_bo_wt2, &bo_wt2); + + if (w1 != prev_w1) { + if (w1 < prev_w1) { + E_ERROR("Bigram %s %s not in unigram order word id: %d prev word id: %d\n", word1, word2, w1, prev_w1); + return -1; + } + + for (prev_w1++; prev_w1 <= w1; prev_w1++) + model->lm3g.unigrams[prev_w1].bigrams = bgcount; + prev_w1 = w1; + } + bgcount++; + bgptr++; + + if ((bgcount & 0x0000ffff) == 0) { + E_INFOCONT("."); + } + } + if (*li == NULL || ((strcmp((*li)->buf, "\\end\\") != 0) + && (strcmp((*li)->buf, "\\3-grams:") != 0))) { + E_ERROR("Bad bigram: %s\n", (*li)->buf); + return -1; + } + + for (prev_w1++; prev_w1 <= base->n_counts[0]; prev_w1++) + model->lm3g.unigrams[prev_w1].bigrams = bgcount; + + return 0; +} + +/* + * Very similar to ReadBigrams. + */ +static int +ReadTrigrams(lineiter_t **li, ngram_model_arpa_t * model) +{ + ngram_model_t *base = &model->base; + int32 i, w1, w2, w3, prev_w1, prev_w2, tgcount, prev_bg, bg, endbg; + int32 seg, prev_seg, prev_seg_lastbg; + trigram_t *tgptr; + bigram_t *bgptr; + + E_INFO("Reading trigrams\n"); + + tgcount = 0; + tgptr = model->lm3g.trigrams; + prev_w1 = -1; + prev_w2 = -1; + prev_bg = -1; + prev_seg = -1; + + while ((*li = lineiter_next(*li))) { + float32 p; + int32 p3; + char *wptr[4], *word1, *word2, *word3; + + string_trim((*li)->buf, STRING_BOTH); + if (str2words((*li)->buf, wptr, 4) != 4) { + if ((*li)->buf[0] != '\0') + break; + continue; + } + else { + p = (float32)atof_c(wptr[0]); + word1 = wptr[1]; + word2 = wptr[2]; + word3 = wptr[3]; + } + + if ((w1 = ngram_wid(base, word1)) == NGRAM_INVALID_WID) { + E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n", + word1, word1, word2, word3); + continue; + } + if ((w2 = ngram_wid(base, word2)) == NGRAM_INVALID_WID) { + E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n", + word2, word1, word2, word3); + continue; + } + if ((w3 = ngram_wid(base, word3)) == NGRAM_INVALID_WID) { + E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n", + word3, word1, word2, word3); + continue; + } + + /* FIXME: Should use logmath_t quantization here. */ + /* HACK!! to quantize probs to 4 decimal digits */ + p = (float32)((int32)(p * 10000)) / 10000; + p3 = logmath_log10_to_log(base->lmath, p); + + if (tgcount >= base->n_counts[2]) { + E_ERROR("Too many trigrams\n"); + return -1; + } + + tgptr->wid = w3; + tgptr->prob3 = sorted_id(&model->sorted_prob3, &p3); + + if ((w1 != prev_w1) || (w2 != prev_w2)) { + /* Trigram for a new bigram; update tg info for all previous bigrams */ + if ((w1 < prev_w1) || ((w1 == prev_w1) && (w2 < prev_w2))) { + E_ERROR("Trigrams not in bigram order\n"); + return -1; + } + + bg = (w1 != + prev_w1) ? model->lm3g.unigrams[w1].bigrams : prev_bg + 1; + endbg = model->lm3g.unigrams[w1 + 1].bigrams; + bgptr = model->lm3g.bigrams + bg; + for (; (bg < endbg) && (bgptr->wid != w2); bg++, bgptr++); + if (bg >= endbg) { + E_ERROR("Missing bigram for trigram: %s", (*li)->buf); + return -1; + } + + /* bg = bigram entry index for <w1,w2>. Update tseg_base */ + seg = bg >> LOG_BG_SEG_SZ; + for (i = prev_seg + 1; i <= seg; i++) + model->lm3g.tseg_base[i] = tgcount; + + /* Update trigrams pointers for all bigrams until bg */ + if (prev_seg < seg) { + int32 tgoff = 0; + + if (prev_seg >= 0) { + tgoff = tgcount - model->lm3g.tseg_base[prev_seg]; + if (tgoff > 65535) { + E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n"); + return -1; + } + } + + prev_seg_lastbg = ((prev_seg + 1) << LOG_BG_SEG_SZ) - 1; + bgptr = model->lm3g.bigrams + prev_bg; + for (++prev_bg, ++bgptr; prev_bg <= prev_seg_lastbg; + prev_bg++, bgptr++) + bgptr->trigrams = tgoff; + + for (; prev_bg <= bg; prev_bg++, bgptr++) + bgptr->trigrams = 0; + } + else { + int32 tgoff; + + tgoff = tgcount - model->lm3g.tseg_base[prev_seg]; + if (tgoff > 65535) { + E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n"); + return -1; + } + + bgptr = model->lm3g.bigrams + prev_bg; + for (++prev_bg, ++bgptr; prev_bg <= bg; prev_bg++, bgptr++) + bgptr->trigrams = tgoff; + } + + prev_w1 = w1; + prev_w2 = w2; + prev_bg = bg; + prev_seg = seg; + } + + tgcount++; + tgptr++; + + if ((tgcount & 0x0000ffff) == 0) { + E_INFOCONT("."); + } + } + if (*li == NULL || strcmp((*li)->buf, "\\end\\") != 0) { + E_ERROR("Bad trigram: %s\n", (*li)->buf); + return -1; + } + + for (prev_bg++; prev_bg <= base->n_counts[1]; prev_bg++) { + if ((prev_bg & (BG_SEG_SZ - 1)) == 0) + model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ] = tgcount; + if ((tgcount - model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ]) > 65535) { + E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n"); + return -1; + } + model->lm3g.bigrams[prev_bg].trigrams = + tgcount - model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ]; + } + return 0; +} + +static unigram_t * +new_unigram_table(int32 n_ug) +{ + unigram_t *table; + int32 i; + + table = ckd_calloc(n_ug, sizeof(unigram_t)); + for (i = 0; i < n_ug; i++) { + table[i].prob1.l = INT_MIN; + table[i].bo_wt1.l = INT_MIN; + } + return table; +} + +ngram_model_t * +ngram_model_arpa_read(cmd_ln_t *config, + const char *file_name, + logmath_t *lmath) +{ + lineiter_t *li; + FILE *fp; + int32 is_pipe; + int32 n_unigram; + int32 n_bigram; + int32 n_trigram; + int32 n; + ngram_model_arpa_t *model; + ngram_model_t *base; + + if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) { + E_ERROR("File %s not found\n", file_name); + return NULL; + } + li = lineiter_start(fp); + + /* Read #unigrams, #bigrams, #trigrams from file */ + if (ReadNgramCounts(&li, &n_unigram, &n_bigram, &n_trigram) == -1) { + lineiter_free(li); + fclose_comp(fp, is_pipe); + return NULL; + } + E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram); + + /* Allocate space for LM, including initial OOVs and placeholders; initialize it */ + model = ckd_calloc(1, sizeof(*model)); + base = &model->base; + if (n_trigram > 0) + n = 3; + else if (n_bigram > 0) + n = 2; + else + n = 1; + /* Initialize base model. */ + ngram_model_init(base, &ngram_model_arpa_funcs, lmath, n, n_unigram); + base->n_counts[0] = n_unigram; + base->n_counts[1] = n_bigram; + base->n_counts[2] = n_trigram; + base->writable = TRUE; + + /* + * Allocate one extra unigram and bigram entry: sentinels to terminate + * followers (bigrams and trigrams, respectively) of previous entry. + */ + model->lm3g.unigrams = new_unigram_table(n_unigram + 1); + model->lm3g.bigrams = + ckd_calloc(n_bigram + 1, sizeof(bigram_t)); + if (n_trigram > 0) + model->lm3g.trigrams = + ckd_calloc(n_trigram, sizeof(trigram_t)); + + if (n_trigram > 0) { + model->lm3g.tseg_base = + ckd_calloc((n_bigram + 1) / BG_SEG_SZ + 1, + sizeof(int32)); + } + if (ReadUnigrams(&li, model) == -1) { + fclose_comp(fp, is_pipe); + ngram_model_free(base); + return NULL; + } + E_INFO("%8d = #unigrams created\n", base->n_counts[0]); + + if (base->n_counts[2] > 0) + init_sorted_list(&model->sorted_bo_wt2); + + if (base->n_counts[1] > 0) { + init_sorted_list(&model->sorted_prob2); + + if (ReadBigrams(&li, model) == -1) { + fclose_comp(fp, is_pipe); + ngram_model_free(base); + return NULL; + } + + base->n_counts[1] = FIRST_BG(model, base->n_counts[0]); + model->lm3g.n_prob2 = model->sorted_prob2.free; + model->lm3g.prob2 = vals_in_sorted_list(&model->sorted_prob2); + free_sorted_list(&model->sorted_prob2); + E_INFO("%8d = #bigrams created\n", base->n_counts[1]); + E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2); + } + + if (base->n_counts[2] > 0) { + /* Create trigram bo-wts array */ + model->lm3g.n_bo_wt2 = model->sorted_bo_wt2.free; + model->lm3g.bo_wt2 = vals_in_sorted_list(&model->sorted_bo_wt2); + free_sorted_list(&model->sorted_bo_wt2); + E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2); + + init_sorted_list(&model->sorted_prob3); + + if (ReadTrigrams(&li, model) == -1) { + fclose_comp(fp, is_pipe); + ngram_model_free(base); + return NULL; + } + + base->n_counts[2] = FIRST_TG(model, base->n_counts[1]); + model->lm3g.n_prob3 = model->sorted_prob3.free; + model->lm3g.prob3 = vals_in_sorted_list(&model->sorted_prob3); + E_INFO("%8d = #trigrams created\n", base->n_counts[2]); + E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3); + + free_sorted_list(&model->sorted_prob3); + + /* Initialize tginfo */ + model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *)); + model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t)); + } + + lineiter_free(li); + fclose_comp(fp, is_pipe); + return base; +} + +int +ngram_model_arpa_write(ngram_model_t *model, + const char *file_name) +{ + ngram_iter_t *itor; + FILE *fh; + int i; + + if ((fh = fopen(file_name, "w")) == NULL) { + E_ERROR_SYSTEM("Failed to open %s for writing", file_name); + return -1; + } + fprintf(fh, "This is an ARPA-format language model file, generated by CMU Sphinx\n"); + + /* The ARPA format doesn't require any extra information that + * N-Gram iterators can't give us, so this is very + * straightforward compared with DMP writing. */ + + /* Write N-gram counts. */ + fprintf(fh, "\\data\\\n"); + for (i = 0; i < model->n; ++i) { + fprintf(fh, "ngram %d=%d\n", i+1, model->n_counts[i]); + } + + /* Write N-grams */ + for (i = 0; i < model->n; ++i) { + fprintf(fh, "\n\\%d-grams:\n", i + 1); + for (itor = ngram_model_mgrams(model, i); itor; itor = ngram_iter_next(itor)) { + int32 const *wids; + int32 score, bowt; + int j; + + wids = ngram_iter_get(itor, &score, &bowt); + fprintf(fh, "%.4f ", logmath_log_to_log10(model->lmath, score)); + for (j = 0; j <= i; ++j) { + assert(wids[j] < model->n_counts[0]); + fprintf(fh, "%s ", model->word_str[wids[j]]); + } + if (i < model->n-1) + fprintf(fh, "%.4f", logmath_log_to_log10(model->lmath, bowt)); + fprintf(fh, "\n"); + } + } + fprintf(fh, "\n\\end\\\n"); + return fclose(fh); +} + +static int +ngram_model_arpa_apply_weights(ngram_model_t *base, float32 lw, + float32 wip, float32 uw) +{ + ngram_model_arpa_t *model = (ngram_model_arpa_t *)base; + lm3g_apply_weights(base, &model->lm3g, lw, wip, uw); + return 0; +} + +/* Lousy "templating" for things that are largely the same in DMP and + * ARPA models, except for the bigram and trigram types and some + * names. */ +#define NGRAM_MODEL_TYPE ngram_model_arpa_t +#include "lm3g_templates.c" + +static void +ngram_model_arpa_free(ngram_model_t *base) +{ + ngram_model_arpa_t *model = (ngram_model_arpa_t *)base; + ckd_free(model->lm3g.unigrams); + ckd_free(model->lm3g.bigrams); + ckd_free(model->lm3g.trigrams); + ckd_free(model->lm3g.prob2); + ckd_free(model->lm3g.bo_wt2); + ckd_free(model->lm3g.prob3); + lm3g_tginfo_free(base, &model->lm3g); + ckd_free(model->lm3g.tseg_base); +} + +static ngram_funcs_t ngram_model_arpa_funcs = { + ngram_model_arpa_free, /* free */ + ngram_model_arpa_apply_weights, /* apply_weights */ + lm3g_template_score, /* score */ + lm3g_template_raw_score, /* raw_score */ + lm3g_template_add_ug, /* add_ug */ + lm3g_template_flush, /* flush */ + lm3g_template_iter, /* iter */ + lm3g_template_mgrams, /* mgrams */ + lm3g_template_successors, /* successors */ + lm3g_template_iter_get, /* iter_get */ + lm3g_template_iter_next, /* iter_next */ + lm3g_template_iter_free /* iter_free */ +}; diff --git a/media/sphinxbase/src/libsphinxbase/lm/ngram_model_arpa.h b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_arpa.h new file mode 100644 index 000000000..2fd9e427d --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_arpa.h @@ -0,0 +1,86 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2007 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/* + * \file ngram_model_arpa.h ARPABO text format for N-Gram models + * + * Author: David Huggins-Daines <dhuggins@cs.cmu.edu> + */ + +#ifndef __NGRAM_MODEL_ARPA_H__ +#define __NGRAM_MODEL_ARPA_H__ + +#include "ngram_model_internal.h" +#include "lm3g_model.h" + +/** + * Bigram structure. + */ +struct bigram_s { + uint32 wid; /**< Index of unigram entry for this. (NOT dictionary id.) */ + uint16 prob2; /**< Index into array of actual bigram probs */ + uint16 bo_wt2; /**< Index into array of actual bigram backoff wts */ + uint16 trigrams; /**< Index of 1st entry in lm_t.trigrams[], + RELATIVE TO its segment base (see above) */ +}; + +/** + * Trigram structure. + * + * As with bigrams, trigram prob info kept in a separate table for conserving + * memory space. + */ +struct trigram_s { + uint32 wid; /**< Index of unigram entry for this. (NOT dictionary id.) */ + uint16 prob3; /**< Index into array of actual trigram probs */ +}; + + +/** + * Subclass of ngram_model for ARPA file reading. + */ +typedef struct ngram_model_arpa_s { + ngram_model_t base; /**< Base ngram_model_t structure */ + lm3g_model_t lm3g; /**< Shared lm3g structure */ + + /* Arrays of unique bigram probs and bo-wts, and trigram probs + * (these are temporary, actually) */ + sorted_list_t sorted_prob2; + sorted_list_t sorted_bo_wt2; + sorted_list_t sorted_prob3; +} ngram_model_arpa_t; + +#endif /* __NGRAM_MODEL_ARPA_H__ */ diff --git a/media/sphinxbase/src/libsphinxbase/lm/ngram_model_dmp.c b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_dmp.c new file mode 100644 index 000000000..c6a2d8b85 --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_dmp.c @@ -0,0 +1,969 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2007 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/* + * \file ngram_model_dmp.c DMP format language models + * + * Author: David Huggins-Daines <dhuggins@cs.cmu.edu> + */ + +#include <assert.h> +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <limits.h> + +#include "sphinxbase/ckd_alloc.h" +#include "sphinxbase/pio.h" +#include "sphinxbase/err.h" +#include "sphinxbase/byteorder.h" +#include "sphinxbase/listelem_alloc.h" + +#include "ngram_model_dmp.h" + +static const char darpa_hdr[] = "Darpa Trigram LM"; +static ngram_funcs_t ngram_model_dmp_funcs; + +#define TSEG_BASE(m,b) ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ]) +#define FIRST_BG(m,u) ((m)->lm3g.unigrams[u].bigrams) +#define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams)) + +static unigram_t * +new_unigram_table(int32 n_ug) +{ + unigram_t *table; + int32 i; + + table = ckd_calloc(n_ug, sizeof(unigram_t)); + for (i = 0; i < n_ug; i++) { + table[i].prob1.f = -99.0; + table[i].bo_wt1.f = -99.0; + } + return table; +} + +ngram_model_t * +ngram_model_dmp_read(cmd_ln_t *config, + const char *file_name, + logmath_t *lmath) +{ + ngram_model_t *base; + ngram_model_dmp_t *model; + FILE *fp; + int do_mmap, do_swap; + int32 is_pipe; + int32 i, j, k, vn, n, ts; + int32 n_unigram; + int32 n_bigram; + int32 n_trigram; + char str[1024]; + unigram_t *ugptr; + bigram_t *bgptr; + trigram_t *tgptr; + char *tmp_word_str; + char *map_base = NULL; + size_t offset = 0; + + base = NULL; + do_mmap = FALSE; + if (config) + do_mmap = cmd_ln_boolean_r(config, "-mmap"); + + if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) { + E_ERROR("Dump file %s not found\n", file_name); + goto error_out; + } + + if (is_pipe && do_mmap) { + E_WARN("Dump file is compressed, will not use memory-mapped I/O\n"); + do_mmap = 0; + } + + do_swap = FALSE; + if (fread(&k, sizeof(k), 1, fp) != 1) + goto error_out; + if (k != strlen(darpa_hdr)+1) { + SWAP_INT32(&k); + if (k != strlen(darpa_hdr)+1) { + E_ERROR("Wrong magic header size number %x: %s is not a dump file\n", k, file_name); + goto error_out; + } + do_swap = 1; + } + if (fread(str, 1, k, fp) != (size_t) k) { + E_ERROR("Cannot read header\n"); + goto error_out; + } + if (strncmp(str, darpa_hdr, k) != 0) { + E_ERROR("Wrong header %s: %s is not a dump file\n", darpa_hdr); + goto error_out; + } + + if (do_mmap) { + if (do_swap) { + E_INFO + ("Byteswapping required, will not use memory-mapped I/O for LM file\n"); + do_mmap = 0; + } + else { + E_INFO("Will use memory-mapped I/O for LM file\n"); +#ifdef __ADSPBLACKFIN__ /* This is true for both VisualDSP++ and uClinux. */ + E_FATAL("memory mapping is not supported at the moment."); +#else +#endif + } + } + + if (fread(&k, sizeof(k), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&k); + if (fread(str, 1, k, fp) != (size_t) k) { + E_ERROR("Cannot read LM filename in header\n"); + goto error_out; + } + + /* read version#, if present (must be <= 0) */ + if (fread(&vn, sizeof(vn), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&vn); + if (vn <= 0) { + /* read and don't compare timestamps (we don't care) */ + if (fread(&ts, sizeof(ts), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&ts); + + /* read and skip format description */ + for (;;) { + if (fread(&k, sizeof(k), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&k); + if (k == 0) + break; + if (fread(str, 1, k, fp) != (size_t) k) { + E_ERROR("Failed to read word\n"); + goto error_out; + } + } + /* read model->ucount */ + if (fread(&n_unigram, sizeof(n_unigram), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&n_unigram); + } + else { + n_unigram = vn; + } + + /* read model->bcount, tcount */ + if (fread(&n_bigram, sizeof(n_bigram), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&n_bigram); + if (fread(&n_trigram, sizeof(n_trigram), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&n_trigram); + E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram); + + /* Allocate space for LM, including initial OOVs and placeholders; initialize it */ + model = ckd_calloc(1, sizeof(*model)); + base = &model->base; + if (n_trigram > 0) + n = 3; + else if (n_bigram > 0) + n = 2; + else + n = 1; + ngram_model_init(base, &ngram_model_dmp_funcs, lmath, n, n_unigram); + base->n_counts[0] = n_unigram; + base->n_counts[1] = n_bigram; + base->n_counts[2] = n_trigram; + + /* read unigrams (always in memory, as they contain dictionary + * mappings that can't be precomputed, and also could have OOVs added) */ + model->lm3g.unigrams = new_unigram_table(n_unigram + 1); + ugptr = model->lm3g.unigrams; + for (i = 0; i <= n_unigram; ++i) { + /* Skip over the mapping ID, we don't care about it. */ + if (fread(ugptr, sizeof(int32), 1, fp) != 1) { + E_ERROR("Failed to read maping id %d\n", i); + goto error_out; + } + /* Read the actual unigram structure. */ + if (fread(ugptr, sizeof(unigram_t), 1, fp) != 1) { + E_ERROR("Failed to read unigrams data\n"); + ngram_model_free(base); + fclose_comp(fp, is_pipe); + return NULL; + } + /* Byte swap if necessary. */ + if (do_swap) { + SWAP_INT32(&ugptr->prob1.l); + SWAP_INT32(&ugptr->bo_wt1.l); + SWAP_INT32(&ugptr->bigrams); + } + /* Convert values to log. */ + ugptr->prob1.l = logmath_log10_to_log(lmath, ugptr->prob1.f); + ugptr->bo_wt1.l = logmath_log10_to_log(lmath, ugptr->bo_wt1.f); + E_DEBUG(2, ("ug %d: prob %d bo %d bigrams %d\n", + i, ugptr->prob1.l, ugptr->bo_wt1.l, ugptr->bigrams)); + ++ugptr; + } + E_INFO("%8d = LM.unigrams(+trailer) read\n", n_unigram); + + /* Now mmap() the file and read in the rest of the (read-only) stuff. */ + if (do_mmap) { + offset = ftell(fp); + + /* Check for improper word alignment. */ + if (offset & 0x3) { + E_WARN("-mmap specified, but trigram index is not word-aligned. Will not memory-map.\n"); + do_mmap = FALSE; + } + else { + model->dump_mmap = mmio_file_read(file_name); + if (model->dump_mmap == NULL) { + do_mmap = FALSE; + } + else { + map_base = mmio_file_ptr(model->dump_mmap); + } + } + } + + if (n_bigram > 0) { + /* read bigrams */ + if (do_mmap) { + model->lm3g.bigrams = (bigram_t *) (map_base + offset); + offset += (n_bigram + 1) * sizeof(bigram_t); + } + else { + model->lm3g.bigrams = + ckd_calloc(n_bigram + 1, sizeof(bigram_t)); + if (fread(model->lm3g.bigrams, sizeof(bigram_t), n_bigram + 1, fp) + != (size_t) n_bigram + 1) { + E_ERROR("Failed to read bigrams data\n"); + goto error_out; + } + if (do_swap) { + for (i = 0, bgptr = model->lm3g.bigrams; i <= n_bigram; + i++, bgptr++) { + SWAP_INT16(&bgptr->wid); + SWAP_INT16(&bgptr->prob2); + SWAP_INT16(&bgptr->bo_wt2); + SWAP_INT16(&bgptr->trigrams); + } + } + } + E_INFO("%8d = LM.bigrams(+trailer) read\n", n_bigram); + } + + /* read trigrams */ + if (n_trigram > 0) { + if (do_mmap) { + model->lm3g.trigrams = (trigram_t *) (map_base + offset); + offset += n_trigram * sizeof(trigram_t); + } + else { + model->lm3g.trigrams = + ckd_calloc(n_trigram, sizeof(trigram_t)); + if (fread + (model->lm3g.trigrams, sizeof(trigram_t), n_trigram, fp) + != (size_t) n_trigram) { + E_ERROR("Failed to read trigrams data\n"); + goto error_out; + } + if (do_swap) { + for (i = 0, tgptr = model->lm3g.trigrams; i < n_trigram; + i++, tgptr++) { + SWAP_INT16(&tgptr->wid); + SWAP_INT16(&tgptr->prob3); + } + } + } + E_INFO("%8d = LM.trigrams read\n", n_trigram); + /* Initialize tginfo */ + model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *)); + model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t)); + } + + if (n_bigram > 0) { + /* read n_prob2 and prob2 array (in memory) */ + if (do_mmap) + fseek(fp, offset, SEEK_SET); + if (fread(&k, sizeof(k), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&k); + model->lm3g.n_prob2 = k; + model->lm3g.prob2 = ckd_calloc(k, sizeof(*model->lm3g.prob2)); + if (fread(model->lm3g.prob2, sizeof(*model->lm3g.prob2), k, fp) != (size_t) k) { + E_ERROR("fread(prob2) failed\n"); + goto error_out; + } + for (i = 0; i < k; i++) { + if (do_swap) + SWAP_INT32(&model->lm3g.prob2[i].l); + /* Convert values to log. */ + model->lm3g.prob2[i].l = logmath_log10_to_log(lmath, model->lm3g.prob2[i].f); + } + E_INFO("%8d = LM.prob2 entries read\n", k); + } + + /* read n_bo_wt2 and bo_wt2 array (in memory) */ + if (base->n > 2) { + if (fread(&k, sizeof(k), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&k); + model->lm3g.n_bo_wt2 = k; + model->lm3g.bo_wt2 = ckd_calloc(k, sizeof(*model->lm3g.bo_wt2)); + if (fread(model->lm3g.bo_wt2, sizeof(*model->lm3g.bo_wt2), k, fp) != (size_t) k) { + E_ERROR("Failed to read backoff weights\n"); + goto error_out; + } + for (i = 0; i < k; i++) { + if (do_swap) + SWAP_INT32(&model->lm3g.bo_wt2[i].l); + /* Convert values to log. */ + model->lm3g.bo_wt2[i].l = logmath_log10_to_log(lmath, model->lm3g.bo_wt2[i].f); + } + E_INFO("%8d = LM.bo_wt2 entries read\n", k); + } + + /* read n_prob3 and prob3 array (in memory) */ + if (base->n > 2) { + if (fread(&k, sizeof(k), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&k); + model->lm3g.n_prob3 = k; + model->lm3g.prob3 = ckd_calloc(k, sizeof(*model->lm3g.prob3)); + if (fread(model->lm3g.prob3, sizeof(*model->lm3g.prob3), k, fp) != (size_t) k) { + E_ERROR("Failed to read trigram probability\n"); + goto error_out; + } + for (i = 0; i < k; i++) { + if (do_swap) + SWAP_INT32(&model->lm3g.prob3[i].l); + /* Convert values to log. */ + model->lm3g.prob3[i].l = logmath_log10_to_log(lmath, model->lm3g.prob3[i].f); + } + E_INFO("%8d = LM.prob3 entries read\n", k); + } + + /* read tseg_base size and tseg_base */ + if (do_mmap) + offset = ftell(fp); + if (n_trigram > 0) { + if (do_mmap) { + memcpy(&k, map_base + offset, sizeof(k)); + offset += sizeof(int32); + model->lm3g.tseg_base = (int32 *) (map_base + offset); + offset += k * sizeof(int32); + } + else { + k = (n_bigram + 1) / BG_SEG_SZ + 1; + if (fread(&k, sizeof(k), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&k); + model->lm3g.tseg_base = ckd_calloc(k, sizeof(int32)); + if (fread(model->lm3g.tseg_base, sizeof(int32), k, fp) != + (size_t) k) { + E_ERROR("Failed to read trigram index\n"); + goto error_out; + } + if (do_swap) + for (i = 0; i < k; i++) + SWAP_INT32(&model->lm3g.tseg_base[i]); + } + E_INFO("%8d = LM.tseg_base entries read\n", k); + } + + /* read ascii word strings */ + if (do_mmap) { + memcpy(&k, map_base + offset, sizeof(k)); + offset += sizeof(int32); + tmp_word_str = (char *) (map_base + offset); + offset += k; + } + else { + base->writable = TRUE; + if (fread(&k, sizeof(k), 1, fp) != 1) + goto error_out; + if (do_swap) SWAP_INT32(&k); + tmp_word_str = ckd_calloc(k, 1); + if (fread(tmp_word_str, 1, k, fp) != (size_t) k) { + E_ERROR("Failed to read words\n"); + goto error_out; + } + } + + /* First make sure string just read contains n_counts[0] words (PARANOIA!!) */ + for (i = 0, j = 0; i < k; i++) + if (tmp_word_str[i] == '\0') + j++; + if (j != n_unigram) { + E_ERROR("Error reading word strings (%d doesn't match n_unigrams %d)\n", + j, n_unigram); + goto error_out; + } + + /* Break up string just read into words */ + if (do_mmap) { + j = 0; + for (i = 0; i < n_unigram; i++) { + base->word_str[i] = tmp_word_str + j; + if (hash_table_enter(base->wid, base->word_str[i], + (void *)(long)i) != (void *)(long)i) { + E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]); + } + j += strlen(base->word_str[i]) + 1; + } + } + else { + j = 0; + for (i = 0; i < n_unigram; i++) { + base->word_str[i] = ckd_salloc(tmp_word_str + j); + if (hash_table_enter(base->wid, base->word_str[i], + (void *)(long)i) != (void *)(long)i) { + E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]); + } + j += strlen(base->word_str[i]) + 1; + } + free(tmp_word_str); + } + E_INFO("%8d = ascii word strings read\n", i); + + fclose_comp(fp, is_pipe); + return base; + +error_out: + if (fp) + fclose_comp(fp, is_pipe); + ngram_model_free(base); + return NULL; +} + +ngram_model_dmp_t * +ngram_model_dmp_build(ngram_model_t *base) +{ + ngram_model_dmp_t *model; + ngram_model_t *newbase; + ngram_iter_t *itor; + sorted_list_t sorted_prob2; + sorted_list_t sorted_bo_wt2; + sorted_list_t sorted_prob3; + bigram_t *bgptr; + trigram_t *tgptr; + int i, bgcount, tgcount, seg; + + if (base->funcs == &ngram_model_dmp_funcs) { + E_INFO("Using existing DMP model.\n"); + return (ngram_model_dmp_t *)ngram_model_retain(base); + } + + /* Initialize new base model structure with params from base. */ + E_INFO("Building DMP model...\n"); + model = ckd_calloc(1, sizeof(*model)); + newbase = &model->base; + ngram_model_init(newbase, &ngram_model_dmp_funcs, + logmath_retain(base->lmath), + base->n, base->n_counts[0]); + /* Copy N-gram counts over. */ + memcpy(newbase->n_counts, base->n_counts, + base->n * sizeof(*base->n_counts)); + /* Make sure word strings are freed. */ + newbase->writable = TRUE; + /* Initialize unigram table and string table. */ + model->lm3g.unigrams = new_unigram_table(newbase->n_counts[0] + 1); + for (itor = ngram_model_mgrams(base, 0); itor; + itor = ngram_iter_next(itor)) { + int32 prob1, bo_wt1; + int32 const *wids; + + /* Can't guarantee they will go in unigram order, so just to + * be correct, we do this... */ + wids = ngram_iter_get(itor, &prob1, &bo_wt1); + model->lm3g.unigrams[wids[0]].prob1.l = prob1; + model->lm3g.unigrams[wids[0]].bo_wt1.l = bo_wt1; + newbase->word_str[wids[0]] = ckd_salloc(ngram_word(base, wids[0])); + if ((hash_table_enter_int32(newbase->wid, + newbase->word_str[wids[0]], wids[0])) + != wids[0]) { + E_WARN("Duplicate word in dictionary: %s\n", newbase->word_str[wids[0]]); + } + } + E_INFO("%8d = #unigrams created\n", newbase->n_counts[0]); + + if (newbase->n < 2) + return model; + + /* Construct quantized probability table for bigrams and + * (optionally) trigrams. Hesitate to use the "sorted list" thing + * since it isn't so useful, but it's there already. */ + init_sorted_list(&sorted_prob2); + if (newbase->n > 2) { + init_sorted_list(&sorted_bo_wt2); + init_sorted_list(&sorted_prob3); + } + /* Construct bigram and trigram arrays. */ + bgptr = model->lm3g.bigrams = ckd_calloc(newbase->n_counts[1] + 1, sizeof(bigram_t)); + if (newbase->n > 2) { + tgptr = model->lm3g.trigrams = ckd_calloc(newbase->n_counts[2], sizeof(trigram_t)); + model->lm3g.tseg_base = + ckd_calloc((newbase->n_counts[1] + 1) / BG_SEG_SZ + 1, sizeof(int32)); + } + else + tgptr = NULL; + /* Since bigrams and trigrams have to be contiguous with others + * with the same N-1-gram, we traverse them in depth-first order + * to build the bigram and trigram arrays. */ + for (i = 0; i < newbase->n_counts[0]; ++i) { + ngram_iter_t *uitor; + bgcount = bgptr - model->lm3g.bigrams; + /* First bigram index (same as next if no bigrams...) */ + model->lm3g.unigrams[i].bigrams = bgcount; + E_DEBUG(2, ("unigram %d: %s => bigram %d\n", i, newbase->word_str[i], bgcount)); + /* All bigrams corresponding to unigram i */ + uitor = ngram_ng_iter(base, i, NULL, 0); + for (itor = ngram_iter_successors(uitor); + itor; ++bgptr, itor = ngram_iter_next(itor)) { + int32 prob2, bo_wt2; + int32 const *wids; + ngram_iter_t *titor; + + wids = ngram_iter_get(itor, &prob2, &bo_wt2); + + assert (bgptr - model->lm3g.bigrams < newbase->n_counts[1]); + + bgptr->wid = wids[1]; + bgptr->prob2 = sorted_id(&sorted_prob2, &prob2); + if (newbase->n > 2) { + tgcount = (tgptr - model->lm3g.trigrams); + bgcount = (bgptr - model->lm3g.bigrams); + + /* Backoff weight (only if there are trigrams...) */ + bgptr->bo_wt2 = sorted_id(&sorted_bo_wt2, &bo_wt2); + + /* Find bigram segment for this bigram (this isn't + * used unless there are trigrams) */ + seg = bgcount >> LOG_BG_SEG_SZ; + /* If we just crossed a bigram segment boundary, then + * point tseg_base for the new segment to the current + * trigram pointer. */ + if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ) + model->lm3g.tseg_base[seg] = tgcount; + /* Now calculate the trigram offset. */ + bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg]; + E_DEBUG(2, ("bigram %d %s %s => trigram %d:%d\n", + bgcount, + newbase->word_str[wids[0]], + newbase->word_str[wids[1]], + seg, bgptr->trigrams)); + + /* And fill in successors' trigram info. */ + for (titor = ngram_iter_successors(itor); + titor; ++tgptr, titor = ngram_iter_next(titor)) { + int32 prob3, dummy; + + assert(tgptr - model->lm3g.trigrams < newbase->n_counts[2]); + wids = ngram_iter_get(titor, &prob3, &dummy); + tgptr->wid = wids[2]; + tgptr->prob3 = sorted_id(&sorted_prob3, &prob3); + E_DEBUG(2, ("trigram %d %s %s %s => prob %d\n", + tgcount, + newbase->word_str[wids[0]], + newbase->word_str[wids[1]], + newbase->word_str[wids[2]], + tgptr->prob3)); + } + } + } + ngram_iter_free(uitor); + } + /* Add sentinal unigram and bigram records. */ + bgcount = bgptr - model->lm3g.bigrams; + tgcount = tgptr - model->lm3g.trigrams; + seg = bgcount >> LOG_BG_SEG_SZ; + if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ) + model->lm3g.tseg_base[seg] = tgcount; + model->lm3g.unigrams[i].bigrams = bgcount; + if (newbase->n > 2) + bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg]; + + /* Now create probability tables. */ + model->lm3g.n_prob2 = sorted_prob2.free; + model->lm3g.prob2 = vals_in_sorted_list(&sorted_prob2); + E_INFO("%8d = #bigrams created\n", newbase->n_counts[1]); + E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2); + free_sorted_list(&sorted_prob2); + if (newbase->n > 2) { + /* Create trigram bo-wts array. */ + model->lm3g.n_bo_wt2 = sorted_bo_wt2.free; + model->lm3g.bo_wt2 = vals_in_sorted_list(&sorted_bo_wt2); + free_sorted_list(&sorted_bo_wt2); + E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2); + /* Create trigram probability table. */ + model->lm3g.n_prob3 = sorted_prob3.free; + model->lm3g.prob3 = vals_in_sorted_list(&sorted_prob3); + E_INFO("%8d = #trigrams created\n", newbase->n_counts[2]); + E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3); + free_sorted_list(&sorted_prob3); + /* Initialize tginfo */ + model->lm3g.tginfo = ckd_calloc(newbase->n_counts[0], sizeof(tginfo_t *)); + model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t)); + } + + return model; +} + +static void +fwrite_int32(FILE *fh, int32 val) +{ + fwrite(&val, 4, 1, fh); +} + +static void +fwrite_ug(FILE *fh, unigram_t *ug, logmath_t *lmath) +{ + int32 bogus = -1; + float32 log10val; + + /* Bogus dictionary mapping field. */ + fwrite(&bogus, 4, 1, fh); + /* Convert values to log10. */ + log10val = logmath_log_to_log10(lmath, ug->prob1.l); + fwrite(&log10val, 4, 1, fh); + log10val = logmath_log_to_log10(lmath, ug->bo_wt1.l); + fwrite(&log10val, 4, 1, fh); + fwrite_int32(fh, ug->bigrams); +} + +static void +fwrite_bg(FILE *fh, bigram_t *bg) +{ + fwrite(bg, sizeof(*bg), 1, fh); +} + +static void +fwrite_tg(FILE *fh, trigram_t *tg) +{ + fwrite(tg, sizeof(*tg), 1, fh); +} + +/** Please look at the definition of + */ +static char const *fmtdesc[] = { + "BEGIN FILE FORMAT DESCRIPTION", + "Header string length (int32) and string (including trailing 0)", + "Original LM filename string-length (int32) and filename (including trailing 0)", + "(int32) version number (present iff value <= 0)", + "(int32) original LM file modification timestamp (iff version# present)", + "(int32) string-length and string (including trailing 0) (iff version# present)", + "... previous entry continued any number of times (iff version# present)", + "(int32) 0 (terminating sequence of strings) (iff version# present)", + "(int32) log_bg_seg_sz (present iff different from default value of LOG2_BG_SEG_SZ)", + "(int32) lm_t.ucount (must be > 0)", + "(int32) lm_t.bcount", + "(int32) lm_t.tcount", + "lm_t.ucount+1 unigrams (including sentinel)", + "lm_t.bcount+1 bigrams (including sentinel 64 bits (bg_t) each if version=-1/-2, 128 bits (bg32_t) each if version=-3", + "lm_t.tcount trigrams (present iff lm_t.tcount > 0 32 bits (tg_t) each if version=-1/-2, 64 bits (tg32_t) each if version=-3)", + "(int32) lm_t.n_prob2", + "(int32) lm_t.prob2[]", + "(int32) lm_t.n_bo_wt2 (present iff lm_t.tcount > 0)", + "(int32) lm_t.bo_wt2[] (present iff lm_t.tcount > 0)", + "(int32) lm_t.n_prob3 (present iff lm_t.tcount > 0)", + "(int32) lm_t.prob3[] (present iff lm_t.tcount > 0)", + "(int32) (lm_t.bcount+1)/BG_SEG_SZ+1 (present iff lm_t.tcount > 0)", + "(int32) lm_t.tseg_base[] (present iff lm_t.tcount > 0)", + "(int32) Sum(all word string-lengths, including trailing 0 for each)", + "All word strings (including trailing 0 for each)", + "END FILE FORMAT DESCRIPTION", + NULL, +}; + +static void +ngram_model_dmp_write_header(FILE * fh) +{ + int32 k; + k = strlen(darpa_hdr) + 1; + fwrite_int32(fh, k); + fwrite(darpa_hdr, 1, k, fh); +} + +static void +ngram_model_dmp_write_lm_filename(FILE * fh, const char *lmfile) +{ + int32 k; + + k = strlen(lmfile) + 1; + fwrite_int32(fh, k); + fwrite(lmfile, 1, k, fh); +} + +#define LMDMP_VERSION_TG_16BIT -1 /**< VERSION 1 is the simplest DMP file which + is trigram or lower which used 16 bits in + bigram and trigram.*/ + +static void +ngram_model_dmp_write_version(FILE * fh, int32 mtime) +{ + fwrite_int32(fh, LMDMP_VERSION_TG_16BIT); /* version # */ + fwrite_int32(fh, mtime); +} + +static void +ngram_model_dmp_write_ngram_counts(FILE * fh, ngram_model_t *model) +{ + fwrite_int32(fh, model->n_counts[0]); + fwrite_int32(fh, model->n_counts[1]); + fwrite_int32(fh, model->n_counts[2]); +} + +static void +ngram_model_dmp_write_fmtdesc(FILE * fh) +{ + int32 i, k; + long pos; + + /* Write file format description into header */ + for (i = 0; fmtdesc[i] != NULL; i++) { + k = strlen(fmtdesc[i]) + 1; + fwrite_int32(fh, k); + fwrite(fmtdesc[i], 1, k, fh); + } + /* Pad it out in order to achieve 32-bit alignment */ + pos = ftell(fh); + k = pos & 3; + if (k) { + fwrite_int32(fh, 4-k); + fwrite("!!!!", 1, 4-k, fh); + } + fwrite_int32(fh, 0); +} + +static void +ngram_model_dmp_write_unigram(FILE *fh, ngram_model_t *model) +{ + ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; + int32 i; + + for (i = 0; i <= model->n_counts[0]; i++) { + fwrite_ug(fh, &(lm->lm3g.unigrams[i]), model->lmath); + } +} + + +static void +ngram_model_dmp_write_bigram(FILE *fh, ngram_model_t *model) +{ + ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; + int32 i; + + for (i = 0; i <= model->n_counts[1]; i++) { + fwrite_bg(fh, &(lm->lm3g.bigrams[i])); + } + +} + +static void +ngram_model_dmp_write_trigram(FILE *fh, ngram_model_t *model) +{ + ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; + int32 i; + + for (i = 0; i < model->n_counts[2]; i++) { + fwrite_tg(fh, &(lm->lm3g.trigrams[i])); + } +} + +static void +ngram_model_dmp_write_bgprob(FILE *fh, ngram_model_t *model) +{ + ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; + int32 i; + + fwrite_int32(fh, lm->lm3g.n_prob2); + for (i = 0; i < lm->lm3g.n_prob2; i++) { + float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob2[i].l); + fwrite(&log10val, 4, 1, fh); + } +} + +static void +ngram_model_dmp_write_tgbowt(FILE *fh, ngram_model_t *model) +{ + ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; + int32 i; + + fwrite_int32(fh, lm->lm3g.n_bo_wt2); + for (i = 0; i < lm->lm3g.n_bo_wt2; i++) { + float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.bo_wt2[i].l); + fwrite(&log10val, 4, 1, fh); + } +} + +static void +ngram_model_dmp_write_tgprob(FILE *fh, ngram_model_t *model) +{ + ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; + int32 i; + + fwrite_int32(fh, lm->lm3g.n_prob3); + for (i = 0; i < lm->lm3g.n_prob3; i++) { + float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob3[i].l); + fwrite(&log10val, 4, 1, fh); + } +} + +static void +ngram_model_dmp_write_tg_segbase(FILE *fh, ngram_model_t *model) +{ + ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; + int32 i, k; + + k = (model->n_counts[1] + 1) / BG_SEG_SZ + 1; + fwrite_int32(fh, k); + for (i = 0; i < k; i++) + fwrite_int32(fh, lm->lm3g.tseg_base[i]); +} + +static void +ngram_model_dmp_write_wordstr(FILE *fh, ngram_model_t *model) +{ + int32 i, k; + + k = 0; + for (i = 0; i < model->n_counts[0]; i++) + k += strlen(model->word_str[i]) + 1; + fwrite_int32(fh, k); + for (i = 0; i < model->n_counts[0]; i++) + fwrite(model->word_str[i], 1, + strlen(model->word_str[i]) + 1, fh); +} + +int +ngram_model_dmp_write(ngram_model_t *base, + const char *file_name) +{ + ngram_model_dmp_t *model; + ngram_model_t *newbase; + FILE *fh; + + /* First, construct a DMP model from the base model. */ + model = ngram_model_dmp_build(base); + newbase = &model->base; + + /* Now write it, confident in the knowledge that it's the right + * kind of language model internally. */ + if ((fh = fopen(file_name, "wb")) == NULL) { + E_ERROR("Cannot create file %s\n", file_name); + return -1; + } + ngram_model_dmp_write_header(fh); + ngram_model_dmp_write_lm_filename(fh, file_name); + ngram_model_dmp_write_version(fh, 0); + ngram_model_dmp_write_fmtdesc(fh); + ngram_model_dmp_write_ngram_counts(fh, newbase); + ngram_model_dmp_write_unigram(fh, newbase); + if (newbase->n > 1) { + ngram_model_dmp_write_bigram(fh, newbase); + if (newbase->n > 2) { + ngram_model_dmp_write_trigram(fh, newbase); + } + ngram_model_dmp_write_bgprob(fh, newbase); + if (newbase->n > 2) { + ngram_model_dmp_write_tgbowt(fh, newbase); + ngram_model_dmp_write_tgprob(fh, newbase); + ngram_model_dmp_write_tg_segbase(fh, newbase); + } + } + ngram_model_dmp_write_wordstr(fh, newbase); + ngram_model_free(newbase); + + return fclose(fh); +} + +static int +ngram_model_dmp_apply_weights(ngram_model_t *base, float32 lw, + float32 wip, float32 uw) +{ + ngram_model_dmp_t *model = (ngram_model_dmp_t *)base; + lm3g_apply_weights(base, &model->lm3g, lw, wip, uw); + return 0; +} + +/* Lousy "templating" for things that are largely the same in DMP and + * ARPA models, except for the bigram and trigram types and some + * names. */ +#define NGRAM_MODEL_TYPE ngram_model_dmp_t +#include "lm3g_templates.c" + +static void +ngram_model_dmp_free(ngram_model_t *base) +{ + ngram_model_dmp_t *model = (ngram_model_dmp_t *)base; + + ckd_free(model->lm3g.unigrams); + ckd_free(model->lm3g.prob2); + if (model->dump_mmap) { + mmio_file_unmap(model->dump_mmap); + } + else { + ckd_free(model->lm3g.bigrams); + if (base->n > 2) { + ckd_free(model->lm3g.trigrams); + ckd_free(model->lm3g.tseg_base); + } + } + if (base->n > 2) { + ckd_free(model->lm3g.bo_wt2); + ckd_free(model->lm3g.prob3); + } + + lm3g_tginfo_free(base, &model->lm3g); +} + +static ngram_funcs_t ngram_model_dmp_funcs = { + ngram_model_dmp_free, /* free */ + ngram_model_dmp_apply_weights, /* apply_weights */ + lm3g_template_score, /* score */ + lm3g_template_raw_score, /* raw_score */ + lm3g_template_add_ug, /* add_ug */ + lm3g_template_flush, /* flush */ + lm3g_template_iter, /* iter */ + lm3g_template_mgrams, /* mgrams */ + lm3g_template_successors, /* successors */ + lm3g_template_iter_get, /* iter_get */ + lm3g_template_iter_next, /* iter_next */ + lm3g_template_iter_free /* iter_free */ +}; diff --git a/media/sphinxbase/src/libsphinxbase/lm/ngram_model_dmp.h b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_dmp.h new file mode 100644 index 000000000..a3b141ad1 --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_dmp.h @@ -0,0 +1,92 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2007 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/* + * \file ngram_model_dmp.h DMP format for N-Gram models + * + * Author: David Huggins-Daines <dhuggins@cs.cmu.edu> + */ + +#ifndef __NGRAM_MODEL_DMP_H__ +#define __NGRAM_MODEL_DMP_H__ + +#include "sphinxbase/mmio.h" + +#include "ngram_model_internal.h" +#include "lm3g_model.h" + +/** + * On-disk representation of bigrams. + */ +struct bigram_s { + uint16 wid; /**< Index of unigram entry for this. (NOT dictionary id.) */ + uint16 prob2; /**< Index into array of actual bigram probs */ + uint16 bo_wt2; /**< Index into array of actual bigram backoff wts */ + uint16 trigrams; /**< Index of 1st entry in lm_t.trigrams[], + RELATIVE TO its segment base (see lm3g_model.h) */ +}; + +/** + * On-disk representation of trigrams. + * + * As with bigrams, trigram prob info kept in a separate table for conserving + * memory space. + */ +struct trigram_s { + uint16 wid; /**< Index of unigram entry for this. (NOT dictionary id.) */ + uint16 prob3; /**< Index into array of actual trigram probs */ +}; + +/** + * Subclass of ngram_model for DMP file reading. + */ +typedef struct ngram_model_dmp_s { + ngram_model_t base; /**< Base ngram_model_t structure */ + lm3g_model_t lm3g; /**< Common lm3g_model_t structure */ + mmio_file_t *dump_mmap; /**< mmap() of dump file (or NULL if none) */ +} ngram_model_dmp_t; + +/** + * Construct a DMP format model from a generic base model. + * + * Note: If base is already a DMP format model, this just calls + * ngram_model_retain(), and any changes will also be made in the base + * model. + */ +ngram_model_dmp_t *ngram_model_dmp_build(ngram_model_t *base); + + +#endif /* __NGRAM_MODEL_DMP_H__ */ diff --git a/media/sphinxbase/src/libsphinxbase/lm/ngram_model_internal.h b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_internal.h new file mode 100644 index 000000000..dcc7b5ae3 --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_internal.h @@ -0,0 +1,282 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2007 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/* + * \file ngram_model_internal.h Internal structures for N-Gram models + * + * Author: David Huggins-Daines <dhuggins@cs.cmu.edu> + */ + +#ifndef __NGRAM_MODEL_INTERNAL_H__ +#define __NGRAM_MODEL_INTERNAL_H__ + +#include "sphinxbase/ngram_model.h" +#include "sphinxbase/hash_table.h" + +/** + * Common implementation of ngram_model_t. + * + * The details of bigram, trigram, and higher-order N-gram storage, if any, can + * vary somewhat depending on the file format in use. + */ +struct ngram_model_s { + int refcount; /**< Reference count */ + int32 *n_counts; /**< Counts for 1, 2, 3, ... grams */ + int32 n_1g_alloc; /**< Number of allocated word strings (for new word addition) */ + int32 n_words; /**< Number of actual word strings (NOT the same as the + number of unigrams, due to class words). */ + uint8 n; /**< This is an n-gram model (1, 2, 3, ...). */ + uint8 n_classes; /**< Number of classes (maximum 128) */ + uint8 writable; /**< Are word strings writable? */ + uint8 flags; /**< Any other flags we might care about + (FIXME: Merge this and writable) */ + logmath_t *lmath; /**< Log-math object */ + float32 lw; /**< Language model scaling factor */ + int32 log_wip; /**< Log of word insertion penalty */ + int32 log_uw; /**< Log of unigram weight */ + int32 log_uniform; /**< Log of uniform (0-gram) probability */ + int32 log_uniform_weight; /**< Log of uniform weight (i.e. 1 - unigram weight) */ + int32 log_zero; /**< Zero probability, cached here for quick lookup */ + char **word_str; /**< Unigram names */ + hash_table_t *wid; /**< Mapping of unigram names to word IDs. */ + int32 *tmp_wids; /**< Temporary array of word IDs for ngram_model_get_ngram() */ + struct ngram_class_s **classes; /**< Word class definitions. */ + struct ngram_funcs_s *funcs; /**< Implementation-specific methods. */ +}; + +/** + * Implementation of ngram_class_t. + */ +struct ngram_class_s { + int32 tag_wid; /**< Base word ID for this class tag */ + int32 start_wid; /**< Starting base word ID for this class' words */ + int32 n_words; /**< Number of base words for this class */ + int32 *prob1; /**< Probability table for base words */ + /** + * Custom hash table for additional words. + */ + struct ngram_hash_s { + int32 wid; /**< Word ID of this bucket */ + int32 prob1; /**< Probability for this word */ + int32 next; /**< Index of next bucket (or -1 for no collision) */ + } *nword_hash; + int32 n_hash; /**< Number of buckets in nword_hash (power of 2) */ + int32 n_hash_inuse; /**< Number of words in nword_hash */ +}; + +#define NGRAM_HASH_SIZE 128 + +#define NGRAM_BASEWID(wid) ((wid)&0xffffff) +#define NGRAM_CLASSID(wid) (((wid)>>24) & 0x7f) +#define NGRAM_CLASSWID(wid,classid) (((classid)<<24) | 0x80000000 | (wid)) +#define NGRAM_IS_CLASSWID(wid) ((wid)&0x80000000) + +#define UG_ALLOC_STEP 10 + +/** Implementation-specific functions for operating on ngram_model_t objects */ +typedef struct ngram_funcs_s { + /** + * Implementation-specific function for freeing an ngram_model_t. + */ + void (*free)(ngram_model_t *model); + /** + * Implementation-specific function for applying language model weights. + */ + int (*apply_weights)(ngram_model_t *model, + float32 lw, + float32 wip, + float32 uw); + /** + * Implementation-specific function for querying language model score. + */ + int32 (*score)(ngram_model_t *model, + int32 wid, + int32 *history, + int32 n_hist, + int32 *n_used); + /** + * Implementation-specific function for querying raw language + * model probability. + */ + int32 (*raw_score)(ngram_model_t *model, + int32 wid, + int32 *history, + int32 n_hist, + int32 *n_used); + /** + * Implementation-specific function for adding unigrams. + * + * This function updates the internal structures of a language + * model to add the given unigram with the given weight (defined + * as a log-factor applied to the uniform distribution). This + * includes reallocating or otherwise resizing the set of unigrams. + * + * @return The language model score (not raw log-probability) of + * the new word, or 0 for failure. + */ + int32 (*add_ug)(ngram_model_t *model, + int32 wid, int32 lweight); + /** + * Implementation-specific function for purging N-Gram cache + */ + void (*flush)(ngram_model_t *model); + + /** + * Implementation-specific function for iterating. + */ + ngram_iter_t * (*iter)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist); + + /** + * Implementation-specific function for iterating. + */ + ngram_iter_t * (*mgrams)(ngram_model_t *model, int32 m); + + /** + * Implementation-specific function for iterating. + */ + ngram_iter_t * (*successors)(ngram_iter_t *itor); + + /** + * Implementation-specific function for iterating. + */ + int32 const * (*iter_get)(ngram_iter_t *itor, + int32 *out_score, + int32 *out_bowt); + + /** + * Implementation-specific function for iterating. + */ + ngram_iter_t * (*iter_next)(ngram_iter_t *itor); + + /** + * Implementation-specific function for iterating. + */ + void (*iter_free)(ngram_iter_t *itor); +} ngram_funcs_t; + +/** + * Base iterator structure for N-grams. + */ +struct ngram_iter_s { + ngram_model_t *model; + int32 *wids; /**< Scratch space for word IDs. */ + int16 m; /**< Order of history. */ + int16 successor; /**< Is this a successor iterator? */ +}; + +/** + * One class definition from a classdef file. + */ +typedef struct classdef_s { + char **words; + float32 *weights; + int32 n_words; +} classdef_t; + +/** + * Initialize the base ngram_model_t structure. + */ +int32 +ngram_model_init(ngram_model_t *model, + ngram_funcs_t *funcs, + logmath_t *lmath, + int32 n, int32 n_unigram); + +/** + * Read an N-Gram model from an ARPABO text file. + */ +ngram_model_t *ngram_model_arpa_read(cmd_ln_t *config, + const char *file_name, + logmath_t *lmath); +/** + * Read an N-Gram model from a Sphinx .DMP binary file. + */ +ngram_model_t *ngram_model_dmp_read(cmd_ln_t *config, + const char *file_name, + logmath_t *lmath); +/** + * Read an N-Gram model from a Sphinx .DMP32 binary file. + */ +ngram_model_t *ngram_model_dmp32_read(cmd_ln_t *config, + const char *file_name, + logmath_t *lmath); + +/** + * Write an N-Gram model to an ARPABO text file. + */ +int ngram_model_arpa_write(ngram_model_t *model, + const char *file_name); +/** + * Write an N-Gram model to a Sphinx .DMP binary file. + */ +int ngram_model_dmp_write(ngram_model_t *model, + const char *file_name); + +/** + * Read a probdef file. + */ +int32 read_classdef_file(hash_table_t *classes, const char *classdef_file); + +/** + * Free a class definition. + */ +void classdef_free(classdef_t *classdef); + +/** + * Allocate and initialize an N-Gram class. + */ +ngram_class_t *ngram_class_new(ngram_model_t *model, int32 tag_wid, + int32 start_wid, glist_t classwords); + +/** + * Deallocate an N-Gram class. + */ +void ngram_class_free(ngram_class_t *lmclass); + +/** + * Get the in-class log probability for a word in an N-Gram class. + * + * @return This probability, or 1 if word not found. + */ +int32 ngram_class_prob(ngram_class_t *lmclass, int32 wid); + +/** + * Initialize base M-Gram iterator structure. + */ +void ngram_iter_init(ngram_iter_t *itor, ngram_model_t *model, + int m, int successor); + +#endif /* __NGRAM_MODEL_INTERNAL_H__ */ diff --git a/media/sphinxbase/src/libsphinxbase/lm/ngram_model_set.c b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_set.c new file mode 100644 index 000000000..50b7557ae --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_set.c @@ -0,0 +1,870 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 2008 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/** + * @file ngram_model_set.c Set of language models. + * @author David Huggins-Daines <dhuggins@cs.cmu.edu> + */ + +#include <string.h> +#include <stdlib.h> + +#include "sphinxbase/err.h" +#include "sphinxbase/ckd_alloc.h" +#include "sphinxbase/strfuncs.h" +#include "sphinxbase/filename.h" + +#include "ngram_model_set.h" + +static ngram_funcs_t ngram_model_set_funcs; + +static int +my_compare(const void *a, const void *b) +{ + /* Make sure <UNK> floats to the beginning. */ + if (strcmp(*(char * const *)a, "<UNK>") == 0) + return -1; + else if (strcmp(*(char * const *)b, "<UNK>") == 0) + return 1; + else + return strcmp(*(char * const *)a, *(char * const *)b); +} + +static void +build_widmap(ngram_model_t *base, logmath_t *lmath, int32 n) +{ + ngram_model_set_t *set = (ngram_model_set_t *)base; + ngram_model_t **models = set->lms; + hash_table_t *vocab; + glist_t hlist; + gnode_t *gn; + int32 i; + + /* Construct a merged vocabulary and a set of word-ID mappings. */ + vocab = hash_table_new(models[0]->n_words, FALSE); + /* Create the set of merged words. */ + for (i = 0; i < set->n_models; ++i) { + int32 j; + for (j = 0; j < models[i]->n_words; ++j) { + /* Ignore collisions. */ + (void)hash_table_enter_int32(vocab, models[i]->word_str[j], j); + } + } + /* Create the array of words, then sort it. */ + if (hash_table_lookup(vocab, "<UNK>", NULL) != 0) + (void)hash_table_enter_int32(vocab, "<UNK>", 0); + /* Now we know the number of unigrams, initialize the base model. */ + ngram_model_init(base, &ngram_model_set_funcs, lmath, n, hash_table_inuse(vocab)); + base->writable = FALSE; /* We will reuse the pointers from the submodels. */ + i = 0; + hlist = hash_table_tolist(vocab, NULL); + for (gn = hlist; gn; gn = gnode_next(gn)) { + hash_entry_t *ent = gnode_ptr(gn); + base->word_str[i++] = (char *)ent->key; + } + glist_free(hlist); + qsort(base->word_str, base->n_words, sizeof(*base->word_str), my_compare); + + /* Now create the word ID mappings. */ + if (set->widmap) + ckd_free_2d((void **)set->widmap); + set->widmap = (int32 **) ckd_calloc_2d(base->n_words, set->n_models, + sizeof(**set->widmap)); + for (i = 0; i < base->n_words; ++i) { + int32 j; + /* Also create the master wid mapping. */ + (void)hash_table_enter_int32(base->wid, base->word_str[i], i); + /* printf("%s: %d => ", base->word_str[i], i); */ + for (j = 0; j < set->n_models; ++j) { + set->widmap[i][j] = ngram_wid(models[j], base->word_str[i]); + /* printf("%d ", set->widmap[i][j]); */ + } + /* printf("\n"); */ + } + hash_table_free(vocab); +} + +ngram_model_t * +ngram_model_set_init(cmd_ln_t *config, + ngram_model_t **models, + char **names, + const float32 *weights, + int32 n_models) +{ + ngram_model_set_t *model; + ngram_model_t *base; + logmath_t *lmath; + int32 i, n; + + if (n_models == 0) /* WTF */ + return NULL; + + /* Do consistency checking on the models. They must all use the + * same logbase and shift. */ + lmath = models[0]->lmath; + for (i = 1; i < n_models; ++i) { + if (logmath_get_base(models[i]->lmath) != logmath_get_base(lmath) + || logmath_get_shift(models[i]->lmath) != logmath_get_shift(lmath)) { + E_ERROR("Log-math parameters don't match, will not create LM set\n"); + return NULL; + } + } + + /* Allocate the combined model, initialize it. */ + model = ckd_calloc(1, sizeof(*model)); + base = &model->base; + model->n_models = n_models; + model->lms = ckd_calloc(n_models, sizeof(*model->lms)); + model->names = ckd_calloc(n_models, sizeof(*model->names)); + /* Initialize weights to a uniform distribution */ + model->lweights = ckd_calloc(n_models, sizeof(*model->lweights)); + { + int32 uniform = logmath_log(lmath, 1.0/n_models); + for (i = 0; i < n_models; ++i) + model->lweights[i] = uniform; + } + /* Default to interpolate if weights were given. */ + if (weights) + model->cur = -1; + + n = 0; + for (i = 0; i < n_models; ++i) { + model->lms[i] = ngram_model_retain(models[i]); + model->names[i] = ckd_salloc(names[i]); + if (weights) + model->lweights[i] = logmath_log(lmath, weights[i]); + /* N is the maximum of all merged models. */ + if (models[i]->n > n) + n = models[i]->n; + } + /* Allocate the history mapping table. */ + model->maphist = ckd_calloc(n - 1, sizeof(*model->maphist)); + + /* Now build the word-ID mapping and merged vocabulary. */ + build_widmap(base, lmath, n); + return base; +} + +ngram_model_t * +ngram_model_set_read(cmd_ln_t *config, + const char *lmctlfile, + logmath_t *lmath) +{ + FILE *ctlfp; + glist_t lms = NULL; + glist_t lmnames = NULL; + __BIGSTACKVARIABLE__ char str[1024]; + ngram_model_t *set = NULL; + hash_table_t *classes; + char *basedir, *c; + + /* Read all the class definition files to accumulate a mapping of + * classnames to definitions. */ + classes = hash_table_new(0, FALSE); + if ((ctlfp = fopen(lmctlfile, "r")) == NULL) { + E_ERROR_SYSTEM("Failed to open %s", lmctlfile); + return NULL; + } + + /* Try to find the base directory to append to relative paths in + * the lmctl file. */ + if ((c = strrchr(lmctlfile, '/')) || (c = strrchr(lmctlfile, '\\'))) { + /* Include the trailing slash. */ + basedir = ckd_calloc(c - lmctlfile + 2, 1); + memcpy(basedir, lmctlfile, c - lmctlfile + 1); + } + else { + basedir = NULL; + } + E_INFO("Reading LM control file '%s'\n", lmctlfile); + if (basedir) + E_INFO("Will prepend '%s' to unqualified paths\n", basedir); + + if (fscanf(ctlfp, "%1023s", str) == 1) { + if (strcmp(str, "{") == 0) { + /* Load LMclass files */ + while ((fscanf(ctlfp, "%1023s", str) == 1) + && (strcmp(str, "}") != 0)) { + char *deffile; + if (basedir && !path_is_absolute(str)) + deffile = string_join(basedir, str, NULL); + else + deffile = ckd_salloc(str); + E_INFO("Reading classdef from '%s'\n", deffile); + if (read_classdef_file(classes, deffile) < 0) { + ckd_free(deffile); + goto error_out; + } + ckd_free(deffile); + } + + if (strcmp(str, "}") != 0) { + E_ERROR("Unexpected EOF in %s\n", lmctlfile); + goto error_out; + } + + /* This might be the first LM name. */ + if (fscanf(ctlfp, "%1023s", str) != 1) + str[0] = '\0'; + } + } + else + str[0] = '\0'; + + /* Read in one LM at a time and add classes to them as necessary. */ + while (str[0] != '\0') { + char *lmfile; + ngram_model_t *lm; + + if (basedir && str[0] != '/' && str[0] != '\\') + lmfile = string_join(basedir, str, NULL); + else + lmfile = ckd_salloc(str); + E_INFO("Reading lm from '%s'\n", lmfile); + lm = ngram_model_read(config, lmfile, NGRAM_AUTO, lmath); + if (lm == NULL) { + ckd_free(lmfile); + goto error_out; + } + if (fscanf(ctlfp, "%1023s", str) != 1) { + E_ERROR("LMname missing after LMFileName '%s'\n", lmfile); + ckd_free(lmfile); + goto error_out; + } + ckd_free(lmfile); + lms = glist_add_ptr(lms, lm); + lmnames = glist_add_ptr(lmnames, ckd_salloc(str)); + + if (fscanf(ctlfp, "%1023s", str) == 1) { + if (strcmp(str, "{") == 0) { + /* LM uses classes; read their names */ + while ((fscanf(ctlfp, "%1023s", str) == 1) && + (strcmp(str, "}") != 0)) { + void *val; + classdef_t *classdef; + + if (hash_table_lookup(classes, str, &val) == -1) { + E_ERROR("Unknown class %s in control file\n", str); + goto error_out; + } + classdef = val; + if (ngram_model_add_class(lm, str, 1.0, + classdef->words, classdef->weights, + classdef->n_words) < 0) { + goto error_out; + } + E_INFO("Added class %s containing %d words\n", + str, classdef->n_words); + } + if (strcmp(str, "}") != 0) { + E_ERROR("Unexpected EOF in %s\n", lmctlfile); + goto error_out; + } + if (fscanf(ctlfp, "%1023s", str) != 1) + str[0] = '\0'; + } + } + else + str[0] = '\0'; + } + fclose(ctlfp); + + /* Now construct arrays out of lms and lmnames, and build an + * ngram_model_set. */ + lms = glist_reverse(lms); + lmnames = glist_reverse(lmnames); + { + int32 n_models; + ngram_model_t **lm_array; + char **name_array; + gnode_t *lm_node, *name_node; + int32 i; + + n_models = glist_count(lms); + lm_array = ckd_calloc(n_models, sizeof(*lm_array)); + name_array = ckd_calloc(n_models, sizeof(*name_array)); + lm_node = lms; + name_node = lmnames; + for (i = 0; i < n_models; ++i) { + lm_array[i] = gnode_ptr(lm_node); + name_array[i] = gnode_ptr(name_node); + lm_node = gnode_next(lm_node); + name_node = gnode_next(name_node); + } + set = ngram_model_set_init(config, lm_array, name_array, + NULL, n_models); + ckd_free(lm_array); + ckd_free(name_array); + } +error_out: + { + gnode_t *gn; + glist_t hlist; + + if (set == NULL) { + for (gn = lms; gn; gn = gnode_next(gn)) { + ngram_model_free(gnode_ptr(gn)); + } + } + glist_free(lms); + for (gn = lmnames; gn; gn = gnode_next(gn)) { + ckd_free(gnode_ptr(gn)); + } + glist_free(lmnames); + hlist = hash_table_tolist(classes, NULL); + for (gn = hlist; gn; gn = gnode_next(gn)) { + hash_entry_t *he = gnode_ptr(gn); + ckd_free((char *)he->key); + classdef_free(he->val); + } + glist_free(hlist); + hash_table_free(classes); + ckd_free(basedir); + } + return set; +} + +int32 +ngram_model_set_count(ngram_model_t *base) +{ + ngram_model_set_t *set = (ngram_model_set_t *)base; + return set->n_models; +} + +ngram_model_set_iter_t * +ngram_model_set_iter(ngram_model_t *base) +{ + ngram_model_set_t *set = (ngram_model_set_t *)base; + ngram_model_set_iter_t *itor; + + if (set == NULL || set->n_models == 0) + return NULL; + itor = ckd_calloc(1, sizeof(*itor)); + itor->set = set; + return itor; +} + +ngram_model_set_iter_t * +ngram_model_set_iter_next(ngram_model_set_iter_t *itor) +{ + if (++itor->cur == itor->set->n_models) { + ngram_model_set_iter_free(itor); + return NULL; + } + return itor; +} + +void +ngram_model_set_iter_free(ngram_model_set_iter_t *itor) +{ + ckd_free(itor); +} + +ngram_model_t * +ngram_model_set_iter_model(ngram_model_set_iter_t *itor, + char const **lmname) +{ + if (lmname) *lmname = itor->set->names[itor->cur]; + return itor->set->lms[itor->cur]; +} + +ngram_model_t * +ngram_model_set_lookup(ngram_model_t *base, + const char *name) +{ + ngram_model_set_t *set = (ngram_model_set_t *)base; + int32 i; + + if (name == NULL) { + if (set->cur == -1) + return NULL; + else + return set->lms[set->cur]; + } + + /* There probably won't be very many submodels. */ + for (i = 0; i < set->n_models; ++i) + if (0 == strcmp(set->names[i], name)) + break; + if (i == set->n_models) + return NULL; + return set->lms[i]; +} + +ngram_model_t * +ngram_model_set_select(ngram_model_t *base, + const char *name) +{ + ngram_model_set_t *set = (ngram_model_set_t *)base; + int32 i; + + /* There probably won't be very many submodels. */ + for (i = 0; i < set->n_models; ++i) + if (0 == strcmp(set->names[i], name)) + break; + if (i == set->n_models) + return NULL; + set->cur = i; + return set->lms[set->cur]; +} + +const char * +ngram_model_set_current(ngram_model_t *base) +{ + ngram_model_set_t *set = (ngram_model_set_t *)base; + + if (set->cur == -1) + return NULL; + else + return set->names[set->cur]; +} + +int32 +ngram_model_set_current_wid(ngram_model_t *base, + int32 set_wid) +{ + ngram_model_set_t *set = (ngram_model_set_t *)base; + + if (set->cur == -1 || set_wid >= base->n_words) + return NGRAM_INVALID_WID; + else + return set->widmap[set_wid][set->cur]; +} + +int32 +ngram_model_set_known_wid(ngram_model_t *base, + int32 set_wid) +{ + ngram_model_set_t *set = (ngram_model_set_t *)base; + + if (set_wid >= base->n_words) + return FALSE; + else if (set->cur == -1) { + int32 i; + for (i = 0; i < set->n_models; ++i) { + if (set->widmap[set_wid][i] != ngram_unknown_wid(set->lms[i])) + return TRUE; + } + return FALSE; + } + else + return (set->widmap[set_wid][set->cur] + != ngram_unknown_wid(set->lms[set->cur])); +} + +ngram_model_t * +ngram_model_set_interp(ngram_model_t *base, + const char **names, + const float32 *weights) +{ + ngram_model_set_t *set = (ngram_model_set_t *)base; + + /* If we have a set of weights here, then set them. */ + if (names && weights) { + int32 i, j; + + /* We hope there aren't many models. */ + for (i = 0; i < set->n_models; ++i) { + for (j = 0; j < set->n_models; ++j) + if (0 == strcmp(names[i], set->names[j])) + break; + if (j == set->n_models) { + E_ERROR("Unknown LM name %s\n", names[i]); + return NULL; + } + set->lweights[j] = logmath_log(base->lmath, weights[i]); + } + } + else if (weights) { + memcpy(set->lweights, weights, set->n_models * sizeof(*set->lweights)); + } + /* Otherwise just enable existing weights. */ + set->cur = -1; + return base; +} + +ngram_model_t * +ngram_model_set_add(ngram_model_t *base, + ngram_model_t *model, + const char *name, + float32 weight, + int reuse_widmap) + +{ + ngram_model_set_t *set = (ngram_model_set_t *)base; + float32 fprob; + int32 scale, i; + + /* Add it to the array of lms. */ + ++set->n_models; + set->lms = ckd_realloc(set->lms, set->n_models * sizeof(*set->lms)); + set->lms[set->n_models - 1] = model; + set->names = ckd_realloc(set->names, set->n_models * sizeof(*set->names)); + set->names[set->n_models - 1] = ckd_salloc(name); + /* Expand the history mapping table if necessary. */ + if (model->n > base->n) { + base->n = model->n; + set->maphist = ckd_realloc(set->maphist, + (model->n - 1) * sizeof(*set->maphist)); + } + + /* Renormalize the interpolation weights. */ + fprob = weight * 1.0 / set->n_models; + set->lweights = ckd_realloc(set->lweights, + set->n_models * sizeof(*set->lweights)); + set->lweights[set->n_models - 1] = logmath_log(base->lmath, fprob); + /* Now normalize everything else to fit it in. This is + * accomplished by simply scaling all the other probabilities + * by (1-fprob). */ + scale = logmath_log(base->lmath, 1.0 - fprob); + for (i = 0; i < set->n_models - 1; ++i) + set->lweights[i] += scale; + + /* Reuse the old word ID mapping if requested. */ + if (reuse_widmap) { + int32 **new_widmap; + + /* Tack another column onto the widmap array. */ + new_widmap = (int32 **)ckd_calloc_2d(base->n_words, set->n_models, + sizeof (**new_widmap)); + for (i = 0; i < base->n_words; ++i) { + /* Copy all the existing mappings. */ + memcpy(new_widmap[i], set->widmap[i], + (set->n_models - 1) * sizeof(**new_widmap)); + /* Create the new mapping. */ + new_widmap[i][set->n_models-1] = ngram_wid(model, base->word_str[i]); + } + ckd_free_2d((void **)set->widmap); + set->widmap = new_widmap; + } + else { + build_widmap(base, base->lmath, base->n); + } + return model; +} + +ngram_model_t * +ngram_model_set_remove(ngram_model_t *base, + const char *name, + int reuse_widmap) +{ + ngram_model_set_t *set = (ngram_model_set_t *)base; + ngram_model_t *submodel; + int32 lmidx, scale, n, i; + float32 fprob; + + for (lmidx = 0; lmidx < set->n_models; ++lmidx) + if (0 == strcmp(name, set->names[lmidx])) + break; + if (lmidx == set->n_models) + return NULL; + submodel = set->lms[lmidx]; + + /* Renormalize the interpolation weights by scaling them by + * 1/(1-fprob) */ + fprob = logmath_exp(base->lmath, set->lweights[lmidx]); + scale = logmath_log(base->lmath, 1.0 - fprob); + + /* Remove it from the array of lms, renormalize remaining weights, + * and recalcluate n. */ + --set->n_models; + n = 0; + ckd_free(set->names[lmidx]); + set->names[lmidx] = NULL; + for (i = 0; i < set->n_models; ++i) { + if (i >= lmidx) { + set->lms[i] = set->lms[i+1]; + set->names[i] = set->names[i+1]; + set->lweights[i] = set->lweights[i+1]; + } + set->lweights[i] -= scale; + if (set->lms[i]->n > n) + n = set->lms[i]->n; + } + /* There's no need to shrink these arrays. */ + set->lms[set->n_models] = NULL; + set->lweights[set->n_models] = base->log_zero; + /* No need to shrink maphist either. */ + + /* Reuse the existing word ID mapping if requested. */ + if (reuse_widmap) { + /* Just go through and shrink each row. */ + for (i = 0; i < base->n_words; ++i) { + memmove(set->widmap[i] + lmidx, set->widmap[i] + lmidx + 1, + (set->n_models - lmidx) * sizeof(**set->widmap)); + } + } + else { + build_widmap(base, base->lmath, n); + } + return submodel; +} + +void +ngram_model_set_map_words(ngram_model_t *base, + const char **words, + int32 n_words) +{ + ngram_model_set_t *set = (ngram_model_set_t *)base; + int32 i; + + /* Recreate the word mapping. */ + if (base->writable) { + for (i = 0; i < base->n_words; ++i) { + ckd_free(base->word_str[i]); + } + } + ckd_free(base->word_str); + ckd_free_2d((void **)set->widmap); + base->writable = TRUE; + base->n_words = base->n_1g_alloc = n_words; + base->word_str = ckd_calloc(n_words, sizeof(*base->word_str)); + set->widmap = (int32 **)ckd_calloc_2d(n_words, set->n_models, sizeof(**set->widmap)); + hash_table_empty(base->wid); + for (i = 0; i < n_words; ++i) { + int32 j; + base->word_str[i] = ckd_salloc(words[i]); + (void)hash_table_enter_int32(base->wid, base->word_str[i], i); + for (j = 0; j < set->n_models; ++j) { + set->widmap[i][j] = ngram_wid(set->lms[j], base->word_str[i]); + } + } +} + +static int +ngram_model_set_apply_weights(ngram_model_t *base, float32 lw, + float32 wip, float32 uw) +{ + ngram_model_set_t *set = (ngram_model_set_t *)base; + int32 i; + + /* Apply weights to each sub-model. */ + for (i = 0; i < set->n_models; ++i) + ngram_model_apply_weights(set->lms[i], lw, wip, uw); + return 0; +} + +static int32 +ngram_model_set_score(ngram_model_t *base, int32 wid, + int32 *history, int32 n_hist, + int32 *n_used) +{ + ngram_model_set_t *set = (ngram_model_set_t *)base; + int32 mapwid; + int32 score; + int32 i; + + /* Truncate the history. */ + if (n_hist > base->n - 1) + n_hist = base->n - 1; + + /* Interpolate if there is no current. */ + if (set->cur == -1) { + score = base->log_zero; + for (i = 0; i < set->n_models; ++i) { + int32 j; + /* Map word and history IDs for each model. */ + mapwid = set->widmap[wid][i]; + for (j = 0; j < n_hist; ++j) { + if (history[j] == NGRAM_INVALID_WID) + set->maphist[j] = NGRAM_INVALID_WID; + else + set->maphist[j] = set->widmap[history[j]][i]; + } + score = logmath_add(base->lmath, score, + set->lweights[i] + + ngram_ng_score(set->lms[i], + mapwid, set->maphist, n_hist, n_used)); + } + } + else { + int32 j; + /* Map word and history IDs (FIXME: do this in a function?) */ + mapwid = set->widmap[wid][set->cur]; + for (j = 0; j < n_hist; ++j) { + if (history[j] == NGRAM_INVALID_WID) + set->maphist[j] = NGRAM_INVALID_WID; + else + set->maphist[j] = set->widmap[history[j]][set->cur]; + } + score = ngram_ng_score(set->lms[set->cur], + mapwid, set->maphist, n_hist, n_used); + } + + return score; +} + +static int32 +ngram_model_set_raw_score(ngram_model_t *base, int32 wid, + int32 *history, int32 n_hist, + int32 *n_used) +{ + ngram_model_set_t *set = (ngram_model_set_t *)base; + int32 mapwid; + int32 score; + int32 i; + + /* Truncate the history. */ + if (n_hist > base->n - 1) + n_hist = base->n - 1; + + /* Interpolate if there is no current. */ + if (set->cur == -1) { + score = base->log_zero; + for (i = 0; i < set->n_models; ++i) { + int32 j; + /* Map word and history IDs for each model. */ + mapwid = set->widmap[wid][i]; + for (j = 0; j < n_hist; ++j) { + if (history[j] == NGRAM_INVALID_WID) + set->maphist[j] = NGRAM_INVALID_WID; + else + set->maphist[j] = set->widmap[history[j]][i]; + } + score = logmath_add(base->lmath, score, + set->lweights[i] + + ngram_ng_prob(set->lms[i], + mapwid, set->maphist, n_hist, n_used)); + } + } + else { + int32 j; + /* Map word and history IDs (FIXME: do this in a function?) */ + mapwid = set->widmap[wid][set->cur]; + for (j = 0; j < n_hist; ++j) { + if (history[j] == NGRAM_INVALID_WID) + set->maphist[j] = NGRAM_INVALID_WID; + else + set->maphist[j] = set->widmap[history[j]][set->cur]; + } + score = ngram_ng_prob(set->lms[set->cur], + mapwid, set->maphist, n_hist, n_used); + } + + return score; +} + +static int32 +ngram_model_set_add_ug(ngram_model_t *base, + int32 wid, int32 lweight) +{ + ngram_model_set_t *set = (ngram_model_set_t *)base; + int32 *newwid; + int32 i, prob; + + /* At this point the word has already been added to the master + model and we have a new word ID for it. Add it to active + submodels and track the word IDs. */ + newwid = ckd_calloc(set->n_models, sizeof(*newwid)); + prob = base->log_zero; + for (i = 0; i < set->n_models; ++i) { + int32 wprob, n_hist; + + /* Only add to active models. */ + if (set->cur == -1 || set->cur == i) { + /* Did this word already exist? */ + newwid[i] = ngram_wid(set->lms[i], base->word_str[wid]); + if (newwid[i] == NGRAM_INVALID_WID) { + /* Add it to the submodel. */ + newwid[i] = ngram_model_add_word(set->lms[i], base->word_str[wid], + logmath_exp(base->lmath, lweight)); + if (newwid[i] == NGRAM_INVALID_WID) { + ckd_free(newwid); + return base->log_zero; + } + } + /* Now get the unigram probability for the new word and either + * interpolate it or use it (if this is the current model). */ + wprob = ngram_ng_prob(set->lms[i], newwid[i], NULL, 0, &n_hist); + if (set->cur == i) + prob = wprob; + else if (set->cur == -1) + prob = logmath_add(base->lmath, prob, set->lweights[i] + wprob); + } + else { + newwid[i] = NGRAM_INVALID_WID; + } + } + /* Okay we have the word IDs for this in all the submodels. Now + do some complicated memory mangling to add this to the + widmap. */ + set->widmap = ckd_realloc(set->widmap, base->n_words * sizeof(*set->widmap)); + set->widmap[0] = ckd_realloc(set->widmap[0], + base->n_words + * set->n_models + * sizeof(**set->widmap)); + for (i = 0; i < base->n_words; ++i) + set->widmap[i] = set->widmap[0] + i * set->n_models; + memcpy(set->widmap[wid], newwid, set->n_models * sizeof(*newwid)); + ckd_free(newwid); + return prob; +} + +static void +ngram_model_set_free(ngram_model_t *base) +{ + ngram_model_set_t *set = (ngram_model_set_t *)base; + int32 i; + + for (i = 0; i < set->n_models; ++i) + ngram_model_free(set->lms[i]); + ckd_free(set->lms); + for (i = 0; i < set->n_models; ++i) + ckd_free(set->names[i]); + ckd_free(set->names); + ckd_free(set->lweights); + ckd_free(set->maphist); + ckd_free_2d((void **)set->widmap); +} + +static void +ngram_model_set_flush(ngram_model_t *base) +{ + ngram_model_set_t *set = (ngram_model_set_t *)base; + int32 i; + + for (i = 0; i < set->n_models; ++i) + ngram_model_flush(set->lms[i]); +} + +static ngram_funcs_t ngram_model_set_funcs = { + ngram_model_set_free, /* free */ + ngram_model_set_apply_weights, /* apply_weights */ + ngram_model_set_score, /* score */ + ngram_model_set_raw_score, /* raw_score */ + ngram_model_set_add_ug, /* add_ug */ + ngram_model_set_flush /* flush */ +}; diff --git a/media/sphinxbase/src/libsphinxbase/lm/ngram_model_set.h b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_set.h new file mode 100644 index 000000000..5fbc7e5a4 --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/lm/ngram_model_set.h @@ -0,0 +1,71 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 1999-2007 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ +/** + * @file ngram_model_set.h Set of language models. + * @author David Huggins-Daines <dhuggins@cs.cmu.edu> + */ + +#ifndef __NGRAM_MODEL_SET_H__ +#define __NGRAM_MODEL_SET_H__ + +#include "ngram_model_internal.h" +#include "lm3g_model.h" + +/** + * Subclass of ngram_model for grouping language models. + */ +typedef struct ngram_model_set_s { + ngram_model_t base; /**< Base ngram_model_t structure. */ + + int32 n_models; /**< Number of models in this set. */ + int32 cur; /**< Currently selected model, or -1 for none. */ + ngram_model_t **lms; /**< Language models in this set. */ + char **names; /**< Names for language models. */ + int32 *lweights; /**< Log interpolation weights. */ + int32 **widmap; /**< Word ID mapping for submodels. */ + int32 *maphist; /**< Word ID mapping for N-Gram history. */ +} ngram_model_set_t; + +/** + * Iterator over a model set. + */ +struct ngram_model_set_iter_s { + ngram_model_set_t *set; + int32 cur; +}; + +#endif /* __NGRAM_MODEL_SET_H__ */ |