diff options
Diffstat (limited to 'media/pocketsphinx/src/ngram_search_fwdtree.c')
-rw-r--r-- | media/pocketsphinx/src/ngram_search_fwdtree.c | 1573 |
1 files changed, 1573 insertions, 0 deletions
diff --git a/media/pocketsphinx/src/ngram_search_fwdtree.c b/media/pocketsphinx/src/ngram_search_fwdtree.c new file mode 100644 index 000000000..6665161f8 --- /dev/null +++ b/media/pocketsphinx/src/ngram_search_fwdtree.c @@ -0,0 +1,1573 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 2008 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +/** + * @file ngram_search_fwdtree.c Lexicon tree search. + */ + +/* System headers. */ +#include <string.h> +#include <assert.h> + +/* SphinxBase headers. */ +#include <sphinxbase/ckd_alloc.h> +#include <sphinxbase/listelem_alloc.h> +#include <sphinxbase/err.h> + +/* Local headers. */ +#include "ngram_search_fwdtree.h" +#include "phone_loop_search.h" + +/* Turn this on to dump channels for debugging */ +#define __CHAN_DUMP__ 0 +#if __CHAN_DUMP__ +#define chan_v_eval(chan) hmm_dump_vit_eval(&(chan)->hmm, stderr) +#else +#define chan_v_eval(chan) hmm_vit_eval(&(chan)->hmm) +#endif + +/* + * Allocate that part of the search channel tree structure that is independent of the + * LM in use. + */ +static void +init_search_tree(ngram_search_t *ngs) +{ + int32 w, ndiph, i, n_words, n_ci; + dict_t *dict = ps_search_dict(ngs); + bitvec_t *dimap; + + n_words = ps_search_n_words(ngs); + ngs->homophone_set = ckd_calloc(n_words, sizeof(*ngs->homophone_set)); + + /* Find #single phone words, and #unique first diphones (#root channels) in dict. */ + ndiph = 0; + ngs->n_1ph_words = 0; + n_ci = bin_mdef_n_ciphone(ps_search_acmod(ngs)->mdef); + /* Allocate a bitvector with flags for each possible diphone. */ + dimap = bitvec_alloc(n_ci * n_ci); + for (w = 0; w < n_words; w++) { + if (!dict_real_word(dict, w)) + continue; + if (dict_is_single_phone(dict, w)) + ++ngs->n_1ph_words; + else { + int ph0, ph1; + ph0 = dict_first_phone(dict, w); + ph1 = dict_second_phone(dict, w); + /* Increment ndiph the first time we see a diphone. */ + if (bitvec_is_clear(dimap, ph0 * n_ci + ph1)) { + bitvec_set(dimap, ph0 * n_ci + ph1); + ++ndiph; + } + } + } + E_INFO("%d unique initial diphones\n", ndiph); + bitvec_free(dimap); + + /* Add remaining dict words (</s>, <s>, <sil>, noise words) to single-phone words */ + ngs->n_1ph_words += dict_num_fillers(dict) + 2; + ngs->n_root_chan_alloc = ndiph + 1; + /* Verify that these are all *actually* single-phone words, + * otherwise really bad things will happen to us. */ + for (w = 0; w < n_words; ++w) { + if (dict_real_word(dict, w)) + continue; + if (!dict_is_single_phone(dict, w)) { + E_WARN("Filler word %d = %s has more than one phone, ignoring it.\n", + w, dict_wordstr(dict, w)); + --ngs->n_1ph_words; + } + } + + /* Allocate and initialize root channels */ + ngs->root_chan = + ckd_calloc(ngs->n_root_chan_alloc, sizeof(*ngs->root_chan)); + for (i = 0; i < ngs->n_root_chan_alloc; i++) { + hmm_init(ngs->hmmctx, &ngs->root_chan[i].hmm, TRUE, -1, -1); + ngs->root_chan[i].penult_phn_wid = -1; + ngs->root_chan[i].next = NULL; + } + + /* Permanently allocate and initialize channels for single-phone + * words (1/word). */ + ngs->rhmm_1ph = ckd_calloc(ngs->n_1ph_words, sizeof(*ngs->rhmm_1ph)); + i = 0; + for (w = 0; w < n_words; w++) { + if (!dict_is_single_phone(dict, w)) + continue; + /* Use SIL as right context for these. */ + ngs->rhmm_1ph[i].ci2phone = bin_mdef_silphone(ps_search_acmod(ngs)->mdef); + ngs->rhmm_1ph[i].ciphone = dict_first_phone(dict, w); + hmm_init(ngs->hmmctx, &ngs->rhmm_1ph[i].hmm, TRUE, + bin_mdef_pid2ssid(ps_search_acmod(ngs)->mdef, ngs->rhmm_1ph[i].ciphone), + bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, ngs->rhmm_1ph[i].ciphone)); + ngs->rhmm_1ph[i].next = NULL; + + ngs->word_chan[w] = (chan_t *) &(ngs->rhmm_1ph[i]); + i++; + } + + ngs->single_phone_wid = ckd_calloc(ngs->n_1ph_words, + sizeof(*ngs->single_phone_wid)); + E_INFO("%d root, %d non-root channels, %d single-phone words\n", + ngs->n_root_chan, ngs->n_nonroot_chan, ngs->n_1ph_words); +} + +/* + * One-time initialization of internal channels in HMM tree. + */ +static void +init_nonroot_chan(ngram_search_t *ngs, chan_t * hmm, int32 ph, int32 ci, int32 tmatid) +{ + hmm->next = NULL; + hmm->alt = NULL; + hmm->info.penult_phn_wid = -1; + hmm->ciphone = ci; + hmm_init(ngs->hmmctx, &hmm->hmm, FALSE, ph, tmatid); +} + +/* + * Allocate and initialize search channel-tree structure. + * At this point, all the root-channels have been allocated and partly initialized + * (as per init_search_tree()), and channels for all the single-phone words have been + * allocated and initialized. None of the interior channels of search-trees have + * been allocated. + * This routine may be called on every utterance, after reinit_search_tree() clears + * the search tree created for the previous utterance. Meant for reconfiguring the + * search tree to suit the currently active LM. + */ +static void +create_search_tree(ngram_search_t *ngs) +{ + chan_t *hmm; + root_chan_t *rhmm; + int32 w, i, j, p, ph, tmatid; + int32 n_words; + dict_t *dict = ps_search_dict(ngs); + dict2pid_t *d2p = ps_search_dict2pid(ngs); + + n_words = ps_search_n_words(ngs); + + E_INFO("Creating search tree\n"); + + for (w = 0; w < n_words; w++) + ngs->homophone_set[w] = -1; + + E_INFO("before: %d root, %d non-root channels, %d single-phone words\n", + ngs->n_root_chan, ngs->n_nonroot_chan, ngs->n_1ph_words); + + ngs->n_1ph_LMwords = 0; + ngs->n_root_chan = 0; + ngs->n_nonroot_chan = 0; + + for (w = 0; w < n_words; w++) { + int ciphone, ci2phone; + + /* Ignore dictionary words not in LM */ + if (!ngram_model_set_known_wid(ngs->lmset, dict_basewid(dict, w))) + continue; + + /* Handle single-phone words individually; not in channel tree */ + if (dict_is_single_phone(dict, w)) { + E_DEBUG(1,("single_phone_wid[%d] = %s\n", + ngs->n_1ph_LMwords, dict_wordstr(dict, w))); + ngs->single_phone_wid[ngs->n_1ph_LMwords++] = w; + continue; + } + + /* Find a root channel matching the initial diphone, or + * allocate one if not found. */ + ciphone = dict_first_phone(dict, w); + ci2phone = dict_second_phone(dict, w); + for (i = 0; i < ngs->n_root_chan; ++i) { + if (ngs->root_chan[i].ciphone == ciphone + && ngs->root_chan[i].ci2phone == ci2phone) + break; + } + if (i == ngs->n_root_chan) { + rhmm = &(ngs->root_chan[ngs->n_root_chan]); + rhmm->hmm.tmatid = bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, ciphone); + /* Begin with CI phone? Not sure this makes a difference... */ + hmm_mpx_ssid(&rhmm->hmm, 0) = + bin_mdef_pid2ssid(ps_search_acmod(ngs)->mdef, ciphone); + rhmm->ciphone = ciphone; + rhmm->ci2phone = ci2phone; + ngs->n_root_chan++; + } + else + rhmm = &(ngs->root_chan[i]); + + E_DEBUG(3,("word %s rhmm %d\n", dict_wordstr(dict, w), rhmm - ngs->root_chan)); + /* Now, rhmm = root channel for w. Go on to remaining phones */ + if (dict_pronlen(dict, w) == 2) { + /* Next phone is the last; not kept in tree; add w to penult_phn_wid set */ + if ((j = rhmm->penult_phn_wid) < 0) + rhmm->penult_phn_wid = w; + else { + for (; ngs->homophone_set[j] >= 0; j = ngs->homophone_set[j]); + ngs->homophone_set[j] = w; + } + } + else { + /* Add remaining phones, except the last, to tree */ + ph = dict2pid_internal(d2p, w, 1); + tmatid = bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, dict_pron(dict, w, 1)); + hmm = rhmm->next; + if (hmm == NULL) { + rhmm->next = hmm = listelem_malloc(ngs->chan_alloc); + init_nonroot_chan(ngs, hmm, ph, dict_pron(dict, w, 1), tmatid); + ngs->n_nonroot_chan++; + } + else { + chan_t *prev_hmm = NULL; + + for (; hmm && (hmm_nonmpx_ssid(&hmm->hmm) != ph); hmm = hmm->alt) + prev_hmm = hmm; + if (!hmm) { /* thanks, rkm! */ + prev_hmm->alt = hmm = listelem_malloc(ngs->chan_alloc); + init_nonroot_chan(ngs, hmm, ph, dict_pron(dict, w, 1), tmatid); + ngs->n_nonroot_chan++; + } + } + E_DEBUG(3,("phone %s = %d\n", + bin_mdef_ciphone_str(ps_search_acmod(ngs)->mdef, + dict_second_phone(dict, w)), ph)); + for (p = 2; p < dict_pronlen(dict, w) - 1; p++) { + ph = dict2pid_internal(d2p, w, p); + tmatid = bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, dict_pron(dict, w, p)); + if (!hmm->next) { + hmm->next = listelem_malloc(ngs->chan_alloc); + hmm = hmm->next; + init_nonroot_chan(ngs, hmm, ph, dict_pron(dict, w, p), tmatid); + ngs->n_nonroot_chan++; + } + else { + chan_t *prev_hmm = NULL; + + for (hmm = hmm->next; hmm && (hmm_nonmpx_ssid(&hmm->hmm) != ph); + hmm = hmm->alt) + prev_hmm = hmm; + if (!hmm) { /* thanks, rkm! */ + prev_hmm->alt = hmm = listelem_malloc(ngs->chan_alloc); + init_nonroot_chan(ngs, hmm, ph, dict_pron(dict, w, p), tmatid); + ngs->n_nonroot_chan++; + } + } + E_DEBUG(3,("phone %s = %d\n", + bin_mdef_ciphone_str(ps_search_acmod(ngs)->mdef, + dict_pron(dict, w, p)), ph)); + } + + /* All but last phone of w in tree; add w to hmm->info.penult_phn_wid set */ + if ((j = hmm->info.penult_phn_wid) < 0) + hmm->info.penult_phn_wid = w; + else { + for (; ngs->homophone_set[j] >= 0; j = ngs->homophone_set[j]); + ngs->homophone_set[j] = w; + } + } + } + + ngs->n_1ph_words = ngs->n_1ph_LMwords; + + /* Add filler words to the array of 1ph words. */ + for (w = 0; w < n_words; ++w) { + /* Skip anything that doesn't actually have a single phone. */ + if (!dict_is_single_phone(dict, w)) + continue; + /* Also skip "real words" and things that are in the LM. */ + if (dict_real_word(dict, w)) + continue; + if (ngram_model_set_known_wid(ngs->lmset, dict_basewid(dict, w))) + continue; + E_DEBUG(1,("single_phone_wid[%d] = %s\n", + ngs->n_1ph_words, dict_wordstr(dict, w))); + ngs->single_phone_wid[ngs->n_1ph_words++] = w; + } + + if (ngs->n_nonroot_chan >= ngs->max_nonroot_chan) { + /* Give some room for channels for new words added dynamically at run time */ + ngs->max_nonroot_chan = ngs->n_nonroot_chan + 128; + E_INFO("after: max nonroot chan increased to %d\n", ngs->max_nonroot_chan); + + /* Free old active channel list array if any and allocate new one */ + if (ngs->active_chan_list) + ckd_free_2d(ngs->active_chan_list); + ngs->active_chan_list = ckd_calloc_2d(2, ngs->max_nonroot_chan, + sizeof(**ngs->active_chan_list)); + } + + if (!ngs->n_root_chan) + E_ERROR("No word from the language model has pronunciation in the dictionary\n"); + + E_INFO("after: %d root, %d non-root channels, %d single-phone words\n", + ngs->n_root_chan, ngs->n_nonroot_chan, ngs->n_1ph_words); +} + +static void +reinit_search_subtree(ngram_search_t *ngs, chan_t * hmm) +{ + chan_t *child, *sibling; + + /* First free all children under hmm */ + for (child = hmm->next; child; child = sibling) { + sibling = child->alt; + reinit_search_subtree(ngs, child); + } + + /* Now free hmm */ + hmm_deinit(&hmm->hmm); + listelem_free(ngs->chan_alloc, hmm); +} + +/* + * Delete search tree by freeing all interior channels within search tree and + * restoring root channel state to the init state (i.e., just after init_search_tree()). + */ +static void +reinit_search_tree(ngram_search_t *ngs) +{ + int32 i; + chan_t *hmm, *sibling; + + for (i = 0; i < ngs->n_root_chan; i++) { + hmm = ngs->root_chan[i].next; + + while (hmm) { + sibling = hmm->alt; + reinit_search_subtree(ngs, hmm); + hmm = sibling; + } + + ngs->root_chan[i].penult_phn_wid = -1; + ngs->root_chan[i].next = NULL; + } + ngs->n_nonroot_chan = 0; +} + +void +ngram_fwdtree_init(ngram_search_t *ngs) +{ + /* Allocate bestbp_rc, lastphn_cand, last_ltrans */ + ngs->bestbp_rc = ckd_calloc(bin_mdef_n_ciphone(ps_search_acmod(ngs)->mdef), + sizeof(*ngs->bestbp_rc)); + ngs->lastphn_cand = ckd_calloc(ps_search_n_words(ngs), + sizeof(*ngs->lastphn_cand)); + init_search_tree(ngs); + create_search_tree(ngs); +} + +static void +deinit_search_tree(ngram_search_t *ngs) +{ + int i, w, n_words; + + n_words = ps_search_n_words(ngs); + for (i = 0; i < ngs->n_root_chan_alloc; i++) { + hmm_deinit(&ngs->root_chan[i].hmm); + } + if (ngs->rhmm_1ph) { + for (i = w = 0; w < n_words; ++w) { + if (!dict_is_single_phone(ps_search_dict(ngs), w)) + continue; + hmm_deinit(&ngs->rhmm_1ph[i].hmm); + ++i; + } + ckd_free(ngs->rhmm_1ph); + ngs->rhmm_1ph = NULL; + } + ngs->n_root_chan = 0; + ngs->n_root_chan_alloc = 0; + ckd_free(ngs->root_chan); + ngs->root_chan = NULL; + ckd_free(ngs->single_phone_wid); + ngs->single_phone_wid = NULL; + ckd_free(ngs->homophone_set); + ngs->homophone_set = NULL; +} + +void +ngram_fwdtree_deinit(ngram_search_t *ngs) +{ + double n_speech = (double)ngs->n_tot_frame + / cmd_ln_int32_r(ps_search_config(ngs), "-frate"); + + E_INFO("TOTAL fwdtree %.2f CPU %.3f xRT\n", + ngs->fwdtree_perf.t_tot_cpu, + ngs->fwdtree_perf.t_tot_cpu / n_speech); + E_INFO("TOTAL fwdtree %.2f wall %.3f xRT\n", + ngs->fwdtree_perf.t_tot_elapsed, + ngs->fwdtree_perf.t_tot_elapsed / n_speech); + + /* Reset non-root channels. */ + reinit_search_tree(ngs); + /* Free the search tree. */ + deinit_search_tree(ngs); + /* Free other stuff. */ + ngs->max_nonroot_chan = 0; + ckd_free_2d(ngs->active_chan_list); + ngs->active_chan_list = NULL; + ckd_free(ngs->cand_sf); + ngs->cand_sf = NULL; + ckd_free(ngs->bestbp_rc); + ngs->bestbp_rc = NULL; + ckd_free(ngs->lastphn_cand); + ngs->lastphn_cand = NULL; +} + +int +ngram_fwdtree_reinit(ngram_search_t *ngs) +{ + /* Reset non-root channels. */ + reinit_search_tree(ngs); + /* Free the search tree. */ + deinit_search_tree(ngs); + /* Reallocate things that depend on the number of words. */ + ckd_free(ngs->lastphn_cand); + ngs->lastphn_cand = ckd_calloc(ps_search_n_words(ngs), + sizeof(*ngs->lastphn_cand)); + ckd_free(ngs->word_chan); + ngs->word_chan = ckd_calloc(ps_search_n_words(ngs), + sizeof(*ngs->word_chan)); + /* Rebuild the search tree. */ + init_search_tree(ngs); + create_search_tree(ngs); + return 0; +} + +void +ngram_fwdtree_start(ngram_search_t *ngs) +{ + ps_search_t *base = (ps_search_t *)ngs; + int32 i, w, n_words; + root_chan_t *rhmm; + + n_words = ps_search_n_words(ngs); + + /* Reset utterance statistics. */ + memset(&ngs->st, 0, sizeof(ngs->st)); + ptmr_reset(&ngs->fwdtree_perf); + ptmr_start(&ngs->fwdtree_perf); + + /* Reset backpointer table. */ + ngs->bpidx = 0; + ngs->bss_head = 0; + + /* Reset word lattice. */ + for (i = 0; i < n_words; ++i) + ngs->word_lat_idx[i] = NO_BP; + + /* Reset active HMM and word lists. */ + ngs->n_active_chan[0] = ngs->n_active_chan[1] = 0; + ngs->n_active_word[0] = ngs->n_active_word[1] = 0; + + /* Reset scores. */ + ngs->best_score = 0; + ngs->renormalized = 0; + + /* Reset other stuff. */ + for (i = 0; i < n_words; i++) + ngs->last_ltrans[i].sf = -1; + ngs->n_frame = 0; + + /* Clear the hypothesis string. */ + ckd_free(base->hyp_str); + base->hyp_str = NULL; + + /* Reset the permanently allocated single-phone words, since they + * may have junk left over in them from FWDFLAT. */ + for (i = 0; i < ngs->n_1ph_words; i++) { + w = ngs->single_phone_wid[i]; + rhmm = (root_chan_t *) ngs->word_chan[w]; + hmm_clear(&rhmm->hmm); + } + + /* Start search with <s>; word_chan[<s>] is permanently allocated */ + rhmm = (root_chan_t *) ngs->word_chan[dict_startwid(ps_search_dict(ngs))]; + hmm_clear(&rhmm->hmm); + hmm_enter(&rhmm->hmm, 0, NO_BP, 0); +} + +/* + * Mark the active senones for all senones belonging to channels that are active in the + * current frame. + */ +static void +compute_sen_active(ngram_search_t *ngs, int frame_idx) +{ + root_chan_t *rhmm; + chan_t *hmm, **acl; + int32 i, w, *awl; + + acmod_clear_active(ps_search_acmod(ngs)); + + /* Flag active senones for root channels */ + for (i = ngs->n_root_chan, rhmm = ngs->root_chan; i > 0; --i, rhmm++) { + if (hmm_frame(&rhmm->hmm) == frame_idx) + acmod_activate_hmm(ps_search_acmod(ngs), &rhmm->hmm); + } + + /* Flag active senones for nonroot channels in HMM tree */ + i = ngs->n_active_chan[frame_idx & 0x1]; + acl = ngs->active_chan_list[frame_idx & 0x1]; + for (hmm = *(acl++); i > 0; --i, hmm = *(acl++)) { + acmod_activate_hmm(ps_search_acmod(ngs), &hmm->hmm); + } + + /* Flag active senones for individual word channels */ + i = ngs->n_active_word[frame_idx & 0x1]; + awl = ngs->active_word_list[frame_idx & 0x1]; + for (w = *(awl++); i > 0; --i, w = *(awl++)) { + for (hmm = ngs->word_chan[w]; hmm; hmm = hmm->next) { + acmod_activate_hmm(ps_search_acmod(ngs), &hmm->hmm); + } + } + for (i = 0; i < ngs->n_1ph_words; i++) { + w = ngs->single_phone_wid[i]; + rhmm = (root_chan_t *) ngs->word_chan[w]; + + if (hmm_frame(&rhmm->hmm) == frame_idx) + acmod_activate_hmm(ps_search_acmod(ngs), &rhmm->hmm); + } +} + +static void +renormalize_scores(ngram_search_t *ngs, int frame_idx, int32 norm) +{ + root_chan_t *rhmm; + chan_t *hmm, **acl; + int32 i, w, *awl; + + /* Renormalize root channels */ + for (i = ngs->n_root_chan, rhmm = ngs->root_chan; i > 0; --i, rhmm++) { + if (hmm_frame(&rhmm->hmm) == frame_idx) { + hmm_normalize(&rhmm->hmm, norm); + } + } + + /* Renormalize nonroot channels in HMM tree */ + i = ngs->n_active_chan[frame_idx & 0x1]; + acl = ngs->active_chan_list[frame_idx & 0x1]; + for (hmm = *(acl++); i > 0; --i, hmm = *(acl++)) { + hmm_normalize(&hmm->hmm, norm); + } + + /* Renormalize individual word channels */ + i = ngs->n_active_word[frame_idx & 0x1]; + awl = ngs->active_word_list[frame_idx & 0x1]; + for (w = *(awl++); i > 0; --i, w = *(awl++)) { + for (hmm = ngs->word_chan[w]; hmm; hmm = hmm->next) { + hmm_normalize(&hmm->hmm, norm); + } + } + for (i = 0; i < ngs->n_1ph_words; i++) { + w = ngs->single_phone_wid[i]; + rhmm = (root_chan_t *) ngs->word_chan[w]; + if (hmm_frame(&rhmm->hmm) == frame_idx) { + hmm_normalize(&rhmm->hmm, norm); + } + } + + ngs->renormalized = TRUE; +} + +static int32 +eval_root_chan(ngram_search_t *ngs, int frame_idx) +{ + root_chan_t *rhmm; + int32 i, bestscore; + + bestscore = WORST_SCORE; + for (i = ngs->n_root_chan, rhmm = ngs->root_chan; i > 0; --i, rhmm++) { + if (hmm_frame(&rhmm->hmm) == frame_idx) { + int32 score = chan_v_eval(rhmm); + if (score BETTER_THAN bestscore) + bestscore = score; + ++ngs->st.n_root_chan_eval; + } + } + return (bestscore); +} + +static int32 +eval_nonroot_chan(ngram_search_t *ngs, int frame_idx) +{ + chan_t *hmm, **acl; + int32 i, bestscore; + + i = ngs->n_active_chan[frame_idx & 0x1]; + acl = ngs->active_chan_list[frame_idx & 0x1]; + bestscore = WORST_SCORE; + ngs->st.n_nonroot_chan_eval += i; + + for (hmm = *(acl++); i > 0; --i, hmm = *(acl++)) { + int32 score = chan_v_eval(hmm); + assert(hmm_frame(&hmm->hmm) == frame_idx); + if (score BETTER_THAN bestscore) + bestscore = score; + } + + return bestscore; +} + +static int32 +eval_word_chan(ngram_search_t *ngs, int frame_idx) +{ + root_chan_t *rhmm; + chan_t *hmm; + int32 i, w, bestscore, *awl, j, k; + + k = 0; + bestscore = WORST_SCORE; + awl = ngs->active_word_list[frame_idx & 0x1]; + + i = ngs->n_active_word[frame_idx & 0x1]; + for (w = *(awl++); i > 0; --i, w = *(awl++)) { + assert(bitvec_is_set(ngs->word_active, w)); + bitvec_clear(ngs->word_active, w); + assert(ngs->word_chan[w] != NULL); + + for (hmm = ngs->word_chan[w]; hmm; hmm = hmm->next) { + int32 score; + + assert(hmm_frame(&hmm->hmm) == frame_idx); + score = chan_v_eval(hmm); + /*printf("eval word chan %d score %d\n", w, score); */ + + if (score BETTER_THAN bestscore) + bestscore = score; + + k++; + } + } + + /* Similarly for statically allocated single-phone words */ + j = 0; + for (i = 0; i < ngs->n_1ph_words; i++) { + int32 score; + + w = ngs->single_phone_wid[i]; + rhmm = (root_chan_t *) ngs->word_chan[w]; + if (hmm_frame(&rhmm->hmm) < frame_idx) + continue; + + score = chan_v_eval(rhmm); + /* printf("eval 1ph word chan %d score %d\n", w, score); */ + if (score BETTER_THAN bestscore && w != ps_search_finish_wid(ngs)) + bestscore = score; + + j++; + } + + ngs->st.n_last_chan_eval += k + j; + ngs->st.n_nonroot_chan_eval += k + j; + ngs->st.n_word_lastchan_eval += + ngs->n_active_word[frame_idx & 0x1] + j; + + return bestscore; +} + +static int32 +evaluate_channels(ngram_search_t *ngs, int16 const *senone_scores, int frame_idx) +{ + int32 bs; + + hmm_context_set_senscore(ngs->hmmctx, senone_scores); + ngs->best_score = eval_root_chan(ngs, frame_idx); + if ((bs = eval_nonroot_chan(ngs, frame_idx)) BETTER_THAN ngs->best_score) + ngs->best_score = bs; + if ((bs = eval_word_chan(ngs, frame_idx)) BETTER_THAN ngs->best_score) + ngs->best_score = bs; + ngs->last_phone_best_score = bs; + + return ngs->best_score; +} + +/* + * Prune currently active root channels for next frame. Also, perform exit + * transitions out of them and activate successors. + * score[] of pruned root chan set to WORST_SCORE elsewhere. + */ +static void +prune_root_chan(ngram_search_t *ngs, int frame_idx) +{ + root_chan_t *rhmm; + chan_t *hmm; + int32 i, nf, w; + int32 thresh, newphone_thresh, lastphn_thresh, newphone_score; + chan_t **nacl; /* next active list */ + lastphn_cand_t *candp; + phone_loop_search_t *pls; + + nf = frame_idx + 1; + thresh = ngs->best_score + ngs->dynamic_beam; + newphone_thresh = ngs->best_score + ngs->pbeam; + lastphn_thresh = ngs->best_score + ngs->lpbeam; + nacl = ngs->active_chan_list[nf & 0x1]; + pls = (phone_loop_search_t *)ps_search_lookahead(ngs); + + for (i = 0, rhmm = ngs->root_chan; i < ngs->n_root_chan; i++, rhmm++) { + E_DEBUG(3,("Root channel %d frame %d score %d thresh %d\n", + i, hmm_frame(&rhmm->hmm), hmm_bestscore(&rhmm->hmm), thresh)); + /* First check if this channel was active in current frame */ + if (hmm_frame(&rhmm->hmm) < frame_idx) + continue; + + if (hmm_bestscore(&rhmm->hmm) BETTER_THAN thresh) { + hmm_frame(&rhmm->hmm) = nf; /* rhmm will be active in next frame */ + E_DEBUG(3,("Preserving root channel %d score %d\n", i, hmm_bestscore(&rhmm->hmm))); + /* transitions out of this root channel */ + /* transition to all next-level channels in the HMM tree */ + newphone_score = hmm_out_score(&rhmm->hmm) + ngs->pip; + if (pls != NULL || newphone_score BETTER_THAN newphone_thresh) { + for (hmm = rhmm->next; hmm; hmm = hmm->alt) { + int32 pl_newphone_score = newphone_score + + phone_loop_search_score(pls, hmm->ciphone); + if (pl_newphone_score BETTER_THAN newphone_thresh) { + if ((hmm_frame(&hmm->hmm) < frame_idx) + || (newphone_score BETTER_THAN hmm_in_score(&hmm->hmm))) { + hmm_enter(&hmm->hmm, newphone_score, + hmm_out_history(&rhmm->hmm), nf); + *(nacl++) = hmm; + } + } + } + } + + /* + * Transition to last phone of all words for which this is the + * penultimate phone (the last phones may need multiple right contexts). + * Remember to remove the temporary newword_penalty. + */ + if (pls != NULL || newphone_score BETTER_THAN lastphn_thresh) { + for (w = rhmm->penult_phn_wid; w >= 0; + w = ngs->homophone_set[w]) { + int32 pl_newphone_score = newphone_score + + phone_loop_search_score + (pls, dict_last_phone(ps_search_dict(ngs),w)); + E_DEBUG(3,("word %s newphone_score %d\n", dict_wordstr(ps_search_dict(ngs), w), newphone_score)); + if (pl_newphone_score BETTER_THAN lastphn_thresh) { + candp = ngs->lastphn_cand + ngs->n_lastphn_cand; + ngs->n_lastphn_cand++; + candp->wid = w; + candp->score = + newphone_score - ngs->nwpen; + candp->bp = hmm_out_history(&rhmm->hmm); + } + } + } + } + } + ngs->n_active_chan[nf & 0x1] = (int)(nacl - ngs->active_chan_list[nf & 0x1]); +} + +/* + * Prune currently active nonroot channels in HMM tree for next frame. Also, perform + * exit transitions out of such channels and activate successors. + */ +static void +prune_nonroot_chan(ngram_search_t *ngs, int frame_idx) +{ + chan_t *hmm, *nexthmm; + int32 nf, w, i; + int32 thresh, newphone_thresh, lastphn_thresh, newphone_score; + chan_t **acl, **nacl; /* active list, next active list */ + lastphn_cand_t *candp; + phone_loop_search_t *pls; + + nf = frame_idx + 1; + + thresh = ngs->best_score + ngs->dynamic_beam; + newphone_thresh = ngs->best_score + ngs->pbeam; + lastphn_thresh = ngs->best_score + ngs->lpbeam; + pls = (phone_loop_search_t *)ps_search_lookahead(ngs); + + acl = ngs->active_chan_list[frame_idx & 0x1]; /* currently active HMMs in tree */ + nacl = ngs->active_chan_list[nf & 0x1] + ngs->n_active_chan[nf & 0x1]; + + for (i = ngs->n_active_chan[frame_idx & 0x1], hmm = *(acl++); i > 0; + --i, hmm = *(acl++)) { + assert(hmm_frame(&hmm->hmm) >= frame_idx); + + if (hmm_bestscore(&hmm->hmm) BETTER_THAN thresh) { + /* retain this channel in next frame */ + if (hmm_frame(&hmm->hmm) != nf) { + hmm_frame(&hmm->hmm) = nf; + *(nacl++) = hmm; + } + + /* transition to all next-level channel in the HMM tree */ + newphone_score = hmm_out_score(&hmm->hmm) + ngs->pip; + if (pls != NULL || newphone_score BETTER_THAN newphone_thresh) { + for (nexthmm = hmm->next; nexthmm; nexthmm = nexthmm->alt) { + int32 pl_newphone_score = newphone_score + + phone_loop_search_score(pls, nexthmm->ciphone); + if ((pl_newphone_score BETTER_THAN newphone_thresh) + && ((hmm_frame(&nexthmm->hmm) < frame_idx) + || (newphone_score + BETTER_THAN hmm_in_score(&nexthmm->hmm)))) { + if (hmm_frame(&nexthmm->hmm) != nf) { + /* Keep this HMM on the active list */ + *(nacl++) = nexthmm; + } + hmm_enter(&nexthmm->hmm, newphone_score, + hmm_out_history(&hmm->hmm), nf); + } + } + } + + /* + * Transition to last phone of all words for which this is the + * penultimate phone (the last phones may need multiple right contexts). + * Remember to remove the temporary newword_penalty. + */ + if (pls != NULL || newphone_score BETTER_THAN lastphn_thresh) { + for (w = hmm->info.penult_phn_wid; w >= 0; + w = ngs->homophone_set[w]) { + int32 pl_newphone_score = newphone_score + + phone_loop_search_score + (pls, dict_last_phone(ps_search_dict(ngs),w)); + if (pl_newphone_score BETTER_THAN lastphn_thresh) { + candp = ngs->lastphn_cand + ngs->n_lastphn_cand; + ngs->n_lastphn_cand++; + candp->wid = w; + candp->score = + newphone_score - ngs->nwpen; + candp->bp = hmm_out_history(&hmm->hmm); + } + } + } + } + else if (hmm_frame(&hmm->hmm) != nf) { + hmm_clear(&hmm->hmm); + } + } + ngs->n_active_chan[nf & 0x1] = (int)(nacl - ngs->active_chan_list[nf & 0x1]); +} + +/* + * Execute the transition into the last phone for all candidates words emerging from + * the HMM tree. Attach LM scores to such transitions. + * (Executed after pruning root and non-root, but before pruning word-chan.) + */ +static void +last_phone_transition(ngram_search_t *ngs, int frame_idx) +{ + int32 i, j, k, nf, bp, bpend, w; + lastphn_cand_t *candp; + int32 *nawl; + int32 thresh; + int32 bestscore, dscr; + chan_t *hmm; + bptbl_t *bpe; + int32 n_cand_sf = 0; + + nf = frame_idx + 1; + nawl = ngs->active_word_list[nf & 0x1]; + ngs->st.n_lastphn_cand_utt += ngs->n_lastphn_cand; + + /* For each candidate word (entering its last phone) */ + /* If best LM score and bp for candidate known use it, else sort cands by startfrm */ + for (i = 0, candp = ngs->lastphn_cand; i < ngs->n_lastphn_cand; i++, candp++) { + int32 start_score; + + /* This can happen if recognition fails. */ + if (candp->bp == -1) + continue; + /* Backpointer entry for it. */ + bpe = &(ngs->bp_table[candp->bp]); + + /* Subtract starting score for candidate, leave it with only word score */ + start_score = ngram_search_exit_score + (ngs, bpe, dict_first_phone(ps_search_dict(ngs), candp->wid)); + assert(start_score BETTER_THAN WORST_SCORE); + candp->score -= start_score; + + /* + * If this candidate not occurred in an earlier frame, prepare for finding + * best transition score into last phone; sort by start frame. + */ + /* i.e. if we don't have an entry in last_ltrans for this + * <word,sf>, then create one */ + if (ngs->last_ltrans[candp->wid].sf != bpe->frame + 1) { + /* Look for an entry in cand_sf matching the backpointer + * for this candidate. */ + for (j = 0; j < n_cand_sf; j++) { + if (ngs->cand_sf[j].bp_ef == bpe->frame) + break; + } + /* Oh, we found one, so chain onto it. */ + if (j < n_cand_sf) + candp->next = ngs->cand_sf[j].cand; + else { + /* Nope, let's make a new one, allocating cand_sf if necessary. */ + if (n_cand_sf >= ngs->cand_sf_alloc) { + if (ngs->cand_sf_alloc == 0) { + ngs->cand_sf = + ckd_calloc(CAND_SF_ALLOCSIZE, + sizeof(*ngs->cand_sf)); + ngs->cand_sf_alloc = CAND_SF_ALLOCSIZE; + } + else { + ngs->cand_sf_alloc += CAND_SF_ALLOCSIZE; + ngs->cand_sf = ckd_realloc(ngs->cand_sf, + ngs->cand_sf_alloc + * sizeof(*ngs->cand_sf)); + E_INFO("cand_sf[] increased to %d entries\n", + ngs->cand_sf_alloc); + } + } + + /* Use the newly created cand_sf. */ + j = n_cand_sf++; + candp->next = -1; /* End of the chain. */ + ngs->cand_sf[j].bp_ef = bpe->frame; + } + /* Update it to point to this candidate. */ + ngs->cand_sf[j].cand = i; + + ngs->last_ltrans[candp->wid].dscr = WORST_SCORE; + ngs->last_ltrans[candp->wid].sf = bpe->frame + 1; + } + } + + /* Compute best LM score and bp for new cands entered in the sorted lists above */ + for (i = 0; i < n_cand_sf; i++) { + /* For the i-th unique end frame... */ + bp = ngs->bp_table_idx[ngs->cand_sf[i].bp_ef]; + bpend = ngs->bp_table_idx[ngs->cand_sf[i].bp_ef + 1]; + for (bpe = &(ngs->bp_table[bp]); bp < bpend; bp++, bpe++) { + if (!bpe->valid) + continue; + /* For each candidate at the start frame find bp->cand transition-score */ + for (j = ngs->cand_sf[i].cand; j >= 0; j = candp->next) { + int32 n_used; + candp = &(ngs->lastphn_cand[j]); + dscr = + ngram_search_exit_score + (ngs, bpe, dict_first_phone(ps_search_dict(ngs), candp->wid)); + if (dscr BETTER_THAN WORST_SCORE) { + assert(!dict_filler_word(ps_search_dict(ngs), candp->wid)); + dscr += ngram_tg_score(ngs->lmset, + dict_basewid(ps_search_dict(ngs), candp->wid), + bpe->real_wid, + bpe->prev_real_wid, + &n_used)>>SENSCR_SHIFT; + } + + if (dscr BETTER_THAN ngs->last_ltrans[candp->wid].dscr) { + ngs->last_ltrans[candp->wid].dscr = dscr; + ngs->last_ltrans[candp->wid].bp = bp; + } + } + } + } + + /* Update best transitions for all candidates; also update best lastphone score */ + bestscore = ngs->last_phone_best_score; + for (i = 0, candp = ngs->lastphn_cand; i < ngs->n_lastphn_cand; i++, candp++) { + candp->score += ngs->last_ltrans[candp->wid].dscr; + candp->bp = ngs->last_ltrans[candp->wid].bp; + + if (candp->score BETTER_THAN bestscore) + bestscore = candp->score; + } + ngs->last_phone_best_score = bestscore; + + /* At this pt, we know the best entry score (with LM component) for all candidates */ + thresh = bestscore + ngs->lponlybeam; + for (i = ngs->n_lastphn_cand, candp = ngs->lastphn_cand; i > 0; --i, candp++) { + if (candp->score BETTER_THAN thresh) { + w = candp->wid; + + ngram_search_alloc_all_rc(ngs, w); + + k = 0; + for (hmm = ngs->word_chan[w]; hmm; hmm = hmm->next) { + if ((hmm_frame(&hmm->hmm) < frame_idx) + || (candp->score BETTER_THAN hmm_in_score(&hmm->hmm))) { + assert(hmm_frame(&hmm->hmm) != nf); + hmm_enter(&hmm->hmm, + candp->score, candp->bp, nf); + k++; + } + } + if (k > 0) { + assert(bitvec_is_clear(ngs->word_active, w)); + assert(!dict_is_single_phone(ps_search_dict(ngs), w)); + *(nawl++) = w; + bitvec_set(ngs->word_active, w); + } + } + } + ngs->n_active_word[nf & 0x1] = (int)(nawl - ngs->active_word_list[nf & 0x1]); +} + +/* + * Prune currently active word channels for next frame. Also, perform exit + * transitions out of such channels and active successors. + */ +static void +prune_word_chan(ngram_search_t *ngs, int frame_idx) +{ + root_chan_t *rhmm; + chan_t *hmm, *thmm; + chan_t **phmmp; /* previous HMM-pointer */ + int32 nf, w, i, k; + int32 newword_thresh, lastphn_thresh; + int32 *awl, *nawl; + + nf = frame_idx + 1; + newword_thresh = ngs->last_phone_best_score + ngs->wbeam; + lastphn_thresh = ngs->last_phone_best_score + ngs->lponlybeam; + + awl = ngs->active_word_list[frame_idx & 0x1]; + nawl = ngs->active_word_list[nf & 0x1] + ngs->n_active_word[nf & 0x1]; + + /* Dynamically allocated last channels of multi-phone words */ + for (i = ngs->n_active_word[frame_idx & 0x1], w = *(awl++); i > 0; + --i, w = *(awl++)) { + k = 0; + phmmp = &(ngs->word_chan[w]); + for (hmm = ngs->word_chan[w]; hmm; hmm = thmm) { + assert(hmm_frame(&hmm->hmm) >= frame_idx); + + thmm = hmm->next; + if (hmm_bestscore(&hmm->hmm) BETTER_THAN lastphn_thresh) { + /* retain this channel in next frame */ + hmm_frame(&hmm->hmm) = nf; + k++; + phmmp = &(hmm->next); + + /* Could if ((! skip_alt_frm) || (frame_idx & 0x1)) the following */ + if (hmm_out_score(&hmm->hmm) BETTER_THAN newword_thresh) { + /* can exit channel and recognize word */ + ngram_search_save_bp(ngs, frame_idx, w, + hmm_out_score(&hmm->hmm), + hmm_out_history(&hmm->hmm), + hmm->info.rc_id); + } + } + else if (hmm_frame(&hmm->hmm) == nf) { + phmmp = &(hmm->next); + } + else { + hmm_deinit(&hmm->hmm); + listelem_free(ngs->chan_alloc, hmm); + *phmmp = thmm; + } + } + if ((k > 0) && (bitvec_is_clear(ngs->word_active, w))) { + assert(!dict_is_single_phone(ps_search_dict(ngs), w)); + *(nawl++) = w; + bitvec_set(ngs->word_active, w); + } + } + ngs->n_active_word[nf & 0x1] = (int)(nawl - ngs->active_word_list[nf & 0x1]); + + /* + * Prune permanently allocated single-phone channels. + * NOTES: score[] of pruned channels set to WORST_SCORE elsewhere. + */ + for (i = 0; i < ngs->n_1ph_words; i++) { + w = ngs->single_phone_wid[i]; + rhmm = (root_chan_t *) ngs->word_chan[w]; + E_DEBUG(3,("Single phone word %s frame %d score %d thresh %d outscore %d nwthresh %d\n", + dict_wordstr(ps_search_dict(ngs),w), + hmm_frame(&rhmm->hmm), hmm_bestscore(&rhmm->hmm), + lastphn_thresh, hmm_out_score(&rhmm->hmm), newword_thresh)); + if (hmm_frame(&rhmm->hmm) < frame_idx) + continue; + if (hmm_bestscore(&rhmm->hmm) BETTER_THAN lastphn_thresh) { + hmm_frame(&rhmm->hmm) = nf; + + /* Could if ((! skip_alt_frm) || (frame_idx & 0x1)) the following */ + if (hmm_out_score(&rhmm->hmm) BETTER_THAN newword_thresh) { + E_DEBUG(4,("Exiting single phone word %s with %d > %d, %d\n", + dict_wordstr(ps_search_dict(ngs),w), + hmm_out_score(&rhmm->hmm), + lastphn_thresh, newword_thresh)); + ngram_search_save_bp(ngs, frame_idx, w, + hmm_out_score(&rhmm->hmm), + hmm_out_history(&rhmm->hmm), 0); + } + } + } +} + +static void +prune_channels(ngram_search_t *ngs, int frame_idx) +{ + /* Clear last phone candidate list. */ + ngs->n_lastphn_cand = 0; + /* Set the dynamic beam based on maxhmmpf here. */ + ngs->dynamic_beam = ngs->beam; + if (ngs->maxhmmpf != -1 + && ngs->st.n_root_chan_eval + ngs->st.n_nonroot_chan_eval > ngs->maxhmmpf) { + /* Build a histogram to approximately prune them. */ + int32 bins[256], bw, nhmms, i; + root_chan_t *rhmm; + chan_t **acl, *hmm; + + /* Bins go from zero (best score) to edge of beam. */ + bw = -ngs->beam / 256; + memset(bins, 0, sizeof(bins)); + /* For each active root channel. */ + for (i = 0, rhmm = ngs->root_chan; i < ngs->n_root_chan; i++, rhmm++) { + int32 b; + + /* Put it in a bin according to its bestscore. */ + b = (ngs->best_score - hmm_bestscore(&rhmm->hmm)) / bw; + if (b >= 256) + b = 255; + ++bins[b]; + } + /* For each active non-root channel. */ + acl = ngs->active_chan_list[frame_idx & 0x1]; /* currently active HMMs in tree */ + for (i = ngs->n_active_chan[frame_idx & 0x1], hmm = *(acl++); + i > 0; --i, hmm = *(acl++)) { + int32 b; + + /* Put it in a bin according to its bestscore. */ + b = (ngs->best_score - hmm_bestscore(&hmm->hmm)) / bw; + if (b >= 256) + b = 255; + ++bins[b]; + } + /* Walk down the bins to find the new beam. */ + for (i = nhmms = 0; i < 256; ++i) { + nhmms += bins[i]; + if (nhmms > ngs->maxhmmpf) + break; + } + ngs->dynamic_beam = -(i * bw); + } + + prune_root_chan(ngs, frame_idx); + prune_nonroot_chan(ngs, frame_idx); + last_phone_transition(ngs, frame_idx); + prune_word_chan(ngs, frame_idx); +} + +/* + * Limit the number of word exits in each frame to maxwpf. And also limit the number of filler + * words to 1. + */ +static void +bptable_maxwpf(ngram_search_t *ngs, int frame_idx) +{ + int32 bp, n; + int32 bestscr, worstscr; + bptbl_t *bpe, *bestbpe, *worstbpe; + + /* Don't prune if no pruing. */ + if (ngs->maxwpf == -1 || ngs->maxwpf == ps_search_n_words(ngs)) + return; + + /* Allow only one filler word exit (the best) per frame */ + bestscr = (int32) 0x80000000; + bestbpe = NULL; + n = 0; + for (bp = ngs->bp_table_idx[frame_idx]; bp < ngs->bpidx; bp++) { + bpe = &(ngs->bp_table[bp]); + if (dict_filler_word(ps_search_dict(ngs), bpe->wid)) { + if (bpe->score BETTER_THAN bestscr) { + bestscr = bpe->score; + bestbpe = bpe; + } + bpe->valid = FALSE; + n++; /* No. of filler words */ + } + } + /* Restore bestbpe to valid state */ + if (bestbpe != NULL) { + bestbpe->valid = TRUE; + --n; + } + + /* Allow up to maxwpf best entries to survive; mark the remaining with valid = 0 */ + n = (ngs->bpidx + - ngs->bp_table_idx[frame_idx]) - n; /* No. of entries after limiting fillers */ + for (; n > ngs->maxwpf; --n) { + /* Find worst BPTable entry */ + worstscr = (int32) 0x7fffffff; + worstbpe = NULL; + for (bp = ngs->bp_table_idx[frame_idx]; (bp < ngs->bpidx); bp++) { + bpe = &(ngs->bp_table[bp]); + if (bpe->valid && (bpe->score WORSE_THAN worstscr)) { + worstscr = bpe->score; + worstbpe = bpe; + } + } + /* FIXME: Don't panic! */ + if (worstbpe == NULL) + E_FATAL("PANIC: No worst BPtable entry remaining\n"); + worstbpe->valid = FALSE; + } +} + +static void +word_transition(ngram_search_t *ngs, int frame_idx) +{ + int32 i, k, bp, w, nf; + int32 rc; + int32 thresh, newscore, pl_newscore; + bptbl_t *bpe; + root_chan_t *rhmm; + struct bestbp_rc_s *bestbp_rc_ptr; + phone_loop_search_t *pls; + dict_t *dict = ps_search_dict(ngs); + dict2pid_t *d2p = ps_search_dict2pid(ngs); + + /* + * Transition to start of new word instances (HMM tree roots); but only if words + * other than </s> finished here. + * But, first, find the best starting score for each possible right context phone. + */ + for (i = bin_mdef_n_ciphone(ps_search_acmod(ngs)->mdef) - 1; i >= 0; --i) + ngs->bestbp_rc[i].score = WORST_SCORE; + k = 0; + pls = (phone_loop_search_t *)ps_search_lookahead(ngs); + /* Ugh, this is complicated. Scan all word exits for this frame + * (they have already been created by prune_word_chan()). */ + for (bp = ngs->bp_table_idx[frame_idx]; bp < ngs->bpidx; bp++) { + bpe = &(ngs->bp_table[bp]); + ngs->word_lat_idx[bpe->wid] = NO_BP; + + if (bpe->wid == ps_search_finish_wid(ngs)) + continue; + k++; + + /* DICT2PID */ + /* Array of HMM scores corresponding to all the possible right + * context expansions of the final phone. It's likely that a + * lot of these are going to be missing, actually. */ + if (bpe->last2_phone == -1) { /* implies s_idx == -1 */ + /* No right context expansion. */ + for (rc = 0; rc < bin_mdef_n_ciphone(ps_search_acmod(ngs)->mdef); ++rc) { + if (bpe->score BETTER_THAN ngs->bestbp_rc[rc].score) { + E_DEBUG(4,("bestbp_rc[0] = %d lc %d\n", + bpe->score, bpe->last_phone)); + ngs->bestbp_rc[rc].score = bpe->score; + ngs->bestbp_rc[rc].path = bp; + ngs->bestbp_rc[rc].lc = bpe->last_phone; + } + } + } + else { + xwdssid_t *rssid = dict2pid_rssid(d2p, bpe->last_phone, bpe->last2_phone); + int32 *rcss = &(ngs->bscore_stack[bpe->s_idx]); + for (rc = 0; rc < bin_mdef_n_ciphone(ps_search_acmod(ngs)->mdef); ++rc) { + if (rcss[rssid->cimap[rc]] BETTER_THAN ngs->bestbp_rc[rc].score) { + E_DEBUG(4,("bestbp_rc[%d] = %d lc %d\n", + rc, rcss[rssid->cimap[rc]], bpe->last_phone)); + ngs->bestbp_rc[rc].score = rcss[rssid->cimap[rc]]; + ngs->bestbp_rc[rc].path = bp; + ngs->bestbp_rc[rc].lc = bpe->last_phone; + } + } + } + } + if (k == 0) + return; + + nf = frame_idx + 1; + thresh = ngs->best_score + ngs->dynamic_beam; + /* + * Hypothesize successors to words finished in this frame. + * Main dictionary, multi-phone words transition to HMM-trees roots. + */ + for (i = ngs->n_root_chan, rhmm = ngs->root_chan; i > 0; --i, rhmm++) { + bestbp_rc_ptr = &(ngs->bestbp_rc[rhmm->ciphone]); + + newscore = bestbp_rc_ptr->score + ngs->nwpen + ngs->pip; + pl_newscore = newscore + + phone_loop_search_score(pls, rhmm->ciphone); + if (pl_newscore BETTER_THAN thresh) { + if ((hmm_frame(&rhmm->hmm) < frame_idx) + || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) { + hmm_enter(&rhmm->hmm, newscore, + bestbp_rc_ptr->path, nf); + /* DICT2PID: Another place where mpx ssids are entered. */ + /* Look up the ssid to use when entering this mpx triphone. */ + hmm_mpx_ssid(&rhmm->hmm, 0) = + dict2pid_ldiph_lc(d2p, rhmm->ciphone, rhmm->ci2phone, bestbp_rc_ptr->lc); + assert(hmm_mpx_ssid(&rhmm->hmm, 0) != BAD_SSID); + } + } + } + + /* + * Single phone words; no right context for these. Cannot use bestbp_rc as + * LM scores have to be included. First find best transition to these words. + */ + for (i = 0; i < ngs->n_1ph_LMwords; i++) { + w = ngs->single_phone_wid[i]; + ngs->last_ltrans[w].dscr = (int32) 0x80000000; + } + for (bp = ngs->bp_table_idx[frame_idx]; bp < ngs->bpidx; bp++) { + bpe = &(ngs->bp_table[bp]); + if (!bpe->valid) + continue; + + for (i = 0; i < ngs->n_1ph_LMwords; i++) { + int32 n_used; + w = ngs->single_phone_wid[i]; + newscore = ngram_search_exit_score + (ngs, bpe, dict_first_phone(dict, w)); + E_DEBUG(4, ("initial newscore for %s: %d\n", + dict_wordstr(dict, w), newscore)); + if (newscore != WORST_SCORE) + newscore += ngram_tg_score(ngs->lmset, + dict_basewid(dict, w), + bpe->real_wid, + bpe->prev_real_wid, + &n_used)>>SENSCR_SHIFT; + + /* FIXME: Not sure how WORST_SCORE could be better, but it + * apparently happens. */ + if (newscore BETTER_THAN ngs->last_ltrans[w].dscr) { + ngs->last_ltrans[w].dscr = newscore; + ngs->last_ltrans[w].bp = bp; + } + } + } + + /* Now transition to in-LM single phone words */ + for (i = 0; i < ngs->n_1ph_LMwords; i++) { + w = ngs->single_phone_wid[i]; + /* Never transition into the start word (for one thing, it is + a non-event in the language model.) */ + if (w == dict_startwid(ps_search_dict(ngs))) + continue; + rhmm = (root_chan_t *) ngs->word_chan[w]; + newscore = ngs->last_ltrans[w].dscr + ngs->pip; + pl_newscore = newscore + phone_loop_search_score(pls, rhmm->ciphone); + if (pl_newscore BETTER_THAN thresh) { + bpe = ngs->bp_table + ngs->last_ltrans[w].bp; + if ((hmm_frame(&rhmm->hmm) < frame_idx) + || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) { + hmm_enter(&rhmm->hmm, + newscore, ngs->last_ltrans[w].bp, nf); + /* DICT2PID: another place where mpx ssids are entered. */ + /* Look up the ssid to use when entering this mpx triphone. */ + hmm_mpx_ssid(&rhmm->hmm, 0) = + dict2pid_ldiph_lc(d2p, rhmm->ciphone, rhmm->ci2phone, + dict_last_phone(dict, bpe->wid)); + assert(hmm_mpx_ssid(&rhmm->hmm, 0) != BAD_SSID); + } + } + } + + /* Remaining words: <sil>, noise words. No mpx for these! */ + w = ps_search_silence_wid(ngs); + rhmm = (root_chan_t *) ngs->word_chan[w]; + bestbp_rc_ptr = &(ngs->bestbp_rc[ps_search_acmod(ngs)->mdef->sil]); + newscore = bestbp_rc_ptr->score + ngs->silpen + ngs->pip; + pl_newscore = newscore + + phone_loop_search_score(pls, rhmm->ciphone); + if (pl_newscore BETTER_THAN thresh) { + if ((hmm_frame(&rhmm->hmm) < frame_idx) + || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) { + hmm_enter(&rhmm->hmm, + newscore, bestbp_rc_ptr->path, nf); + } + } + for (w = dict_filler_start(dict); w <= dict_filler_end(dict); w++) { + if (w == ps_search_silence_wid(ngs)) + continue; + /* Never transition into the start word (for one thing, it is + a non-event in the language model.) */ + if (w == dict_startwid(ps_search_dict(ngs))) + continue; + rhmm = (root_chan_t *) ngs->word_chan[w]; + /* If this was not actually a single-phone word, rhmm will be NULL. */ + if (rhmm == NULL) + continue; + newscore = bestbp_rc_ptr->score + ngs->fillpen + ngs->pip; + pl_newscore = newscore + + phone_loop_search_score(pls, rhmm->ciphone); + if (pl_newscore BETTER_THAN thresh) { + if ((hmm_frame(&rhmm->hmm) < frame_idx) + || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) { + hmm_enter(&rhmm->hmm, + newscore, bestbp_rc_ptr->path, nf); + } + } + } +} + +static void +deactivate_channels(ngram_search_t *ngs, int frame_idx) +{ + root_chan_t *rhmm; + int i; + + /* Clear score[] of pruned root channels */ + for (i = ngs->n_root_chan, rhmm = ngs->root_chan; i > 0; --i, rhmm++) { + if (hmm_frame(&rhmm->hmm) == frame_idx) { + hmm_clear(&rhmm->hmm); + } + } + /* Clear score[] of pruned single-phone channels */ + for (i = 0; i < ngs->n_1ph_words; i++) { + int32 w = ngs->single_phone_wid[i]; + rhmm = (root_chan_t *) ngs->word_chan[w]; + if (hmm_frame(&rhmm->hmm) == frame_idx) { + hmm_clear(&rhmm->hmm); + } + } +} + +int +ngram_fwdtree_search(ngram_search_t *ngs, int frame_idx) +{ + int16 const *senscr; + + /* Activate our HMMs for the current frame if need be. */ + if (!ps_search_acmod(ngs)->compallsen) + compute_sen_active(ngs, frame_idx); + + /* Compute GMM scores for the current frame. */ + if ((senscr = acmod_score(ps_search_acmod(ngs), &frame_idx)) == NULL) + return 0; + ngs->st.n_senone_active_utt += ps_search_acmod(ngs)->n_senone_active; + + /* Mark backpointer table for current frame. */ + ngram_search_mark_bptable(ngs, frame_idx); + + /* If the best score is equal to or worse than WORST_SCORE, + * recognition has failed, don't bother to keep trying. */ + if (ngs->best_score == WORST_SCORE || ngs->best_score WORSE_THAN WORST_SCORE) + return 0; + /* Renormalize if necessary */ + if (ngs->best_score + (2 * ngs->beam) WORSE_THAN WORST_SCORE) { + E_INFO("Renormalizing Scores at frame %d, best score %d\n", + frame_idx, ngs->best_score); + renormalize_scores(ngs, frame_idx, ngs->best_score); + } + + /* Evaluate HMMs */ + evaluate_channels(ngs, senscr, frame_idx); + /* Prune HMMs and do phone transitions. */ + prune_channels(ngs, frame_idx); + /* Do absolute pruning on word exits. */ + bptable_maxwpf(ngs, frame_idx); + /* Do word transitions. */ + word_transition(ngs, frame_idx); + /* Deactivate pruned HMMs. */ + deactivate_channels(ngs, frame_idx); + + ++ngs->n_frame; + /* Return the number of frames processed. */ + return 1; +} + +void +ngram_fwdtree_finish(ngram_search_t *ngs) +{ + int32 i, w, cf, *awl; + root_chan_t *rhmm; + chan_t *hmm, **acl; + + /* This is the number of frames processed. */ + cf = ps_search_acmod(ngs)->output_frame; + /* Add a mark in the backpointer table for one past the final frame. */ + ngram_search_mark_bptable(ngs, cf); + + /* Deactivate channels lined up for the next frame */ + /* First, root channels of HMM tree */ + for (i = ngs->n_root_chan, rhmm = ngs->root_chan; i > 0; --i, rhmm++) { + hmm_clear(&rhmm->hmm); + } + + /* nonroot channels of HMM tree */ + i = ngs->n_active_chan[cf & 0x1]; + acl = ngs->active_chan_list[cf & 0x1]; + for (hmm = *(acl++); i > 0; --i, hmm = *(acl++)) { + hmm_clear(&hmm->hmm); + } + + /* word channels */ + i = ngs->n_active_word[cf & 0x1]; + awl = ngs->active_word_list[cf & 0x1]; + for (w = *(awl++); i > 0; --i, w = *(awl++)) { + /* Don't accidentally free single-phone words! */ + if (dict_is_single_phone(ps_search_dict(ngs), w)) + continue; + bitvec_clear(ngs->word_active, w); + if (ngs->word_chan[w] == NULL) + continue; + ngram_search_free_all_rc(ngs, w); + } + + /* + * The previous search code did a postprocessing of the + * backpointer table here, but we will postpone this until it is + * absolutely necessary, i.e. when generating a word graph. + * Likewise we don't actually have to decide what the exit word is + * until somebody requests a backtrace. + */ + + ptmr_stop(&ngs->fwdtree_perf); + /* Print out some statistics. */ + if (cf > 0) { + double n_speech = (double)(cf + 1) + / cmd_ln_int32_r(ps_search_config(ngs), "-frate"); + E_INFO("%8d words recognized (%d/fr)\n", + ngs->bpidx, (ngs->bpidx + (cf >> 1)) / (cf + 1)); + E_INFO("%8d senones evaluated (%d/fr)\n", ngs->st.n_senone_active_utt, + (ngs->st.n_senone_active_utt + (cf >> 1)) / (cf + 1)); + E_INFO("%8d channels searched (%d/fr), %d 1st, %d last\n", + ngs->st.n_root_chan_eval + ngs->st.n_nonroot_chan_eval, + (ngs->st.n_root_chan_eval + ngs->st.n_nonroot_chan_eval) / (cf + 1), + ngs->st.n_root_chan_eval, ngs->st.n_last_chan_eval); + E_INFO("%8d words for which last channels evaluated (%d/fr)\n", + ngs->st.n_word_lastchan_eval, + ngs->st.n_word_lastchan_eval / (cf + 1)); + E_INFO("%8d candidate words for entering last phone (%d/fr)\n", + ngs->st.n_lastphn_cand_utt, ngs->st.n_lastphn_cand_utt / (cf + 1)); + E_INFO("fwdtree %.2f CPU %.3f xRT\n", + ngs->fwdtree_perf.t_cpu, + ngs->fwdtree_perf.t_cpu / n_speech); + E_INFO("fwdtree %.2f wall %.3f xRT\n", + ngs->fwdtree_perf.t_elapsed, + ngs->fwdtree_perf.t_elapsed / n_speech); + } + /* dump_bptable(ngs); */ +} |