/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ /* ==================================================================== * Copyright (c) 2008 Carnegie Mellon University. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * This work was supported in part by funding from the Defense Advanced * Research Projects Agency and the National Science Foundation of the * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * */ /** * @file ngram_search_fwdflat.c Flat lexicon search. */ /* System headers. */ #include #include /* SphinxBase headers. */ #include #include #include /* Local headers. */ #include "ngram_search.h" #include "ps_lattice_internal.h" /* Turn this on to dump channels for debugging */ #define __CHAN_DUMP__ 0 #if __CHAN_DUMP__ #define chan_v_eval(chan) hmm_dump_vit_eval(&(chan)->hmm, stderr) #else #define chan_v_eval(chan) hmm_vit_eval(&(chan)->hmm) #endif static void ngram_fwdflat_expand_all(ngram_search_t *ngs) { int n_words, i; /* For all "real words" (not fillers or /) in the dictionary, * * 1) Add the ones which are in the LM to the fwdflat wordlist * 2) And to the expansion list (since we are expanding all) */ ngs->n_expand_words = 0; n_words = ps_search_n_words(ngs); bitvec_clear_all(ngs->expand_word_flag, ps_search_n_words(ngs)); for (i = 0; i < n_words; ++i) { if (!ngram_model_set_known_wid(ngs->lmset, dict_basewid(ps_search_dict(ngs),i))) continue; ngs->fwdflat_wordlist[ngs->n_expand_words] = i; ngs->expand_word_list[ngs->n_expand_words] = i; bitvec_set(ngs->expand_word_flag, i); ngs->n_expand_words++; } E_INFO("Utterance vocabulary contains %d words\n", ngs->n_expand_words); ngs->expand_word_list[ngs->n_expand_words] = -1; ngs->fwdflat_wordlist[ngs->n_expand_words] = -1; } static void ngram_fwdflat_allocate_1ph(ngram_search_t *ngs) { dict_t *dict = ps_search_dict(ngs); int n_words = ps_search_n_words(ngs); int i, w; /* Allocate single-phone words, since they won't have * been allocated for us by fwdtree initialization. */ ngs->n_1ph_words = 0; for (w = 0; w < n_words; w++) { if (dict_is_single_phone(dict, w)) ++ngs->n_1ph_words; } ngs->single_phone_wid = ckd_calloc(ngs->n_1ph_words, sizeof(*ngs->single_phone_wid)); ngs->rhmm_1ph = ckd_calloc(ngs->n_1ph_words, sizeof(*ngs->rhmm_1ph)); i = 0; for (w = 0; w < n_words; w++) { if (!dict_is_single_phone(dict, w)) continue; /* DICT2PID location */ ngs->rhmm_1ph[i].ciphone = dict_first_phone(dict, w); ngs->rhmm_1ph[i].ci2phone = bin_mdef_silphone(ps_search_acmod(ngs)->mdef); hmm_init(ngs->hmmctx, &ngs->rhmm_1ph[i].hmm, TRUE, /* ssid */ bin_mdef_pid2ssid(ps_search_acmod(ngs)->mdef, ngs->rhmm_1ph[i].ciphone), /* tmatid */ bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, ngs->rhmm_1ph[i].ciphone)); ngs->rhmm_1ph[i].next = NULL; ngs->word_chan[w] = (chan_t *) &(ngs->rhmm_1ph[i]); ngs->single_phone_wid[i] = w; i++; } } static void ngram_fwdflat_free_1ph(ngram_search_t *ngs) { int i, w; int n_words = ps_search_n_words(ngs); for (i = w = 0; w < n_words; ++w) { if (!dict_is_single_phone(ps_search_dict(ngs), w)) continue; hmm_deinit(&ngs->rhmm_1ph[i].hmm); ++i; } ckd_free(ngs->rhmm_1ph); ngs->rhmm_1ph = NULL; ckd_free(ngs->single_phone_wid); } void ngram_fwdflat_init(ngram_search_t *ngs) { int n_words; n_words = ps_search_n_words(ngs); ngs->fwdflat_wordlist = ckd_calloc(n_words + 1, sizeof(*ngs->fwdflat_wordlist)); ngs->expand_word_flag = bitvec_alloc(n_words); ngs->expand_word_list = ckd_calloc(n_words + 1, sizeof(*ngs->expand_word_list)); ngs->frm_wordlist = ckd_calloc(ngs->n_frame_alloc, sizeof(*ngs->frm_wordlist)); ngs->min_ef_width = cmd_ln_int32_r(ps_search_config(ngs), "-fwdflatefwid"); ngs->max_sf_win = cmd_ln_int32_r(ps_search_config(ngs), "-fwdflatsfwin"); E_INFO("fwdflat: min_ef_width = %d, max_sf_win = %d\n", ngs->min_ef_width, ngs->max_sf_win); /* No tree-search; pre-build the expansion list, including all LM words. */ if (!ngs->fwdtree) { /* Build full expansion list from LM words. */ ngram_fwdflat_expand_all(ngs); /* Allocate single phone words. */ ngram_fwdflat_allocate_1ph(ngs); } } void ngram_fwdflat_deinit(ngram_search_t *ngs) { double n_speech = (double)ngs->n_tot_frame / cmd_ln_int32_r(ps_search_config(ngs), "-frate"); E_INFO("TOTAL fwdflat %.2f CPU %.3f xRT\n", ngs->fwdflat_perf.t_tot_cpu, ngs->fwdflat_perf.t_tot_cpu / n_speech); E_INFO("TOTAL fwdflat %.2f wall %.3f xRT\n", ngs->fwdflat_perf.t_tot_elapsed, ngs->fwdflat_perf.t_tot_elapsed / n_speech); /* Free single-phone words if we allocated them. */ if (!ngs->fwdtree) { ngram_fwdflat_free_1ph(ngs); } ckd_free(ngs->fwdflat_wordlist); bitvec_free(ngs->expand_word_flag); ckd_free(ngs->expand_word_list); ckd_free(ngs->frm_wordlist); } int ngram_fwdflat_reinit(ngram_search_t *ngs) { /* Reallocate things that depend on the number of words. */ int n_words; ckd_free(ngs->fwdflat_wordlist); ckd_free(ngs->expand_word_list); bitvec_free(ngs->expand_word_flag); n_words = ps_search_n_words(ngs); ngs->fwdflat_wordlist = ckd_calloc(n_words + 1, sizeof(*ngs->fwdflat_wordlist)); ngs->expand_word_flag = bitvec_alloc(n_words); ngs->expand_word_list = ckd_calloc(n_words + 1, sizeof(*ngs->expand_word_list)); /* No tree-search; take care of the expansion list and single phone words. */ if (!ngs->fwdtree) { /* Free single-phone words. */ ngram_fwdflat_free_1ph(ngs); /* Reallocate word_chan. */ ckd_free(ngs->word_chan); ngs->word_chan = ckd_calloc(dict_size(ps_search_dict(ngs)), sizeof(*ngs->word_chan)); /* Rebuild full expansion list from LM words. */ ngram_fwdflat_expand_all(ngs); /* Allocate single phone words. */ ngram_fwdflat_allocate_1ph(ngs); } /* Otherwise there is nothing to do since the wordlist is * generated anew every utterance. */ return 0; } /** * Find all active words in backpointer table and sort by frame. */ static void build_fwdflat_wordlist(ngram_search_t *ngs) { int32 i, f, sf, ef, wid, nwd; bptbl_t *bp; ps_latnode_t *node, *prevnode, *nextnode; /* No tree-search, use statically allocated wordlist. */ if (!ngs->fwdtree) return; memset(ngs->frm_wordlist, 0, ngs->n_frame_alloc * sizeof(*ngs->frm_wordlist)); /* Scan the backpointer table for all active words and record * their exit frames. */ for (i = 0, bp = ngs->bp_table; i < ngs->bpidx; i++, bp++) { sf = (bp->bp < 0) ? 0 : ngs->bp_table[bp->bp].frame + 1; ef = bp->frame; wid = bp->wid; /* Anything that can be transitioned to in the LM can go in * the word list. */ if (!ngram_model_set_known_wid(ngs->lmset, dict_basewid(ps_search_dict(ngs), wid))) continue; /* Look for it in the wordlist. */ for (node = ngs->frm_wordlist[sf]; node && (node->wid != wid); node = node->next); /* Update last end frame. */ if (node) node->lef = ef; else { /* New node; link to head of list */ node = listelem_malloc(ngs->latnode_alloc); node->wid = wid; node->fef = node->lef = ef; node->next = ngs->frm_wordlist[sf]; ngs->frm_wordlist[sf] = node; } } /* Eliminate "unlikely" words, for which there are too few end points */ for (f = 0; f < ngs->n_frame; f++) { prevnode = NULL; for (node = ngs->frm_wordlist[f]; node; node = nextnode) { nextnode = node->next; /* Word has too few endpoints */ if ((node->lef - node->fef < ngs->min_ef_width) || /* Word is and doesn't actually end in last frame */ ((node->wid == ps_search_finish_wid(ngs)) && (node->lef < ngs->n_frame - 1))) { if (!prevnode) ngs->frm_wordlist[f] = nextnode; else prevnode->next = nextnode; listelem_free(ngs->latnode_alloc, node); } else prevnode = node; } } /* Form overall wordlist for 2nd pass */ nwd = 0; bitvec_clear_all(ngs->word_active, ps_search_n_words(ngs)); for (f = 0; f < ngs->n_frame; f++) { for (node = ngs->frm_wordlist[f]; node; node = node->next) { if (!bitvec_is_set(ngs->word_active, node->wid)) { bitvec_set(ngs->word_active, node->wid); ngs->fwdflat_wordlist[nwd++] = node->wid; } } } ngs->fwdflat_wordlist[nwd] = -1; E_INFO("Utterance vocabulary contains %d words\n", nwd); } /** * Build HMM network for one utterance of fwdflat search. */ static void build_fwdflat_chan(ngram_search_t *ngs) { int32 i, wid, p; root_chan_t *rhmm; chan_t *hmm, *prevhmm; dict_t *dict; dict2pid_t *d2p; dict = ps_search_dict(ngs); d2p = ps_search_dict2pid(ngs); /* Build word HMMs for each word in the lattice. */ for (i = 0; ngs->fwdflat_wordlist[i] >= 0; i++) { wid = ngs->fwdflat_wordlist[i]; /* Single-phone words are permanently allocated */ if (dict_is_single_phone(dict, wid)) continue; assert(ngs->word_chan[wid] == NULL); /* Multiplex root HMM for first phone (one root per word, flat * lexicon). diphone is irrelevant here, for the time being, * at least. */ rhmm = listelem_malloc(ngs->root_chan_alloc); rhmm->ci2phone = dict_second_phone(dict, wid); rhmm->ciphone = dict_first_phone(dict, wid); rhmm->next = NULL; hmm_init(ngs->hmmctx, &rhmm->hmm, TRUE, bin_mdef_pid2ssid(ps_search_acmod(ngs)->mdef, rhmm->ciphone), bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, rhmm->ciphone)); /* HMMs for word-internal phones */ prevhmm = NULL; for (p = 1; p < dict_pronlen(dict, wid) - 1; p++) { hmm = listelem_malloc(ngs->chan_alloc); hmm->ciphone = dict_pron(dict, wid, p); hmm->info.rc_id = (p == dict_pronlen(dict, wid) - 1) ? 0 : -1; hmm->next = NULL; hmm_init(ngs->hmmctx, &hmm->hmm, FALSE, dict2pid_internal(d2p,wid,p), bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, hmm->ciphone)); if (prevhmm) prevhmm->next = hmm; else rhmm->next = hmm; prevhmm = hmm; } /* Right-context phones */ ngram_search_alloc_all_rc(ngs, wid); /* Link in just allocated right-context phones */ if (prevhmm) prevhmm->next = ngs->word_chan[wid]; else rhmm->next = ngs->word_chan[wid]; ngs->word_chan[wid] = (chan_t *) rhmm; } } void ngram_fwdflat_start(ngram_search_t *ngs) { root_chan_t *rhmm; int i; ptmr_reset(&ngs->fwdflat_perf); ptmr_start(&ngs->fwdflat_perf); build_fwdflat_wordlist(ngs); build_fwdflat_chan(ngs); ngs->bpidx = 0; ngs->bss_head = 0; for (i = 0; i < ps_search_n_words(ngs); i++) ngs->word_lat_idx[i] = NO_BP; /* Reset the permanently allocated single-phone words, since they * may have junk left over in them from previous searches. */ for (i = 0; i < ngs->n_1ph_words; i++) { int32 w = ngs->single_phone_wid[i]; rhmm = (root_chan_t *) ngs->word_chan[w]; hmm_clear(&rhmm->hmm); } /* Start search with ; word_chan[] is permanently allocated */ rhmm = (root_chan_t *) ngs->word_chan[ps_search_start_wid(ngs)]; hmm_enter(&rhmm->hmm, 0, NO_BP, 0); ngs->active_word_list[0][0] = ps_search_start_wid(ngs); ngs->n_active_word[0] = 1; ngs->best_score = 0; ngs->renormalized = FALSE; for (i = 0; i < ps_search_n_words(ngs); i++) ngs->last_ltrans[i].sf = -1; if (!ngs->fwdtree) ngs->n_frame = 0; ngs->st.n_fwdflat_chan = 0; ngs->st.n_fwdflat_words = 0; ngs->st.n_fwdflat_word_transition = 0; ngs->st.n_senone_active_utt = 0; } static void compute_fwdflat_sen_active(ngram_search_t *ngs, int frame_idx) { int32 i, nw, w; int32 *awl; root_chan_t *rhmm; chan_t *hmm; acmod_clear_active(ps_search_acmod(ngs)); nw = ngs->n_active_word[frame_idx & 0x1]; awl = ngs->active_word_list[frame_idx & 0x1]; for (i = 0; i < nw; i++) { w = *(awl++); rhmm = (root_chan_t *)ngs->word_chan[w]; if (hmm_frame(&rhmm->hmm) == frame_idx) { acmod_activate_hmm(ps_search_acmod(ngs), &rhmm->hmm); } for (hmm = rhmm->next; hmm; hmm = hmm->next) { if (hmm_frame(&hmm->hmm) == frame_idx) { acmod_activate_hmm(ps_search_acmod(ngs), &hmm->hmm); } } } } static void fwdflat_eval_chan(ngram_search_t *ngs, int frame_idx) { int32 i, w, nw, bestscore; int32 *awl; root_chan_t *rhmm; chan_t *hmm; nw = ngs->n_active_word[frame_idx & 0x1]; awl = ngs->active_word_list[frame_idx & 0x1]; bestscore = WORST_SCORE; ngs->st.n_fwdflat_words += nw; /* Scan all active words. */ for (i = 0; i < nw; i++) { w = *(awl++); rhmm = (root_chan_t *) ngs->word_chan[w]; if (hmm_frame(&rhmm->hmm) == frame_idx) { int32 score = chan_v_eval(rhmm); if ((score BETTER_THAN bestscore) && (w != ps_search_finish_wid(ngs))) bestscore = score; ngs->st.n_fwdflat_chan++; } for (hmm = rhmm->next; hmm; hmm = hmm->next) { if (hmm_frame(&hmm->hmm) == frame_idx) { int32 score = chan_v_eval(hmm); if (score BETTER_THAN bestscore) bestscore = score; ngs->st.n_fwdflat_chan++; } } } ngs->best_score = bestscore; } static void fwdflat_prune_chan(ngram_search_t *ngs, int frame_idx) { int32 i, nw, cf, nf, w, pip, newscore, thresh, wordthresh; int32 *awl; root_chan_t *rhmm; chan_t *hmm, *nexthmm; cf = frame_idx; nf = cf + 1; nw = ngs->n_active_word[cf & 0x1]; awl = ngs->active_word_list[cf & 0x1]; bitvec_clear_all(ngs->word_active, ps_search_n_words(ngs)); thresh = ngs->best_score + ngs->fwdflatbeam; wordthresh = ngs->best_score + ngs->fwdflatwbeam; pip = ngs->pip; E_DEBUG(3,("frame %d thresh %d wordthresh %d\n", frame_idx, thresh, wordthresh)); /* Scan all active words. */ for (i = 0; i < nw; i++) { w = *(awl++); rhmm = (root_chan_t *) ngs->word_chan[w]; /* Propagate active root channels */ if (hmm_frame(&rhmm->hmm) == cf && hmm_bestscore(&rhmm->hmm) BETTER_THAN thresh) { hmm_frame(&rhmm->hmm) = nf; bitvec_set(ngs->word_active, w); /* Transitions out of root channel */ newscore = hmm_out_score(&rhmm->hmm); if (rhmm->next) { assert(!dict_is_single_phone(ps_search_dict(ngs), w)); newscore += pip; if (newscore BETTER_THAN thresh) { hmm = rhmm->next; /* Enter all right context phones */ if (hmm->info.rc_id >= 0) { for (; hmm; hmm = hmm->next) { if ((hmm_frame(&hmm->hmm) < cf) || (newscore BETTER_THAN hmm_in_score(&hmm->hmm))) { hmm_enter(&hmm->hmm, newscore, hmm_out_history(&rhmm->hmm), nf); } } } /* Just a normal word internal phone */ else { if ((hmm_frame(&hmm->hmm) < cf) || (newscore BETTER_THAN hmm_in_score(&hmm->hmm))) { hmm_enter(&hmm->hmm, newscore, hmm_out_history(&rhmm->hmm), nf); } } } } else { assert(dict_is_single_phone(ps_search_dict(ngs), w)); /* Word exit for single-phone words (where did their * whmms come from?) (either from * ngram_search_fwdtree, or from * ngram_fwdflat_allocate_1ph(), that's where) */ if (newscore BETTER_THAN wordthresh) { ngram_search_save_bp(ngs, cf, w, newscore, hmm_out_history(&rhmm->hmm), 0); } } } /* Transitions out of non-root channels. */ for (hmm = rhmm->next; hmm; hmm = hmm->next) { if (hmm_frame(&hmm->hmm) >= cf) { /* Propagate forward HMMs inside the beam. */ if (hmm_bestscore(&hmm->hmm) BETTER_THAN thresh) { hmm_frame(&hmm->hmm) = nf; bitvec_set(ngs->word_active, w); newscore = hmm_out_score(&hmm->hmm); /* Word-internal phones */ if (hmm->info.rc_id < 0) { newscore += pip; if (newscore BETTER_THAN thresh) { nexthmm = hmm->next; /* Enter all right-context phones. */ if (nexthmm->info.rc_id >= 0) { for (; nexthmm; nexthmm = nexthmm->next) { if ((hmm_frame(&nexthmm->hmm) < cf) || (newscore BETTER_THAN hmm_in_score(&nexthmm->hmm))) { hmm_enter(&nexthmm->hmm, newscore, hmm_out_history(&hmm->hmm), nf); } } } /* Enter single word-internal phone. */ else { if ((hmm_frame(&nexthmm->hmm) < cf) || (newscore BETTER_THAN hmm_in_score(&nexthmm->hmm))) { hmm_enter(&nexthmm->hmm, newscore, hmm_out_history(&hmm->hmm), nf); } } } } /* Right-context phones - apply word beam and exit. */ else { if (newscore BETTER_THAN wordthresh) { ngram_search_save_bp(ngs, cf, w, newscore, hmm_out_history(&hmm->hmm), hmm->info.rc_id); } } } /* Zero out inactive HMMs. */ else if (hmm_frame(&hmm->hmm) != nf) { hmm_clear_scores(&hmm->hmm); } } } } } static void get_expand_wordlist(ngram_search_t *ngs, int32 frm, int32 win) { int32 f, sf, ef; ps_latnode_t *node; if (!ngs->fwdtree) { ngs->st.n_fwdflat_word_transition += ngs->n_expand_words; return; } sf = frm - win; if (sf < 0) sf = 0; ef = frm + win; if (ef > ngs->n_frame) ef = ngs->n_frame; bitvec_clear_all(ngs->expand_word_flag, ps_search_n_words(ngs)); ngs->n_expand_words = 0; for (f = sf; f < ef; f++) { for (node = ngs->frm_wordlist[f]; node; node = node->next) { if (!bitvec_is_set(ngs->expand_word_flag, node->wid)) { ngs->expand_word_list[ngs->n_expand_words++] = node->wid; bitvec_set(ngs->expand_word_flag, node->wid); } } } ngs->expand_word_list[ngs->n_expand_words] = -1; ngs->st.n_fwdflat_word_transition += ngs->n_expand_words; } static void fwdflat_word_transition(ngram_search_t *ngs, int frame_idx) { int32 cf, nf, b, thresh, pip, i, nw, w, newscore; int32 best_silrc_score = 0, best_silrc_bp = 0; /* FIXME: good defaults? */ bptbl_t *bp; int32 *rcss; root_chan_t *rhmm; int32 *awl; float32 lwf; dict_t *dict = ps_search_dict(ngs); dict2pid_t *d2p = ps_search_dict2pid(ngs); cf = frame_idx; nf = cf + 1; thresh = ngs->best_score + ngs->fwdflatbeam; pip = ngs->pip; best_silrc_score = WORST_SCORE; lwf = ngs->fwdflat_fwdtree_lw_ratio; /* Search for all words starting within a window of this frame. * These are the successors for words exiting now. */ get_expand_wordlist(ngs, cf, ngs->max_sf_win); /* Scan words exited in current frame */ for (b = ngs->bp_table_idx[cf]; b < ngs->bpidx; b++) { xwdssid_t *rssid; int32 silscore; bp = ngs->bp_table + b; ngs->word_lat_idx[bp->wid] = NO_BP; if (bp->wid == ps_search_finish_wid(ngs)) continue; /* DICT2PID location */ /* Get the mapping from right context phone ID to index in the * right context table and the bscore_stack. */ rcss = ngs->bscore_stack + bp->s_idx; if (bp->last2_phone == -1) rssid = NULL; else rssid = dict2pid_rssid(d2p, bp->last_phone, bp->last2_phone); /* Transition to all successor words. */ for (i = 0; ngs->expand_word_list[i] >= 0; i++) { int32 n_used; w = ngs->expand_word_list[i]; /* Get the exit score we recorded in save_bwd_ptr(), or * something approximating it. */ if (rssid) newscore = rcss[rssid->cimap[dict_first_phone(dict, w)]]; else newscore = bp->score; if (newscore == WORST_SCORE) continue; /* FIXME: Floating point... */ newscore += lwf * (ngram_tg_score(ngs->lmset, dict_basewid(dict, w), bp->real_wid, bp->prev_real_wid, &n_used) >> SENSCR_SHIFT); newscore += pip; /* Enter the next word */ if (newscore BETTER_THAN thresh) { rhmm = (root_chan_t *) ngs->word_chan[w]; if ((hmm_frame(&rhmm->hmm) < cf) || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) { hmm_enter(&rhmm->hmm, newscore, b, nf); /* DICT2PID: This is where mpx ssids get introduced. */ /* Look up the ssid to use when entering this mpx triphone. */ hmm_mpx_ssid(&rhmm->hmm, 0) = dict2pid_ldiph_lc(d2p, rhmm->ciphone, rhmm->ci2phone, dict_last_phone(dict, bp->wid)); assert(IS_S3SSID(hmm_mpx_ssid(&rhmm->hmm, 0))); E_DEBUG(6,("ssid %d(%d,%d) = %d\n", rhmm->ciphone, dict_last_phone(dict, bp->wid), rhmm->ci2phone, hmm_mpx_ssid(&rhmm->hmm, 0))); bitvec_set(ngs->word_active, w); } } } /* Get the best exit into silence. */ if (rssid) silscore = rcss[rssid->cimap[ps_search_acmod(ngs)->mdef->sil]]; else silscore = bp->score; if (silscore BETTER_THAN best_silrc_score) { best_silrc_score = silscore; best_silrc_bp = b; } } /* Transition to */ newscore = best_silrc_score + ngs->silpen + pip; if ((newscore BETTER_THAN thresh) && (newscore BETTER_THAN WORST_SCORE)) { w = ps_search_silence_wid(ngs); rhmm = (root_chan_t *) ngs->word_chan[w]; if ((hmm_frame(&rhmm->hmm) < cf) || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) { hmm_enter(&rhmm->hmm, newscore, best_silrc_bp, nf); bitvec_set(ngs->word_active, w); } } /* Transition to noise words */ newscore = best_silrc_score + ngs->fillpen + pip; if ((newscore BETTER_THAN thresh) && (newscore BETTER_THAN WORST_SCORE)) { for (w = ps_search_silence_wid(ngs) + 1; w < ps_search_n_words(ngs); w++) { rhmm = (root_chan_t *) ngs->word_chan[w]; /* Noise words that aren't a single phone will have NULL here. */ if (rhmm == NULL) continue; if ((hmm_frame(&rhmm->hmm) < cf) || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) { hmm_enter(&rhmm->hmm, newscore, best_silrc_bp, nf); bitvec_set(ngs->word_active, w); } } } /* Reset initial channels of words that have become inactive even after word trans. */ nw = ngs->n_active_word[cf & 0x1]; awl = ngs->active_word_list[cf & 0x1]; for (i = 0; i < nw; i++) { w = *(awl++); rhmm = (root_chan_t *) ngs->word_chan[w]; if (hmm_frame(&rhmm->hmm) == cf) { hmm_clear_scores(&rhmm->hmm); } } } static void fwdflat_renormalize_scores(ngram_search_t *ngs, int frame_idx, int32 norm) { root_chan_t *rhmm; chan_t *hmm; int32 i, nw, cf, w, *awl; cf = frame_idx; /* Renormalize individual word channels */ nw = ngs->n_active_word[cf & 0x1]; awl = ngs->active_word_list[cf & 0x1]; for (i = 0; i < nw; i++) { w = *(awl++); rhmm = (root_chan_t *) ngs->word_chan[w]; if (hmm_frame(&rhmm->hmm) == cf) { hmm_normalize(&rhmm->hmm, norm); } for (hmm = rhmm->next; hmm; hmm = hmm->next) { if (hmm_frame(&hmm->hmm) == cf) { hmm_normalize(&hmm->hmm, norm); } } } ngs->renormalized = TRUE; } int ngram_fwdflat_search(ngram_search_t *ngs, int frame_idx) { int16 const *senscr; int32 nf, i, j; int32 *nawl; /* Activate our HMMs for the current frame if need be. */ if (!ps_search_acmod(ngs)->compallsen) compute_fwdflat_sen_active(ngs, frame_idx); /* Compute GMM scores for the current frame. */ senscr = acmod_score(ps_search_acmod(ngs), &frame_idx); ngs->st.n_senone_active_utt += ps_search_acmod(ngs)->n_senone_active; /* Mark backpointer table for current frame. */ ngram_search_mark_bptable(ngs, frame_idx); /* If the best score is equal to or worse than WORST_SCORE, * recognition has failed, don't bother to keep trying. */ if (ngs->best_score == WORST_SCORE || ngs->best_score WORSE_THAN WORST_SCORE) return 0; /* Renormalize if necessary */ if (ngs->best_score + (2 * ngs->beam) WORSE_THAN WORST_SCORE) { E_INFO("Renormalizing Scores at frame %d, best score %d\n", frame_idx, ngs->best_score); fwdflat_renormalize_scores(ngs, frame_idx, ngs->best_score); } ngs->best_score = WORST_SCORE; hmm_context_set_senscore(ngs->hmmctx, senscr); /* Evaluate HMMs */ fwdflat_eval_chan(ngs, frame_idx); /* Prune HMMs and do phone transitions. */ fwdflat_prune_chan(ngs, frame_idx); /* Do word transitions. */ fwdflat_word_transition(ngs, frame_idx); /* Create next active word list, skip fillers */ nf = frame_idx + 1; nawl = ngs->active_word_list[nf & 0x1]; for (i = 0, j = 0; ngs->fwdflat_wordlist[i] >= 0; i++) { int32 wid = ngs->fwdflat_wordlist[i]; if (bitvec_is_set(ngs->word_active, wid) && wid < ps_search_start_wid(ngs)) { *(nawl++) = wid; j++; } } /* Add fillers */ for (i = ps_search_start_wid(ngs); i < ps_search_n_words(ngs); i++) { if (bitvec_is_set(ngs->word_active, i)) { *(nawl++) = i; j++; } } if (!ngs->fwdtree) ++ngs->n_frame; ngs->n_active_word[nf & 0x1] = j; /* Return the number of frames processed. */ return 1; } /** * Destroy wordlist from the current utterance. */ static void destroy_fwdflat_wordlist(ngram_search_t *ngs) { ps_latnode_t *node, *tnode; int32 f; if (!ngs->fwdtree) return; for (f = 0; f < ngs->n_frame; f++) { for (node = ngs->frm_wordlist[f]; node; node = tnode) { tnode = node->next; listelem_free(ngs->latnode_alloc, node); } } } /** * Free HMM network for one utterance of fwdflat search. */ static void destroy_fwdflat_chan(ngram_search_t *ngs) { int32 i, wid; for (i = 0; ngs->fwdflat_wordlist[i] >= 0; i++) { root_chan_t *rhmm; chan_t *thmm; wid = ngs->fwdflat_wordlist[i]; if (dict_is_single_phone(ps_search_dict(ngs),wid)) continue; assert(ngs->word_chan[wid] != NULL); /* The first HMM in ngs->word_chan[wid] was allocated with * ngs->root_chan_alloc, but this will attempt to free it * using ngs->chan_alloc, which will not work. Therefore we * free it manually and move the list forward before handing * it off. */ rhmm = (root_chan_t *)ngs->word_chan[wid]; thmm = rhmm->next; listelem_free(ngs->root_chan_alloc, rhmm); ngs->word_chan[wid] = thmm; ngram_search_free_all_rc(ngs, wid); } } void ngram_fwdflat_finish(ngram_search_t *ngs) { int32 cf; destroy_fwdflat_chan(ngs); destroy_fwdflat_wordlist(ngs); bitvec_clear_all(ngs->word_active, ps_search_n_words(ngs)); /* This is the number of frames processed. */ cf = ps_search_acmod(ngs)->output_frame; /* Add a mark in the backpointer table for one past the final frame. */ ngram_search_mark_bptable(ngs, cf); ptmr_stop(&ngs->fwdflat_perf); /* Print out some statistics. */ if (cf > 0) { double n_speech = (double)(cf + 1) / cmd_ln_int32_r(ps_search_config(ngs), "-frate"); E_INFO("%8d words recognized (%d/fr)\n", ngs->bpidx, (ngs->bpidx + (cf >> 1)) / (cf + 1)); E_INFO("%8d senones evaluated (%d/fr)\n", ngs->st.n_senone_active_utt, (ngs->st.n_senone_active_utt + (cf >> 1)) / (cf + 1)); E_INFO("%8d channels searched (%d/fr)\n", ngs->st.n_fwdflat_chan, ngs->st.n_fwdflat_chan / (cf + 1)); E_INFO("%8d words searched (%d/fr)\n", ngs->st.n_fwdflat_words, ngs->st.n_fwdflat_words / (cf + 1)); E_INFO("%8d word transitions (%d/fr)\n", ngs->st.n_fwdflat_word_transition, ngs->st.n_fwdflat_word_transition / (cf + 1)); E_INFO("fwdflat %.2f CPU %.3f xRT\n", ngs->fwdflat_perf.t_cpu, ngs->fwdflat_perf.t_cpu / n_speech); E_INFO("fwdflat %.2f wall %.3f xRT\n", ngs->fwdflat_perf.t_elapsed, ngs->fwdflat_perf.t_elapsed / n_speech); } }