diff options
Diffstat (limited to 'media/pocketsphinx/src/ngram_search.h')
-rw-r--r-- | media/pocketsphinx/src/ngram_search.h | 433 |
1 files changed, 433 insertions, 0 deletions
diff --git a/media/pocketsphinx/src/ngram_search.h b/media/pocketsphinx/src/ngram_search.h new file mode 100644 index 000000000..a9a2f4c25 --- /dev/null +++ b/media/pocketsphinx/src/ngram_search.h @@ -0,0 +1,433 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 2008 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +/** + * @file ngram_search.h N-Gram based multi-pass search ("FBS") + */ + +#ifndef __NGRAM_SEARCH_H__ +#define __NGRAM_SEARCH_H__ + +/* SphinxBase headers. */ +#include <sphinxbase/cmd_ln.h> +#include <sphinxbase/logmath.h> +#include <sphinxbase/ngram_model.h> +#include <sphinxbase/listelem_alloc.h> +#include <sphinxbase/err.h> + +/* Local headers. */ +#include "pocketsphinx_internal.h" +#include "hmm.h" + +/** + * Lexical tree node data type. + * + * Not the first HMM for words, which multiplex HMMs based on + * different left contexts. This structure is used both in the + * dynamic HMM tree structure and in the per-word last-phone right + * context fanout. + */ +typedef struct chan_s { + hmm_t hmm; /**< Basic HMM structure. This *must* be first in + the structure because chan_t and root_chan_t are + sometimes used interchangeably */ + struct chan_s *next; /**< first descendant of this channel; or, in the + case of the last phone of a word, the next + alternative right context channel */ + struct chan_s *alt; /**< sibling; i.e., next descendant of parent HMM */ + + int32 ciphone; /**< ciphone for this node */ + union { + int32 penult_phn_wid; /**< list of words whose last phone follows this one; + this field indicates the first of the list; the + rest must be built up in a separate array. Used + only within HMM tree. -1 if none */ + int32 rc_id; /**< right-context id for last phone of words */ + } info; +} chan_t; + +/** + * Lexical tree node data type for the first phone (root) of each dynamic HMM tree + * structure. + * + * Each state may have a different parent static HMM. Most fields are + * similar to those in chan_t. + */ +typedef struct root_chan_s { + hmm_t hmm; /**< Basic HMM structure. This *must* be first in + the structure because chan_t and root_chan_t are + sometimes used interchangeably. */ + chan_t *next; /**< first descendant of this channel */ + + int32 penult_phn_wid; + int32 this_phn_wid; /**< list of words consisting of this single phone; + actually the first of the list, like penult_phn_wid; + -1 if none */ + int16 ciphone; /**< first ciphone of this node; all words rooted at this + node begin with this ciphone */ + int16 ci2phone; /**< second ciphone of this node; one root HMM for each + unique right context */ +} root_chan_t; + +/** + * Back pointer table (forward pass lattice; actually a tree) + */ +typedef struct bptbl_s { + frame_idx_t frame; /**< start or end frame */ + uint8 valid; /**< For absolute pruning */ + uint8 refcnt; /**< Reference count (number of successors) */ + int32 wid; /**< Word index */ + int32 bp; /**< Back Pointer */ + int32 score; /**< Score (best among all right contexts) */ + int32 s_idx; /**< Start of BScoreStack for various right contexts*/ + int32 real_wid; /**< wid of this or latest predecessor real word */ + int32 prev_real_wid; /**< wid of second-last real word */ + int16 last_phone; /**< last phone of this word */ + int16 last2_phone; /**< next-to-last phone of this word */ +} bptbl_t; + +/** + * Segmentation "iterator" for backpointer table results. + */ +typedef struct bptbl_seg_s { + ps_seg_t base; /**< Base structure. */ + int32 *bpidx; /**< Sequence of backpointer IDs. */ + int16 n_bpidx; /**< Number of backpointer IDs. */ + int16 cur; /**< Current position in bpidx. */ +} bptbl_seg_t; + +/* + * Candidates words for entering their last phones. Cleared and rebuilt in each + * frame. + * NOTE: candidates can only be multi-phone, real dictionary words. + */ +typedef struct lastphn_cand_s { + int32 wid; + int32 score; + int32 bp; + int32 next; /* next candidate starting at the same frame */ +} lastphn_cand_t; + +/* + * Since the same instance of a word (i.e., <word,start-frame>) reaches its last + * phone several times, we can compute its best BP and LM transition score info + * just the first time and cache it for future occurrences. Structure for such + * a cache. + */ +typedef struct { + int32 sf; /* Start frame */ + int32 dscr; /* Delta-score upon entering last phone */ + int32 bp; /* Best BP */ +} last_ltrans_t; + +#define CAND_SF_ALLOCSIZE 32 +typedef struct { + int32 bp_ef; + int32 cand; +} cand_sf_t; + +/* + * Structure for reorganizing the BP table entries in the current frame according + * to distinct right context ci-phones. Each entry contains the best BP entry for + * a given right context. Each successor word will pick up the correct entry based + * on its first ci-phone. + */ +typedef struct bestbp_rc_s { + int32 score; + int32 path; /* BP table index corresponding to this entry */ + int32 lc; /* right most ci-phone of above BP entry word */ +} bestbp_rc_t; + +#define NO_BP -1 + +/** + * Various statistics for profiling. + */ +typedef struct ngram_search_stats_s { + int32 n_phone_eval; + int32 n_root_chan_eval; + int32 n_nonroot_chan_eval; + int32 n_last_chan_eval; + int32 n_word_lastchan_eval; + int32 n_lastphn_cand_utt; + int32 n_fwdflat_chan; + int32 n_fwdflat_words; + int32 n_fwdflat_word_transition; + int32 n_senone_active_utt; +} ngram_search_stats_t; + + +/** + * N-Gram search module structure. + */ +struct ngram_search_s { + ps_search_t base; + ngram_model_t *lmset; /**< Set of language models. */ + hmm_context_t *hmmctx; /**< HMM context. */ + + /* Flags to quickly indicate which passes are enabled. */ + uint8 fwdtree; + uint8 fwdflat; + uint8 bestpath; + + /* State of procesing. */ + uint8 done; + + /* Allocators */ + listelem_alloc_t *chan_alloc; /**< For chan_t */ + listelem_alloc_t *root_chan_alloc; /**< For root_chan_t */ + listelem_alloc_t *latnode_alloc; /**< For latnode_t */ + + /** + * Search structure of HMM instances. + * + * The word triphone sequences (HMM instances) are transformed + * into tree structures, one tree per unique left triphone in the + * entire dictionary (actually diphone, since its left context + * varies dyamically during the search process). The entire set + * of trees of channels is allocated once and for all during + * initialization (since dynamic management of active CHANs is + * time consuming), with one exception: the last phones of words, + * that need multiple right context modelling, are not maintained + * in this static structure since there are too many of them and + * few are active at any time. Instead they are maintained as + * linked lists of CHANs, one list per word, and each CHAN in this + * set is allocated only on demand and freed if inactive. + */ + root_chan_t *root_chan; /**< Roots of search tree. */ + int32 n_root_chan_alloc; /**< Number of root_chan allocated */ + int32 n_root_chan; /**< Number of valid root_chan */ + int32 n_nonroot_chan; /**< Number of valid non-root channels */ + int32 max_nonroot_chan; /**< Maximum possible number of non-root channels */ + root_chan_t *rhmm_1ph; /**< Root HMMs for single-phone words */ + + /** + * Channels associated with a given word (only used for right + * contexts, single-phone words in fwdtree search, and word HMMs + * in fwdflat search). WARNING: For single-phone words and + * fwdflat search, this actually contains pointers to root_chan_t, + * which are allocated using root_chan_alloc. This is a + * suboptimal state of affairs. + */ + chan_t **word_chan; + bitvec_t *word_active; /**< array of active flags for all words. */ + + /** + * Each node in the HMM tree structure may point to a set of words + * whose last phone would follow that node in the tree structure + * (but is not included in the tree structure for reasons + * explained above). The channel node points to one word in this + * set of words. The remaining words are linked through + * homophone_set[]. + * + * Single-phone words are not represented in the HMM tree; they + * are kept in word_chan. + * + * Specifically, homophone_set[w] = wid of next word in the same + * set as w. + */ + int32 *homophone_set; + int32 *single_phone_wid; /**< list of single-phone word ids */ + int32 n_1ph_words; /**< Number single phone words in dict (total) */ + int32 n_1ph_LMwords; /**< Number single phone dict words also in LM; + these come first in single_phone_wid */ + /** + * Array of active channels for current and next frame. + * + * In any frame, only some HMM tree nodes are active. + * active_chan_list[f mod 2] = list of nonroot channels in the HMM + * tree active in frame f. + */ + chan_t ***active_chan_list; + int32 n_active_chan[2]; /**< Number entries in active_chan_list */ + /** + * Array of active multi-phone words for current and next frame. + * + * Similarly to active_chan_list, active_word_list[f mod 2] = list + * of word ids for which active channels exist in word_chan in + * frame f. + * + * Statically allocated single-phone words are always active and + * should not appear in this list. + */ + int32 **active_word_list; + int32 n_active_word[2]; /**< Number entries in active_word_list */ + + /* + * FIXME: Document all of these bits. + */ + lastphn_cand_t *lastphn_cand; + int32 n_lastphn_cand; + last_ltrans_t *last_ltrans; /* one per word */ + int32 cand_sf_alloc; + cand_sf_t *cand_sf; + bestbp_rc_t *bestbp_rc; + + bptbl_t *bp_table; /* Forward pass lattice */ + int32 bpidx; /* First free BPTable entry */ + int32 bp_table_size; + int32 *bscore_stack; /* Score stack for all possible right contexts */ + int32 bss_head; /* First free BScoreStack entry */ + int32 bscore_stack_size; + + int32 n_frame_alloc; /**< Number of frames allocated in bp_table_idx and friends. */ + int32 n_frame; /**< Number of frames actually present. */ + int32 *bp_table_idx; /* First BPTable entry for each frame */ + int32 *word_lat_idx; /* BPTable index for any word in current frame; + cleared before each frame */ + + /* + * Flat lexicon (2nd pass) search stuff. + */ + ps_latnode_t **frm_wordlist; /**< List of active words in each frame. */ + int32 *fwdflat_wordlist; /**< List of active word IDs for utterance. */ + bitvec_t *expand_word_flag; + int32 *expand_word_list; + int32 n_expand_words; + int32 min_ef_width; + int32 max_sf_win; + float32 fwdflat_fwdtree_lw_ratio; + + int32 best_score; /**< Best Viterbi path score. */ + int32 last_phone_best_score; /**< Best Viterbi path score for last phone. */ + int32 renormalized; + + /* + * DAG (3rd pass) search stuff. + */ + float32 bestpath_fwdtree_lw_ratio; + float32 ascale; /**< Acoustic score scale for posterior probabilities. */ + + ngram_search_stats_t st; /**< Various statistics for profiling. */ + ptmr_t fwdtree_perf; + ptmr_t fwdflat_perf; + ptmr_t bestpath_perf; + int32 n_tot_frame; + + /* A collection of beam widths. */ + int32 beam; + int32 dynamic_beam; + int32 pbeam; + int32 wbeam; + int32 lpbeam; + int32 lponlybeam; + int32 fwdflatbeam; + int32 fwdflatwbeam; + int32 fillpen; + int32 silpen; + int32 wip; + int32 nwpen; + int32 pip; + int32 maxwpf; + int32 maxhmmpf; +}; +typedef struct ngram_search_s ngram_search_t; + +/** + * Initialize the N-Gram search module. + */ +ps_search_t *ngram_search_init(ngram_model_t *lm, + cmd_ln_t *config, + acmod_t *acmod, + dict_t *dict, + dict2pid_t *d2p); + +/** + * Finalize the N-Gram search module. + */ +void ngram_search_free(ps_search_t *ngs); + +/** + * Record the current frame's index in the backpointer table. + * + * @return the current backpointer index. + */ +int ngram_search_mark_bptable(ngram_search_t *ngs, int frame_idx); + +/** + * Enter a word in the backpointer table. + */ +void ngram_search_save_bp(ngram_search_t *ngs, int frame_idx, int32 w, + int32 score, int32 path, int32 rc); + +/** + * Allocate last phone channels for all possible right contexts for word w. + */ +void ngram_search_alloc_all_rc(ngram_search_t *ngs, int32 w); + +/** + * Allocate last phone channels for all possible right contexts for word w. + */ +void ngram_search_free_all_rc(ngram_search_t *ngs, int32 w); + +/** + * Find the best word exit for the current frame in the backpointer table. + * + * @return the backpointer index of the best word exit. + */ +int ngram_search_find_exit(ngram_search_t *ngs, int frame_idx, int32 *out_best_score, int32 *out_is_final); + +/** + * Backtrace from a given backpointer index to obtain a word hypothesis. + * + * @return a <strong>read-only</strong> string with the best hypothesis. + */ +char const *ngram_search_bp_hyp(ngram_search_t *ngs, int bpidx); + +/** + * Compute language and acoustic scores for backpointer table entries. + */ +void ngram_compute_seg_scores(ngram_search_t *ngs, float32 lwf); + +/** + * Construct a word lattice from the current hypothesis. + */ +ps_lattice_t *ngram_search_lattice(ps_search_t *search); + +/** + * Get the exit score for a backpointer entry with a given right context. + */ +int32 ngram_search_exit_score(ngram_search_t *ngs, bptbl_t *pbe, int rcphone); + +/** + * Sets the global language model. + * + * Sets the language model to use if nothing was passed in configuration + */ +void ngram_search_set_lm(ngram_model_t *lm); + +#endif /* __NGRAM_SEARCH_H__ */ |