diff options
Diffstat (limited to 'media/pocketsphinx/src/acmod.h')
-rw-r--r-- | media/pocketsphinx/src/acmod.h | 466 |
1 files changed, 0 insertions, 466 deletions
diff --git a/media/pocketsphinx/src/acmod.h b/media/pocketsphinx/src/acmod.h deleted file mode 100644 index f4d5761c2..000000000 --- a/media/pocketsphinx/src/acmod.h +++ /dev/null @@ -1,466 +0,0 @@ -/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ -/* ==================================================================== - * Copyright (c) 2008 Carnegie Mellon University. All rights - * reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * This work was supported in part by funding from the Defense Advanced - * Research Projects Agency and the National Science Foundation of the - * United States of America, and the CMU Sphinx Speech Consortium. - * - * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND - * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY - * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * ==================================================================== - * - */ - -/** - * @file acmod.h Acoustic model structures for PocketSphinx. - * @author David Huggins-Daines <dhuggins@cs.cmu.edu> - */ - -#ifndef __ACMOD_H__ -#define __ACMOD_H__ - -/* System headers. */ -#include <stdio.h> - -/* SphinxBase headers. */ -#include <sphinxbase/cmd_ln.h> -#include <sphinxbase/logmath.h> -#include <sphinxbase/fe.h> -#include <sphinxbase/feat.h> -#include <sphinxbase/bitvec.h> -#include <sphinxbase/err.h> -#include <sphinxbase/prim_type.h> - -/* Local headers. */ -#include "ps_mllr.h" -#include "bin_mdef.h" -#include "tmat.h" -#include "hmm.h" - -/** - * States in utterance processing. - */ -typedef enum acmod_state_e { - ACMOD_IDLE, /**< Not in an utterance. */ - ACMOD_STARTED, /**< Utterance started, no data yet. */ - ACMOD_PROCESSING, /**< Utterance in progress. */ - ACMOD_ENDED /**< Utterance ended, still buffering. */ -} acmod_state_t; - -/** - * Dummy senone score value for unintentionally active states. - */ -#define SENSCR_DUMMY 0x7fff - -/** - * Feature space linear transform structure. - */ -struct ps_mllr_s { - int refcnt; /**< Reference count. */ - int n_class; /**< Number of MLLR classes. */ - int n_feat; /**< Number of feature streams. */ - int *veclen; /**< Length of input vectors for each stream. */ - float32 ****A; /**< Rotation part of mean transformations. */ - float32 ***b; /**< Bias part of mean transformations. */ - float32 ***h; /**< Diagonal transformation of variances. */ - int32 *cb2mllr; /**< Mapping from codebooks to transformations. */ -}; - -/** - * Acoustic model parameter structure. - */ -typedef struct ps_mgau_s ps_mgau_t; - -typedef struct ps_mgaufuncs_s { - char const *name; - - int (*frame_eval)(ps_mgau_t *mgau, - int16 *senscr, - uint8 *senone_active, - int32 n_senone_active, - mfcc_t ** feat, - int32 frame, - int32 compallsen); - int (*transform)(ps_mgau_t *mgau, - ps_mllr_t *mllr); - void (*free)(ps_mgau_t *mgau); -} ps_mgaufuncs_t; - -struct ps_mgau_s { - ps_mgaufuncs_t *vt; /**< vtable of mgau functions. */ - int frame_idx; /**< frame counter. */ -}; - -#define ps_mgau_base(mg) ((ps_mgau_t *)(mg)) -#define ps_mgau_frame_eval(mg,senscr,senone_active,n_senone_active,feat,frame,compallsen) \ - (*ps_mgau_base(mg)->vt->frame_eval) \ - (mg, senscr, senone_active, n_senone_active, feat, frame, compallsen) -#define ps_mgau_transform(mg, mllr) \ - (*ps_mgau_base(mg)->vt->transform)(mg, mllr) -#define ps_mgau_free(mg) \ - (*ps_mgau_base(mg)->vt->free)(mg) - -/** - * Acoustic model structure. - * - * This object encapsulates all stages of acoustic processing, from - * raw audio input to acoustic score output. The reason for grouping - * all of these modules together is that they all have to "agree" in - * their parameterizations, and the configuration of the acoustic and - * dynamic feature computation is completely dependent on the - * parameters used to build the original acoustic model (which should - * by now always be specified in a feat.params file). - * - * Because there is not a one-to-one correspondence from blocks of - * input audio or frames of input features to frames of acoustic - * scores (due to dynamic feature calculation), results may not be - * immediately available after input, and the output results will not - * correspond to the last piece of data input. - * - * TODO: In addition, this structure serves the purpose of queueing - * frames of features (and potentially also scores in the future) for - * asynchronous passes of recognition operating in parallel. - */ -struct acmod_s { - /* Global objects, not retained. */ - cmd_ln_t *config; /**< Configuration. */ - logmath_t *lmath; /**< Log-math computation. */ - glist_t strings; /**< Temporary acoustic model filenames. */ - - /* Feature computation: */ - fe_t *fe; /**< Acoustic feature computation. */ - feat_t *fcb; /**< Dynamic feature computation. */ - - /* Model parameters: */ - bin_mdef_t *mdef; /**< Model definition. */ - tmat_t *tmat; /**< Transition matrices. */ - ps_mgau_t *mgau; /**< Model parameters. */ - ps_mllr_t *mllr; /**< Speaker transformation. */ - - /* Senone scoring: */ - int16 *senone_scores; /**< GMM scores for current frame. */ - bitvec_t *senone_active_vec; /**< Active GMMs in current frame. */ - uint8 *senone_active; /**< Array of deltas to active GMMs. */ - int senscr_frame; /**< Frame index for senone_scores. */ - int n_senone_active; /**< Number of active GMMs. */ - int log_zero; /**< Zero log-probability value. */ - - /* Utterance processing: */ - mfcc_t **mfc_buf; /**< Temporary buffer of acoustic features. */ - mfcc_t ***feat_buf; /**< Temporary buffer of dynamic features. */ - FILE *rawfh; /**< File for writing raw audio data. */ - FILE *mfcfh; /**< File for writing acoustic feature data. */ - FILE *senfh; /**< File for writing senone score data. */ - FILE *insenfh; /**< Input senone score file. */ - long *framepos; /**< File positions of recent frames in senone file. */ - - /* Rawdata collected during decoding */ - int16 *rawdata; - int32 rawdata_size; - int32 rawdata_pos; - - /* A whole bunch of flags and counters: */ - uint8 state; /**< State of utterance processing. */ - uint8 compallsen; /**< Compute all senones? */ - uint8 grow_feat; /**< Whether to grow feat_buf. */ - uint8 insen_swap; /**< Whether to swap input senone score. */ - - frame_idx_t utt_start_frame; /**< Index of the utterance start in the stream, all timings are relative to that. */ - - frame_idx_t output_frame; /**< Index of next frame of dynamic features. */ - frame_idx_t n_mfc_alloc; /**< Number of frames allocated in mfc_buf */ - frame_idx_t n_mfc_frame; /**< Number of frames active in mfc_buf */ - frame_idx_t mfc_outidx; /**< Start of active frames in mfc_buf */ - frame_idx_t n_feat_alloc; /**< Number of frames allocated in feat_buf */ - frame_idx_t n_feat_frame; /**< Number of frames active in feat_buf */ - frame_idx_t feat_outidx; /**< Start of active frames in feat_buf */ -}; -typedef struct acmod_s acmod_t; - -/** - * Initialize an acoustic model. - * - * @param config a command-line object containing parameters. This - * pointer is not retained by this object. - * @param lmath global log-math parameters. - * @param fe a previously-initialized acoustic feature module to use, - * or NULL to create one automatically. If this is supplied - * and its parameters do not match those in the acoustic - * model, this function will fail. This pointer is not retained. - * @param fe a previously-initialized dynamic feature module to use, - * or NULL to create one automatically. If this is supplied - * and its parameters do not match those in the acoustic - * model, this function will fail. This pointer is not retained. - * @return a newly initialized acmod_t, or NULL on failure. - */ -acmod_t *acmod_init(cmd_ln_t *config, logmath_t *lmath, fe_t *fe, feat_t *fcb); - -/** - * Adapt acoustic model using a linear transform. - * - * @param mllr The new transform to use, or NULL to update the existing - * transform. The decoder retains ownership of this pointer, - * so you should not attempt to free it manually. Use - * ps_mllr_retain() if you wish to reuse it - * elsewhere. - * @return The updated transform object for this decoder, or - * NULL on failure. - */ -ps_mllr_t *acmod_update_mllr(acmod_t *acmod, ps_mllr_t *mllr); - -/** - * Start logging senone scores to a filehandle. - * - * @param acmod Acoustic model object. - * @param logfh Filehandle to log to. - * @return 0 for success, <0 on error. - */ -int acmod_set_senfh(acmod_t *acmod, FILE *senfh); - -/** - * Start logging MFCCs to a filehandle. - * - * @param acmod Acoustic model object. - * @param logfh Filehandle to log to. - * @return 0 for success, <0 on error. - */ -int acmod_set_mfcfh(acmod_t *acmod, FILE *logfh); - -/** - * Start logging raw audio to a filehandle. - * - * @param acmod Acoustic model object. - * @param logfh Filehandle to log to. - * @return 0 for success, <0 on error. - */ -int acmod_set_rawfh(acmod_t *acmod, FILE *logfh); - -/** - * Finalize an acoustic model. - */ -void acmod_free(acmod_t *acmod); - -/** - * Mark the start of an utterance. - */ -int acmod_start_utt(acmod_t *acmod); - -/** - * Mark the end of an utterance. - */ -int acmod_end_utt(acmod_t *acmod); - -/** - * Rewind the current utterance, allowing it to be rescored. - * - * After calling this function, the internal frame index is reset, and - * acmod_score() will return scores starting at the first frame of the - * current utterance. Currently, acmod_set_grow() must have been - * called to enable growing the feature buffer in order for this to - * work. In the future, senone scores may be cached instead. - * - * @return 0 for success, <0 for failure (if the utterance can't be - * rewound due to no feature or score data available) - */ -int acmod_rewind(acmod_t *acmod); - -/** - * Advance the frame index. - * - * This function moves to the next frame of input data. Subsequent - * calls to acmod_score() will return scores for that frame, until the - * next call to acmod_advance(). - * - * @return New frame index. - */ -int acmod_advance(acmod_t *acmod); - -/** - * Set memory allocation policy for utterance processing. - * - * @param grow_feat If non-zero, the internal dynamic feature buffer - * will expand as necessary to encompass any amount of data fed to the - * model. - * @return previous allocation policy. - */ -int acmod_set_grow(acmod_t *acmod, int grow_feat); - -/** - * TODO: Set queue length for utterance processing. - * - * This function allows multiple concurrent passes of search to - * operate on different parts of the utterance. - */ - -/** - * Feed raw audio data to the acoustic model for scoring. - * - * @param inout_raw In: Pointer to buffer of raw samples - * Out: Pointer to next sample to be read - * @param inout_n_samps In: Number of samples available - * Out: Number of samples remaining - * @param full_utt If non-zero, this block represents a full - * utterance and should be processed as such. - * @return Number of frames of data processed. - */ -int acmod_process_raw(acmod_t *acmod, - int16 const **inout_raw, - size_t *inout_n_samps, - int full_utt); - -/** - * Feed acoustic feature data into the acoustic model for scoring. - * - * @param inout_cep In: Pointer to buffer of features - * Out: Pointer to next frame to be read - * @param inout_n_frames In: Number of frames available - * Out: Number of frames remaining - * @param full_utt If non-zero, this block represents a full - * utterance and should be processed as such. - * @return Number of frames of data processed. - */ -int acmod_process_cep(acmod_t *acmod, - mfcc_t ***inout_cep, - int *inout_n_frames, - int full_utt); - -/** - * Feed dynamic feature data into the acoustic model for scoring. - * - * Unlike acmod_process_raw() and acmod_process_cep(), this function - * accepts a single frame at a time. This is because there is no need - * to do buffering when using dynamic features as input. However, if - * the dynamic feature buffer is full, this function will fail, so you - * should either always check the return value, or always pair a call - * to it with a call to acmod_score(). - * - * @param feat Pointer to one frame of dynamic features. - * @return Number of frames processed (either 0 or 1). - */ -int acmod_process_feat(acmod_t *acmod, - mfcc_t **feat); - -/** - * Set up a senone score dump file for input. - * - * @param insenfh File handle of dump file - * @return 0 for success, <0 for failure - */ -int acmod_set_insenfh(acmod_t *acmod, FILE *insenfh); - -/** - * Read one frame of scores from senone score dump file. - * - * @return Number of frames read or <0 on error. - */ -int acmod_read_scores(acmod_t *acmod); - -/** - * Get a frame of dynamic feature data. - * - * @param inout_frame_idx Input: frame index to get, or NULL - * to obtain features for the most recent frame. - * Output: frame index corresponding to this - * set of features. - * @return Feature array, or NULL if requested frame is not available. - */ -mfcc_t **acmod_get_frame(acmod_t *acmod, int *inout_frame_idx); - -/** - * Score one frame of data. - * - * @param inout_frame_idx Input: frame index to score, or NULL - * to obtain scores for the most recent frame. - * Output: frame index corresponding to this - * set of scores. - * @return Array of senone scores for this frame, or NULL if no frame - * is available for scoring (such as if a frame index is - * requested that is not yet or no longer available). The - * data pointed to persists only until the next call to - * acmod_score() or acmod_advance(). - */ -int16 const *acmod_score(acmod_t *acmod, - int *inout_frame_idx); - -/** - * Write senone dump file header. - */ -int acmod_write_senfh_header(acmod_t *acmod, FILE *logfh); - -/** - * Write a frame of senone scores to a dump file. - */ -int acmod_write_scores(acmod_t *acmod, int n_active, uint8 const *active, - int16 const *senscr, FILE *senfh); - - -/** - * Get best score and senone index for current frame. - */ -int acmod_best_score(acmod_t *acmod, int *out_best_senid); - -/** - * Clear set of active senones. - */ -void acmod_clear_active(acmod_t *acmod); - -/** - * Activate senones associated with an HMM. - */ -void acmod_activate_hmm(acmod_t *acmod, hmm_t *hmm); - -/** - * Activate a single senone. - */ -#define acmod_activate_sen(acmod, sen) bitvec_set((acmod)->senone_active_vec, sen) - -/** - * Build active list from - */ -int32 acmod_flags2list(acmod_t *acmod); - -/** - * Get the offset of the utterance start of the current stream, helpful for stream-wide timing. - */ -int32 acmod_stream_offset(acmod_t *acmod); - -/** - * Reset the current stream - */ -void acmod_start_stream(acmod_t *acmod); - -/** - * Sets the limit of the raw audio data to store - */ -void acmod_set_rawdata_size(acmod_t *acmod, int32 size); - -/** - * Retrieves the raw data collected during utterance decoding - */ -void acmod_get_rawdata(acmod_t *acmod, int16 **buffer, int32 *size); - -#endif /* __ACMOD_H__ */ |