/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ /* ==================================================================== * Copyright (c) 1999-2004 Carnegie Mellon University. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * This work was supported in part by funding from the Defense Advanced * Research Projects Agency and the National Science Foundation of the * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * */ /* * feat.h -- Cepstral features computation. */ #ifndef _S3_FEAT_H_ #define _S3_FEAT_H_ #include /* Win32/WinCE DLL gunk */ #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif #if 0 /* Fool Emacs. */ } #endif /** \file feat.h * \brief compute the dynamic coefficients from the cepstral vector. */ #define LIVEBUFBLOCKSIZE 256 /** Blocks of 256 vectors allocated for livemode decoder */ #define S3_MAX_FRAMES 15000 /* RAH, I believe this is still too large, but better than before */ #define cepstral_to_feature_command_line_macro() \ { "-feat", \ ARG_STRING, \ "1s_c_d_dd", \ "Feature stream type, depends on the acoustic model" }, \ { "-ceplen", \ ARG_INT32, \ "13", \ "Number of components in the input feature vector" }, \ { "-cmn", \ ARG_STRING, \ "current", \ "Cepstral mean normalization scheme ('current', 'prior', or 'none')" }, \ { "-cmninit", \ ARG_STRING, \ "8.0", \ "Initial values (comma-separated) for cepstral mean when 'prior' is used" }, \ { "-varnorm", \ ARG_BOOLEAN, \ "no", \ "Variance normalize each utterance (only if CMN == current)" }, \ { "-agc", \ ARG_STRING, \ "none", \ "Automatic gain control for c0 ('max', 'emax', 'noise', or 'none')" }, \ { "-agcthresh", \ ARG_FLOAT32, \ "2.0", \ "Initial threshold for automatic gain control" }, \ { "-lda", \ ARG_STRING, \ NULL, \ "File containing transformation matrix to be applied to features (single-stream features only)" }, \ { "-ldadim", \ ARG_INT32, \ "0", \ "Dimensionality of output of feature transformation (0 to use entire matrix)" }, \ {"-svspec", \ ARG_STRING, \ NULL, \ "Subvector specification (e.g., 24,0-11/25,12-23/26-38 or 0-12/13-25/26-38)"} /** * \struct feat_t * \brief Structure for describing a speech feature type * Structure for describing a speech feature type (no. of streams and stream widths), * as well as the computation for converting the input speech (e.g., Sphinx-II format * MFC cepstra) into this type of feature vectors. */ typedef struct feat_s { int refcount; /**< Reference count. */ char *name; /**< Printable name for this feature type */ int32 cepsize; /**< Size of input speech vector (typically, a cepstrum vector) */ int32 n_stream; /**< Number of feature streams; e.g., 4 in Sphinx-II */ uint32 *stream_len; /**< Vector length of each feature stream */ int32 window_size; /**< Number of extra frames around given input frame needed to compute corresponding output feature (so total = window_size*2 + 1) */ int32 n_sv; /**< Number of subvectors */ uint32 *sv_len; /**< Vector length of each subvector */ int32 **subvecs; /**< Subvector specification (or NULL for none) */ mfcc_t *sv_buf; /**< Temporary copy buffer for subvector projection */ int32 sv_dim; /**< Total dimensionality of subvector (length of sv_buf) */ cmn_type_t cmn; /**< Type of CMN to be performed on each utterance */ int32 varnorm; /**< Whether variance normalization is to be performed on each utt; Irrelevant if no CMN is performed */ agc_type_t agc; /**< Type of AGC to be performed on each utterance */ /** * Feature computation function. * @param fcb the feat_t describing this feature type * @param input pointer into the input cepstra * @param feat a 2-d array of output features (n_stream x stream_len) * @return 0 if successful, -ve otherwise. * * Function for converting window of input speech vector * (input[-window_size..window_size]) to output feature vector * (feat[stream][]). If NULL, no conversion available, the * speech input must be feature vector itself. **/ void (*compute_feat)(struct feat_s *fcb, mfcc_t **input, mfcc_t **feat); cmn_t *cmn_struct; /**< Structure that stores the temporary variables for cepstral means normalization*/ agc_t *agc_struct; /**< Structure that stores the temporary variables for acoustic gain control*/ mfcc_t **cepbuf; /**< Circular buffer of MFCC frames for live feature computation. */ mfcc_t **tmpcepbuf; /**< Array of pointers into cepbuf to handle border cases. */ int32 bufpos; /**< Write index in cepbuf. */ int32 curpos; /**< Read index in cepbuf. */ mfcc_t ***lda; /**< Array of linear transformations (for LDA, MLLT, or whatever) */ uint32 n_lda; /**< Number of linear transformations in lda. */ uint32 out_dim; /**< Output dimensionality */ } feat_t; /** * Name of feature type. */ #define feat_name(f) ((f)->name) /** * Input dimensionality of feature. */ #define feat_cepsize(f) ((f)->cepsize) /** * Size of dynamic feature window. */ #define feat_window_size(f) ((f)->window_size) /** * Number of feature streams. * * @deprecated Do not use this, use feat_dimension1() instead. */ #define feat_n_stream(f) ((f)->n_stream) /** * Length of feature stream i. * * @deprecated Do not use this, use feat_dimension2() instead. */ #define feat_stream_len(f,i) ((f)->stream_len[i]) /** * Number of streams or subvectors in feature output. */ #define feat_dimension1(f) ((f)->n_sv ? (f)->n_sv : f->n_stream) /** * Dimensionality of stream/subvector i in feature output. */ #define feat_dimension2(f,i) ((f)->lda ? (f)->out_dim : ((f)->sv_len ? (f)->sv_len[i] : f->stream_len[i])) /** * Total dimensionality of feature output. */ #define feat_dimension(f) ((f)->out_dim) /** * Array with stream/subvector lengths */ #define feat_stream_lengths(f) ((f)->lda ? (&(f)->out_dim) : (f)->sv_len ? (f)->sv_len : f->stream_len) /** * Parse subvector specification string. * * Format of specification: * \li '/' separated list of subvectors * \li each subvector is a ',' separated list of subranges * \li each subrange is a single \verbatim \endverbatim or * \verbatim - \endverbatim (inclusive), where * \verbatim \endverbatim is a feature vector dimension * specifier. * * E.g., "24,0-11/25,12-23/26,27-38" has: * \li 3 subvectors * \li the 1st subvector has feature dims: 24, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, and 11. * \li etc. * * @param str subvector specification string. * @return allocated 2-D array of subvector specs (free with * subvecs_free()). If there are N subvectors specified, subvec[N] = * NULL; and each subvec[0]..subvec[N-1] is -1 terminated vector of * feature dims. */ SPHINXBASE_EXPORT int32 **parse_subvecs(char const *str); /** * Free array of subvector specs. */ SPHINXBASE_EXPORT void subvecs_free(int32 **subvecs); /** * Allocate an array to hold several frames worth of feature vectors. The returned value * is the mfcc_t ***data array, organized as follows: * * - data[0][0] = frame 0 stream 0 vector, data[0][1] = frame 0 stream 1 vector, ... * - data[1][0] = frame 1 stream 0 vector, data[0][1] = frame 1 stream 1 vector, ... * - data[2][0] = frame 2 stream 0 vector, data[0][1] = frame 2 stream 1 vector, ... * - ... * * NOTE: For I/O convenience, the entire data area is allocated as one contiguous block. * @return pointer to the allocated space if successful, NULL if any error. */ SPHINXBASE_EXPORT mfcc_t ***feat_array_alloc(feat_t *fcb, /**< In: Descriptor from feat_init(), used to obtain number of streams and stream sizes */ int32 nfr /**< In: Number of frames for which to allocate */ ); /** * Realloate the array of features. Requires us to know the old size */ SPHINXBASE_EXPORT mfcc_t ***feat_array_realloc(feat_t *fcb, /**< In: Descriptor from feat_init(), used to obtain number of streams and stream sizes */ mfcc_t ***old_feat, /**< Feature array. Freed */ int32 ofr, /**< In: Previous number of frames */ int32 nfr /**< In: Number of frames for which to allocate */ ); /** * Free a buffer allocated with feat_array_alloc() */ SPHINXBASE_EXPORT void feat_array_free(mfcc_t ***feat); /** * Initialize feature module to use the selected type of feature stream. * One-time only initialization at the beginning of the program. Input type * is a string defining the kind of input->feature conversion desired: * * - "s2_4x": s2mfc->Sphinx-II 4-feature stream, * - "1s_c_d_dd": s2mfc->Sphinx 3.x single feature stream, * - "s3_1x39": s2mfc->Sphinx 3.0 single feature stream, * - "n1,n2,n3,...": Explicit feature vector layout spec. with comma-separated * feature stream lengths. In this case, the input data is already in the * feature format and there is no conversion necessary. * * @return (feat_t *) descriptor if successful, NULL if error. Caller * must not directly modify the contents of the returned value. */ SPHINXBASE_EXPORT feat_t *feat_init(char const *type,/**< In: Type of feature stream */ cmn_type_t cmn, /**< In: Type of cepstram mean normalization to be done before feature computation; can be CMN_NONE (for none) */ int32 varnorm, /**< In: (boolean) Whether variance normalization done on each utt; only applicable if CMN also done */ agc_type_t agc, /**< In: Type of automatic gain control to be done before feature computation */ int32 breport, /**< In: Whether to show a report for feat_t */ int32 cepsize /**< Number of components in the input vector (or 0 for the default for this feature type, which is usually 13) */ ); /** * Add an LDA transformation to the feature module from a file. * @return 0 for success or -1 if reading the LDA file failed. **/ SPHINXBASE_EXPORT int32 feat_read_lda(feat_t *feat, /**< In: Descriptor from feat_init() */ const char *ldafile, /**< In: File to read the LDA matrix from. */ int32 dim /**< In: Dimensionality of LDA output. */ ); /** * Transform a block of features using the feature module's LDA transform. **/ SPHINXBASE_EXPORT void feat_lda_transform(feat_t *fcb, /**< In: Descriptor from feat_init() */ mfcc_t ***inout_feat, /**< Feature block to transform. */ uint32 nfr /**< In: Number of frames in inout_feat. */ ); /** * Add a subvector specification to the feature module. * * The subvector splitting will be performed after dynamic feature * computation, CMN, AGC, and any LDA transformation. The number of * streams in the dynamic feature type must be one, as with LDA. * * After adding a subvector specification, the output of feature * computation will be split into multiple subvectors, and * feat_array_alloc() will allocate pointers accordingly. The number * of streams will remain the * * @param fcb the feature descriptor. * @param subvecs subvector specification. This pointer is retained * by the feat_t and should not be freed manually. * @return 0 for success or -1 if the subvector specification was * invalid. */ SPHINXBASE_EXPORT int feat_set_subvecs(feat_t *fcb, int32 **subvecs); /** * Print the given block of feature vectors to the given FILE. */ SPHINXBASE_EXPORT void feat_print(feat_t *fcb, /**< In: Descriptor from feat_init() */ mfcc_t ***feat, /**< In: Feature data to be printed */ int32 nfr, /**< In: Number of frames of feature data above */ FILE *fp /**< In: Output file pointer */ ); /** * Read a specified MFC file (or given segment within it), perform * CMN/AGC as indicated by fcb, and compute feature * vectors. Feature vectors are computed for the entire segment * specified, by including additional surrounding or padding frames to * accommodate the feature windows. * * @return Number of frames of feature vectors computed if successful; * -1 if any error. If feat is NULL, then no actual * computation will be done, and the number of frames which must be * allocated will be returned. * * A note on how the file path is constructed: If the control file * already specifies extension or absolute path, then these are not * applied. The default extension is defined by the application. */ SPHINXBASE_EXPORT int32 feat_s2mfc2feat(feat_t *fcb, /**< In: Descriptor from feat_init() */ const char *file, /**< In: File to be read */ const char *dir, /**< In: Directory prefix for file, if needed; can be NULL */ const char *cepext,/**< In: Extension of the cepstrum file.It cannot be NULL */ int32 sf, int32 ef, /* Start/End frames within file to be read. Use 0,-1 to process entire file */ mfcc_t ***feat, /**< Out: Computed feature vectors; caller must allocate this space */ int32 maxfr /**< In: Available space (number of frames) in above feat array; it must be sufficient to hold the result. Pass -1 for no limit. */ ); /** * Feature computation routine for live mode decoder. * * This function computes features for blocks of incoming data. It * retains an internal buffer for computing deltas, which means that * the number of output frames will not necessarily equal the number * of input frames. * * It is very important to realize that the number of * output frames can be greater than the number of * input frames, specifically when endutt is true. It is * guaranteed to never exceed *inout_ncep + * feat_window_size(fcb). You MUST have * allocated at least that many frames in ofeat, or you * will experience a buffer overflow. * * If beginutt and endutt are both true, CMN_CURRENT and AGC_MAX will * be done. Otherwise only CMN_PRIOR and AGC_EMAX will be done. * * If beginutt is false, endutt is true, and the number of input * frames exceeds the input size, then end-of-utterance processing * won't actually be done. This condition can easily be checked, * because *inout_ncep will equal the return value on * exit, and will also be smaller than the value of * *inout_ncep on entry. * * @return The number of output frames actually computed. **/ SPHINXBASE_EXPORT int32 feat_s2mfc2feat_live(feat_t *fcb, /**< In: Descriptor from feat_init() */ mfcc_t **uttcep, /**< In: Incoming cepstral buffer */ int32 *inout_ncep,/**< In: Size of incoming buffer. Out: Number of incoming frames consumed. */ int32 beginutt, /**< In: Begining of utterance flag */ int32 endutt, /**< In: End of utterance flag */ mfcc_t ***ofeat /**< In: Output feature buffer. See VERY IMPORTANT note about the size of this buffer above. */ ); /** * Update the normalization stats, possibly in the end of utterance * */ SPHINXBASE_EXPORT void feat_update_stats(feat_t *fcb); /** * Retain ownership of feat_t. * * @return pointer to retained feat_t. */ SPHINXBASE_EXPORT feat_t *feat_retain(feat_t *f); /** * Release resource associated with feat_t * * @return new reference count (0 if freed) */ SPHINXBASE_EXPORT int feat_free(feat_t *f /**< In: feat_t */ ); /** * Report the feat_t data structure */ SPHINXBASE_EXPORT void feat_report(feat_t *f /**< In: feat_t */ ); #ifdef __cplusplus } #endif #endif