/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ /* ==================================================================== * Copyright (c) 2008 Carnegie Mellon University. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * This work was supported in part by funding from the Defense Advanced * Research Projects Agency and the National Science Foundation of the * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * */ /** * @file acmod.c Acoustic model structures for PocketSphinx. * @author David Huggins-Daines */ /* System headers. */ #include #include #include /* SphinxBase headers. */ #include #include #include #include #include #include #include /* Local headers. */ #include "cmdln_macro.h" #include "acmod.h" #include "s2_semi_mgau.h" #include "ptm_mgau.h" #include "ms_mgau.h" /* Feature and front-end parameters that may be in feat.params */ static const arg_t feat_defn[] = { waveform_to_cepstral_command_line_macro(), cepstral_to_feature_command_line_macro(), CMDLN_EMPTY_OPTION }; #ifndef WORDS_BIGENDIAN #define WORDS_BIGENDIAN 1 #endif static int32 acmod_process_mfcbuf(acmod_t *acmod); static int acmod_init_am(acmod_t *acmod) { char const *mdeffn, *tmatfn, *mllrfn, *hmmdir; /* Read model definition. */ if ((mdeffn = cmd_ln_str_r(acmod->config, "-mdef")) == NULL) { if ((hmmdir = cmd_ln_str_r(acmod->config, "-hmm")) == NULL) E_ERROR("Acoustic model definition is not specified either " "with -mdef option or with -hmm\n"); else E_ERROR("Folder '%s' does not contain acoustic model " "definition 'mdef'\n", hmmdir); return -1; } if ((acmod->mdef = bin_mdef_read(acmod->config, mdeffn)) == NULL) { E_ERROR("Failed to read acoustic model definition from %s\n", mdeffn); return -1; } /* Read transition matrices. */ if ((tmatfn = cmd_ln_str_r(acmod->config, "-tmat")) == NULL) { E_ERROR("No tmat file specified\n"); return -1; } acmod->tmat = tmat_init(tmatfn, acmod->lmath, cmd_ln_float32_r(acmod->config, "-tmatfloor"), TRUE); /* Read the acoustic models. */ if ((cmd_ln_str_r(acmod->config, "-mean") == NULL) || (cmd_ln_str_r(acmod->config, "-var") == NULL) || (cmd_ln_str_r(acmod->config, "-tmat") == NULL)) { E_ERROR("No mean/var/tmat files specified\n"); return -1; } if (cmd_ln_str_r(acmod->config, "-senmgau")) { E_INFO("Using general multi-stream GMM computation\n"); acmod->mgau = ms_mgau_init(acmod, acmod->lmath, acmod->mdef); if (acmod->mgau == NULL) return -1; } else { E_INFO("Attempting to use PTM computation module\n"); if ((acmod->mgau = ptm_mgau_init(acmod, acmod->mdef)) == NULL) { E_INFO("Attempting to use semi-continuous computation module\n"); if ((acmod->mgau = s2_semi_mgau_init(acmod)) == NULL) { E_INFO("Falling back to general multi-stream GMM computation\n"); acmod->mgau = ms_mgau_init(acmod, acmod->lmath, acmod->mdef); if (acmod->mgau == NULL) return -1; } } } /* If there is an MLLR transform, apply it. */ if ((mllrfn = cmd_ln_str_r(acmod->config, "-mllr"))) { ps_mllr_t *mllr = ps_mllr_read(mllrfn); if (mllr == NULL) return -1; acmod_update_mllr(acmod, mllr); } return 0; } static int acmod_init_feat(acmod_t *acmod) { acmod->fcb = feat_init(cmd_ln_str_r(acmod->config, "-feat"), cmn_type_from_str(cmd_ln_str_r(acmod->config,"-cmn")), cmd_ln_boolean_r(acmod->config, "-varnorm"), agc_type_from_str(cmd_ln_str_r(acmod->config, "-agc")), 1, cmd_ln_int32_r(acmod->config, "-ceplen")); if (acmod->fcb == NULL) return -1; if (cmd_ln_str_r(acmod->config, "-lda")) { E_INFO("Reading linear feature transformation from %s\n", cmd_ln_str_r(acmod->config, "-lda")); if (feat_read_lda(acmod->fcb, cmd_ln_str_r(acmod->config, "-lda"), cmd_ln_int32_r(acmod->config, "-ldadim")) < 0) return -1; } if (cmd_ln_str_r(acmod->config, "-svspec")) { int32 **subvecs; E_INFO("Using subvector specification %s\n", cmd_ln_str_r(acmod->config, "-svspec")); if ((subvecs = parse_subvecs(cmd_ln_str_r(acmod->config, "-svspec"))) == NULL) return -1; if ((feat_set_subvecs(acmod->fcb, subvecs)) < 0) return -1; } if (cmd_ln_exists_r(acmod->config, "-agcthresh") && 0 != strcmp(cmd_ln_str_r(acmod->config, "-agc"), "none")) { agc_set_threshold(acmod->fcb->agc_struct, cmd_ln_float32_r(acmod->config, "-agcthresh")); } if (acmod->fcb->cmn_struct && cmd_ln_exists_r(acmod->config, "-cmninit")) { char *c, *cc, *vallist; int32 nvals; vallist = ckd_salloc(cmd_ln_str_r(acmod->config, "-cmninit")); c = vallist; nvals = 0; while (nvals < acmod->fcb->cmn_struct->veclen && (cc = strchr(c, ',')) != NULL) { *cc = '\0'; acmod->fcb->cmn_struct->cmn_mean[nvals] = FLOAT2MFCC(atof_c(c)); c = cc + 1; ++nvals; } if (nvals < acmod->fcb->cmn_struct->veclen && *c != '\0') { acmod->fcb->cmn_struct->cmn_mean[nvals] = FLOAT2MFCC(atof_c(c)); } ckd_free(vallist); } return 0; } int acmod_fe_mismatch(acmod_t *acmod, fe_t *fe) { /* Output vector dimension needs to be the same. */ if (cmd_ln_int32_r(acmod->config, "-ceplen") != fe_get_output_size(fe)) { E_ERROR("Configured feature length %d doesn't match feature " "extraction output size %d\n", cmd_ln_int32_r(acmod->config, "-ceplen"), fe_get_output_size(fe)); return TRUE; } /* Feature parameters need to be the same. */ /* ... */ return FALSE; } int acmod_feat_mismatch(acmod_t *acmod, feat_t *fcb) { /* Feature type needs to be the same. */ if (0 != strcmp(cmd_ln_str_r(acmod->config, "-feat"), feat_name(fcb))) return TRUE; /* Input vector dimension needs to be the same. */ if (cmd_ln_int32_r(acmod->config, "-ceplen") != feat_cepsize(fcb)) return TRUE; /* FIXME: Need to check LDA and stuff too. */ return FALSE; } acmod_t * acmod_init(cmd_ln_t *config, logmath_t *lmath, fe_t *fe, feat_t *fcb) { acmod_t *acmod; char const *featparams; acmod = ckd_calloc(1, sizeof(*acmod)); acmod->config = cmd_ln_retain(config); acmod->lmath = lmath; acmod->state = ACMOD_IDLE; /* Look for feat.params in acoustic model dir. */ if ((featparams = cmd_ln_str_r(acmod->config, "-featparams"))) { if (NULL != cmd_ln_parse_file_r(acmod->config, feat_defn, featparams, FALSE)) E_INFO("Parsed model-specific feature parameters from %s\n", featparams); } /* Initialize feature computation. */ if (fe) { if (acmod_fe_mismatch(acmod, fe)) goto error_out; fe_retain(fe); acmod->fe = fe; } else { /* Initialize a new front end. */ acmod->fe = fe_init_auto_r(config); if (acmod->fe == NULL) goto error_out; if (acmod_fe_mismatch(acmod, acmod->fe)) goto error_out; } if (fcb) { if (acmod_feat_mismatch(acmod, fcb)) goto error_out; feat_retain(fcb); acmod->fcb = fcb; } else { /* Initialize a new fcb. */ if (acmod_init_feat(acmod) < 0) goto error_out; } /* Load acoustic model parameters. */ if (acmod_init_am(acmod) < 0) goto error_out; /* The MFCC buffer needs to be at least as large as the dynamic * feature window. */ acmod->n_mfc_alloc = acmod->fcb->window_size * 2 + 1; acmod->mfc_buf = (mfcc_t **) ckd_calloc_2d(acmod->n_mfc_alloc, acmod->fcb->cepsize, sizeof(**acmod->mfc_buf)); /* Feature buffer has to be at least as large as MFCC buffer. */ acmod->n_feat_alloc = acmod->n_mfc_alloc + cmd_ln_int32_r(config, "-pl_window"); acmod->feat_buf = feat_array_alloc(acmod->fcb, acmod->n_feat_alloc); acmod->framepos = ckd_calloc(acmod->n_feat_alloc, sizeof(*acmod->framepos)); acmod->utt_start_frame = 0; /* Senone computation stuff. */ acmod->senone_scores = ckd_calloc(bin_mdef_n_sen(acmod->mdef), sizeof(*acmod->senone_scores)); acmod->senone_active_vec = bitvec_alloc(bin_mdef_n_sen(acmod->mdef)); acmod->senone_active = ckd_calloc(bin_mdef_n_sen(acmod->mdef), sizeof(*acmod->senone_active)); acmod->log_zero = logmath_get_zero(acmod->lmath); acmod->compallsen = cmd_ln_boolean_r(config, "-compallsen"); return acmod; error_out: acmod_free(acmod); return NULL; } void acmod_free(acmod_t *acmod) { if (acmod == NULL) return; feat_free(acmod->fcb); fe_free(acmod->fe); cmd_ln_free_r(acmod->config); if (acmod->mfc_buf) ckd_free_2d((void **)acmod->mfc_buf); if (acmod->feat_buf) feat_array_free(acmod->feat_buf); if (acmod->mfcfh) fclose(acmod->mfcfh); if (acmod->rawfh) fclose(acmod->rawfh); if (acmod->senfh) fclose(acmod->senfh); ckd_free(acmod->framepos); ckd_free(acmod->senone_scores); ckd_free(acmod->senone_active_vec); ckd_free(acmod->senone_active); ckd_free(acmod->rawdata); if (acmod->mdef) bin_mdef_free(acmod->mdef); if (acmod->tmat) tmat_free(acmod->tmat); if (acmod->mgau) ps_mgau_free(acmod->mgau); if (acmod->mllr) ps_mllr_free(acmod->mllr); ckd_free(acmod); } ps_mllr_t * acmod_update_mllr(acmod_t *acmod, ps_mllr_t *mllr) { if (acmod->mllr) ps_mllr_free(acmod->mllr); acmod->mllr = mllr; ps_mgau_transform(acmod->mgau, mllr); return mllr; } int acmod_write_senfh_header(acmod_t *acmod, FILE *logfh) { char nsenstr[64], logbasestr[64]; sprintf(nsenstr, "%d", bin_mdef_n_sen(acmod->mdef)); sprintf(logbasestr, "%f", logmath_get_base(acmod->lmath)); return bio_writehdr(logfh, "version", "0.1", "mdef_file", cmd_ln_str_r(acmod->config, "-mdef"), "n_sen", nsenstr, "logbase", logbasestr, NULL); } int acmod_set_senfh(acmod_t *acmod, FILE *logfh) { if (acmod->senfh) fclose(acmod->senfh); acmod->senfh = logfh; if (logfh == NULL) return 0; return acmod_write_senfh_header(acmod, logfh); } int acmod_set_mfcfh(acmod_t *acmod, FILE *logfh) { int rv = 0; if (acmod->mfcfh) fclose(acmod->mfcfh); acmod->mfcfh = logfh; fwrite(&rv, 4, 1, acmod->mfcfh); return rv; } int acmod_set_rawfh(acmod_t *acmod, FILE *logfh) { if (acmod->rawfh) fclose(acmod->rawfh); acmod->rawfh = logfh; return 0; } void acmod_grow_feat_buf(acmod_t *acmod, int nfr) { if (nfr > MAX_N_FRAMES) E_FATAL("Decoder can not process more than %d frames at once, " "requested %d\n", MAX_N_FRAMES, nfr); acmod->feat_buf = feat_array_realloc(acmod->fcb, acmod->feat_buf, acmod->n_feat_alloc, nfr); acmod->framepos = ckd_realloc(acmod->framepos, nfr * sizeof(*acmod->framepos)); acmod->n_feat_alloc = nfr; } int acmod_set_grow(acmod_t *acmod, int grow_feat) { int tmp = acmod->grow_feat; acmod->grow_feat = grow_feat; /* Expand feat_buf to a reasonable size to start with. */ if (grow_feat && acmod->n_feat_alloc < 128) acmod_grow_feat_buf(acmod, 128); return tmp; } int acmod_start_utt(acmod_t *acmod) { fe_start_utt(acmod->fe); acmod->state = ACMOD_STARTED; acmod->n_mfc_frame = 0; acmod->n_feat_frame = 0; acmod->mfc_outidx = 0; acmod->feat_outidx = 0; acmod->output_frame = 0; acmod->senscr_frame = -1; acmod->n_senone_active = 0; acmod->mgau->frame_idx = 0; acmod->rawdata_pos = 0; return 0; } int acmod_end_utt(acmod_t *acmod) { int32 nfr = 0; acmod->state = ACMOD_ENDED; if (acmod->n_mfc_frame < acmod->n_mfc_alloc) { int inptr; /* Where to start writing them (circular buffer) */ inptr = (acmod->mfc_outidx + acmod->n_mfc_frame) % acmod->n_mfc_alloc; /* nfr is always either zero or one. */ fe_end_utt(acmod->fe, acmod->mfc_buf[inptr], &nfr); acmod->n_mfc_frame += nfr; /* Process whatever's left, and any leadout or update stats if needed. */ if (nfr) nfr = acmod_process_mfcbuf(acmod); else feat_update_stats(acmod->fcb); } if (acmod->mfcfh) { long outlen; int32 rv; outlen = (ftell(acmod->mfcfh) - 4) / 4; if (!WORDS_BIGENDIAN) SWAP_INT32(&outlen); /* Try to seek and write */ if ((rv = fseek(acmod->mfcfh, 0, SEEK_SET)) == 0) { fwrite(&outlen, 4, 1, acmod->mfcfh); } fclose(acmod->mfcfh); acmod->mfcfh = NULL; } if (acmod->rawfh) { fclose(acmod->rawfh); acmod->rawfh = NULL; } if (acmod->senfh) { fclose(acmod->senfh); acmod->senfh = NULL; } return nfr; } static int acmod_log_mfc(acmod_t *acmod, mfcc_t **cep, int n_frames) { int i, n; int32 *ptr = (int32 *)cep[0]; n = n_frames * feat_cepsize(acmod->fcb); /* Swap bytes. */ if (!WORDS_BIGENDIAN) { for (i = 0; i < (n * sizeof(mfcc_t)); ++i) { SWAP_INT32(ptr + i); } } /* Write features. */ if (fwrite(cep[0], sizeof(mfcc_t), n, acmod->mfcfh) != n) { E_ERROR_SYSTEM("Failed to write %d values to log file", n); } /* Swap them back. */ if (!WORDS_BIGENDIAN) { for (i = 0; i < (n * sizeof(mfcc_t)); ++i) { SWAP_INT32(ptr + i); } } return 0; } static int acmod_process_full_cep(acmod_t *acmod, mfcc_t ***inout_cep, int *inout_n_frames) { int32 nfr; /* Write to log file. */ if (acmod->mfcfh) acmod_log_mfc(acmod, *inout_cep, *inout_n_frames); /* Resize feat_buf to fit. */ if (acmod->n_feat_alloc < *inout_n_frames) { if (*inout_n_frames > MAX_N_FRAMES) E_FATAL("Batch processing can not process more than %d frames " "at once, requested %d\n", MAX_N_FRAMES, *inout_n_frames); feat_array_free(acmod->feat_buf); acmod->feat_buf = feat_array_alloc(acmod->fcb, *inout_n_frames); acmod->n_feat_alloc = *inout_n_frames; acmod->n_feat_frame = 0; acmod->feat_outidx = 0; } /* Make dynamic features. */ nfr = feat_s2mfc2feat_live(acmod->fcb, *inout_cep, inout_n_frames, TRUE, TRUE, acmod->feat_buf); acmod->n_feat_frame = nfr; assert(acmod->n_feat_frame <= acmod->n_feat_alloc); *inout_cep += *inout_n_frames; *inout_n_frames = 0; return nfr; } static int acmod_process_full_raw(acmod_t *acmod, int16 const **inout_raw, size_t *inout_n_samps) { int32 nfr, ntail; mfcc_t **cepptr; /* Write to logging file if any. */ if (*inout_n_samps + acmod->rawdata_pos < acmod->rawdata_size) { memcpy(acmod->rawdata + acmod->rawdata_pos, *inout_raw, *inout_n_samps * sizeof(int16)); acmod->rawdata_pos += *inout_n_samps; } if (acmod->rawfh) fwrite(*inout_raw, sizeof(int16), *inout_n_samps, acmod->rawfh); /* Resize mfc_buf to fit. */ if (fe_process_frames(acmod->fe, NULL, inout_n_samps, NULL, &nfr, NULL) < 0) return -1; if (acmod->n_mfc_alloc < nfr + 1) { ckd_free_2d(acmod->mfc_buf); acmod->mfc_buf = ckd_calloc_2d(nfr + 1, fe_get_output_size(acmod->fe), sizeof(**acmod->mfc_buf)); acmod->n_mfc_alloc = nfr + 1; } acmod->n_mfc_frame = 0; acmod->mfc_outidx = 0; fe_start_utt(acmod->fe); if (fe_process_frames(acmod->fe, inout_raw, inout_n_samps, acmod->mfc_buf, &nfr, NULL) < 0) return -1; fe_end_utt(acmod->fe, acmod->mfc_buf[nfr], &ntail); nfr += ntail; cepptr = acmod->mfc_buf; nfr = acmod_process_full_cep(acmod, &cepptr, &nfr); acmod->n_mfc_frame = 0; return nfr; } /** * Process MFCCs that are in the internal buffer into features. */ static int32 acmod_process_mfcbuf(acmod_t *acmod) { mfcc_t **mfcptr; int32 ncep; ncep = acmod->n_mfc_frame; /* Also do this in two parts because of the circular mfc_buf. */ if (acmod->mfc_outidx + ncep > acmod->n_mfc_alloc) { int32 ncep1 = acmod->n_mfc_alloc - acmod->mfc_outidx; int saved_state = acmod->state; /* Make sure we don't end the utterance here. */ if (acmod->state == ACMOD_ENDED) acmod->state = ACMOD_PROCESSING; mfcptr = acmod->mfc_buf + acmod->mfc_outidx; ncep1 = acmod_process_cep(acmod, &mfcptr, &ncep1, FALSE); /* It's possible that not all available frames were filled. */ ncep -= ncep1; acmod->n_mfc_frame -= ncep1; acmod->mfc_outidx += ncep1; acmod->mfc_outidx %= acmod->n_mfc_alloc; /* Restore original state (could this really be the end) */ acmod->state = saved_state; } mfcptr = acmod->mfc_buf + acmod->mfc_outidx; ncep = acmod_process_cep(acmod, &mfcptr, &ncep, FALSE); acmod->n_mfc_frame -= ncep; acmod->mfc_outidx += ncep; acmod->mfc_outidx %= acmod->n_mfc_alloc; return ncep; } int acmod_process_raw(acmod_t *acmod, int16 const **inout_raw, size_t *inout_n_samps, int full_utt) { int32 ncep; int32 out_frameidx; int16 const *prev_audio_inptr; /* If this is a full utterance, process it all at once. */ if (full_utt) return acmod_process_full_raw(acmod, inout_raw, inout_n_samps); /* Append MFCCs to the end of any that are previously in there * (in practice, there will probably be none) */ if (inout_n_samps && *inout_n_samps) { int inptr; int32 processed_samples; prev_audio_inptr = *inout_raw; /* Total number of frames available. */ ncep = acmod->n_mfc_alloc - acmod->n_mfc_frame; /* Where to start writing them (circular buffer) */ inptr = (acmod->mfc_outidx + acmod->n_mfc_frame) % acmod->n_mfc_alloc; /* Write them in two (or more) parts if there is wraparound. */ while (inptr + ncep > acmod->n_mfc_alloc) { int32 ncep1 = acmod->n_mfc_alloc - inptr; if (fe_process_frames(acmod->fe, inout_raw, inout_n_samps, acmod->mfc_buf + inptr, &ncep1, &out_frameidx) < 0) return -1; if (out_frameidx > 0) acmod->utt_start_frame = out_frameidx; processed_samples = *inout_raw - prev_audio_inptr; if (processed_samples + acmod->rawdata_pos < acmod->rawdata_size) { memcpy(acmod->rawdata + acmod->rawdata_pos, prev_audio_inptr, processed_samples * sizeof(int16)); acmod->rawdata_pos += processed_samples; } /* Write to logging file if any. */ if (acmod->rawfh) { fwrite(prev_audio_inptr, sizeof(int16), processed_samples, acmod->rawfh); } prev_audio_inptr = *inout_raw; /* ncep1 now contains the number of frames actually * processed. This is a good thing, but it means we * actually still might have some room left at the end of * the buffer, hence the while loop. Unfortunately it * also means that in the case where we are really * actually done, we need to get out totally, hence the * goto. */ acmod->n_mfc_frame += ncep1; ncep -= ncep1; inptr += ncep1; inptr %= acmod->n_mfc_alloc; if (ncep1 == 0) goto alldone; } assert(inptr + ncep <= acmod->n_mfc_alloc); if (fe_process_frames(acmod->fe, inout_raw, inout_n_samps, acmod->mfc_buf + inptr, &ncep, &out_frameidx) < 0) return -1; if (out_frameidx > 0) acmod->utt_start_frame = out_frameidx; processed_samples = *inout_raw - prev_audio_inptr; if (processed_samples + acmod->rawdata_pos < acmod->rawdata_size) { memcpy(acmod->rawdata + acmod->rawdata_pos, prev_audio_inptr, processed_samples * sizeof(int16)); acmod->rawdata_pos += processed_samples; } if (acmod->rawfh) { fwrite(prev_audio_inptr, sizeof(int16), processed_samples, acmod->rawfh); } prev_audio_inptr = *inout_raw; acmod->n_mfc_frame += ncep; alldone: ; } /* Hand things off to acmod_process_cep. */ return acmod_process_mfcbuf(acmod); } int acmod_process_cep(acmod_t *acmod, mfcc_t ***inout_cep, int *inout_n_frames, int full_utt) { int32 nfeat, ncep, inptr; int orig_n_frames; /* If this is a full utterance, process it all at once. */ if (full_utt) return acmod_process_full_cep(acmod, inout_cep, inout_n_frames); /* Write to log file. */ if (acmod->mfcfh) acmod_log_mfc(acmod, *inout_cep, *inout_n_frames); /* Maximum number of frames we're going to generate. */ orig_n_frames = ncep = nfeat = *inout_n_frames; /* FIXME: This behaviour isn't guaranteed... */ if (acmod->state == ACMOD_ENDED) nfeat += feat_window_size(acmod->fcb); else if (acmod->state == ACMOD_STARTED) nfeat -= feat_window_size(acmod->fcb); /* Clamp number of features to fit available space. */ if (nfeat > acmod->n_feat_alloc - acmod->n_feat_frame) { /* Grow it as needed - we have to grow it at the end of an * utterance because we can't return a short read there. */ if (acmod->grow_feat || acmod->state == ACMOD_ENDED) acmod_grow_feat_buf(acmod, acmod->n_feat_alloc + nfeat); else ncep -= (nfeat - (acmod->n_feat_alloc - acmod->n_feat_frame)); } /* Where to start writing in the feature buffer. */ if (acmod->grow_feat) { /* Grow to avoid wraparound if grow_feat == TRUE. */ inptr = acmod->feat_outidx + acmod->n_feat_frame; while (inptr + nfeat >= acmod->n_feat_alloc) acmod_grow_feat_buf(acmod, acmod->n_feat_alloc * 2); } else { inptr = (acmod->feat_outidx + acmod->n_feat_frame) % acmod->n_feat_alloc; } /* FIXME: we can't split the last frame drop properly to be on the bounary, * so just return */ if (inptr + nfeat > acmod->n_feat_alloc && acmod->state == ACMOD_ENDED) { *inout_n_frames -= ncep; *inout_cep += ncep; return 0; } /* Write them in two parts if there is wraparound. */ if (inptr + nfeat > acmod->n_feat_alloc) { int32 ncep1 = acmod->n_feat_alloc - inptr; /* Make sure we don't end the utterance here. */ nfeat = feat_s2mfc2feat_live(acmod->fcb, *inout_cep, &ncep1, (acmod->state == ACMOD_STARTED), FALSE, acmod->feat_buf + inptr); if (nfeat < 0) return -1; /* Move the output feature pointer forward. */ acmod->n_feat_frame += nfeat; assert(acmod->n_feat_frame <= acmod->n_feat_alloc); inptr += nfeat; inptr %= acmod->n_feat_alloc; /* Move the input feature pointers forward. */ *inout_n_frames -= ncep1; *inout_cep += ncep1; ncep -= ncep1; } nfeat = feat_s2mfc2feat_live(acmod->fcb, *inout_cep, &ncep, (acmod->state == ACMOD_STARTED), (acmod->state == ACMOD_ENDED), acmod->feat_buf + inptr); if (nfeat < 0) return -1; acmod->n_feat_frame += nfeat; assert(acmod->n_feat_frame <= acmod->n_feat_alloc); /* Move the input feature pointers forward. */ *inout_n_frames -= ncep; *inout_cep += ncep; if (acmod->state == ACMOD_STARTED) acmod->state = ACMOD_PROCESSING; return orig_n_frames - *inout_n_frames; } int acmod_process_feat(acmod_t *acmod, mfcc_t **feat) { int i, inptr; if (acmod->n_feat_frame == acmod->n_feat_alloc) { if (acmod->grow_feat) acmod_grow_feat_buf(acmod, acmod->n_feat_alloc * 2); else return 0; } if (acmod->grow_feat) { /* Grow to avoid wraparound if grow_feat == TRUE. */ inptr = acmod->feat_outidx + acmod->n_feat_frame; while (inptr + 1 >= acmod->n_feat_alloc) acmod_grow_feat_buf(acmod, acmod->n_feat_alloc * 2); } else { inptr = (acmod->feat_outidx + acmod->n_feat_frame) % acmod->n_feat_alloc; } for (i = 0; i < feat_dimension1(acmod->fcb); ++i) memcpy(acmod->feat_buf[inptr][i], feat[i], feat_dimension2(acmod->fcb, i) * sizeof(**feat)); ++acmod->n_feat_frame; assert(acmod->n_feat_frame <= acmod->n_feat_alloc); return 1; } static int acmod_read_senfh_header(acmod_t *acmod) { char **name, **val; int32 swap; int i; if (bio_readhdr(acmod->insenfh, &name, &val, &swap) < 0) goto error_out; for (i = 0; name[i] != NULL; ++i) { if (!strcmp(name[i], "n_sen")) { if (atoi(val[i]) != bin_mdef_n_sen(acmod->mdef)) { E_ERROR("Number of senones in senone file (%d) does not " "match mdef (%d)\n", atoi(val[i]), bin_mdef_n_sen(acmod->mdef)); goto error_out; } } if (!strcmp(name[i], "logbase")) { if (fabs(atof_c(val[i]) - logmath_get_base(acmod->lmath)) > 0.001) { E_ERROR("Logbase in senone file (%f) does not match acmod " "(%f)\n", atof_c(val[i]), logmath_get_base(acmod->lmath)); goto error_out; } } } acmod->insen_swap = swap; bio_hdrarg_free(name, val); return 0; error_out: bio_hdrarg_free(name, val); return -1; } int acmod_set_insenfh(acmod_t *acmod, FILE *senfh) { acmod->insenfh = senfh; if (senfh == NULL) { acmod->n_feat_frame = 0; acmod->compallsen = cmd_ln_boolean_r(acmod->config, "-compallsen"); return 0; } acmod->compallsen = TRUE; return acmod_read_senfh_header(acmod); } int acmod_rewind(acmod_t *acmod) { /* If the feature buffer is circular, this is not possible. */ if (acmod->output_frame > acmod->n_feat_alloc) { E_ERROR("Circular feature buffer cannot be rewound (output frame %d, " "alloc %d)\n", acmod->output_frame, acmod->n_feat_alloc); return -1; } /* Frames consumed + frames available */ acmod->n_feat_frame = acmod->output_frame + acmod->n_feat_frame; /* Reset output pointers. */ acmod->feat_outidx = 0; acmod->output_frame = 0; acmod->senscr_frame = -1; acmod->mgau->frame_idx = 0; return 0; } int acmod_advance(acmod_t *acmod) { /* Advance the output pointers. */ if (++acmod->feat_outidx == acmod->n_feat_alloc) acmod->feat_outidx = 0; --acmod->n_feat_frame; ++acmod->mgau->frame_idx; return ++acmod->output_frame; } int acmod_write_scores(acmod_t *acmod, int n_active, uint8 const *active, int16 const *senscr, FILE *senfh) { int16 n_active2; /* Uncompressed frame format: * * (2 bytes) n_active: Number of active senones * If all senones active: * (n_active * 2 bytes) scores of active senones * * Otherwise: * (2 bytes) n_active: Number of active senones * (n_active bytes) deltas to active senones * (n_active * 2 bytes) scores of active senones */ n_active2 = n_active; if (fwrite(&n_active2, 2, 1, senfh) != 1) goto error_out; if (n_active == bin_mdef_n_sen(acmod->mdef)) { if (fwrite(senscr, 2, n_active, senfh) != n_active) goto error_out; } else { int i, n; if (fwrite(active, 1, n_active, senfh) != n_active) goto error_out; for (i = n = 0; i < n_active; ++i) { n += active[i]; if (fwrite(senscr + n, 2, 1, senfh) != 1) goto error_out; } } return 0; error_out: E_ERROR_SYSTEM("Failed to write frame to senone file"); return -1; } /** * Internal version, used for reading previous frames in acmod_score() */ static int acmod_read_scores_internal(acmod_t *acmod) { FILE *senfh = acmod->insenfh; int16 n_active; size_t rv; if (acmod->n_feat_frame == acmod->n_feat_alloc) { if (acmod->grow_feat) acmod_grow_feat_buf(acmod, acmod->n_feat_alloc * 2); else return 0; } if (senfh == NULL) return -1; if ((rv = fread(&n_active, 2, 1, senfh)) != 1) goto error_out; acmod->n_senone_active = n_active; if (acmod->n_senone_active == bin_mdef_n_sen(acmod->mdef)) { if ((rv = fread(acmod->senone_scores, 2, acmod->n_senone_active, senfh)) != acmod->n_senone_active) goto error_out; } else { int i, n; if ((rv = fread(acmod->senone_active, 1, acmod->n_senone_active, senfh)) != acmod->n_senone_active) goto error_out; for (i = 0, n = 0; i < acmod->n_senone_active; ++i) { int j, sen = n + acmod->senone_active[i]; for (j = n + 1; j < sen; ++j) acmod->senone_scores[j] = SENSCR_DUMMY; if ((rv = fread(acmod->senone_scores + sen, 2, 1, senfh)) != 1) goto error_out; n = sen; } n++; while (n < bin_mdef_n_sen(acmod->mdef)) acmod->senone_scores[n++] = SENSCR_DUMMY; } return 1; error_out: if (ferror(senfh)) { E_ERROR_SYSTEM("Failed to read frame from senone file"); return -1; } return 0; } int acmod_read_scores(acmod_t *acmod) { int inptr, rv; if (acmod->grow_feat) { /* Grow to avoid wraparound if grow_feat == TRUE. */ inptr = acmod->feat_outidx + acmod->n_feat_frame; /* Has to be +1, otherwise, next time acmod_advance() is * called, this will wrap around. */ while (inptr + 1 >= acmod->n_feat_alloc) acmod_grow_feat_buf(acmod, acmod->n_feat_alloc * 2); } else { inptr = (acmod->feat_outidx + acmod->n_feat_frame) % acmod->n_feat_alloc; } if ((rv = acmod_read_scores_internal(acmod)) != 1) return rv; /* Set acmod->senscr_frame appropriately so that these scores get reused below in acmod_score(). */ acmod->senscr_frame = acmod->output_frame + acmod->n_feat_frame; E_DEBUG(1,("Frame %d has %d active states\n", acmod->senscr_frame, acmod->n_senone_active)); /* Increment the "feature frame counter" and record the file * position for the relevant frame in the (possibly circular) * buffer. */ ++acmod->n_feat_frame; acmod->framepos[inptr] = ftell(acmod->insenfh); return 1; } static int calc_frame_idx(acmod_t *acmod, int *inout_frame_idx) { int frame_idx; /* Calculate the absolute frame index to be scored. */ if (inout_frame_idx == NULL) frame_idx = acmod->output_frame; else if (*inout_frame_idx < 0) frame_idx = acmod->output_frame + 1 + *inout_frame_idx; else frame_idx = *inout_frame_idx; return frame_idx; } static int calc_feat_idx(acmod_t *acmod, int frame_idx) { int n_backfr, feat_idx; n_backfr = acmod->n_feat_alloc - acmod->n_feat_frame; if (frame_idx < 0 || acmod->output_frame - frame_idx > n_backfr) { E_ERROR("Frame %d outside queue of %d frames, %d alloc (%d > %d), " "cannot score\n", frame_idx, acmod->n_feat_frame, acmod->n_feat_alloc, acmod->output_frame - frame_idx, n_backfr); return -1; } /* Get the index in feat_buf/framepos of the frame to be scored. */ feat_idx = (acmod->feat_outidx + frame_idx - acmod->output_frame) % acmod->n_feat_alloc; if (feat_idx < 0) feat_idx += acmod->n_feat_alloc; return feat_idx; } mfcc_t ** acmod_get_frame(acmod_t *acmod, int *inout_frame_idx) { int frame_idx, feat_idx; /* Calculate the absolute frame index requested. */ frame_idx = calc_frame_idx(acmod, inout_frame_idx); /* Calculate position of requested frame in circular buffer. */ if ((feat_idx = calc_feat_idx(acmod, frame_idx)) < 0) return NULL; if (inout_frame_idx) *inout_frame_idx = frame_idx; return acmod->feat_buf[feat_idx]; } int16 const * acmod_score(acmod_t *acmod, int *inout_frame_idx) { int frame_idx, feat_idx; /* Calculate the absolute frame index to be scored. */ frame_idx = calc_frame_idx(acmod, inout_frame_idx); /* If all senones are being computed, or we are using a senone file, then we can reuse existing scores. */ if ((acmod->compallsen || acmod->insenfh) && frame_idx == acmod->senscr_frame) { if (inout_frame_idx) *inout_frame_idx = frame_idx; return acmod->senone_scores; } /* Calculate position of requested frame in circular buffer. */ if ((feat_idx = calc_feat_idx(acmod, frame_idx)) < 0) return NULL; /* * If there is an input senone file locate the appropriate frame and read * it. */ if (acmod->insenfh) { fseek(acmod->insenfh, acmod->framepos[feat_idx], SEEK_SET); if (acmod_read_scores_internal(acmod) < 0) return NULL; } else { /* Build active senone list. */ acmod_flags2list(acmod); /* Generate scores for the next available frame */ ps_mgau_frame_eval(acmod->mgau, acmod->senone_scores, acmod->senone_active, acmod->n_senone_active, acmod->feat_buf[feat_idx], frame_idx, acmod->compallsen); } if (inout_frame_idx) *inout_frame_idx = frame_idx; acmod->senscr_frame = frame_idx; /* Dump scores to the senone dump file if one exists. */ if (acmod->senfh) { if (acmod_write_scores(acmod, acmod->n_senone_active, acmod->senone_active, acmod->senone_scores, acmod->senfh) < 0) return NULL; E_DEBUG(1,("Frame %d has %d active states\n", frame_idx, acmod->n_senone_active)); } return acmod->senone_scores; } int acmod_best_score(acmod_t *acmod, int *out_best_senid) { int i, best; best = SENSCR_DUMMY; if (acmod->compallsen) { for (i = 0; i < bin_mdef_n_sen(acmod->mdef); ++i) { if (acmod->senone_scores[i] < best) { best = acmod->senone_scores[i]; *out_best_senid = i; } } } else { int16 *senscr; senscr = acmod->senone_scores; for (i = 0; i < acmod->n_senone_active; ++i) { senscr += acmod->senone_active[i]; if (*senscr < best) { best = *senscr; *out_best_senid = i; } } } return best; } void acmod_clear_active(acmod_t *acmod) { if (acmod->compallsen) return; bitvec_clear_all(acmod->senone_active_vec, bin_mdef_n_sen(acmod->mdef)); acmod->n_senone_active = 0; } #define MPX_BITVEC_SET(a,h,i) \ if (hmm_mpx_ssid(h,i) != BAD_SSID) \ bitvec_set((a)->senone_active_vec, hmm_mpx_senid(h,i)) #define NONMPX_BITVEC_SET(a,h,i) \ bitvec_set((a)->senone_active_vec, \ hmm_nonmpx_senid(h,i)) void acmod_activate_hmm(acmod_t *acmod, hmm_t *hmm) { int i; if (acmod->compallsen) return; if (hmm_is_mpx(hmm)) { switch (hmm_n_emit_state(hmm)) { case 5: MPX_BITVEC_SET(acmod, hmm, 4); MPX_BITVEC_SET(acmod, hmm, 3); case 3: MPX_BITVEC_SET(acmod, hmm, 2); MPX_BITVEC_SET(acmod, hmm, 1); MPX_BITVEC_SET(acmod, hmm, 0); break; default: for (i = 0; i < hmm_n_emit_state(hmm); ++i) { MPX_BITVEC_SET(acmod, hmm, i); } } } else { switch (hmm_n_emit_state(hmm)) { case 5: NONMPX_BITVEC_SET(acmod, hmm, 4); NONMPX_BITVEC_SET(acmod, hmm, 3); case 3: NONMPX_BITVEC_SET(acmod, hmm, 2); NONMPX_BITVEC_SET(acmod, hmm, 1); NONMPX_BITVEC_SET(acmod, hmm, 0); break; default: for (i = 0; i < hmm_n_emit_state(hmm); ++i) { NONMPX_BITVEC_SET(acmod, hmm, i); } } } } int32 acmod_flags2list(acmod_t *acmod) { int32 w, l, n, b, total_dists, total_words, extra_bits; bitvec_t *flagptr; total_dists = bin_mdef_n_sen(acmod->mdef); if (acmod->compallsen) { acmod->n_senone_active = total_dists; return total_dists; } total_words = total_dists / BITVEC_BITS; extra_bits = total_dists % BITVEC_BITS; w = n = l = 0; for (flagptr = acmod->senone_active_vec; w < total_words; ++w, ++flagptr) { if (*flagptr == 0) continue; for (b = 0; b < BITVEC_BITS; ++b) { if (*flagptr & (1UL << b)) { int32 sen = w * BITVEC_BITS + b; int32 delta = sen - l; /* Handle excessive deltas "lossily" by adding a few extra senones to bridge the gap. */ while (delta > 255) { acmod->senone_active[n++] = 255; delta -= 255; } acmod->senone_active[n++] = delta; l = sen; } } } for (b = 0; b < extra_bits; ++b) { if (*flagptr & (1UL << b)) { int32 sen = w * BITVEC_BITS + b; int32 delta = sen - l; /* Handle excessive deltas "lossily" by adding a few extra senones to bridge the gap. */ while (delta > 255) { acmod->senone_active[n++] = 255; delta -= 255; } acmod->senone_active[n++] = delta; l = sen; } } acmod->n_senone_active = n; E_DEBUG(1, ("acmod_flags2list: %d active in frame %d\n", acmod->n_senone_active, acmod->output_frame)); return n; } int32 acmod_stream_offset(acmod_t *acmod) { return acmod->utt_start_frame; } void acmod_start_stream(acmod_t *acmod) { fe_start_stream(acmod->fe); acmod->utt_start_frame = 0; } void acmod_set_rawdata_size(acmod_t *acmod, int32 size) { assert(size >= 0); acmod->rawdata_size = size; if (acmod->rawdata_size > 0) { ckd_free(acmod->rawdata); acmod->rawdata = ckd_calloc(size, sizeof(int16)); } } void acmod_get_rawdata(acmod_t *acmod, int16 **buffer, int32 *size) { if (buffer) { *buffer = acmod->rawdata; } if (size) { *size = acmod->rawdata_pos; } }