diff options
Diffstat (limited to 'media/pocketsphinx/src/state_align_search.c')
-rw-r--r-- | media/pocketsphinx/src/state_align_search.c | 310 |
1 files changed, 310 insertions, 0 deletions
diff --git a/media/pocketsphinx/src/state_align_search.c b/media/pocketsphinx/src/state_align_search.c new file mode 100644 index 000000000..33d8d70b5 --- /dev/null +++ b/media/pocketsphinx/src/state_align_search.c @@ -0,0 +1,310 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 2010 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +/** + * @file state_align_search.c State (and phone and word) alignment search. + */ + +#include "state_align_search.h" + +static int +state_align_search_start(ps_search_t *search) +{ + state_align_search_t *sas = (state_align_search_t *)search; + + /* Activate the initial state. */ + hmm_enter(sas->hmms, 0, 0, 0); + + return 0; +} + +static void +renormalize_hmms(state_align_search_t *sas, int frame_idx, int32 norm) +{ + int i; + for (i = 0; i < sas->n_phones; ++i) + hmm_normalize(sas->hmms + i, norm); +} + +static int32 +evaluate_hmms(state_align_search_t *sas, int16 const *senscr, int frame_idx) +{ + int32 bs = WORST_SCORE; + int i; + + hmm_context_set_senscore(sas->hmmctx, senscr); + + for (i = 0; i < sas->n_phones; ++i) { + hmm_t *hmm = sas->hmms + i; + int32 score; + + if (hmm_frame(hmm) < frame_idx) + continue; + score = hmm_vit_eval(hmm); + if (score BETTER_THAN bs) { + bs = score; + } + } + return bs; +} + +static void +prune_hmms(state_align_search_t *sas, int frame_idx) +{ + int nf = frame_idx + 1; + int i; + + /* Check all phones to see if they remain active in the next frame. */ + for (i = 0; i < sas->n_phones; ++i) { + hmm_t *hmm = sas->hmms + i; + if (hmm_frame(hmm) < frame_idx) + continue; + hmm_frame(hmm) = nf; + } +} + +static void +phone_transition(state_align_search_t *sas, int frame_idx) +{ + int nf = frame_idx + 1; + int i; + + for (i = 0; i < sas->n_phones - 1; ++i) { + hmm_t *hmm, *nhmm; + int32 newphone_score; + + hmm = sas->hmms + i; + if (hmm_frame(hmm) != nf) + continue; + + newphone_score = hmm_out_score(hmm); + /* Transition into next phone using the usual Viterbi rule. */ + nhmm = hmm + 1; + if (hmm_frame(nhmm) < frame_idx + || newphone_score BETTER_THAN hmm_in_score(nhmm)) { + hmm_enter(nhmm, newphone_score, hmm_out_history(hmm), nf); + } + } +} + +#define TOKEN_STEP 20 +static void +extend_tokenstack(state_align_search_t *sas, int frame_idx) +{ + if (frame_idx >= sas->n_fr_alloc) { + sas->n_fr_alloc = frame_idx + TOKEN_STEP + 1; + sas->tokens = ckd_realloc(sas->tokens, + sas->n_emit_state * sas->n_fr_alloc + * sizeof(*sas->tokens)); + } + memset(sas->tokens + frame_idx * sas->n_emit_state, 0xff, + sas->n_emit_state * sizeof(*sas->tokens)); +} + +static void +record_transitions(state_align_search_t *sas, int frame_idx) +{ + uint16 *tokens; + int i; + + /* Push another frame of tokens on the stack. */ + extend_tokenstack(sas, frame_idx); + tokens = sas->tokens + frame_idx * sas->n_emit_state; + + /* Scan all active HMMs */ + for (i = 0; i < sas->n_phones; ++i) { + hmm_t *hmm = sas->hmms + i; + int j; + + if (hmm_frame(hmm) < frame_idx) + continue; + for (j = 0; j < sas->hmmctx->n_emit_state; ++j) { + int state_idx = i * sas->hmmctx->n_emit_state + j; + /* Record their backpointers on the token stack. */ + tokens[state_idx] = hmm_history(hmm, j); + /* Update backpointer fields with state index. */ + hmm_history(hmm, j) = state_idx; + } + } +} + +static int +state_align_search_step(ps_search_t *search, int frame_idx) +{ + state_align_search_t *sas = (state_align_search_t *)search; + acmod_t *acmod = ps_search_acmod(search); + int16 const *senscr; + int i; + + /* Calculate senone scores. */ + for (i = 0; i < sas->n_phones; ++i) + acmod_activate_hmm(acmod, sas->hmms + i); + senscr = acmod_score(acmod, &frame_idx); + + /* Renormalize here if needed. */ + /* FIXME: Make sure to (unit-)test this!!! */ + if ((sas->best_score - 0x300000) WORSE_THAN WORST_SCORE) { + E_INFO("Renormalizing Scores at frame %d, best score %d\n", + frame_idx, sas->best_score); + renormalize_hmms(sas, frame_idx, sas->best_score); + } + + /* Viterbi step. */ + sas->best_score = evaluate_hmms(sas, senscr, frame_idx); + prune_hmms(sas, frame_idx); + + /* Transition out of non-emitting states. */ + phone_transition(sas, frame_idx); + + /* Generate new tokens from best path results. */ + record_transitions(sas, frame_idx); + + /* Update frame counter */ + sas->frame = frame_idx; + + return 0; +} + +static int +state_align_search_finish(ps_search_t *search) +{ + state_align_search_t *sas = (state_align_search_t *)search; + hmm_t *final_phone = sas->hmms + sas->n_phones - 1; + ps_alignment_iter_t *itor; + ps_alignment_entry_t *ent; + int next_state, next_start, state, frame; + + /* Best state exiting the last frame. */ + next_state = state = hmm_out_history(final_phone); + if (state == 0xffff) { + E_ERROR("Failed to reach final state in alignment\n"); + return -1; + } + itor = ps_alignment_states(sas->al); + next_start = sas->frame + 1; + for (frame = sas->frame - 1; frame >= 0; --frame) { + state = sas->tokens[frame * sas->n_emit_state + state]; + /* State boundary, update alignment entry for next state. */ + if (state != next_state) { + itor = ps_alignment_iter_goto(itor, next_state); + assert(itor != NULL); + ent = ps_alignment_iter_get(itor); + ent->start = frame + 1; + ent->duration = next_start - ent->start; + E_DEBUG(1,("state %d start %d end %d\n", next_state, + ent->start, next_start)); + next_state = state; + next_start = frame + 1; + } + } + /* Update alignment entry for initial state. */ + itor = ps_alignment_iter_goto(itor, 0); + assert(itor != NULL); + ent = ps_alignment_iter_get(itor); + ent->start = 0; + ent->duration = next_start; + E_DEBUG(1,("state %d start %d end %d\n", 0, + ent->start, next_start)); + ps_alignment_iter_free(itor); + ps_alignment_propagate(sas->al); + + return 0; +} + +static int +state_align_search_reinit(ps_search_t *search, dict_t *dict, dict2pid_t *d2p) +{ + /* This does nothing. */ + return 0; +} + +static void +state_align_search_free(ps_search_t *search) +{ + state_align_search_t *sas = (state_align_search_t *)search; + ps_search_deinit(search); + ckd_free(sas->hmms); + ckd_free(sas->tokens); + hmm_context_free(sas->hmmctx); + ckd_free(sas); +} + +static ps_searchfuncs_t state_align_search_funcs = { + /* name: */ "state_align", + /* start: */ state_align_search_start, + /* step: */ state_align_search_step, + /* finish: */ state_align_search_finish, + /* reinit: */ state_align_search_reinit, + /* free: */ state_align_search_free, + /* lattice: */ NULL, + /* hyp: */ NULL, + /* prob: */ NULL, + /* seg_iter: */ NULL, +}; + +ps_search_t * +state_align_search_init(cmd_ln_t *config, + acmod_t *acmod, + ps_alignment_t *al) +{ + state_align_search_t *sas; + ps_alignment_iter_t *itor; + hmm_t *hmm; + + sas = ckd_calloc(1, sizeof(*sas)); + ps_search_init(ps_search_base(sas), &state_align_search_funcs, + config, acmod, al->d2p->dict, al->d2p); + sas->hmmctx = hmm_context_init(bin_mdef_n_emit_state(acmod->mdef), + acmod->tmat->tp, NULL, acmod->mdef->sseq); + if (sas->hmmctx == NULL) { + ckd_free(sas); + return NULL; + } + sas->al = al; + + /* Generate HMM vector from phone level of alignment. */ + sas->n_phones = ps_alignment_n_phones(al); + sas->n_emit_state = ps_alignment_n_states(al); + sas->hmms = ckd_calloc(sas->n_phones, sizeof(*sas->hmms)); + for (hmm = sas->hmms, itor = ps_alignment_phones(al); itor; + ++hmm, itor = ps_alignment_iter_next(itor)) { + ps_alignment_entry_t *ent = ps_alignment_iter_get(itor); + hmm_init(sas->hmmctx, hmm, FALSE, + ent->id.pid.ssid, ent->id.pid.tmatid); + } + return ps_search_base(sas); +} |