summaryrefslogtreecommitdiffstats
path: root/media/pocketsphinx/src/state_align_search.c
diff options
context:
space:
mode:
Diffstat (limited to 'media/pocketsphinx/src/state_align_search.c')
-rw-r--r--media/pocketsphinx/src/state_align_search.c310
1 files changed, 310 insertions, 0 deletions
diff --git a/media/pocketsphinx/src/state_align_search.c b/media/pocketsphinx/src/state_align_search.c
new file mode 100644
index 000000000..33d8d70b5
--- /dev/null
+++ b/media/pocketsphinx/src/state_align_search.c
@@ -0,0 +1,310 @@
+/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
+/* ====================================================================
+ * Copyright (c) 2010 Carnegie Mellon University. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * This work was supported in part by funding from the Defense Advanced
+ * Research Projects Agency and the National Science Foundation of the
+ * United States of America, and the CMU Sphinx Speech Consortium.
+ *
+ * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
+ * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
+ * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ====================================================================
+ *
+ */
+
+/**
+ * @file state_align_search.c State (and phone and word) alignment search.
+ */
+
+#include "state_align_search.h"
+
+static int
+state_align_search_start(ps_search_t *search)
+{
+ state_align_search_t *sas = (state_align_search_t *)search;
+
+ /* Activate the initial state. */
+ hmm_enter(sas->hmms, 0, 0, 0);
+
+ return 0;
+}
+
+static void
+renormalize_hmms(state_align_search_t *sas, int frame_idx, int32 norm)
+{
+ int i;
+ for (i = 0; i < sas->n_phones; ++i)
+ hmm_normalize(sas->hmms + i, norm);
+}
+
+static int32
+evaluate_hmms(state_align_search_t *sas, int16 const *senscr, int frame_idx)
+{
+ int32 bs = WORST_SCORE;
+ int i;
+
+ hmm_context_set_senscore(sas->hmmctx, senscr);
+
+ for (i = 0; i < sas->n_phones; ++i) {
+ hmm_t *hmm = sas->hmms + i;
+ int32 score;
+
+ if (hmm_frame(hmm) < frame_idx)
+ continue;
+ score = hmm_vit_eval(hmm);
+ if (score BETTER_THAN bs) {
+ bs = score;
+ }
+ }
+ return bs;
+}
+
+static void
+prune_hmms(state_align_search_t *sas, int frame_idx)
+{
+ int nf = frame_idx + 1;
+ int i;
+
+ /* Check all phones to see if they remain active in the next frame. */
+ for (i = 0; i < sas->n_phones; ++i) {
+ hmm_t *hmm = sas->hmms + i;
+ if (hmm_frame(hmm) < frame_idx)
+ continue;
+ hmm_frame(hmm) = nf;
+ }
+}
+
+static void
+phone_transition(state_align_search_t *sas, int frame_idx)
+{
+ int nf = frame_idx + 1;
+ int i;
+
+ for (i = 0; i < sas->n_phones - 1; ++i) {
+ hmm_t *hmm, *nhmm;
+ int32 newphone_score;
+
+ hmm = sas->hmms + i;
+ if (hmm_frame(hmm) != nf)
+ continue;
+
+ newphone_score = hmm_out_score(hmm);
+ /* Transition into next phone using the usual Viterbi rule. */
+ nhmm = hmm + 1;
+ if (hmm_frame(nhmm) < frame_idx
+ || newphone_score BETTER_THAN hmm_in_score(nhmm)) {
+ hmm_enter(nhmm, newphone_score, hmm_out_history(hmm), nf);
+ }
+ }
+}
+
+#define TOKEN_STEP 20
+static void
+extend_tokenstack(state_align_search_t *sas, int frame_idx)
+{
+ if (frame_idx >= sas->n_fr_alloc) {
+ sas->n_fr_alloc = frame_idx + TOKEN_STEP + 1;
+ sas->tokens = ckd_realloc(sas->tokens,
+ sas->n_emit_state * sas->n_fr_alloc
+ * sizeof(*sas->tokens));
+ }
+ memset(sas->tokens + frame_idx * sas->n_emit_state, 0xff,
+ sas->n_emit_state * sizeof(*sas->tokens));
+}
+
+static void
+record_transitions(state_align_search_t *sas, int frame_idx)
+{
+ uint16 *tokens;
+ int i;
+
+ /* Push another frame of tokens on the stack. */
+ extend_tokenstack(sas, frame_idx);
+ tokens = sas->tokens + frame_idx * sas->n_emit_state;
+
+ /* Scan all active HMMs */
+ for (i = 0; i < sas->n_phones; ++i) {
+ hmm_t *hmm = sas->hmms + i;
+ int j;
+
+ if (hmm_frame(hmm) < frame_idx)
+ continue;
+ for (j = 0; j < sas->hmmctx->n_emit_state; ++j) {
+ int state_idx = i * sas->hmmctx->n_emit_state + j;
+ /* Record their backpointers on the token stack. */
+ tokens[state_idx] = hmm_history(hmm, j);
+ /* Update backpointer fields with state index. */
+ hmm_history(hmm, j) = state_idx;
+ }
+ }
+}
+
+static int
+state_align_search_step(ps_search_t *search, int frame_idx)
+{
+ state_align_search_t *sas = (state_align_search_t *)search;
+ acmod_t *acmod = ps_search_acmod(search);
+ int16 const *senscr;
+ int i;
+
+ /* Calculate senone scores. */
+ for (i = 0; i < sas->n_phones; ++i)
+ acmod_activate_hmm(acmod, sas->hmms + i);
+ senscr = acmod_score(acmod, &frame_idx);
+
+ /* Renormalize here if needed. */
+ /* FIXME: Make sure to (unit-)test this!!! */
+ if ((sas->best_score - 0x300000) WORSE_THAN WORST_SCORE) {
+ E_INFO("Renormalizing Scores at frame %d, best score %d\n",
+ frame_idx, sas->best_score);
+ renormalize_hmms(sas, frame_idx, sas->best_score);
+ }
+
+ /* Viterbi step. */
+ sas->best_score = evaluate_hmms(sas, senscr, frame_idx);
+ prune_hmms(sas, frame_idx);
+
+ /* Transition out of non-emitting states. */
+ phone_transition(sas, frame_idx);
+
+ /* Generate new tokens from best path results. */
+ record_transitions(sas, frame_idx);
+
+ /* Update frame counter */
+ sas->frame = frame_idx;
+
+ return 0;
+}
+
+static int
+state_align_search_finish(ps_search_t *search)
+{
+ state_align_search_t *sas = (state_align_search_t *)search;
+ hmm_t *final_phone = sas->hmms + sas->n_phones - 1;
+ ps_alignment_iter_t *itor;
+ ps_alignment_entry_t *ent;
+ int next_state, next_start, state, frame;
+
+ /* Best state exiting the last frame. */
+ next_state = state = hmm_out_history(final_phone);
+ if (state == 0xffff) {
+ E_ERROR("Failed to reach final state in alignment\n");
+ return -1;
+ }
+ itor = ps_alignment_states(sas->al);
+ next_start = sas->frame + 1;
+ for (frame = sas->frame - 1; frame >= 0; --frame) {
+ state = sas->tokens[frame * sas->n_emit_state + state];
+ /* State boundary, update alignment entry for next state. */
+ if (state != next_state) {
+ itor = ps_alignment_iter_goto(itor, next_state);
+ assert(itor != NULL);
+ ent = ps_alignment_iter_get(itor);
+ ent->start = frame + 1;
+ ent->duration = next_start - ent->start;
+ E_DEBUG(1,("state %d start %d end %d\n", next_state,
+ ent->start, next_start));
+ next_state = state;
+ next_start = frame + 1;
+ }
+ }
+ /* Update alignment entry for initial state. */
+ itor = ps_alignment_iter_goto(itor, 0);
+ assert(itor != NULL);
+ ent = ps_alignment_iter_get(itor);
+ ent->start = 0;
+ ent->duration = next_start;
+ E_DEBUG(1,("state %d start %d end %d\n", 0,
+ ent->start, next_start));
+ ps_alignment_iter_free(itor);
+ ps_alignment_propagate(sas->al);
+
+ return 0;
+}
+
+static int
+state_align_search_reinit(ps_search_t *search, dict_t *dict, dict2pid_t *d2p)
+{
+ /* This does nothing. */
+ return 0;
+}
+
+static void
+state_align_search_free(ps_search_t *search)
+{
+ state_align_search_t *sas = (state_align_search_t *)search;
+ ps_search_deinit(search);
+ ckd_free(sas->hmms);
+ ckd_free(sas->tokens);
+ hmm_context_free(sas->hmmctx);
+ ckd_free(sas);
+}
+
+static ps_searchfuncs_t state_align_search_funcs = {
+ /* name: */ "state_align",
+ /* start: */ state_align_search_start,
+ /* step: */ state_align_search_step,
+ /* finish: */ state_align_search_finish,
+ /* reinit: */ state_align_search_reinit,
+ /* free: */ state_align_search_free,
+ /* lattice: */ NULL,
+ /* hyp: */ NULL,
+ /* prob: */ NULL,
+ /* seg_iter: */ NULL,
+};
+
+ps_search_t *
+state_align_search_init(cmd_ln_t *config,
+ acmod_t *acmod,
+ ps_alignment_t *al)
+{
+ state_align_search_t *sas;
+ ps_alignment_iter_t *itor;
+ hmm_t *hmm;
+
+ sas = ckd_calloc(1, sizeof(*sas));
+ ps_search_init(ps_search_base(sas), &state_align_search_funcs,
+ config, acmod, al->d2p->dict, al->d2p);
+ sas->hmmctx = hmm_context_init(bin_mdef_n_emit_state(acmod->mdef),
+ acmod->tmat->tp, NULL, acmod->mdef->sseq);
+ if (sas->hmmctx == NULL) {
+ ckd_free(sas);
+ return NULL;
+ }
+ sas->al = al;
+
+ /* Generate HMM vector from phone level of alignment. */
+ sas->n_phones = ps_alignment_n_phones(al);
+ sas->n_emit_state = ps_alignment_n_states(al);
+ sas->hmms = ckd_calloc(sas->n_phones, sizeof(*sas->hmms));
+ for (hmm = sas->hmms, itor = ps_alignment_phones(al); itor;
+ ++hmm, itor = ps_alignment_iter_next(itor)) {
+ ps_alignment_entry_t *ent = ps_alignment_iter_get(itor);
+ hmm_init(sas->hmmctx, hmm, FALSE,
+ ent->id.pid.ssid, ent->id.pid.tmatid);
+ }
+ return ps_search_base(sas);
+}