summaryrefslogtreecommitdiffstats
path: root/media/sphinxbase/src/libsphinxbase/fe/fe_noise.c
diff options
context:
space:
mode:
authorMoonchild <moonchild@palemoon.org>2020-05-20 10:19:04 +0000
committerMoonchild <moonchild@palemoon.org>2020-05-20 14:04:17 +0000
commit99c2e698d2a3c56649e42d8d2133706cd8c9501e (patch)
tree85be449d772eb57860f0f386efb4bc1e790fd498 /media/sphinxbase/src/libsphinxbase/fe/fe_noise.c
parent15ac4021b06d549e47c9e2efc9364a9eb96bfe82 (diff)
downloadUXP-99c2e698d2a3c56649e42d8d2133706cd8c9501e.tar
UXP-99c2e698d2a3c56649e42d8d2133706cd8c9501e.tar.gz
UXP-99c2e698d2a3c56649e42d8d2133706cd8c9501e.tar.lz
UXP-99c2e698d2a3c56649e42d8d2133706cd8c9501e.tar.xz
UXP-99c2e698d2a3c56649e42d8d2133706cd8c9501e.zip
Issue #1538 - remove speech recognition engine
This removes speech recognition, pocketsphinx, training models and the speech automated test interface. This also re-establishes proper use of MOZ_WEBSPEECH to work for the speech API (synthesis part only) that was a broken mess before, with some synth parts being always built, some parts being built only with it enabled and recognition parts being dependent on it. I'm pretty sure it'd be totally busted if you'd ever have tried building without MOZ_WEBPEECH before. Tested that synthesis still works as-intended. This resolves #1538
Diffstat (limited to 'media/sphinxbase/src/libsphinxbase/fe/fe_noise.c')
-rw-r--r--media/sphinxbase/src/libsphinxbase/fe/fe_noise.c425
1 files changed, 0 insertions, 425 deletions
diff --git a/media/sphinxbase/src/libsphinxbase/fe/fe_noise.c b/media/sphinxbase/src/libsphinxbase/fe/fe_noise.c
deleted file mode 100644
index 4fb6d21a9..000000000
--- a/media/sphinxbase/src/libsphinxbase/fe/fe_noise.c
+++ /dev/null
@@ -1,425 +0,0 @@
-/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
-/* ====================================================================
- * Copyright (c) 2013 Carnegie Mellon University. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * This work was supported in part by funding from the Defense Advanced
- * Research Projects Agency and the National Science Foundation of the
- * United States of America, and the CMU Sphinx Speech Consortium.
- *
- * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
- * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
- * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * ====================================================================
- *
- */
-
-/* This noise removal algorithm is inspired by the following papers
- * Computationally Efficient Speech Enchancement by Spectral Minina Tracking
- * by G. Doblinger
- *
- * Power-Normalized Cepstral Coefficients (PNCC) for Robust Speech Recognition
- * by C. Kim.
- *
- * For the recent research and state of art see papers about IMRCA and
- * A Minimum-Mean-Square-Error Noise Reduction Algorithm On Mel-Frequency
- * Cepstra For Robust Speech Recognition by Dong Yu and others
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <math.h>
-
-#include "sphinxbase/prim_type.h"
-#include "sphinxbase/ckd_alloc.h"
-#include "sphinxbase/strfuncs.h"
-#include "sphinxbase/err.h"
-
-#include "fe_noise.h"
-#include "fe_internal.h"
-
-/* Noise supression constants */
-#define SMOOTH_WINDOW 4
-#define LAMBDA_POWER 0.7
-#define LAMBDA_A 0.995
-#define LAMBDA_B 0.5
-#define LAMBDA_T 0.85
-#define MU_T 0.2
-#define MAX_GAIN 20
-
-struct noise_stats_s {
- /* Smoothed power */
- powspec_t *power;
- /* Noise estimate */
- powspec_t *noise;
- /* Signal floor estimate */
- powspec_t *floor;
- /* Peak for temporal masking */
- powspec_t *peak;
-
- /* Initialize it next time */
- uint8 undefined;
- /* Number of items to process */
- uint32 num_filters;
-
- /* Precomputed constants */
- powspec_t lambda_power;
- powspec_t comp_lambda_power;
- powspec_t lambda_a;
- powspec_t comp_lambda_a;
- powspec_t lambda_b;
- powspec_t comp_lambda_b;
- powspec_t lambda_t;
- powspec_t mu_t;
- powspec_t max_gain;
- powspec_t inv_max_gain;
-
- powspec_t smooth_scaling[2 * SMOOTH_WINDOW + 3];
-};
-
-static void
-fe_lower_envelope(noise_stats_t *noise_stats, powspec_t * buf, powspec_t * floor_buf, int32 num_filt)
-{
- int i;
-
- for (i = 0; i < num_filt; i++) {
-#ifndef FIXED_POINT
- if (buf[i] >= floor_buf[i]) {
- floor_buf[i] =
- noise_stats->lambda_a * floor_buf[i] + noise_stats->comp_lambda_a * buf[i];
- }
- else {
- floor_buf[i] =
- noise_stats->lambda_b * floor_buf[i] + noise_stats->comp_lambda_b * buf[i];
- }
-#else
- if (buf[i] >= floor_buf[i]) {
- floor_buf[i] = fe_log_add(noise_stats->lambda_a + floor_buf[i],
- noise_stats->comp_lambda_a + buf[i]);
- }
- else {
- floor_buf[i] = fe_log_add(noise_stats->lambda_b + floor_buf[i],
- noise_stats->comp_lambda_b + buf[i]);
- }
-#endif
- }
-}
-
-/* temporal masking */
-static void
-fe_temp_masking(noise_stats_t *noise_stats, powspec_t * buf, powspec_t * peak, int32 num_filt)
-{
- powspec_t cur_in;
- int i;
-
- for (i = 0; i < num_filt; i++) {
- cur_in = buf[i];
-
-#ifndef FIXED_POINT
- peak[i] *= noise_stats->lambda_t;
- if (buf[i] < noise_stats->lambda_t * peak[i])
- buf[i] = peak[i] * noise_stats->mu_t;
-#else
- peak[i] += noise_stats->lambda_t;
- if (buf[i] < noise_stats->lambda_t + peak[i])
- buf[i] = peak[i] + noise_stats->mu_t;
-#endif
-
- if (cur_in > peak[i])
- peak[i] = cur_in;
- }
-}
-
-/* spectral weight smoothing */
-static void
-fe_weight_smooth(noise_stats_t *noise_stats, powspec_t * buf, powspec_t * coefs, int32 num_filt)
-{
- int i, j;
- int l1, l2;
- powspec_t coef;
-
- for (i = 0; i < num_filt; i++) {
- l1 = ((i - SMOOTH_WINDOW) > 0) ? (i - SMOOTH_WINDOW) : 0;
- l2 = ((i + SMOOTH_WINDOW) <
- (num_filt - 1)) ? (i + SMOOTH_WINDOW) : (num_filt - 1);
-
-#ifndef FIXED_POINT
- coef = 0;
- for (j = l1; j <= l2; j++) {
- coef += coefs[j];
- }
- buf[i] = buf[i] * (coef / (l2 - l1 + 1));
-#else
- coef = MIN_FIXLOG;
- for (j = l1; j <= l2; j++) {
- coef = fe_log_add(coef, coefs[j]);
- }
- buf[i] = buf[i] + coef - noise_stats->smooth_scaling[l2 - l1 + 1];
-#endif
-
- }
-}
-
-noise_stats_t *
-fe_init_noisestats(int num_filters)
-{
- int i;
- noise_stats_t *noise_stats;
-
- noise_stats = (noise_stats_t *) ckd_calloc(1, sizeof(noise_stats_t));
-
- noise_stats->power =
- (powspec_t *) ckd_calloc(num_filters, sizeof(powspec_t));
- noise_stats->noise =
- (powspec_t *) ckd_calloc(num_filters, sizeof(powspec_t));
- noise_stats->floor =
- (powspec_t *) ckd_calloc(num_filters, sizeof(powspec_t));
- noise_stats->peak =
- (powspec_t *) ckd_calloc(num_filters, sizeof(powspec_t));
-
- noise_stats->undefined = TRUE;
- noise_stats->num_filters = num_filters;
-
-#ifndef FIXED_POINT
- noise_stats->lambda_power = LAMBDA_POWER;
- noise_stats->comp_lambda_power = 1 - LAMBDA_POWER;
- noise_stats->lambda_a = LAMBDA_A;
- noise_stats->comp_lambda_a = 1 - LAMBDA_A;
- noise_stats->lambda_b = LAMBDA_B;
- noise_stats->comp_lambda_b = 1 - LAMBDA_B;
- noise_stats->lambda_t = LAMBDA_T;
- noise_stats->mu_t = MU_T;
- noise_stats->max_gain = MAX_GAIN;
- noise_stats->inv_max_gain = 1.0 / MAX_GAIN;
-
- for (i = 1; i < 2 * SMOOTH_WINDOW + 1; i++) {
- noise_stats->smooth_scaling[i] = 1.0 / i;
- }
-#else
- noise_stats->lambda_power = FLOAT2FIX(log(LAMBDA_POWER));
- noise_stats->comp_lambda_power = FLOAT2FIX(log(1 - LAMBDA_POWER));
- noise_stats->lambda_a = FLOAT2FIX(log(LAMBDA_A));
- noise_stats->comp_lambda_a = FLOAT2FIX(log(1 - LAMBDA_A));
- noise_stats->lambda_b = FLOAT2FIX(log(LAMBDA_B));
- noise_stats->comp_lambda_b = FLOAT2FIX(log(1 - LAMBDA_B));
- noise_stats->lambda_t = FLOAT2FIX(log(LAMBDA_T));
- noise_stats->mu_t = FLOAT2FIX(log(MU_T));
- noise_stats->max_gain = FLOAT2FIX(log(MAX_GAIN));
- noise_stats->inv_max_gain = FLOAT2FIX(log(1.0 / MAX_GAIN));
-
- for (i = 1; i < 2 * SMOOTH_WINDOW + 3; i++) {
- noise_stats->smooth_scaling[i] = FLOAT2FIX(log(i));
- }
-#endif
-
- return noise_stats;
-}
-
-void
-fe_reset_noisestats(noise_stats_t * noise_stats)
-{
- if (noise_stats)
- noise_stats->undefined = TRUE;
-}
-
-void
-fe_free_noisestats(noise_stats_t * noise_stats)
-{
- ckd_free(noise_stats->power);
- ckd_free(noise_stats->noise);
- ckd_free(noise_stats->floor);
- ckd_free(noise_stats->peak);
- ckd_free(noise_stats);
-}
-
-/**
- * For fixed point we are doing the computation in a fixlog domain,
- * so we have to add many processing cases.
- */
-void
-fe_track_snr(fe_t * fe, int32 *in_speech)
-{
- powspec_t *signal;
- powspec_t *gain;
- noise_stats_t *noise_stats;
- powspec_t *mfspec;
- int32 i, num_filts;
- powspec_t lrt, snr, max_signal, log_signal;
-
- if (!(fe->remove_noise || fe->remove_silence)) {
- *in_speech = TRUE;
- return;
- }
-
- noise_stats = fe->noise_stats;
- mfspec = fe->mfspec;
- num_filts = noise_stats->num_filters;
-
- signal = (powspec_t *) ckd_calloc(num_filts, sizeof(powspec_t));
-
- if (noise_stats->undefined) {
- for (i = 0; i < num_filts; i++) {
- noise_stats->power[i] = mfspec[i];
- noise_stats->noise[i] = mfspec[i];
-#ifndef FIXED_POINT
- noise_stats->floor[i] = mfspec[i] / noise_stats->max_gain;
- noise_stats->peak[i] = 0.0;
-#else
- noise_stats->floor[i] = mfspec[i] - noise_stats->max_gain;
- noise_stats->peak[i] = MIN_FIXLOG;
-#endif
- }
- noise_stats->undefined = FALSE;
- }
-
- /* Calculate smoothed power */
- for (i = 0; i < num_filts; i++) {
-#ifndef FIXED_POINT
- noise_stats->power[i] =
- noise_stats->lambda_power * noise_stats->power[i] + noise_stats->comp_lambda_power * mfspec[i];
-#else
- noise_stats->power[i] = fe_log_add(noise_stats->lambda_power + noise_stats->power[i],
- noise_stats->comp_lambda_power + mfspec[i]);
-#endif
- }
-
- /* Noise estimation and vad decision */
- fe_lower_envelope(noise_stats, noise_stats->power, noise_stats->noise, num_filts);
-
- lrt = FLOAT2FIX(0.0f);
- max_signal = FLOAT2FIX(0.0f);
- for (i = 0; i < num_filts; i++) {
-#ifndef FIXED_POINT
- signal[i] = noise_stats->power[i] - noise_stats->noise[i];
- if (signal[i] < 1.0)
- signal[i] = 1.0;
- snr = log(noise_stats->power[i] / noise_stats->noise[i]);
- log_signal = log(signal[i]);
-#else
- signal[i] = fe_log_sub(noise_stats->power[i], noise_stats->noise[i]);
- snr = noise_stats->power[i] - noise_stats->noise[i];
- log_signal = signal[i];
-#endif
- if (snr > lrt) {
- lrt = snr;
- if (log_signal > max_signal) {
- max_signal = log_signal;
- }
- }
- }
-
-#ifndef FIXED_POINT
- if (fe->remove_silence && (lrt < fe->vad_threshold || max_signal < fe->vad_threshold)) {
-#else
- if (fe->remove_silence && (lrt < FLOAT2FIX(fe->vad_threshold) || max_signal < FLOAT2FIX(fe->vad_threshold))) {
-#endif
- *in_speech = FALSE;
- } else {
- *in_speech = TRUE;
- }
-
- fe_lower_envelope(noise_stats, signal, noise_stats->floor, num_filts);
-
- fe_temp_masking(noise_stats, signal, noise_stats->peak, num_filts);
-
- if (!fe->remove_noise) {
- //no need for further calculations if noise cancellation disabled
- ckd_free(signal);
- return;
- }
-
- for (i = 0; i < num_filts; i++) {
- if (signal[i] < noise_stats->floor[i])
- signal[i] = noise_stats->floor[i];
- }
-
- gain = (powspec_t *) ckd_calloc(num_filts, sizeof(powspec_t));
-#ifndef FIXED_POINT
- for (i = 0; i < num_filts; i++) {
- if (signal[i] < noise_stats->max_gain * noise_stats->power[i])
- gain[i] = signal[i] / noise_stats->power[i];
- else
- gain[i] = noise_stats->max_gain;
- if (gain[i] < noise_stats->inv_max_gain)
- gain[i] = noise_stats->inv_max_gain;
- }
-#else
- for (i = 0; i < num_filts; i++) {
- gain[i] = signal[i] - noise_stats->power[i];
- if (gain[i] > noise_stats->max_gain)
- gain[i] = noise_stats->max_gain;
- if (gain[i] < noise_stats->inv_max_gain)
- gain[i] = noise_stats->inv_max_gain;
- }
-#endif
-
- /* Weight smoothing and time frequency normalization */
- fe_weight_smooth(noise_stats, mfspec, gain, num_filts);
-
- ckd_free(gain);
- ckd_free(signal);
-}
-
-void
-fe_vad_hangover(fe_t * fe, mfcc_t * fea, int32 is_speech)
-{
- /* track vad state and deal with cepstrum prespeech buffer */
- fe->vad_data->state_changed = 0;
- if (is_speech) {
- fe->vad_data->postspch_num = 0;
- if (!fe->vad_data->global_state) {
- fe->vad_data->prespch_num++;
- fe_prespch_write_cep(fe->vad_data->prespch_buf, fea);
- /* check for transition sil->speech */
- if (fe->vad_data->prespch_num >= fe->prespch_len) {
- fe->vad_data->prespch_num = 0;
- fe->vad_data->global_state = 1;
- /* transition silence->speech occurred */
- fe->vad_data->state_changed = 1;
- }
- }
- } else {
- fe->vad_data->prespch_num = 0;
- fe_prespch_reset_cep(fe->vad_data->prespch_buf);
- if (fe->vad_data->global_state) {
- fe->vad_data->postspch_num++;
- /* check for transition speech->sil */
- if (fe->vad_data->postspch_num >= fe->postspch_len) {
- fe->vad_data->postspch_num = 0;
- fe->vad_data->global_state = 0;
- /* transition speech->silence occurred */
- fe->vad_data->state_changed = 1;
- }
- }
- }
-
- if (fe->vad_data->store_pcm) {
- if (is_speech || fe->vad_data->global_state)
- fe_prespch_write_pcm(fe->vad_data->prespch_buf, fe->spch);
- if (!is_speech && !fe->vad_data->global_state)
- fe_prespch_reset_pcm(fe->vad_data->prespch_buf);
- }
-}