diff options
Diffstat (limited to 'media/sphinxbase/src/libsphinxbase/fe/fe_noise.c')
-rw-r--r-- | media/sphinxbase/src/libsphinxbase/fe/fe_noise.c | 425 |
1 files changed, 425 insertions, 0 deletions
diff --git a/media/sphinxbase/src/libsphinxbase/fe/fe_noise.c b/media/sphinxbase/src/libsphinxbase/fe/fe_noise.c new file mode 100644 index 000000000..4fb6d21a9 --- /dev/null +++ b/media/sphinxbase/src/libsphinxbase/fe/fe_noise.c @@ -0,0 +1,425 @@ +/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* ==================================================================== + * Copyright (c) 2013 Carnegie Mellon University. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * This work was supported in part by funding from the Defense Advanced + * Research Projects Agency and the National Science Foundation of the + * United States of America, and the CMU Sphinx Speech Consortium. + * + * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +/* This noise removal algorithm is inspired by the following papers + * Computationally Efficient Speech Enchancement by Spectral Minina Tracking + * by G. Doblinger + * + * Power-Normalized Cepstral Coefficients (PNCC) for Robust Speech Recognition + * by C. Kim. + * + * For the recent research and state of art see papers about IMRCA and + * A Minimum-Mean-Square-Error Noise Reduction Algorithm On Mel-Frequency + * Cepstra For Robust Speech Recognition by Dong Yu and others + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <math.h> + +#include "sphinxbase/prim_type.h" +#include "sphinxbase/ckd_alloc.h" +#include "sphinxbase/strfuncs.h" +#include "sphinxbase/err.h" + +#include "fe_noise.h" +#include "fe_internal.h" + +/* Noise supression constants */ +#define SMOOTH_WINDOW 4 +#define LAMBDA_POWER 0.7 +#define LAMBDA_A 0.995 +#define LAMBDA_B 0.5 +#define LAMBDA_T 0.85 +#define MU_T 0.2 +#define MAX_GAIN 20 + +struct noise_stats_s { + /* Smoothed power */ + powspec_t *power; + /* Noise estimate */ + powspec_t *noise; + /* Signal floor estimate */ + powspec_t *floor; + /* Peak for temporal masking */ + powspec_t *peak; + + /* Initialize it next time */ + uint8 undefined; + /* Number of items to process */ + uint32 num_filters; + + /* Precomputed constants */ + powspec_t lambda_power; + powspec_t comp_lambda_power; + powspec_t lambda_a; + powspec_t comp_lambda_a; + powspec_t lambda_b; + powspec_t comp_lambda_b; + powspec_t lambda_t; + powspec_t mu_t; + powspec_t max_gain; + powspec_t inv_max_gain; + + powspec_t smooth_scaling[2 * SMOOTH_WINDOW + 3]; +}; + +static void +fe_lower_envelope(noise_stats_t *noise_stats, powspec_t * buf, powspec_t * floor_buf, int32 num_filt) +{ + int i; + + for (i = 0; i < num_filt; i++) { +#ifndef FIXED_POINT + if (buf[i] >= floor_buf[i]) { + floor_buf[i] = + noise_stats->lambda_a * floor_buf[i] + noise_stats->comp_lambda_a * buf[i]; + } + else { + floor_buf[i] = + noise_stats->lambda_b * floor_buf[i] + noise_stats->comp_lambda_b * buf[i]; + } +#else + if (buf[i] >= floor_buf[i]) { + floor_buf[i] = fe_log_add(noise_stats->lambda_a + floor_buf[i], + noise_stats->comp_lambda_a + buf[i]); + } + else { + floor_buf[i] = fe_log_add(noise_stats->lambda_b + floor_buf[i], + noise_stats->comp_lambda_b + buf[i]); + } +#endif + } +} + +/* temporal masking */ +static void +fe_temp_masking(noise_stats_t *noise_stats, powspec_t * buf, powspec_t * peak, int32 num_filt) +{ + powspec_t cur_in; + int i; + + for (i = 0; i < num_filt; i++) { + cur_in = buf[i]; + +#ifndef FIXED_POINT + peak[i] *= noise_stats->lambda_t; + if (buf[i] < noise_stats->lambda_t * peak[i]) + buf[i] = peak[i] * noise_stats->mu_t; +#else + peak[i] += noise_stats->lambda_t; + if (buf[i] < noise_stats->lambda_t + peak[i]) + buf[i] = peak[i] + noise_stats->mu_t; +#endif + + if (cur_in > peak[i]) + peak[i] = cur_in; + } +} + +/* spectral weight smoothing */ +static void +fe_weight_smooth(noise_stats_t *noise_stats, powspec_t * buf, powspec_t * coefs, int32 num_filt) +{ + int i, j; + int l1, l2; + powspec_t coef; + + for (i = 0; i < num_filt; i++) { + l1 = ((i - SMOOTH_WINDOW) > 0) ? (i - SMOOTH_WINDOW) : 0; + l2 = ((i + SMOOTH_WINDOW) < + (num_filt - 1)) ? (i + SMOOTH_WINDOW) : (num_filt - 1); + +#ifndef FIXED_POINT + coef = 0; + for (j = l1; j <= l2; j++) { + coef += coefs[j]; + } + buf[i] = buf[i] * (coef / (l2 - l1 + 1)); +#else + coef = MIN_FIXLOG; + for (j = l1; j <= l2; j++) { + coef = fe_log_add(coef, coefs[j]); + } + buf[i] = buf[i] + coef - noise_stats->smooth_scaling[l2 - l1 + 1]; +#endif + + } +} + +noise_stats_t * +fe_init_noisestats(int num_filters) +{ + int i; + noise_stats_t *noise_stats; + + noise_stats = (noise_stats_t *) ckd_calloc(1, sizeof(noise_stats_t)); + + noise_stats->power = + (powspec_t *) ckd_calloc(num_filters, sizeof(powspec_t)); + noise_stats->noise = + (powspec_t *) ckd_calloc(num_filters, sizeof(powspec_t)); + noise_stats->floor = + (powspec_t *) ckd_calloc(num_filters, sizeof(powspec_t)); + noise_stats->peak = + (powspec_t *) ckd_calloc(num_filters, sizeof(powspec_t)); + + noise_stats->undefined = TRUE; + noise_stats->num_filters = num_filters; + +#ifndef FIXED_POINT + noise_stats->lambda_power = LAMBDA_POWER; + noise_stats->comp_lambda_power = 1 - LAMBDA_POWER; + noise_stats->lambda_a = LAMBDA_A; + noise_stats->comp_lambda_a = 1 - LAMBDA_A; + noise_stats->lambda_b = LAMBDA_B; + noise_stats->comp_lambda_b = 1 - LAMBDA_B; + noise_stats->lambda_t = LAMBDA_T; + noise_stats->mu_t = MU_T; + noise_stats->max_gain = MAX_GAIN; + noise_stats->inv_max_gain = 1.0 / MAX_GAIN; + + for (i = 1; i < 2 * SMOOTH_WINDOW + 1; i++) { + noise_stats->smooth_scaling[i] = 1.0 / i; + } +#else + noise_stats->lambda_power = FLOAT2FIX(log(LAMBDA_POWER)); + noise_stats->comp_lambda_power = FLOAT2FIX(log(1 - LAMBDA_POWER)); + noise_stats->lambda_a = FLOAT2FIX(log(LAMBDA_A)); + noise_stats->comp_lambda_a = FLOAT2FIX(log(1 - LAMBDA_A)); + noise_stats->lambda_b = FLOAT2FIX(log(LAMBDA_B)); + noise_stats->comp_lambda_b = FLOAT2FIX(log(1 - LAMBDA_B)); + noise_stats->lambda_t = FLOAT2FIX(log(LAMBDA_T)); + noise_stats->mu_t = FLOAT2FIX(log(MU_T)); + noise_stats->max_gain = FLOAT2FIX(log(MAX_GAIN)); + noise_stats->inv_max_gain = FLOAT2FIX(log(1.0 / MAX_GAIN)); + + for (i = 1; i < 2 * SMOOTH_WINDOW + 3; i++) { + noise_stats->smooth_scaling[i] = FLOAT2FIX(log(i)); + } +#endif + + return noise_stats; +} + +void +fe_reset_noisestats(noise_stats_t * noise_stats) +{ + if (noise_stats) + noise_stats->undefined = TRUE; +} + +void +fe_free_noisestats(noise_stats_t * noise_stats) +{ + ckd_free(noise_stats->power); + ckd_free(noise_stats->noise); + ckd_free(noise_stats->floor); + ckd_free(noise_stats->peak); + ckd_free(noise_stats); +} + +/** + * For fixed point we are doing the computation in a fixlog domain, + * so we have to add many processing cases. + */ +void +fe_track_snr(fe_t * fe, int32 *in_speech) +{ + powspec_t *signal; + powspec_t *gain; + noise_stats_t *noise_stats; + powspec_t *mfspec; + int32 i, num_filts; + powspec_t lrt, snr, max_signal, log_signal; + + if (!(fe->remove_noise || fe->remove_silence)) { + *in_speech = TRUE; + return; + } + + noise_stats = fe->noise_stats; + mfspec = fe->mfspec; + num_filts = noise_stats->num_filters; + + signal = (powspec_t *) ckd_calloc(num_filts, sizeof(powspec_t)); + + if (noise_stats->undefined) { + for (i = 0; i < num_filts; i++) { + noise_stats->power[i] = mfspec[i]; + noise_stats->noise[i] = mfspec[i]; +#ifndef FIXED_POINT + noise_stats->floor[i] = mfspec[i] / noise_stats->max_gain; + noise_stats->peak[i] = 0.0; +#else + noise_stats->floor[i] = mfspec[i] - noise_stats->max_gain; + noise_stats->peak[i] = MIN_FIXLOG; +#endif + } + noise_stats->undefined = FALSE; + } + + /* Calculate smoothed power */ + for (i = 0; i < num_filts; i++) { +#ifndef FIXED_POINT + noise_stats->power[i] = + noise_stats->lambda_power * noise_stats->power[i] + noise_stats->comp_lambda_power * mfspec[i]; +#else + noise_stats->power[i] = fe_log_add(noise_stats->lambda_power + noise_stats->power[i], + noise_stats->comp_lambda_power + mfspec[i]); +#endif + } + + /* Noise estimation and vad decision */ + fe_lower_envelope(noise_stats, noise_stats->power, noise_stats->noise, num_filts); + + lrt = FLOAT2FIX(0.0f); + max_signal = FLOAT2FIX(0.0f); + for (i = 0; i < num_filts; i++) { +#ifndef FIXED_POINT + signal[i] = noise_stats->power[i] - noise_stats->noise[i]; + if (signal[i] < 1.0) + signal[i] = 1.0; + snr = log(noise_stats->power[i] / noise_stats->noise[i]); + log_signal = log(signal[i]); +#else + signal[i] = fe_log_sub(noise_stats->power[i], noise_stats->noise[i]); + snr = noise_stats->power[i] - noise_stats->noise[i]; + log_signal = signal[i]; +#endif + if (snr > lrt) { + lrt = snr; + if (log_signal > max_signal) { + max_signal = log_signal; + } + } + } + +#ifndef FIXED_POINT + if (fe->remove_silence && (lrt < fe->vad_threshold || max_signal < fe->vad_threshold)) { +#else + if (fe->remove_silence && (lrt < FLOAT2FIX(fe->vad_threshold) || max_signal < FLOAT2FIX(fe->vad_threshold))) { +#endif + *in_speech = FALSE; + } else { + *in_speech = TRUE; + } + + fe_lower_envelope(noise_stats, signal, noise_stats->floor, num_filts); + + fe_temp_masking(noise_stats, signal, noise_stats->peak, num_filts); + + if (!fe->remove_noise) { + //no need for further calculations if noise cancellation disabled + ckd_free(signal); + return; + } + + for (i = 0; i < num_filts; i++) { + if (signal[i] < noise_stats->floor[i]) + signal[i] = noise_stats->floor[i]; + } + + gain = (powspec_t *) ckd_calloc(num_filts, sizeof(powspec_t)); +#ifndef FIXED_POINT + for (i = 0; i < num_filts; i++) { + if (signal[i] < noise_stats->max_gain * noise_stats->power[i]) + gain[i] = signal[i] / noise_stats->power[i]; + else + gain[i] = noise_stats->max_gain; + if (gain[i] < noise_stats->inv_max_gain) + gain[i] = noise_stats->inv_max_gain; + } +#else + for (i = 0; i < num_filts; i++) { + gain[i] = signal[i] - noise_stats->power[i]; + if (gain[i] > noise_stats->max_gain) + gain[i] = noise_stats->max_gain; + if (gain[i] < noise_stats->inv_max_gain) + gain[i] = noise_stats->inv_max_gain; + } +#endif + + /* Weight smoothing and time frequency normalization */ + fe_weight_smooth(noise_stats, mfspec, gain, num_filts); + + ckd_free(gain); + ckd_free(signal); +} + +void +fe_vad_hangover(fe_t * fe, mfcc_t * fea, int32 is_speech) +{ + /* track vad state and deal with cepstrum prespeech buffer */ + fe->vad_data->state_changed = 0; + if (is_speech) { + fe->vad_data->postspch_num = 0; + if (!fe->vad_data->global_state) { + fe->vad_data->prespch_num++; + fe_prespch_write_cep(fe->vad_data->prespch_buf, fea); + /* check for transition sil->speech */ + if (fe->vad_data->prespch_num >= fe->prespch_len) { + fe->vad_data->prespch_num = 0; + fe->vad_data->global_state = 1; + /* transition silence->speech occurred */ + fe->vad_data->state_changed = 1; + } + } + } else { + fe->vad_data->prespch_num = 0; + fe_prespch_reset_cep(fe->vad_data->prespch_buf); + if (fe->vad_data->global_state) { + fe->vad_data->postspch_num++; + /* check for transition speech->sil */ + if (fe->vad_data->postspch_num >= fe->postspch_len) { + fe->vad_data->postspch_num = 0; + fe->vad_data->global_state = 0; + /* transition speech->silence occurred */ + fe->vad_data->state_changed = 1; + } + } + } + + if (fe->vad_data->store_pcm) { + if (is_speech || fe->vad_data->global_state) + fe_prespch_write_pcm(fe->vad_data->prespch_buf, fe->spch); + if (!is_speech && !fe->vad_data->global_state) + fe_prespch_reset_pcm(fe->vad_data->prespch_buf); + } +} |