diff options
Diffstat (limited to 'dom/media/webspeech/recognition/energy_endpointer.cc')
-rw-r--r-- | dom/media/webspeech/recognition/energy_endpointer.cc | 393 |
1 files changed, 0 insertions, 393 deletions
diff --git a/dom/media/webspeech/recognition/energy_endpointer.cc b/dom/media/webspeech/recognition/energy_endpointer.cc deleted file mode 100644 index 9b1a81589..000000000 --- a/dom/media/webspeech/recognition/energy_endpointer.cc +++ /dev/null @@ -1,393 +0,0 @@ -// Copyright (c) 2013 The Chromium Authors. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "energy_endpointer.h" - -#include <math.h> - -namespace { - -// Returns the RMS (quadratic mean) of the input signal. -float RMS(const int16_t* samples, int num_samples) { - int64_t ssq_int64_t = 0; - int64_t sum_int64_t = 0; - for (int i = 0; i < num_samples; ++i) { - sum_int64_t += samples[i]; - ssq_int64_t += samples[i] * samples[i]; - } - // now convert to floats. - double sum = static_cast<double>(sum_int64_t); - sum /= num_samples; - double ssq = static_cast<double>(ssq_int64_t); - return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum))); -} - -int64_t Secs2Usecs(float seconds) { - return static_cast<int64_t>(0.5 + (1.0e6 * seconds)); -} - -float GetDecibel(float value) { - if (value > 1.0e-100) - return 20 * log10(value); - return -2000.0; -} - -} // namespace - -namespace mozilla { - -// Stores threshold-crossing histories for making decisions about the speech -// state. -class EnergyEndpointer::HistoryRing { - public: - HistoryRing() : insertion_index_(0) {} - - // Resets the ring to |size| elements each with state |initial_state| - void SetRing(int size, bool initial_state); - - // Inserts a new entry into the ring and drops the oldest entry. - void Insert(int64_t time_us, bool decision); - - // Returns the time in microseconds of the most recently added entry. - int64_t EndTime() const; - - // Returns the sum of all intervals during which 'decision' is true within - // the time in seconds specified by 'duration'. The returned interval is - // in seconds. - float RingSum(float duration_sec); - - private: - struct DecisionPoint { - int64_t time_us; - bool decision; - }; - - std::vector<DecisionPoint> decision_points_; - int insertion_index_; // Index at which the next item gets added/inserted. - - HistoryRing(const HistoryRing&); - void operator=(const HistoryRing&); -}; - -void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) { - insertion_index_ = 0; - decision_points_.clear(); - DecisionPoint init = { -1, initial_state }; - decision_points_.resize(size, init); -} - -void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) { - decision_points_[insertion_index_].time_us = time_us; - decision_points_[insertion_index_].decision = decision; - insertion_index_ = (insertion_index_ + 1) % decision_points_.size(); -} - -int64_t EnergyEndpointer::HistoryRing::EndTime() const { - int ind = insertion_index_ - 1; - if (ind < 0) - ind = decision_points_.size() - 1; - return decision_points_[ind].time_us; -} - -float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) { - if (!decision_points_.size()) - return 0.0; - - int64_t sum_us = 0; - int ind = insertion_index_ - 1; - if (ind < 0) - ind = decision_points_.size() - 1; - int64_t end_us = decision_points_[ind].time_us; - bool is_on = decision_points_[ind].decision; - int64_t start_us = end_us - static_cast<int64_t>(0.5 + (1.0e6 * duration_sec)); - if (start_us < 0) - start_us = 0; - size_t n_summed = 1; // n points ==> (n-1) intervals - while ((decision_points_[ind].time_us > start_us) && - (n_summed < decision_points_.size())) { - --ind; - if (ind < 0) - ind = decision_points_.size() - 1; - if (is_on) - sum_us += end_us - decision_points_[ind].time_us; - is_on = decision_points_[ind].decision; - end_us = decision_points_[ind].time_us; - n_summed++; - } - - return 1.0e-6f * sum_us; // Returns total time that was super threshold. -} - -EnergyEndpointer::EnergyEndpointer() - : status_(EP_PRE_SPEECH), - offset_confirm_dur_sec_(0), - endpointer_time_us_(0), - fast_update_frames_(0), - frame_counter_(0), - max_window_dur_(4.0), - sample_rate_(0), - history_(new HistoryRing()), - decision_threshold_(0), - estimating_environment_(false), - noise_level_(0), - rms_adapt_(0), - start_lag_(0), - end_lag_(0), - user_input_start_time_us_(0) { -} - -EnergyEndpointer::~EnergyEndpointer() { -} - -int EnergyEndpointer::TimeToFrame(float time) const { - return static_cast<int32_t>(0.5 + (time / params_.frame_period())); -} - -void EnergyEndpointer::Restart(bool reset_threshold) { - status_ = EP_PRE_SPEECH; - user_input_start_time_us_ = 0; - - if (reset_threshold) { - decision_threshold_ = params_.decision_threshold(); - rms_adapt_ = decision_threshold_; - noise_level_ = params_.decision_threshold() / 2.0f; - frame_counter_ = 0; // Used for rapid initial update of levels. - } - - // Set up the memories to hold the history windows. - history_->SetRing(TimeToFrame(max_window_dur_), false); - - // Flag that indicates that current input should be used for - // estimating the environment. The user has not yet started input - // by e.g. pressed the push-to-talk button. By default, this is - // false for backward compatibility. - estimating_environment_ = false; -} - -void EnergyEndpointer::Init(const EnergyEndpointerParams& params) { - params_ = params; - - // Find the longest history interval to be used, and make the ring - // large enough to accommodate that number of frames. NOTE: This - // depends upon ep_frame_period being set correctly in the factory - // that did this instantiation. - max_window_dur_ = params_.onset_window(); - if (params_.speech_on_window() > max_window_dur_) - max_window_dur_ = params_.speech_on_window(); - if (params_.offset_window() > max_window_dur_) - max_window_dur_ = params_.offset_window(); - Restart(true); - - offset_confirm_dur_sec_ = params_.offset_window() - - params_.offset_confirm_dur(); - if (offset_confirm_dur_sec_ < 0.0) - offset_confirm_dur_sec_ = 0.0; - - user_input_start_time_us_ = 0; - - // Flag that indicates that current input should be used for - // estimating the environment. The user has not yet started input - // by e.g. pressed the push-to-talk button. By default, this is - // false for backward compatibility. - estimating_environment_ = false; - // The initial value of the noise and speech levels is inconsequential. - // The level of the first frame will overwrite these values. - noise_level_ = params_.decision_threshold() / 2.0f; - fast_update_frames_ = - static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period()); - - frame_counter_ = 0; // Used for rapid initial update of levels. - - sample_rate_ = params_.sample_rate(); - start_lag_ = static_cast<int>(sample_rate_ / - params_.max_fundamental_frequency()); - end_lag_ = static_cast<int>(sample_rate_ / - params_.min_fundamental_frequency()); -} - -void EnergyEndpointer::StartSession() { - Restart(true); -} - -void EnergyEndpointer::EndSession() { - status_ = EP_POST_SPEECH; -} - -void EnergyEndpointer::SetEnvironmentEstimationMode() { - Restart(true); - estimating_environment_ = true; -} - -void EnergyEndpointer::SetUserInputMode() { - estimating_environment_ = false; - user_input_start_time_us_ = endpointer_time_us_; -} - -void EnergyEndpointer::ProcessAudioFrame(int64_t time_us, - const int16_t* samples, - int num_samples, - float* rms_out) { - endpointer_time_us_ = time_us; - float rms = RMS(samples, num_samples); - - // Check that this is user input audio vs. pre-input adaptation audio. - // Input audio starts when the user indicates start of input, by e.g. - // pressing push-to-talk. Audio recieved prior to that is used to update - // noise and speech level estimates. - if (!estimating_environment_) { - bool decision = false; - if ((endpointer_time_us_ - user_input_start_time_us_) < - Secs2Usecs(params_.contamination_rejection_period())) { - decision = false; - //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_)); - } else { - decision = (rms > decision_threshold_); - } - - history_->Insert(endpointer_time_us_, decision); - - switch (status_) { - case EP_PRE_SPEECH: - if (history_->RingSum(params_.onset_window()) > - params_.onset_detect_dur()) { - status_ = EP_POSSIBLE_ONSET; - } - break; - - case EP_POSSIBLE_ONSET: { - float tsum = history_->RingSum(params_.onset_window()); - if (tsum > params_.onset_confirm_dur()) { - status_ = EP_SPEECH_PRESENT; - } else { // If signal is not maintained, drop back to pre-speech. - if (tsum <= params_.onset_detect_dur()) - status_ = EP_PRE_SPEECH; - } - break; - } - - case EP_SPEECH_PRESENT: { - // To induce hysteresis in the state residency, we allow a - // smaller residency time in the on_ring, than was required to - // enter the SPEECH_PERSENT state. - float on_time = history_->RingSum(params_.speech_on_window()); - if (on_time < params_.on_maintain_dur()) - status_ = EP_POSSIBLE_OFFSET; - break; - } - - case EP_POSSIBLE_OFFSET: - if (history_->RingSum(params_.offset_window()) <= - offset_confirm_dur_sec_) { - // Note that this offset time may be beyond the end - // of the input buffer in a real-time system. It will be up - // to the RecognizerSession to decide what to do. - status_ = EP_PRE_SPEECH; // Automatically reset for next utterance. - } else { // If speech picks up again we allow return to SPEECH_PRESENT. - if (history_->RingSum(params_.speech_on_window()) >= - params_.on_maintain_dur()) - status_ = EP_SPEECH_PRESENT; - } - break; - - default: - break; - } - - // If this is a quiet, non-speech region, slowly adapt the detection - // threshold to be about 6dB above the average RMS. - if ((!decision) && (status_ == EP_PRE_SPEECH)) { - decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms); - rms_adapt_ = decision_threshold_; - } else { - // If this is in a speech region, adapt the decision threshold to - // be about 10dB below the average RMS. If the noise level is high, - // the threshold is pushed up. - // Adaptation up to a higher level is 5 times faster than decay to - // a lower level. - if ((status_ == EP_SPEECH_PRESENT) && decision) { - if (rms_adapt_ > rms) { - rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms); - } else { - rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms); - } - float target_threshold = 0.3f * rms_adapt_ + noise_level_; - decision_threshold_ = (.90f * decision_threshold_) + - (0.10f * target_threshold); - } - } - - // Set a floor - if (decision_threshold_ < params_.min_decision_threshold()) - decision_threshold_ = params_.min_decision_threshold(); - } - - // Update speech and noise levels. - UpdateLevels(rms); - ++frame_counter_; - - if (rms_out) - *rms_out = GetDecibel(rms); -} - -float EnergyEndpointer::GetNoiseLevelDb() const { - return GetDecibel(noise_level_); -} - -void EnergyEndpointer::UpdateLevels(float rms) { - // Update quickly initially. We assume this is noise and that - // speech is 6dB above the noise. - if (frame_counter_ < fast_update_frames_) { - // Alpha increases from 0 to (k-1)/k where k is the number of time - // steps in the initial adaptation period. - float alpha = static_cast<float>(frame_counter_) / - static_cast<float>(fast_update_frames_); - noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms); - //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_)); - } else { - // Update Noise level. The noise level adapts quickly downward, but - // slowly upward. The noise_level_ parameter is not currently used - // for threshold adaptation. It is used for UI feedback. - if (noise_level_ < rms) - noise_level_ = (0.999f * noise_level_) + (0.001f * rms); - else - noise_level_ = (0.95f * noise_level_) + (0.05f * rms); - } - if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) { - decision_threshold_ = noise_level_ * 2; // 6dB above noise level. - // Set a floor - if (decision_threshold_ < params_.min_decision_threshold()) - decision_threshold_ = params_.min_decision_threshold(); - } -} - -EpStatus EnergyEndpointer::Status(int64_t* status_time) const { - *status_time = history_->EndTime(); - return status_; -} - -} // namespace mozilla |