summaryrefslogtreecommitdiffstats
path: root/dom/media/webspeech/recognition/SpeechRecognition.cpp
diff options
context:
space:
mode:
authorMoonchild <moonchild@palemoon.org>2020-05-20 10:19:04 +0000
committerMoonchild <moonchild@palemoon.org>2020-05-20 14:04:17 +0000
commit99c2e698d2a3c56649e42d8d2133706cd8c9501e (patch)
tree85be449d772eb57860f0f386efb4bc1e790fd498 /dom/media/webspeech/recognition/SpeechRecognition.cpp
parent15ac4021b06d549e47c9e2efc9364a9eb96bfe82 (diff)
downloadUXP-99c2e698d2a3c56649e42d8d2133706cd8c9501e.tar
UXP-99c2e698d2a3c56649e42d8d2133706cd8c9501e.tar.gz
UXP-99c2e698d2a3c56649e42d8d2133706cd8c9501e.tar.lz
UXP-99c2e698d2a3c56649e42d8d2133706cd8c9501e.tar.xz
UXP-99c2e698d2a3c56649e42d8d2133706cd8c9501e.zip
Issue #1538 - remove speech recognition engine
This removes speech recognition, pocketsphinx, training models and the speech automated test interface. This also re-establishes proper use of MOZ_WEBSPEECH to work for the speech API (synthesis part only) that was a broken mess before, with some synth parts being always built, some parts being built only with it enabled and recognition parts being dependent on it. I'm pretty sure it'd be totally busted if you'd ever have tried building without MOZ_WEBPEECH before. Tested that synthesis still works as-intended. This resolves #1538
Diffstat (limited to 'dom/media/webspeech/recognition/SpeechRecognition.cpp')
-rw-r--r--dom/media/webspeech/recognition/SpeechRecognition.cpp1088
1 files changed, 0 insertions, 1088 deletions
diff --git a/dom/media/webspeech/recognition/SpeechRecognition.cpp b/dom/media/webspeech/recognition/SpeechRecognition.cpp
deleted file mode 100644
index cd57f03ca..000000000
--- a/dom/media/webspeech/recognition/SpeechRecognition.cpp
+++ /dev/null
@@ -1,1088 +0,0 @@
-/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
-/* vim:set ts=2 sw=2 sts=2 et cindent: */
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-#include "SpeechRecognition.h"
-
-#include "nsCOMPtr.h"
-#include "nsCycleCollectionParticipant.h"
-
-#include "mozilla/dom/BindingUtils.h"
-#include "mozilla/dom/Element.h"
-#include "mozilla/dom/SpeechRecognitionBinding.h"
-#include "mozilla/dom/MediaStreamTrackBinding.h"
-#include "mozilla/dom/MediaStreamError.h"
-#include "mozilla/MediaManager.h"
-#include "mozilla/Preferences.h"
-#include "MediaPrefs.h"
-#include "mozilla/Services.h"
-
-#include "AudioSegment.h"
-#include "endpointer.h"
-
-#include "SpeechGrammar.h"
-#include "mozilla/dom/SpeechRecognitionEvent.h"
-#include "nsContentUtils.h"
-#include "nsIDocument.h"
-#include "nsIObserverService.h"
-#include "nsIPermissionManager.h"
-#include "nsIPrincipal.h"
-#include "nsPIDOMWindow.h"
-#include "nsServiceManagerUtils.h"
-#include "nsQueryObject.h"
-
-#include <algorithm>
-
-// Undo the windows.h damage
-#if defined(XP_WIN) && defined(GetMessage)
-#undef GetMessage
-#endif
-
-namespace mozilla {
-namespace dom {
-
-#define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default"
-#define DEFAULT_RECOGNITION_SERVICE_PREFIX "pocketsphinx-"
-#define DEFAULT_RECOGNITION_SERVICE "pocketsphinx-en-US"
-
-#define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length"
-#define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH "media.webspeech.long_silence_length"
-#define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH "media.webspeech.long_speech_length"
-
-static const uint32_t kSAMPLE_RATE = 16000;
-static const uint32_t kSPEECH_DETECTION_TIMEOUT_MS = 10000;
-
-// number of frames corresponding to 300ms of audio to send to endpointer while
-// it's in environment estimation mode
-// kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms
-static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000;
-
-LogModule*
-GetSpeechRecognitionLog()
-{
- static LazyLogModule sLog("SpeechRecognition");
- return sLog;
-}
-#define SR_LOG(...) MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__))
-
-already_AddRefed<nsISpeechRecognitionService>
-GetSpeechRecognitionService(const nsAString& aLang)
-{
- nsAutoCString speechRecognitionServiceCID;
-
- nsAdoptingCString prefValue =
- Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE);
- nsAutoCString speechRecognitionService;
-
- if (!aLang.IsEmpty()) {
- speechRecognitionService =
- NS_LITERAL_CSTRING(DEFAULT_RECOGNITION_SERVICE_PREFIX) +
- NS_ConvertUTF16toUTF8(aLang);
- } else if (!prefValue.IsEmpty()) {
- speechRecognitionService = prefValue;
- } else {
- speechRecognitionService = DEFAULT_RECOGNITION_SERVICE;
- }
-
- if (MediaPrefs::WebSpeechFakeRecognitionService()) {
- speechRecognitionServiceCID =
- NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake";
- } else {
- speechRecognitionServiceCID =
- NS_LITERAL_CSTRING(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) +
- speechRecognitionService;
- }
-
- nsresult rv;
- nsCOMPtr<nsISpeechRecognitionService> recognitionService;
- recognitionService = do_GetService(speechRecognitionServiceCID.get(), &rv);
- return recognitionService.forget();
-}
-
-NS_IMPL_CYCLE_COLLECTION_INHERITED(SpeechRecognition, DOMEventTargetHelper, mDOMStream, mSpeechGrammarList)
-
-NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION_INHERITED(SpeechRecognition)
- NS_INTERFACE_MAP_ENTRY(nsIObserver)
-NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper)
-
-NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper)
-NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper)
-
-SpeechRecognition::SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow)
- : DOMEventTargetHelper(aOwnerWindow)
- , mEndpointer(kSAMPLE_RATE)
- , mAudioSamplesPerChunk(mEndpointer.FrameSize())
- , mSpeechDetectionTimer(do_CreateInstance(NS_TIMER_CONTRACTID))
- , mSpeechGrammarList(new SpeechGrammarList(GetParentObject()))
- , mInterimResults(false)
- , mMaxAlternatives(1)
-{
- SR_LOG("created SpeechRecognition");
-
- if (MediaPrefs::WebSpeechTestEnabled()) {
- nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
- obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false);
- obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false);
- }
-
- mEndpointer.set_speech_input_complete_silence_length(
- Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 1250000));
- mEndpointer.set_long_speech_input_complete_silence_length(
- Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 2500000));
- mEndpointer.set_long_speech_length(
- Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000));
- Reset();
-}
-
-bool
-SpeechRecognition::StateBetween(FSMState begin, FSMState end)
-{
- return mCurrentState >= begin && mCurrentState <= end;
-}
-
-void
-SpeechRecognition::SetState(FSMState state)
-{
- mCurrentState = state;
- SR_LOG("Transitioned to state %s", GetName(mCurrentState));
- return;
-}
-
-JSObject*
-SpeechRecognition::WrapObject(JSContext* aCx, JS::Handle<JSObject*> aGivenProto)
-{
- return SpeechRecognitionBinding::Wrap(aCx, this, aGivenProto);
-}
-
-bool
-SpeechRecognition::IsAuthorized(JSContext* aCx, JSObject* aGlobal)
-{
- nsCOMPtr<nsIPrincipal> principal = nsContentUtils::ObjectPrincipal(aGlobal);
-
- nsresult rv;
- nsCOMPtr<nsIPermissionManager> mgr = do_GetService(NS_PERMISSIONMANAGER_CONTRACTID, &rv);
- if (NS_WARN_IF(NS_FAILED(rv))) {
- return false;
- }
-
- uint32_t speechRecognition = nsIPermissionManager::UNKNOWN_ACTION;
- rv = mgr->TestExactPermissionFromPrincipal(principal, "speech-recognition", &speechRecognition);
- if (NS_WARN_IF(NS_FAILED(rv))) {
- return false;
- }
-
- bool hasPermission = (speechRecognition == nsIPermissionManager::ALLOW_ACTION);
-
- return (hasPermission || MediaPrefs::WebSpeechRecognitionForceEnabled()
- || MediaPrefs::WebSpeechTestEnabled())
- && MediaPrefs::WebSpeechRecognitionEnabled();
-}
-
-already_AddRefed<SpeechRecognition>
-SpeechRecognition::Constructor(const GlobalObject& aGlobal,
- ErrorResult& aRv)
-{
- nsCOMPtr<nsPIDOMWindowInner> win = do_QueryInterface(aGlobal.GetAsSupports());
- if (!win) {
- aRv.Throw(NS_ERROR_FAILURE);
- }
-
- MOZ_ASSERT(win->IsInnerWindow());
- RefPtr<SpeechRecognition> object = new SpeechRecognition(win);
- return object.forget();
-}
-
-nsISupports*
-SpeechRecognition::GetParentObject() const
-{
- return GetOwner();
-}
-
-void
-SpeechRecognition::ProcessEvent(SpeechEvent* aEvent)
-{
- SR_LOG("Processing %s, current state is %s",
- GetName(aEvent),
- GetName(mCurrentState));
-
- if (mAborted && aEvent->mType != EVENT_ABORT) {
- // ignore all events while aborting
- return;
- }
-
- Transition(aEvent);
-}
-
-void
-SpeechRecognition::Transition(SpeechEvent* aEvent)
-{
- switch (mCurrentState) {
- case STATE_IDLE:
- switch (aEvent->mType) {
- case EVENT_START:
- // TODO: may want to time out if we wait too long
- // for user to approve
- WaitForAudioData(aEvent);
- break;
- case EVENT_STOP:
- case EVENT_ABORT:
- case EVENT_AUDIO_DATA:
- case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
- case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
- DoNothing(aEvent);
- break;
- case EVENT_AUDIO_ERROR:
- case EVENT_RECOGNITIONSERVICE_ERROR:
- AbortError(aEvent);
- break;
- case EVENT_COUNT:
- MOZ_CRASH("Invalid event EVENT_COUNT");
- }
- break;
- case STATE_STARTING:
- switch (aEvent->mType) {
- case EVENT_AUDIO_DATA:
- StartedAudioCapture(aEvent);
- break;
- case EVENT_AUDIO_ERROR:
- case EVENT_RECOGNITIONSERVICE_ERROR:
- AbortError(aEvent);
- break;
- case EVENT_ABORT:
- AbortSilently(aEvent);
- break;
- case EVENT_STOP:
- Reset();
- break;
- case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
- case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
- DoNothing(aEvent);
- break;
- case EVENT_START:
- SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
- MOZ_CRASH();
- case EVENT_COUNT:
- MOZ_CRASH("Invalid event EVENT_COUNT");
- }
- break;
- case STATE_ESTIMATING:
- switch (aEvent->mType) {
- case EVENT_AUDIO_DATA:
- WaitForEstimation(aEvent);
- break;
- case EVENT_STOP:
- StopRecordingAndRecognize(aEvent);
- break;
- case EVENT_ABORT:
- AbortSilently(aEvent);
- break;
- case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
- case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
- case EVENT_RECOGNITIONSERVICE_ERROR:
- DoNothing(aEvent);
- break;
- case EVENT_AUDIO_ERROR:
- AbortError(aEvent);
- break;
- case EVENT_START:
- SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType);
- MOZ_CRASH();
- case EVENT_COUNT:
- MOZ_CRASH("Invalid event EVENT_COUNT");
- }
- break;
- case STATE_WAITING_FOR_SPEECH:
- switch (aEvent->mType) {
- case EVENT_AUDIO_DATA:
- DetectSpeech(aEvent);
- break;
- case EVENT_STOP:
- StopRecordingAndRecognize(aEvent);
- break;
- case EVENT_ABORT:
- AbortSilently(aEvent);
- break;
- case EVENT_AUDIO_ERROR:
- AbortError(aEvent);
- break;
- case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
- case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
- case EVENT_RECOGNITIONSERVICE_ERROR:
- DoNothing(aEvent);
- break;
- case EVENT_START:
- SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
- MOZ_CRASH();
- case EVENT_COUNT:
- MOZ_CRASH("Invalid event EVENT_COUNT");
- }
- break;
- case STATE_RECOGNIZING:
- switch (aEvent->mType) {
- case EVENT_AUDIO_DATA:
- WaitForSpeechEnd(aEvent);
- break;
- case EVENT_STOP:
- StopRecordingAndRecognize(aEvent);
- break;
- case EVENT_AUDIO_ERROR:
- case EVENT_RECOGNITIONSERVICE_ERROR:
- AbortError(aEvent);
- break;
- case EVENT_ABORT:
- AbortSilently(aEvent);
- break;
- case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
- case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
- DoNothing(aEvent);
- break;
- case EVENT_START:
- SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent));
- MOZ_CRASH();
- case EVENT_COUNT:
- MOZ_CRASH("Invalid event EVENT_COUNT");
- }
- break;
- case STATE_WAITING_FOR_RESULT:
- switch (aEvent->mType) {
- case EVENT_STOP:
- DoNothing(aEvent);
- break;
- case EVENT_AUDIO_ERROR:
- case EVENT_RECOGNITIONSERVICE_ERROR:
- AbortError(aEvent);
- break;
- case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
- NotifyFinalResult(aEvent);
- break;
- case EVENT_AUDIO_DATA:
- DoNothing(aEvent);
- break;
- case EVENT_ABORT:
- AbortSilently(aEvent);
- break;
- case EVENT_START:
- case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
- SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s", GetName(aEvent));
- MOZ_CRASH();
- case EVENT_COUNT:
- MOZ_CRASH("Invalid event EVENT_COUNT");
- }
- break;
- case STATE_COUNT:
- MOZ_CRASH("Invalid state STATE_COUNT");
- }
-
- return;
-}
-
-/*
- * Handle a segment of recorded audio data.
- * Returns the number of samples that were processed.
- */
-uint32_t
-SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate)
-{
- AudioSegment::ChunkIterator iterator(*aSegment);
- uint32_t samples = 0;
- while (!iterator.IsEnded()) {
- float out;
- mEndpointer.ProcessAudio(*iterator, &out);
- samples += iterator->GetDuration();
- iterator.Next();
- }
-
- mRecognitionService->ProcessAudioSegment(aSegment, aTrackRate);
- return samples;
-}
-
-/****************************************************************************
- * FSM Transition functions
- *
- * If a transition function may cause a DOM event to be fired,
- * it may also be re-entered, since the event handler may cause the
- * event loop to spin and new SpeechEvents to be processed.
- *
- * Rules:
- * 1) These methods should call SetState as soon as possible.
- * 2) If these methods dispatch DOM events, or call methods that dispatch
- * DOM events, that should be done as late as possible.
- * 3) If anything must happen after dispatching a DOM event, make sure
- * the state is still what the method expected it to be.
- ****************************************************************************/
-
-void
-SpeechRecognition::Reset()
-{
- SetState(STATE_IDLE);
- mRecognitionService = nullptr;
- mEstimationSamples = 0;
- mBufferedSamples = 0;
- mSpeechDetectionTimer->Cancel();
- mAborted = false;
-}
-
-void
-SpeechRecognition::ResetAndEnd()
-{
- Reset();
- DispatchTrustedEvent(NS_LITERAL_STRING("end"));
-}
-
-void
-SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent)
-{
- SetState(STATE_STARTING);
-}
-
-void
-SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent)
-{
- SetState(STATE_ESTIMATING);
-
- mEndpointer.SetEnvironmentEstimationMode();
- mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
-
- DispatchTrustedEvent(NS_LITERAL_STRING("audiostart"));
- if (mCurrentState == STATE_ESTIMATING) {
- DispatchTrustedEvent(NS_LITERAL_STRING("start"));
- }
-}
-
-void
-SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent)
-{
- SetState(STATE_WAITING_FOR_RESULT);
-
- MOZ_ASSERT(mRecognitionService, "Service deleted before recording done");
- mRecognitionService->SoundEnd();
-
- StopRecording();
-}
-
-void
-SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent)
-{
- SetState(STATE_ESTIMATING);
-
- mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
- if (mEstimationSamples > kESTIMATION_SAMPLES) {
- mEndpointer.SetUserInputMode();
- SetState(STATE_WAITING_FOR_SPEECH);
- }
-}
-
-void
-SpeechRecognition::DetectSpeech(SpeechEvent* aEvent)
-{
- SetState(STATE_WAITING_FOR_SPEECH);
-
- ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
- if (mEndpointer.DidStartReceivingSpeech()) {
- mSpeechDetectionTimer->Cancel();
- SetState(STATE_RECOGNIZING);
- DispatchTrustedEvent(NS_LITERAL_STRING("speechstart"));
- }
-}
-
-void
-SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent)
-{
- SetState(STATE_RECOGNIZING);
-
- ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
- if (mEndpointer.speech_input_complete()) {
- DispatchTrustedEvent(NS_LITERAL_STRING("speechend"));
-
- if (mCurrentState == STATE_RECOGNIZING) {
- // FIXME: StopRecordingAndRecognize should only be called for single
- // shot services for continuous we should just inform the service
- StopRecordingAndRecognize(aEvent);
- }
- }
-}
-
-void
-SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent)
-{
- ResetAndEnd();
-
- RootedDictionary<SpeechRecognitionEventInit> init(RootingCx());
- init.mBubbles = true;
- init.mCancelable = false;
- // init.mResultIndex = 0;
- init.mResults = aEvent->mRecognitionResultList;
- init.mInterpretation = JS::NullValue();
- // init.mEmma = nullptr;
-
- RefPtr<SpeechRecognitionEvent> event =
- SpeechRecognitionEvent::Constructor(this, NS_LITERAL_STRING("result"), init);
- event->SetTrusted(true);
-
- bool defaultActionEnabled;
- this->DispatchEvent(event, &defaultActionEnabled);
-}
-
-void
-SpeechRecognition::DoNothing(SpeechEvent* aEvent)
-{
-}
-
-void
-SpeechRecognition::AbortSilently(SpeechEvent* aEvent)
-{
- if (mRecognitionService) {
- mRecognitionService->Abort();
- }
-
- if (mDOMStream) {
- StopRecording();
- }
-
- ResetAndEnd();
-}
-
-void
-SpeechRecognition::AbortError(SpeechEvent* aEvent)
-{
- AbortSilently(aEvent);
- NotifyError(aEvent);
-}
-
-void
-SpeechRecognition::NotifyError(SpeechEvent* aEvent)
-{
- aEvent->mError->SetTrusted(true);
-
- bool defaultActionEnabled;
- this->DispatchEvent(aEvent->mError, &defaultActionEnabled);
-
- return;
-}
-
-/**************************************
- * Event triggers and other functions *
- **************************************/
-NS_IMETHODIMP
-SpeechRecognition::StartRecording(DOMMediaStream* aDOMStream)
-{
- // hold a reference so that the underlying stream
- // doesn't get Destroy()'ed
- mDOMStream = aDOMStream;
-
- if (NS_WARN_IF(!mDOMStream->GetPlaybackStream())) {
- return NS_ERROR_UNEXPECTED;
- }
- mSpeechListener = new SpeechStreamListener(this);
- mDOMStream->GetPlaybackStream()->AddListener(mSpeechListener);
-
- mEndpointer.StartSession();
-
- return mSpeechDetectionTimer->Init(this, kSPEECH_DETECTION_TIMEOUT_MS,
- nsITimer::TYPE_ONE_SHOT);
-}
-
-NS_IMETHODIMP
-SpeechRecognition::StopRecording()
-{
- // we only really need to remove the listener explicitly when testing,
- // as our JS code still holds a reference to mDOMStream and only assigning
- // it to nullptr isn't guaranteed to free the stream and the listener.
- mDOMStream->GetPlaybackStream()->RemoveListener(mSpeechListener);
- mSpeechListener = nullptr;
- mDOMStream = nullptr;
-
- mEndpointer.EndSession();
- DispatchTrustedEvent(NS_LITERAL_STRING("audioend"));
-
- return NS_OK;
-}
-
-NS_IMETHODIMP
-SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic,
- const char16_t* aData)
-{
- MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread");
-
- if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) &&
- StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) {
-
- DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR,
- SpeechRecognitionErrorCode::No_speech,
- NS_LITERAL_STRING("No speech detected (timeout)"));
- } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) {
- nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
- obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC);
- obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC);
- } else if (MediaPrefs::WebSpeechFakeFSMEvents() &&
- !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) {
- ProcessTestEventRequest(aSubject, nsDependentString(aData));
- }
-
- return NS_OK;
-}
-
-void
-SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName)
-{
- if (aEventName.EqualsLiteral("EVENT_ABORT")) {
- Abort();
- } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) {
- DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR,
- SpeechRecognitionErrorCode::Audio_capture, // TODO different codes?
- NS_LITERAL_STRING("AUDIO_ERROR test event"));
- } else {
- NS_ASSERTION(MediaPrefs::WebSpeechFakeRecognitionService(),
- "Got request for fake recognition service event, but "
- TEST_PREFERENCE_FAKE_RECOGNITION_SERVICE " is unset");
-
- // let the fake recognition service handle the request
- }
-
- return;
-}
-
-already_AddRefed<SpeechGrammarList>
-SpeechRecognition::Grammars() const
-{
- RefPtr<SpeechGrammarList> speechGrammarList = mSpeechGrammarList;
- return speechGrammarList.forget();
-}
-
-void
-SpeechRecognition::SetGrammars(SpeechGrammarList& aArg)
-{
- mSpeechGrammarList = &aArg;
-}
-
-void
-SpeechRecognition::GetLang(nsString& aRetVal) const
-{
- aRetVal = mLang;
-}
-
-void
-SpeechRecognition::SetLang(const nsAString& aArg)
-{
- mLang = aArg;
-}
-
-bool
-SpeechRecognition::GetContinuous(ErrorResult& aRv) const
-{
- aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
- return false;
-}
-
-void
-SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv)
-{
- aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
- return;
-}
-
-bool
-SpeechRecognition::InterimResults() const
-{
- return mInterimResults;
-}
-
-void
-SpeechRecognition::SetInterimResults(bool aArg)
-{
- mInterimResults = aArg;
- return;
-}
-
-uint32_t
-SpeechRecognition::MaxAlternatives() const
-{
- return mMaxAlternatives;
-}
-
-void
-SpeechRecognition::SetMaxAlternatives(uint32_t aArg)
-{
- mMaxAlternatives = aArg;
- return;
-}
-
-void
-SpeechRecognition::GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const
-{
- aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
- return;
-}
-
-void
-SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv)
-{
- aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
- return;
-}
-
-void
-SpeechRecognition::Start(const Optional<NonNull<DOMMediaStream>>& aStream, ErrorResult& aRv)
-{
- if (mCurrentState != STATE_IDLE) {
- aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
- return;
- }
-
- if (!SetRecognitionService(aRv)) {
- return;
- }
-
- if (!ValidateAndSetGrammarList(aRv)) {
- return;
- }
-
- nsresult rv;
- rv = mRecognitionService->Initialize(this);
- if (NS_WARN_IF(NS_FAILED(rv))) {
- return;
- }
-
- MediaStreamConstraints constraints;
- constraints.mAudio.SetAsBoolean() = true;
-
- if (aStream.WasPassed()) {
- StartRecording(&aStream.Value());
- } else {
- AutoNoJSAPI();
- MediaManager* manager = MediaManager::Get();
- manager->GetUserMedia(GetOwner(),
- constraints,
- new GetUserMediaSuccessCallback(this),
- new GetUserMediaErrorCallback(this));
- }
-
- RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_START);
- NS_DispatchToMainThread(event);
-}
-
-bool
-SpeechRecognition::SetRecognitionService(ErrorResult& aRv)
-{
- // See: https://dvcs.w3.org/hg/speech-api/raw-file/tip/webspeechapi.html#dfn-lang
- if (!mLang.IsEmpty()) {
- mRecognitionService = GetSpeechRecognitionService(mLang);
-
- if (!mRecognitionService) {
- aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
- return false;
- }
-
- return true;
- }
-
- nsCOMPtr<nsPIDOMWindowInner> window = GetOwner();
- if(!window) {
- aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
- return false;
- }
- nsCOMPtr<nsIDocument> document = window->GetExtantDoc();
- if(!document) {
- aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
- return false;
- }
- nsCOMPtr<Element> element = document->GetRootElement();
- if(!element) {
- aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
- return false;
- }
-
- nsAutoString lang;
- element->GetLang(lang);
- mRecognitionService = GetSpeechRecognitionService(lang);
-
- if (!mRecognitionService) {
- aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
- return false;
- }
-
- return true;
-}
-
-bool
-SpeechRecognition::ValidateAndSetGrammarList(ErrorResult& aRv)
-{
- if (!mSpeechGrammarList) {
- aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
- return false;
- }
-
- uint32_t grammarListLength = mSpeechGrammarList->Length();
- if (0 == grammarListLength) {
- aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
- return false;
- }
-
- for (uint32_t count = 0; count < grammarListLength; ++count) {
- RefPtr<SpeechGrammar> speechGrammar = mSpeechGrammarList->Item(count, aRv);
- if (aRv.Failed()) {
- return false;
- }
- if (NS_FAILED(mRecognitionService->ValidateAndSetGrammarList(speechGrammar.get(), nullptr))) {
- aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
- return false;
- }
- }
-
- return true;
-}
-
-void
-SpeechRecognition::Stop()
-{
- RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_STOP);
- NS_DispatchToMainThread(event);
-}
-
-void
-SpeechRecognition::Abort()
-{
- if (mAborted) {
- return;
- }
-
- mAborted = true;
- RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_ABORT);
- NS_DispatchToMainThread(event);
-}
-
-void
-SpeechRecognition::DispatchError(EventType aErrorType,
- SpeechRecognitionErrorCode aErrorCode,
- const nsAString& aMessage)
-{
- MOZ_ASSERT(NS_IsMainThread());
- MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR ||
- aErrorType == EVENT_AUDIO_ERROR, "Invalid error type!");
-
- RefPtr<SpeechRecognitionError> srError =
- new SpeechRecognitionError(nullptr, nullptr, nullptr);
-
- srError->InitSpeechRecognitionError(NS_LITERAL_STRING("error"), true, false,
- aErrorCode, aMessage);
-
- RefPtr<SpeechEvent> event = new SpeechEvent(this, aErrorType);
- event->mError = srError;
- NS_DispatchToMainThread(event);
-}
-
-/*
- * Buffer audio samples into mAudioSamplesBuffer until aBufferSize.
- * Updates mBufferedSamples and returns the number of samples that were buffered.
- */
-uint32_t
-SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples,
- uint32_t aSampleCount)
-{
- MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk);
- MOZ_ASSERT(mAudioSamplesBuffer.get());
-
- int16_t* samplesBuffer = static_cast<int16_t*>(mAudioSamplesBuffer->Data());
- size_t samplesToCopy = std::min(aSampleCount,
- mAudioSamplesPerChunk - mBufferedSamples);
-
- memcpy(samplesBuffer + mBufferedSamples, aSamples,
- samplesToCopy * sizeof(int16_t));
-
- mBufferedSamples += samplesToCopy;
- return samplesToCopy;
-}
-
-/*
- * Split a samples buffer starting of a given size into
- * chunks of equal size. The chunks are stored in the array
- * received as argument.
- * Returns the offset of the end of the last chunk that was
- * created.
- */
-uint32_t
-SpeechRecognition::SplitSamplesBuffer(const int16_t* aSamplesBuffer,
- uint32_t aSampleCount,
- nsTArray<RefPtr<SharedBuffer>>& aResult)
-{
- uint32_t chunkStart = 0;
-
- while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) {
- RefPtr<SharedBuffer> chunk =
- SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t));
-
- memcpy(chunk->Data(), aSamplesBuffer + chunkStart,
- mAudioSamplesPerChunk * sizeof(int16_t));
-
- aResult.AppendElement(chunk.forget());
- chunkStart += mAudioSamplesPerChunk;
- }
-
- return chunkStart;
-}
-
-AudioSegment*
-SpeechRecognition::CreateAudioSegment(nsTArray<RefPtr<SharedBuffer>>& aChunks)
-{
- AudioSegment* segment = new AudioSegment();
- for (uint32_t i = 0; i < aChunks.Length(); ++i) {
- RefPtr<SharedBuffer> buffer = aChunks[i];
- const int16_t* chunkData = static_cast<const int16_t*>(buffer->Data());
-
- AutoTArray<const int16_t*, 1> channels;
- channels.AppendElement(chunkData);
- segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk,
- PRINCIPAL_HANDLE_NONE);
- }
-
- return segment;
-}
-
-void
-SpeechRecognition::FeedAudioData(already_AddRefed<SharedBuffer> aSamples,
- uint32_t aDuration,
- MediaStreamListener* aProvider, TrackRate aTrackRate)
-{
- NS_ASSERTION(!NS_IsMainThread(),
- "FeedAudioData should not be called in the main thread");
-
- // Endpointer expects to receive samples in chunks whose size is a
- // multiple of its frame size.
- // Since we can't assume we will receive the frames in appropriate-sized
- // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk
- // (a multiple of Endpointer's frame size) before feeding to Endpointer.
-
- // ensure aSamples is deleted
- RefPtr<SharedBuffer> refSamples = aSamples;
-
- uint32_t samplesIndex = 0;
- const int16_t* samples = static_cast<int16_t*>(refSamples->Data());
- AutoTArray<RefPtr<SharedBuffer>, 5> chunksToSend;
-
- // fill up our buffer and make a chunk out of it, if possible
- if (mBufferedSamples > 0) {
- samplesIndex += FillSamplesBuffer(samples, aDuration);
-
- if (mBufferedSamples == mAudioSamplesPerChunk) {
- chunksToSend.AppendElement(mAudioSamplesBuffer.forget());
- mBufferedSamples = 0;
- }
- }
-
- // create sample chunks of correct size
- if (samplesIndex < aDuration) {
- samplesIndex += SplitSamplesBuffer(samples + samplesIndex,
- aDuration - samplesIndex,
- chunksToSend);
- }
-
- // buffer remaining samples
- if (samplesIndex < aDuration) {
- mBufferedSamples = 0;
- mAudioSamplesBuffer =
- SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t));
-
- FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex);
- }
-
- AudioSegment* segment = CreateAudioSegment(chunksToSend);
- RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_AUDIO_DATA);
- event->mAudioSegment = segment;
- event->mProvider = aProvider;
- event->mTrackRate = aTrackRate;
- NS_DispatchToMainThread(event);
-
- return;
-}
-
-const char*
-SpeechRecognition::GetName(FSMState aId)
-{
- static const char* names[] = {
- "STATE_IDLE",
- "STATE_STARTING",
- "STATE_ESTIMATING",
- "STATE_WAITING_FOR_SPEECH",
- "STATE_RECOGNIZING",
- "STATE_WAITING_FOR_RESULT",
- };
-
- MOZ_ASSERT(aId < STATE_COUNT);
- MOZ_ASSERT(ArrayLength(names) == STATE_COUNT);
- return names[aId];
-}
-
-const char*
-SpeechRecognition::GetName(SpeechEvent* aEvent)
-{
- static const char* names[] = {
- "EVENT_START",
- "EVENT_STOP",
- "EVENT_ABORT",
- "EVENT_AUDIO_DATA",
- "EVENT_AUDIO_ERROR",
- "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT",
- "EVENT_RECOGNITIONSERVICE_FINAL_RESULT",
- "EVENT_RECOGNITIONSERVICE_ERROR"
- };
-
- MOZ_ASSERT(aEvent->mType < EVENT_COUNT);
- MOZ_ASSERT(ArrayLength(names) == EVENT_COUNT);
- return names[aEvent->mType];
-}
-
-SpeechEvent::~SpeechEvent()
-{
- delete mAudioSegment;
-}
-
-NS_IMETHODIMP
-SpeechEvent::Run()
-{
- mRecognition->ProcessEvent(this);
- return NS_OK;
-}
-
-NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaSuccessCallback, nsIDOMGetUserMediaSuccessCallback)
-
-NS_IMETHODIMP
-SpeechRecognition::GetUserMediaSuccessCallback::OnSuccess(nsISupports* aStream)
-{
- RefPtr<DOMMediaStream> stream = do_QueryObject(aStream);
- if (!stream) {
- return NS_ERROR_NO_INTERFACE;
- }
- mRecognition->StartRecording(stream);
- return NS_OK;
-}
-
-NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaErrorCallback, nsIDOMGetUserMediaErrorCallback)
-
-NS_IMETHODIMP
-SpeechRecognition::GetUserMediaErrorCallback::OnError(nsISupports* aError)
-{
- RefPtr<MediaStreamError> error = do_QueryObject(aError);
- if (!error) {
- return NS_OK;
- }
- SpeechRecognitionErrorCode errorCode;
-
- nsAutoString name;
- error->GetName(name);
- if (name.EqualsLiteral("PERMISSION_DENIED")) {
- errorCode = SpeechRecognitionErrorCode::Not_allowed;
- } else {
- errorCode = SpeechRecognitionErrorCode::Audio_capture;
- }
-
- nsAutoString message;
- error->GetMessage(message);
- mRecognition->DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode,
- message);
- return NS_OK;
-}
-
-} // namespace dom
-} // namespace mozilla