diff options
Diffstat (limited to 'dom/media/webspeech/recognition/SpeechRecognition.cpp')
-rw-r--r-- | dom/media/webspeech/recognition/SpeechRecognition.cpp | 1087 |
1 files changed, 1087 insertions, 0 deletions
diff --git a/dom/media/webspeech/recognition/SpeechRecognition.cpp b/dom/media/webspeech/recognition/SpeechRecognition.cpp new file mode 100644 index 000000000..48b1177b2 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechRecognition.cpp @@ -0,0 +1,1087 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechRecognition.h" + +#include "nsCOMPtr.h" +#include "nsCycleCollectionParticipant.h" + +#include "mozilla/dom/BindingUtils.h" +#include "mozilla/dom/Element.h" +#include "mozilla/dom/SpeechRecognitionBinding.h" +#include "mozilla/dom/MediaStreamTrackBinding.h" +#include "mozilla/dom/MediaStreamError.h" +#include "mozilla/MediaManager.h" +#include "mozilla/Preferences.h" +#include "MediaPrefs.h" +#include "mozilla/Services.h" + +#include "AudioSegment.h" +#include "endpointer.h" + +#include "mozilla/dom/SpeechRecognitionEvent.h" +#include "nsContentUtils.h" +#include "nsIDocument.h" +#include "nsIObserverService.h" +#include "nsIPermissionManager.h" +#include "nsIPrincipal.h" +#include "nsPIDOMWindow.h" +#include "nsServiceManagerUtils.h" +#include "nsQueryObject.h" + +#include <algorithm> + +// Undo the windows.h damage +#if defined(XP_WIN) && defined(GetMessage) +#undef GetMessage +#endif + +namespace mozilla { +namespace dom { + +#define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default" +#define DEFAULT_RECOGNITION_SERVICE_PREFIX "pocketsphinx-" +#define DEFAULT_RECOGNITION_SERVICE "pocketsphinx-en-US" + +#define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length" +#define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH "media.webspeech.long_silence_length" +#define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH "media.webspeech.long_speech_length" + +static const uint32_t kSAMPLE_RATE = 16000; +static const uint32_t kSPEECH_DETECTION_TIMEOUT_MS = 10000; + +// number of frames corresponding to 300ms of audio to send to endpointer while +// it's in environment estimation mode +// kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms +static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000; + +LogModule* +GetSpeechRecognitionLog() +{ + static LazyLogModule sLog("SpeechRecognition"); + return sLog; +} +#define SR_LOG(...) MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__)) + +already_AddRefed<nsISpeechRecognitionService> +GetSpeechRecognitionService(const nsAString& aLang) +{ + nsAutoCString speechRecognitionServiceCID; + + nsAdoptingCString prefValue = + Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE); + nsAutoCString speechRecognitionService; + + if (!aLang.IsEmpty()) { + speechRecognitionService = + NS_LITERAL_CSTRING(DEFAULT_RECOGNITION_SERVICE_PREFIX) + + NS_ConvertUTF16toUTF8(aLang); + } else if (!prefValue.IsEmpty()) { + speechRecognitionService = prefValue; + } else { + speechRecognitionService = DEFAULT_RECOGNITION_SERVICE; + } + + if (MediaPrefs::WebSpeechFakeRecognitionService()) { + speechRecognitionServiceCID = + NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake"; + } else { + speechRecognitionServiceCID = + NS_LITERAL_CSTRING(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) + + speechRecognitionService; + } + + nsresult rv; + nsCOMPtr<nsISpeechRecognitionService> recognitionService; + recognitionService = do_GetService(speechRecognitionServiceCID.get(), &rv); + return recognitionService.forget(); +} + +NS_IMPL_CYCLE_COLLECTION_INHERITED(SpeechRecognition, DOMEventTargetHelper, mDOMStream, mSpeechGrammarList) + +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION_INHERITED(SpeechRecognition) + NS_INTERFACE_MAP_ENTRY(nsIObserver) +NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper) + +NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper) +NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper) + +SpeechRecognition::SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow) + : DOMEventTargetHelper(aOwnerWindow) + , mEndpointer(kSAMPLE_RATE) + , mAudioSamplesPerChunk(mEndpointer.FrameSize()) + , mSpeechDetectionTimer(do_CreateInstance(NS_TIMER_CONTRACTID)) + , mSpeechGrammarList(new SpeechGrammarList(GetParentObject())) + , mInterimResults(false) + , mMaxAlternatives(1) +{ + SR_LOG("created SpeechRecognition"); + + if (MediaPrefs::WebSpeechTestEnabled()) { + nsCOMPtr<nsIObserverService> obs = services::GetObserverService(); + obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false); + obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false); + } + + mEndpointer.set_speech_input_complete_silence_length( + Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 1250000)); + mEndpointer.set_long_speech_input_complete_silence_length( + Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 2500000)); + mEndpointer.set_long_speech_length( + Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000)); + Reset(); +} + +bool +SpeechRecognition::StateBetween(FSMState begin, FSMState end) +{ + return mCurrentState >= begin && mCurrentState <= end; +} + +void +SpeechRecognition::SetState(FSMState state) +{ + mCurrentState = state; + SR_LOG("Transitioned to state %s", GetName(mCurrentState)); + return; +} + +JSObject* +SpeechRecognition::WrapObject(JSContext* aCx, JS::Handle<JSObject*> aGivenProto) +{ + return SpeechRecognitionBinding::Wrap(aCx, this, aGivenProto); +} + +bool +SpeechRecognition::IsAuthorized(JSContext* aCx, JSObject* aGlobal) +{ + nsCOMPtr<nsIPrincipal> principal = nsContentUtils::ObjectPrincipal(aGlobal); + + nsresult rv; + nsCOMPtr<nsIPermissionManager> mgr = do_GetService(NS_PERMISSIONMANAGER_CONTRACTID, &rv); + if (NS_WARN_IF(NS_FAILED(rv))) { + return false; + } + + uint32_t speechRecognition = nsIPermissionManager::UNKNOWN_ACTION; + rv = mgr->TestExactPermissionFromPrincipal(principal, "speech-recognition", &speechRecognition); + if (NS_WARN_IF(NS_FAILED(rv))) { + return false; + } + + bool hasPermission = (speechRecognition == nsIPermissionManager::ALLOW_ACTION); + + return (hasPermission || MediaPrefs::WebSpeechRecognitionForceEnabled() + || MediaPrefs::WebSpeechTestEnabled()) + && MediaPrefs::WebSpeechRecognitionEnabled(); +} + +already_AddRefed<SpeechRecognition> +SpeechRecognition::Constructor(const GlobalObject& aGlobal, + ErrorResult& aRv) +{ + nsCOMPtr<nsPIDOMWindowInner> win = do_QueryInterface(aGlobal.GetAsSupports()); + if (!win) { + aRv.Throw(NS_ERROR_FAILURE); + } + + MOZ_ASSERT(win->IsInnerWindow()); + RefPtr<SpeechRecognition> object = new SpeechRecognition(win); + return object.forget(); +} + +nsISupports* +SpeechRecognition::GetParentObject() const +{ + return GetOwner(); +} + +void +SpeechRecognition::ProcessEvent(SpeechEvent* aEvent) +{ + SR_LOG("Processing %s, current state is %s", + GetName(aEvent), + GetName(mCurrentState)); + + if (mAborted && aEvent->mType != EVENT_ABORT) { + // ignore all events while aborting + return; + } + + Transition(aEvent); +} + +void +SpeechRecognition::Transition(SpeechEvent* aEvent) +{ + switch (mCurrentState) { + case STATE_IDLE: + switch (aEvent->mType) { + case EVENT_START: + // TODO: may want to time out if we wait too long + // for user to approve + WaitForAudioData(aEvent); + break; + case EVENT_STOP: + case EVENT_ABORT: + case EVENT_AUDIO_DATA: + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + DoNothing(aEvent); + break; + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_ERROR: + AbortError(aEvent); + break; + case EVENT_COUNT: + MOZ_CRASH("Invalid event EVENT_COUNT"); + } + break; + case STATE_STARTING: + switch (aEvent->mType) { + case EVENT_AUDIO_DATA: + StartedAudioCapture(aEvent); + break; + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_ERROR: + AbortError(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_STOP: + Reset(); + break; + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + DoNothing(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent)); + MOZ_CRASH(); + case EVENT_COUNT: + MOZ_CRASH("Invalid event EVENT_COUNT"); + } + break; + case STATE_ESTIMATING: + switch (aEvent->mType) { + case EVENT_AUDIO_DATA: + WaitForEstimation(aEvent); + break; + case EVENT_STOP: + StopRecordingAndRecognize(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + case EVENT_RECOGNITIONSERVICE_ERROR: + DoNothing(aEvent); + break; + case EVENT_AUDIO_ERROR: + AbortError(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType); + MOZ_CRASH(); + case EVENT_COUNT: + MOZ_CRASH("Invalid event EVENT_COUNT"); + } + break; + case STATE_WAITING_FOR_SPEECH: + switch (aEvent->mType) { + case EVENT_AUDIO_DATA: + DetectSpeech(aEvent); + break; + case EVENT_STOP: + StopRecordingAndRecognize(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_AUDIO_ERROR: + AbortError(aEvent); + break; + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + case EVENT_RECOGNITIONSERVICE_ERROR: + DoNothing(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent)); + MOZ_CRASH(); + case EVENT_COUNT: + MOZ_CRASH("Invalid event EVENT_COUNT"); + } + break; + case STATE_RECOGNIZING: + switch (aEvent->mType) { + case EVENT_AUDIO_DATA: + WaitForSpeechEnd(aEvent); + break; + case EVENT_STOP: + StopRecordingAndRecognize(aEvent); + break; + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_ERROR: + AbortError(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + DoNothing(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent)); + MOZ_CRASH(); + case EVENT_COUNT: + MOZ_CRASH("Invalid event EVENT_COUNT"); + } + break; + case STATE_WAITING_FOR_RESULT: + switch (aEvent->mType) { + case EVENT_STOP: + DoNothing(aEvent); + break; + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_ERROR: + AbortError(aEvent); + break; + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + NotifyFinalResult(aEvent); + break; + case EVENT_AUDIO_DATA: + DoNothing(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_START: + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s", GetName(aEvent)); + MOZ_CRASH(); + case EVENT_COUNT: + MOZ_CRASH("Invalid event EVENT_COUNT"); + } + break; + case STATE_COUNT: + MOZ_CRASH("Invalid state STATE_COUNT"); + } + + return; +} + +/* + * Handle a segment of recorded audio data. + * Returns the number of samples that were processed. + */ +uint32_t +SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate) +{ + AudioSegment::ChunkIterator iterator(*aSegment); + uint32_t samples = 0; + while (!iterator.IsEnded()) { + float out; + mEndpointer.ProcessAudio(*iterator, &out); + samples += iterator->GetDuration(); + iterator.Next(); + } + + mRecognitionService->ProcessAudioSegment(aSegment, aTrackRate); + return samples; +} + +/**************************************************************************** + * FSM Transition functions + * + * If a transition function may cause a DOM event to be fired, + * it may also be re-entered, since the event handler may cause the + * event loop to spin and new SpeechEvents to be processed. + * + * Rules: + * 1) These methods should call SetState as soon as possible. + * 2) If these methods dispatch DOM events, or call methods that dispatch + * DOM events, that should be done as late as possible. + * 3) If anything must happen after dispatching a DOM event, make sure + * the state is still what the method expected it to be. + ****************************************************************************/ + +void +SpeechRecognition::Reset() +{ + SetState(STATE_IDLE); + mRecognitionService = nullptr; + mEstimationSamples = 0; + mBufferedSamples = 0; + mSpeechDetectionTimer->Cancel(); + mAborted = false; +} + +void +SpeechRecognition::ResetAndEnd() +{ + Reset(); + DispatchTrustedEvent(NS_LITERAL_STRING("end")); +} + +void +SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent) +{ + SetState(STATE_STARTING); +} + +void +SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent) +{ + SetState(STATE_ESTIMATING); + + mEndpointer.SetEnvironmentEstimationMode(); + mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); + + DispatchTrustedEvent(NS_LITERAL_STRING("audiostart")); + if (mCurrentState == STATE_ESTIMATING) { + DispatchTrustedEvent(NS_LITERAL_STRING("start")); + } +} + +void +SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent) +{ + SetState(STATE_WAITING_FOR_RESULT); + + MOZ_ASSERT(mRecognitionService, "Service deleted before recording done"); + mRecognitionService->SoundEnd(); + + StopRecording(); +} + +void +SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent) +{ + SetState(STATE_ESTIMATING); + + mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); + if (mEstimationSamples > kESTIMATION_SAMPLES) { + mEndpointer.SetUserInputMode(); + SetState(STATE_WAITING_FOR_SPEECH); + } +} + +void +SpeechRecognition::DetectSpeech(SpeechEvent* aEvent) +{ + SetState(STATE_WAITING_FOR_SPEECH); + + ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); + if (mEndpointer.DidStartReceivingSpeech()) { + mSpeechDetectionTimer->Cancel(); + SetState(STATE_RECOGNIZING); + DispatchTrustedEvent(NS_LITERAL_STRING("speechstart")); + } +} + +void +SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent) +{ + SetState(STATE_RECOGNIZING); + + ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); + if (mEndpointer.speech_input_complete()) { + DispatchTrustedEvent(NS_LITERAL_STRING("speechend")); + + if (mCurrentState == STATE_RECOGNIZING) { + // FIXME: StopRecordingAndRecognize should only be called for single + // shot services for continuous we should just inform the service + StopRecordingAndRecognize(aEvent); + } + } +} + +void +SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent) +{ + ResetAndEnd(); + + RootedDictionary<SpeechRecognitionEventInit> init(RootingCx()); + init.mBubbles = true; + init.mCancelable = false; + // init.mResultIndex = 0; + init.mResults = aEvent->mRecognitionResultList; + init.mInterpretation = JS::NullValue(); + // init.mEmma = nullptr; + + RefPtr<SpeechRecognitionEvent> event = + SpeechRecognitionEvent::Constructor(this, NS_LITERAL_STRING("result"), init); + event->SetTrusted(true); + + bool defaultActionEnabled; + this->DispatchEvent(event, &defaultActionEnabled); +} + +void +SpeechRecognition::DoNothing(SpeechEvent* aEvent) +{ +} + +void +SpeechRecognition::AbortSilently(SpeechEvent* aEvent) +{ + if (mRecognitionService) { + mRecognitionService->Abort(); + } + + if (mDOMStream) { + StopRecording(); + } + + ResetAndEnd(); +} + +void +SpeechRecognition::AbortError(SpeechEvent* aEvent) +{ + AbortSilently(aEvent); + NotifyError(aEvent); +} + +void +SpeechRecognition::NotifyError(SpeechEvent* aEvent) +{ + aEvent->mError->SetTrusted(true); + + bool defaultActionEnabled; + this->DispatchEvent(aEvent->mError, &defaultActionEnabled); + + return; +} + +/************************************** + * Event triggers and other functions * + **************************************/ +NS_IMETHODIMP +SpeechRecognition::StartRecording(DOMMediaStream* aDOMStream) +{ + // hold a reference so that the underlying stream + // doesn't get Destroy()'ed + mDOMStream = aDOMStream; + + if (NS_WARN_IF(!mDOMStream->GetPlaybackStream())) { + return NS_ERROR_UNEXPECTED; + } + mSpeechListener = new SpeechStreamListener(this); + mDOMStream->GetPlaybackStream()->AddListener(mSpeechListener); + + mEndpointer.StartSession(); + + return mSpeechDetectionTimer->Init(this, kSPEECH_DETECTION_TIMEOUT_MS, + nsITimer::TYPE_ONE_SHOT); +} + +NS_IMETHODIMP +SpeechRecognition::StopRecording() +{ + // we only really need to remove the listener explicitly when testing, + // as our JS code still holds a reference to mDOMStream and only assigning + // it to nullptr isn't guaranteed to free the stream and the listener. + mDOMStream->GetPlaybackStream()->RemoveListener(mSpeechListener); + mSpeechListener = nullptr; + mDOMStream = nullptr; + + mEndpointer.EndSession(); + DispatchTrustedEvent(NS_LITERAL_STRING("audioend")); + + return NS_OK; +} + +NS_IMETHODIMP +SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic, + const char16_t* aData) +{ + MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread"); + + if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) && + StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) { + + DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, + SpeechRecognitionErrorCode::No_speech, + NS_LITERAL_STRING("No speech detected (timeout)")); + } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) { + nsCOMPtr<nsIObserverService> obs = services::GetObserverService(); + obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC); + obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC); + } else if (MediaPrefs::WebSpeechFakeFSMEvents() && + !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) { + ProcessTestEventRequest(aSubject, nsDependentString(aData)); + } + + return NS_OK; +} + +void +SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName) +{ + if (aEventName.EqualsLiteral("EVENT_ABORT")) { + Abort(); + } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) { + DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, + SpeechRecognitionErrorCode::Audio_capture, // TODO different codes? + NS_LITERAL_STRING("AUDIO_ERROR test event")); + } else { + NS_ASSERTION(MediaPrefs::WebSpeechFakeRecognitionService(), + "Got request for fake recognition service event, but " + TEST_PREFERENCE_FAKE_RECOGNITION_SERVICE " is unset"); + + // let the fake recognition service handle the request + } + + return; +} + +already_AddRefed<SpeechGrammarList> +SpeechRecognition::Grammars() const +{ + RefPtr<SpeechGrammarList> speechGrammarList = mSpeechGrammarList; + return speechGrammarList.forget(); +} + +void +SpeechRecognition::SetGrammars(SpeechGrammarList& aArg) +{ + mSpeechGrammarList = &aArg; +} + +void +SpeechRecognition::GetLang(nsString& aRetVal) const +{ + aRetVal = mLang; +} + +void +SpeechRecognition::SetLang(const nsAString& aArg) +{ + mLang = aArg; +} + +bool +SpeechRecognition::GetContinuous(ErrorResult& aRv) const +{ + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); + return false; +} + +void +SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv) +{ + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); + return; +} + +bool +SpeechRecognition::InterimResults() const +{ + return mInterimResults; +} + +void +SpeechRecognition::SetInterimResults(bool aArg) +{ + mInterimResults = aArg; + return; +} + +uint32_t +SpeechRecognition::MaxAlternatives() const +{ + return mMaxAlternatives; +} + +void +SpeechRecognition::SetMaxAlternatives(uint32_t aArg) +{ + mMaxAlternatives = aArg; + return; +} + +void +SpeechRecognition::GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const +{ + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); + return; +} + +void +SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv) +{ + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); + return; +} + +void +SpeechRecognition::Start(const Optional<NonNull<DOMMediaStream>>& aStream, ErrorResult& aRv) +{ + if (mCurrentState != STATE_IDLE) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return; + } + + if (!SetRecognitionService(aRv)) { + return; + } + + if (!ValidateAndSetGrammarList(aRv)) { + return; + } + + nsresult rv; + rv = mRecognitionService->Initialize(this); + if (NS_WARN_IF(NS_FAILED(rv))) { + return; + } + + MediaStreamConstraints constraints; + constraints.mAudio.SetAsBoolean() = true; + + if (aStream.WasPassed()) { + StartRecording(&aStream.Value()); + } else { + AutoNoJSAPI(); + MediaManager* manager = MediaManager::Get(); + manager->GetUserMedia(GetOwner(), + constraints, + new GetUserMediaSuccessCallback(this), + new GetUserMediaErrorCallback(this)); + } + + RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_START); + NS_DispatchToMainThread(event); +} + +bool +SpeechRecognition::SetRecognitionService(ErrorResult& aRv) +{ + // See: https://dvcs.w3.org/hg/speech-api/raw-file/tip/webspeechapi.html#dfn-lang + if (!mLang.IsEmpty()) { + mRecognitionService = GetSpeechRecognitionService(mLang); + + if (!mRecognitionService) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + + return true; + } + + nsCOMPtr<nsPIDOMWindowInner> window = GetOwner(); + if(!window) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + nsCOMPtr<nsIDocument> document = window->GetExtantDoc(); + if(!document) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + nsCOMPtr<Element> element = document->GetRootElement(); + if(!element) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + + nsAutoString lang; + element->GetLang(lang); + mRecognitionService = GetSpeechRecognitionService(lang); + + if (!mRecognitionService) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + + return true; +} + +bool +SpeechRecognition::ValidateAndSetGrammarList(ErrorResult& aRv) +{ + if (!mSpeechGrammarList) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + + uint32_t grammarListLength = mSpeechGrammarList->Length(); + if (0 == grammarListLength) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + + for (uint32_t count = 0; count < grammarListLength; ++count) { + RefPtr<SpeechGrammar> speechGrammar = mSpeechGrammarList->Item(count, aRv); + if (aRv.Failed()) { + return false; + } + if (NS_FAILED(mRecognitionService->ValidateAndSetGrammarList(speechGrammar.get(), nullptr))) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return false; + } + } + + return true; +} + +void +SpeechRecognition::Stop() +{ + RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_STOP); + NS_DispatchToMainThread(event); +} + +void +SpeechRecognition::Abort() +{ + if (mAborted) { + return; + } + + mAborted = true; + RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_ABORT); + NS_DispatchToMainThread(event); +} + +void +SpeechRecognition::DispatchError(EventType aErrorType, + SpeechRecognitionErrorCode aErrorCode, + const nsAString& aMessage) +{ + MOZ_ASSERT(NS_IsMainThread()); + MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR || + aErrorType == EVENT_AUDIO_ERROR, "Invalid error type!"); + + RefPtr<SpeechRecognitionError> srError = + new SpeechRecognitionError(nullptr, nullptr, nullptr); + + srError->InitSpeechRecognitionError(NS_LITERAL_STRING("error"), true, false, + aErrorCode, aMessage); + + RefPtr<SpeechEvent> event = new SpeechEvent(this, aErrorType); + event->mError = srError; + NS_DispatchToMainThread(event); +} + +/* + * Buffer audio samples into mAudioSamplesBuffer until aBufferSize. + * Updates mBufferedSamples and returns the number of samples that were buffered. + */ +uint32_t +SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples, + uint32_t aSampleCount) +{ + MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk); + MOZ_ASSERT(mAudioSamplesBuffer.get()); + + int16_t* samplesBuffer = static_cast<int16_t*>(mAudioSamplesBuffer->Data()); + size_t samplesToCopy = std::min(aSampleCount, + mAudioSamplesPerChunk - mBufferedSamples); + + memcpy(samplesBuffer + mBufferedSamples, aSamples, + samplesToCopy * sizeof(int16_t)); + + mBufferedSamples += samplesToCopy; + return samplesToCopy; +} + +/* + * Split a samples buffer starting of a given size into + * chunks of equal size. The chunks are stored in the array + * received as argument. + * Returns the offset of the end of the last chunk that was + * created. + */ +uint32_t +SpeechRecognition::SplitSamplesBuffer(const int16_t* aSamplesBuffer, + uint32_t aSampleCount, + nsTArray<RefPtr<SharedBuffer>>& aResult) +{ + uint32_t chunkStart = 0; + + while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) { + RefPtr<SharedBuffer> chunk = + SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t)); + + memcpy(chunk->Data(), aSamplesBuffer + chunkStart, + mAudioSamplesPerChunk * sizeof(int16_t)); + + aResult.AppendElement(chunk.forget()); + chunkStart += mAudioSamplesPerChunk; + } + + return chunkStart; +} + +AudioSegment* +SpeechRecognition::CreateAudioSegment(nsTArray<RefPtr<SharedBuffer>>& aChunks) +{ + AudioSegment* segment = new AudioSegment(); + for (uint32_t i = 0; i < aChunks.Length(); ++i) { + RefPtr<SharedBuffer> buffer = aChunks[i]; + const int16_t* chunkData = static_cast<const int16_t*>(buffer->Data()); + + AutoTArray<const int16_t*, 1> channels; + channels.AppendElement(chunkData); + segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk, + PRINCIPAL_HANDLE_NONE); + } + + return segment; +} + +void +SpeechRecognition::FeedAudioData(already_AddRefed<SharedBuffer> aSamples, + uint32_t aDuration, + MediaStreamListener* aProvider, TrackRate aTrackRate) +{ + NS_ASSERTION(!NS_IsMainThread(), + "FeedAudioData should not be called in the main thread"); + + // Endpointer expects to receive samples in chunks whose size is a + // multiple of its frame size. + // Since we can't assume we will receive the frames in appropriate-sized + // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk + // (a multiple of Endpointer's frame size) before feeding to Endpointer. + + // ensure aSamples is deleted + RefPtr<SharedBuffer> refSamples = aSamples; + + uint32_t samplesIndex = 0; + const int16_t* samples = static_cast<int16_t*>(refSamples->Data()); + AutoTArray<RefPtr<SharedBuffer>, 5> chunksToSend; + + // fill up our buffer and make a chunk out of it, if possible + if (mBufferedSamples > 0) { + samplesIndex += FillSamplesBuffer(samples, aDuration); + + if (mBufferedSamples == mAudioSamplesPerChunk) { + chunksToSend.AppendElement(mAudioSamplesBuffer.forget()); + mBufferedSamples = 0; + } + } + + // create sample chunks of correct size + if (samplesIndex < aDuration) { + samplesIndex += SplitSamplesBuffer(samples + samplesIndex, + aDuration - samplesIndex, + chunksToSend); + } + + // buffer remaining samples + if (samplesIndex < aDuration) { + mBufferedSamples = 0; + mAudioSamplesBuffer = + SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t)); + + FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex); + } + + AudioSegment* segment = CreateAudioSegment(chunksToSend); + RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_AUDIO_DATA); + event->mAudioSegment = segment; + event->mProvider = aProvider; + event->mTrackRate = aTrackRate; + NS_DispatchToMainThread(event); + + return; +} + +const char* +SpeechRecognition::GetName(FSMState aId) +{ + static const char* names[] = { + "STATE_IDLE", + "STATE_STARTING", + "STATE_ESTIMATING", + "STATE_WAITING_FOR_SPEECH", + "STATE_RECOGNIZING", + "STATE_WAITING_FOR_RESULT", + }; + + MOZ_ASSERT(aId < STATE_COUNT); + MOZ_ASSERT(ArrayLength(names) == STATE_COUNT); + return names[aId]; +} + +const char* +SpeechRecognition::GetName(SpeechEvent* aEvent) +{ + static const char* names[] = { + "EVENT_START", + "EVENT_STOP", + "EVENT_ABORT", + "EVENT_AUDIO_DATA", + "EVENT_AUDIO_ERROR", + "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT", + "EVENT_RECOGNITIONSERVICE_FINAL_RESULT", + "EVENT_RECOGNITIONSERVICE_ERROR" + }; + + MOZ_ASSERT(aEvent->mType < EVENT_COUNT); + MOZ_ASSERT(ArrayLength(names) == EVENT_COUNT); + return names[aEvent->mType]; +} + +SpeechEvent::~SpeechEvent() +{ + delete mAudioSegment; +} + +NS_IMETHODIMP +SpeechEvent::Run() +{ + mRecognition->ProcessEvent(this); + return NS_OK; +} + +NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaSuccessCallback, nsIDOMGetUserMediaSuccessCallback) + +NS_IMETHODIMP +SpeechRecognition::GetUserMediaSuccessCallback::OnSuccess(nsISupports* aStream) +{ + RefPtr<DOMMediaStream> stream = do_QueryObject(aStream); + if (!stream) { + return NS_ERROR_NO_INTERFACE; + } + mRecognition->StartRecording(stream); + return NS_OK; +} + +NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaErrorCallback, nsIDOMGetUserMediaErrorCallback) + +NS_IMETHODIMP +SpeechRecognition::GetUserMediaErrorCallback::OnError(nsISupports* aError) +{ + RefPtr<MediaStreamError> error = do_QueryObject(aError); + if (!error) { + return NS_OK; + } + SpeechRecognitionErrorCode errorCode; + + nsAutoString name; + error->GetName(name); + if (name.EqualsLiteral("PERMISSION_DENIED")) { + errorCode = SpeechRecognitionErrorCode::Not_allowed; + } else { + errorCode = SpeechRecognitionErrorCode::Audio_capture; + } + + nsAutoString message; + error->GetMessage(message); + mRecognition->DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode, + message); + return NS_OK; +} + +} // namespace dom +} // namespace mozilla |