/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim:set ts=2 sw=2 sts=2 et cindent: */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #ifndef mozilla_dom_SpeechRecognition_h #define mozilla_dom_SpeechRecognition_h #include "mozilla/Attributes.h" #include "mozilla/DOMEventTargetHelper.h" #include "nsCOMPtr.h" #include "nsString.h" #include "nsWrapperCache.h" #include "nsTArray.h" #include "js/TypeDecls.h" #include "nsIDOMNavigatorUserMedia.h" #include "nsITimer.h" #include "MediaEngine.h" #include "MediaStreamGraph.h" #include "AudioSegment.h" #include "mozilla/WeakPtr.h" #include "SpeechGrammarList.h" #include "SpeechRecognitionResultList.h" #include "SpeechStreamListener.h" #include "nsISpeechRecognitionService.h" #include "endpointer.h" #include "mozilla/dom/SpeechRecognitionError.h" namespace mozilla { namespace dom { #define SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC "SpeechRecognitionTest:RequestEvent" #define SPEECH_RECOGNITION_TEST_END_TOPIC "SpeechRecognitionTest:End" class GlobalObject; class SpeechEvent; LogModule* GetSpeechRecognitionLog(); #define SR_LOG(...) MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__)) class SpeechRecognition final : public DOMEventTargetHelper, public nsIObserver, public SupportsWeakPtr { public: MOZ_DECLARE_WEAKREFERENCE_TYPENAME(SpeechRecognition) explicit SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow); NS_DECL_ISUPPORTS_INHERITED NS_DECL_CYCLE_COLLECTION_CLASS_INHERITED(SpeechRecognition, DOMEventTargetHelper) NS_DECL_NSIOBSERVER nsISupports* GetParentObject() const; JSObject* WrapObject(JSContext* aCx, JS::Handle aGivenProto) override; static bool IsAuthorized(JSContext* aCx, JSObject* aGlobal); static already_AddRefed Constructor(const GlobalObject& aGlobal, ErrorResult& aRv); already_AddRefed Grammars() const; void SetGrammars(mozilla::dom::SpeechGrammarList& aArg); void GetLang(nsString& aRetVal) const; void SetLang(const nsAString& aArg); bool GetContinuous(ErrorResult& aRv) const; void SetContinuous(bool aArg, ErrorResult& aRv); bool InterimResults() const; void SetInterimResults(bool aArg); uint32_t MaxAlternatives() const; void SetMaxAlternatives(uint32_t aArg); void GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const; void SetServiceURI(const nsAString& aArg, ErrorResult& aRv); void Start(const Optional>& aStream, ErrorResult& aRv); void Stop(); void Abort(); IMPL_EVENT_HANDLER(audiostart) IMPL_EVENT_HANDLER(soundstart) IMPL_EVENT_HANDLER(speechstart) IMPL_EVENT_HANDLER(speechend) IMPL_EVENT_HANDLER(soundend) IMPL_EVENT_HANDLER(audioend) IMPL_EVENT_HANDLER(result) IMPL_EVENT_HANDLER(nomatch) IMPL_EVENT_HANDLER(error) IMPL_EVENT_HANDLER(start) IMPL_EVENT_HANDLER(end) enum EventType { EVENT_START, EVENT_STOP, EVENT_ABORT, EVENT_AUDIO_DATA, EVENT_AUDIO_ERROR, EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT, EVENT_RECOGNITIONSERVICE_FINAL_RESULT, EVENT_RECOGNITIONSERVICE_ERROR, EVENT_COUNT }; void DispatchError(EventType aErrorType, SpeechRecognitionErrorCode aErrorCode, const nsAString& aMessage); uint32_t FillSamplesBuffer(const int16_t* aSamples, uint32_t aSampleCount); uint32_t SplitSamplesBuffer(const int16_t* aSamplesBuffer, uint32_t aSampleCount, nsTArray>& aResult); AudioSegment* CreateAudioSegment(nsTArray>& aChunks); void FeedAudioData(already_AddRefed aSamples, uint32_t aDuration, MediaStreamListener* aProvider, TrackRate aTrackRate); friend class SpeechEvent; private: virtual ~SpeechRecognition() {}; enum FSMState { STATE_IDLE, STATE_STARTING, STATE_ESTIMATING, STATE_WAITING_FOR_SPEECH, STATE_RECOGNIZING, STATE_WAITING_FOR_RESULT, STATE_COUNT }; void SetState(FSMState state); bool StateBetween(FSMState begin, FSMState end); bool SetRecognitionService(ErrorResult& aRv); bool ValidateAndSetGrammarList(ErrorResult& aRv); class GetUserMediaSuccessCallback : public nsIDOMGetUserMediaSuccessCallback { public: NS_DECL_ISUPPORTS NS_DECL_NSIDOMGETUSERMEDIASUCCESSCALLBACK explicit GetUserMediaSuccessCallback(SpeechRecognition* aRecognition) : mRecognition(aRecognition) {} private: virtual ~GetUserMediaSuccessCallback() {} RefPtr mRecognition; }; class GetUserMediaErrorCallback : public nsIDOMGetUserMediaErrorCallback { public: NS_DECL_ISUPPORTS NS_DECL_NSIDOMGETUSERMEDIAERRORCALLBACK explicit GetUserMediaErrorCallback(SpeechRecognition* aRecognition) : mRecognition(aRecognition) {} private: virtual ~GetUserMediaErrorCallback() {} RefPtr mRecognition; }; NS_IMETHOD StartRecording(DOMMediaStream* aDOMStream); NS_IMETHOD StopRecording(); uint32_t ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate); void NotifyError(SpeechEvent* aEvent); void ProcessEvent(SpeechEvent* aEvent); void Transition(SpeechEvent* aEvent); void Reset(); void ResetAndEnd(); void WaitForAudioData(SpeechEvent* aEvent); void StartedAudioCapture(SpeechEvent* aEvent); void StopRecordingAndRecognize(SpeechEvent* aEvent); void WaitForEstimation(SpeechEvent* aEvent); void DetectSpeech(SpeechEvent* aEvent); void WaitForSpeechEnd(SpeechEvent* aEvent); void NotifyFinalResult(SpeechEvent* aEvent); void DoNothing(SpeechEvent* aEvent); void AbortSilently(SpeechEvent* aEvent); void AbortError(SpeechEvent* aEvent); RefPtr mDOMStream; RefPtr mSpeechListener; nsCOMPtr mRecognitionService; FSMState mCurrentState; Endpointer mEndpointer; uint32_t mEstimationSamples; uint32_t mAudioSamplesPerChunk; // buffer holds one chunk of mAudioSamplesPerChunk // samples before feeding it to mEndpointer RefPtr mAudioSamplesBuffer; uint32_t mBufferedSamples; nsCOMPtr mSpeechDetectionTimer; bool mAborted; nsString mLang; RefPtr mSpeechGrammarList; // WebSpeechAPI (http://bit.ly/1gIl7DC) states: // // 1. Default value MUST be false // 2. If true, interim results SHOULD be returned // 3. If false, interim results MUST NOT be returned // // Pocketsphinx does not return interm results; so, defaulting // mInterimResults to false, then ignoring its subsequent value // is a conforming implementation. bool mInterimResults; // WebSpeechAPI (http://bit.ly/1JAiqeo) states: // // 1. Default value is 1 // 2. Subsequent value is the "maximum number of SpeechRecognitionAlternatives per result" // // Pocketsphinx can only return at maximum a single SpeechRecognitionAlternative // per SpeechRecognitionResult. So defaulting mMaxAlternatives to 1, for all non // zero values ignoring mMaxAlternatives while for a 0 value returning no // SpeechRecognitionAlternative per result is a conforming implementation. uint32_t mMaxAlternatives; void ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName); const char* GetName(FSMState aId); const char* GetName(SpeechEvent* aId); }; class SpeechEvent : public Runnable { public: SpeechEvent(SpeechRecognition* aRecognition, SpeechRecognition::EventType aType) : mAudioSegment(0) , mRecognitionResultList(nullptr) , mError(nullptr) , mRecognition(aRecognition) , mType(aType) , mTrackRate(0) { } ~SpeechEvent(); NS_IMETHOD Run() override; AudioSegment* mAudioSegment; RefPtr mRecognitionResultList; // TODO: make this a session being passed which also has index and stuff RefPtr mError; friend class SpeechRecognition; private: SpeechRecognition* mRecognition; // for AUDIO_DATA events, keep a reference to the provider // of the data (i.e., the SpeechStreamListener) to ensure it // is kept alive (and keeps SpeechRecognition alive) until this // event gets processed. RefPtr mProvider; SpeechRecognition::EventType mType; TrackRate mTrackRate; }; } // namespace dom inline nsISupports* ToSupports(dom::SpeechRecognition* aRec) { return ToSupports(static_cast(aRec)); } } // namespace mozilla #endif