diff options
Diffstat (limited to 'dom/media/webspeech/recognition/SpeechRecognition.h')
-rw-r--r-- | dom/media/webspeech/recognition/SpeechRecognition.h | 296 |
1 files changed, 296 insertions, 0 deletions
diff --git a/dom/media/webspeech/recognition/SpeechRecognition.h b/dom/media/webspeech/recognition/SpeechRecognition.h new file mode 100644 index 000000000..3f1ab7977 --- /dev/null +++ b/dom/media/webspeech/recognition/SpeechRecognition.h @@ -0,0 +1,296 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef mozilla_dom_SpeechRecognition_h +#define mozilla_dom_SpeechRecognition_h + +#include "mozilla/Attributes.h" +#include "mozilla/DOMEventTargetHelper.h" +#include "nsCOMPtr.h" +#include "nsString.h" +#include "nsWrapperCache.h" +#include "nsTArray.h" +#include "js/TypeDecls.h" + +#include "nsIDOMNavigatorUserMedia.h" +#include "nsITimer.h" +#include "MediaEngine.h" +#include "MediaStreamGraph.h" +#include "AudioSegment.h" +#include "mozilla/WeakPtr.h" + +#include "SpeechGrammarList.h" +#include "SpeechRecognitionResultList.h" +#include "SpeechStreamListener.h" +#include "nsISpeechRecognitionService.h" +#include "endpointer.h" + +#include "mozilla/dom/SpeechRecognitionError.h" + +namespace mozilla { + +namespace dom { + +#define SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC "SpeechRecognitionTest:RequestEvent" +#define SPEECH_RECOGNITION_TEST_END_TOPIC "SpeechRecognitionTest:End" + +class GlobalObject; +class SpeechEvent; + +LogModule* GetSpeechRecognitionLog(); +#define SR_LOG(...) MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__)) + +class SpeechRecognition final : public DOMEventTargetHelper, + public nsIObserver, + public SupportsWeakPtr<SpeechRecognition> +{ +public: + MOZ_DECLARE_WEAKREFERENCE_TYPENAME(SpeechRecognition) + explicit SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow); + + NS_DECL_ISUPPORTS_INHERITED + NS_DECL_CYCLE_COLLECTION_CLASS_INHERITED(SpeechRecognition, DOMEventTargetHelper) + + NS_DECL_NSIOBSERVER + + nsISupports* GetParentObject() const; + + JSObject* WrapObject(JSContext* aCx, JS::Handle<JSObject*> aGivenProto) override; + + static bool IsAuthorized(JSContext* aCx, JSObject* aGlobal); + + static already_AddRefed<SpeechRecognition> + Constructor(const GlobalObject& aGlobal, ErrorResult& aRv); + + already_AddRefed<SpeechGrammarList> Grammars() const; + + void SetGrammars(mozilla::dom::SpeechGrammarList& aArg); + + void GetLang(nsString& aRetVal) const; + + void SetLang(const nsAString& aArg); + + bool GetContinuous(ErrorResult& aRv) const; + + void SetContinuous(bool aArg, ErrorResult& aRv); + + bool InterimResults() const; + + void SetInterimResults(bool aArg); + + uint32_t MaxAlternatives() const; + + void SetMaxAlternatives(uint32_t aArg); + + void GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const; + + void SetServiceURI(const nsAString& aArg, ErrorResult& aRv); + + void Start(const Optional<NonNull<DOMMediaStream>>& aStream, ErrorResult& aRv); + + void Stop(); + + void Abort(); + + IMPL_EVENT_HANDLER(audiostart) + IMPL_EVENT_HANDLER(soundstart) + IMPL_EVENT_HANDLER(speechstart) + IMPL_EVENT_HANDLER(speechend) + IMPL_EVENT_HANDLER(soundend) + IMPL_EVENT_HANDLER(audioend) + IMPL_EVENT_HANDLER(result) + IMPL_EVENT_HANDLER(nomatch) + IMPL_EVENT_HANDLER(error) + IMPL_EVENT_HANDLER(start) + IMPL_EVENT_HANDLER(end) + + enum EventType { + EVENT_START, + EVENT_STOP, + EVENT_ABORT, + EVENT_AUDIO_DATA, + EVENT_AUDIO_ERROR, + EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT, + EVENT_RECOGNITIONSERVICE_FINAL_RESULT, + EVENT_RECOGNITIONSERVICE_ERROR, + EVENT_COUNT + }; + + void DispatchError(EventType aErrorType, SpeechRecognitionErrorCode aErrorCode, const nsAString& aMessage); + uint32_t FillSamplesBuffer(const int16_t* aSamples, uint32_t aSampleCount); + uint32_t SplitSamplesBuffer(const int16_t* aSamplesBuffer, uint32_t aSampleCount, nsTArray<RefPtr<SharedBuffer>>& aResult); + AudioSegment* CreateAudioSegment(nsTArray<RefPtr<SharedBuffer>>& aChunks); + void FeedAudioData(already_AddRefed<SharedBuffer> aSamples, uint32_t aDuration, MediaStreamListener* aProvider, TrackRate aTrackRate); + + friend class SpeechEvent; +private: + virtual ~SpeechRecognition() {}; + + enum FSMState { + STATE_IDLE, + STATE_STARTING, + STATE_ESTIMATING, + STATE_WAITING_FOR_SPEECH, + STATE_RECOGNIZING, + STATE_WAITING_FOR_RESULT, + STATE_COUNT + }; + + void SetState(FSMState state); + bool StateBetween(FSMState begin, FSMState end); + + bool SetRecognitionService(ErrorResult& aRv); + bool ValidateAndSetGrammarList(ErrorResult& aRv); + + class GetUserMediaSuccessCallback : public nsIDOMGetUserMediaSuccessCallback + { + public: + NS_DECL_ISUPPORTS + NS_DECL_NSIDOMGETUSERMEDIASUCCESSCALLBACK + + explicit GetUserMediaSuccessCallback(SpeechRecognition* aRecognition) + : mRecognition(aRecognition) + {} + + private: + virtual ~GetUserMediaSuccessCallback() {} + + RefPtr<SpeechRecognition> mRecognition; + }; + + class GetUserMediaErrorCallback : public nsIDOMGetUserMediaErrorCallback + { + public: + NS_DECL_ISUPPORTS + NS_DECL_NSIDOMGETUSERMEDIAERRORCALLBACK + + explicit GetUserMediaErrorCallback(SpeechRecognition* aRecognition) + : mRecognition(aRecognition) + {} + + private: + virtual ~GetUserMediaErrorCallback() {} + + RefPtr<SpeechRecognition> mRecognition; + }; + + NS_IMETHOD StartRecording(DOMMediaStream* aDOMStream); + NS_IMETHOD StopRecording(); + + uint32_t ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate); + void NotifyError(SpeechEvent* aEvent); + + void ProcessEvent(SpeechEvent* aEvent); + void Transition(SpeechEvent* aEvent); + + void Reset(); + void ResetAndEnd(); + void WaitForAudioData(SpeechEvent* aEvent); + void StartedAudioCapture(SpeechEvent* aEvent); + void StopRecordingAndRecognize(SpeechEvent* aEvent); + void WaitForEstimation(SpeechEvent* aEvent); + void DetectSpeech(SpeechEvent* aEvent); + void WaitForSpeechEnd(SpeechEvent* aEvent); + void NotifyFinalResult(SpeechEvent* aEvent); + void DoNothing(SpeechEvent* aEvent); + void AbortSilently(SpeechEvent* aEvent); + void AbortError(SpeechEvent* aEvent); + + RefPtr<DOMMediaStream> mDOMStream; + RefPtr<SpeechStreamListener> mSpeechListener; + nsCOMPtr<nsISpeechRecognitionService> mRecognitionService; + + FSMState mCurrentState; + + Endpointer mEndpointer; + uint32_t mEstimationSamples; + + uint32_t mAudioSamplesPerChunk; + + // buffer holds one chunk of mAudioSamplesPerChunk + // samples before feeding it to mEndpointer + RefPtr<SharedBuffer> mAudioSamplesBuffer; + uint32_t mBufferedSamples; + + nsCOMPtr<nsITimer> mSpeechDetectionTimer; + bool mAborted; + + nsString mLang; + + RefPtr<SpeechGrammarList> mSpeechGrammarList; + + // WebSpeechAPI (http://bit.ly/1gIl7DC) states: + // + // 1. Default value MUST be false + // 2. If true, interim results SHOULD be returned + // 3. If false, interim results MUST NOT be returned + // + // Pocketsphinx does not return interm results; so, defaulting + // mInterimResults to false, then ignoring its subsequent value + // is a conforming implementation. + bool mInterimResults; + + // WebSpeechAPI (http://bit.ly/1JAiqeo) states: + // + // 1. Default value is 1 + // 2. Subsequent value is the "maximum number of SpeechRecognitionAlternatives per result" + // + // Pocketsphinx can only return at maximum a single SpeechRecognitionAlternative + // per SpeechRecognitionResult. So defaulting mMaxAlternatives to 1, for all non + // zero values ignoring mMaxAlternatives while for a 0 value returning no + // SpeechRecognitionAlternative per result is a conforming implementation. + uint32_t mMaxAlternatives; + + void ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName); + + const char* GetName(FSMState aId); + const char* GetName(SpeechEvent* aId); +}; + +class SpeechEvent : public Runnable +{ +public: + SpeechEvent(SpeechRecognition* aRecognition, SpeechRecognition::EventType aType) + : mAudioSegment(0) + , mRecognitionResultList(nullptr) + , mError(nullptr) + , mRecognition(aRecognition) + , mType(aType) + , mTrackRate(0) + { + } + + ~SpeechEvent(); + + NS_IMETHOD Run() override; + AudioSegment* mAudioSegment; + RefPtr<SpeechRecognitionResultList> mRecognitionResultList; // TODO: make this a session being passed which also has index and stuff + RefPtr<SpeechRecognitionError> mError; + + friend class SpeechRecognition; +private: + SpeechRecognition* mRecognition; + + // for AUDIO_DATA events, keep a reference to the provider + // of the data (i.e., the SpeechStreamListener) to ensure it + // is kept alive (and keeps SpeechRecognition alive) until this + // event gets processed. + RefPtr<MediaStreamListener> mProvider; + SpeechRecognition::EventType mType; + TrackRate mTrackRate; +}; + +} // namespace dom + +inline nsISupports* +ToSupports(dom::SpeechRecognition* aRec) +{ + return ToSupports(static_cast<DOMEventTargetHelper*>(aRec)); +} + +} // namespace mozilla + +#endif |