summaryrefslogtreecommitdiffstats
path: root/dom/media/webspeech/recognition/SpeechRecognition.h
diff options
context:
space:
mode:
Diffstat (limited to 'dom/media/webspeech/recognition/SpeechRecognition.h')
-rw-r--r--dom/media/webspeech/recognition/SpeechRecognition.h296
1 files changed, 296 insertions, 0 deletions
diff --git a/dom/media/webspeech/recognition/SpeechRecognition.h b/dom/media/webspeech/recognition/SpeechRecognition.h
new file mode 100644
index 000000000..3f1ab7977
--- /dev/null
+++ b/dom/media/webspeech/recognition/SpeechRecognition.h
@@ -0,0 +1,296 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef mozilla_dom_SpeechRecognition_h
+#define mozilla_dom_SpeechRecognition_h
+
+#include "mozilla/Attributes.h"
+#include "mozilla/DOMEventTargetHelper.h"
+#include "nsCOMPtr.h"
+#include "nsString.h"
+#include "nsWrapperCache.h"
+#include "nsTArray.h"
+#include "js/TypeDecls.h"
+
+#include "nsIDOMNavigatorUserMedia.h"
+#include "nsITimer.h"
+#include "MediaEngine.h"
+#include "MediaStreamGraph.h"
+#include "AudioSegment.h"
+#include "mozilla/WeakPtr.h"
+
+#include "SpeechGrammarList.h"
+#include "SpeechRecognitionResultList.h"
+#include "SpeechStreamListener.h"
+#include "nsISpeechRecognitionService.h"
+#include "endpointer.h"
+
+#include "mozilla/dom/SpeechRecognitionError.h"
+
+namespace mozilla {
+
+namespace dom {
+
+#define SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC "SpeechRecognitionTest:RequestEvent"
+#define SPEECH_RECOGNITION_TEST_END_TOPIC "SpeechRecognitionTest:End"
+
+class GlobalObject;
+class SpeechEvent;
+
+LogModule* GetSpeechRecognitionLog();
+#define SR_LOG(...) MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__))
+
+class SpeechRecognition final : public DOMEventTargetHelper,
+ public nsIObserver,
+ public SupportsWeakPtr<SpeechRecognition>
+{
+public:
+ MOZ_DECLARE_WEAKREFERENCE_TYPENAME(SpeechRecognition)
+ explicit SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow);
+
+ NS_DECL_ISUPPORTS_INHERITED
+ NS_DECL_CYCLE_COLLECTION_CLASS_INHERITED(SpeechRecognition, DOMEventTargetHelper)
+
+ NS_DECL_NSIOBSERVER
+
+ nsISupports* GetParentObject() const;
+
+ JSObject* WrapObject(JSContext* aCx, JS::Handle<JSObject*> aGivenProto) override;
+
+ static bool IsAuthorized(JSContext* aCx, JSObject* aGlobal);
+
+ static already_AddRefed<SpeechRecognition>
+ Constructor(const GlobalObject& aGlobal, ErrorResult& aRv);
+
+ already_AddRefed<SpeechGrammarList> Grammars() const;
+
+ void SetGrammars(mozilla::dom::SpeechGrammarList& aArg);
+
+ void GetLang(nsString& aRetVal) const;
+
+ void SetLang(const nsAString& aArg);
+
+ bool GetContinuous(ErrorResult& aRv) const;
+
+ void SetContinuous(bool aArg, ErrorResult& aRv);
+
+ bool InterimResults() const;
+
+ void SetInterimResults(bool aArg);
+
+ uint32_t MaxAlternatives() const;
+
+ void SetMaxAlternatives(uint32_t aArg);
+
+ void GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const;
+
+ void SetServiceURI(const nsAString& aArg, ErrorResult& aRv);
+
+ void Start(const Optional<NonNull<DOMMediaStream>>& aStream, ErrorResult& aRv);
+
+ void Stop();
+
+ void Abort();
+
+ IMPL_EVENT_HANDLER(audiostart)
+ IMPL_EVENT_HANDLER(soundstart)
+ IMPL_EVENT_HANDLER(speechstart)
+ IMPL_EVENT_HANDLER(speechend)
+ IMPL_EVENT_HANDLER(soundend)
+ IMPL_EVENT_HANDLER(audioend)
+ IMPL_EVENT_HANDLER(result)
+ IMPL_EVENT_HANDLER(nomatch)
+ IMPL_EVENT_HANDLER(error)
+ IMPL_EVENT_HANDLER(start)
+ IMPL_EVENT_HANDLER(end)
+
+ enum EventType {
+ EVENT_START,
+ EVENT_STOP,
+ EVENT_ABORT,
+ EVENT_AUDIO_DATA,
+ EVENT_AUDIO_ERROR,
+ EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT,
+ EVENT_RECOGNITIONSERVICE_FINAL_RESULT,
+ EVENT_RECOGNITIONSERVICE_ERROR,
+ EVENT_COUNT
+ };
+
+ void DispatchError(EventType aErrorType, SpeechRecognitionErrorCode aErrorCode, const nsAString& aMessage);
+ uint32_t FillSamplesBuffer(const int16_t* aSamples, uint32_t aSampleCount);
+ uint32_t SplitSamplesBuffer(const int16_t* aSamplesBuffer, uint32_t aSampleCount, nsTArray<RefPtr<SharedBuffer>>& aResult);
+ AudioSegment* CreateAudioSegment(nsTArray<RefPtr<SharedBuffer>>& aChunks);
+ void FeedAudioData(already_AddRefed<SharedBuffer> aSamples, uint32_t aDuration, MediaStreamListener* aProvider, TrackRate aTrackRate);
+
+ friend class SpeechEvent;
+private:
+ virtual ~SpeechRecognition() {};
+
+ enum FSMState {
+ STATE_IDLE,
+ STATE_STARTING,
+ STATE_ESTIMATING,
+ STATE_WAITING_FOR_SPEECH,
+ STATE_RECOGNIZING,
+ STATE_WAITING_FOR_RESULT,
+ STATE_COUNT
+ };
+
+ void SetState(FSMState state);
+ bool StateBetween(FSMState begin, FSMState end);
+
+ bool SetRecognitionService(ErrorResult& aRv);
+ bool ValidateAndSetGrammarList(ErrorResult& aRv);
+
+ class GetUserMediaSuccessCallback : public nsIDOMGetUserMediaSuccessCallback
+ {
+ public:
+ NS_DECL_ISUPPORTS
+ NS_DECL_NSIDOMGETUSERMEDIASUCCESSCALLBACK
+
+ explicit GetUserMediaSuccessCallback(SpeechRecognition* aRecognition)
+ : mRecognition(aRecognition)
+ {}
+
+ private:
+ virtual ~GetUserMediaSuccessCallback() {}
+
+ RefPtr<SpeechRecognition> mRecognition;
+ };
+
+ class GetUserMediaErrorCallback : public nsIDOMGetUserMediaErrorCallback
+ {
+ public:
+ NS_DECL_ISUPPORTS
+ NS_DECL_NSIDOMGETUSERMEDIAERRORCALLBACK
+
+ explicit GetUserMediaErrorCallback(SpeechRecognition* aRecognition)
+ : mRecognition(aRecognition)
+ {}
+
+ private:
+ virtual ~GetUserMediaErrorCallback() {}
+
+ RefPtr<SpeechRecognition> mRecognition;
+ };
+
+ NS_IMETHOD StartRecording(DOMMediaStream* aDOMStream);
+ NS_IMETHOD StopRecording();
+
+ uint32_t ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate);
+ void NotifyError(SpeechEvent* aEvent);
+
+ void ProcessEvent(SpeechEvent* aEvent);
+ void Transition(SpeechEvent* aEvent);
+
+ void Reset();
+ void ResetAndEnd();
+ void WaitForAudioData(SpeechEvent* aEvent);
+ void StartedAudioCapture(SpeechEvent* aEvent);
+ void StopRecordingAndRecognize(SpeechEvent* aEvent);
+ void WaitForEstimation(SpeechEvent* aEvent);
+ void DetectSpeech(SpeechEvent* aEvent);
+ void WaitForSpeechEnd(SpeechEvent* aEvent);
+ void NotifyFinalResult(SpeechEvent* aEvent);
+ void DoNothing(SpeechEvent* aEvent);
+ void AbortSilently(SpeechEvent* aEvent);
+ void AbortError(SpeechEvent* aEvent);
+
+ RefPtr<DOMMediaStream> mDOMStream;
+ RefPtr<SpeechStreamListener> mSpeechListener;
+ nsCOMPtr<nsISpeechRecognitionService> mRecognitionService;
+
+ FSMState mCurrentState;
+
+ Endpointer mEndpointer;
+ uint32_t mEstimationSamples;
+
+ uint32_t mAudioSamplesPerChunk;
+
+ // buffer holds one chunk of mAudioSamplesPerChunk
+ // samples before feeding it to mEndpointer
+ RefPtr<SharedBuffer> mAudioSamplesBuffer;
+ uint32_t mBufferedSamples;
+
+ nsCOMPtr<nsITimer> mSpeechDetectionTimer;
+ bool mAborted;
+
+ nsString mLang;
+
+ RefPtr<SpeechGrammarList> mSpeechGrammarList;
+
+ // WebSpeechAPI (http://bit.ly/1gIl7DC) states:
+ //
+ // 1. Default value MUST be false
+ // 2. If true, interim results SHOULD be returned
+ // 3. If false, interim results MUST NOT be returned
+ //
+ // Pocketsphinx does not return interm results; so, defaulting
+ // mInterimResults to false, then ignoring its subsequent value
+ // is a conforming implementation.
+ bool mInterimResults;
+
+ // WebSpeechAPI (http://bit.ly/1JAiqeo) states:
+ //
+ // 1. Default value is 1
+ // 2. Subsequent value is the "maximum number of SpeechRecognitionAlternatives per result"
+ //
+ // Pocketsphinx can only return at maximum a single SpeechRecognitionAlternative
+ // per SpeechRecognitionResult. So defaulting mMaxAlternatives to 1, for all non
+ // zero values ignoring mMaxAlternatives while for a 0 value returning no
+ // SpeechRecognitionAlternative per result is a conforming implementation.
+ uint32_t mMaxAlternatives;
+
+ void ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName);
+
+ const char* GetName(FSMState aId);
+ const char* GetName(SpeechEvent* aId);
+};
+
+class SpeechEvent : public Runnable
+{
+public:
+ SpeechEvent(SpeechRecognition* aRecognition, SpeechRecognition::EventType aType)
+ : mAudioSegment(0)
+ , mRecognitionResultList(nullptr)
+ , mError(nullptr)
+ , mRecognition(aRecognition)
+ , mType(aType)
+ , mTrackRate(0)
+ {
+ }
+
+ ~SpeechEvent();
+
+ NS_IMETHOD Run() override;
+ AudioSegment* mAudioSegment;
+ RefPtr<SpeechRecognitionResultList> mRecognitionResultList; // TODO: make this a session being passed which also has index and stuff
+ RefPtr<SpeechRecognitionError> mError;
+
+ friend class SpeechRecognition;
+private:
+ SpeechRecognition* mRecognition;
+
+ // for AUDIO_DATA events, keep a reference to the provider
+ // of the data (i.e., the SpeechStreamListener) to ensure it
+ // is kept alive (and keeps SpeechRecognition alive) until this
+ // event gets processed.
+ RefPtr<MediaStreamListener> mProvider;
+ SpeechRecognition::EventType mType;
+ TrackRate mTrackRate;
+};
+
+} // namespace dom
+
+inline nsISupports*
+ToSupports(dom::SpeechRecognition* aRec)
+{
+ return ToSupports(static_cast<DOMEventTargetHelper*>(aRec));
+}
+
+} // namespace mozilla
+
+#endif