1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
|
// Copyright 2012 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_MANAGER_IMPL_H_
#define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_MANAGER_IMPL_H_
#include <memory>
#include <optional>
#include "base/containers/flat_map.h"
#include "base/memory/raw_ptr.h"
#include "base/memory/weak_ptr.h"
#include "content/common/content_export.h"
#include "content/public/browser/browser_thread.h"
#include "content/public/browser/speech_recognition_event_listener.h"
#include "content/public/browser/speech_recognition_manager.h"
#include "content/public/browser/speech_recognition_session_config.h"
#include "content/public/browser/speech_recognition_session_context.h"
#include "media/mojo/mojom/speech_recognition.mojom.h"
#include "media/mojo/mojom/speech_recognition_error.mojom.h"
#include "mojo/public/cpp/bindings/pending_receiver.h"
#include "mojo/public/cpp/bindings/remote.h"
#include "third_party/blink/public/mojom/mediastream/media_stream.mojom-forward.h"
namespace media {
class AudioSystem;
}
namespace content {
class BrowserMainLoop;
class MediaStreamManager;
class MediaStreamUIProxy;
class SpeechRecognitionManagerDelegate;
class SpeechRecognizer;
// This is the manager for speech recognition. It is a single instance in
// the browser process and can serve several requests. Each recognition request
// corresponds to a session, initiated via |CreateSession|.
//
// In any moment, the manager has at most a single session using the microphone
// known as the, |microphone_session_id_|. This is the session that is capturing
// audio, waiting for user permission, etc. There may also be other,
// non-primary, sessions living in parallel that are waiting for results but not
// recording audio.
//
// The SpeechRecognitionManager has the following responsibilities:
// - Handles requests received from various render frames and makes sure only
// one of them accesses the audio device at any given time.
// - Handles the instantiation of NetworkSpeechRecognitionEngineImpl objects
// when requested by SpeechRecognitionSessions.
// - Relays recognition results/status/error events of each session to the
// corresponding listener (demuxing on the base of their session_id).
// - Relays also recognition results/status/error events of every session to
// the catch-all snoop listener (optionally) provided by the delegate.
class CONTENT_EXPORT SpeechRecognitionManagerImpl
: public SpeechRecognitionManager,
public SpeechRecognitionEventListener {
public:
// Returns the current SpeechRecognitionManagerImpl or NULL if the call is
// issued when it is not created yet or destroyed (by BrowserMainLoop).
static SpeechRecognitionManagerImpl* GetInstance();
static bool IsOnDeviceSpeechRecognitionInstalled(
const SpeechRecognitionSessionConfig& config);
// SpeechRecognitionManager implementation.
int CreateSession(const SpeechRecognitionSessionConfig& config) override;
int CreateSession(
const SpeechRecognitionSessionConfig& config,
mojo::PendingReceiver<media::mojom::SpeechRecognitionSession>
session_receiver,
mojo::PendingRemote<media::mojom::SpeechRecognitionSessionClient>
client_remote,
std::optional<SpeechRecognitionAudioForwarderConfig>
audio_forwarder_config) override;
void StartSession(int session_id) override;
void AbortSession(int session_id) override;
void AbortAllSessionsForRenderFrame(int render_process_id,
int render_frame_id) override;
void StopAudioCaptureForSession(int session_id) override;
void UpdateRecognitionContextForSession(
int session_id,
const media::SpeechRecognitionRecognitionContext& recognition_context)
override;
const SpeechRecognitionSessionConfig& GetSessionConfig(
int session_id) override;
SpeechRecognitionSessionContext GetSessionContext(int session_id) override;
bool UseOnDeviceSpeechRecognition(
const SpeechRecognitionSessionConfig& config) override;
// SpeechRecognitionEventListener methods.
void OnRecognitionStart(int session_id) override;
void OnAudioStart(int session_id) override;
void OnSoundStart(int session_id) override;
void OnSoundEnd(int session_id) override;
void OnAudioEnd(int session_id) override;
void OnRecognitionEnd(int session_id) override;
void OnRecognitionResults(
int session_id,
const std::vector<media::mojom::WebSpeechRecognitionResultPtr>& result)
override;
void OnRecognitionError(
int session_id,
const media::mojom::SpeechRecognitionError& error) override;
void OnAudioLevelsChange(int session_id,
float volume,
float noise_volume) override;
SpeechRecognitionManagerDelegate* delegate() const { return delegate_.get(); }
protected:
// Only BrowserMainLoop and tests are allowed to instantiate this class.
friend class BrowserMainLoop;
friend class SpeechRecognitionManagerImplTest;
// Needed for deletion on the IO thread.
friend std::default_delete<SpeechRecognitionManagerImpl>;
friend class base::DeleteHelper<content::SpeechRecognitionManagerImpl>;
SpeechRecognitionManagerImpl(media::AudioSystem* audio_system,
MediaStreamManager* media_stream_manager);
~SpeechRecognitionManagerImpl() override;
private:
// Data types for the internal Finite State Machine (FSM).
enum FSMState {
SESSION_STATE_IDLE = 0,
SESSION_STATE_CAPTURING_AUDIO,
SESSION_STATE_WAITING_FOR_RESULT,
SESSION_STATE_MAX_VALUE = SESSION_STATE_WAITING_FOR_RESULT
};
enum FSMEvent {
EVENT_ABORT = 0,
EVENT_START,
EVENT_UPDATE_RECOGNITION_CONTEXT,
EVENT_STOP_CAPTURE,
EVENT_AUDIO_ENDED,
EVENT_RECOGNITION_ENDED,
EVENT_MAX_VALUE = EVENT_RECOGNITION_ENDED
};
struct Session {
Session();
~Session();
int id;
bool abort_requested;
SpeechRecognitionSessionConfig config;
SpeechRecognitionSessionContext context;
scoped_refptr<SpeechRecognizer> recognizer;
std::unique_ptr<MediaStreamUIProxy> ui;
bool use_microphone;
media::SpeechRecognitionRecognitionContext recognition_context;
};
void AbortSessionImpl(int session_id);
// Callback issued by the SpeechRecognitionManagerDelegate for reporting
// asynchronously the result of the CheckRecognitionIsAllowed call.
void RecognitionAllowedCallback(int session_id,
bool ask_user,
bool is_allowed);
// Callback to get back the result of a media request. |devices| is an array
// of devices approved to be used for the request, |devices| is empty if the
// users deny the request.
void MediaRequestPermissionCallback(
int session_id,
const blink::mojom::StreamDevicesSet& stream_devices_set,
std::unique_ptr<MediaStreamUIProxy> stream_ui);
// Entry point for pushing any external event into the session handling FSM.
void DispatchEvent(int session_id, FSMEvent event);
// Defines the behavior of the session handling FSM, selecting the appropriate
// transition according to the session, its current state and the event.
void ExecuteTransitionAndGetNextState(Session* session,
FSMState session_state,
FSMEvent event);
// Retrieves the state of the session, enquiring directly the recognizer.
FSMState GetSessionState(int session_id) const;
// The methods below handle transitions of the session handling FSM.
void SessionStart(const Session& session);
void SessionUpdateRecognitionContext(const Session& session);
void SessionAbort(const Session& session);
void SessionStopAudioCapture(const Session& session);
void ResetCapturingSessionId(const Session& session);
void SessionDelete(Session* session);
void NotFeasible(const Session& session, FSMEvent event);
bool SessionExists(int session_id) const;
Session* GetSession(int session_id) const;
SpeechRecognitionEventListener* GetListener(int session_id) const;
SpeechRecognitionEventListener* GetDelegateListener() const;
int GetNextSessionID();
static int next_requester_id_;
raw_ptr<media::AudioSystem> audio_system_;
raw_ptr<MediaStreamManager> media_stream_manager_;
base::flat_map<int, std::unique_ptr<Session>> sessions_;
int microphone_session_id_ = kSessionIDInvalid;
int last_session_id_ = kSessionIDInvalid;
bool is_dispatching_event_ = false;
std::unique_ptr<SpeechRecognitionManagerDelegate> delegate_;
const int requester_id_;
mojo::Remote<media::mojom::SpeechRecognitionContext>
speech_recognition_context_;
// Used for posting asynchronous tasks (on the IO thread) without worrying
// about this class being destroyed in the meanwhile (due to browser shutdown)
// since tasks pending on a destroyed WeakPtr are automatically discarded.
base::WeakPtrFactory<SpeechRecognitionManagerImpl> weak_factory_{this};
};
} // namespace content
#endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_MANAGER_IMPL_H_
|