File: system_live_caption_service.h

package info (click to toggle)
chromium 139.0.7258.127-1
links: PTS, VCS
area: main
in suites:
size: 6,122,068 kB
sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (181 lines) | stat: -rw-r--r-- 6,778 bytes
parent folder | download | duplicates (5)
// Copyright 2022 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef CHROME_BROWSER_ASH_ACCESSIBILITY_LIVE_CAPTION_SYSTEM_LIVE_CAPTION_SERVICE_H_
#define CHROME_BROWSER_ASH_ACCESSIBILITY_LIVE_CAPTION_SYSTEM_LIVE_CAPTION_SERVICE_H_

#include <memory>

#include "ash/accessibility/caption_bubble_context_ash.h"
#include "base/memory/raw_ptr.h"
#include "base/memory/weak_ptr.h"
#include "base/scoped_observation.h"
#include "base/timer/timer.h"
#include "chrome/browser/profiles/profile_keyed_service_factory.h"
#include "chrome/browser/speech/speech_recognition_recognizer_client_impl.h"
#include "chrome/browser/speech/speech_recognizer_delegate.h"
#include "chromeos/ash/components/audio/cras_audio_handler.h"
#include "components/keyed_service/core/keyed_service.h"
#include "components/live_caption/live_translate_controller.h"
#include "components/live_caption/translation_util.h"
#include "components/soda/constants.h"
#include "components/soda/soda_installer.h"
#include "media/mojo/mojom/speech_recognition.mojom-shared.h"
#include "media/mojo/mojom/speech_recognition.mojom.h"
#include "mojo/public/cpp/bindings/receiver.h"

class Profile;
class SpeechRecognitionRecognizerClientImpl;

namespace captions {
class LiveCaptionController;
}  // namespace captions

namespace media {
class AudioSystem;
}  // namespace media

namespace ash {

// Responsible for running the live captioning model on audio from non-web (e.g.
// Android, linux) sources. Internally uses the "audio stream" speech
// recognition API on (what will eventually be) a "loopback" audio stream.
//
// This class doesn't track preferences, package installation or audio status at
// all; it is told to start/stop by the classes that actually do so.
//
// For the moment, this is prototype logic only: it processes the input device
// stream (c.f. a not-yet-existing "non-web only" loopback) and processes the
// stream even when no audio is being produced.
//
// TODO(b/253114860): Until these issues are addressed, this class can't be used
//                    in production.
class SystemLiveCaptionService
    : public KeyedService,
      public SpeechRecognizerDelegate,
      public media::mojom::SpeechRecognitionBrowserObserver,
      public CrasAudioHandler::AudioObserver {
 public:
  enum class AudioSource {
    kLoopback,
    kUserMicrophone,
  };

  explicit SystemLiveCaptionService(
      Profile* profile,
      AudioSource source = AudioSource::kLoopback);
  ~SystemLiveCaptionService() override;

  SystemLiveCaptionService(const SystemLiveCaptionService&) = delete;
  SystemLiveCaptionService& operator=(const SystemLiveCaptionService&) = delete;

  // KeyedService overrides:
  void Shutdown() override;

  // SpeechRecognizerDelegate overrides:
  void OnSpeechResult(const std::u16string& text,
                      bool is_final,
                      const std::optional<media::SpeechRecognitionResult>&
                          full_result) override;
  void OnSpeechSoundLevelChanged(int16_t level) override;
  void OnSpeechRecognitionStateChanged(
      SpeechRecognizerStatus new_state) override;
  void OnSpeechRecognitionStopped() override;
  void OnLanguageIdentificationEvent(
      media::mojom::LanguageIdentificationEventPtr event) override;

  // media::mojom::SpeechRecognitionBrowserObserver overrides:
  void SpeechRecognitionAvailabilityChanged(
      bool is_speech_recognition_available) override;
  void SpeechRecognitionLanguageChanged(const std::string& language) override;
  void SpeechRecognitionMaskOffensiveWordsChanged(
      bool mask_offensive_words) override;

  void set_audio_system_factory_for_testing(
      base::RepeatingCallback<std::unique_ptr<media::AudioSystem>()>
          create_audio_system_for_testing) {
    create_audio_system_for_testing_ =
        std::move(create_audio_system_for_testing);
  }

  void set_num_non_chrome_output_streams_for_testing(
      uint32_t num_output_streams) {
    num_output_streams_for_testing_ = num_output_streams;
  }

  // CrasAudioHandler::AudioObserver overrides
  void OnNonChromeOutputStarted() override;

  void OnNonChromeOutputStopped() override;

 protected:
  virtual media::mojom::RecognizerClientType GetRecognizerClientType();

 private:
  void OnTranslationCallback(const std::string& cached_translation,
                             const std::string& original_transcription,
                             const std::string& source_language,
                             const std::string& target_language,
                             bool is_final,
                             const ::captions::TranslateEvent& result);

  void AttemptDispatch(const std::string& text, bool is_final);

  // Binds to the correct observer list based on `source_`
  void BindToBrowserInterface();
  // Gets language code based on the preference this keyed_service
  // is listening to.
  virtual std::string GetPrimaryLanguageCode() const;
  // The source language code of the audio stream.
  std::string source_language_;
  SpeechRecognizerStatus current_recognizer_status_ =
      SpeechRecognizerStatus::SPEECH_RECOGNIZER_OFF;
  bool output_running_ = false;

  std::unique_ptr<base::OneShotTimer> stop_countdown_timer_;

  // Stops and destructs audio stream recognizing client.
  void StopRecognizing();

  void CreateClient();
  void StopTimeoutFinished();

  void OpenCaptionSettings();

  // wrapper around CrasAudioHandler's NumberOfNonChromeOutputStreams.  If
  // we inject a value for the number of non chrome output streams this method
  // will instead return that value.
  uint32_t GetNumberOfNonChromeOutputStreams();

  ::captions::TranslationCache translation_cache_;

  const raw_ptr<Profile> profile_;
  raw_ptr<::captions::LiveCaptionController> controller_;
  ash::captions::CaptionBubbleContextAsh context_;

  std::unique_ptr<SpeechRecognitionRecognizerClientImpl> client_;

  // Which audio source this service is listening to.
  const AudioSource source_;

  // The number of characters sent to the translation service.
  int characters_translated_ = 0;

  // If set during a test this number will be used to determine the
  // number of non chrome output streams.
  std::optional<uint32_t> num_output_streams_for_testing_;

  mojo::Receiver<media::mojom::SpeechRecognitionBrowserObserver>
      browser_observer_receiver_{this};

  // Used to inject a fake audio system into our client in tests.
  base::RepeatingCallback<std::unique_ptr<media::AudioSystem>()>
      create_audio_system_for_testing_;

  base::WeakPtrFactory<SystemLiveCaptionService> weak_ptr_factory_{this};
};

}  // namespace ash

#endif  // CHROME_BROWSER_ASH_ACCESSIBILITY_LIVE_CAPTION_SYSTEM_LIVE_CAPTION_SERVICE_H_