File: tts_linux.cc

package info (click to toggle)
chromium 138.0.7204.183-1~deb12u1
links: PTS, VCS
area: main
in suites: bookworm-proposed-updates
size: 6,080,960 kB
sloc: cpp: 34,937,079; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,954; asm: 946,768; xml: 739,971; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,811; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (588 lines) | stat: -rw-r--r-- 18,973 bytes
parent folder | download | duplicates (4)
// Copyright 2012 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include <math.h>
#include <stddef.h>

#include <algorithm>
#include <map>
#include <memory>

#include "base/command_line.h"
#include "base/compiler_specific.h"
#include "base/debug/leak_annotations.h"
#include "base/functional/bind.h"
#include "base/memory/raw_ptr.h"
#include "base/no_destructor.h"
#include "base/synchronization/lock.h"
#include "base/task/sequenced_task_runner.h"
#include "base/task/task_runner.h"
#include "base/task/task_traits.h"
#include "base/task/thread_pool.h"
#include "base/threading/sequence_bound.h"
#include "content/browser/speech/tts_platform_impl.h"
#include "content/public/browser/browser_task_traits.h"
#include "content/public/browser/browser_thread.h"
#include "content/public/browser/tts_controller.h"
#include "content/public/common/content_switches.h"
#include "library_loaders/libspeechd.h"

namespace content {

namespace {

struct SPDChromeVoice {
  std::string name;
  std::string module;
  std::string language;
};

using PlatformVoices = std::map<std::string, SPDChromeVoice>;

constexpr int kInvalidUtteranceId = -1;
constexpr int kInvalidMessageUid = -1;

}  // namespace

class TtsPlatformImplBackgroundWorker {
 public:
  TtsPlatformImplBackgroundWorker() = default;
  TtsPlatformImplBackgroundWorker(const TtsPlatformImplBackgroundWorker&) =
      delete;
  TtsPlatformImplBackgroundWorker& operator=(
      const TtsPlatformImplBackgroundWorker&) = delete;
  ~TtsPlatformImplBackgroundWorker() = default;

  void Initialize();

  void ProcessSpeech(int utterance_id,
                     const std::string& parsed_utterance,
                     const std::string& lang,
                     float rate,
                     float pitch,
                     SPDChromeVoice voice,
                     base::OnceCallback<void(bool)> on_speak_finished);

  void Pause();
  void Resume();
  void StopSpeaking();
  void Shutdown();

 private:
  bool InitializeSpeechd();
  void InitializeVoices(PlatformVoices*);
  void OpenConnection();
  void CloseConnection();

  void OnSpeechEvent(int msg_id, SPDNotificationType type);

  // Send an TTS event notification to the TTS controller.
  void SendTtsEvent(int utterance_id,
                    TtsEventType event_type,
                    int char_index,
                    int length = -1);

  static void NotificationCallback(size_t msg_id,
                                   size_t client_id,
                                   SPDNotificationType type);

  static void IndexMarkCallback(size_t msg_id,
                                size_t client_id,
                                SPDNotificationType state,
                                char* index_mark);

  LibSpeechdLoader libspeechd_loader_;
  raw_ptr<SPDConnection> conn_ = nullptr;
  int msg_uid_ = kInvalidMessageUid;

  // These apply to the current utterance only that is currently being
  // processed.
  int utterance_id_ = kInvalidUtteranceId;
  size_t utterance_length_ = 0;
  size_t utterance_char_position_ = 0;
};

class TtsPlatformImplLinux : public TtsPlatformImpl {
 public:
  TtsPlatformImplLinux(const TtsPlatformImplLinux&) = delete;
  TtsPlatformImplLinux& operator=(const TtsPlatformImplLinux&) = delete;

  bool PlatformImplSupported() override;
  bool PlatformImplInitialized() override;
  void Speak(int utterance_id,
             const std::string& utterance,
             const std::string& lang,
             const VoiceData& voice,
             const UtteranceContinuousParameters& params,
             base::OnceCallback<void(bool)> on_speak_finished) override;
  bool StopSpeaking() override;
  void Pause() override;
  void Resume() override;
  bool IsSpeaking() override;
  void GetVoices(std::vector<VoiceData>* out_voices) override;
  void Shutdown() override;

  void OnInitialized(bool success, PlatformVoices voices);
  void OnSpeakScheduled(base::OnceCallback<void(bool)> on_speak_finished,
                        bool success);
  void OnSpeakFinished(int utterance_id);

  base::SequenceBound<TtsPlatformImplBackgroundWorker>* worker() {
    return &worker_;
  }

  // Get the single instance of this class.
  static TtsPlatformImplLinux* GetInstance();

 private:
  friend base::NoDestructor<TtsPlatformImplLinux>;
  TtsPlatformImplLinux();

  void ProcessSpeech(int utterance_id,
                     const std::string& lang,
                     const VoiceData& voice,
                     const UtteranceContinuousParameters& params,
                     base::OnceCallback<void(bool)> on_speak_finished,
                     const std::string& parsed_utterance);

  // Holds the platform state.
  bool is_supported_ = false;
  bool is_initialized_ = false;
  bool is_speaking_ = false;
  bool paused_ = false;

  // The current utterance being spoke.
  int utterance_id_ = kInvalidUtteranceId;

  // Map a string composed of a voicename and module to the voicename. Used to
  // uniquely identify a voice across all available modules.
  PlatformVoices voices_;

  // Hold the state and the code of the background implementation.
  base::SequenceBound<TtsPlatformImplBackgroundWorker> worker_;
};

//
// TtsPlatformImplBackgroundWorker
//

void TtsPlatformImplBackgroundWorker::Initialize() {
  PlatformVoices voices;
  if (InitializeSpeechd()) {
    OpenConnection();
    InitializeVoices(&voices);
  }

  bool success = (conn_ != nullptr);
  GetUIThreadTaskRunner({})->PostTask(
      FROM_HERE,
      base::BindOnce(&TtsPlatformImplLinux::OnInitialized,
                     base::Unretained(TtsPlatformImplLinux::GetInstance()),
                     success, std::move(voices)));
}

void TtsPlatformImplBackgroundWorker::ProcessSpeech(
    int utterance_id,
    const std::string& parsed_utterance,
    const std::string& lang,
    float rate,
    float pitch,
    SPDChromeVoice voice,
    base::OnceCallback<void(bool)> on_speak_finished) {
  if (!conn_) {
    GetUIThreadTaskRunner({})->PostTask(
        FROM_HERE, base::BindOnce(std::move(on_speak_finished), false));
    return;
  }

  libspeechd_loader_.spd_set_output_module(conn_, voice.module.c_str());
  libspeechd_loader_.spd_set_synthesis_voice(conn_, voice.name.c_str());

  // Map our multiplicative range to Speech Dispatcher's linear range.
  // .334 = -100.
  // 3 = 100.
  libspeechd_loader_.spd_set_voice_rate(conn_, 100 * log10(rate) / log10(3));
  libspeechd_loader_.spd_set_voice_pitch(conn_, 100 * log10(pitch) / log10(3));

  // Support languages other than the default
  if (!lang.empty())
    libspeechd_loader_.spd_set_language(conn_, lang.c_str());

  utterance_id_ = utterance_id;
  utterance_char_position_ = 0;
  utterance_length_ = parsed_utterance.size();

  // spd_say(...) returns msg_uid on success, -1 otherwise. Each call to spd_say
  // returns a different msg_uid.
  msg_uid_ =
      libspeechd_loader_.spd_say(conn_, SPD_TEXT, parsed_utterance.c_str());

  bool success = (msg_uid_ != kInvalidMessageUid);
  GetUIThreadTaskRunner({})->PostTask(
      FROM_HERE, base::BindOnce(std::move(on_speak_finished), success));
}

void TtsPlatformImplBackgroundWorker::Pause() {
  if (conn_ && msg_uid_ != kInvalidMessageUid)
    libspeechd_loader_.spd_pause(conn_);
}

void TtsPlatformImplBackgroundWorker::Resume() {
  if (conn_ && msg_uid_ != kInvalidMessageUid)
    libspeechd_loader_.spd_resume(conn_);
}

void TtsPlatformImplBackgroundWorker::StopSpeaking() {
  if (conn_ && msg_uid_ != kInvalidMessageUid) {
    int result = libspeechd_loader_.spd_stop(conn_);
    if (result == -1) {
      CloseConnection();
      OpenConnection();
    }
    msg_uid_ = kInvalidMessageUid;
    utterance_id_ = kInvalidUtteranceId;
  }
}

void TtsPlatformImplBackgroundWorker::Shutdown() {
  CloseConnection();
}

bool TtsPlatformImplBackgroundWorker::InitializeSpeechd() {
  return libspeechd_loader_.Load("libspeechd.so.2");
}

void TtsPlatformImplBackgroundWorker::InitializeVoices(PlatformVoices* voices) {
  if (!conn_)
    return;

  char** modules = libspeechd_loader_.spd_list_modules(conn_);
  if (!modules)
    return;
  for (int i = 0; UNSAFE_TODO(modules[i]); i++) {
    char* module = UNSAFE_TODO(modules[i]);
    libspeechd_loader_.spd_set_output_module(conn_, module);
    SPDVoice** spd_voices = libspeechd_loader_.spd_list_synthesis_voices(conn_);
    if (!spd_voices) {
      free(module);
      continue;
    }
    for (int j = 0; UNSAFE_TODO(spd_voices[j]); j++) {
      SPDVoice* spd_voice = UNSAFE_TODO(spd_voices[j]);
      SPDChromeVoice spd_data;
      spd_data.name = spd_voice->name;
      spd_data.module = module;
      spd_data.language = spd_voice->language;
      std::string key;
      key.append(spd_data.name);
      key.append(" ");
      key.append(spd_data.module);
      voices->insert(std::pair<std::string, SPDChromeVoice>(key, spd_data));
      free(UNSAFE_TODO(spd_voices[j]));
    }
    free(UNSAFE_TODO(modules[i]));
  }
}

void TtsPlatformImplBackgroundWorker::OpenConnection() {
  {
    // spd_open has memory leaks which are hard to suppress.
    // http://crbug.com/317360
    ANNOTATE_SCOPED_MEMORY_LEAK;
    conn_ = libspeechd_loader_.spd_open("chrome", "extension_api", nullptr,
                                        SPD_MODE_THREADED);
  }
  if (!conn_)
    return;

  // Register callbacks for all events.
  conn_->callback_begin = conn_->callback_end = conn_->callback_cancel =
      conn_->callback_pause = conn_->callback_resume =
          &TtsPlatformImplBackgroundWorker::NotificationCallback;

  conn_->callback_im = &TtsPlatformImplBackgroundWorker::IndexMarkCallback;

  libspeechd_loader_.spd_set_notification_on(conn_, SPD_BEGIN);
  libspeechd_loader_.spd_set_notification_on(conn_, SPD_END);
  libspeechd_loader_.spd_set_notification_on(conn_, SPD_CANCEL);
  libspeechd_loader_.spd_set_notification_on(conn_, SPD_PAUSE);
  libspeechd_loader_.spd_set_notification_on(conn_, SPD_RESUME);
}

void TtsPlatformImplBackgroundWorker::CloseConnection() {
  if (conn_) {
    libspeechd_loader_.spd_close(conn_);
    conn_ = nullptr;
  }
}

void TtsPlatformImplBackgroundWorker::OnSpeechEvent(int msg_id,
                                                    SPDNotificationType type) {
  if (!conn_ || msg_id != msg_uid_)
    return;

  switch (type) {
    case SPD_EVENT_BEGIN:
      utterance_char_position_ = 0;
      SendTtsEvent(utterance_id_, TTS_EVENT_START, utterance_char_position_,
                   -1);
      break;
    case SPD_EVENT_RESUME:
      SendTtsEvent(utterance_id_, TTS_EVENT_RESUME, utterance_char_position_,
                   -1);
      break;
    case SPD_EVENT_END:
      GetUIThreadTaskRunner({})->PostTask(
          FROM_HERE,
          base::BindOnce(&TtsPlatformImplLinux::OnSpeakFinished,
                         base::Unretained(TtsPlatformImplLinux::GetInstance()),
                         utterance_id_));

      utterance_char_position_ = utterance_length_;
      SendTtsEvent(utterance_id_, TTS_EVENT_END, utterance_char_position_, 0);
      break;
    case SPD_EVENT_PAUSE:
      SendTtsEvent(utterance_id_, TTS_EVENT_PAUSE, utterance_char_position_,
                   -1);
      break;
    case SPD_EVENT_CANCEL:
      SendTtsEvent(utterance_id_, TTS_EVENT_CANCELLED, utterance_char_position_,
                   -1);
      break;
    case SPD_EVENT_INDEX_MARK:
      // TODO: Can we get length from linux? If so, update
      // utterance_char_position_.
      SendTtsEvent(utterance_id_, TTS_EVENT_MARKER, utterance_char_position_,
                   -1);
      break;
  }
}

void TtsPlatformImplBackgroundWorker::SendTtsEvent(int utterance_id,
                                                   TtsEventType event_type,
                                                   int char_index,
                                                   int length) {
  GetUIThreadTaskRunner({})->PostTask(
      FROM_HERE, base::BindOnce(&TtsController::OnTtsEvent,
                                base::Unretained(TtsController::GetInstance()),
                                utterance_id, event_type, char_index, length,
                                std::string()));
}

// static
void TtsPlatformImplBackgroundWorker::NotificationCallback(
    size_t msg_id,
    size_t client_id,
    SPDNotificationType type) {
  TtsPlatformImplLinux::GetInstance()
      ->worker()
      ->AsyncCall(&TtsPlatformImplBackgroundWorker::OnSpeechEvent)
      .WithArgs(msg_id, type);
}

// static
void TtsPlatformImplBackgroundWorker::IndexMarkCallback(
    size_t msg_id,
    size_t client_id,
    SPDNotificationType type,
    char* index_mark) {
  // TODO(dtseng): index_mark appears to specify an index type supplied by a
  // client. Need to explore how this is used before hooking it up with existing
  // word, sentence events.
  TtsPlatformImplLinux::GetInstance()
      ->worker()
      ->AsyncCall(&TtsPlatformImplBackgroundWorker::OnSpeechEvent)
      .WithArgs(msg_id, type);
}

//
// TtsPlatformImplLinux
//

TtsPlatformImplLinux::TtsPlatformImplLinux()
    : worker_(base::ThreadPool::CreateSequencedTaskRunner({base::MayBlock()})) {
  DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI));
  const base::CommandLine& command_line =
      *base::CommandLine::ForCurrentProcess();
  if (!command_line.HasSwitch(switches::kEnableSpeechDispatcher))
    return;

  // The TTS platform is supported. The Tts platform initialisation will happen
  // on a worker thread and it will become initialized.
  is_supported_ = true;
  worker_.AsyncCall(&TtsPlatformImplBackgroundWorker::Initialize);
}

bool TtsPlatformImplLinux::PlatformImplSupported() {
  return is_supported_;
}

bool TtsPlatformImplLinux::PlatformImplInitialized() {
  return is_initialized_;
}

void TtsPlatformImplLinux::Speak(
    int utterance_id,
    const std::string& utterance,
    const std::string& lang,
    const VoiceData& voice,
    const UtteranceContinuousParameters& params,
    base::OnceCallback<void(bool)> on_speak_finished) {
  DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI));
  DCHECK(PlatformImplInitialized());

  if (paused_ || is_speaking_) {
    std::move(on_speak_finished).Run(false);
    return;
  }

  // Flag that a utterance is getting emitted. The |is_speaking_| flag will be
  // set back to false when the utterance will be fully spoken, stopped or if
  // the voice synthetizer was not able to emit it.
  is_speaking_ = true;
  utterance_id_ = utterance_id;

  // Parse SSML and process speech.
  TtsController::GetInstance()->StripSSML(
      utterance,
      base::BindOnce(&TtsPlatformImplLinux::ProcessSpeech,
                     base::Unretained(this), utterance_id, lang, voice, params,
                     base::BindOnce(&TtsPlatformImplLinux::OnSpeakScheduled,
                                    base::Unretained(this),
                                    std::move(on_speak_finished))));
}

bool TtsPlatformImplLinux::StopSpeaking() {
  DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI));
  DCHECK(PlatformImplInitialized());

  worker_.AsyncCall(&TtsPlatformImplBackgroundWorker::StopSpeaking);
  paused_ = false;

  is_speaking_ = false;
  utterance_id_ = kInvalidUtteranceId;

  return true;
}

void TtsPlatformImplLinux::Pause() {
  DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI));
  DCHECK(PlatformImplInitialized());

  if (paused_ || !is_speaking_)
    return;

  worker_.AsyncCall(&TtsPlatformImplBackgroundWorker::Pause);
  paused_ = true;
}

void TtsPlatformImplLinux::Resume() {
  DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI));
  DCHECK(PlatformImplInitialized());

  if (!paused_ || !is_speaking_)
    return;

  worker_.AsyncCall(&TtsPlatformImplBackgroundWorker::Resume);
  paused_ = false;
}

bool TtsPlatformImplLinux::IsSpeaking() {
  return is_speaking_;
}

void TtsPlatformImplLinux::GetVoices(std::vector<VoiceData>* out_voices) {
  DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI));
  DCHECK(PlatformImplInitialized());

  for (auto it = voices_.begin(); it != voices_.end(); ++it) {
    out_voices->push_back(VoiceData());
    VoiceData& voice = out_voices->back();
    voice.native = true;
    voice.name = it->first;
    voice.lang = it->second.language;
    voice.events.insert(TTS_EVENT_START);
    voice.events.insert(TTS_EVENT_END);
    voice.events.insert(TTS_EVENT_CANCELLED);
    voice.events.insert(TTS_EVENT_MARKER);
    voice.events.insert(TTS_EVENT_PAUSE);
    voice.events.insert(TTS_EVENT_RESUME);
  }
}

void TtsPlatformImplLinux::Shutdown() {
  worker_.AsyncCall(&TtsPlatformImplBackgroundWorker::Shutdown);
}

void TtsPlatformImplLinux::OnInitialized(bool success, PlatformVoices voices) {
  DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI));
  if (success)
    voices_ = std::move(voices);
  is_initialized_ = true;
  TtsController::GetInstance()->VoicesChanged();
}

void TtsPlatformImplLinux::OnSpeakScheduled(
    base::OnceCallback<void(bool)> on_speak_finished,
    bool success) {
  DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI));
  DCHECK(is_speaking_);

  // If the utterance was not able to be emitted, stop the speaking. There
  // won't be any asynchronous TTS event to confirm the end of the speech.
  if (!success) {
    is_speaking_ = false;
    utterance_id_ = kInvalidUtteranceId;
  }

  // Pass the results to our caller.
  std::move(on_speak_finished).Run(success);
}

void TtsPlatformImplLinux::OnSpeakFinished(int utterance_id) {
  DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI));
  if (utterance_id != utterance_id_)
    return;

  DCHECK(is_speaking_);
  DCHECK_NE(utterance_id_, kInvalidUtteranceId);
  is_speaking_ = false;
  utterance_id_ = kInvalidUtteranceId;
}

void TtsPlatformImplLinux::ProcessSpeech(
    int utterance_id,
    const std::string& lang,
    const VoiceData& voice,
    const UtteranceContinuousParameters& params,
    base::OnceCallback<void(bool)> on_speak_finished,
    const std::string& parsed_utterance) {
  DCHECK(BrowserThread::CurrentlyOn(content::BrowserThread::UI));

  // Speech dispatcher's speech params are around 3x at either limit.
  float rate = std::clamp(static_cast<float>(params.rate), 0.334f, 3.0f);
  float pitch = std::clamp(static_cast<float>(params.pitch), 0.334f, 3.0f);

  SPDChromeVoice matched_voice;
  auto it = voices_.find(voice.name);
  if (it != voices_.end())
    matched_voice = it->second;

  worker_.AsyncCall(&TtsPlatformImplBackgroundWorker::ProcessSpeech)
      .WithArgs(utterance_id, parsed_utterance, lang, rate, pitch,
                matched_voice, std::move(on_speak_finished));
}

// static
TtsPlatformImplLinux* TtsPlatformImplLinux::GetInstance() {
  static base::NoDestructor<TtsPlatformImplLinux> tts_platform;
  return tts_platform.get();
}

// static
TtsPlatformImpl* TtsPlatformImpl::GetInstance() {
  return TtsPlatformImplLinux::GetInstance();
}

}  // namespace content