File: tts_extension.js

package info (click to toggle)
chromium 138.0.7204.183-1~deb12u1
links: PTS, VCS
area: main
in suites: bookworm-proposed-updates
size: 6,080,960 kB
sloc: cpp: 34,937,079; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,954; asm: 946,768; xml: 739,971; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,811; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (296 lines) | stat: -rw-r--r-- 10,084 bytes
parent folder | download | duplicates (6)
// Copyright 2013 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

/**
 * @fileoverview
 * This is a component extension that implements a text-to-speech (TTS)
 * engine powered by Google's speech synthesis API.
 *
 * This is a service worker, so it's not loaded when the API isn't being used,
 * and doesn't waste resources. When a web page or web app makes a speech
 * request and the parameters match one of the voices in this extension's
 * manifest, it makes a request to Google's API using Chrome's private key
 * and plays the resulting speech using HTML5 audio.
 */

/**
 * The main class for this extension. Adds listeners to
 * chrome.ttsEngine.onSpeak and chrome.ttsEngine.onStop and implements
 * them using Google's speech synthesis API.
 */
class TtsExtension {
  /**
   * Initializes data structures and adds event listeners.
   */
  constructor() {
    /**
     * The url prefix of the speech server, including static query
     * parameters that don't change.
     * @type {string}
     * @const
     * @private
     */
    this.SPEECH_SERVER_URL_ =
        'https://www.google.com/speech-api/v2/synthesize?' +
        'enc=mpeg&client=chromium';

    /**
     * A mapping from language and gender to voice name, hardcoded for now
     * until the speech synthesis server capabilities response provides this.
     * The key of this map is of the form '<lang>-<gender>'.
     * @type {Object<string>}
     * @private
     */
    this.LANG_AND_GENDER_TO_VOICE_NAME_ = {
      'en-gb-male': 'rjs',
      'en-gb-female': 'fis',
    };

    /**
     * The arguments passed to the onSpeak event handler for the utterance
     * that's currently being spoken. Should be null when no object is
     * pending.
     *
     * @type {?{utterance: string, options: Object, callback: Function}}
     * @private
     */
    this.currentUtterance_ = null;

    /**
     * A mapping from voice name to language and gender, derived from the
     * manifest file.  This is used in case the speech synthesis request
     * specifies a voice name but doesn't specify a language code or gender.
     * @type {Object<{lang: string, gender: string}>}
     * @private
     */
    this.voiceNameToLangAndGender_ = {};

    /** True when the offscreen document is in the progress of loading. */
    this.loading_ = true;

    /** Tracks the timeout callback which closes the offscreen document. */
    this.closeTimeoutId_ = null;

    // Get voices from manifest.
    const voices = chrome.runtime.getManifest().tts_engine.voices;
    for (let i = 0; i < voices.length; i++) {
      this.voiceNameToLangAndGender_[voices[i].voice_name] = {
        lang: voices[i].lang,
        gender: voices[i].gender,
      };
    }

    // Install event listeners for the ttsEngine API.
    chrome.ttsEngine.onSpeak.addListener(
        (utterance, options, callback) =>
            this.onSpeak_(utterance, options, callback));
    chrome.ttsEngine.onStop.addListener(() => this.onStop_());
    chrome.ttsEngine.onPause.addListener(() => this.onPause_());
    chrome.ttsEngine.onResume.addListener(() => this.onResume_());

    chrome.runtime.onMessage.addListener(message => {
      switch (message.command) {
        case 'loaded':
          // Speak an utterance spoken during load.
          if (this.loading_) {
            this.loading_ = false;
            if (this.currentUtterance_) {
              this.onSpeak_(
                  this.currentUtterance_.utterance,
                  this.currentUtterance_.options,
                  this.currentUtterance_.callback);
            }
          }
          break;
        case 'onCanPlayThrough':
          chrome.metricsPrivate.recordEnumerationValue(
              'TextToSpeech.ExtensionNetworkSpeechSynthesis.Playback',
              /*can play through*/ 0, /*enum size*/ 2);
          if (this.currentUtterance_) {
            this.currentUtterance_.callback({'type': 'start', 'charIndex': 0});
          }
          break;
        case 'onEnded':
          chrome.metricsPrivate.recordEnumerationValue(
              'TextToSpeech.ExtensionNetworkSpeechSynthesis.Playback',
              /*ended*/ 1, /*enum size */ 2);
          this.onStop_(/* onEnded = */ true);
          break;
      }
    });
  }

  /**
   * Handler for the chrome.ttsEngine.onSpeak interface.
   * Gets Chrome's Google API key and then uses it to generate a request
   * url for the requested speech utterance. Sets that url as the source
   * of the HTML5 audio element.
   * @param {string} utterance The text to be spoken.
   * @param {Object} options Options to control the speech, as defined
   *     in the Chrome ttsEngine extension API.
   * @private
   */
  onSpeak_(utterance, options, callback) {
    // Ignore the utterance if it is empty. Continue such processing causes no
    // speech and fails all subsequent calls to process additional utterances.
    if (utterance.length === 0) {
      callback({'type': 'end', 'charIndex': 0});
      this.currentUtterance_ = null;
      return;
    }

    // Truncate the utterance if it's too long. Both Chrome's tts
    // extension api and the web speech api specify 32k as the
    // maximum limit for an utterance.
    if (utterance.length > 32768) {
      utterance = utterance.substr(0, 32768);
    }

    this.currentUtterance_ = {
      utterance: utterance,
      options: options,
      callback: callback,
    };

    if (this.loading_) {
      this.maybeCreateOffscreenDocument_();
      return;
    }

    // We're now committed to speaking something. Bump our timeout to keep the
    // offscreen doc open.
    clearTimeout(this.closeTimeoutId_);

    chrome.runtime.sendMessage({
      command: 'setCurrentUtterance',
      currentUtterance: this.currentUtterance_,
    });

    let lang = options.lang;
    let gender = options.gender;
    if (options.voiceName) {
      lang = this.voiceNameToLangAndGender_[options.voiceName].lang;
      gender = this.voiceNameToLangAndGender_[options.voiceName].gender;
    }

    if (!lang) {
      lang = navigator.language;
    }

    // Look up the specific voice name for this language and gender.
    // If it's not in the map, it doesn't matter - the language will
    // be used directly. This is only used for languages where more
    // than one gender is actually available.
    const key = lang.toLowerCase() + '-' + gender;
    const voiceName = this.LANG_AND_GENDER_TO_VOICE_NAME_[key];

    let url = this.SPEECH_SERVER_URL_;
    chrome.systemPrivate.getApiKey(key => {
      url += '&key=' + key;
      url += '&text=' + encodeURIComponent(utterance);
      url += '&lang=' + lang.toLowerCase();

      if (voiceName) {
        url += '&name=' + voiceName;
      }

      if (options.rate) {
        // Input rate is between 0.1 and 10.0 with a default of 1.0.
        // Output speed is between 0.0 and 1.0 with a default of 0.5.
        url += '&speed=' + (options.rate / 2.0);
      }

      if (options.pitch) {
        // Input pitch is between 0.0 and 2.0 with a default of 1.0.
        // Output pitch is between 0.0 and 1.0 with a default of 0.5.
        url += '&pitch=' + (options.pitch / 2.0);
      }

      // This begins loading the audio but does not play it.
      // When enough of the audio has loaded to begin playback,
      // the 'canplaythrough' handler will call this.onCanPlayThrough_,
      // which sends a start event to the ttsEngine callback and
      // then begins playing audio.
      chrome.runtime.sendMessage({command: 'setUrl', url});
    });
  }

  /**
   * Handler for the chrome.ttsEngine.onStop interface.
   * Called either when the ttsEngine API requests us to stop, or when
   * we reach the end of the audio stream. Pause the audio element to
   * silence it, and send a callback to the ttsEngine API to let it know
   * that we've completed. Note that the ttsEngine API manages callback
   * messages and will automatically replace the 'end' event with a
   * more specific callback like 'interrupted' when sending it to the
   * TTS client.
   * @param {boolean|undefined) onEnded
   * @private
   */
  onStop_(onEnded) {
    if (this.currentUtterance_) {
      if (onEnded !== true) {
        chrome.runtime.sendMessage({command: 'pause'});
      }
      this.currentUtterance_.callback({
        'type': 'end',
        'charIndex': this.currentUtterance_.utterance.length,
      });
      this.currentUtterance_ = null;
    }
    this.maybeCloseOffscreenDocument_();
  }

  /**
   * Handler for the chrome.ttsEngine.onPause interface.
   * Pauses audio if we're in the middle of an utterance.
   * @private
   */
  onPause_() {
    if (this.currentUtterance_) {
      chrome.runtime.sendMessage({command: 'pause'});
    }
  }

  /**
   * Handler for the chrome.ttsEngine.onPause interface.
   * Resumes audio if we're in the middle of an utterance.
   * @private
   */
  onResume_() {
    if (this.currentUtterance_) {
      chrome.runtime.sendMessage({command: 'play'});
    }
  }

  async maybeCreateOffscreenDocument_() {
    const offscreenUrl = chrome.runtime.getURL('audio.html');
    const existingContexts = await chrome.runtime.getContexts(
        {contextTypes: ['OFFSCREEN_DOCUMENT'], documentUrls: [offscreenUrl]});

    if (existingContexts.length === 0) {
      const creating = chrome.offscreen.createDocument({
        url: offscreenUrl,
        // We use USER_MEDIA here instead of AUDIO_PLAYBACK because we need to
        // support pause/resume and need the offscreen document to exist without
        // audio playback. Once finished, we manually close the offscreen
        // document.
        reasons: ['USER_MEDIA'],
        justification: 'Use the audio element',
      });
      await creating;
    }
  }

  maybeCloseOffscreenDocument_() {
    // Clear any existing timeouts.
    clearTimeout(this.closeTimeoutId_);
    this.closeTimeoutId_ = setTimeout(async () => {
      await chrome.offscreen.closeDocument();
      this.loading_ = true;
    }, 30 * 1000);
  }
}

new TtsExtension();