File: speech_level_estimator.cc

package info (click to toggle)
chromium 138.0.7204.183-1
links: PTS, VCS
area: main
in suites: trixie
size: 6,071,908 kB
sloc: cpp: 34,937,088; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,953; asm: 946,768; xml: 739,971; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,806; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (174 lines) | stat: -rw-r--r-- 7,064 bytes
parent folder | download | duplicates (6)
/*
 *  Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "modules/audio_processing/agc2/speech_level_estimator.h"

#include "modules/audio_processing/agc2/agc2_common.h"
#include "modules/audio_processing/logging/apm_data_dumper.h"
#include "rtc_base/checks.h"
#include "rtc_base/logging.h"
#include "rtc_base/numerics/safe_minmax.h"

namespace webrtc {
namespace {

float ClampLevelEstimateDbfs(float level_estimate_dbfs) {
  return SafeClamp<float>(level_estimate_dbfs, -90.0f, 30.0f);
}

// Returns the initial speech level estimate needed to apply the initial gain.
float GetInitialSpeechLevelEstimateDbfs(
    const AudioProcessing::Config::GainController2::AdaptiveDigital& config) {
  return ClampLevelEstimateDbfs(-kSaturationProtectorInitialHeadroomDb -
                                config.initial_gain_db - config.headroom_db);
}

}  // namespace

bool SpeechLevelEstimator::LevelEstimatorState::operator==(
    const SpeechLevelEstimator::LevelEstimatorState& b) const {
  return time_to_confidence_ms == b.time_to_confidence_ms &&
         level_dbfs.numerator == b.level_dbfs.numerator &&
         level_dbfs.denominator == b.level_dbfs.denominator;
}

float SpeechLevelEstimator::LevelEstimatorState::Ratio::GetRatio() const {
  RTC_DCHECK_NE(denominator, 0.f);
  return numerator / denominator;
}

SpeechLevelEstimator::SpeechLevelEstimator(
    ApmDataDumper* apm_data_dumper,
    const AudioProcessing::Config::GainController2::AdaptiveDigital& config,
    int adjacent_speech_frames_threshold)
    : apm_data_dumper_(apm_data_dumper),
      initial_speech_level_dbfs_(GetInitialSpeechLevelEstimateDbfs(config)),
      adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold),
      level_dbfs_(initial_speech_level_dbfs_),
      // TODO(bugs.webrtc.org/7494): Remove init below when AGC2 input volume
      // controller temporal dependency removed.
      is_confident_(false) {
  RTC_DCHECK(apm_data_dumper_);
  RTC_DCHECK_GE(adjacent_speech_frames_threshold_, 1);
  Reset();
}

void SpeechLevelEstimator::Update(float rms_dbfs,
                                  float peak_dbfs,
                                  float speech_probability) {
  RTC_DCHECK_GT(rms_dbfs, -150.0f);
  RTC_DCHECK_LT(rms_dbfs, 50.0f);
  RTC_DCHECK_GT(peak_dbfs, -150.0f);
  RTC_DCHECK_LT(peak_dbfs, 50.0f);
  RTC_DCHECK_GE(speech_probability, 0.0f);
  RTC_DCHECK_LE(speech_probability, 1.0f);
  if (speech_probability < kVadConfidenceThreshold) {
    // Not a speech frame.
    if (adjacent_speech_frames_threshold_ > 1) {
      // When two or more adjacent speech frames are required in order to update
      // the state, we need to decide whether to discard or confirm the updates
      // based on the speech sequence length.
      if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
        // First non-speech frame after a long enough sequence of speech frames.
        // Update the reliable state.
        reliable_state_ = preliminary_state_;
      } else if (num_adjacent_speech_frames_ > 0) {
        // First non-speech frame after a too short sequence of speech frames.
        // Reset to the last reliable state.
        preliminary_state_ = reliable_state_;
      }
    }
    num_adjacent_speech_frames_ = 0;
  } else {
    // Speech frame observed.
    num_adjacent_speech_frames_++;

    // Update preliminary level estimate.
    RTC_DCHECK_GE(preliminary_state_.time_to_confidence_ms, 0);
    const bool buffer_is_full = preliminary_state_.time_to_confidence_ms == 0;
    if (!buffer_is_full) {
      preliminary_state_.time_to_confidence_ms -= kFrameDurationMs;
    }
    // Weighted average of levels with speech probability as weight.
    RTC_DCHECK_GT(speech_probability, 0.0f);
    const float leak_factor = buffer_is_full ? kLevelEstimatorLeakFactor : 1.0f;
    preliminary_state_.level_dbfs.numerator =
        preliminary_state_.level_dbfs.numerator * leak_factor +
        rms_dbfs * speech_probability;
    preliminary_state_.level_dbfs.denominator =
        preliminary_state_.level_dbfs.denominator * leak_factor +
        speech_probability;

    const float level_dbfs = preliminary_state_.level_dbfs.GetRatio();

    if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
      // `preliminary_state_` is now reliable. Update the last level estimation.
      level_dbfs_ = ClampLevelEstimateDbfs(level_dbfs);
    }
  }
  UpdateIsConfident();
  DumpDebugData();
}

void SpeechLevelEstimator::UpdateIsConfident() {
  if (adjacent_speech_frames_threshold_ == 1) {
    // Ignore `reliable_state_` when a single frame is enough to update the
    // level estimate (because it is not used).
    is_confident_ = preliminary_state_.time_to_confidence_ms == 0;
    return;
  }
  // Once confident, it remains confident.
  RTC_DCHECK(reliable_state_.time_to_confidence_ms != 0 ||
             preliminary_state_.time_to_confidence_ms == 0);
  // During the first long enough speech sequence, `reliable_state_` must be
  // ignored since `preliminary_state_` is used.
  is_confident_ =
      reliable_state_.time_to_confidence_ms == 0 ||
      (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_ &&
       preliminary_state_.time_to_confidence_ms == 0);
}

void SpeechLevelEstimator::Reset() {
  ResetLevelEstimatorState(preliminary_state_);
  ResetLevelEstimatorState(reliable_state_);
  level_dbfs_ = initial_speech_level_dbfs_;
  num_adjacent_speech_frames_ = 0;
}

void SpeechLevelEstimator::ResetLevelEstimatorState(
    LevelEstimatorState& state) const {
  state.time_to_confidence_ms = kLevelEstimatorTimeToConfidenceMs;
  state.level_dbfs.numerator = initial_speech_level_dbfs_;
  state.level_dbfs.denominator = 1.0f;
}

void SpeechLevelEstimator::DumpDebugData() const {
  if (!apm_data_dumper_)
    return;
  apm_data_dumper_->DumpRaw("agc2_speech_level_dbfs", level_dbfs_);
  apm_data_dumper_->DumpRaw("agc2_speech_level_is_confident", is_confident_);
  apm_data_dumper_->DumpRaw(
      "agc2_adaptive_level_estimator_num_adjacent_speech_frames",
      num_adjacent_speech_frames_);
  apm_data_dumper_->DumpRaw(
      "agc2_adaptive_level_estimator_preliminary_level_estimate_num",
      preliminary_state_.level_dbfs.numerator);
  apm_data_dumper_->DumpRaw(
      "agc2_adaptive_level_estimator_preliminary_level_estimate_den",
      preliminary_state_.level_dbfs.denominator);
  apm_data_dumper_->DumpRaw(
      "agc2_adaptive_level_estimator_preliminary_time_to_confidence_ms",
      preliminary_state_.time_to_confidence_ms);
  apm_data_dumper_->DumpRaw(
      "agc2_adaptive_level_estimator_reliable_time_to_confidence_ms",
      reliable_state_.time_to_confidence_ms);
}

}  // namespace webrtc