1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
|
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_
#include <complex>
#include <memory>
#include <vector>
#include "webrtc/base/swap_queue.h"
#include "webrtc/common_audio/channel_buffer.h"
#include "webrtc/common_audio/lapped_transform.h"
#include "webrtc/modules/audio_processing/audio_buffer.h"
#include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h"
#include "webrtc/modules/audio_processing/render_queue_item_verifier.h"
#include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"
namespace webrtc {
// Speech intelligibility enhancement module. Reads render and capture
// audio streams and modifies the render stream with a set of gains per
// frequency bin to enhance speech against the noise background.
// Details of the model and algorithm can be found in the original paper:
// http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788
class IntelligibilityEnhancer : public LappedTransform::Callback {
public:
IntelligibilityEnhancer(int sample_rate_hz,
size_t num_render_channels,
size_t num_bands,
size_t num_noise_bins);
~IntelligibilityEnhancer() override;
// Sets the capture noise magnitude spectrum estimate.
void SetCaptureNoiseEstimate(std::vector<float> noise, float gain);
// Reads chunk of speech in time domain and updates with modified signal.
void ProcessRenderAudio(AudioBuffer* audio);
bool active() const;
protected:
// All in frequency domain, receives input |in_block|, applies
// intelligibility enhancement, and writes result to |out_block|.
void ProcessAudioBlock(const std::complex<float>* const* in_block,
size_t in_channels,
size_t frames,
size_t out_channels,
std::complex<float>* const* out_block) override;
private:
FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestRenderUpdate);
FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestErbCreation);
FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestSolveForGains);
FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest,
TestNoiseGainHasExpectedResult);
FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest,
TestAllBandsHaveSameDelay);
// Updates the SNR estimation and enables or disables this component using a
// hysteresis.
void SnrBasedEffectActivation();
// Bisection search for optimal |lambda|.
void SolveForLambda(float power_target);
// Transforms freq gains to ERB gains.
void UpdateErbGains();
// Returns number of ERB filters.
static size_t GetBankSize(int sample_rate, size_t erb_resolution);
// Initializes ERB filterbank.
std::vector<std::vector<float>> CreateErbBank(size_t num_freqs);
// Analytically solves quadratic for optimal gains given |lambda|.
// Negative gains are set to 0. Stores the results in |sols|.
void SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols);
// Returns true if the audio is speech.
bool IsSpeech(const float* audio);
// Delays the high bands to compensate for the processing delay in the low
// band.
void DelayHighBands(AudioBuffer* audio);
static const size_t kMaxNumNoiseEstimatesToBuffer = 5;
const size_t freqs_; // Num frequencies in frequency domain.
const size_t num_noise_bins_;
const size_t chunk_length_; // Chunk size in samples.
const size_t bank_size_; // Num ERB filters.
const int sample_rate_hz_;
const size_t num_render_channels_;
intelligibility::PowerEstimator<std::complex<float>> clear_power_estimator_;
intelligibility::PowerEstimator<float> noise_power_estimator_;
std::vector<float> filtered_clear_pow_;
std::vector<float> filtered_noise_pow_;
std::vector<float> center_freqs_;
std::vector<std::vector<float>> capture_filter_bank_;
std::vector<std::vector<float>> render_filter_bank_;
size_t start_freq_;
std::vector<float> gains_eq_; // Pre-filter modified gains.
intelligibility::GainApplier gain_applier_;
std::unique_ptr<LappedTransform> render_mangler_;
VoiceActivityDetector vad_;
std::vector<int16_t> audio_s16_;
size_t chunks_since_voice_;
bool is_speech_;
float snr_;
bool is_active_;
unsigned long int num_chunks_;
unsigned long int num_active_chunks_;
std::vector<float> noise_estimation_buffer_;
SwapQueue<std::vector<float>, RenderQueueItemVerifier<float>>
noise_estimation_queue_;
std::vector<std::unique_ptr<intelligibility::DelayBuffer>>
high_bands_buffers_;
};
} // namespace webrtc
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_
|