1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
|
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/speech/endpointer/endpointer.h"
#include <stdint.h>
#include "base/memory/raw_ptr.h"
#include "base/types/fixed_array.h"
#include "components/speech/audio_buffer.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace {
const int kFrameRate = 50; // 20 ms long frames for AMR encoding.
} // namespace
namespace speech {
class FrameProcessor {
public:
// Process a single frame of test audio samples.
virtual EpStatus ProcessFrame(int64_t time,
int16_t* samples,
int frame_size) = 0;
};
void RunEndpointerEventsTest(FrameProcessor* processor, int sample_rate) {
int frame_size = sample_rate / kFrameRate;
base::FixedArray<int16_t> samples(frame_size);
// We will create a white noise signal of 150 frames. The frames from 50 to
// 100 will have more power, and the endpointer should fire on those frames.
const int kNumFrames = 150;
// Create a random sequence of samples.
srand(1);
float gain = 0.0;
int64_t time = 0;
for (int frame_count = 0; frame_count < kNumFrames; ++frame_count) {
// The frames from 50 to 100 will have more power, and the endpointer
// should detect those frames as speech.
if ((frame_count >= 50) && (frame_count < 100)) {
gain = 2000.0;
} else {
gain = 1.0;
}
// Create random samples.
for (int i = 0; i < frame_size; ++i) {
float randNum = static_cast<float>(rand() - (RAND_MAX / 2)) /
static_cast<float>(RAND_MAX);
samples[i] = static_cast<int16_t>(gain * randNum);
}
EpStatus ep_status =
processor->ProcessFrame(time, samples.data(), frame_size);
time += static_cast<int64_t>(frame_size * (1e6 / sample_rate));
// Log the status.
if (20 == frame_count) {
EXPECT_EQ(EP_PRE_SPEECH, ep_status);
}
if (70 == frame_count) {
EXPECT_EQ(EP_SPEECH_PRESENT, ep_status);
}
if (120 == frame_count) {
EXPECT_EQ(EP_PRE_SPEECH, ep_status);
}
}
}
// This test instantiates and initializes a stand alone endpointer module.
// The test creates FrameData objects with random noise and send them
// to the endointer module. The energy of the first 50 frames is low,
// followed by 500 high energy frames, and another 50 low energy frames.
// We test that the correct start and end frames were detected.
class EnergyEndpointerFrameProcessor : public FrameProcessor {
public:
explicit EnergyEndpointerFrameProcessor(EnergyEndpointer* endpointer)
: endpointer_(endpointer) {}
EpStatus ProcessFrame(int64_t time,
int16_t* samples,
int frame_size) override {
endpointer_->ProcessAudioFrame(time, samples, frame_size, nullptr);
int64_t ep_time;
return endpointer_->Status(&ep_time);
}
private:
raw_ptr<EnergyEndpointer> endpointer_;
};
TEST(EndpointerTest, TestEnergyEndpointerEvents) {
const int sample_rate = 8000; // 8 k samples per second for AMR encoding.
// Initialize endpointer and configure it. We specify the parameters
// here for a 20ms window, and a 20ms step size, which corrsponds to
// the narrow band AMR codec.
EnergyEndpointerParams ep_config;
ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate));
ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate));
ep_config.set_endpoint_margin(0.2f);
ep_config.set_onset_window(0.15f);
ep_config.set_speech_on_window(0.4f);
ep_config.set_offset_window(0.15f);
ep_config.set_onset_detect_dur(0.09f);
ep_config.set_onset_confirm_dur(0.075f);
ep_config.set_on_maintain_dur(0.10f);
ep_config.set_offset_confirm_dur(0.12f);
ep_config.set_decision_threshold(100.0f);
EnergyEndpointer endpointer;
endpointer.Init(ep_config);
endpointer.StartSession();
EnergyEndpointerFrameProcessor frame_processor(&endpointer);
RunEndpointerEventsTest(&frame_processor, sample_rate);
endpointer.EndSession();
}
// Test endpointer wrapper class.
class EndpointerFrameProcessor : public FrameProcessor {
public:
explicit EndpointerFrameProcessor(Endpointer* endpointer)
: endpointer_(endpointer) {}
EpStatus ProcessFrame(int64_t time,
int16_t* samples,
int frame_size) override {
scoped_refptr<AudioChunk> frame(
new AudioChunk(reinterpret_cast<uint8_t*>(samples), frame_size * 2, 2));
endpointer_->ProcessAudio(*frame.get(), nullptr);
int64_t ep_time;
return endpointer_->Status(&ep_time);
}
private:
raw_ptr<Endpointer> endpointer_;
};
TEST(EndpointerTest, TestEmbeddedEndpointerEvents) {
const int sample_rate = 8000; // 8 k samples per second for AMR encoding.
Endpointer endpointer(sample_rate);
const int64_t kMillisecondsPerMicrosecond = 1000;
const int64_t short_timeout = 300 * kMillisecondsPerMicrosecond;
endpointer.set_speech_input_possibly_complete_silence_length(short_timeout);
const int64_t long_timeout = 500 * kMillisecondsPerMicrosecond;
endpointer.set_speech_input_complete_silence_length(long_timeout);
endpointer.StartSession();
EndpointerFrameProcessor frame_processor(&endpointer);
RunEndpointerEventsTest(&frame_processor, sample_rate);
endpointer.EndSession();
}
TEST(EndpointerTest, HighSampleRate) {
const int sample_rate = 48000;
Endpointer endpointer(sample_rate);
const int64_t kMillisecondsPerMicrosecond = 1000;
const int64_t short_timeout = 300 * kMillisecondsPerMicrosecond;
endpointer.set_speech_input_possibly_complete_silence_length(short_timeout);
const int64_t long_timeout = 500 * kMillisecondsPerMicrosecond;
endpointer.set_speech_input_complete_silence_length(long_timeout);
endpointer.StartSession();
EndpointerFrameProcessor frame_processor(&endpointer);
RunEndpointerEventsTest(&frame_processor, sample_rate);
endpointer.EndSession();
}
} // namespace speech
|