File: endpointer_unittest.cc

package info (click to toggle)
chromium 138.0.7204.183-1~deb12u1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm-proposed-updates
  • size: 6,080,960 kB
  • sloc: cpp: 34,937,079; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,954; asm: 946,768; xml: 739,971; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,811; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (178 lines) | stat: -rw-r--r-- 6,060 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.


#include "components/speech/endpointer/endpointer.h"

#include <stdint.h>

#include "base/memory/raw_ptr.h"
#include "base/types/fixed_array.h"
#include "components/speech/audio_buffer.h"
#include "testing/gtest/include/gtest/gtest.h"

namespace {
const int kFrameRate = 50;     // 20 ms long frames for AMR encoding.
}  // namespace

namespace speech {

class FrameProcessor {
 public:
  // Process a single frame of test audio samples.
  virtual EpStatus ProcessFrame(int64_t time,
                                int16_t* samples,
                                int frame_size) = 0;
};

void RunEndpointerEventsTest(FrameProcessor* processor, int sample_rate) {
  int frame_size = sample_rate / kFrameRate;
  base::FixedArray<int16_t> samples(frame_size);

  // We will create a white noise signal of 150 frames. The frames from 50 to
  // 100 will have more power, and the endpointer should fire on those frames.
  const int kNumFrames = 150;

  // Create a random sequence of samples.
  srand(1);
  float gain = 0.0;
  int64_t time = 0;
  for (int frame_count = 0; frame_count < kNumFrames; ++frame_count) {
    // The frames from 50 to 100 will have more power, and the endpointer
    // should detect those frames as speech.
    if ((frame_count >= 50) && (frame_count < 100)) {
      gain = 2000.0;
    } else {
      gain = 1.0;
    }
    // Create random samples.
    for (int i = 0; i < frame_size; ++i) {
      float randNum = static_cast<float>(rand() - (RAND_MAX / 2)) /
                      static_cast<float>(RAND_MAX);
      samples[i] = static_cast<int16_t>(gain * randNum);
    }

    EpStatus ep_status =
        processor->ProcessFrame(time, samples.data(), frame_size);
    time += static_cast<int64_t>(frame_size * (1e6 / sample_rate));

    // Log the status.
    if (20 == frame_count) {
      EXPECT_EQ(EP_PRE_SPEECH, ep_status);
    }
    if (70 == frame_count) {
      EXPECT_EQ(EP_SPEECH_PRESENT, ep_status);
    }
    if (120 == frame_count) {
      EXPECT_EQ(EP_PRE_SPEECH, ep_status);
    }
  }
}

// This test instantiates and initializes a stand alone endpointer module.
// The test creates FrameData objects with random noise and send them
// to the endointer module. The energy of the first 50 frames is low,
// followed by 500 high energy frames, and another 50 low energy frames.
// We test that the correct start and end frames were detected.
class EnergyEndpointerFrameProcessor : public FrameProcessor {
 public:
  explicit EnergyEndpointerFrameProcessor(EnergyEndpointer* endpointer)
      : endpointer_(endpointer) {}

  EpStatus ProcessFrame(int64_t time,
                        int16_t* samples,
                        int frame_size) override {
    endpointer_->ProcessAudioFrame(time, samples, frame_size, nullptr);
    int64_t ep_time;
    return endpointer_->Status(&ep_time);
  }

 private:
  raw_ptr<EnergyEndpointer> endpointer_;
};

TEST(EndpointerTest, TestEnergyEndpointerEvents) {
  const int sample_rate = 8000;  // 8 k samples per second for AMR encoding.

  // Initialize endpointer and configure it. We specify the parameters
  // here for a 20ms window, and a 20ms step size, which corrsponds to
  // the narrow band AMR codec.
  EnergyEndpointerParams ep_config;
  ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate));
  ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate));
  ep_config.set_endpoint_margin(0.2f);
  ep_config.set_onset_window(0.15f);
  ep_config.set_speech_on_window(0.4f);
  ep_config.set_offset_window(0.15f);
  ep_config.set_onset_detect_dur(0.09f);
  ep_config.set_onset_confirm_dur(0.075f);
  ep_config.set_on_maintain_dur(0.10f);
  ep_config.set_offset_confirm_dur(0.12f);
  ep_config.set_decision_threshold(100.0f);
  EnergyEndpointer endpointer;
  endpointer.Init(ep_config);

  endpointer.StartSession();

  EnergyEndpointerFrameProcessor frame_processor(&endpointer);
  RunEndpointerEventsTest(&frame_processor, sample_rate);

  endpointer.EndSession();
}

// Test endpointer wrapper class.
class EndpointerFrameProcessor : public FrameProcessor {
 public:
  explicit EndpointerFrameProcessor(Endpointer* endpointer)
      : endpointer_(endpointer) {}

  EpStatus ProcessFrame(int64_t time,
                        int16_t* samples,
                        int frame_size) override {
    scoped_refptr<AudioChunk> frame(
        new AudioChunk(reinterpret_cast<uint8_t*>(samples), frame_size * 2, 2));
    endpointer_->ProcessAudio(*frame.get(), nullptr);
    int64_t ep_time;
    return endpointer_->Status(&ep_time);
  }

 private:
  raw_ptr<Endpointer> endpointer_;
};

TEST(EndpointerTest, TestEmbeddedEndpointerEvents) {
  const int sample_rate = 8000;  // 8 k samples per second for AMR encoding.

  Endpointer endpointer(sample_rate);
  const int64_t kMillisecondsPerMicrosecond = 1000;
  const int64_t short_timeout = 300 * kMillisecondsPerMicrosecond;
  endpointer.set_speech_input_possibly_complete_silence_length(short_timeout);
  const int64_t long_timeout = 500 * kMillisecondsPerMicrosecond;
  endpointer.set_speech_input_complete_silence_length(long_timeout);
  endpointer.StartSession();

  EndpointerFrameProcessor frame_processor(&endpointer);
  RunEndpointerEventsTest(&frame_processor, sample_rate);

  endpointer.EndSession();
}

TEST(EndpointerTest, HighSampleRate) {
  const int sample_rate = 48000;

  Endpointer endpointer(sample_rate);
  const int64_t kMillisecondsPerMicrosecond = 1000;
  const int64_t short_timeout = 300 * kMillisecondsPerMicrosecond;
  endpointer.set_speech_input_possibly_complete_silence_length(short_timeout);
  const int64_t long_timeout = 500 * kMillisecondsPerMicrosecond;
  endpointer.set_speech_input_complete_silence_length(long_timeout);
  endpointer.StartSession();

  EndpointerFrameProcessor frame_processor(&endpointer);
  RunEndpointerEventsTest(&frame_processor, sample_rate);

  endpointer.EndSession();
}

}  // namespace speech