1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
|
/* Example of simple PocketSphinx speech segmentation.
*
* MIT license (c) 2022, see LICENSE for more information.
*
* Author: David Huggins-Daines <dhdaines@gmail.com>
*/
/**
* @example live_win32.c
* @brief Speech recognition with live audio input and endpointing.
*
* This file shows how to use PocketSphinx with microphone input using
* the Win32 Waveform Audio API (the only one of many terrible audio
* APIs on Windows that isn't made even more terrible by requiring you
* to use C++ in an unmanaged environment).
*
* To build it, you should be able to find a "live_win32" target in
* your favorite IDE after running CMake - in Visual Studio Code, look
* in the "CMake" tab.
*
* Microphones on Windows tend to be miscalibrated with the recording
* level set much too high by default, so the endpointer may give a
* lot of false positives at first. Programs like Audacity seem to
* work around this somehow, but I don't really know how they do it.
*/
#include <windows.h>
#include <mmsystem.h>
#include <pocketsphinx.h>
#include <signal.h>
static int global_done = 0;
static void
catch_sig(int signum)
{
(void)signum;
global_done = 1;
}
#define CHECK(expr) \
do { \
int err; \
if ((err = expr) != 0) \
{ \
char errbuf[MAXERRORLENGTH]; \
waveInGetErrorText(err, errbuf, sizeof(errbuf)); \
E_FATAL("error %08x: %s\n", err, errbuf); \
} \
} while (0)
int main(int argc, char *argv[])
{
ps_decoder_t *decoder;
ps_config_t *config;
ps_endpointer_t *ep;
size_t frame_size;
HWAVEIN wavein;
WAVEFORMATEX wavefmt;
HANDLE event;
/* A large but somewhat arbitrary number of buffers. */
#define NBUF 100 /* 100 * 0.03 = 3 seconds */
WAVEHDR hdrs[NBUF];
int i;
(void)argc; (void)argv;
/* Initialize decoder and endpointer */
config = ps_config_init(NULL);
ps_default_search_args(config);
if ((decoder = ps_init(config)) == NULL)
E_FATAL("PocketSphinx decoder init failed\n");
if ((ep = ps_endpointer_init(0, 0.0, 0,
ps_config_int(config, "samprate"),
0)) == NULL)
E_FATAL("PocketSphinx endpointer init failed\n");
/* Frame size in samples (not bytes) */
frame_size = ps_endpointer_frame_size(ep);
/* Tell Windows what format we want (NOTE: may not be available...) */
wavefmt.wFormatTag = WAVE_FORMAT_PCM;
wavefmt.nChannels = 1;
wavefmt.nSamplesPerSec = ps_endpointer_sample_rate(ep);
wavefmt.wBitsPerSample = 16;
wavefmt.nBlockAlign = 2;
wavefmt.nAvgBytesPerSec = wavefmt.nSamplesPerSec * wavefmt.nBlockAlign;
wavefmt.cbSize = 0;
/* Create an event to tell us when a new buffer is ready. */
event = CreateEvent(NULL, TRUE, FALSE, "buffer_ready");
/* Open the recording device. */
CHECK(waveInOpen(&wavein, WAVE_MAPPER, &wavefmt,
(DWORD_PTR)event, 0, CALLBACK_EVENT));
/* Create buffers. */
memset(hdrs, 0, sizeof(hdrs));
for (i = 0; i < NBUF; ++i) {
hdrs[i].lpData = malloc(frame_size * 2);
hdrs[i].dwBufferLength = (DWORD)frame_size * 2;
CHECK(waveInPrepareHeader(wavein, &hdrs[i], sizeof(hdrs[i])));
CHECK(waveInAddBuffer(wavein, &hdrs[i], sizeof(hdrs[i])));
}
/* Start recording. */
CHECK(waveInStart(wavein));
i = 0;
if (signal(SIGINT, catch_sig) == SIG_ERR)
E_FATAL_SYSTEM("Failed to set SIGINT handler");
while (!global_done) {
const int16 *speech;
WaitForSingleObject(event, INFINITE);
/* Get as many buffers as we can. */
while (hdrs[i].dwFlags & WHDR_DONE) {
int prev_in_speech = ps_endpointer_in_speech(ep);
int16 *frame = (int16 *)hdrs[i].lpData;
/* Process them one by one. */
speech = ps_endpointer_process(ep, frame);
CHECK(waveInUnprepareHeader(wavein, &hdrs[i], sizeof(hdrs[i])));
CHECK(waveInPrepareHeader(wavein, &hdrs[i], sizeof(hdrs[i])));
CHECK(waveInAddBuffer(wavein, &hdrs[i], sizeof(hdrs[i])));
if (++i == NBUF)
i = 0;
if (speech != NULL) {
const char *hyp;
if (!prev_in_speech) {
fprintf(stderr, "Speech start at %.2f\n",
ps_endpointer_speech_start(ep));
fflush(stderr); /* For broken MSYS2 terminal */
ps_start_utt(decoder);
}
if (ps_process_raw(decoder, speech, frame_size, FALSE, FALSE) < 0)
E_FATAL("ps_process_raw() failed\n");
if ((hyp = ps_get_hyp(decoder, NULL)) != NULL) {
fprintf(stderr, "PARTIAL RESULT: %s\n", hyp);
fflush(stderr);
}
if (!ps_endpointer_in_speech(ep)) {
fprintf(stderr, "Speech end at %.2f\n",
ps_endpointer_speech_end(ep));
fflush(stderr);
ps_end_utt(decoder);
if ((hyp = ps_get_hyp(decoder, NULL)) != NULL) {
printf("%s\n", hyp);
fflush(stdout);
}
}
}
}
/* Wait for another buffer. */
ResetEvent(event);
}
/* Stop recording, cancel all buffers, and free them. */
CHECK(waveInStop(wavein));
CHECK(waveInReset(wavein));
for (i = 0; i < NBUF; ++i) {
if (hdrs[i].dwFlags & WHDR_PREPARED)
CHECK(waveInUnprepareHeader(wavein, &hdrs[i],
sizeof(hdrs[i])));
free(hdrs[i].lpData);
}
CloseHandle(event);
ps_endpointer_free(ep);
ps_free(decoder);
ps_config_free(config);
return 0;
}
|