1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
|
#include "captions-mssapi.hpp"
#define do_log(type, format, ...) blog(type, "[Captions] " format, \
##__VA_ARGS__)
#define error(format, ...) do_log(LOG_ERROR, format, ##__VA_ARGS__)
#define debug(format, ...) do_log(LOG_DEBUG, format, ##__VA_ARGS__)
mssapi_captions::mssapi_captions(
captions_cb callback,
const std::string &lang) try
: captions_handler(callback, AUDIO_FORMAT_16BIT, 16000)
{
HRESULT hr;
std::wstring wlang;
wlang.resize(lang.size());
for (size_t i = 0; i < lang.size(); i++)
wlang[i] = (wchar_t)lang[i];
LCID lang_id = LocaleNameToLCID(wlang.c_str(), 0);
wchar_t lang_str[32];
_snwprintf(lang_str, 31, L"language=%x", (int)lang_id);
stop = CreateEvent(nullptr, false, false, nullptr);
if (!stop.Valid())
throw "Failed to create event";
hr = SpFindBestToken(SPCAT_RECOGNIZERS, lang_str, nullptr, &token);
if (FAILED(hr))
throw HRError("SpFindBestToken failed", hr);
hr = CoCreateInstance(CLSID_SpInprocRecognizer, nullptr, CLSCTX_ALL,
__uuidof(ISpRecognizer), (void**)&recognizer);
if (FAILED(hr))
throw HRError("CoCreateInstance for recognizer failed", hr);
hr = recognizer->SetRecognizer(token);
if (FAILED(hr))
throw HRError("SetRecognizer failed", hr);
hr = recognizer->SetRecoState(SPRST_INACTIVE);
if (FAILED(hr))
throw HRError("SetRecoState(SPRST_INACTIVE) failed", hr);
hr = recognizer->CreateRecoContext(&context);
if (FAILED(hr))
throw HRError("CreateRecoContext failed", hr);
ULONGLONG interest = SPFEI(SPEI_RECOGNITION) |
SPFEI(SPEI_END_SR_STREAM);
hr = context->SetInterest(interest, interest);
if (FAILED(hr))
throw HRError("SetInterest failed", hr);
hr = context->SetNotifyWin32Event();
if (FAILED(hr))
throw HRError("SetNotifyWin32Event", hr);
notify = context->GetNotifyEventHandle();
if (notify == INVALID_HANDLE_VALUE)
throw HRError("GetNotifyEventHandle failed", E_NOINTERFACE);
size_t sample_rate = audio_output_get_sample_rate(obs_get_audio());
audio = new CaptionStream((DWORD)sample_rate, this);
audio->Release();
hr = recognizer->SetInput(audio, false);
if (FAILED(hr))
throw HRError("SetInput failed", hr);
hr = context->CreateGrammar(1, &grammar);
if (FAILED(hr))
throw HRError("CreateGrammar failed", hr);
hr = grammar->LoadDictation(nullptr, SPLO_STATIC);
if (FAILED(hr))
throw HRError("LoadDictation failed", hr);
try {
t = std::thread([this] () {main_thread();});
} catch (...) {
throw "Failed to create thread";
}
} catch (const char *err) {
blog(LOG_WARNING, "%s: %s", __FUNCTION__, err);
throw CAPTIONS_ERROR_GENERIC_FAIL;
} catch (HRError err) {
blog(LOG_WARNING, "%s: %s (%lX)", __FUNCTION__, err.str, err.hr);
throw CAPTIONS_ERROR_GENERIC_FAIL;
}
mssapi_captions::~mssapi_captions()
{
if (t.joinable()) {
SetEvent(stop);
t.join();
}
}
void mssapi_captions::main_thread()
try {
HRESULT hr;
os_set_thread_name(__FUNCTION__);
hr = grammar->SetDictationState(SPRS_ACTIVE);
if (FAILED(hr))
throw HRError("SetDictationState failed", hr);
hr = recognizer->SetRecoState(SPRST_ACTIVE);
if (FAILED(hr))
throw HRError("SetRecoState(SPRST_ACTIVE) failed", hr);
HANDLE events[] = {notify, stop};
started = true;
for (;;) {
DWORD ret = WaitForMultipleObjects(2, events, false, INFINITE);
if (ret != WAIT_OBJECT_0)
break;
CSpEvent event;
bool exit = false;
while (event.GetFrom(context) == S_OK) {
if (event.eEventId == SPEI_RECOGNITION) {
ISpRecoResult *result = event.RecoResult();
CoTaskMemPtr<wchar_t> text;
hr = result->GetText((ULONG)-1, (ULONG)-1,
true, &text, nullptr);
if (FAILED(hr))
continue;
char text_utf8[512];
os_wcs_to_utf8(text, 0, text_utf8, 512);
callback(text_utf8);
blog(LOG_DEBUG, "\"%s\"", text_utf8);
} else if (event.eEventId == SPEI_END_SR_STREAM) {
exit = true;
break;
}
}
if (exit)
break;
}
audio->Stop();
} catch (HRError err) {
blog(LOG_WARNING, "%s failed: %s (%lX)", __FUNCTION__, err.str, err.hr);
}
void mssapi_captions::pcm_data(const void *data, size_t frames)
{
if (started)
audio->PushAudio(data, frames);
}
captions_handler_info mssapi_info = {
[] () -> std::string
{
return "Microsoft Speech-to-Text";
},
[] (captions_cb cb, const std::string &lang) -> captions_handler *
{
return new mssapi_captions(cb, lang);
}
};
|