File: captions-mssapi.cpp

package info (click to toggle)
obs-studio 22.0.3%2Bdfsg1-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 23,052 kB
  • sloc: ansic: 134,708; cpp: 49,169; objc: 1,036; makefile: 829; sh: 410; python: 360
file content (179 lines) | stat: -rw-r--r-- 4,359 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#include "captions-mssapi.hpp"

#define do_log(type, format, ...) blog(type, "[Captions] " format, \
		##__VA_ARGS__)

#define error(format, ...) do_log(LOG_ERROR, format, ##__VA_ARGS__)
#define debug(format, ...) do_log(LOG_DEBUG, format, ##__VA_ARGS__)

mssapi_captions::mssapi_captions(
		captions_cb callback,
		const std::string &lang) try
	: captions_handler(callback, AUDIO_FORMAT_16BIT, 16000)
{
	HRESULT hr;

	std::wstring wlang;
	wlang.resize(lang.size());

	for (size_t i = 0; i < lang.size(); i++)
		wlang[i] = (wchar_t)lang[i];

	LCID lang_id = LocaleNameToLCID(wlang.c_str(), 0);

	wchar_t lang_str[32];
	_snwprintf(lang_str, 31, L"language=%x", (int)lang_id);

	stop = CreateEvent(nullptr, false, false, nullptr);
	if (!stop.Valid())
		throw "Failed to create event";

	hr = SpFindBestToken(SPCAT_RECOGNIZERS, lang_str, nullptr, &token);
	if (FAILED(hr))
		throw HRError("SpFindBestToken failed", hr);

	hr = CoCreateInstance(CLSID_SpInprocRecognizer, nullptr, CLSCTX_ALL,
			__uuidof(ISpRecognizer), (void**)&recognizer);
	if (FAILED(hr))
		throw HRError("CoCreateInstance for recognizer failed", hr);

	hr = recognizer->SetRecognizer(token);
	if (FAILED(hr))
		throw HRError("SetRecognizer failed", hr);

	hr = recognizer->SetRecoState(SPRST_INACTIVE);
	if (FAILED(hr))
		throw HRError("SetRecoState(SPRST_INACTIVE) failed", hr);

	hr = recognizer->CreateRecoContext(&context);
	if (FAILED(hr))
		throw HRError("CreateRecoContext failed", hr);

	ULONGLONG interest = SPFEI(SPEI_RECOGNITION) |
	                     SPFEI(SPEI_END_SR_STREAM);
	hr = context->SetInterest(interest, interest);
	if (FAILED(hr))
		throw HRError("SetInterest failed", hr);

	hr = context->SetNotifyWin32Event();
	if (FAILED(hr))
		throw HRError("SetNotifyWin32Event", hr);

	notify = context->GetNotifyEventHandle();
	if (notify == INVALID_HANDLE_VALUE)
		throw HRError("GetNotifyEventHandle failed", E_NOINTERFACE);

	size_t sample_rate = audio_output_get_sample_rate(obs_get_audio());
	audio = new CaptionStream((DWORD)sample_rate, this);
	audio->Release();

	hr = recognizer->SetInput(audio, false);
	if (FAILED(hr))
		throw HRError("SetInput failed", hr);

	hr = context->CreateGrammar(1, &grammar);
	if (FAILED(hr))
		throw HRError("CreateGrammar failed", hr);

	hr = grammar->LoadDictation(nullptr, SPLO_STATIC);
	if (FAILED(hr))
		throw HRError("LoadDictation failed", hr);

	try {
		t = std::thread([this] () {main_thread();});
	} catch (...) {
		throw "Failed to create thread";
	}

} catch (const char *err) {
	blog(LOG_WARNING, "%s: %s", __FUNCTION__, err);
	throw CAPTIONS_ERROR_GENERIC_FAIL;

} catch (HRError err) {
	blog(LOG_WARNING, "%s: %s (%lX)", __FUNCTION__, err.str, err.hr);
	throw CAPTIONS_ERROR_GENERIC_FAIL;
}

mssapi_captions::~mssapi_captions()
{
	if (t.joinable()) {
		SetEvent(stop);
		t.join();
	}
}

void mssapi_captions::main_thread()
try {
	HRESULT hr;

	os_set_thread_name(__FUNCTION__);

	hr = grammar->SetDictationState(SPRS_ACTIVE);
	if (FAILED(hr))
		throw HRError("SetDictationState failed", hr);

	hr = recognizer->SetRecoState(SPRST_ACTIVE);
	if (FAILED(hr))
		throw HRError("SetRecoState(SPRST_ACTIVE) failed", hr);

	HANDLE events[] = {notify, stop};

	started = true;

	for (;;) {
		DWORD ret = WaitForMultipleObjects(2, events, false, INFINITE);
		if (ret != WAIT_OBJECT_0)
			break;

		CSpEvent event;
		bool exit = false;

		while (event.GetFrom(context) == S_OK) {
			if (event.eEventId == SPEI_RECOGNITION) {
				ISpRecoResult *result = event.RecoResult();

				CoTaskMemPtr<wchar_t> text;
				hr = result->GetText((ULONG)-1, (ULONG)-1,
						true, &text, nullptr);
				if (FAILED(hr))
					continue;

				char text_utf8[512];
				os_wcs_to_utf8(text, 0, text_utf8, 512);

				callback(text_utf8);

				blog(LOG_DEBUG, "\"%s\"", text_utf8);

			} else if (event.eEventId == SPEI_END_SR_STREAM) {
				exit = true;
				break;
			}
		}

		if (exit)
			break;
	}

	audio->Stop();

} catch (HRError err) {
	blog(LOG_WARNING, "%s failed: %s (%lX)", __FUNCTION__, err.str, err.hr);
}

void mssapi_captions::pcm_data(const void *data, size_t frames)
{
	if (started)
		audio->PushAudio(data, frames);
}

captions_handler_info mssapi_info = {
	[] () -> std::string
	{
		return "Microsoft Speech-to-Text";
	},
	[] (captions_cb cb, const std::string &lang) -> captions_handler *
	{
		return new mssapi_captions(cb, lang);
	}
};