File: utf8-combined.c

package info (click to toggle)
tmux 3.6a-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 4,508 kB
  • sloc: ansic: 72,681; sh: 1,813; yacc: 1,483; awk: 339; makefile: 235; perl: 41
file content (289 lines) | stat: -rw-r--r-- 7,511 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
/* $OpenBSD$ */

/*
 * Copyright (c) 2023 Nicholas Marriott <nicholas.marriott@gmail.com>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include <sys/types.h>

#include <stdlib.h>
#include <string.h>
#include <wchar.h>

#include "tmux.h"

enum hanguljamo_subclass {
	HANGULJAMO_SUBCLASS_NOT_HANGULJAMO,
	HANGULJAMO_SUBCLASS_CHOSEONG,			// U+1100 - U+1112
	HANGULJAMO_SUBCLASS_OLD_CHOSEONG,		// U+1113 - U+115E
	HANGULJAMO_SUBCLASS_CHOSEONG_FILLER,		// U+115F
	HANGULJAMO_SUBCLASS_JUNGSEONG_FILLER,		// U+1160
	HANGULJAMO_SUBCLASS_JUNGSEONG,			// U+1161 - U+1175
	HANGULJAMO_SUBCLASS_OLD_JUNGSEONG,		// U+1176 - U+11A7
	HANGULJAMO_SUBCLASS_JONGSEONG,			// U+11A8 - U+11C2
	HANGULJAMO_SUBCLASS_OLD_JONGSEONG,		// U+11C3 - U+11FF
	HANGULJAMO_SUBCLASS_EXTENDED_OLD_CHOSEONG,	// U+A960 - U+A97C
	HANGULJAMO_SUBCLASS_EXTENDED_OLD_JUNGSEONG,	// U+D7B0 - U+D7C6
	HANGULJAMO_SUBCLASS_EXTENDED_OLD_JONGSEONG	// U+D7CB - U+D7FB
};

enum hanguljamo_class {
	HANGULJAMO_CLASS_NOT_HANGULJAMO,
	HANGULJAMO_CLASS_CHOSEONG,
	HANGULJAMO_CLASS_JUNGSEONG,
	HANGULJAMO_CLASS_JONGSEONG
};

/* Has this got a zero width joiner at the end? */
int
utf8_has_zwj(const struct utf8_data *ud)
{
	if (ud->size < 3)
		return (0);
	return (memcmp(ud->data + ud->size - 3, "\342\200\215", 3) == 0);
}

/* Is this zero width joiner U+200D? */
int
utf8_is_zwj(const struct utf8_data *ud)
{
	if (ud->size != 3)
		return (0);
	return (memcmp(ud->data, "\342\200\215", 3) == 0);
}

/* Is this variation selector U+FE0F? */
int
utf8_is_vs(const struct utf8_data *ud)
{
	if (ud->size != 3)
		return (0);
	return (memcmp(ud->data, "\357\270\217", 3) == 0);
}

/* Is this Hangul filler U+3164? */
int
utf8_is_hangul_filler(const struct utf8_data *ud)
{
	if (ud->size != 3)
		return (0);
	return (memcmp(ud->data, "\343\205\244", 3) == 0);
}

/* Should these two characters combine? */
int
utf8_should_combine(const struct utf8_data *with, const struct utf8_data *add)
{
	wchar_t	w, a;

	if (utf8_towc(with, &w) != UTF8_DONE)
		return (0);
	if (utf8_towc(add, &a) != UTF8_DONE)
		return (0);

	/* Regional indicators. */
	if ((a >= 0x1F1E6 && a <= 0x1F1FF) && (w >= 0x1F1E6 && w <= 0x1F1FF))
		return (1);

	/* Emoji skin tone modifiers. */
	switch (a) {
	case 0x1F44B:
	case 0x1F44C:
	case 0x1F44D:
	case 0x1F44E:
	case 0x1F44F:
	case 0x1F450:
	case 0x1F466:
	case 0x1F467:
	case 0x1F468:
	case 0x1F469:
	case 0x1F46E:
	case 0x1F470:
	case 0x1F471:
	case 0x1F472:
	case 0x1F473:
	case 0x1F474:
	case 0x1F475:
	case 0x1F476:
	case 0x1F477:
	case 0x1F478:
	case 0x1F47C:
	case 0x1F481:
	case 0x1F482:
	case 0x1F485:
	case 0x1F486:
	case 0x1F487:
	case 0x1F4AA:
	case 0x1F575:
	case 0x1F57A:
	case 0x1F590:
	case 0x1F595:
	case 0x1F596:
	case 0x1F645:
	case 0x1F646:
	case 0x1F647:
	case 0x1F64B:
	case 0x1F64C:
	case 0x1F64D:
	case 0x1F64E:
	case 0x1F64F:
	case 0x1F6B4:
	case 0x1F6B5:
	case 0x1F6B6:
	case 0x1F926:
	case 0x1F937:
	case 0x1F938:
	case 0x1F939:
	case 0x1F93D:
	case 0x1F93E:
	case 0x1F9B5:
	case 0x1F9B6:
	case 0x1F9B8:
	case 0x1F9B9:
	case 0x1F9CD:
	case 0x1F9CE:
	case 0x1F9CF:
	case 0x1F9D1:
	case 0x1F9D2:
	case 0x1F9D3:
	case 0x1F9D4:
	case 0x1F9D5:
	case 0x1F9D6:
	case 0x1F9D7:
	case 0x1F9D8:
	case 0x1F9D9:
	case 0x1F9DA:
	case 0x1F9DB:
	case 0x1F9DC:
	case 0x1F9DD:
	case 0x1F9DE:
	case 0x1F9DF:
		if (w >= 0x1F3FB && w <= 0x1F3FF)
			return (1);
		break;
	}
	return 0;
}

static enum hanguljamo_subclass
hanguljamo_get_subclass(const u_char *s)
{
	switch (s[0]) {
	case 0xE1:
		switch (s[1]) {
		case 0x84:
			if (s[2] >= 0x80 && s[2] <= 0x92)
				return (HANGULJAMO_SUBCLASS_CHOSEONG);
			if (s[2] >= 0x93 && s[2] <= 0xBF)
				return (HANGULJAMO_SUBCLASS_OLD_CHOSEONG);
			break;
		case 0x85:
			if (s[2] == 0x9F)
				return (HANGULJAMO_SUBCLASS_CHOSEONG_FILLER);
			if (s[2] == 0xA0)
				return (HANGULJAMO_SUBCLASS_JUNGSEONG_FILLER);
			if (s[2] >= 0x80 && s[2] <= 0x9E)
				return (HANGULJAMO_SUBCLASS_OLD_CHOSEONG);
			if (s[2] >= 0xA1 && s[2] <= 0xB5)
				return (HANGULJAMO_SUBCLASS_JUNGSEONG);
			if (s[2] >= 0xB6 && s[2] <= 0xBF)
				return (HANGULJAMO_SUBCLASS_OLD_JUNGSEONG);
			break;
		case 0x86:
			if (s[2] >= 0x80 && s[2] <= 0xA7)
				return (HANGULJAMO_SUBCLASS_OLD_JUNGSEONG);
			if (s[2] >= 0xA8 && s[2] <= 0xBF)
				return (HANGULJAMO_SUBCLASS_JONGSEONG);
			break;
		case 0x87:
			if (s[2] >= 0x80 && s[2] <= 0x82)
				return (HANGULJAMO_SUBCLASS_JONGSEONG);
			if (s[2] >= 0x83 && s[2] <= 0xBF)
				return (HANGULJAMO_SUBCLASS_OLD_JONGSEONG);
			break;
		}
		break;
	case 0xEA:
		if (s[1] == 0xA5 && s[2] >= 0xA0 && s[2] <= 0xBC)
			return (HANGULJAMO_SUBCLASS_EXTENDED_OLD_CHOSEONG);
		break;
	case 0xED:
		if (s[1] == 0x9E && s[2] >= 0xB0 && s[2] <= 0xBF)
			return (HANGULJAMO_SUBCLASS_EXTENDED_OLD_JUNGSEONG);
		if (s[1] != 0x9F)
			break;
		if (s[2] >= 0x80 && s[2] <= 0x86)
			return (HANGULJAMO_SUBCLASS_EXTENDED_OLD_JUNGSEONG);
		if (s[2] >= 0x8B && s[2] <= 0xBB)
			return (HANGULJAMO_SUBCLASS_EXTENDED_OLD_JONGSEONG);
		break;
	}
	return (HANGULJAMO_SUBCLASS_NOT_HANGULJAMO);
}

static enum hanguljamo_class
hanguljamo_get_class(const u_char *s)
{
	switch (hanguljamo_get_subclass(s)) {
	case HANGULJAMO_SUBCLASS_CHOSEONG:
	case HANGULJAMO_SUBCLASS_CHOSEONG_FILLER:
	case HANGULJAMO_SUBCLASS_OLD_CHOSEONG:
	case HANGULJAMO_SUBCLASS_EXTENDED_OLD_CHOSEONG:
		return (HANGULJAMO_CLASS_CHOSEONG);
	case HANGULJAMO_SUBCLASS_JUNGSEONG:
	case HANGULJAMO_SUBCLASS_JUNGSEONG_FILLER:
	case HANGULJAMO_SUBCLASS_OLD_JUNGSEONG:
	case HANGULJAMO_SUBCLASS_EXTENDED_OLD_JUNGSEONG:
		return (HANGULJAMO_CLASS_JUNGSEONG);
	case HANGULJAMO_SUBCLASS_JONGSEONG:
	case HANGULJAMO_SUBCLASS_OLD_JONGSEONG:
	case HANGULJAMO_SUBCLASS_EXTENDED_OLD_JONGSEONG:
		return (HANGULJAMO_CLASS_JONGSEONG);
	case HANGULJAMO_SUBCLASS_NOT_HANGULJAMO:
		return (HANGULJAMO_CLASS_NOT_HANGULJAMO);
	}
	return (HANGULJAMO_CLASS_NOT_HANGULJAMO);
}

enum hanguljamo_state
hanguljamo_check_state(const struct utf8_data *p_ud, const struct utf8_data *ud)
{
	const u_char	*s;

	if (ud->size != 3)
		return (HANGULJAMO_STATE_NOT_HANGULJAMO);

	switch (hanguljamo_get_class(ud->data)) {
	case HANGULJAMO_CLASS_CHOSEONG:
		return (HANGULJAMO_STATE_CHOSEONG);
	case HANGULJAMO_CLASS_JUNGSEONG:
		if (p_ud->size < 3)
			return (HANGULJAMO_STATE_NOT_COMPOSABLE);
		s = p_ud->data + p_ud->size - 3;
		if (hanguljamo_get_class(s) == HANGULJAMO_CLASS_CHOSEONG)
			return (HANGULJAMO_STATE_COMPOSABLE);
		return (HANGULJAMO_STATE_NOT_COMPOSABLE);
	case HANGULJAMO_CLASS_JONGSEONG:
		if (p_ud->size < 3)
			return (HANGULJAMO_STATE_NOT_COMPOSABLE);
		s = p_ud->data + p_ud->size - 3;
		if (hanguljamo_get_class(s) == HANGULJAMO_CLASS_JUNGSEONG)
			return (HANGULJAMO_STATE_COMPOSABLE);
		return (HANGULJAMO_STATE_NOT_COMPOSABLE);
	case HANGULJAMO_CLASS_NOT_HANGULJAMO:
		return (HANGULJAMO_STATE_NOT_HANGULJAMO);
	}
	return (HANGULJAMO_STATE_NOT_HANGULJAMO);
}