File: unicode.cc

package info (click to toggle)
simutrans 120.4.1-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 19,024 kB
  • sloc: cpp: 144,315; ansic: 2,782; makefile: 903; sh: 622
file content (263 lines) | stat: -rw-r--r-- 6,160 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
#include "unicode.h"
#include "simtypes.h"

utf32 const UNICODE_NUL = 0;

static inline int is_1byte_seq(utf8 c) { return c<0x80; }	// normal ASCII (equivalent to (c & 0x80) == 0x00)
static inline int is_2byte_seq(utf8 c) { return (c & 0xE0) == 0xC0; } // 2 Byte sequence, total letter value is 110xxxxx 10yyyyyy => 00000xxx xxyyyyyy
static inline int is_3byte_seq(utf8 c) { return (c & 0xF0) == 0xE0; }	// 3 Byte sequence, total letter value is 1110xxxx 10yyyyyy 10zzzzzz => xxxxyyyy yyzzzzzz
static inline int is_cont_char(utf8 c) { return (c & 0xC0) == 0x80; }	// the bytes in a sequence have always the format 10xxxxxx

utf8_decoder_t::utf8_decoder_t(utf8 const *const str)
{
	utf8str = str;
}

utf32 utf8_decoder_t::decode(utf8 const *&buff)
{
	size_t len = 0;
	utf32 const code_point = decode(buff, len);
	buff += len;
	return code_point;
}

utf32 utf8_decoder_t::decode(utf8 const *const buff, size_t &len) {
	// Implementation derived from RFC 3629.

	// Process character byte.
	size_t i = 0;
	len = 0;
	utf8 const character = buff[i++];
	utf32 cp = 0;
	if(  character <= 0x7F  ) {
		// ASCII character.
		cp = character;
		len = 1;
	} else if(  character < 0xC2  ) {
		// Invalid character.
	} else if(  character <= 0xDF  ) {
		// 2 byte character.
		cp = character & 0x1F;
		len = 2;
	} else if(  character <= 0xEF  ) {
		// 3 byte character.
		if(  !((character == 0xE0 && buff[i] < 0xA0) ||
			(character == 0xED && buff[i] > 0x9F))  ) {
			cp = character & 0xF;
			len = 3;
		}
	} else if(  character <= 0xF4  ) {
		// 4 byte character.
		if(  !((character == 0xF0 && buff[i] < 0x90) ||
			(character == 0xF4 && buff[i] > 0x8F))  ) {
			cp = character & 0x7;
			len = 4;
		}
	} else {
		// Invalid character.
	}

	// Process tail bytes.
	for(  ; i < len ; i++  ) {
		utf8 const tail = buff[i];
		if(  0x80 <= tail && tail <= 0xBF  ) {
			cp <<= 6;
			cp |= tail & 0x3F;
		} else {
			// Invalid tail.
			len = 0;
		}
	}

	if(  len == 0  ) {
		// Replace invalid sequences with code point of the single decoded character (ISO-8859-1).
		len = 1;
		cp = character;
	}

	return cp;
}

bool utf8_decoder_t::has_next() const
{
	return utf8str[0] != UNICODE_NUL;
}

utf32 utf8_decoder_t::next()
{
	return has_next() ? decode(utf8str) : UNICODE_NUL;
}

utf8 const *utf8_decoder_t::get_position()
{
	return utf8str;
}


size_t utf8_get_next_char(const utf8* text, size_t pos)
{
	// go right one character
	// the bytes in a sequence have always the format 10xxxxxx, thus we use is_cont_char()
	// this will always work, even if we do not start on a sequence starting character
	do {
		pos++;
	} while (is_cont_char(text[pos]));
	return pos;
}


sint32 utf8_get_prev_char(const utf8* text, sint32 pos)
{
/* not needed, since the only position calling it, checks it too
	if(pos==0) {
		return 0;
	}
*/
	// go left one character
	// the bytes in a sequence have always the format 10xxxxxx, thus we use is_cont_char()
	do {
		pos--;
	} while (pos>0  &&  is_cont_char(text[pos]));
	return pos;
}


int utf16_to_utf8(utf16 c, utf8* out)
{
	if (c < 0x80) {
		out[0] = (utf8)c;
		return 1;
	} else if (c < 0x800) {
		out[0] = 0xC0 | (c >> 6);
		out[1] = 0x80 | (c >> 0 & 0x3F);
		return 2;
	} else /* if (c < 0x10000) */ {
		// Assume always a 3 byte sequence, since we do not support 4 byte UTF32
		out[0] = 0xE0 | (c >> 12);
		out[1] = 0x80 | (c >>  6 & 0x3F);
		out[2] = 0x80 | (c >>  0 & 0x3F);
		return 3;
	}
}



// helper function to convert unicode into latin2
static utf16 latin2_to_unicode_lookup[96] =
{
	0x00A0, // char 0xA0
	0x0104, // char 0xA1
	0x02D8, // char 0xA2
	0x0141, // char 0xA3
	0x00A4, // char 0xA4
	0x013D, // char 0xA5
	0x015A, // char 0xA6
	0x00A7, // char 0xA7
	0x00A8, // char 0xA8
	0x0160, // char 0xA9
	0x015E, // char 0xAA
	0x0164, // char 0xAB
	0x0179, // char 0xAC
	0x00AD, // char 0xAD
	0x017D, // char 0xAE
	0x017B, // char 0xAF
	0x00B0, // char 0xB0
	0x0105, // char 0xB1
	0x02DB, // char 0xB2
	0x0142, // char 0xB3
	0x00B4, // char 0xB4
	0x013E, // char 0xB5
	0x015B, // char 0xB6
	0x02C7, // char 0xB7
	0x00B8, // char 0xB8
	0x0161, // char 0xB9
	0x015F, // char 0xBA
	0x0165, // char 0xBB
	0x017A, // char 0xBC
	0x02DD, // char 0xBD
	0x017E, // char 0xBE
	0x017C, // char 0xBF
	0x0154, // char 0xC0
	0x00C1, // char 0xC1
	0x00C2, // char 0xC2
	0x0102, // char 0xC3
	0x00C4, // char 0xC4
	0x0139, // char 0xC5
	0x0106, // char 0xC6
	0x00C7, // char 0xC7
	0x010C, // char 0xC8
	0x00C9, // char 0xC9
	0x0118, // char 0xCA
	0x00CB, // char 0xCB
	0x011A, // char 0xCC
	0x00CD, // char 0xCD
	0x00CE, // char 0xCE
	0x010E, // char 0xCF
	0x0110, // char 0xD0
	0x0143, // char 0xD1
	0x0147, // char 0xD2
	0x00D3, // char 0xD3
	0x00D4, // char 0xD4
	0x0150, // char 0xD5
	0x00D6, // char 0xD6
	0x00D7, // char 0xD7
	0x0158, // char 0xD8
	0x016E, // char 0xD9
	0x00DA, // char 0xDA
	0x0170, // char 0xDB
	0x00DC, // char 0xDC
	0x00DD, // char 0xDD
	0x0162, // char 0xDE
	0x00DF, // char 0xDF
	0x0155, // char 0xE0
	0x00E1, // char 0xE1
	0x00E2, // char 0xE2
	0x0103, // char 0xE3
	0x00E4, // char 0xE4
	0x013A, // char 0xE5
	0x0107, // char 0xE6
	0x00E7, // char 0xE7
	0x010D, // char 0xE8
	0x00E9, // char 0xE9
	0x0119, // char 0xEA
	0x00EB, // char 0xEB
	0x011B, // char 0xEC
	0x00ED, // char 0xED
	0x00EE, // char 0xEE
	0x010F, // char 0xEF
	0x0111, // char 0xF0
	0x0144, // char 0xF1
	0x0148, // char 0xF2
	0x00F3, // char 0xF3
	0x00F4, // char 0xF4
	0x0151, // char 0xF5
	0x00F6, // char 0xF6
	0x00F7, // char 0xF7
	0x0159, // char 0xF8
	0x016F, // char 0xF9
	0x00FA, // char 0xFA
	0x0171, // char 0xFB
	0x00FC, // char 0xFC
	0x00FD, // char 0xFD
	0x0163, // char 0xFE
	0x02D9 // char 0xFF
};



uint8 unicode_to_latin2( utf16 chr )
{
	for(  utf8 i=0;  i<96;  i++  ) {
		if(  latin2_to_unicode_lookup[i]==chr  ) {
			return (i+0xA0);
		}
	}
	return 0;
}



utf16 latin2_to_unicode( uint8 chr )
{
	return chr >= 0xA0 ? latin2_to_unicode_lookup[chr-0xA0] : chr;
}