File: Utf8.cpp

package info (click to toggle)
endless-sky 0.10.16-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 414,608 kB
  • sloc: cpp: 73,435; python: 893; xml: 666; sh: 271; makefile: 28
file content (170 lines) | stat: -rw-r--r-- 3,812 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
/* Utf8.cpp
Copyright (c) 2017, 2018 by Flavio J. Saraiva

Endless Sky is free software: you can redistribute it and/or modify it under the
terms of the GNU General Public License as published by the Free Software
Foundation, either version 3 of the License, or (at your option) any later version.

Endless Sky is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with
this program. If not, see <https://www.gnu.org/licenses/>.
*/

#include "Utf8.h"

#if defined(_WIN32)
#define STRICT
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif

using namespace std;

namespace {
	constexpr char32_t BOM = 0x0000FEFF;
}



namespace Utf8 {
#if defined(_WIN32)
	wstring ToUTF16(const string &input, bool isPath)
	{
		const auto page = CP_UTF8;
		wstring result;
		if(input.empty())
			return result;

		bool endsInSlash = isPath && (input.back() == '/' || input.back() == '\\');
		int size = MultiByteToWideChar(page, 0, &input[0], input.length() - endsInSlash, nullptr, 0);
		result.resize(size);
		MultiByteToWideChar(page, 0, &input[0], input.length() - endsInSlash, &result[0], size);

		return result;
	}



	string ToUTF8(const wchar_t *str)
	{
		string result;
		if(!str || !*str)
			return result;

		const auto page = CP_UTF8;
		// The returned size will include the null character at the end.
		int size = WideCharToMultiByte(page, 0, str, -1, nullptr, 0, nullptr, nullptr) - 1;
		result.resize(size);
		WideCharToMultiByte(page, 0, str, -1, &result[0], size, nullptr, nullptr);

		return result;
	}
#endif



	// Check if this character is the byte order mark (BOM) sequence.
	bool IsBOM(char32_t c)
	{
		return c == BOM;
	}



	size_t NextCodePoint(const string &str, size_t pos)
	{
		if(pos >= str.length())
			return string::npos;

		for(++pos; pos < str.length(); ++pos)
			if((str[pos] & 0x80) == 0 || (str[pos] & 0xc0) == 0xc0)
				break;
		return pos;
	}



	// Returns the start of the unicode code point at pos in utf8.
	size_t CodePointStart(const string &str, size_t pos)
	{
		// 0xxxxxxx and 11?????? start a code point
		while(pos > 0 && (str[pos] & 0x80) != 0x00 && (str[pos] & 0xc0) != 0xc0)
			--pos;
		return pos;
	}



	// Determines the number of bytes used by the unicode code point in utf8.
	int CodePointBytes(const char *str)
	{
		// end - 00000000
		if(!str || !*str)
			return 0;

		// 1 byte - 0xxxxxxx
		if((*str & 0x80) == 0)
			return 1;

		// invalid - 10?????? or 11?????? invalid
		if((*str & 0x40) == 0 || (*(str + 1) & 0xc0) != 0x80)
			return -1;

		// 2 bytes - 110xxxxx 10xxxxxx
		if((*str & 0x20) == 0)
			return 2;

		// invalid - 111????? 10?????? invalid
		if((*(str + 2) & 0xc0) != 0x80)
			return -1;

		// 3 bytes - 1110xxxx 10xxxxxx 10xxxxxx
		if((*str & 0x10) == 0)
			return 3;

		// invalid - 1111???? 10?????? 10?????? invalid
		if((*(str + 3) & 0xc0) != 0x80)
			return -1;

		// 4 bytes - 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
		if((*str & 0x8) == 0)
			return 4;

		// not unicode - 11111??? 10?????? 10?????? 10??????
		return -1;
	}



	// Decodes a unicode code point in utf8.
	// Invalid codepoints are converted to 0xFFFFFFFF.
	char32_t DecodeCodePoint(const string &str, size_t &pos)
	{
		if(pos >= str.length())
		{
			pos = string::npos;
			return 0;
		}

		// invalid (-1) or end (0)
		int bytes = CodePointBytes(str.c_str() + pos);
		if(bytes < 1)
		{
			++pos;
			return bytes;
		}

		// 1 byte
		if(bytes == 1)
			return (str[pos++] & 0x7f);

		// 2-4 bytes
		char32_t c = (str[pos++] & ((1 << (7 - bytes)) - 1));
		for(int i = 1; i < bytes; ++i)
			c = (c << 6) + (str[pos++] & 0x3f);
		return c;
	}
}