File: Utf8.cpp

package info (click to toggle)
jazz2-native 3.5.0-3
  • links: PTS, VCS
  • area: contrib
  • in suites: forky, sid
  • size: 16,912 kB
  • sloc: cpp: 172,557; xml: 113; python: 36; makefile: 5; sh: 2
file content (180 lines) | stat: -rw-r--r-- 6,816 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#include "Utf8.h"
#include "CommonWindows.h"
#include "Asserts.h"

namespace Death { namespace Utf8 {
//###==##====#=====--==~--~=~- --- -- -  -  -   -

	const Containers::StaticArray<256, std::uint8_t> BytesOfLead = {{
		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00 - 0x0F
		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10 - 0x1F
		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20 - 0x2F
		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30 - 0x3F
		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40 - 0x4F
		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50 - 0x5F
		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60 - 0x6F
		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 - 0x7F
		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 - 0x8F
		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90 - 0x9F
		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xA0 - 0xAF
		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xB0 - 0xBF
		1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xC0 - 0xCF
		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xD0 - 0xDF
		3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE0 - 0xEF
		4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xF0 - 0xFF
	}};

	Containers::Pair<char32_t, std::size_t> NextChar(const Containers::ArrayView<const char> text, std::size_t cursor)
	{
		DEATH_DEBUG_ASSERT(cursor < text.size(), ("Expected cursor to be less than {} but got {}", text.size(), cursor), {});

		std::uint32_t character = text[cursor];
		std::size_t end = cursor;
		std::uint32_t mask;

		// Sequence size
		if (character < 0x80) {
			end += 1;
			mask = 0x7f;
		} else if ((character & 0xe0) == 0xc0) {
			end += 2;
			mask = 0x1f;
		} else if ((character & 0xf0) == 0xe0) {
			end += 3;
			mask = 0x0f;
		} else if ((character & 0xf8) == 0xf0) {
			end += 4;
			mask = 0x07;
		} else {
			// Wrong sequence start
			return {U'\xffffffff', cursor + 1};
		}

		// Unexpected end
		if (text.size() < end) return {U'\xffffffff', cursor + 1};

		char32_t result = (character & mask);
		for (std::size_t i = cursor + 1; i != end; i++) {
			// Garbage in the sequence
			if ((text[i] & 0xc0) != 0x80)
				return {U'\xffffffff', cursor + 1};

			result <<= 6;
			result |= (text[i] & 0x3f);
		}

		return {result, end};
	}

	Containers::Pair<char32_t, std::size_t> PrevChar(const Containers::ArrayView<const char> text, std::size_t cursor)
	{
		DEATH_DEBUG_ASSERT(cursor > 0 && cursor <= text.size(), ("Expected cursor to be greater than 0 and less than or equal to {} but got {}", text.size(), cursor), {});

		// If the previous byte is a continuation byte, go back until it isn't, but only up to three
		// bytes -- any longer sequence of continuation bytes would be invalid anyway
		const std::size_t iMax = (cursor < std::size_t{4} ? cursor : std::size_t{4});
		std::size_t i = 1;
		while (i != iMax && (text[cursor - i] & 0xc0) == 0x80)
			i++;

		// Delegate to NextChar() for the actual codepoint calculation and validation. It's also invalid
		// if the next UTF-8 character isn't *exactly* this cursor position.
		const Containers::Pair<char32_t, std::size_t> prev = NextChar(text, cursor - i);
		if (prev.first() == U'\xffffffff' || prev.second() != cursor)
			return {U'\xffffffff', cursor - 1};

		return {prev.first(), cursor - i};
	}
	
	std::size_t FromCodePoint(char32_t character, Containers::StaticArrayView<4, char> result)
	{
		if (character < U'\x00000080') {
			result[0] = 0x00 | ((character >> 0) & 0x7f);
			return 1;
		}

		if (character < U'\x00000800') {
			result[0] = 0xc0 | ((character >> 6) & 0x1f);
			result[1] = 0x80 | ((character >> 0) & 0x3f);
			return 2;
		}

		if (character < U'\x00010000') {
			result[0] = 0xe0 | ((character >> 12) & 0x0f);
			result[1] = 0x80 | ((character >> 6) & 0x3f);
			result[2] = 0x80 | ((character >> 0) & 0x3f);
			return 3;
		}

		if (character < U'\x00110000') {
			result[0] = 0xf0 | ((character >> 18) & 0x07);
			result[1] = 0x80 | ((character >> 12) & 0x3f);
			result[2] = 0x80 | ((character >> 6) & 0x3f);
			result[3] = 0x80 | ((character >> 0) & 0x3f);
			return 4;
		}

		// Value outside of UTF-32 range
		return 0;
	}

#if defined(DEATH_TARGET_WINDOWS)

	Containers::Array<wchar_t> ToUtf16(const char* source, std::int32_t sourceSize)
	{
		// MBtoWC counts the trailing \0 into the size, which we have to cut. It also can't be called with a zero
		// size for some stupid reason, in that case just set the result size to zero. We can't just `return {}`,
		// because the output array is guaranteed to be a pointer to a null-terminated string.
		const std::size_t lengthNeeded = (sourceSize == 0 ? 0 : ::MultiByteToWideChar(CP_UTF8, 0, source, sourceSize, nullptr, 0) - (sourceSize == -1 ? 1 : 0));

		// Create the array with a sentinel null terminator. If size is zero, this is just a single null terminator.
		Containers::Array<wchar_t> result{Containers::NoInit, lengthNeeded + 1};
		result[lengthNeeded] = L'\0';

		if (sourceSize != 0) ::MultiByteToWideChar(CP_UTF8, 0, source, sourceSize, result.data(), (std::int32_t)lengthNeeded);
		// Return the size without the null terminator
		return Containers::Array<wchar_t>(result.release(), lengthNeeded);
	}

	std::int32_t ToUtf16(wchar_t* destination, std::int32_t destinationSize, const char* source, std::int32_t sourceSize)
	{
		if (sourceSize == 0) return 0;

		std::int32_t length = ::MultiByteToWideChar(CP_UTF8, 0, source, sourceSize, destination, destinationSize);
		if (length > 0 && sourceSize == -1) {
			length--;	// Return the size without the null terminator
		}
		if (length < destinationSize) {
			destination[length] = L'\0';
		}
		return length;
	}

	Containers::String FromUtf16(const wchar_t* source, std::int32_t sourceSize)
	{
		if (sourceSize == 0) return {};

		// WCtoMB counts the trailing \0 into the size, which we have to cut. Containers::String takes
		// care of allocating extra for the null terminator so we don't need to do that explicitly.
		Containers::String result{Containers::NoInit, std::size_t(::WideCharToMultiByte(CP_UTF8, 0, source, sourceSize, nullptr, 0, nullptr, nullptr) - (sourceSize == -1 ? 1 : 0))};
		::WideCharToMultiByte(CP_UTF8, 0, source, sourceSize, result.data(), (std::int32_t)result.size(), nullptr, nullptr);
		return result;
	}

	std::int32_t FromUtf16(char* destination, std::int32_t destinationSize, const wchar_t* source, std::int32_t sourceSize)
	{
		if (sourceSize == 0) return 0;

		std::int32_t length = ::WideCharToMultiByte(CP_UTF8, 0, source, sourceSize, destination, destinationSize, NULL, NULL);
		if (length > 0 && sourceSize == -1) {
			length--;	// Return the size without the null terminator
		}
		if (length < destinationSize) {
			destination[length] = '\0';
		}
		return length;
	}

#endif

}}