File: unicode.h

package info (click to toggle)
freespace2 24.2.0%2Brepack-1
  • links: PTS, VCS
  • area: non-free
  • in suites: forky, sid
  • size: 43,716 kB
  • sloc: cpp: 595,001; ansic: 21,741; python: 1,174; sh: 457; makefile: 248; xml: 181
file content (171 lines) | stat: -rw-r--r-- 5,263 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#pragma once

#include "globalincs/pstypes.h"

#include "mod_table/mod_table.h"

#include <iterator>
#include <cinttypes>

#include <utf8.h>

#if !HAVE_CHAR32_T
// Older compilers don't have this as a built-in type so we define it for those
typedef std::uint_least32_t char32_t;
#endif

#if !HAVE_UNICODE_CHAR_LITERAL
#define UNICODE_CHAR(c) (char32_t) c
#else
/**
 * @brief Compatibility macro for compilers which don't support U'' char literals.
 *
 * @warning This can only handle standard ASCII character values since older compilers still use standard character
 * literals
 */
#define UNICODE_CHAR(c) U##c
#endif

namespace unicode {

/**
 * @brief A standard unicode codepoint
 */
typedef char32_t codepoint_t;

class text_iterator {
	const char* current_byte = nullptr;
	const char* range_end_byte = nullptr;
	const char* range_start_byte = nullptr;

	bool is_from_same_range(const text_iterator& other) const;
 public:
	explicit text_iterator(const char* current_byte, const char* range_start_byte, const char* range_end_byte = nullptr);

	typedef codepoint_t value_type;

	const char* pos() const;

	text_iterator& operator++();
	text_iterator& operator--();

	value_type operator*();

	bool operator==(const text_iterator& rhs) const;
	bool operator!=(const text_iterator& rhs) const;

	bool operator<(const text_iterator& rhs) const;
	bool operator>(const text_iterator& rhs) const;
	bool operator<=(const text_iterator& rhs) const;
	bool operator>=(const text_iterator& rhs) const;

	text_iterator operator+(ptrdiff_t diff) const;
	text_iterator operator-(ptrdiff_t diff) const;
};

/**
 * @brief Represents a range of unicode codepoints that can be iterated over
 *
 * @note This class can be used in range based for loops
 */
class codepoint_range {
	const char* start = nullptr;
	const char* end_ptr = nullptr;

 public:
	/**
	 * @brief Creates a codepoint range based on an UTF-8 encoded string
	 * @param start The start of the encoded string
	 * @param end The end of the encoded string. May be @c nullptr in which case @c start is assumed to be null-terminated
	 */
	explicit codepoint_range(const char* start, const char* end = nullptr);

	/**
	 * @brief Retrieves an iterator for the start of the codepoint range
	 * @return An iterator which is located at the start of the range
	 */
	text_iterator begin();

	/**
	 * @brief Retrieves an iterator for the end of the codepoint range
	 * @return An iterator which is located at the end of the range
	 */
	text_iterator end();
};

/**
 * @brief Computes the byte size the given codepoint would have if it were encoded using the standard encoding
 *
 * The standard encoding is UTF-8 in Unicode mode and ASCII otherwise.
 *
 * @param cp The codepoint to determine the size for.
 * @return The number of bytes required for encoding this code point.
 */
size_t encoded_size(codepoint_t cp);

/**
 * @brief Appends the given code point to the byte range specified by buffer.
 *
 * @c buffer can be a pointer to a character buffer or an output iterator. If the engine is in Unicode mode then the
 * codepoint will be encoded using UTF-8. Otherwise the codepoint will be truncated to fit into the ASCII encoding.
 *
 * @tparam octet_iterator The type of the output sequence.
 * @param cp The codepoint to encode
 * @param buffer The buffer to write the encoded data to.
 * @return The value of the iterator after appending all bytes to the sequence.
 */
template<typename octet_iterator>
octet_iterator encode(codepoint_t cp, octet_iterator buffer) {
	if (Unicode_text_mode) {
		try {
			return utf8::append(cp, buffer);
		} catch(const std::exception& e) {
			Error(LOCATION, "Exception while encoding Unicode code point %" PRIu32 ": %s", (uint32_t)cp, e.what());
			return buffer;
		}
	} else {
		// In the legacy mode every code point is exactly one char
		*(buffer++) = (char)cp;
		return buffer;
	}
}

/**
 * @brief Counts the number of code points in the specified byte sequence
 *
 * @note This respects the Unicode mod setting and should be used where the text contains Unicode characters.
 *
 * @tparam octet_iterator The type of the byte sequence.
 * @param start An iterator located at the start of the sequence
 * @param end An iterator which signals the end of the sequence
 * @return The number of codepoints found between @c start and @c end
 */
template<typename octet_iterator>
size_t num_codepoints(octet_iterator start, octet_iterator end) {
	if (Unicode_text_mode) {
		try {
			return static_cast<size_t>(utf8::distance(start, end));
		} catch(const std::exception& e) {
			Error(LOCATION, "Exception while counting Unicode code points: %s", e.what());
			return 0;
		}
	} else {
		return static_cast<size_t>(std::distance(start, end));
	}
}

template<typename octet_iterator>
void advance(octet_iterator& start, size_t n, octet_iterator end) {
	if (Unicode_text_mode) {
		utf8::advance(start, n, end);
	} else {
		start = std::min(start + n, end);
	}
}

enum class Encoding { Encoding_current, Encoding_utf8, Encoding_iso8859_1 };

bool string_is_ascii_only(const char* str, size_t len);
const char* get_encoding_string(Encoding encoding);
bool convert_encoding(SCP_string& buffer, const char* src, Encoding encoding_src, Encoding encoding_dest = Encoding::Encoding_current);
}