File: Utf.h

package info (click to toggle)
storm-lang 0.7.5-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 52,028 kB
  • sloc: ansic: 261,471; cpp: 140,432; sh: 14,891; perl: 9,846; python: 2,525; lisp: 2,504; asm: 860; makefile: 678; pascal: 70; java: 52; xml: 37; awk: 12
file content (161 lines) | stat: -rw-r--r-- 3,896 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#pragma once

namespace storm {

	// Codepoint for the 'replacement character', which is used whenever encoding or decoding of text fails.
	static const nat16 replacementChar = 0xFFFD;

	/**
	 * Utf16 helpers.
	 */
	namespace utf16 {

		// Is this a leading character?
		static inline bool leading(nat16 ch) {
			return (ch & 0xFC00) == 0xD800;
		}

		// Is this a trailing character?
		static inline bool trailing(nat16 ch) {
			return (ch & 0xFC00) == 0xDC00;
		}

		// Assemble a leading and a trailing char into one codepoint.
		static inline nat assemble(nat16 lead, nat16 trail) {
			nat r = nat(lead & 0x3FF) << nat(10);
			r |= nat(trail & 0x3FF);
			r += 0x10000;
			return r;
		}

		// Valid codepoint?
		static inline bool valid(nat cp) {
			if (cp >= 0x110000)
				return false;
			if (cp >= 0xD800 && cp < 0xE000)
				return false;
			return true;
		}

		// Should this codepoint be split?
		static inline bool split(nat cp) {
			return cp >= 0x10000;
		}

		// Get the leading codepoint. Return 0 if should not be split.
		static inline nat16 splitLeading(nat cp) {
			if (!split(cp) || !valid(cp))
				return 0;

			cp -= 0x10000;
			cp >>= 10;
			return nat16(0xD800 + (cp & 0x3FF));
		}

		// Get the trailing codepoint.
		static inline nat16 splitTrailing(nat cp) {
			if (!valid(cp)) {
				return replacementChar;
			} else if (split(cp)) {
				cp -= 0x10000;
				return nat16(0xDC00 + (cp & 0x3FF));
			} else {
				return cp;
			}
		}

	}

	/**
	 * Utf8 helpers.
	 */
	namespace utf8 {

		// Is this a continuation byte?
		static inline bool isCont(byte c) {
			return (c & 0xC0) == 0x80;
		}

		// Get the data from a continuation byte.
		static inline byte contData(byte c) {
			return c & 0x3F;
		}

		// Add another continuation byte to the character.
		static inline nat addCont(nat prev, byte cont) {
			return (prev << 6) | contData(cont);
		}

		// Get information about the first byte of a codepoint. Returns the initial
		// codepoint-data. 'remaining' indicates the number of remaining continuation bytes that
		// shall be appended using 'addCont'.
		static inline nat firstData(byte c, nat &remaining) {
			if ((c & 0x80) == 0) {
				remaining = 0;
				return nat(c);
			} else if ((c & 0xC0) == 0x80) {
				// Continuation from before: error!
				remaining = 0;
				return replacementChar;
			} else if ((c & 0xE0) == 0xC0) {
				remaining = 1;
				return nat(c & 0x1F);
			} else if ((c & 0xF0) == 0xE0) {
				remaining = 2;
				return nat(c & 0x0F);
			} else if ((c & 0xF8) == 0xF0) {
				remaining = 3;
				return nat(c & 0x07);
			} else if ((c & 0xFC) == 0xF8) {
				remaining = 4;
				return nat(c & 0x03);
			} else if ((c & 0xFE) == 0xFC) {
				remaining = 5;
			    return nat(c & 0x01);
			} else {
				remaining = 0;
				return replacementChar;
			}
		}

		// Largest number of bytes required for a single codepoint in UTF-8. Including null terminator.
		static const nat maxBytes = 9;

		// Encode a codepoint in UTF-8. 'out' is assumed to be at least 'maxBytes' entries
		// large. 'count' optionally returns the number of bytes used inside the buffer. A pointer
		// to the first written byte is returned from the function.
		static inline byte *encode(nat cp, byte *buffer, nat *count) {
			if (cp < 0x80) {
				// Fast path: 1 byte codepoints.
				buffer[0] = byte(cp);
				buffer[1] = 0;
				if (count)
					*count = 1;
				return buffer;
			}

			// Output multiple bytes...
			byte *at = buffer + maxBytes - 1;
			*at = 0;
			nat leadingBits = 6;
			nat bytes = 0;
			do {
				// Output the least significant 6 bits.
				*--at = byte(0x80 | (cp & 0x3F));
				cp = cp >> 6;
				bytes++;
				leadingBits--;
			} while (cp >= (Nat(1) << leadingBits));

			// Output the first byte indicating the length of the codepoint.
			*--at = byte((0xFF << (leadingBits + 1)) | cp);
			bytes++;

			if (count)
				*count = bytes;
			return at;
		}

	}

}