File: unicode.h

package info (click to toggle)
simutrans 121.0-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 18,716 kB
  • sloc: cpp: 143,173; ansic: 2,783; makefile: 872; sh: 508
file content (72 lines) | stat: -rw-r--r-- 2,171 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#ifndef UNICODE_H
#define UNICODE_H

#include <stddef.h>
#include "simtypes.h"

// Unicode type large enough to hold every single possible Unicode code point.
typedef uint32 utf32;

typedef unsigned char  utf8;
typedef unsigned short utf16;

extern utf32 const UNICODE_NUL;

/**
 * UTF-8 string decoder that can be used to iterate through all code points.
 */
class utf8_decoder_t
{
private:
	// Pointer to UTF-8 formated C string.
	utf8 const *utf8str;
public:
	// Constructs a UTF-8 decoder for the given C string.
	utf8_decoder_t(utf8 const *str);

	/** 
	 * Decodes a Unicode code point from the byte sequence pointed to by buff.
	 * On return buff has been advanced to point at the beginning of the next Unicode code point.
	 * Does not respect NUL terminator character, care should be taken to detect the emmited UNICODE_NUL when decoding C strings to avoid buffer over run errors.
	 * Invalid Unicode sequences are intepreted using ISO-8859-1 and advance buff 1 byte.
	 */
	static utf32 decode(utf8 const *&buff);

	/** 
	 * Decodes a Unicode code point from the byte sequence pointed to by buff.
	 * On return len contains the length of the Unicode character in bytes.
	 * Does not respect NUL terminator character, care should be taken to detect the emmited UNICODE_NUL when decoding C strings to avoid buffer over run errors.
	 * Invalid Unicode sequences are intepreted using ISO-8859-1 with a len of 1.
	 */
	static utf32 decode(utf8 const *const buff, size_t &len);

	/**
	 * Returns true if there are more code points left to decode.
	 * Returns false if at end of string.
	 */
	bool has_next() const;

	/**
	 * Returns the next Unicode code point value in the string.
	 * Returns UNICODE_NUL if has_next returns false.
	 */
	utf32 next();

	/**
	 * Returns the current position of the decoder.
	 * This is a pointer to the next character.
	 */
	utf8 const *get_position();
};

size_t utf8_get_next_char(const utf8 *text, size_t pos);
sint32 utf8_get_prev_char(const utf8 *text, sint32 pos);

int	utf16_to_utf8(utf16 unicode, utf8 *out);

// returns latin2 or 0 for error
uint8 unicode_to_latin2( utf16 chr );
utf16 latin2_to_unicode( uint8 chr );


#endif