File: unicode_to_utf8.c

package info (click to toggle)
jfsutils 1.1.15-7
  • links: PTS
  • area: main
  • in suites: forky, sid
  • size: 3,076 kB
  • sloc: ansic: 35,080; sh: 1,048; makefile: 81
file content (142 lines) | stat: -rw-r--r-- 2,708 bytes parent folder | download | duplicates (10)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
/*  The code in this module was taken from:
 *
 * linux/fs/nls.c
 *
 * Native language support--charsets and unicode translations.
 * By Gordon Chaffee 1996, 1997
 *
 */
#include <config.h>
#include "unicode_to_utf8.h"

/*
 * Sample implementation from Unicode home page.
 * http://www.stonehand.com/unicode/standard/fss-utf.html
 */
struct utf8_table {
	int cmask;
	int cval;
	int shift;
	long lmask;
	long lval;
};

static struct utf8_table utf8_table[] = {
	{0x80, 0x00, 0 * 6, 0x7F, 0, /* 1 byte sequence */ },
	{0xE0, 0xC0, 1 * 6, 0x7FF, 0x80, /* 2 byte sequence */ },
	{0xF0, 0xE0, 2 * 6, 0xFFFF, 0x800, /* 3 byte sequence */ },
	{0xF8, 0xF0, 3 * 6, 0x1FFFFF, 0x10000, /* 4 byte sequence */ },
	{0xFC, 0xF8, 4 * 6, 0x3FFFFFF, 0x200000, /* 5 byte sequence */ },
	{0xFE, 0xFC, 5 * 6, 0x7FFFFFFF, 0x4000000, /* 6 byte sequence */ },
	{0, /* end of table    */ }
};

int Unicode_Character_to_UTF8_Character(uint8_t * s, uint16_t wc, int maxlen)
{
	long l;
	int c, nc;
	struct utf8_table *t;

	if (s == 0)
		return 0;

	l = wc;
	nc = 0;
	for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) {
		nc++;
		if (l <= t->lmask) {
			c = t->shift;
			*s = t->cval | (l >> c);
			while (c > 0) {
				c -= 6;
				s++;
				*s = 0x80 | ((l >> c) & 0x3F);
			}
			return nc;
		}
	}
	return -1;
}

int Unicode_String_to_UTF8_String(uint8_t * s, const uint16_t * pwcs, int maxlen)
{
	const uint16_t *ip;
	uint8_t *op;
	int size;

	op = s;
	ip = pwcs;
	while (*ip && maxlen > 0) {
		if (*ip > 0x7f) {
			size = Unicode_Character_to_UTF8_Character(op, *ip, maxlen);
			if (size == -1) {
				/* Ignore character and move on */
				maxlen--;
			} else {
				op += size;
				maxlen -= size;
			}
		} else {
			*op++ = (uint8_t) * ip;
			maxlen--;
		}
		ip++;
	}
	return (op - s);
}

int UTF8_Character_To_Unicode_Character(uint16_t * p, const uint8_t * s, int maxLen)
{
	long l;
	int c0, c, nc;
	struct utf8_table *t;

	nc = 0;
	c0 = *s;
	l = c0;
	for (t = utf8_table; t->cmask; t++) {
		nc++;
		if ((c0 & t->cmask) == t->cval) {
			l &= t->lmask;
			if (l < t->lval)
				return -1;
			*p = l;
			return nc;
		}
		if (maxLen <= nc)
			return -1;
		s++;
		c = (*s ^ 0x80) & 0xFF;
		if (c & 0xC0)
			return -1;
		l = (l << 6) | c;
	}
	return -1;
}

int UTF8_String_To_Unicode_String(uint16_t * pwcs, const uint8_t * s, int maxLen)
{
	uint16_t *op;
	const uint8_t *ip;
	int size;

	op = pwcs;
	ip = s;
	while (*ip && maxLen > 0) {
		if (*ip & 0x80) {
			size = UTF8_Character_To_Unicode_Character(op, ip, maxLen);
			if (size == -1) {
				/* Ignore character and move on */
				ip++;
				maxLen--;
			} else {
				op += size;
				ip += size;
				maxLen -= size;
			}
		} else {
			*op++ = *ip++;
		}
	}
	return (op - pwcs);
}