File: unicode.c

package info (click to toggle)
cconv 0.6.2-1.3
  • links: PTS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,332 kB
  • sloc: cpp: 13,691; sh: 8,473; ansic: 758; makefile: 60
file content (122 lines) | stat: -rw-r--r-- 3,371 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
/*
 * Copyright (C) 2008, 2009
 * Free Software Foundation, Inc.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * \author Yang Jianyu <xiaoyjy@hotmail.com>
 */

#include "unicode.h"

int utf8_char_width(const unsigned char* w)
{
	if(w[0] < 0x80) return 1;
	if(w[0] < 0xe0) return 2;
	if(w[0] < 0xf0) return 3;
	if(w[0] < 0xf8) return 4;
	if(w[0] < 0xfc) return 5;
	if(w[0] < 0xfe) return 6;

	return -1;
}

int utf8_char_encode(const unsigned char* w, int* c)
{
	/* 00000000-01111111	00-7F	0-127	US-ASCII (single byte) */
	if(w[0] < 0x80) {
		*c = (int)w[0];
		return 1;
	}

	/* 10000000-10111111	80-BF	128-191	Second, third, or fourth byte of a multi-byte sequence */
	/* 11000000-11000001	C0-C1	192-193	Overlong encoding: start of a 2-byte sequence, but code point <= 127*/

	/* 11000010-11011111	C2-DF	194-223	Start of 2-byte sequence */
	if(w[0] < 0xe0 && w[0] > 0xc1) {
		*c = (int)(((w[0] - 0xc0) << 6) + w[1] - 0x80);
		return 2;
	}

	/* 11100000-11101111	E0-EF	224-239	Start of 3-byte sequence */
	if(w[0] < 0xf0 && w[0] > 0xdf)
	{
		*c = (int)(((w[0] - 0xe0) << 12) + ((w[1] - 0x80) << 6) + w[2] - 0x80);
		return 3;
	}

	/* 11110000-11110100	F0-F4	240-244	Start of 4-byte sequence */
	if(w[0] < 0xf8 && w[0] > 0xef)
	{
		*c = (int)(
			((w[0] - 0xf0) << 18) +
			((w[1] - 0x80) << 12) +
			((w[2] - 0x80) << 6 ) + w[3] - 0x80);
		return 4;
	}
	/* 11110101-11110111	F5-F7	245-247	Restricted by RFC 3629:
	 * start of 4-byte sequence for codepoint above 10FFFF
	 */


	/* 11111000-11111011	F8-FB	248-251	Restricted by RFC 3629: start of 5-byte sequence */
	if(w[0] < 0xfc && w[0] > 0xf7)
	{
		*c = (int)(
			((w[0] - 0xf8) << 24) +
			((w[1] - 0x80) << 18) +
			((w[2] - 0x80) << 12) +
			((w[3] - 0x80) << 6 ) + w[4] - 0x80);
		return 5;
	}

	/* 11111100-11111101	FC-FD	252-253	Restricted by RFC 3629: start of 6-byte sequence */
	if(w[0] < 0xfe && w[0] > 0xfb)
	{
		*c = (int)(
			((w[0] - 0xfc) << 30) +
			((w[1] - 0x80) << 24) +
			((w[2] - 0x80) << 18) +
			((w[3] - 0x80) << 12) +
			((w[4] - 0x80) << 6 ) + w[5] - 0x80);
		return 6;
	}

	*c = 0;
	return -1;
}

int utf8_char_decode(int unicode, unsigned char* utf)
{
	int count = -1;

	     if(unicode < 0x80)	       /*0100,0000*/
		count = 1;
	else if (unicode < 0x800)      /*0100,0000,0000*/
		count = 2;
	else if (unicode < 0x10000)    /*0001,0000,0000,0000,000,*/
		count = 3;
	else if (unicode < 0x200000)   /*0010,0000,0000,0000,0000,0000*/
		count = 4;
	else if (unicode < 0x4000000)  /*0100,0000,0000,0000,0000,0000,0000*/
		count = 5;
	else if (unicode <= 0x7fffffff)/*0110,1111,1111,1111,1111,1111,1111,1111*/
		count = 6;
	else
		return count;

	switch (count) { /* note: code falls through cases! */
		case 6: utf[5] = 0x80 | (unicode & 0x3f); unicode = unicode >> 6; unicode |= 0x4000000;
		case 5: utf[4] = 0x80 | (unicode & 0x3f); unicode = unicode >> 6; unicode |= 0x200000;
		case 4: utf[3] = 0x80 | (unicode & 0x3f); unicode = unicode >> 6; unicode |= 0x10000;
		case 3: utf[2] = 0x80 | (unicode & 0x3f); unicode = unicode >> 6; unicode |= 0x800;
		case 2: utf[1] = 0x80 | (unicode & 0x3f); unicode = unicode >> 6; unicode |= 0xc0;
		case 1: utf[0] = (char)unicode;
	}

	return count;	
}