File: utf8.c

package info (click to toggle)
referencer 1.2.2-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 4,028 kB
  • ctags: 2,265
  • sloc: ansic: 32,973; cpp: 12,149; python: 1,314; xml: 1,258; sh: 1,154; makefile: 252
file content (129 lines) | stat: -rw-r--r-- 3,401 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
/*
 * utf8.c
 *
 * Copyright (c) Chris Putnam 2004-5
 *
 * Source code released under the GPL
 *
 */
#include <stdio.h>
#include "utf8.h"

/* UTF-8 encryption

U-00000000 - U-0000007F:  0xxxxxxx 
U-00000080 - U-000007FF:  110xxxxx 10xxxxxx 
U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx 
U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 
U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

*/

static void
utf8_build( unsigned int value, unsigned char out[6], int in_pos, int out_pos )
{
	unsigned int in_mask, out_mask;
	int byte = 0;
	while ( in_pos < 32 ) {
		in_mask = 1 << ( 31 - in_pos );
		out_mask = 1 << ( 7 - out_pos );
		if ( value & in_mask ) out[byte] |= out_mask;
		in_pos++;
		out_pos++;
		if ( out_pos > 7 ) {
			out_pos=2;
			byte++;
		}
	}
}

/* int utf8( in, out[6] );
 *
 *  in is character code 0x0 -> 0x7FFFFFFF
 *  int is number of characters for output
 *
 */
int
utf8_encode( unsigned int value, unsigned char out[6] )
{
	int i;
	for ( i=1; i<6; ++i ) out[i] = 0x80;  /* 10xxxxxx */
	if ( value < 0x80 ) {
		out[0] = 0x0;   /* 0xxxxxxx */
		utf8_build( value, out, 25, 1 );
		return 1;
	} else if ( value < 0x800 ) {
		out[0] = 0xC0;  /* 110xxxxx */
		utf8_build( value, out, 21, 3 );
		return 2;
	} else if ( value < 0x10000 ) {
		out[0] = 0xE0;  /* 1110xxxx */
		utf8_build( value, out, 16, 4 );
		return 3;
	} else if ( value < 0x200000 ) {
		out[0] = 0xF0;  /* 11110xxx */
		utf8_build( value, out, 11, 5 );
		return 4;
	} else if ( value < 0x4000000 ) {
		out[0] = 0xF8;  /* 111110xx */
		utf8_build( value, out,  6, 6 );
		return 5;
	} else if ( value < (unsigned int ) 0x80000000 ) {
		out[0] = 0xFC;  /* 1111110x */
		utf8_build( value, out,  1, 7 );
		return 6;
	} else {
		/* error, above 2^31 bits encodable by UTF-8 */
		return 0;
	}
}

unsigned int
utf8_decode( char *s, unsigned int *pi )
{
	unsigned int c;
	int i = *pi;
	/* one digit utf-8 */
	if ((s[i] & 128)== 0 ) {
		c = (unsigned int) s[i];
		i += 1;
	} else if ((s[i] & 224)== 192 ) { /* 110xxxxx & 111xxxxx == 110xxxxx */
		c = (( (unsigned int) s[i] & 31 ) << 6) +
			( (unsigned int) s[i+1] & 63 );
		i += 2;
	} else if ((s[i] & 240)== 224 ) { /* 1110xxxx & 1111xxxx == 1110xxxx */
		c = ( ( (unsigned int) s[i] & 15 ) << 12 ) + 
			( ( (unsigned int) s[i+1] & 63 ) << 6 ) +
		 	( (unsigned int) s[i+2] & 63 );
		i += 3;
	} else if ((s[i] & 248)== 240 ) { /* 11110xxx & 11111xxx == 11110xxx */
		c =  ( ( (unsigned int) s[i] & 7 ) << 18 ) +
			( ( (unsigned int) s[i+1] & 63 ) << 12 ) +
			( ( (unsigned int) s[i+2] & 63 ) << 6 ) +
			( (unsigned int) s[i+3] & 63 );
		i+= 4;
	} else if ((s[i] & 252)== 248 ) { /* 111110xx & 111111xx == 111110xx */
		c = ( ( (unsigned int) s[i] & 3 ) << 24 ) +
			( ( (unsigned int) s[i+1] & 63 ) << 18 ) +
			( ( (unsigned int) s[i+2] & 63 ) << 12 ) +
			( ( (unsigned int) s[i+3] & 63 ) << 6 ) +
			( (unsigned int) s[i+4] & 63 );
		i += 5;
	} else if ((s[i] & 254)== 252 ) { /* 1111110x & 1111111x == 1111110x */
		c = ( ( (unsigned int) s[i] & 1 ) << 30 ) + 
			( ( (unsigned int) s[i+1] & 63 ) << 24 ) +
			( ( (unsigned int) s[i+2] & 63 ) << 18 ) +
			( ( (unsigned int) s[i+3] & 63 ) << 12 ) +
			( ( (unsigned int) s[i+4] & 63 ) << 6 ) +
			( (unsigned int) s[i+5] & 63 );
		i += 6;
	} else {
		c = '?';
		i++;
	}
	*pi = i;
	return c;
}