File: unicode.c

package info (click to toggle)
unrtf 0.21.5-3
  • links: PTS, VCS
  • area: main
  • in suites: jessie-kfreebsd
  • size: 4,940 kB
  • sloc: ansic: 5,022; sh: 4,279; makefile: 112
file content (145 lines) | stat: -rw-r--r-- 4,372 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/*----------------------------------------------------------------------
 * Module name:    unicode
 * Author name:    Arkadiusz Firus
 * Create date:    09 Nov 08
 * Purpose:        unicode translations
 *----------------------------------------------------------------------
 * Changes:
 * 04 Jan 10, daved@physiol.usyd.edu.au: null terminate strings in
 *		unicode_to_string
 * 21 Aug 10, daved@physiol.usyd.edu.au: test feof() rather than EOF for
 *              AIX support
 * 07 Oct 11, jf@dockes.org: major changes including change of
 *	get_unicode_char to get_unicode_utf8
 *--------------------------------------------------------------------*/
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>

#include "malloc.h"

/*========================================================================
 * Name		get_unicode
 * Purpose:	Translates string like U221E or 221E to int value
 * Args:	Unicode character.
 * Returns:	Unicode number.
 *=======================================================================*/
int
get_unicode(char *string)
{
    unsigned long uc;
    if (string[0] == 'U' || string[0] == 'u')
        string++;
    uc = strtoul(string, 0, 16);
    return uc;
}

/*========================================================================
 * Name		unicode_to_utf8
 * Purpose:	Translates unicode number to UTF-8 string
 * Args:	Unicode number.
 * Returns:	malloced UTF-8 string
 *=======================================================================*/
char *
unicode_to_utf8(unsigned int uc)
{
        unsigned char *string;
	if (uc < 0x7f)
	{
		string = my_malloc(2 * sizeof(char));
		string[0] = (unsigned char) uc;
		string[1] = '\0';
	}
	else if (uc < 0x7ff)
	{
		string = my_malloc(3 * sizeof(char));
		string[0] = (unsigned char) 192 + (uc / 64);
		string[1] = (unsigned char) 128 + (uc % 64);
		string[2] = '\0';
	}
	else if (uc < 0xffff)
	{
		string = my_malloc(4 * sizeof(char));
		string[0] = (unsigned char) 224 + (uc / (64 * 64));
		string[1] = (unsigned char) 128 + ((uc / 64) % 64);
		string[2] = (unsigned char) 128 + (uc % 64);
		string[3] = '\0';
	}
	else if (uc < 0x1FFFFF)
	{
		string = my_malloc(5 * sizeof(char));
		string[0] = (unsigned char) 240 + (uc / (64 * 64 * 64));
		string[1] = (unsigned char) 128 + ((uc / (64 * 64)) % 64);
		string[2] = (unsigned char) 128 + ((uc / 64) % 64);
		string[3] = (unsigned char) 128 + (uc % 64);
		string[4] = '\0';
	}
	else if (uc < 0x3FFFFFF)
	{
		string = my_malloc(6 * sizeof(char));
		string[0] = (unsigned char) 248 + (uc / (64 * 64 * 64 * 64));
		string[1] = (unsigned char) 128 + ((uc / (64 * 64 * 64)) % 64);
		string[2] = (unsigned char) 128 + ((uc / (64 * 64)) % 64);
		string[3] = (unsigned char) 128 + ((uc / 64) % 64);
		string[4] = (unsigned char) 128 + (uc % 64);
		string[5] = '\0';
	}
	else if (uc < 0x7FFFFFFF)
	{
		string = my_malloc(7 * sizeof(char));
		string[0] = (unsigned char) 252 + (uc / (64 * 64 * 64 * 64 * 64));
		string[1] = (unsigned char) 128 + ((uc / (64 * 64 * 64 * 64)) % 64);
		string[2] = (unsigned char) 128 + ((uc / (64 * 64 * 64)) % 64);
		string[3] = (unsigned char) 128 + ((uc / (64 * 64)) % 64);
		string[4] = (unsigned char) 128 + ((uc / 64) % 64);
		string[5] = (unsigned char) 128 + (uc % 64);
		string[6] = '\0';
	}

	return string;
}

/*========================================================================
 * Name		get_unicode_int
 * Purpose:	Reads unicode character (in format <UN...N> and translates
		it to printable unicode character.
 * Caution:	This function should be executed after char '<'  was read.
		It reads until char '>' was found or EOL or EOF.
 * Args:	File to read from.
 * Returns:	Unicode character encoded in UTF-8
 *=======================================================================*/

int
get_unicode_int(FILE *file)
{
	int allocated = 5, len = 0, uc;
	char c, *unicode_number = my_malloc(allocated * sizeof(char));

	c = fgetc(file);

	while (c != '>' && c != '\n' && !feof(file) && !ferror(file))
	{
		unicode_number[len] = c;
		c = fgetc(file);
		len++;

		if (len == allocated)
		{
			allocated *= 2;
			unicode_number = my_realloc(unicode_number, allocated / 2, allocated);
		}
	}

	if (c != '>')
		ungetc(c, file);

	unicode_number[len] = '\0';
	return get_unicode(unicode_number);
}

char *
get_unicode_utf8(FILE *file)
{
    int uc = get_unicode_int(file);
    return unicode_to_utf8(uc);
}