File: utf8conv.c

package info (click to toggle)
tdom 0.7.8-5
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 5,404 kB
  • ctags: 3,312
  • sloc: ansic: 38,842; xml: 18,244; tcl: 3,704; sh: 2,994; makefile: 58; cpp: 22
file content (206 lines) | stat: -rw-r--r-- 5,946 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
/*---------------------------------------------------------------------------
|   Copyright (C) 1999  Jochen C. Loewer (loewerj@hotmail.com)
+----------------------------------------------------------------------------
|
|   $Header: /usr/local/pubcvs/tdom/generic/utf8conv.c,v 1.1.1.1 2002/02/22 01:05:35 rolf Exp $
|
|
|   Functions, which (try) to convert UTF-8 encoded Unicode strings back 
|   to some 8bit encodings like ISO-8859-*, ... 
|
|
|   The contents of this file are subject to the Mozilla Public License
|   Version 1.1 (the "License"); you may not use this file except in
|   compliance with the License. You may obtain a copy of the License at
|   http://www.mozilla.org/MPL/
|
|   Software distributed under the License is distributed on an "AS IS"
|   basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|   License for the specific language governing rights and limitations
|   under the License.
|
|   The Original Code is tDOM.
|
|   The Initial Developer of the Original Code is Jochen Loewer
|   Portions created by Jochen Loewer are Copyright (C) 1998, 1999
|   Jochen Loewer. All Rights Reserved.
|
|   Contributor(s):
|
|
|   $Log: utf8conv.c,v $
|   Revision 1.1.1.1  2002/02/22 01:05:35  rolf
|   tDOM0.7test with Jochens first set of patches
|
|
|
|   written by Jochen Loewer
|   November, 1999
|
\--------------------------------------------------------------------------*/



/*---------------------------------------------------------------------------
|   Includes
|
\--------------------------------------------------------------------------*/
#include <tcl.h>
#include <stdlib.h>
#include <string.h>
#include <utf8conv.h>

/*---------------------------------------------------------------------------
|   Defines
|
\--------------------------------------------------------------------------*/
#define DBG(x)

#define ENC_END       0
#define ENC_IDENTITY  1
#define ENC_MAP       2

#if defined(_MSC_VER)
# define STRCASECMP(a,b)  stricmp (a,b)
#else
# define STRCASECMP(a,b)  strcasecmp (a,b)
#endif


/*---------------------------------------------------------------------------
|   Static Globals
|
\--------------------------------------------------------------------------*/
#include "encodings.inc"



/*---------------------------------------------------------------------------
|   tdom_GetEncoding  -  Looks up a encoding table for the given encoding
|                        name. If nothing was found NULL is returned.
|
\--------------------------------------------------------------------------*/
TEncoding * 
tdom_GetEncoding (
    char  * name
)
{
    TEncoding *encoding = TDOM_UnicodeTo8bitEncodings;
    
    while (encoding && encoding->name) {
        DBG(fprintf(stderr, "encoding=%x encoding->name='%s' name='%s'",
                             encoding, encoding->name, name);)
        if (STRCASECMP(encoding->name,name)==0) {
            return encoding;
        }
        encoding++;
    }
    return NULL;
}


/*---------------------------------------------------------------------------
|   tdom_GetEncodingName
|
\--------------------------------------------------------------------------*/
char *
tdom_GetEncodingName (TEncoding *encoding) 
{
    TEncoding *knownencoding = TDOM_UnicodeTo8bitEncodings;
    
    while (knownencoding && knownencoding->name) {
        if (knownencoding == encoding) {
            return (char*) knownencoding->name;
        }
        knownencoding++;
    }
    return NULL;
}
    

/*---------------------------------------------------------------------------
|   tdom_Utf8to8Bit  -  Convert a UTF-8 encode string with byte length 
|                       *len to 8bit encoding using the specify encoding.
|
\--------------------------------------------------------------------------*/
void 
tdom_Utf8to8Bit (
    TEncoding  * encoding,
    const char * utf8_string,
    int        * len
)
{
    unsigned char  *in, *end, *out;
    TEncodingRule  *rule;
    int             byte;
    int             unicode;
        
        
    if (encoding == NULL) {
       /* don't convert; keep UTF-8 */
       return;
    }
         
    in  = (unsigned char*) utf8_string;
    out = (unsigned char*) utf8_string;
    end = in + *len;
    unicode = 0;
    
    while (in < end) {

        byte = *in;

        /* extract unicode character from (multiple) UTF-8 bytes */

        if (byte < 0xC0) { 
            unicode = byte;
            in++;
        } else if (byte < 0xE0) {
            if ((in[1] & 0xC0) == 0x80) {
                unicode = ((byte & 0x1F) << 6) | (in[1] & 0x3F);
                in += 2;
            } else {
                unicode = byte; 
                in++;
            }
        } else if (byte < 0xF0) {
            if (((in[1] & 0xC0) == 0x80) && ((in[2] & 0xC0) == 0x80)) {
                unicode =  ((byte  & 0x0F) << 12)
                         | ((in[1] & 0x3F) << 6 )
                         | ((in[2] & 0x3F)      );
                in += 3;
            } else {
                unicode = byte; 
                in++; 
            }
        } else {
            /* ??? > 3 bytes UTF chars ??? */
            in++;
        }

        /* convert unicode character to 8bit representation */
        rule = encoding->rules;
        while (rule && rule->type != ENC_END) {
            if (   (unicode >= rule->start_code) 
                && (unicode < (rule->start_code + rule->len)) ) {

                if (rule->type == ENC_MAP) {
                    *out++ = rule->map[unicode - rule->start_code];
                } else {
                    *out++ = unicode & 0xFF;
                }
                break;
            }
            rule++;
        }
        if (rule->type == ENC_END) {
            /* no rule foun, use fallback */
            *out++ = encoding->fallback_char & 0x0FF;
        }
    }
    if (out < end) {
        *out = '\0';
    }
    *len = ( (char*)out - utf8_string);
}