File: utf.c

package info (click to toggle)
golf 601.4.41-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 2,824 kB
  • sloc: ansic: 20,020; sh: 1,171; makefile: 292
file content (359 lines) | stat: -rw-r--r-- 13,805 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
// SPDX-License-Identifier: Apache-2.0
// Copyright 2018-2025 Gliim LLC. 
// Licensed under Apache License v2. See LICENSE file.
// On the web http://golf-lang.com/ - this file is part of Golf framework.

// 
// UTF-related module
//

//
// Based on UTF standard: https://www.rfc-editor.org/rfc/rfc3629
//

#include "golf.h"

//
// get original unicode from two surrogates. Return that value.
//
gg_num32 gg_make_from_utf_surrogate (gg_num32 u0, gg_num32 u1)
{
    GG_TRACE("");
    return (u0 << 10) + u1 + (0x10000 - (0xD800 << 10) - 0xDC00);
}

//
// Encode binary UTF string 'r' to unicode value 'u'. Return number of bytes of UTF picked up (1,2,3 or 4) or -1 if error
// 'e' is error text
//
gg_num gg_encode_utf (char *r, gg_num32 *u, char **e)
{
    GG_TRACE("");
    *e = GG_EMPTY_STRING;
    if ((r[0] & 128) == 0) 
    {
        // single byte, ascii
        *u = (gg_num32) r[0];
        return 1;
    }
    // These 3 else if must be in this order, because checking for 192 is true for both 224 and 240
    // so if 192 were first, 224 and 240 would never happen
    else if ((r[0] & 240) == 240)
    {
        // four byte
        *u = (r[0] & 7) << 18;
        if ((r[1] & 128) == 128) *u += ((r[1] & 63)<<12);
        else { *e = gg_strdup(GG_UTF_ERR_SECOND_BYTE); return -1;}
        if ((r[2] & 128) == 128) *u += ((r[2] & 63)<<6);
        else { *e = gg_strdup(GG_UTF_ERR_THIRD_BYTE); return -1;}
        if ((r[3] & 128) == 128) *u += (r[3] & 63);
        else { *e = gg_strdup(GG_UTF_ERR_FOURTH_BYTE); return -1;}
        return 4;
    }
    else if ((r[0] & 224) == 224)
    {
        // three byte
        *u = (r[0] & 15) << 12;
        if ((r[1] & 128) == 128) *u += ((r[1] & 63)<<6);
        else { *e = gg_strdup(GG_UTF_ERR_SECOND_BYTE); return -1;}
        if ((r[2] & 128) == 128) *u += (r[2] & 63);
        else { *e = gg_strdup(GG_UTF_ERR_THIRD_BYTE); return -1;}
        return 3;
    }
    else if ((r[0] & 192) == 192)
    {
        // two byte
        *u = (r[0] & 31) << 6;
        if ((r[1] & 128) == 128) *u += (r[1] & 63);
        else { *e = gg_strdup(GG_UTF_ERR_SECOND_BYTE); return -1;}
        return 2;
    }
    else
    {
        *e = gg_strdup(GG_UTF_INVALID); return -1;
    }
}

//
// Break unicode u into surrogate u0 and u1. 
// 
//
void gg_get_utf_surrogate (gg_num32 u, gg_num32 *u0, gg_num32 *u1)
{
    GG_TRACE("");
    *u0 = (0xD800 - (0x10000 >> 10)) + (u>>10);
    *u1 = 0xDC00 + (u&0x3FF);
}

//
// Decode 32 bit Unicode to 1,2,3 or 4 UTF string
// 'u' is unicode number of a character, range 0 to 10FFFF.
// 'r' is the result string of UTF (1,2,3 or 4 bytes), r must point to at least 5 bytes of memory (4+null)
// 'e' is error message. 
// returns -1 on error, number of bytes (excluding 0) if okay
//
gg_num gg_decode_utf (gg_num32 u, unsigned char *r, char **e)
{
    GG_TRACE("");
    *e = GG_EMPTY_STRING;
    if (u <= 0x7F) { r[0] = (unsigned char)u; return 1; } // single byte (ASCII)
    else if (u >= 0x80 && u <= 0x7FF) 
    { 
        // 192 = b11000000
        r[0] = 192+(u>>6); // >>6 to get the top 5 (out of 11) bits
        r[1] = 128+(u&63); // 63 is 111111 to extract lower 6 bits
        return 2; 
    }
    else if (u >= 0x800 && u <= 0xFFFF) 
    {
        if (u == 0xFEFF) {*e = gg_strdup(GG_UTF_ERR_ILLEGAL_CHARACTER_FEFF); return -1;}
        // 224 = 11100000
        r[0] = 224+(u>>12); // get top 4 (out of 16) bits
        r[1] = 128+((u>>6)&63); // get middle 6 from 4+6+6 group of bits
        r[2] = 128+(u&63); // get lower 6 bits
        return 3;
    }
    else if (u >= 0x10000 && u <= 0x10FFFF) 
    {
        // 240 = 11110000
        r[0] = 240+(u>>18); // get top 3 (out of 21) bits
        r[1] = 128+((u>>12)&63); // get 6 after 3 from 3+6+6+6 group of bits
        r[2] = 128+((u>>6)&63); // get 6 after second 6 in 3+6+6+6 group of bits
        r[3] = 128+(u&63); // get lower 6 bits
        return 4;
    }
    else { *e = gg_strdup(GG_UTF_ERR_OUT_OF_RANGE); return -1; }
    return -1; // just to satisfy compiler
}


//
// Get value of string representing hex in 4 digits at 'v'. If error, err will be filled and returns 0.
// Return value of hex.
//
gg_num32 gg_get_hex(char *v, char **err)
{
    GG_TRACE("");
    gg_num k;
    gg_num r = 0;
    for (k = 0; k < 4; k++)
    {
        if (*v >= '0' && *v <= '9') r += (*v - '0')*gg_topower(16,3-k);
            else if (*v >= 'a' && *v <= 'f') r += (*v - 'a' + 10)*gg_topower(16,3-k);
            else if (*v >= 'A' && *v <= 'F') r += (*v - 'A' + 10)*gg_topower(16,3-k);
            else { *err = GG_ERR_UTF_BADUTF; return 0;} // not a hex value
        v++;
    }
    return r;
}

//
// Obtain the string from text "val", which starts with a first byte after double quote. 
// This string can be name or a string value in text. Sets errm.
// quoted is 1 if string is double quoted (as is with text)
// returns pointer to first byte after the string or NULL on error
// If 'enc' is 0, do not enccode strings at all.
// alloced is true if val is allocated memory
//
char *gg_text_to_utf (char *val, char quoted, char **o_errm, char enc, bool alloced)
{
    GG_TRACE("");
    *o_errm = GG_EMPTY_STRING;
    gg_num i = 1; // if quoted, start right after first char (which is a quote)
    if (quoted == 0) i = 0; // if not quoted, start from the beginning of string

    gg_num pull = 0; // when interpreting escaped value, current tally of how many byte to copy current byte
                  // all val[] assignments subtract "pull" when being assigned

    if (enc == 0)
    {
        // this is no-encode
        while (val[i] != 0) // do not go past the end of string
        {
            if (val[i] == '\\' && val[i+1] != 0) i+=2; // if there is an escape, get passed both chars
            else if (val[i] == '"')  // found unescaped quote
            {
                // stop with unescaped quote, if our string is quoted, otherwise a quote is nothing special
                // and just continue until null char
                if (quoted == 1) break; else continue;
                i++;
            } else i++;
        }
    }
    else
    {
        // this is "encode", i.e. convert strings to binary format
        //
        // look for closing quote of end of string, or zero character. A quote inside is escaped and would be processed, so it would
        // not be caught here. Depending on whether quoted is 0 or 1, either one may be valid ending
        while (val[i] != '"' && val[i] != 0) 
        {
            // process escaped char
            if (val[i] == '\\')
            {
                i++; // move on to byte after the escape
                switch (val[i])
                {
                    // one byte escaped. we put in one bytes instead of two, so pull increases by 1
                    case '"': val[i-1-pull] = val[i]; pull++; break;
                    case '\\': val[i-1-pull] = val[i]; pull++; break;
                    case '/': val[i-1-pull] = val[i]; pull++; break;
                    case 'b': val[i-1-pull] = '\b'; pull++; break;
                    case 'f': val[i-1-pull] = '\f'; pull++; break;
                    case 'n': val[i-1-pull] = '\n'; pull++; break;
                    case 'r': val[i-1-pull] = '\r'; pull++; break;
                    case 't': val[i-1-pull] = '\t'; pull++; break;
                    // 5 bytes escaped for UTF-8 encoding
                    case 'u': 
                    {
                        gg_num c = i;
                        gg_num totjv = 6;
                        c++;
                        gg_num r = gg_get_hex (val + c, o_errm);
                        if ((*o_errm)[0] != 0) return NULL;
                        gg_num32 rtot;
                        if (r >= 0xD800 && r <= 0xDFFF)  
                        { 
                            totjv += 6;
                            c+=4; // get passed XXXX\u
                            if (val[c] != '\\' || val[c+1] != 'u')
                            {
                                GG_ERR0;
                                *o_errm = gg_strdup(GG_ERR_UTF_SURROGATE); return NULL;
                            }
                            gg_num r1 = gg_get_hex (val + c + 2 , o_errm); // c+2 to skip \u
                            if ((*o_errm)[0] != 0) return NULL;
                            rtot = gg_make_from_utf_surrogate (r, r1);
                        } else rtot = r;
                        // turn unicode to binary
                        gg_num32 bytes = gg_decode_utf (rtot, (unsigned char*) (val+i-1-pull), o_errm);
                        if ((*o_errm)[0] != 0) return NULL;
                        pull += totjv - bytes; // binary is less space than \uXXXX 
                        i += totjv - 1 - 1; // 1 for getting passed \ and one for i++ further down
                        
                        break;
                    }
                    default: { GG_ERR0; *o_errm = gg_strdup(GG_ERR_UTF_BADESCAPE); return NULL;} // unknown escape sequence
                }
                i++; // to account for \, ", /, b, f, u etc.
            }
            else
            {
                // normal character, if no escapes just proceed to the end
                if (pull !=0) val[i-pull] = val[i];
                i++;
            }
        }
    }
    if (pull != 0) 
    {
        val[i-pull] = 0; // if pulled whole string back, finish it with 0
        if (alloced) gg_mem_set_len (gg_mem_get_id(val), i-pull+1); // set it before gg_puts because it needs length to output below (if so)
    }
    if (val[i] == 0 && quoted == 1) { GG_ERR0; *o_errm = gg_strdup(GG_ERR_UTF_NOQUOTE); return NULL;} // must have double quote at the end if quoted set
    return val+i; // return where the end is
}



//
// Convert binary utf val of len 'len' into text/unicode text 'resptr' (which is allocated here), 
// any error in err (also set here)
// If error, return -1, otherwise length of res
// len can be -1 in which case it's computed
//
gg_num gg_utf_to_text (char *val, gg_num len, char **resptr, char **err)
{
    GG_TRACE("");

    *err = GG_EMPTY_STRING;
    gg_num id = gg_mem_get_id(val);
    if (len == -1) len = gg_mem_get_len(id);
    else if (len > gg_mem_get_len(id)) gg_report_error ("Memory used is of length [%ld] but only [%ld] allocated", len, gg_mem_get_len(id));


    // allocate 3x memory, worst case scenario 2-byte utf to 6 byte unicode like \uXXXX
    // or 4-byte utf to 2x 6 byte surrogate unicodes. For others it's only 2x (such as \t)
    char *res = (char*)gg_malloc(3*len + 1);
    res[0] = 0; // by default it's empty in case of error
    if (resptr != NULL) *resptr = res;

    // note use of sprintf/like here is okay, as we have guaranteed enough space to write into (see above gg_malloc)
    gg_num i;
    gg_num r = 0;
    for (i = 0; i < len; i++)
    {
        if ((val[i] & 192) == 192)
        {
            // this is the beginning of utf sequence of bytes (2,3 or 4 bytes)
            // create \uXXXX (and possibly a surrogate pair)
            gg_num32 u;
            gg_num bytes = gg_encode_utf (val+i, &u, err);
            if (bytes == -1) return -1; else i+=(bytes-1); // since for loop will do i++
            if (u >= 0x10000) { // this means we need a surrogate pair
                gg_num32 u0;
                gg_num32 u1;
                gg_get_utf_surrogate (u, &u0, &u1);
                //
                //Use direct memory manipulation instead of sprintf which is many times slower
                //
                //sprintf (res+r, "\\u%04x", u0);
                res[r] = '\\';
                res[r+1] = 'u';
                GG_HEX_FROM_INT16(res+r+2,u0);
                r+=6; // no need to set res[r] to 0, since we continue below
                //sprintf (res+r, "\\u%04x", u1);
                res[r] = '\\';
                res[r+1] = 'u';
                GG_HEX_FROM_INT16(res+r+2,u1);
                r+=6;
                res[r] = 0;
            }
            else
            {
                //
                //Use direct memory manipulation instead of sprintf which is many times slower
                //
                //sprintf (res+r, "\\u%04x", u);
                res[r] = '\\';
                res[r+1] = 'u';
                GG_HEX_FROM_INT16(res+r+2,u);
                r+=6;
                res[r] = 0;
            }
            continue;
        }
        else
        {
            switch (val[i])
            {
                // one byte escaped. we put in one bytes instead of two, so pull increases by 1
                case '"': memcpy (res+r, "\\\"", 2); r+=2; break;
                case '\\': memcpy (res+r, "\\\\", 2); r+=2; break;
                //
                // solidus is not encoded but for decoding it's recognized - this is common implementation
                //
                //case '/':  memcpy (res+r, "\\/", 2); r+=2;break;
                case '\b': memcpy (res+r, "\\b", 2); r+=2; break;
                case '\f': memcpy (res+r, "\\f", 2); r+=2; break;
                case '\n': memcpy (res+r, "\\n", 2); r+=2; break;
                case '\r': memcpy (res+r, "\\r", 2); r+=2; break;
                case '\t': memcpy (res+r, "\\t", 2); r+=2; break;
                default: { res[r++] = val[i]; } // just copy over
            }
        }
    }
    res[r] = 0;
    gg_mem_set_len (gg_mem_get_id(res), r+1); // set it before gg_puts because it needs length to output below (if so)
    // if "to" is not used in utf-text, output string
    if (resptr == NULL) 
    {
        gg_num wres = gg_puts (GG_NOENC, res, r, true);
        if (wres < 0) GG_TRACE ("Error in writing direct, error [%s]", strerror(errno)); else GG_TRACE("Wrote direct [%ld] bytes", wres);
        gg_free_int (res); // free output string
    }
    return r;
}