File: utf8.c

package info (click to toggle)
mle 1.7.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,108 kB
  • sloc: ansic: 13,335; sh: 728; php: 228; makefile: 83
file content (99 lines) | stat: -rw-r--r-- 2,312 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
// Adapted from https://github.com/termbox/termbox/blob/a0e450500b3f07ddd172ac64e48a59129a8878fb/src/utf8.c

#include <stdint.h>

#include "mlbuf.h"

static const unsigned char utf8_length[256] = {
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
};

static const unsigned char utf8_mask[6] = {
    0x7F,
    0x1F,
    0x0F,
    0x07,
    0x03,
    0x01
};

int utf8_char_length(char c) {
    return utf8_length[(unsigned char)c];
}

int utf8_char_to_unicode(uint32_t *out, const char *c, const char *stop) {
    if (*c == 0)
        return -1;

    int i;
    unsigned char len = utf8_char_length(*c);
    unsigned char mask = utf8_mask[len-1];
    uint32_t result = c[0] & mask;
    for (i = 1; i < len; ++i) {
        if (stop && c + i >= stop) {
            len -= (len - i);
            break;
        }
        result <<= 6;
        result |= c[i] & 0x3f;
    }

    *out = result;
    return (int)len;
}

int utf8_unicode_to_char(char *out, uint32_t c) {
    int len = 0;
    int first;
    int i;

    if (c < 0x80) {
        first = 0;
        len = 1;
    } else if (c < 0x800) {
        first = 0xc0;
        len = 2;
    } else if (c < 0x10000) {
        first = 0xe0;
        len = 3;
    } else if (c < 0x200000) {
        first = 0xf0;
        len = 4;
    } else if (c < 0x4000000) {
        first = 0xf8;
        len = 5;
    } else {
        first = 0xfc;
        len = 6;
    }

    for (i = len - 1; i > 0; --i) {
        out[i] = (c & 0x3f) | 0x80;
        c >>= 6;
    }
    out[0] = c | first;

    return len;
}

size_t utf8_str_length(char *data, size_t len) {
    size_t slen;
    int clen;
    char *data_stop, *c;
    data_stop = data + len;
    c = data;
    slen = 0;
    while (c < data_stop) {
        clen = utf8_char_length(*c);
        c += clen;
        slen += 1;
    }
    return slen;
}