File: foundation_utf8.c

package info (click to toggle)
haskell-foundation 0.0.30-5
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 928 kB
  • sloc: haskell: 9,124; ansic: 570; makefile: 6
file content (85 lines) | stat: -rw-r--r-- 2,292 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#include <stdint.h>
#include <stdlib.h>
#include "foundation_prim.h"

#if 0
static const uint64_t utf8_mask_80 = 0x8080808080808080ULL;
static const uint64_t utf8_mask_40 = 0x4040404040404040ULL;

typedef unsigned long pu;
#define POPCOUNT(x) __builtin_popcountl(x)
#define ALIGNED8(p) ((((uintptr_t) (p)) & (sizeof(pu)-1)) == 0)

FsCountOf foundation_utf8_length(uint8_t *p8, const FsOffset start_offset, const FsOffset end_offset)
{
    const uint8_t *end = p8 + end_offset;
    FsCountOf n = 0;

    p8 += start_offset;

    while (!ALIGNED8(p8) && p8 < end) {
        if ((*p8++ & 0xc0) != 0x80) { n++; }
    }

    /* process 8 bytes */
    for (; (p8 + sizeof(pu)) <= end; p8 += sizeof(pu)) {
        pu h   = *((pu *) p8);
        pu h80 = h & utf8_mask_80;

        /* only ASCII */
        if (h80 == 0) {
            n += sizeof(pu);
            continue;
        }

        int nb_ascii = (h80 == utf8_mask_80) ? 0 : (8 - __builtin_popcountl(h80));
        int nb_high = __builtin_popcountl( h & (h80 >> 1));
        n += nb_ascii + nb_high;
    }

    while (p8 < end) {
        if ((*p8++ & 0xc0) != 0x80) { n++; }
    }

    return n;
}

#define IS_CONT(x) ((x & 0xc0) == 0x80)

int foundation_utf8_validate(const uint8_t *c, size_t offset, size_t end)
{
    while (offset < end) {
        uint8_t h = c[offset];
        if (!(h & 0x80)) {
            offset++;
            continue;
        }

        /* continuation */
        if      (h < 0xC0) { goto fail1; }
        /* 2 bytes */
        else if (h < 0xE0) { if      (offset + 1 >= end) { goto fail2; }
            else if (IS_CONT(c[offset+1])) { offset += 2; }
            else { goto fail1; }
        }
        /* 3 bytes */
        else if (h < 0xF0) { if      (offset + 2 >= end) { goto fail2; }
            else if (IS_CONT(c[offset+1]) && IS_CONT(c[offset+2])) { offset += 3; }
            else { goto fail1; }
        }

        /* 4 bytes */
        else if (h < 0xFE) { if      (offset + 3 >= end) { goto fail2; }
            else if (IS_CONT(c[offset+1]) && IS_CONT(c[offset+2]) && IS_CONT(c[offset+3])) { offset += 4; }
            else { goto fail1; }
        }
        /* invalid > 4 bytes */
        else               { goto fail1; }
    }
    return 0;
fail1:
    return 1;
fail2:
    return 2;
}
#endif