File: test_utf.cpp

package info (click to toggle)
c4core 0.2.7-1
  • links: PTS
  • area: main
  • in suites: forky, sid
  • size: 5,184 kB
  • sloc: cpp: 35,521; python: 2,786; javascript: 414; makefile: 6
file content (150 lines) | stat: -rw-r--r-- 4,455 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#include "c4/test.hpp"
#ifndef C4CORE_SINGLE_HEADER
#include "c4/std/string.hpp"
#include "c4/std/vector.hpp"
#include "c4/format.hpp"
#include "c4/utf.hpp"
#endif

#include "c4/libtest/supprwarn_push.hpp"

#include <cstring>

namespace c4 {

struct utft
{
    csubstr code_point;
    csubstr character;
    uint32_t character_val;
    csubstr character_val_hex;
};
constexpr const utft utf_chars[] = {
#include "./utfchars.inc"
};

TEST_CASE("utf.decode_code_point")
{
    size_t i = 0;
    char decoded_buf[64];
    for(auto uc : utf_chars)
    {
        INFO("utfchars[", i, "]: codepoint=", uc.code_point, ' ',
             "character=", uc.character.empty() ? csubstr{} : uc.character, ' ',
             "val=", uc.character_val_hex, '(', uc.character_val, ')');
        i++;
        csubstr cpstr = uc.code_point.sub(2).triml('0');
        if(cpstr.empty())
            continue;
        csubstr decoded = decode_code_point(decoded_buf, cpstr);
        CHECK_UNARY(uc.code_point.begins_with("U+"));
        if(uc.character.empty())
            continue;
        CHECK_EQ(decoded.len, uc.character.len);
        CHECK_EQ(decoded, uc.character);
    }
}

void test_with_bom(substr input, csubstr bom)
{
    CHECK_UNARY(input.begins_with(bom));
    {
        csubstr s = input;
        CHECK_EQ(first_non_bom(s), bom.len);
        CHECK_EQ(get_bom(s).len, bom.len);
        CHECK_EQ(get_bom(s), bom);
        CHECK_UNARY(get_bom(s).is_sub(s));
        CHECK_UNARY(skip_bom(s).is_sub(s));
        CHECK_EQ(skip_bom(s).len, input.len - bom.len);
    }
    {
        substr s = input;
        CHECK_EQ(first_non_bom(s), bom.len);
        CHECK_EQ(get_bom(s), bom);
        CHECK_EQ(get_bom(s).len, bom.len);
        CHECK_UNARY(get_bom(s).is_sub(s));
        CHECK_UNARY(skip_bom(s).is_sub(s));
        CHECK_EQ(skip_bom(s).len, input.len - bom.len);
    }
}

void test_without_bom(substr input, csubstr bom)
{
    CHECK_UNARY(!input.begins_with(bom));
    {
        csubstr s = input;
        CHECK_EQ(first_non_bom(s), 0);
        CHECK_EQ(get_bom(s).len, 0);
        CHECK_EQ(skip_bom(s).len, s.len);
    }
    {
        substr s = input;
        CHECK_EQ(first_non_bom(s), 0);
        CHECK_EQ(get_bom(s).len, 0);
        CHECK_EQ(skip_bom(s).len, s.len);
    }
}


TEST_CASE("utf.bom")
{
    csubstr strs[] = {
        csubstr(""),
        csubstr("1"),
        csubstr("12"),
        csubstr("123"),
        csubstr("1234"),
        csubstr("12345"),
    };
    // https://en.wikipedia.org/wiki/Byte_order_mark#Byte-order_marks_by_encoding
    struct bomcase {
        csubstr name;
        csubstr bom;
        size_t bom_len;
    };
    const bomcase boms[] = {
        bomcase{csubstr("UTF-8"      ), csubstr("\xEF\xBB\xBF"), 3},
        bomcase{csubstr("UTF-16 (BE)"), csubstr("\xFE\xFF"), 2},
        bomcase{csubstr("UTF-16 (LE)"), csubstr("\xFF\xFE"), 2},
        bomcase{csubstr("UTF-32 (BE)"), csubstr("\x00\x00\xFE\xFF"), 4},
        bomcase{csubstr("UTF-32 (LE)"), csubstr("\xFF\xFE\x00\x00"), 4},
        bomcase{csubstr("UTF-7"      ), csubstr("\x2B\x2F\x76"), 3},
        bomcase{csubstr("UTF-1"      ), csubstr("\xF7\x64\x4C"), 3},
        bomcase{csubstr("UTF-EBCDIC" ), csubstr("\xDD\x73\x66\x73"), 4},
        bomcase{csubstr("SCSU"       ), csubstr("\x0E\xFE\xFF"), 3},
        bomcase{csubstr("BOCU-1"     ), csubstr("\xFB\xEE\x28"), 3},
        bomcase{csubstr("GB18030"    ), csubstr("\x84\x31\x95\x33"), 4},
    };
    std::string buf;
    for(bomcase bomc : boms)
    {
        csubstr name = bomc.name;
        csubstr bom = bomc.bom;
        size_t bomlen = bomc.bom_len;
        INFO("name=[", name.len, "]~~~", name, "~~~");
        INFO("bom=[", bom.len, "]~~~", bom, "~~~");
        CHECK_EQ(bomlen, bom.len);
        for(csubstr str : strs)
        {
            INFO("str=[", str.len, "]~~~", str, "~~~");
            {
                buf.clear();
                buf.append(str.str, str.len);
                CHECK_EQ(buf.size(), str.len);
                test_without_bom(to_substr(buf), bom);
            }
            {
                buf.clear();
                buf.append(bom.str, bom.len);
                CHECK_EQ(buf.size(), bom.len);
                buf.append(str.str, str.len);
                CHECK_EQ(buf.size(), bom.len + str.len);
                INFO("buf=[", buf.size(), "]~~~", buf, "~~~");
                test_with_bom(to_substr(buf), bom);
            }
        }
    }

}

} // namespace c4