File: utf8_validate.cpp

package info (click to toggle)
hyperscan 5.4.2-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 12,304 kB
  • sloc: cpp: 143,324; ansic: 41,041; python: 621; sh: 32; makefile: 12
file content (122 lines) | stat: -rw-r--r-- 4,070 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
/*
 * Copyright (c) 2015-2022, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "config.h"

#include "parser/utf8_validate.h"

#include "ue2common.h"
#include "util/string_util.h"

#include "gtest/gtest.h"

using namespace testing;
using namespace ue2;

struct ValidUtf8TestInfo {
    std::string str;
    bool is_valid;
};

// Helper for gtest.
static
void PrintTo(const ValidUtf8TestInfo &t, ::std::ostream *os) {
    *os << "(\"" << printable(t.str) << "\", " << t.is_valid << ")";
}

static ValidUtf8TestInfo valid_utf8_tests[] = {
    // Trivial ASCII.
    {"foobar", true},
    {"0", true},
    {"\x7e", true},
    {"hatstand\tteakettle\tbadgerbrush\n", true},

    // Some valid UTF-8: Movie titles!
    {"À bout de souffle", true},
    {"拳銃は俺のパスポート", true},
    {"大醉俠", true},
    {"龙门客栈", true},
    {"공동경비구역", true},
    {"জলসাঘর", true},

    // Valid one-byte caseS.
    {"\x7f", true}, // \x7f is valid

    // These bytes should never appear in a UTF-8 stream.
    {"\xc0", false},
    {"\xc1", false},
    {"\xf5", false},
    {"\xf6", false},
    {"\xf7", false},
    {"\xf8", false},
    {"\xf9", false},
    {"\xfa", false},
    {"\xfc", false},
    {"\xfd", false},
    {"\xfe", false},
    {"\xff", false},
    {"\xff", false},

    // Examples from RFC-3629 section 7.
    {"\x41\xe2\x89\xa2\xce\x91\x2e", true},
    {"\xed\x95\x9c\xea\xb5\xad\xec\x96\xb4", true},
    {"\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e", true},
    {"\xef\xbb\xbf\xf0\xa3\x8e\xb4", true},

    // Examples from RFC-3629 section 10. (security concerns)
    {"/../", true},
    {"\x2f\xc0\xae\x2e\x2f", false}, // overlong

    // Overlong encodings
    {"\xc0\xc1", false}, // 'a' as two bytes
    {"\xe0\x80\xc1", false}, // 'a' as three bytes
    {"\xf0\x80\x80\xc1", false}, // 'a' as four bytes

    /* invalid continuing bytes */
    {"\xd1\xf1", false},
    {"\xef\xbf\xf1", false},
    {"\xef\xf1\xbf", false},
    {"\xf1\xf1\xf1\xf1", false},
    {"\xf1\xbf\xbf\xf1", false},
    {"\xf1\xbf\xf1\xbf", false},
    {"\xf1\xf1\xbf\xbf", false},

    // UTF-16 surrogates
    {"\xed\xa0\x80", false}, // U+D800
    {"\xed\xb2\x80", false}, // U+DC80
};

class ValidUtf8Test : public TestWithParam<ValidUtf8TestInfo> {};

INSTANTIATE_TEST_CASE_P(ValidUtf8, ValidUtf8Test, ValuesIn(valid_utf8_tests));

TEST_P(ValidUtf8Test, check) {
    const auto &info = GetParam();
    SCOPED_TRACE(testing::Message() << "String is: " << printable(info.str));
    ASSERT_EQ(info.is_valid, isValidUtf8(info.str.c_str(), info.str.size()));
}