File: utf8_validity_test.cc

package info (click to toggle)
protobuf 3.25.7-1
  • links: PTS
  • area: main
  • in suites: experimental
  • size: 46,004 kB
  • sloc: cpp: 204,412; java: 88,198; ansic: 81,264; objc: 58,434; cs: 27,303; python: 22,841; php: 11,408; ruby: 8,637; pascal: 3,333; xml: 2,333; sh: 1,331; makefile: 538; lisp: 86; awk: 17
file content (76 lines) | stat: -rw-r--r-- 3,536 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#include "utf8_validity.h"

#include "gtest/gtest.h"
#include "absl/strings/string_view.h"

namespace utf8_range {

TEST(Utf8Validity, SpanStructurallyValid) {
  // Test simple good strings
  EXPECT_EQ(4, SpanStructurallyValid("abcd"));
  EXPECT_EQ(4, SpanStructurallyValid(absl::string_view("a\0cd", 4)));  // NULL
  EXPECT_EQ(4, SpanStructurallyValid("ab\xc2\x81"));                   // 2-byte
  EXPECT_EQ(4, SpanStructurallyValid("a\xe2\x81\x81"));                // 3-byte
  EXPECT_EQ(4, SpanStructurallyValid("\xf2\x81\x81\x81"));             // 4

  // Test simple bad strings
  EXPECT_EQ(3, SpanStructurallyValid("abc\x80"));           // bad char
  EXPECT_EQ(3, SpanStructurallyValid("abc\xc2"));           // trunc 2
  EXPECT_EQ(2, SpanStructurallyValid("ab\xe2\x81"));        // trunc 3
  EXPECT_EQ(1, SpanStructurallyValid("a\xf2\x81\x81"));     // trunc 4
  EXPECT_EQ(2, SpanStructurallyValid("ab\xc0\x81"));        // not 1
  EXPECT_EQ(1, SpanStructurallyValid("a\xe0\x81\x81"));     // not 2
  EXPECT_EQ(0, SpanStructurallyValid("\xf0\x81\x81\x81"));  // not 3
  EXPECT_EQ(0, SpanStructurallyValid("\xf4\xbf\xbf\xbf"));  // big
  // surrogate min, max
  EXPECT_EQ(0, SpanStructurallyValid("\xED\xA0\x80"));  // U+D800
  EXPECT_EQ(0, SpanStructurallyValid("\xED\xBF\xBF"));  // U+DFFF

  // non-shortest forms should all return false
  EXPECT_EQ(0, SpanStructurallyValid("\xc0\x80"));
  EXPECT_EQ(0, SpanStructurallyValid("\xc1\xbf"));
  EXPECT_EQ(0, SpanStructurallyValid("\xe0\x80\x80"));
  EXPECT_EQ(0, SpanStructurallyValid("\xe0\x9f\xbf"));
  EXPECT_EQ(0, SpanStructurallyValid("\xf0\x80\x80\x80"));
  EXPECT_EQ(0, SpanStructurallyValid("\xf0\x83\xbf\xbf"));

  // This string unchecked caused GWS to crash 7/2006:
  // invalid sequence 0xc7 0xc8 0xcd 0xcb
  EXPECT_EQ(0, SpanStructurallyValid("\xc7\xc8\xcd\xcb"));
}

TEST(Utf8Validity, IsStructurallyValid) {
  // Test simple good strings
  EXPECT_TRUE(IsStructurallyValid("abcd"));
  EXPECT_TRUE(IsStructurallyValid(absl::string_view("a\0cd", 4)));  // NULL
  EXPECT_TRUE(IsStructurallyValid("ab\xc2\x81"));                   // 2-byte
  EXPECT_TRUE(IsStructurallyValid("a\xe2\x81\x81"));                // 3-byte
  EXPECT_TRUE(IsStructurallyValid("\xf2\x81\x81\x81"));             // 4

  // Test simple bad strings
  EXPECT_FALSE(IsStructurallyValid("abc\x80"));           // bad char
  EXPECT_FALSE(IsStructurallyValid("abc\xc2"));           // trunc 2
  EXPECT_FALSE(IsStructurallyValid("ab\xe2\x81"));        // trunc 3
  EXPECT_FALSE(IsStructurallyValid("a\xf2\x81\x81"));     // trunc 4
  EXPECT_FALSE(IsStructurallyValid("ab\xc0\x81"));        // not 1
  EXPECT_FALSE(IsStructurallyValid("a\xe0\x81\x81"));     // not 2
  EXPECT_FALSE(IsStructurallyValid("\xf0\x81\x81\x81"));  // not 3
  EXPECT_FALSE(IsStructurallyValid("\xf4\xbf\xbf\xbf"));  // big
  // surrogate min, max
  EXPECT_FALSE(IsStructurallyValid("\xED\xA0\x80"));  // U+D800
  EXPECT_FALSE(IsStructurallyValid("\xED\xBF\xBF"));  // U+DFFF

  // non-shortest forms should all return false
  EXPECT_FALSE(IsStructurallyValid("\xc0\x80"));
  EXPECT_FALSE(IsStructurallyValid("\xc1\xbf"));
  EXPECT_FALSE(IsStructurallyValid("\xe0\x80\x80"));
  EXPECT_FALSE(IsStructurallyValid("\xe0\x9f\xbf"));
  EXPECT_FALSE(IsStructurallyValid("\xf0\x80\x80\x80"));
  EXPECT_FALSE(IsStructurallyValid("\xf0\x83\xbf\xbf"));

  // This string unchecked caused GWS to crash 7/2006:
  // invalid sequence 0xc7 0xc8 0xcd 0xcb
  EXPECT_FALSE(IsStructurallyValid("\xc7\xc8\xcd\xcb"));
}

}  // namespace utf8_range