File: unicode_casefold_test.cpp

package info (click to toggle)
boost1.90 1.90.0-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 593,120 kB
  • sloc: cpp: 4,190,908; xml: 196,648; python: 34,618; ansic: 23,145; asm: 5,468; sh: 3,774; makefile: 1,161; perl: 1,020; sql: 728; ruby: 676; yacc: 478; java: 77; lisp: 24; csh: 6
file content (204 lines) | stat: -rw-r--r-- 6,046 bytes parent folder | download | duplicates (11)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
/*
 *
 * Copyright (c) 2021 John Maddock
 * Copyright (c) 2021 Daniel Kruegler
 *
 * Use, modification and distribution are subject to the 
 * Boost Software License, Version 1.0. (See accompanying file 
 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 *
 */
 
 /*
  *   LOCATION:    see http://www.boost.org for most recent version.
  *   FILE         unicode_casefold_test.cpp
  *   VERSION      see <boost/version.hpp>
  *   DESCRIPTION: Simple test suite for Unicode case folding.
  */

#include <boost/regex/config.hpp>
#include <boost/detail/lightweight_main.hpp>
#include "../test_macros.hpp"

#if defined(BOOST_HAS_ICU)

#include <boost/regex/icu.hpp>

#include <utility>

#include <unicode/uversion.h>
#include <unicode/uchar.h>

typedef std::pair<int, int> unicode_verinfo;

// Function to query the effective Unicode major and minor
// version, because some spot test cases can only be tested 
// for specific Unicode versions.
unicode_verinfo get_unicode_version()
{
  UVersionInfo versionArray = {};
  u_getUnicodeVersion(versionArray);
  unicode_verinfo result(versionArray[0] , versionArray[1]);
  return result;
}

void latin_1_checks()
{
  typedef boost::icu_regex_traits traits_type;
  traits_type traits;

  // Test range [U+0000, U+0041): Identity fold
  for (traits_type::char_type c = 0x0; c < 0x41; ++c)
  {
    traits_type::char_type nc = traits.translate_nocase(c);
    BOOST_CHECK_EQUAL(nc, c);
  }

  // Test ASCII upper case letters [A, Z]: Each character folds 
  // to its lowercase variant:
  for (traits_type::char_type c = 0x41; c <= 0x5A; ++c)
  {
    traits_type::char_type nc = traits.translate_nocase(c);
    const int shift = 0x61 - 0x41;
    BOOST_CHECK_EQUAL(nc, c + shift);
    BOOST_CHECK_EQUAL(nc, traits.tolower(c));
  }

  // Test range (U+005A, U+00B5): Identity fold
  for (traits_type::char_type c = 0x5A + 1; c < 0xB5; ++c)
  {
    traits_type::char_type nc = traits.translate_nocase(c);
    BOOST_CHECK_EQUAL(nc, c);
  }

  // U+00B5 maps to its decomposition GREEK SMALL LETTER MU 
  // (U+03BC):
  {
    traits_type::char_type c = 0xB5;
    traits_type::char_type nc = traits.translate_nocase(c);
    BOOST_CHECK_EQUAL(nc, 0x03BC);
  }

  // Test range (U+00B5, U+00BF]: Identity fold
  for (traits_type::char_type c = 0xB5 + 1; c <= 0xBF; ++c)
  {
    traits_type::char_type nc = traits.translate_nocase(c);
    BOOST_CHECK_EQUAL(nc, c);
  }

  // Test range [U+00C0, U+00D6]: Each character folds 
  // to its lowercase variant:
  for (traits_type::char_type c = 0xC0; c <= 0xD6; ++c)
  {
    traits_type::char_type nc = traits.translate_nocase(c);
    traits_type::char_type lc = traits.tolower(c);
    BOOST_CHECK_EQUAL(nc, lc);
    BOOST_CHECK_NE(nc, c);
  }

  // U+00D7: Identity fold
  {
    traits_type::char_type c = 0xD7;
    traits_type::char_type nc = traits.translate_nocase(c);
    BOOST_CHECK_EQUAL(nc, c);
  }

  // Test range [U+00D8, U+00DE]: Each character folds 
  // to its lowercase variant:
  for (traits_type::char_type c = 0xD8; c <= 0xDE; ++c)
  {
    traits_type::char_type nc = traits.translate_nocase(c);
    traits_type::char_type lc = traits.tolower(c);
    BOOST_CHECK_EQUAL(nc, lc);
    BOOST_CHECK_NE(nc, c);
  }

  // Test range [U+00DF, U+00BF]: Identity fold
  // Note that case folding of U+00DF (LATIN SMALL 
  // LETTER SHARP S) does not fold to U+1E9E (LATIN 
  // CAPITAL LETTER SHARP S) due to case folding 
  // stability contract
  for (traits_type::char_type c = 0xDF; c <= 0xFF; ++c)
  {
    traits_type::char_type nc = traits.translate_nocase(c);
    BOOST_CHECK_EQUAL(nc, c);
  }
}

void spot_checks()
{
  // test specific values ripped straight out of the Unicode standard
  // to verify that our case folding is the same as theirs:
  typedef boost::icu_regex_traits traits_type;
  traits_type traits;

  const unicode_verinfo unicode_version = get_unicode_version();

  // 'LATIN CAPITAL LETTER SHARP S' folds to
  // 'LATIN SMALL LETTER SHARP S'
  if (unicode_version >= unicode_verinfo(5, 1))
  {
    traits_type::char_type c = 0x1E9E;
    traits_type::char_type nc = traits.translate_nocase(c);
    traits_type::char_type lc = traits.tolower(c);
    BOOST_CHECK_EQUAL(nc, lc);
    BOOST_CHECK_EQUAL(nc, 0xDF);
  }

  // Capital sigma (U+03A3) is the uppercase form of both the regular (U+03C2) 
  // and final (U+03C3) lowercase sigma. All these characters exists since
  // Unicode 1.1.0.
  {
    traits_type::char_type c = 0x03A3;
    traits_type::char_type nc = traits.translate_nocase(c);
    traits_type::char_type lc = traits.tolower(c);
    BOOST_CHECK_EQUAL(nc, lc);
    BOOST_CHECK_EQUAL(nc, 0x03C3);
    c = 0x03C2;
    nc = traits.translate_nocase(c);
    BOOST_CHECK_EQUAL(nc, 0x03C3);
    c = 0x03C3;
    nc = traits.translate_nocase(c);
    BOOST_CHECK_EQUAL(nc, c);
  }

  // In Turkish languages the lowercase letter 'i' (U+0069) maps to an 
  // uppercase dotted I (U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE), 
  // while the uppercase letter 'I' (U+0049) maps to the dotless lowercase 
  // i (U+0131). The Unicode simple default mapping folds U+0130 to itself, 
  // but folds U+0049 to U+0069.
  {
    traits_type::char_type c = 0x0130;
    traits_type::char_type nc = traits.translate_nocase(c);
    BOOST_CHECK_EQUAL(nc, c);
    c = 0x0049;
    nc = traits.translate_nocase(c);
    traits_type::char_type lc = traits.tolower(c);
    BOOST_CHECK_EQUAL(nc, lc);
    BOOST_CHECK_EQUAL(nc, 0x0069);
  }

  // Cherokee small letters were added with Unicode 8.0,
  // but the upper case letters existed before, therefore
  // the small letters case fold to upper case letters.
  if (unicode_version >= unicode_verinfo(8, 0))
  {
    traits_type::char_type c = 0x13F8;
    traits_type::char_type nc = traits.translate_nocase(c);
    traits_type::char_type uc = traits.toupper(c);
    BOOST_CHECK_EQUAL(nc, uc);
    BOOST_CHECK_EQUAL(nc, 0x13F0);
  }

}

#endif

int cpp_main( int, char* [] ) 
{
#if defined(BOOST_HAS_ICU)
  latin_1_checks();
  spot_checks();
#endif
  return boost::report_errors();
}