File: DataValidationTest.java

package info (click to toggle)
openjdk-21 21.0.8%2B9-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 823,976 kB
  • sloc: java: 5,613,338; xml: 1,643,607; cpp: 1,296,296; ansic: 420,291; asm: 404,850; objc: 20,994; sh: 15,271; javascript: 11,245; python: 6,895; makefile: 2,362; perl: 357; awk: 351; sed: 172; jsp: 24; csh: 3
file content (447 lines) | stat: -rw-r--r-- 15,835 bytes parent folder | download | duplicates (11)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
/*
 * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
/*
 * test
 * bug  4221795
 * summary Confirm *.icu data using ICU4J Normalizer
 */

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.util.BitSet;
import java.util.StringTokenizer;

import com.ibm.icu.text.Normalizer;
import com.ibm.icu.impl.NormalizerImpl;

/**
 * This is not a test program but a data validation utility.
 * Two datafiles for Normalizer, unorm.icu and uprops.icu under
 * sun/text/resouces, are generated using generators in ICU4C 3.2 on a
 * BIG-ENDIAN machine. Before using them with java.text.Normalizer and
 * sun.text.Normalizer, you may want to check these test datafile's validation.
 * You can test datafiles using Normalizer in ICU4J 3.2. Download ICU4J 3.2 and
 * run this test program with -cp <ICU4J 3.2>.
 */
public class DataValidationTest {

    //
    // Options to be used with com.ibm.icu.text.Normalizer
    //

    /*
     * Default Unicode 3.2.0 normalization.
     *
     *   - With Corrigendum 4 fix
     *     (Different from Mustang's Normalizer.)
     *   - With Public Review Issue #29 fix
     *     (Different from Mustang's Normalizer.)
     */
    private static final int UNICODE_3_2_0 = Normalizer.UNICODE_3_2;

    /*
     * *Incomplete* Unicode 3.2.0 normalization for IDNA/StringPrep.
     *
     *   - With Corrigendum 4 fix
     *   - Without Public Review Issue #29 fix
     *
     * ICU4J's Normalizer itself doesn't support normalization for Unicode 3.2.0
     * without Corrigendum 4 fix, which is necessary for IDNA/StringPrep. It is
     * done in StringPrep. Therefore, we don't test the normlaization in this
     * test program. We merely test normalization for Unicode 3.2.0 without
     * Public Review Issue #29 fix with this test program.
     */
    private static final int UNICODE_3_2_0_BEFORE_PRI_29 =
                                 Normalizer.UNICODE_3_2 |
                                 NormalizerImpl.BEFORE_PRI_29;

    /*
     * Default normalization.
     *
     *   - Unicode 4.0.1
     *     (Different from Mustang's Normalizer.)
     *   - With Corrigendum 4 fix
     *   - With Public Review Issue #29 fix
     *     (Different from Mustang's Normalizer.)
     *
     * Because Public Review Issue #29 is fixed in Unicode 4.1.0. I think that
     * IUC4J 3.2 should not support it. But it actually supports PRI #29 fix
     * as default....
     */
    private static final int UNICODE_LATEST = 0x00;

    /*
     * Normalization without Public Review Issue #29 fix.
     *
     *   - Unicode 4.0.1
     *   - Without Corrigendum 4 fix
     *   - Without Public Review Issue #29 fix
     */
    static final int UNICODE_LATEST_BEFORE_PRI_29 =
                         NormalizerImpl.BEFORE_PRI_29;

    //
    // Conformance test datafiles
    //

    /*
     * Conformance test datafile for normalization for Unicode 3.2.0 with
     * Corrigendum 4 corrections. This is NOT an original Conformace test
     * data. Some inconvenient test cases are commented out.
     * About corrigendum 4, please refer
     *   http://www.unicode.org/versions/corrigendum4.html
     *
     * ICU4J 3.2's Normalizer itself doesn't support normalization for Unicode
     * 3.2.0 without Corrigendum 4 corrections. StringPrep helps it. So, we
     * don't test the normalization with this test program.
     */
    static final String DATA_3_2_0 = "NormalizationTest-3.2.0.Corrigendum4.txt";

    /*
     * Conformance test datafile for the latest Unicode which is supported
     * by J2SE.
     */
    static final String DATA_LATEST = "NormalizationTest-Latest.txt";

   /*
    * Decorder
    */
    static final CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder();

   /*
    * List to pick up characters which are not listed in Part1
    */
    static BitSet charList = new BitSet(Character.MAX_CODE_POINT+1);

   /*
    * Shortcuts
    */
    static final Normalizer.Mode NFC  = com.ibm.icu.text.Normalizer.NFC;
    static final Normalizer.Mode NFD  = com.ibm.icu.text.Normalizer.NFD;
    static final Normalizer.Mode NFKC = com.ibm.icu.text.Normalizer.NFKC;
    static final Normalizer.Mode NFKD = com.ibm.icu.text.Normalizer.NFKD;
    static final Normalizer.Mode[] modes = {NFC, NFD, NFKC, NFKD};


    public static void main(String[] args) throws Exception {
        test(DATA_3_2_0, UNICODE_3_2_0);
        test(DATA_3_2_0, UNICODE_3_2_0_BEFORE_PRI_29);
        test(DATA_LATEST, UNICODE_LATEST);
        // This test started failing since ICU4J 3.6.
//      test(DATA_LATEST, UNICODE_LATEST_BEFORE_PRI_29);

        /* Unconformity test */
//      test(DATA_3_2_0, UNICODE_LATEST);
//      test(DATA_LATEST, UNICODE_3_2);
    }

    private static void test(String filename, int unicodeVer) throws Exception {

        FileInputStream fis = new FileInputStream(filename);
        BufferedReader in =
            new BufferedReader(new InputStreamReader(fis, decoder));

        System.out.println("\nStart testing with " + filename +
            " for options: " +
            (((unicodeVer & Normalizer.UNICODE_3_2) != 0) ?
                "Unicode 3.2.0" : "the latest Unicode") + ", " +
            (((unicodeVer & NormalizerImpl.BEFORE_PRI_29) != 0) ?
                "with" : "without") + " PRI #29 fix");

        int lineNo = 0;
        String text;
        String[] columns = new String[6];
        boolean part1test = false;

        while ((text = in.readLine()) != null) {
            lineNo ++;

            char c = text.charAt(0);
            if (c == '#') {
                continue;
            } else if (c == '@') {
                if (text.startsWith("@Part")) {
                    System.out.println("# Testing data in " + text);

                    if (text.startsWith("@Part1 ")) {
                        part1test = true;
                    } else {
                        part1test = false;
                    }

                    continue;
                }
            }

            prepareColumns(columns, text, filename, lineNo, part1test);

            testNFC(columns, unicodeVer, filename, lineNo);
            testNFD(columns, unicodeVer, filename, lineNo);
            testNFKC(columns, unicodeVer, filename, lineNo);
            testNFKD(columns, unicodeVer, filename, lineNo);
        }

        in.close();
        fis.close();

        if (unicodeVer == UNICODE_LATEST) {
            System.out.println("# Testing characters which are not listed in Part1");
            testRemainingChars(filename, unicodeVer);
        }
    }

    /*
     * Test for NFC
     *
     *   c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3)
     *   c4 ==  NFC(c4) ==  NFC(c5)
     */
    private static void testNFC(String[] c, int unicodeVer,
                                String file, int line) throws Exception {
        test(2, c, 1, 3, NFC, unicodeVer, file, line);
        test(4, c, 4, 5, NFC, unicodeVer, file, line);
    }

    /*
     * Test for NFD
     *
     *   c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3)
     *   c5 ==  NFD(c4) ==  NFD(c5)
     */
    private static void testNFD(String[] c, int unicodeVer,
                                String file, int line) throws Exception {
        test(3, c, 1, 3, NFD, unicodeVer, file, line);
        test(5, c, 4, 5, NFD, unicodeVer, file, line);
    }

    /*
     * Test for NFKC
     *
     *   c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
     */
    private static void testNFKC(String[] c, int unicodeVer,
                                 String file, int line) throws Exception {
        test(4, c, 1, 5, NFKC, unicodeVer, file, line);
    }

    /*
     * Test for NFKD
     *
     *   c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
     */
    private static void testNFKD(String[] c, int unicodeVer,
                                 String file, int line) throws Exception {
        test(5, c, 1, 5, NFKD, unicodeVer, file, line);
    }

    /*
     * Test for characters which aren't listed in Part1
     *
     *   X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)
     */
    private static void testRemainingChars(String file,
                                           int unicodeVer) throws Exception {
        for (int i = Character.MIN_CODE_POINT;
             i <= Character.MAX_CODE_POINT;
             i++) {
            if (!charList.get(i)) {
                String from = String.valueOf(Character.toChars(i));
                String to;

                for (int j = 0; j < modes.length; j++) {
                    Normalizer.Mode mode = modes[j];

                    to = Normalizer.normalize(from, mode, unicodeVer);
                    if (!from.equals(to)) {
                        error(mode, from, from, to, file, -1);
//                  } else {
//                      okay(mode, from, from, to, file, -1);
                    }

                    if (!Normalizer.isNormalized(from, mode, unicodeVer)) {
                        error(mode, from, file, -1);
//                  } else {
//                      okay(mode, from, file, -1);
                    }
                }
            }
        }
    }

    /*
     * Test normalize() and isNormalized()
     */
    private static void test(int col, String[] c,
                             int FROM, int TO,
                             Normalizer.Mode mode, int unicodeVer,
                             String file, int line) throws Exception {
        for (int i = FROM; i <= TO; i++) {
            String got = Normalizer.normalize(c[i], mode, unicodeVer);
            if (!c[col].equals(got)) {
                error(mode, c[i], c[col], got, file, line);
//          } else {
//              okay(mode, c[i], c[col], got, file, line);
            }

            /*
             * If the original String equals its normalized String, it means
             * that the original String is normalizerd. Thus, isNormalized()
             * should return true. And, vice versa!
             */
            if (c[col].equals(c[i])) {
                if (!Normalizer.isNormalized(c[i], mode, unicodeVer)) {
                    error(mode, c[i], file, line);
//              } else {
//                  okay(mode, c[i], file, line);
                }
            } else {
                if (Normalizer.isNormalized(c[i], mode, unicodeVer)) {
                    error(mode, c[i], file, line);
//              } else {
//                  okay(mode, c[i], file, line);
                }
            }
        }
    }

    /*
     * Generate an array of String from a line of conformance datafile.
     */
    private static void prepareColumns(String[] col, String text,
                                       String file, int line,
                                       boolean part1test) throws Exception {
        int index = text.indexOf('#');
        if (index != -1) {
            text = text.substring(0, index);
        }

        StringTokenizer st = new StringTokenizer(text, ";");
        int tokenCount = st.countTokens();
        if (tokenCount < 5) {
             throw new RuntimeException("# of tokens in datafile should be 6, but got: " + tokenCount + " at line " + line + " in " + file);
        }

        StringBuffer sb = new StringBuffer();
        for (int i = 1; i <= 5; i++) {
            StringTokenizer tst = new StringTokenizer(st.nextToken(), " ");

            while (tst.hasMoreTokens()) {
                int code = Integer.parseInt(tst.nextToken(), 16);
                sb.append(Character.toChars(code));
            }

            col[i] = sb.toString();
            sb.setLength(0);
        }

        if (part1test) {
            charList.set(col[1].codePointAt(0));
        }
    }

    /*
     * Show an error message when normalize() didn't return the expected value.
     * (An exception is sometimes convenient. Therefore, it is commented out
     * for the moment.)
     */
    private static void error(Normalizer.Mode mode,
                              String from, String to, String got,
                              String file, int line) throws Exception {
        System.err.println("\t" + toString(mode) + ": normalize(" +
            toHexString(from) + ") doesn't equal <" + toHexString(to) +
            "> at line " + line + " in " + file + ". Got <" +
            toHexString(got) + ">.");
//      throw new RuntimeException("Normalization(" + toString(mode) + ") failed");
    }

    /*
     * Show an error message when isNormalize() didn't return the expected value.
     * (An exception is sometimes convenient. Therefore, it is commented out
     * for the moment.)
     */
    private static void error(Normalizer.Mode mode, String orig,
                              String file, int line) throws Exception {
        System.err.println("\t" + toString(mode) + ": isNormalized(" +
            toHexString(orig) + ") returned the wrong value at line " + line +
            " in " + file + ".");
//      throw new RuntimeException("Normalization(" + toString(mode) +") failed");
    }

    /*
     * (For debugging)
     * Shows a message when normalize() returned the expected value.
     */
    private static void okay(Normalizer.Mode mode,
                             String from, String to, String got,
                             String file, int line) {
        System.out.println("\t" + toString(mode) + ": normalize(" +
            toHexString(from) + ") equals <" + toHexString(to) +
            "> at line " + line + " in " + file + ". Got <" +
            toHexString(got) + ">.");
    }

    /*
     * (For debugging)
     * Shows a message when isNormalized() returned the expected value.
     */
    private static void okay(Normalizer.Mode mode, String orig,
                             String file, int line) {
        System.out.println("\t" + toString(mode) + ": isNormalized(" +
            toHexString(orig) + ") returned the correct value at line " +
            line + " in " + file + ".");
    }

    /*
     * Returns a spece-delimited hex String
     */
    private static String toHexString(String s) {
        StringBuffer sb = new StringBuffer(" ");

        for (int i = 0; i < s.length(); i++) {
            sb.append(Integer.toHexString(s.charAt(i)));
            sb.append(' ');
        }

        return sb.toString();
    }

   /*
    * Returns the name of Normalizer.Mode
    */
    private static String toString(Normalizer.Mode mode) {
        if (mode == Normalizer.NFC) {
            return "NFC";
        } else if (mode == Normalizer.NFD) {
            return "NFD";
        } else if (mode == Normalizer.NFKC) {
            return "NFKC";
        } else if (mode == Normalizer.NFKD) {
            return "NFKD";
        }

        return "unknown";
    }
}