File: cpt.rs

package info (click to toggle)
rustc 1.85.0%2Bdfsg3-1
  • links: PTS, VCS
  • area: main
  • in suites: experimental, sid, trixie
  • size: 893,396 kB
  • sloc: xml: 158,127; python: 35,830; javascript: 19,497; cpp: 19,002; sh: 17,245; ansic: 13,127; asm: 4,376; makefile: 1,051; perl: 29; lisp: 29; ruby: 19; sql: 11
file content (492 lines) | stat: -rw-r--r-- 16,638 bytes parent folder | download | duplicates (11)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use icu_collections::codepointtrie::planes::get_planes_trie;
use icu_collections::codepointtrie::Error;
use icu_collections::codepointtrie::*;
use zerovec::ZeroVec;

#[test]
fn planes_trie_deserialize_check_test() {
    // Get expected planes trie from crate::planes::get_planes_trie()

    let exp_planes_trie = get_planes_trie();

    // Compute actual planes trie from planes.toml

    let planes_enum_prop =
        ::toml::from_str::<UnicodeEnumeratedProperty>(include_str!("data/cpt/planes.toml"))
            .unwrap();

    let code_point_trie_struct = planes_enum_prop.code_point_trie.trie_struct;

    let trie_header = CodePointTrieHeader {
        high_start: code_point_trie_struct.high_start,
        shifted12_high_start: code_point_trie_struct.shifted12_high_start,
        index3_null_offset: code_point_trie_struct.index3_null_offset,
        data_null_offset: code_point_trie_struct.data_null_offset,
        null_value: code_point_trie_struct.null_value,
        trie_type: TrieType::try_from(code_point_trie_struct.trie_type_enum_val).unwrap_or_else(
            |_| {
                panic!(
                    "Could not parse trie_type serialized enum value in test data file: {}",
                    code_point_trie_struct.name
                )
            },
        ),
    };

    let data = ZeroVec::from_slice_or_alloc(code_point_trie_struct.data_8.as_ref().unwrap());
    let index = ZeroVec::from_slice_or_alloc(&code_point_trie_struct.index);
    let trie_result: Result<CodePointTrie<u8>, Error> =
        CodePointTrie::try_new(trie_header, index, data);
    let act_planes_trie = trie_result.unwrap();

    // Get check ranges (inversion map-style sequence of range+value) and
    // apply the trie validation test fn on expected and actual tries

    let serialized_ranges: Vec<(u32, u32, u32)> = planes_enum_prop.code_point_map.data.ranges;
    let mut check_ranges: Vec<u32> = vec![];
    for range_tuple in serialized_ranges {
        let range_end = range_tuple.1 + 1;
        let value = range_tuple.2;
        check_ranges.push(range_end);
        check_ranges.push(value);
    }

    check_trie(&act_planes_trie, &check_ranges);
    check_trie(&exp_planes_trie, &check_ranges);
}

#[test]
fn free_blocks_16() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/free-blocks.16.toml"));
}

#[test]
fn free_blocks_32() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/free-blocks.32.toml"));
}

#[test]
fn free_blocks_8() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/free-blocks.8.toml"));
}

#[test]
fn free_blocks_small16() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/free-blocks.small16.toml"));
}

#[test]
fn grow_data_16() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/grow-data.16.toml"));
}

#[test]
fn grow_data_32() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/grow-data.32.toml"));
}

#[test]
fn grow_data_8() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/grow-data.8.toml"));
}

#[test]
fn grow_data_small16() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/grow-data.small16.toml"));
}

#[test]
fn set1_16() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/set1.16.toml"));
}

#[test]
fn set1_32() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/set1.32.toml"));
}

#[test]
fn set1_8() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/set1.8.toml"));
}

#[test]
fn set1_small16() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/set1.small16.toml"));
}

#[test]
fn set2_overlap_16() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/set2-overlap.16.toml"));
}

#[test]
fn set2_overlap_32() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/set2-overlap.32.toml"));
}

#[test]
fn set2_overlap_small16() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/set2-overlap.small16.toml"));
}

#[test]
fn set3_initial_9_16() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/set3-initial-9.16.toml"));
}

#[test]
fn set3_initial_9_32() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/set3-initial-9.32.toml"));
}

#[test]
fn set3_initial_9_8() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/set3-initial-9.8.toml"));
}

#[test]
fn set3_initial_9_small16() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/set3-initial-9.small16.toml"));
}

#[test]
fn set_empty_16() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/set-empty.16.toml"));
}

#[test]
fn set_empty_32() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/set-empty.32.toml"));
}

#[test]
fn set_empty_8() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/set-empty.8.toml"));
}

#[test]
fn set_empty_small16() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/set-empty.small16.toml"));
}

#[test]
fn set_single_value_16() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/set-single-value.16.toml"));
}

#[test]
fn set_single_value_32() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/set-single-value.32.toml"));
}

#[test]
fn set_single_value_8() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/set-single-value.8.toml"));
}

#[test]
fn set_single_value_small16() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/set-single-value.small16.toml"));
}

#[test]
fn short_all_same_16() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/short-all-same.16.toml"));
}

#[test]
fn short_all_same_8() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/short-all-same.8.toml"));
}

#[test]
fn short_all_same_small16() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/short-all-same.small16.toml"));
}

#[test]
fn small0_in_fast_16() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/small0-in-fast.16.toml"));
}

#[test]
fn small0_in_fast_32() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/small0-in-fast.32.toml"));
}

#[test]
fn small0_in_fast_8() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/small0-in-fast.8.toml"));
}

#[test]
fn small0_in_fast_small16() {
    run_deserialize_test_from_test_data(include_str!("data/cpt/small0-in-fast.small16.toml"));
}

/// The width of the elements in the data array of a [`CodePointTrie`].
/// See [`UCPTrieValueWidth`](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ucptrie_8h.html) in ICU4C.
#[derive(Clone, Copy, PartialEq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum ValueWidthEnum {
    Bits16 = 0,
    Bits32 = 1,
    Bits8 = 2,
}

/// Test .get() on CodePointTrie by iterating through each range in
/// check_ranges and assert that the associated
/// value matches the trie value for each code point in the range.
pub fn check_trie<T: TrieValue + Into<u32>>(trie: &CodePointTrie<T>, check_ranges: &[u32]) {
    assert_eq!(
        0,
        check_ranges.len() % 2,
        "check_ranges must have an even number of 32-bit values in (limit,value) pairs"
    );

    let mut i: u32 = 0;
    let check_range_tuples = check_ranges.chunks(2);
    // Iterate over each check range
    for range_tuple in check_range_tuples {
        let range_limit = range_tuple[0];
        let range_value = range_tuple[1];
        // Check all values in this range, one-by-one
        while i < range_limit {
            assert_eq!(range_value, trie.get32(i).into(), "trie_get({})", i,);
            i += 1;
        }
    }
}

/// Test .get_range() / .iter_ranges() on CodePointTrie by calling
/// .iter_ranges() on the trie (which returns an iterator that produces values
/// by calls to .get_range) and see if it matches the values in check_ranges.
pub fn test_check_ranges_get_ranges<T: TrieValue + Into<u32>>(
    trie: &CodePointTrie<T>,
    check_ranges: &[u32],
) {
    assert_eq!(
        0,
        check_ranges.len() % 2,
        "check_ranges must have an even number of 32-bit values in (limit,value) pairs"
    );

    let mut trie_ranges = trie.iter_ranges();

    let mut range_start: u32 = 0;
    let check_range_tuples = check_ranges.chunks(2);
    // Iterate over each check range
    for range_tuple in check_range_tuples {
        let range_limit = range_tuple[0];
        let range_value = range_tuple[1];

        // The check ranges array seems to start with a trivial range whose
        // limit is zero. range_start is initialized to 0, so we can skip.
        if range_limit == 0 {
            continue;
        }

        let cpm_range = trie_ranges.next();
        assert!(cpm_range.is_some(), "CodePointTrie iter_ranges() produces fewer ranges than the check_ranges field in testdata has");
        let cpm_range = cpm_range.unwrap();
        let cpmr_start = cpm_range.range.start();
        let cpmr_end = cpm_range.range.end();
        let cpmr_value: u32 = cpm_range.value.into();

        assert_eq!(range_start, *cpmr_start);
        assert_eq!(range_limit, *cpmr_end + 1);
        assert_eq!(range_value, cpmr_value);

        range_start = range_limit;
    }

    assert!(trie_ranges.next().is_none(), "CodePointTrie iter_ranges() produces more ranges than the check_ranges field in testdata has");
}

/// Run above tests that verify the validity of CodePointTrie methods
pub fn run_trie_tests<T: TrieValue + Into<u32>>(trie: &CodePointTrie<T>, check_ranges: &[u32]) {
    check_trie(trie, check_ranges);
    test_check_ranges_get_ranges(trie, check_ranges);
}

// The following structs might be useful later for de-/serialization of the
// main `CodePointTrie` struct in the corresponding data provider.

#[cfg_attr(any(feature = "serde", test), derive(serde::Deserialize))]
pub struct UnicodeEnumeratedProperty {
    pub code_point_map: EnumPropCodePointMap,
    pub code_point_trie: EnumPropSerializedCPT,
}

#[cfg_attr(any(feature = "serde", test), derive(serde::Deserialize))]
pub struct EnumPropCodePointMap {
    pub data: EnumPropCodePointMapData,
}

#[cfg_attr(any(feature = "serde", test), derive(serde::Deserialize))]
pub struct EnumPropCodePointMapData {
    pub long_name: String,
    pub name: String,
    pub ranges: Vec<(u32, u32, u32)>,
}

#[allow(clippy::upper_case_acronyms)]
#[cfg_attr(any(feature = "serde", test), derive(serde::Deserialize))]
pub struct EnumPropSerializedCPT {
    #[cfg_attr(any(feature = "serde", test), serde(rename = "struct"))]
    pub trie_struct: EnumPropSerializedCPTStruct,
}

// These structs support the test data dumped as TOML files from ICU.
// Because the properties CodePointMap data will also be dumped from ICU
// using similar functions, some of these structs may be useful to refactor
// into main code at a later point.

#[allow(clippy::upper_case_acronyms)]
#[cfg_attr(any(feature = "serde", test), derive(serde::Deserialize))]
pub struct EnumPropSerializedCPTStruct {
    #[cfg_attr(any(feature = "serde", test), serde(skip))]
    pub long_name: String,
    pub name: String,
    pub index: Vec<u16>,
    pub data_8: Option<Vec<u8>>,
    pub data_16: Option<Vec<u16>>,
    pub data_32: Option<Vec<u32>>,
    #[cfg_attr(any(feature = "serde", test), serde(skip))]
    pub index_length: u32,
    #[cfg_attr(any(feature = "serde", test), serde(skip))]
    pub data_length: u32,
    #[cfg_attr(any(feature = "serde", test), serde(rename = "highStart"))]
    pub high_start: u32,
    #[cfg_attr(any(feature = "serde", test), serde(rename = "shifted12HighStart"))]
    pub shifted12_high_start: u16,
    #[cfg_attr(any(feature = "serde", test), serde(rename = "type"))]
    pub trie_type_enum_val: u8,
    #[cfg_attr(any(feature = "serde", test), serde(rename = "valueWidth"))]
    pub value_width_enum_val: u8,
    #[cfg_attr(any(feature = "serde", test), serde(rename = "index3NullOffset"))]
    pub index3_null_offset: u16,
    #[cfg_attr(any(feature = "serde", test), serde(rename = "dataNullOffset"))]
    pub data_null_offset: u32,
    #[cfg_attr(any(feature = "serde", test), serde(rename = "nullValue"))]
    pub null_value: u32,
}

// Given a .toml file dumped from ICU4C test data for UCPTrie, run the test
// data file deserialization into the test file struct, convert and construct
// the `CodePointTrie`, and test the constructed struct against the test file's
// "check ranges" (inversion map ranges) using `check_trie` to verify the
// validity of the `CodePointTrie`'s behavior for all code points.
#[allow(dead_code)]
pub fn run_deserialize_test_from_test_data(test_file: &str) {
    // The following structs are specific to the TOML format files for dumped ICU
    // test data.

    #[derive(serde::Deserialize)]
    pub struct TestFile {
        code_point_trie: TestCodePointTrie,
    }

    #[derive(serde::Deserialize)]
    pub struct TestCodePointTrie {
        // The trie_struct field for test data files is dumped from the same source
        // (ICU4C) using the same function (usrc_writeUCPTrie) as property data
        // for the provider, so we can reuse the same struct here.
        #[serde(rename(deserialize = "struct"))]
        trie_struct: EnumPropSerializedCPTStruct,
        #[serde(rename(deserialize = "testdata"))]
        test_data: TestData,
    }

    #[derive(serde::Deserialize)]
    pub struct TestData {
        #[serde(rename(deserialize = "checkRanges"))]
        check_ranges: Vec<u32>,
    }

    let test_file = ::toml::from_str::<TestFile>(test_file).unwrap();

    let test_struct = test_file.code_point_trie.trie_struct;

    println!(
        "Running CodePointTrie reader logic test on test data file: {}",
        test_struct.name
    );

    let trie_type_enum = match TrieType::try_from(test_struct.trie_type_enum_val) {
        Ok(enum_val) => enum_val,
        _ => {
            panic!(
                "Could not parse trie_type serialized enum value in test data file: {}",
                test_struct.name
            );
        }
    };

    let trie_header = CodePointTrieHeader {
        high_start: test_struct.high_start,
        shifted12_high_start: test_struct.shifted12_high_start,
        index3_null_offset: test_struct.index3_null_offset,
        data_null_offset: test_struct.data_null_offset,
        null_value: test_struct.null_value,
        trie_type: trie_type_enum,
    };

    let index = ZeroVec::from_slice_or_alloc(&test_struct.index);

    match (test_struct.data_8, test_struct.data_16, test_struct.data_32) {
        (Some(data_8), _, _) => {
            let data = ZeroVec::from_slice_or_alloc(&data_8);
            let trie_result: Result<CodePointTrie<u8>, Error> =
                CodePointTrie::try_new(trie_header, index, data);
            assert!(trie_result.is_ok(), "Could not construct trie");
            assert_eq!(
                test_struct.value_width_enum_val,
                ValueWidthEnum::Bits8 as u8
            );
            run_trie_tests(
                &trie_result.unwrap(),
                &test_file.code_point_trie.test_data.check_ranges,
            );
        }

        (_, Some(data_16), _) => {
            let data = ZeroVec::from_slice_or_alloc(&data_16);
            let trie_result: Result<CodePointTrie<u16>, Error> =
                CodePointTrie::try_new(trie_header, index, data);
            assert!(trie_result.is_ok(), "Could not construct trie");
            assert_eq!(
                test_struct.value_width_enum_val,
                ValueWidthEnum::Bits16 as u8
            );
            run_trie_tests(
                &trie_result.unwrap(),
                &test_file.code_point_trie.test_data.check_ranges,
            );
        }

        (_, _, Some(data_32)) => {
            let data = ZeroVec::from_slice_or_alloc(&data_32);
            let trie_result: Result<CodePointTrie<u32>, Error> =
                CodePointTrie::try_new(trie_header, index, data);
            assert!(trie_result.is_ok(), "Could not construct trie");
            assert_eq!(
                test_struct.value_width_enum_val,
                ValueWidthEnum::Bits32 as u8
            );
            run_trie_tests(
                &trie_result.unwrap(),
                &test_file.code_point_trie.test_data.check_ranges,
            );
        }

        (_, _, _) => {
            panic!("Could not match test trie data to a known value width or trie type");
        }
    };
}