File: Generate-Collation-Data.p6

package info (click to toggle)
moarvm 2020.12%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 18,652 kB
  • sloc: ansic: 268,178; perl: 8,186; python: 1,316; makefile: 768; sh: 287
file content (318 lines) | stat: -rwxr-xr-x 12,980 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
#!/usr/bin/env perl6
use lib <lib tools/lib>;
use Collation-Gram;
use ArrayCompose;
my $my_debug = False;
# Set this to only generate a partial run, for testing purposes
my Int $less-than;
my $out-file = "src/strings/unicode_uca.c";
class p6node {
    has Int $.cp;
    has @!collation_elements;
    has $!last;
    has %.next is rw;
    method next-cps                           { %!next.keys.map(*.Int).sort  }
    method has-collation                      { @!collation_elements.Bool    }
    method get-collation                      { @!collation_elements }
    method set-collation (Positional:D $list) {
        @!collation_elements = |$list;
    }
    method set-cp (Int:D $cp) { $!cp = $cp }
}
sub p6node-find-node (Int:D $cp, p6node $p6node is rw --> p6node) is rw {
    die unless $p6node.next{$cp}.VAR.^name eq 'Scalar';
    die "can't find the node for $cp " unless $p6node.next{$cp}.isa(p6node);
    return-rw $p6node.next{$cp} orelse die "Can't find node";
}
sub p6node-create-or-find-node (Int:D $cp, p6node:D $p6node is rw) is rw {
    my $hash := $p6node.next;
    #say "p6node-create-or-find-node called for cp $cp";
    if $hash{$cp}:exists {
        return-rw $p6node.next{$cp};
    }
    else {
        my $obj = p6node.new(cp => $cp, last => $hash);
        $obj.set-cp($cp);
        $hash{$cp} = $obj;
        return-rw $hash{$cp};
    }

}
sub print-var ($var) { $var.gist }
my Str $Unicode-Version;
my @implicit-weights;
my $max-cp = 0;
sub int-bitwidth (Int:D $int) {
    $int.base(2).chars + 1;
}
sub uint-bitwidth (Int:D $int) {
    $int.base(2).chars;
}
my Int:D $codepoint_sequence_no_max = 0;
sub parse-test-data (p6node:D $main-p6node) {
    my $data = "UNIDATA/UCA/allkeys.txt".IO;
    my $line-no;
    for $data.lines -> $line {
        $line-no++;
        last if $less-than and 10_000 < $less-than;
        #say $line-no;
        next if $line eq '' or $line.starts-with('#');
        if $line.starts-with('@version') {
            $Unicode-Version = $line.subst('@version ', '');
            next;
        }
        if $line.starts-with('@implicitweights') {
            @implicit-weights.push: $line.subst('@implicitweights ', '');
            next;
        }
        my $var = Collation-Gram.new.parse($line, :actions(Collation-Gram::Action.new)).made;
        die $line unless $var;
        # skip them if it's not a sequence (only one codepoint), AND there
        # is only one collation element. These are picked up into the main MVM
        # UCD database
        next if $var<codepoints>.elems == 1 && $var<array>.elems == 1;
        my $node = $main-p6node;
        say $line, "\n", $var<codepoints> if $my_debug;
        $codepoint_sequence_no_max = $var<codepoints>.elems
            if $codepoint_sequence_no_max < $var<codepoints>.elems;
        for $var<codepoints>.list -> $cp {
            $max-cp = $cp if $max-cp < $cp;
            $node = p6node-create-or-find-node($cp, $node);
        }
        $node.set-collation($var<array>);
    }
    say "Done with parse-test-data";
}

class sub_node {
    has Int $.codepoint;
    has Int $.sub_node_elems      is rw ;
    has Int $.sub_node_link       is rw;
    has Int $.collation_key_elems is rw = 0;
    has Int $.collation_key_link  is rw = 0;
    has Int $.element             is rw;
    method build {
        $!codepoint,
        $!sub_node_elems,
        # To save space, set to zero if it's -1 (-1 means there's no link)
        # we can determine there is no link by checking the collation_key_elems
        # or sub_node_elems
        # so we don't need to set these to -1
        ($!sub_node_link == -1 ?? 0 !! $!sub_node_link),
        $!collation_key_elems,
        ($!collation_key_link == -1 ?? 0 !! $!collation_key_link),
    }
    method Str {
        "\{{$.codepoint.fmt("0x%X")}, $!sub_node_elems, $!sub_node_link, $!collation_key_elems, $!collation_key_link\}"
    }
}
#| Adds the initial codepoint nodes to @main-node
sub add-main-node-to-c-data (p6node:D $p6node is rw, @main-node) is rw {
    for $p6node.next.keys.map(*.Int).sort -> $cp {
        my $thing := sub_node.new(codepoint => $cp, element => @main-node.elems);
        @main-node.push: $thing;
    }
    @main-node.elems;
}

#say Dump @main-node;
#| Follows the codepoints already in @main-node and adds sub_nodes based on that
sub sub_node-flesh-out-tree-from-main-node-elems
(p6node:D $main-p6node is rw, @main-node, @collation-elements) {
    for ^@main-node -> $i {
        #say "Processing $sub_node.codepoint()";
        sub_node-add-to-c-data-from-sub_node(@main-node[$i],
            p6node-find-node(@main-node[$i].codepoint, $main-p6node),
            @main-node, @collation-elements);
    }
}
sub sub_node-add-to-c-data-from-sub_node
(sub_node:D $sub_node is rw, p6node:D $p6node is rw, @main-node, @collation-elements --> sub_node:D) is rw {
    die unless $sub_node.codepoint == $p6node.cp;
    if $p6node.has-collation {
        my $temp := sub_node-add-collation-elems-from-p6node($sub_node, $p6node, @collation-elements);
        die "\$temp !=== \$sub_node" unless $temp === $sub_node;
    }
    #if !$sub_node.sub_node_elems {
    $sub_node.sub_node_elems = $p6node.next.elems;
    #}
    #die "\$sub_node.sub_node_elems !== \$p6node.next.elems" unless $sub_node.sub_node_elems == $p6node.next.elems;
    my Int ($last-link, $first-link) = -1 xx 2;
    for $p6node.next-cps -> $cp {
        $last-link = sub_node-add-sub_node($cp, @main-node);
        sub_node-add-to-c-data-from-sub_node(@main-node[$last-link], p6node-find-node($cp, $p6node), @main-node, @collation-elements);
        $first-link = $last-link if $first-link == -1;
    }
    $sub_node.sub_node_link = $first-link;

    #say Dump $sub_node;
    $sub_node;
}
sub sub_node-add-sub_node (Int:D $cp, @main-node --> Int:D) {
    my $node := sub_node.new(codepoint => $cp, element => @main-node.elems);
    die "!\$node.element.defined || !\$node.codepoint.defined"
        unless $node.element.defined && $node.codepoint.defined;
    @main-node.push: $node;
    return @main-node.elems - 1;
}
my Int:D $max-collation-elems = 0;
my Int:D $max-primary   = 0;
my Int:D $max-secondary = 0;
my Int:D $max-tertiary  = 0;
my Int:D $max-special   = 0;
sub sub_node-add-collation-elems-from-p6node (sub_node:D $sub_node is rw, p6node:D $p6node is rw, @collation-elements --> sub_node:D) is rw {
    die "!\$p6node.has-collation" unless $p6node.has-collation;
    my Int:D $before-elems = @collation-elements.elems;
    for $p6node.get-collation <-> $element {
        $max-primary   = $element[0] if $max-primary   < $element[0];
        $max-secondary = $element[1] if $max-secondary < $element[1];
        $max-tertiary  = $element[2] if $max-tertiary  < $element[2];
        $max-special   = $element[3] if $max-special   < $element[3];
        @collation-elements.push: $element;
    }
    $max-collation-elems = $p6node.get-collation.elems if $max-collation-elems < $p6node.get-collation.elems;
    my Int:D $after-elems = @collation-elements.elems;
    $sub_node.collation_key_link  = $before-elems;
    $sub_node.collation_key_elems = $after-elems - $before-elems;
    $sub_node;
}
my @main-node;
my @collation-elements;
my $main-p6node = p6node.new;
sub debug-out-nodes {
    use JSON::Fast;
    spurt 'out_nodes', to-json(@main-node.map(*.build));
}
sub process-block (Str:D $text) {
    if $text ~~ / ^ \s* $<fullnam>=( $<start>=(<:AHex>+) ['..' $<end>=(<:AHex>+)]? \s* ';' \s* $<name>=(.*) ) \s* $ / {
        #.say;
        my Int:D $start = $<fullnam><start>.Str.parse-base(16);
        my Int:D $end = $<fullnam><end> ?? $<fullnam><end>.Str.parse-base(16) !! $start;
        my Str:D $name = $<fullnam><name>.Str;
        my Str:D $fullname = $<fullnam>.Str;
        return $start, $end, $name, $fullname;
    }
    else {
        die;
    }

}
sub get-block-data (Str:D $file, @looking, $funcname) {
    die unless "UNIDATA/$file".IO.f;
    my $myfile = slurp "UNIDATA/$file";
    my @out = "/* Data from $file */", "MVM_STATIC_INLINE MVMuint32 $funcname " ~ '(MVMCodepoint cp) {';
    @out.push: 'return'.indent: 4;
    my Int:D $num = 0;
    for $myfile.lines {
        next if /^ \s* '#' /;
        next if /^ \s* $/;
        if ($file eq 'PropList.txt') {
            my @split = .split(/[\s+|';']/, :skip-empty);
            #say @split.perl;
            next unless @split[1].trim eq @looking.any;
        }
        if ($file eq 'Blocks.txt') {
            my $found = False;
            for @looking -> $looking {
                $found = True if m/^\s* <:AHex>+ '..' <:AHex>+ \s* ';' \s* $looking \s* $/;
            }
            next unless $found;
        }
        #100000..10FFFF; Supplementary Private Use Area-B
        #(0x3400 <= cp && cp <= 0x4DB5) /*  3400..4DB5  d*/
        my ($start, $end, $name, $fullname) = process-block $_;

        my Str:D $or = $num++ ?? '||' !! '  ';
        #say $num;
        my Str:D $conditional = $start == $end
            ?? "0x%-22X == cp".sprintf($start)
            !! "0x%-5X <= cp && cp <= 0x%-5X".sprintf: $start, $end;
        @out.push: "%s (%s) /* %4X..%-4X %-34s */".sprintf($or, $conditional, $start, $end, $name).indent: 4;
        #say "start $start end $end name: “$name”";
    }
    @out.push: ';'.indent: 4;
    @out.push: '}';
    @out.join("\n") ~ "\n";

}
parse-test-data($main-p6node);
my $main-node-elems = add-main-node-to-c-data($main-p6node, @main-node);
sub_node-flesh-out-tree-from-main-node-elems($main-p6node, @main-node, @collation-elements);
say now - INIT now;
sub format-collation-Str ($a) {
    my Str $out;
    for $a -> $item is copy {
        my $thing = $item.pop;
        my Str:D $thing-str = $thing ?? '*' !! '.';
        $out ~= "[%s%.4X.%.4X.%.4X]".sprintf($thing-str, |$item);
    }
    $out;
}
my @composed-arrays = "/* This file generated from tools/Generate-Collation-Data.p6 */";
sub make-struct (@names, @types, @collation-list-for-packing, $struct-name) {
    use lib 'lib';
    use BitfieldPacking;
    my @order = compute-packing(@collation-list-for-packing);
    my @out-str = "struct $struct-name \{";
    for @order -> $pair {
        @out-str.push: ([~] @types[$pair.key], " ", @names[$pair.key], " :", $pair.value, ";").indent(4);
    }
    @out-str.push: '};';
    @out-str.join("\n"), @order;
}
my @collation-list-for-packing =
    0 => uint-bitwidth($max-primary),
    1 => uint-bitwidth($max-secondary),
    2 => uint-bitwidth($max-tertiary),
    3 => uint-bitwidth($max-special);
my @collation_key_names =
    'primary', 'secondary', 'tertiary', 'special';
my ($collation_key_struct, $collation_key_order) = make-struct(
    @collation_key_names,
    ("MVMuint32" xx 4),
    @collation-list-for-packing,
    'collation_key');
@composed-arrays.push: $collation_key_struct;
my @names2 = <codepoint sub_node_elems sub_node_link
              collation_key_elems collation_key_link>;
my @sub_node-list-for-packing2 =
    0 => uint-bitwidth($max-cp),
    1 => uint-bitwidth(@main-node.elems - 1),
    2 => uint-bitwidth(@main-node.elems - 1),
    3 => uint-bitwidth($max-collation-elems),
    4 => uint-bitwidth(@collation-elements.elems - 1);
my ($sub_node_struct, $order2) = make-struct(
    @names2,
    ('MVMuint32' xx 5),
    @sub_node-list-for-packing2,
    'sub_node');
@composed-arrays.push: $sub_node_struct;
@composed-arrays.push: "typedef struct sub_node sub_node;";
sub transform-array (@array, @order) {
    @array.map(-> $item {
        my @out;
        for ^$item.elems -> $i {
            @out[$i] = $item[@order[$i].key];
        }
        @out;
    });
}
@composed-arrays.push: "#define main_nodes_elems @main-node.elems()";
@composed-arrays.push: "#define starter_main_nodes_elems $main-node-elems";
@composed-arrays.push: "#define codepoint_sequence_no_max $codepoint_sequence_no_max";
@composed-arrays.push: "#define special_collation_keys_elems @collation-elements.elems()";
@composed-arrays.push: get-block-data("PropList.txt", ("Unified_Ideograph",), "is_unified_ideograph");
@composed-arrays.push: get-block-data("Blocks.txt", ("Nushu",), "is_Assigned_Block_Nushu");
@composed-arrays.push: get-block-data("Blocks.txt", ("Tangut","Tangut Components"), "is_Block_Tangut");
@composed-arrays.push: get-block-data("Blocks.txt", ("CJK Unified Ideographs","CJK Compatibility Ideographs"), "is_Block_CJK_Unified_Ideographs_OR_CJK_Compatibility_Ideographs");
@composed-arrays.push: compose-array('sub_node', 'main_nodes', transform-array(@main-node».build, $order2));
@composed-arrays.push: compose-array( 'struct collation_key', 'special_collation_keys', transform-array(@collation-elements, $collation_key_order));
spurt $out-file, @composed-arrays.join("\n");
print qq:to/END/;
Done writing $out-file.
{'=' x 70}
MAKE SURE TO RUN `perl6 CollationTestGen.p6 --folder=MoarVM/UNIDATA/UCA/CollationTest --test-only --should-test ` to ensure there are ~74 failures only!

Then paste those failures (if they're ok into CollationTestGen.p6 in the \%todo variable.
{'=' x 70}
END