File: generate_encoding_codetables.p6

package info (click to toggle)
moarvm 2020.12%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 18,652 kB
  • sloc: ansic: 268,178; perl: 8,186; python: 1,316; makefile: 768; sh: 287
file content (143 lines) | stat: -rwxr-xr-x 6,052 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env perl6
# This code generates encoding tables for single byte encodings.
# Currently Windows-1252 and Windows-1251
sub process-file (Str:D $filename, Str:D $encoding) {
    my %to-hex1252;
    for $filename.IO.slurp.lines -> $line {
        next if $line.starts-with: '#';
        my ($cp1252_hex, $Unicode_hex, $comment) = $line.split: /\t/;
        if (!$cp1252_hex || !$Unicode_hex || !$comment) {
            die "'$cp1252_hex' '$Unicode_hex' '$comment'";
        }
        # Map unmapped things to 0xFFFF, so we can throw if we see them
        if $Unicode_hex ~~ /^\s+/ {
            $Unicode_hex = "0xFFFF";
        }
        $Unicode_hex ~~ s/^0x//;
        $cp1252_hex ~~ s/^0x//;
        %to-hex1252{$cp1252_hex.parse-base(16)} = $Unicode_hex.parse-base(16);
    }
    die unless elems %to-hex1252 == 256;
    %to-hex1252;
}
sub process-shift-jis-index (Str:D $filename) {
    my %indexes;
    my %unis;
    for $filename.IO.slurp.lines -> $line {
        next if $line ~~ /^\s*$/ || $line ~~ /^\s*'#'/;
        my ($index, $uni) = $line.split(/\s+/, :skip-empty);
        # Let index be index jis0208 excluding all entries whose pointer is in the range 8272 to 8835, inclusive.
        next if 8272 <= $index && $index <= 8835;
        my $uni_int = $uni.subst(/^0x/, "").parse-base(16);
        note "index $index already exists with codepoint %indexes{$index}. Adding for codepoint $uni" if %indexes{$index}:exists;
        # The index pointer for code point in index is the first pointer corresponding to code point in index, or null if code point is not in index.
        %indexes{$index} = $uni_int;
        if %unis{$uni_int}:!exists {
            %unis{$uni_int} = $index;
        }
    }
    my @cp_to_index;
    my @index_to_cp;
    my @index_to_cp_array;
    for %unis.sort(*.key.Int) -> $pair {
        my ($uni, $index) = ($pair.kv);
        push @cp_to_index, make-case($uni.fmt("0x%X"), $index);
    }
    @cp_to_index.push: "default: return SHIFTJIS_NULL;";
    my $last_seen_index = -1;
    my @data;
    my @points;
    my $max_index;
    for %indexes.sort(*.key.Int) -> $pair {
        my ($index, $uni) = ($pair.kv);
        if $last_seen_index+1 != $index {
            push @data, sprintf("\{%4i, %4i\}", $last_seen_index, $index - $last_seen_index - 1);
        }
        $last_seen_index = $index;
        push @points, $uni;
        push @index_to_cp, make-case($index, $uni.fmt("0x%X"));
        $max_index = $index if !$max_index.defined || $max_index < $index;
    }
    my $offset-values-name = "shiftjis_offset_values";
    my $codepoint-array    = "shiftjis_index_to_cp_codepoints";
    my $max_index-name     = "shiftjis_max_index".uc;
    my @index_to_cp_str_out;
    @index_to_cp_str_out.push: "#define {"{$offset-values-name}_elems".uc} @data.elems()";
    @index_to_cp_str_out.push: "#define {"{$codepoint-array}_elems".uc} @points.elems()";
    @index_to_cp_str_out.push: "#define $max_index-name $max_index";
    @index_to_cp_str_out.push: [~] "static struct shiftjis_offset $offset-values-name\[{@data.elems}] = \{", "\n", @data.join(",\n").indent(4), "\n", '};';
    use lib 'tools/lib';
    use ArrayCompose;
    use IntWidth;
    @index_to_cp_str_out.push: compose-array(
        'static MVMuint16',
        $codepoint-array,
        @points);
    @index_to_cp.push: "default: return SHIFTJIS_NULL;";
    my $cp_to_index_str = "static MVMint16 shift_jis_cp_to_index (MVMThreadContext *tc, MVMGrapheme32 codepoint) \{\n" ~
        ("switch (codepoint) \{\n" ~ @cp_to_index.join("\n").indent(4) ~ "\n}").indent(4) ~
        "\n\}\n";

    "#define SHIFTJIS_NULL -1\n" ~
    @index_to_cp_str_out.join("\n") ~ "\n" ~
    $cp_to_index_str;

}
sub MAIN {
    my $DIR = "UNIDATA/CODETABLES";
    say process-shift-jis-index("$DIR/index-jis0208.txt");
    exit;
    my @info = %(encoding => 'windows1252', filename => "$DIR/CP1252.TXT", comment => "/* Windows-1252 Latin */"),
        %( encoding => 'windows1251', filename => "$DIR/CP1251.TXT", comment => "/* Windows-1251 Cyrillic */");
    my %win1252 = process-file(@info[0]<filename>, @info[0]<encoding>);
    my %win1251 = process-file(@info[1]<filename>, @info[1]<encoding>);

    say create-windows1252_codepoints(%win1252, @info[0]<encoding>, @info[0]<comment>);
    say create-windows1252_codepoints(%win1251, @info[1]<encoding>, @info[1]<comment>);
    say create-windows1252_cp_to_char(%win1252, @info[0]<encoding>);
    say create-windows1252_cp_to_char(%win1251, @info[1]<encoding>);
}
sub create-windows1252_codepoints (%to-hex1252, $encoding, $comment) {
    sub make_line (@lines, @out) {
        if @lines {
            my Str:D $out = join(",", @lines);
            @out.push: $out;
            @lines = Empty;
        }
    }
    my @lines;
    my $count = 0;
    my @out;
    for 0..255 {
        push @lines, "0x%04X".sprintf(%to-hex1252{$_});
        make_line @lines, @out if @lines %% 8;
    }
    make_line @lines, @out;
    my $out_str = "$comment\n" ~ "static const MVMuint16 {$encoding}_codepoints[] = \{\n" ~ @out.join(",\n").indent(4) ~ "\n\};";
    $out_str;

}
sub create-windows1252_cp_to_char (%to-hex1252, $encoding) {
    my $max = %to-hex1252.values.grep({$_ != 0xFFFF}).max;
    my $out_str2 = "static MVMuint8 {$encoding}_cp_to_char(MVMint32 codepoint) \{\n";
    my $out_str3 = qq:to/END/;
    if ($max < codepoint || codepoint < 0)
        return '\\0';
    switch (codepoint) \{
    END
    my @cases;
    for %to-hex1252.keys.sort({%to-hex1252{$^a} <=> %to-hex1252{$^b}}) -> $win_cp {
        next if %to-hex1252{$win_cp} == 0xFFFF;
        # Skip codepoints from 0..127 since those are in ASCII and don't need to
        # be in the switch
        next if $win_cp <= 127;
        @cases.push: make-case %to-hex1252{$win_cp}, $win_cp;
    }
    @cases.push: ‘default: return '\0';’;
    my $indent = ' ' x 4;
    $out_str2 ~= ($out_str3 ~ $indent ~ @cases.join("\n$indent") ~ "\n\};").indent(4) ~ "\n\}";
    $out_str2;
}
sub make-case (Cool:D $from, Cool:D $to) {
    "case $from: return $to;"
}