File: generate_unicode_data.cr

package info (click to toggle)
crystal 1.14.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 24,384 kB
  • sloc: javascript: 6,400; sh: 695; makefile: 269; ansic: 121; python: 105; cpp: 77; xml: 32
file content (357 lines) | stat: -rwxr-xr-x 10,947 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
#! /usr/bin/env crystal
#
# This script generates the file src/unicode/data.cr
# that contains compact representations of the UnicodeData.txt
# file from the unicode specification.

require "http/client"
require "ecr"
require "../src/compiler/crystal/formatter"

UCD_ROOT = "http://www.unicode.org/Public/#{Unicode::VERSION}/ucd/"

enum DecompositionType
  None
  Canonical
  Compatibility
end

# Each entry in UnicodeData.txt
# (some info is missing but we don't use it yet)
record Entry,
  codepoint : Int32,
  name : String,
  general_category : String,
  decomposition_type : DecompositionType,
  decomposition_mapping : Array(Int32)?,
  upcase : Int32?,
  downcase : Int32?,
  casefold : Int32?

record SpecialCase,
  codepoint : Int32,
  value : Array(Int32)

record CaseRange, low : Int32, high : Int32, delta : Int32
record AlternateRange, low : Int32, high : Int32
record Stride, low : Int32, high : Int32, stride : Int32
record CanonicalCombiningClassRange, low : Int32, high : Int32, ccc : UInt8
record QuickCheckRange, low : Int32, high : Int32, result : Unicode::QuickCheckResult

def case_ranges(entries, &block)
  ranges = [] of CaseRange
  first_codepoint = nil
  last_codepoint = nil
  first_match = nil
  last_match = nil
  entries.each do |entry|
    codepoint = entry.codepoint
    match = yield entry
    if match
      if last_codepoint == codepoint - 1 && last_match == match - 1
        # Continue streak
      else
        if last_codepoint && last_match
          ranges << CaseRange.new(first_codepoint.not_nil!, last_codepoint, first_match.not_nil! - first_codepoint.not_nil!)
        end
        first_codepoint = codepoint
        first_match = match
      end
    else
      if last_codepoint && last_match
        ranges << CaseRange.new(first_codepoint.not_nil!, last_codepoint, first_match.not_nil! - first_codepoint.not_nil!)
      end
    end

    last_codepoint = codepoint
    last_match = match
  end
  ranges
end

def alternate_ranges(ranges)
  alternate = [] of AlternateRange

  first_codepoint = nil
  last_codepoint = nil

  ranges.each do |range|
    codepoint = range.low
    if last_codepoint == codepoint - 2
      # Continue streak
    else
      if first_codepoint
        alternate << new_alternate_range(first_codepoint, last_codepoint)
      end
      first_codepoint = codepoint
    end

    last_codepoint = codepoint
  end

  if first_codepoint
    alternate << new_alternate_range(first_codepoint, last_codepoint)
  end

  alternate
end

def new_alternate_range(first_codepoint, last_codepoint)
  # The last codepoint is the one for the uppercase letter and we
  # need to also consider the next codepoint for the lowercase one.
  AlternateRange.new(first_codepoint, last_codepoint.not_nil! + 1)
end

def strides(entries, targets, &)
  strides = [] of Stride

  entries = entries.select { |entry| targets.includes?(yield entry) }

  first_entry = nil
  last_entry = nil
  stride = nil

  entries.each do |entry|
    if first_entry
      if last_entry
        current_stride = entry.codepoint - last_entry.codepoint
        if current_stride == stride
          # Continue stride
        else
          if first_entry == last_entry
            stride = current_stride
          else
            stride = 1 if first_entry.name.ends_with?("First>") && last_entry.name.ends_with?("Last>")
            strides << Stride.new(first_entry.codepoint, last_entry.codepoint, stride.not_nil!)
            first_entry = entry
            stride = nil
          end
        end
      end
    else
      first_entry = entry
    end

    last_entry = entry
  end

  if first_entry && last_entry
    if stride
      stride = 1 if first_entry.name.ends_with?("First>") && last_entry.name.ends_with?("Last>")
      strides << Stride.new(first_entry.codepoint, last_entry.codepoint, stride)
    else
      strides << Stride.new(first_entry.codepoint, last_entry.codepoint, 1)
    end
  end

  strides
end

entries = [] of Entry
special_cases_downcase = [] of SpecialCase
special_cases_titlecase = [] of SpecialCase
special_cases_upcase = [] of SpecialCase
special_cases_casefold = [] of SpecialCase
casefold_mapping = Hash(Int32, Int32).new
canonical_combining_classes = [] of CanonicalCombiningClassRange
full_composition_exclusions = Set(Int32).new
quick_checks = Unicode::NormalizationForm.values.to_h { |kind| {kind, Array(QuickCheckRange).new} }

url = "#{UCD_ROOT}CaseFolding.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
  line = line.strip
  next if line.empty?
  next if line.starts_with?('#')

  pieces = line.split(';')
  codepoint = pieces[0].to_i(16)
  status = pieces[1].strip[0]
  casefold = pieces[2].split.map(&.to_i(16))
  next if status != 'C' && status != 'F' # casefold uses full case folding (C and F)
  if casefold.size == 1
    casefold_mapping[codepoint] = casefold[0]
    casefold = nil
  end
  if casefold
    while casefold.size < 3
      casefold << 0
    end
    special_cases_casefold << SpecialCase.new(codepoint, casefold)
  end
end

url = "#{UCD_ROOT}UnicodeData.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
  line = line.strip
  next if line.empty?

  pieces = line.split(';')
  codepoint = pieces[0].to_i(16)
  name = pieces[1]
  general_category = pieces[2]
  # don't read CanonicalCombiningClass here; the derived properties file has
  # exact ranges
  decomposition = pieces[5]
  if decomposition.starts_with?('<')
    decomposition_mapping = decomposition.partition("> ")[2].split.map(&.to_i(16))
    decomposition_type = DecompositionType::Compatibility
  else
    decomposition_mapping = decomposition.presence.try &.split.map(&.to_i(16))
    decomposition_type = decomposition_mapping.nil? ? DecompositionType::None : DecompositionType::Canonical
  end
  upcase = pieces[12].to_i?(16)
  downcase = pieces[13].to_i?(16)
  titlecase = pieces[14].to_i?(16)
  casefold = casefold_mapping[codepoint]?
  entries << Entry.new(
    codepoint: codepoint,
    name: name,
    general_category: general_category,
    decomposition_type: decomposition_type,
    decomposition_mapping: decomposition_mapping,
    upcase: upcase,
    downcase: downcase,
    casefold: casefold,
  )
  if titlecase && titlecase != upcase
    special_cases_titlecase << SpecialCase.new(codepoint, [titlecase, 0, 0])
  end
end

url = "#{UCD_ROOT}SpecialCasing.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
  line = line.strip
  next if line.empty?
  break if line.starts_with?("# Conditional Mappings")
  next if line.starts_with?('#')

  pieces = line.split(';')
  codepoint = pieces[0].to_i(16)

  downcase = pieces[1].split.map(&.to_i(16))
  if downcase.size > 1
    while downcase.size < 3
      downcase << 0
    end
    special_cases_downcase << SpecialCase.new(codepoint, downcase)
  end

  upcase = pieces[3].split.map(&.to_i(16))
  if upcase.size > 1
    while upcase.size < 3
      upcase << 0
    end
    special_cases_upcase << SpecialCase.new(codepoint, upcase)
  end

  titlecase = pieces[2].split.map(&.to_i(16))
  if titlecase.size > 1
    while titlecase.size < 3
      titlecase << 0
    end
    special_cases_titlecase << SpecialCase.new(codepoint, titlecase)
  end
end

url = "#{UCD_ROOT}extracted/DerivedCombiningClass.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
  line = line.strip

  if m = line.match(/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*(\d+)/)
    ccc = m[3].to_u8
    next if ccc == 0
    low = m[1].to_i(16)
    high = m[2]?.try(&.to_i(16)) || low
    canonical_combining_classes << CanonicalCombiningClassRange.new(low, high, ccc)
  end
end

url = "#{UCD_ROOT}DerivedNormalizationProps.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
  line = line.strip
  break if line.starts_with?("# Derived Property: Expands_On_NFD")

  if m = line.match(/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*Full_Composition_Exclusion/)
    low = m[1].to_i(16)
    high = m[2]?.try(&.to_i(16)) || low
    (low..high).each { |codepoint| full_composition_exclusions << codepoint }
  elsif m = line.match(/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*(NFC|NFD|NFKC|NFKD)_QC\s*;\s*(N|M)/)
    low = m[1].to_i(16)
    high = m[2]?.try(&.to_i(16)) || low
    quick_check = quick_checks[Unicode::NormalizationForm.parse(m[3])]
    result = m[4] == "M" ? Unicode::QuickCheckResult::Maybe : Unicode::QuickCheckResult::No
    quick_check << QuickCheckRange.new(low, high, result)
  end
end

downcase_ranges = case_ranges entries, &.downcase
downcase_one_ranges, downcase_ranges = downcase_ranges.partition { |r| r.delta == 1 }

upcase_ranges = case_ranges entries, &.upcase
upcase_ranges.select! { |r| r.delta != -1 }

alternate_ranges = alternate_ranges(downcase_one_ranges)

special_cases_downcase.sort_by! &.codepoint
special_cases_upcase.sort_by! &.codepoint
special_cases_titlecase.reject! &.in?(special_cases_upcase)
special_cases_titlecase.sort_by! &.codepoint

casefold_ranges = case_ranges entries, &.casefold

all_strides = {} of String => Array(Stride)
categories = %w(Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Cn)

categories.each do |category|
  all_strides[category] = strides entries, category, &.general_category
end

canonical_combining_classes.sort_by! &.low

canonical_decompositions = entries.compact_map do |entry|
  next unless entry.decomposition_type.canonical?
  mapping = entry.decomposition_mapping.not_nil!
  raise "BUG: Mapping longer than 2 codepoints" unless mapping.size <= 2
  {entry.codepoint, mapping[0], mapping[1]? || 0}
end

# Instead of storing the codepoints for each compatibility decomposition as an
# individual `Array`, we store all of them in a single `Array` and refer to its
# subsequences using index and count.
compatibility_decomposition_data = [] of Int32
compatibility_decompositions = entries.compact_map do |entry|
  next unless entry.decomposition_type.compatibility?
  mapping = entry.decomposition_mapping.not_nil!

  # We try to reuse any existing subsequences in the table that match this
  # entry's decomposition mapping. This reduces the table size by over 40%,
  # mainly due to singleton decompositions. It can be further optimized by
  # solving the shortest common superstring problem.
  index = (0..compatibility_decomposition_data.size - mapping.size).find do |i|
    (0...mapping.size).all? do |j|
      mapping[j] == compatibility_decomposition_data[i + j]
    end
  end
  unless index
    index = compatibility_decomposition_data.size
    compatibility_decomposition_data.concat(mapping)
  end

  {entry.codepoint, index, mapping.size}
end

canonical_compositions = canonical_decompositions.compact_map do |codepoint, first, second|
  next if second == 0 || full_composition_exclusions.includes?(codepoint)
  {(first.to_i64 << 21) | second, codepoint}
end

quick_checks.each_value &.sort_by! &.low

output = ECR.render "#{__DIR__}/unicode_data.ecr"
output = Crystal.format(output)
File.write("#{__DIR__}/../src/unicode/data.cr", output)