1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357
|
#! /usr/bin/env crystal
#
# This script generates the file src/unicode/data.cr
# that contains compact representations of the UnicodeData.txt
# file from the unicode specification.
require "http/client"
require "ecr"
require "../src/compiler/crystal/formatter"
UCD_ROOT = "http://www.unicode.org/Public/#{Unicode::VERSION}/ucd/"
enum DecompositionType
None
Canonical
Compatibility
end
# Each entry in UnicodeData.txt
# (some info is missing but we don't use it yet)
record Entry,
codepoint : Int32,
name : String,
general_category : String,
decomposition_type : DecompositionType,
decomposition_mapping : Array(Int32)?,
upcase : Int32?,
downcase : Int32?,
casefold : Int32?
record SpecialCase,
codepoint : Int32,
value : Array(Int32)
record CaseRange, low : Int32, high : Int32, delta : Int32
record AlternateRange, low : Int32, high : Int32
record Stride, low : Int32, high : Int32, stride : Int32
record CanonicalCombiningClassRange, low : Int32, high : Int32, ccc : UInt8
record QuickCheckRange, low : Int32, high : Int32, result : Unicode::QuickCheckResult
def case_ranges(entries, &block)
ranges = [] of CaseRange
first_codepoint = nil
last_codepoint = nil
first_match = nil
last_match = nil
entries.each do |entry|
codepoint = entry.codepoint
match = yield entry
if match
if last_codepoint == codepoint - 1 && last_match == match - 1
# Continue streak
else
if last_codepoint && last_match
ranges << CaseRange.new(first_codepoint.not_nil!, last_codepoint, first_match.not_nil! - first_codepoint.not_nil!)
end
first_codepoint = codepoint
first_match = match
end
else
if last_codepoint && last_match
ranges << CaseRange.new(first_codepoint.not_nil!, last_codepoint, first_match.not_nil! - first_codepoint.not_nil!)
end
end
last_codepoint = codepoint
last_match = match
end
ranges
end
def alternate_ranges(ranges)
alternate = [] of AlternateRange
first_codepoint = nil
last_codepoint = nil
ranges.each do |range|
codepoint = range.low
if last_codepoint == codepoint - 2
# Continue streak
else
if first_codepoint
alternate << new_alternate_range(first_codepoint, last_codepoint)
end
first_codepoint = codepoint
end
last_codepoint = codepoint
end
if first_codepoint
alternate << new_alternate_range(first_codepoint, last_codepoint)
end
alternate
end
def new_alternate_range(first_codepoint, last_codepoint)
# The last codepoint is the one for the uppercase letter and we
# need to also consider the next codepoint for the lowercase one.
AlternateRange.new(first_codepoint, last_codepoint.not_nil! + 1)
end
def strides(entries, targets, &)
strides = [] of Stride
entries = entries.select { |entry| targets.includes?(yield entry) }
first_entry = nil
last_entry = nil
stride = nil
entries.each do |entry|
if first_entry
if last_entry
current_stride = entry.codepoint - last_entry.codepoint
if current_stride == stride
# Continue stride
else
if first_entry == last_entry
stride = current_stride
else
stride = 1 if first_entry.name.ends_with?("First>") && last_entry.name.ends_with?("Last>")
strides << Stride.new(first_entry.codepoint, last_entry.codepoint, stride.not_nil!)
first_entry = entry
stride = nil
end
end
end
else
first_entry = entry
end
last_entry = entry
end
if first_entry && last_entry
if stride
stride = 1 if first_entry.name.ends_with?("First>") && last_entry.name.ends_with?("Last>")
strides << Stride.new(first_entry.codepoint, last_entry.codepoint, stride)
else
strides << Stride.new(first_entry.codepoint, last_entry.codepoint, 1)
end
end
strides
end
entries = [] of Entry
special_cases_downcase = [] of SpecialCase
special_cases_titlecase = [] of SpecialCase
special_cases_upcase = [] of SpecialCase
special_cases_casefold = [] of SpecialCase
casefold_mapping = Hash(Int32, Int32).new
canonical_combining_classes = [] of CanonicalCombiningClassRange
full_composition_exclusions = Set(Int32).new
quick_checks = Unicode::NormalizationForm.values.to_h { |kind| {kind, Array(QuickCheckRange).new} }
url = "#{UCD_ROOT}CaseFolding.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
line = line.strip
next if line.empty?
next if line.starts_with?('#')
pieces = line.split(';')
codepoint = pieces[0].to_i(16)
status = pieces[1].strip[0]
casefold = pieces[2].split.map(&.to_i(16))
next if status != 'C' && status != 'F' # casefold uses full case folding (C and F)
if casefold.size == 1
casefold_mapping[codepoint] = casefold[0]
casefold = nil
end
if casefold
while casefold.size < 3
casefold << 0
end
special_cases_casefold << SpecialCase.new(codepoint, casefold)
end
end
url = "#{UCD_ROOT}UnicodeData.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
line = line.strip
next if line.empty?
pieces = line.split(';')
codepoint = pieces[0].to_i(16)
name = pieces[1]
general_category = pieces[2]
# don't read CanonicalCombiningClass here; the derived properties file has
# exact ranges
decomposition = pieces[5]
if decomposition.starts_with?('<')
decomposition_mapping = decomposition.partition("> ")[2].split.map(&.to_i(16))
decomposition_type = DecompositionType::Compatibility
else
decomposition_mapping = decomposition.presence.try &.split.map(&.to_i(16))
decomposition_type = decomposition_mapping.nil? ? DecompositionType::None : DecompositionType::Canonical
end
upcase = pieces[12].to_i?(16)
downcase = pieces[13].to_i?(16)
titlecase = pieces[14].to_i?(16)
casefold = casefold_mapping[codepoint]?
entries << Entry.new(
codepoint: codepoint,
name: name,
general_category: general_category,
decomposition_type: decomposition_type,
decomposition_mapping: decomposition_mapping,
upcase: upcase,
downcase: downcase,
casefold: casefold,
)
if titlecase && titlecase != upcase
special_cases_titlecase << SpecialCase.new(codepoint, [titlecase, 0, 0])
end
end
url = "#{UCD_ROOT}SpecialCasing.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
line = line.strip
next if line.empty?
break if line.starts_with?("# Conditional Mappings")
next if line.starts_with?('#')
pieces = line.split(';')
codepoint = pieces[0].to_i(16)
downcase = pieces[1].split.map(&.to_i(16))
if downcase.size > 1
while downcase.size < 3
downcase << 0
end
special_cases_downcase << SpecialCase.new(codepoint, downcase)
end
upcase = pieces[3].split.map(&.to_i(16))
if upcase.size > 1
while upcase.size < 3
upcase << 0
end
special_cases_upcase << SpecialCase.new(codepoint, upcase)
end
titlecase = pieces[2].split.map(&.to_i(16))
if titlecase.size > 1
while titlecase.size < 3
titlecase << 0
end
special_cases_titlecase << SpecialCase.new(codepoint, titlecase)
end
end
url = "#{UCD_ROOT}extracted/DerivedCombiningClass.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
line = line.strip
if m = line.match(/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*(\d+)/)
ccc = m[3].to_u8
next if ccc == 0
low = m[1].to_i(16)
high = m[2]?.try(&.to_i(16)) || low
canonical_combining_classes << CanonicalCombiningClassRange.new(low, high, ccc)
end
end
url = "#{UCD_ROOT}DerivedNormalizationProps.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
line = line.strip
break if line.starts_with?("# Derived Property: Expands_On_NFD")
if m = line.match(/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*Full_Composition_Exclusion/)
low = m[1].to_i(16)
high = m[2]?.try(&.to_i(16)) || low
(low..high).each { |codepoint| full_composition_exclusions << codepoint }
elsif m = line.match(/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*(NFC|NFD|NFKC|NFKD)_QC\s*;\s*(N|M)/)
low = m[1].to_i(16)
high = m[2]?.try(&.to_i(16)) || low
quick_check = quick_checks[Unicode::NormalizationForm.parse(m[3])]
result = m[4] == "M" ? Unicode::QuickCheckResult::Maybe : Unicode::QuickCheckResult::No
quick_check << QuickCheckRange.new(low, high, result)
end
end
downcase_ranges = case_ranges entries, &.downcase
downcase_one_ranges, downcase_ranges = downcase_ranges.partition { |r| r.delta == 1 }
upcase_ranges = case_ranges entries, &.upcase
upcase_ranges.select! { |r| r.delta != -1 }
alternate_ranges = alternate_ranges(downcase_one_ranges)
special_cases_downcase.sort_by! &.codepoint
special_cases_upcase.sort_by! &.codepoint
special_cases_titlecase.reject! &.in?(special_cases_upcase)
special_cases_titlecase.sort_by! &.codepoint
casefold_ranges = case_ranges entries, &.casefold
all_strides = {} of String => Array(Stride)
categories = %w(Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Cn)
categories.each do |category|
all_strides[category] = strides entries, category, &.general_category
end
canonical_combining_classes.sort_by! &.low
canonical_decompositions = entries.compact_map do |entry|
next unless entry.decomposition_type.canonical?
mapping = entry.decomposition_mapping.not_nil!
raise "BUG: Mapping longer than 2 codepoints" unless mapping.size <= 2
{entry.codepoint, mapping[0], mapping[1]? || 0}
end
# Instead of storing the codepoints for each compatibility decomposition as an
# individual `Array`, we store all of them in a single `Array` and refer to its
# subsequences using index and count.
compatibility_decomposition_data = [] of Int32
compatibility_decompositions = entries.compact_map do |entry|
next unless entry.decomposition_type.compatibility?
mapping = entry.decomposition_mapping.not_nil!
# We try to reuse any existing subsequences in the table that match this
# entry's decomposition mapping. This reduces the table size by over 40%,
# mainly due to singleton decompositions. It can be further optimized by
# solving the shortest common superstring problem.
index = (0..compatibility_decomposition_data.size - mapping.size).find do |i|
(0...mapping.size).all? do |j|
mapping[j] == compatibility_decomposition_data[i + j]
end
end
unless index
index = compatibility_decomposition_data.size
compatibility_decomposition_data.concat(mapping)
end
{entry.codepoint, index, mapping.size}
end
canonical_compositions = canonical_decompositions.compact_map do |codepoint, first, second|
next if second == 0 || full_composition_exclusions.includes?(codepoint)
{(first.to_i64 << 21) | second, codepoint}
end
quick_checks.each_value &.sort_by! &.low
output = ECR.render "#{__DIR__}/unicode_data.ecr"
output = Crystal.format(output)
File.write("#{__DIR__}/../src/unicode/data.cr", output)
|