File: generate_unicode_data.cr

package info (click to toggle)
crystal 1.14.0%2Bdfsg-1
links: PTS, VCS
area: main
in suites: sid, trixie
size: 24,384 kB
sloc: javascript: 6,400; sh: 695; makefile: 269; ansic: 121; python: 105; cpp: 77; xml: 32
file content (357 lines) | stat: -rwxr-xr-x 10,947 bytes
#! /usr/bin/env crystal
#
# This script generates the file src/unicode/data.cr
# that contains compact representations of the UnicodeData.txt
# file from the unicode specification.

require "http/client"
require "ecr"
require "../src/compiler/crystal/formatter"

UCD_ROOT = "http://www.unicode.org/Public/#{Unicode::VERSION}/ucd/"

enum DecompositionType
  None
  Canonical
  Compatibility
end

# Each entry in UnicodeData.txt
# (some info is missing but we don't use it yet)
record Entry,
  codepoint : Int32,
  name : String,
  general_category : String,
  decomposition_type : DecompositionType,
  decomposition_mapping : Array(Int32)?,
  upcase : Int32?,
  downcase : Int32?,
  casefold : Int32?

record SpecialCase,
  codepoint : Int32,
  value : Array(Int32)

record CaseRange, low : Int32, high : Int32, delta : Int32
record AlternateRange, low : Int32, high : Int32
record Stride, low : Int32, high : Int32, stride : Int32
record CanonicalCombiningClassRange, low : Int32, high : Int32, ccc : UInt8
record QuickCheckRange, low : Int32, high : Int32, result : Unicode::QuickCheckResult

def case_ranges(entries, &block)
  ranges = [] of CaseRange
  first_codepoint = nil
  last_codepoint = nil
  first_match = nil
  last_match = nil
  entries.each do |entry|
    codepoint = entry.codepoint
    match = yield entry
    if match
      if last_codepoint == codepoint - 1 && last_match == match - 1
        # Continue streak
      else
        if last_codepoint && last_match
          ranges << CaseRange.new(first_codepoint.not_nil!, last_codepoint, first_match.not_nil! - first_codepoint.not_nil!)
        end
        first_codepoint = codepoint
        first_match = match
      end
    else
      if last_codepoint && last_match
        ranges << CaseRange.new(first_codepoint.not_nil!, last_codepoint, first_match.not_nil! - first_codepoint.not_nil!)
      end
    end

    last_codepoint = codepoint
    last_match = match
  end
  ranges
end

def alternate_ranges(ranges)
  alternate = [] of AlternateRange

  first_codepoint = nil
  last_codepoint = nil

  ranges.each do |range|
    codepoint = range.low
    if last_codepoint == codepoint - 2
      # Continue streak
    else
      if first_codepoint
        alternate << new_alternate_range(first_codepoint, last_codepoint)
      end
      first_codepoint = codepoint
    end

    last_codepoint = codepoint
  end

  if first_codepoint
    alternate << new_alternate_range(first_codepoint, last_codepoint)
  end

  alternate
end

def new_alternate_range(first_codepoint, last_codepoint)
  # The last codepoint is the one for the uppercase letter and we
  # need to also consider the next codepoint for the lowercase one.
  AlternateRange.new(first_codepoint, last_codepoint.not_nil! + 1)
end

def strides(entries, targets, &)
  strides = [] of Stride

  entries = entries.select { |entry| targets.includes?(yield entry) }

  first_entry = nil
  last_entry = nil
  stride = nil

  entries.each do |entry|
    if first_entry
      if last_entry
        current_stride = entry.codepoint - last_entry.codepoint
        if current_stride == stride
          # Continue stride
        else
          if first_entry == last_entry
            stride = current_stride
          else
            stride = 1 if first_entry.name.ends_with?("First>") && last_entry.name.ends_with?("Last>")
            strides << Stride.new(first_entry.codepoint, last_entry.codepoint, stride.not_nil!)
            first_entry = entry
            stride = nil
          end
        end
      end
    else
      first_entry = entry
    end

    last_entry = entry
  end

  if first_entry && last_entry
    if stride
      stride = 1 if first_entry.name.ends_with?("First>") && last_entry.name.ends_with?("Last>")
      strides << Stride.new(first_entry.codepoint, last_entry.codepoint, stride)
    else
      strides << Stride.new(first_entry.codepoint, last_entry.codepoint, 1)
    end
  end

  strides
end

entries = [] of Entry
special_cases_downcase = [] of SpecialCase
special_cases_titlecase = [] of SpecialCase
special_cases_upcase = [] of SpecialCase
special_cases_casefold = [] of SpecialCase
casefold_mapping = Hash(Int32, Int32).new
canonical_combining_classes = [] of CanonicalCombiningClassRange
full_composition_exclusions = Set(Int32).new
quick_checks = Unicode::NormalizationForm.values.to_h { |kind| {kind, Array(QuickCheckRange).new} }

url = "#{UCD_ROOT}CaseFolding.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
  line = line.strip
  next if line.empty?
  next if line.starts_with?('#')

  pieces = line.split(';')
  codepoint = pieces[0].to_i(16)
  status = pieces[1].strip[0]
  casefold = pieces[2].split.map(&.to_i(16))
  next if status != 'C' && status != 'F' # casefold uses full case folding (C and F)
  if casefold.size == 1
    casefold_mapping[codepoint] = casefold[0]
    casefold = nil
  end
  if casefold
    while casefold.size < 3
      casefold << 0
    end
    special_cases_casefold << SpecialCase.new(codepoint, casefold)
  end
end

url = "#{UCD_ROOT}UnicodeData.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
  line = line.strip
  next if line.empty?

  pieces = line.split(';')
  codepoint = pieces[0].to_i(16)
  name = pieces[1]
  general_category = pieces[2]
  # don't read CanonicalCombiningClass here; the derived properties file has
  # exact ranges
  decomposition = pieces[5]
  if decomposition.starts_with?('<')
    decomposition_mapping = decomposition.partition("> ")[2].split.map(&.to_i(16))
    decomposition_type = DecompositionType::Compatibility
  else
    decomposition_mapping = decomposition.presence.try &.split.map(&.to_i(16))
    decomposition_type = decomposition_mapping.nil? ? DecompositionType::None : DecompositionType::Canonical
  end
  upcase = pieces[12].to_i?(16)
  downcase = pieces[13].to_i?(16)
  titlecase = pieces[14].to_i?(16)
  casefold = casefold_mapping[codepoint]?
  entries << Entry.new(
    codepoint: codepoint,
    name: name,
    general_category: general_category,
    decomposition_type: decomposition_type,
    decomposition_mapping: decomposition_mapping,
    upcase: upcase,
    downcase: downcase,
    casefold: casefold,
  )
  if titlecase && titlecase != upcase
    special_cases_titlecase << SpecialCase.new(codepoint, [titlecase, 0, 0])
  end
end

url = "#{UCD_ROOT}SpecialCasing.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
  line = line.strip
  next if line.empty?
  break if line.starts_with?("# Conditional Mappings")
  next if line.starts_with?('#')

  pieces = line.split(';')
  codepoint = pieces[0].to_i(16)

  downcase = pieces[1].split.map(&.to_i(16))
  if downcase.size > 1
    while downcase.size < 3
      downcase << 0
    end
    special_cases_downcase << SpecialCase.new(codepoint, downcase)
  end

  upcase = pieces[3].split.map(&.to_i(16))
  if upcase.size > 1
    while upcase.size < 3
      upcase << 0
    end
    special_cases_upcase << SpecialCase.new(codepoint, upcase)
  end

  titlecase = pieces[2].split.map(&.to_i(16))
  if titlecase.size > 1
    while titlecase.size < 3
      titlecase << 0
    end
    special_cases_titlecase << SpecialCase.new(codepoint, titlecase)
  end
end

url = "#{UCD_ROOT}extracted/DerivedCombiningClass.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
  line = line.strip

  if m = line.match(/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*(\d+)/)
    ccc = m[3].to_u8
    next if ccc == 0
    low = m[1].to_i(16)
    high = m[2]?.try(&.to_i(16)) || low
    canonical_combining_classes << CanonicalCombiningClassRange.new(low, high, ccc)
  end
end

url = "#{UCD_ROOT}DerivedNormalizationProps.txt"
body = HTTP::Client.get(url).body
body.each_line do |line|
  line = line.strip
  break if line.starts_with?("# Derived Property: Expands_On_NFD")

  if m = line.match(/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*Full_Composition_Exclusion/)
    low = m[1].to_i(16)
    high = m[2]?.try(&.to_i(16)) || low
    (low..high).each { |codepoint| full_composition_exclusions << codepoint }
  elsif m = line.match(/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*(NFC|NFD|NFKC|NFKD)_QC\s*;\s*(N|M)/)
    low = m[1].to_i(16)
    high = m[2]?.try(&.to_i(16)) || low
    quick_check = quick_checks[Unicode::NormalizationForm.parse(m[3])]
    result = m[4] == "M" ? Unicode::QuickCheckResult::Maybe : Unicode::QuickCheckResult::No
    quick_check << QuickCheckRange.new(low, high, result)
  end
end

downcase_ranges = case_ranges entries, &.downcase
downcase_one_ranges, downcase_ranges = downcase_ranges.partition { |r| r.delta == 1 }

upcase_ranges = case_ranges entries, &.upcase
upcase_ranges.select! { |r| r.delta != -1 }

alternate_ranges = alternate_ranges(downcase_one_ranges)

special_cases_downcase.sort_by! &.codepoint
special_cases_upcase.sort_by! &.codepoint
special_cases_titlecase.reject! &.in?(special_cases_upcase)
special_cases_titlecase.sort_by! &.codepoint

casefold_ranges = case_ranges entries, &.casefold

all_strides = {} of String => Array(Stride)
categories = %w(Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Cn)

categories.each do |category|
  all_strides[category] = strides entries, category, &.general_category
end

canonical_combining_classes.sort_by! &.low

canonical_decompositions = entries.compact_map do |entry|
  next unless entry.decomposition_type.canonical?
  mapping = entry.decomposition_mapping.not_nil!
  raise "BUG: Mapping longer than 2 codepoints" unless mapping.size <= 2
  {entry.codepoint, mapping[0], mapping[1]? || 0}
end

# Instead of storing the codepoints for each compatibility decomposition as an
# individual `Array`, we store all of them in a single `Array` and refer to its
# subsequences using index and count.
compatibility_decomposition_data = [] of Int32
compatibility_decompositions = entries.compact_map do |entry|
  next unless entry.decomposition_type.compatibility?
  mapping = entry.decomposition_mapping.not_nil!

  # We try to reuse any existing subsequences in the table that match this
  # entry's decomposition mapping. This reduces the table size by over 40%,
  # mainly due to singleton decompositions. It can be further optimized by
  # solving the shortest common superstring problem.
  index = (0..compatibility_decomposition_data.size - mapping.size).find do |i|
    (0...mapping.size).all? do |j|
      mapping[j] == compatibility_decomposition_data[i + j]
    end
  end
  unless index
    index = compatibility_decomposition_data.size
    compatibility_decomposition_data.concat(mapping)
  end

  {entry.codepoint, index, mapping.size}
end

canonical_compositions = canonical_decompositions.compact_map do |codepoint, first, second|
  next if second == 0 || full_composition_exclusions.includes?(codepoint)
  {(first.to_i64 << 21) | second, codepoint}
end

quick_checks.each_value &.sort_by! &.low

output = ECR.render "#{__DIR__}/unicode_data.ecr"
output = Crystal.format(output)
File.write("#{__DIR__}/../src/unicode/data.cr", output)