1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
|
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
import GenUtils
// Given a string to the UnicodeData file, return the flattened list of scalar
// to Canonical Decompositions.
//
// Each line in this data file is formatted like the following:
//
// 1B06;BALINESE LETTER AKARA TEDUNG;Lo;0;L;1B05 1B35;;;;N;;;;;
//
// Where each section is split by a ';'. The first section informs us of the
// scalar in the line with the various properties. For the purposes of
// decomposition data, we only need the 1B05 1B35 after the L (index 5) which is
// the array of scalars that the scalars decomposes to.
func getDecompData(
from data: String
) -> [(UInt32, [UInt32])] {
var unflattened: [(UInt32, [UInt32])] = []
for line in data.split(separator: "\n") {
let components = line.split(separator: ";", omittingEmptySubsequences: false)
let decomp = components[5]
// We either 1. don't have decompositions, or 2. the decompositions is for
// compatibile forms. We only care about NFD, so ignore these cases.
if decomp == "" || decomp.hasPrefix("<") {
continue
}
let decomposedScalars = decomp.split(separator: " ").map {
UInt32($0, radix: 16)!
}
let scalarStr = components[0]
let scalar = UInt32(scalarStr, radix: 16)!
unflattened.append((scalar, decomposedScalars))
}
return unflattened
}
// Takes a mph for the keys and the data values and writes the required data into
// static C arrays.
func emitDecomp(
_ mph: Mph,
_ data: [(UInt32, [UInt32])],
into result: inout String
) {
emitMph(
mph,
name: "_swift_stdlib_nfd_decomp",
defineLabel: "NFD_DECOMP",
into: &result
)
// Fixup the decomposed scalars first for fully decompositions.
var data = data
func decompose(_ scalar: UInt32, into result: inout [UInt32]) {
if scalar <= 0x7F {
result.append(scalar)
return
}
if let decomp = data.first(where: { $0.0 == scalar }) {
for scalar in decomp.1 {
decompose(scalar, into: &result)
}
} else {
result.append(scalar)
}
}
for (i, (_, rawDecomposed)) in data.enumerated() {
var newDecomposed: [UInt32] = []
for rawScalar in rawDecomposed {
decompose(rawScalar, into: &newDecomposed)
}
data[i].1 = newDecomposed
}
var sortedData: [(UInt32, UInt16)] = []
for (scalar, _) in data {
sortedData.append((scalar, UInt16(mph.index(for: UInt64(scalar)))))
}
sortedData.sort { $0.1 < $1.1 }
let indices = emitDecompDecomp(data, sortedData, into: &result)
emitDecompIndices(indices, into: &result)
}
func emitDecompDecomp(
_ data: [(UInt32, [UInt32])],
_ sortedData: [(UInt32, UInt16)],
into result: inout String
) -> [(UInt32, UInt16)] {
var indices: [(UInt32, UInt16)] = []
var decompResult: [UInt8] = []
// Keep a record of decompositions because some scalars share the same
// decomposition, so instead of emitting it twice, both scalars just point at
// the same decomposition index.
var uniqueDecomps: [[UInt32]: UInt16] = [:]
for (scalar, _) in sortedData {
let decomp = data.first(where: { $0.0 == scalar })!.1
// If we've seen this decomp before, use it.
if let idx = uniqueDecomps[decomp] {
indices.append((scalar, idx))
continue
}
indices.append((scalar, UInt16(decompResult.count)))
// This is our NFD decomposition utf8 string count.
decompResult.append(0)
let sizeIdx = decompResult.count - 1
uniqueDecomps[decomp] = UInt16(sizeIdx)
for scalar in decomp {
let realScalar = Unicode.Scalar(scalar)!
decompResult[sizeIdx] += UInt8(realScalar.utf8.count)
for utf8 in realScalar.utf8 {
decompResult.append(utf8)
}
}
}
result += """
static const __swift_uint8_t _swift_stdlib_nfd_decomp[\(decompResult.count)] = {
"""
formatCollection(decompResult, into: &result) { value -> String in
return "0x\(String(value, radix: 16, uppercase: true))"
}
result += "\n};\n\n"
return indices
}
func emitDecompIndices(
_ indices: [(UInt32, UInt16)],
into result: inout String
) {
result += """
static const __swift_uint32_t _swift_stdlib_nfd_decomp_indices[\(indices.count)] = {
"""
formatCollection(indices, into: &result) { (scalar, idx) -> String in
// Make sure that these scalars don't exceed past 18 bits. We need the other
// 14 bits to store the index into decomp array. Although Unicode scalars
// can go up to 21 bits, none of the higher scalars actually decompose into
// anything or aren't assigned yet.
assert(scalar <= 0x3FFFF)
var value = scalar
value |= UInt32(idx) << 18
return "0x\(String(value, radix: 16, uppercase: true))"
}
result += "\n};\n\n"
}
|