File: Decomp.swift

package info (click to toggle)
swiftlang 6.0.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,519,992 kB
  • sloc: cpp: 9,107,863; ansic: 2,040,022; asm: 1,135,751; python: 296,500; objc: 82,456; f90: 60,502; lisp: 34,951; pascal: 19,946; sh: 18,133; perl: 7,482; ml: 4,937; javascript: 4,117; makefile: 3,840; awk: 3,535; xml: 914; fortran: 619; cs: 573; ruby: 573
file content (187 lines) | stat: -rw-r--r-- 5,325 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

import GenUtils

// Given a string to the UnicodeData file, return the flattened list of scalar
// to Canonical Decompositions.
//
// Each line in this data file is formatted like the following:
//
//     1B06;BALINESE LETTER AKARA TEDUNG;Lo;0;L;1B05 1B35;;;;N;;;;;
//
// Where each section is split by a ';'. The first section informs us of the
// scalar in the line with the various properties. For the purposes of
// decomposition data, we only need the 1B05 1B35 after the L (index 5) which is
// the array of scalars that the scalars decomposes to.
func getDecompData(
  from data: String
) -> [(UInt32, [UInt32])] {
  var unflattened: [(UInt32, [UInt32])] = []
  
  for line in data.split(separator: "\n") {
    let components = line.split(separator: ";", omittingEmptySubsequences: false)
    
    let decomp = components[5]
    
    // We either 1. don't have decompositions, or 2. the decompositions is for
    // compatibile forms. We only care about NFD, so ignore these cases.
    if decomp == "" || decomp.hasPrefix("<") {
      continue
    }
    
    let decomposedScalars = decomp.split(separator: " ").map {
      UInt32($0, radix: 16)!
    }
    
    let scalarStr = components[0]
    let scalar = UInt32(scalarStr, radix: 16)!
    
    unflattened.append((scalar, decomposedScalars))
  }
  
  return unflattened
}

// Takes a mph for the keys and the data values and writes the required data into
// static C arrays.
func emitDecomp(
  _ mph: Mph,
  _ data: [(UInt32, [UInt32])],
  into result: inout String
) {
  emitMph(
    mph,
    name: "_swift_stdlib_nfd_decomp",
    defineLabel: "NFD_DECOMP",
    into: &result
  )
  
  // Fixup the decomposed scalars first for fully decompositions.
  
  var data = data
  
  func decompose(_ scalar: UInt32, into result: inout [UInt32]) {
    if scalar <= 0x7F {
      result.append(scalar)
      return
    }
    
    if let decomp = data.first(where: { $0.0 == scalar }) {
      for scalar in decomp.1 {
        decompose(scalar, into: &result)
      }
    } else {
      result.append(scalar)
    }
  }
  
  for (i, (_, rawDecomposed)) in data.enumerated() {
    var newDecomposed: [UInt32] = []
    
    for rawScalar in rawDecomposed {
      decompose(rawScalar, into: &newDecomposed)
    }
    
    data[i].1 = newDecomposed
  }
  
  var sortedData: [(UInt32, UInt16)] = []
  
  for (scalar, _) in data {
    sortedData.append((scalar, UInt16(mph.index(for: UInt64(scalar)))))
  }
  
  sortedData.sort { $0.1 < $1.1 }
  
  let indices = emitDecompDecomp(data, sortedData, into: &result)
  emitDecompIndices(indices, into: &result)
}

func emitDecompDecomp(
  _ data: [(UInt32, [UInt32])],
  _ sortedData: [(UInt32, UInt16)],
  into result: inout String
) -> [(UInt32, UInt16)] {
  var indices: [(UInt32, UInt16)] = []
  var decompResult: [UInt8] = []
  
  // Keep a record of decompositions because some scalars share the same
  // decomposition, so instead of emitting it twice, both scalars just point at
  // the same decomposition index.
  var uniqueDecomps: [[UInt32]: UInt16] = [:]
  
  for (scalar, _) in sortedData {
    let decomp = data.first(where: { $0.0 == scalar })!.1
    
    // If we've seen this decomp before, use it.
    if let idx = uniqueDecomps[decomp] {
      indices.append((scalar, idx))
      continue
    }
    
    indices.append((scalar, UInt16(decompResult.count)))
    
    // This is our NFD decomposition utf8 string count.
    decompResult.append(0)
    let sizeIdx = decompResult.count - 1
    
    uniqueDecomps[decomp] = UInt16(sizeIdx)
    
    for scalar in decomp {
      let realScalar = Unicode.Scalar(scalar)!
      
      decompResult[sizeIdx] += UInt8(realScalar.utf8.count)
      
      for utf8 in realScalar.utf8 {
        decompResult.append(utf8)
      }
    }
  }
  
  result += """
  static const __swift_uint8_t _swift_stdlib_nfd_decomp[\(decompResult.count)] = {

  """
  
  formatCollection(decompResult, into: &result) { value -> String in
    return "0x\(String(value, radix: 16, uppercase: true))"
  }
  
  result += "\n};\n\n"
  
  return indices
}

func emitDecompIndices(
  _ indices: [(UInt32, UInt16)],
  into result: inout String
) {
  result += """
  static const __swift_uint32_t _swift_stdlib_nfd_decomp_indices[\(indices.count)] = {

  """
  
  formatCollection(indices, into: &result) { (scalar, idx) -> String in
    // Make sure that these scalars don't exceed past 18 bits. We need the other
    // 14 bits to store the index into decomp array. Although Unicode scalars
    // can go up to 21 bits, none of the higher scalars actually decompose into
    // anything or aren't assigned yet.
    assert(scalar <= 0x3FFFF)
    var value = scalar
    value |= UInt32(idx) << 18
    
    return "0x\(String(value, radix: 16, uppercase: true))"
  }
  
  result += "\n};\n\n"
}