File: UnicodeData.swift

package info (click to toggle)
swiftlang 6.0.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,519,992 kB
  • sloc: cpp: 9,107,863; ansic: 2,040,022; asm: 1,135,751; python: 296,500; objc: 82,456; f90: 60,502; lisp: 34,951; pascal: 19,946; sh: 18,133; perl: 7,482; ml: 4,937; javascript: 4,117; makefile: 3,840; awk: 3,535; xml: 914; fortran: 619; cs: 573; ruby: 573
file content (189 lines) | stat: -rw-r--r-- 6,756 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

import SwiftShims

internal typealias ScalarAndNormData = (
  scalar: Unicode.Scalar,
  normData: Unicode._NormData
)

extension Unicode {
  // A wrapper type over the normalization data value we receive when we
  // lookup a scalar's normalization information. The layout of the underlying
  // 16 bit value we receive is as follows:
  //
  // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  // └───┬───┘ └──── CCC ────┘ └─┘ │
  //     │                      │  └── NFD_QC
  //     │                      └── NFC_QC
  //     └── Unused
  //
  // NFD_QC: This is a simple Yes/No on whether the scalar has canonical
  //         decomposition. Note: Yes is indicated via 0 instead of 1.
  //
  // NFC_QC: This is either Yes/No/Maybe on whether the scalar is NFC quick
  //         check. Yes, represented as 0, means the scalar can NEVER compose
  //         with another scalar previous to it. No, represented as 1, means the
  //         scalar can NEVER appear within a well formed NFC string. Maybe,
  //         represented as 2, means the scalar could appear with an NFC string,
  //         but further information is required to determine if that is the
  //         case. At the moment, we really only care about Yes/No.
  //
  // CCC: This is the canonical combining class property of a scalar that is
  //      used when sorting scalars of a normalization segment after NFD
  //      computation. A scalar with a CCC value of 128 can NEVER appear before
  //      a scalar with a CCC value of 100, unless there are normalization
  //      boundaries between them.
  //
  internal struct _NormData {
    var rawValue: UInt16

    var ccc: UInt8 {
      UInt8(truncatingIfNeeded: rawValue >> 3)
    }

    var isNFCQC: Bool {
      rawValue & 0x6 == 0
    }

    var isNFDQC: Bool {
      rawValue & 0x1 == 0
    }

    init(_ scalar: Unicode.Scalar, fastUpperbound: UInt32 = 0xC0) {
      if _fastPath(scalar.value < fastUpperbound) {
        // CCC = 0, NFC_QC = Yes, NFD_QC = Yes
        rawValue = 0
      } else {
        rawValue = _swift_stdlib_getNormData(scalar.value)

        // Because we don't store precomposed hangul in our NFD_QC data, these
        // will return true for NFD_QC when in fact they are not.
        if (0xAC00 ... 0xD7A3).contains(scalar.value) {
          // NFD_QC = false
          rawValue |= 0x1
        }
      }
    }

    init(rawValue: UInt16) {
      self.rawValue = rawValue
    }
  }
}

extension Unicode {
  // A wrapper type for normalization buffers in the NFC and NFD iterators.
  // This helps remove some of the buffer logic like removal and sorting out of
  // the iterators and into this type.
  internal struct _NormDataBuffer {
    var storage: [ScalarAndNormData] = []

    // This is simply a marker denoting that we've built up our storage, and
    // now everything within it needs to be emitted. We reverse the buffer and
    // pop elements from the back as a way to remove them.
    var isReversed = false

    var isEmpty: Bool {
      storage.isEmpty
    }

    var last: ScalarAndNormData? {
      storage.last
    }

    mutating func append(_ scalarAndNormData: ScalarAndNormData) {
      _internalInvariant(!isReversed)
      storage.append(scalarAndNormData)
    }

    // Removes the first element from the buffer. Note: it is not safe to append
    // to the buffer after this function has been called. We reverse the storage
    // internally for everything to be emitted out, so appending would insert
    // into the storage at the wrong location. One must continue to call this
    // function until a 'nil' return value has been received before appending.
    mutating func next() -> ScalarAndNormData? {
      guard !storage.isEmpty else {
        isReversed = false
        return nil
      }

      // If our storage hasn't been reversed yet, do so now.
      if !isReversed {
        storage.reverse()
        isReversed = true
      }

      return storage.removeLast()
    }

    // Sort the entire buffer based on the canonical combining class.
    mutating func sort() {
      storage._insertionSort(within: storage.indices) {
        $0.normData.ccc < $1.normData.ccc
      }
    }
  }
}

extension Unicode {
  // A wrapper type over the decomposition entry value we receive when we
  // lookup a scalar's canonical decomposition. The layout of the underlying
  // 32 bit value we receive is as follows:
  //
  //          Top 14 bits                   Bottom 18 bits
  //
  // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  // └───────── Index ─────────┘ └───────── Hashed Scalar ─────────┘
  //
  // Index: This is the direct index into '_swift_stdlib_nfd_decompositions'
  //        that points to a size byte indicating the overall size of the
  //        UTF-8 decomposition string. Following the size byte is said string.
  //
  // Hashed Scalar: Because perfect hashing doesn't know the original set of
  //                keys it was hashed with, we store the original scalar in the
  //                decomposition entry so that we can guard against scalars
  //                who happen to hash to the same index.
  //
  internal struct _DecompositionEntry {
    let rawValue: UInt32

    // Our original scalar is stored in the first 18 bits of this entry.
    var hashedScalar: Unicode.Scalar {
      Unicode.Scalar(_value: (rawValue << 14) >> 14)
    }

    // The index into the decomposition array is stored in the top 14 bits.
    var index: Int {
      Int(truncatingIfNeeded: rawValue >> 18)
    }

    // A buffer pointer to the UTF8 decomposition string.
    var utf8: UnsafeBufferPointer<UInt8> {
      let decompPtr = _swift_stdlib_nfd_decompositions._unsafelyUnwrappedUnchecked

      // This size is the utf8 length of the decomposition.
      let size = Int(truncatingIfNeeded: decompPtr[index])

      return UnsafeBufferPointer(
        // We add 1 here to skip the size byte.
        start: decompPtr + index + 1,
        count: size
      )
    }

    init(_ scalar: Unicode.Scalar) {
      rawValue = _swift_stdlib_getDecompositionEntry(scalar.value)
    }
  }
}