File: NFC.swift

package info (click to toggle)
swiftlang 6.0.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,519,992 kB
  • sloc: cpp: 9,107,863; ansic: 2,040,022; asm: 1,135,751; python: 296,500; objc: 82,456; f90: 60,502; lisp: 34,951; pascal: 19,946; sh: 18,133; perl: 7,482; ml: 4,937; javascript: 4,117; makefile: 3,840; awk: 3,535; xml: 914; fortran: 619; cs: 573; ruby: 573
file content (223 lines) | stat: -rw-r--r-- 8,323 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

import SwiftShims

extension Unicode {
  internal struct _InternalNFC<S: StringProtocol> {
    let base: S
  }
}

extension Unicode._InternalNFC {
  internal struct Iterator {
    var buffer = Unicode._NormDataBuffer()

    // This is our starter that is currently being composed with other scalars
    // into new scalars. For example, "e\u{301}", here our first scalar is 'e',
    // which is a starter, thus we assign composee to this 'e' and move to the
    // next scalar. We attempt to compose our composee, 'e', with '\u{301}' and
    // find that there is a composition. Thus our new composee is now 'é' and
    // we continue to try and compose following scalars with this composee.
    var composee: Unicode.Scalar? = nil

    var iterator: Unicode._InternalNFD<S>.Iterator
  }
}

extension Unicode._InternalNFC.Iterator: IteratorProtocol {
  internal func compose(
    _ x: Unicode.Scalar,
    and y: Unicode.Scalar
  ) -> Unicode.Scalar? {
    // Fast path: ASCII and some latiny scalars never compose when they're on
    // the rhs.
    if _fastPath(y.value < 0x300) {
      return nil
    }

    if let hangul = composeHangul(x, and: y) {
      return hangul
    }

    // Otherwise, lookup the composition.
    let composition = _swift_stdlib_getComposition(x.value, y.value)

    guard composition != .max else {
      return nil
    }

    return Unicode.Scalar(_value: composition)
  }

  @inline(never)
  internal func composeHangul(
    _ x: Unicode.Scalar,
    and y: Unicode.Scalar
  ) -> Unicode.Scalar? {
    // L = Hangul leading consonants
    let L: (base: UInt32, count: UInt32) = (base: 0x1100, count: 19)
    // V = Hangul vowels
    let V: (base: UInt32, count: UInt32) = (base: 0x1161, count: 21)
    // T = Hangul tail consonants
    let T: (base: UInt32, count: UInt32) = (base: 0x11A7, count: 28)
    // N = Number of precomposed Hangul syllables that start with the same
    //     leading consonant. (There is no base for N).
    let N: (base: UInt32, count: UInt32) = (base: 0x0, count: 588)
    // S = Hangul precomposed syllables
    let S: (base: UInt32, count: UInt32) = (base: 0xAC00, count: 11172)

    switch (x.value, y.value) {
    // Check for Hangul (L, V) -> LV compositions.
    case (L.base ..< L.base &+ L.count, V.base ..< V.base &+ V.count):
      let lIdx = x.value &- L.base
      let vIdx = y.value &- V.base
      let lvIdx = lIdx &* N.count &+ vIdx &* T.count
      let s = S.base &+ lvIdx
      return Unicode.Scalar(_value: s)

    // Check for Hangul (LV, T) -> LVT compositions.
    case (S.base ..< S.base &+ S.count, T.base &+ 1 ..< T.base &+ T.count):
      if (x.value &- S.base) % T.count == 0 {
        return Unicode.Scalar(_value: x.value &+ y.value &- T.base)
      } else {
        fallthrough
      }

    default:
      return nil
    }
  }

  internal mutating func next() -> Unicode.Scalar? {
    // Empty out our buffer before attempting to compose anything with our new
    // composee.
    if let nextBuffered = buffer.next() {
      return nextBuffered.scalar
    }

    while let current = iterator.next() {
      guard let currentComposee = composee else {
        // If we don't have a composee at this point, we're most likely looking
        // at the start of a string. If our class is 0, then attempt to compose
        // the following scalars with this one. Otherwise, it's a one off scalar
        // that needs to be emitted.
        if current.normData.ccc == 0 {
          composee = current.scalar
          continue
        } else {
          return current.scalar
        }
      }

      // If we have any scalars in the buffer, it means those scalars couldn't
      // compose with our composee to form a new scalar. However, scalars
      // following them may still compose with our composee, so take the last
      // scalar in the buffer and get its normalization data so that we can
      // perform the check underneath this one about whether this current scalar
      // is "blocked". We get the last scalar because the scalars we receive are
      // already NFD, so the last scalar in the buffer will have the highest
      // CCC value in this normalization segment.
      guard let lastBufferedNormData = buffer.last?.normData else {
        // If we do not have any scalars in our buffer yet, then this step is
        // trivial. Attempt to compose our current scalar with whatever composee
        // we're currently building up.

        // If our right hand side scalar IS NFC_QC, then that means it can
        // never compose with any scalars previous to it. So, if our current
        // scalar is NFC_QC, then we have no composition.
        guard !current.normData.isNFCQC,
            let composed = compose(currentComposee, and: current.scalar) else {
          // We did not find a composition between the two. If our current class
          // is 0, then set that as the new composee and return whatever built
          // up scalar we have. Otherwise, add our current scalar to the buffer
          // for eventual removal!

          if current.normData.ccc == 0 {
            composee = current.scalar
            return currentComposee
          }

          buffer.append(current)
          continue
        }

        // We found a composition! Record it as our new composee and repeat the
        // process.
        composee = composed
        continue
      }

      // Check if our current scalar is not blocked from our current composee.
      // In this case blocked means there is some scalar whose class
      // (lastBufferedNormData.ccc) is either == 0 or >= current.normData.ccc.
      //
      // Example:
      //
      //     "z\u{0335}\u{0327}\u{0324}\u{0301}"
      //
      // In this example, there are several combining marks following a 'z', but
      // none of them actually compose with the composee 'z'. However, the last
      // scalar U+0301 does actually compose. So this check makes sure that the
      // last scalar doesn't have any scalar in between it and the composee that
      // would otherwise "block" it from composing.
      guard lastBufferedNormData.ccc < current.normData.ccc else {
        // We had a scalar block it. That means our current scalar is either a
        // starter or has a same class (preserve ordering).

        // Starters are the "start" of a new normalization segment. Set it as
        // the new composee and return our current composee. This will trigger
        // any other scalars in the buffer to be emitted before we handle
        // normalizing this new segment.
        if current.normData.ccc == 0 {
          composee = current.scalar
          return currentComposee
        }

        _internalInvariant(current.normData.ccc == lastBufferedNormData.ccc)
        buffer.append(current)
        continue
      }

      // There were no blockers! Attempt to compose the two! (Again, if our rhs
      // scalar IS NFC_QC, then it can never compose with anything previous to
      // it).
      guard !current.normData.isNFCQC,
            let composed = compose(currentComposee, and: current.scalar) else {
        // No composition found. Stick it at the end of the buffer with the rest
        // of non-composed scalars.

        buffer.append(current)
        continue
      }

      // They composed! Assign the composition as our new composee and iterate
      // to the next scalar.
      composee = composed
    }

    // If we have a leftover composee, make sure to return it.
    return composee.take()
  }
}

extension Unicode._InternalNFC: Sequence {
  internal func makeIterator() -> Iterator {
    Iterator(iterator: base._internalNFD.makeIterator())
  }
}

extension StringProtocol {
  internal var _internalNFC: Unicode._InternalNFC<Self> {
    Unicode._InternalNFC(base: self)
  }
}