File: StringNormalization.swift

package info (click to toggle)
swiftlang 6.2.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 2,856,264 kB
  • sloc: cpp: 9,995,718; ansic: 2,234,019; asm: 1,092,167; python: 313,940; objc: 82,726; f90: 80,126; lisp: 38,373; pascal: 25,580; sh: 20,378; ml: 5,058; perl: 4,751; makefile: 4,725; awk: 3,535; javascript: 3,018; xml: 918; fortran: 664; cs: 573; ruby: 396
file content (170 lines) | stat: -rw-r--r-- 4,753 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
//===--- StringNormalization.swift ----------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2018 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

extension Unicode.Scalar {
  // Normalization boundary - a place in a string where everything left of the
  // boundary can be normalized independently from everything right of the
  // boundary. The concatenation of each result is the same as if the entire
  // string had been normalized as a whole.
  //
  // Normalization segment - a sequence of code units between two normalization
  // boundaries (without any boundaries in the middle). Note that normalization
  // segments can, as a process of normalization, expand, contract, and even
  // produce new sub-segments.

  // Quick check if a scalar is an NFC segment starter.
  internal var _isNFCStarter: Bool {
    // Fast path: All scalars up to U+300 are NFC_QC and have boundaries
    // before them.
    let normData = Unicode._NormData(self, fastUpperbound: 0x300)
    return normData.ccc == 0 && normData.isNFCQC
  }
}

extension UnsafeBufferPointer where Element == UInt8 {
  internal func hasNormalizationBoundary(before offset: Int) -> Bool {
    if offset == 0 || offset == count {
      return true
    }
    unsafe _internalInvariant(!UTF8.isContinuation(self[_unchecked: offset]))

    // Sub-300 latiny fast-path
    if unsafe self[_unchecked: offset] < 0xCC { return true }

    let cu = unsafe _decodeScalar(self, startingAt: offset).0
    return cu._isNFCStarter
  }

  internal func isOnUnicodeScalarBoundary(_ offset: Int) -> Bool {
    guard offset < count else {
      _internalInvariant(offset == count)
      return true
    }
    return unsafe !UTF8.isContinuation(self[offset])
  }
}

internal func _isScalarNFCQC(
  _ scalar: Unicode.Scalar,
  _ prevCCC: inout UInt8
) -> Bool {
  let normData = Unicode._NormData(scalar, fastUpperbound: 0x300)

  if prevCCC > normData.ccc, normData.ccc != 0 {
    return false
  }

  if !normData.isNFCQC {
    return false
  }

  prevCCC = normData.ccc
  return true
}

extension _StringGutsSlice {
  internal func _withNFCCodeUnits(_ f: (UInt8) throws -> Void) rethrows {
    let substring = String(_guts)[range]
    // Fast path: If we're already NFC (or ASCII), then we don't need to do
    // anything at all.
    if _fastPath(_guts.isNFC) {
      try substring.utf8.forEach(f)
      return
    }

    var isNFCQC = true
    var prevCCC: UInt8 = 0

    if _guts.isFastUTF8 {
      _fastNFCCheck(&isNFCQC, &prevCCC)

      // Because we have access to the fastUTF8, we can go through that instead
      // of accessing the UTF8 view on String.
      if isNFCQC {
        try unsafe withFastUTF8 {
          for unsafe byte in unsafe $0 {
            try f(byte)
          }
        }

        return
      }
    } else {
      for scalar in substring.unicodeScalars {
        if !_isScalarNFCQC(scalar, &prevCCC) {
          isNFCQC = false
          break
        }
      }

      if isNFCQC {
        for byte in substring.utf8 {
          try f(byte)
        }

        return
      }
    }

    for scalar in substring.unicodeScalars._internalNFC {
      try scalar.withUTF8CodeUnits {
        for unsafe byte in unsafe $0 {
          try f(byte)
        }
      }
    }
  }

  internal func _fastNFCCheck(_ isNFCQC: inout Bool, _ prevCCC: inout UInt8) {
    unsafe withFastUTF8 { utf8 in
      isNFCQC = unsafe _nfcQuickCheck(utf8, prevCCC: &prevCCC)
    }
  }
}

/// Run the Unicode NFC quick check algorithm, returns
internal func _nfcQuickCheck(
  _ utf8: UnsafeBufferPointer<UInt8>,
  prevCCC: inout UInt8
) -> Bool {
  var position = 0

  while position < utf8.count {
    // If our first byte is less than 0xCC, then it means we're under the
    // 0x300 scalar value and everything up to 0x300 is NFC already.
    if unsafe utf8[position] < 0xCC {
      // If our first byte is less than 0xC0, then it means it is ASCII
      // and only takes up a single byte.
      if unsafe utf8[position] < 0xC0 {
        position &+= 1
      } else {
        // Otherwise, this is a 2 byte < 0x300 sequence.
        position &+= 2
      }
      // ASCII always has ccc of 0.
      prevCCC = 0

      continue
    }

    let (scalar, len) = unsafe _decodeScalar(utf8, startingAt: position)

    guard _isScalarNFCQC(scalar, &prevCCC) else {
      return false
    }

    position &+= len
  }

  return true
}