File: StringIndex.swift

package info (click to toggle)
swiftlang 6.0.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,519,992 kB
  • sloc: cpp: 9,107,863; ansic: 2,040,022; asm: 1,135,751; python: 296,500; objc: 82,456; f90: 60,502; lisp: 34,951; pascal: 19,946; sh: 18,133; perl: 7,482; ml: 4,937; javascript: 4,117; makefile: 3,840; awk: 3,535; xml: 914; fortran: 619; cs: 573; ruby: 573
file content (565 lines) | stat: -rw-r--r-- 22,737 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
//===--- StringIndex.swift ------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

import SwiftShims

/*

String's Index has the following layout:

 ┌──────────┬────────────────╥────────────────┬───────╥───────┐
 │ b63:b16  │      b15:b14   ║     b13:b8     │ b7:b4 ║ b3:b0 │
 ├──────────┼────────────────╫────────────────┼───────╫───────┤
 │ position │ transc. offset ║ grapheme cache │ rsvd  ║ flags │
 └──────────┴────────────────╨────────────────┴───────╨───────┘
                             └────── resilient ───────┘

Position, transcoded offset, and flags are fully exposed in the ABI. Grapheme
cache and reserved bits are partially resilient: the fact that there are 11 bits
with a default value of `0` is ABI, but not the layout, construction, or
interpretation of those bits. All use of grapheme cache should be behind
non-inlinable function calls. Inlinable code should not set a non-zero value to
resilient bits: doing so breaks future evolution as the meaning of those bits
isn't frozen.

- position aka `encodedOffset`: A 48-bit offset into the string's code units

- transcoded offset: a 2-bit sub-scalar offset, derived from transcoding

<resilience barrier>

- grapheme cache: A 6-bit value remembering the distance to the next extended
  grapheme cluster boundary, or 0 if unknown. The value stored (if any) must be
  calculated assuming that the index addresses a boundary itself, i.e., without
  looking back at scalars preceding the index. (Substrings that don't start on a
  `Character` boundary heavily rely on this.)

- reserved: 4 unused bits available for future flags etc. The meaning of each
  bit may change between stdlib versions. These must be set to zero if
  constructing an index in inlinable code.

<resilience barrier>

  * b0: `_isScalarAligned`

    If set, index is known to be on a Unicode scalar boundary (see below).
    (Introduced in Swift 5.1)

  * b1: `_isCharacterAligned`

    If set, the index is known to be on an extended grapheme cluster
    boundary (i.e., on a Swift `Character`.)
    (Introduced in Swift 5.7)

  * b2: UTF-8 encoding

    If set, the position is known to be expressed in UTF-8 code units.
    (Introduced in Swift 5.7)

  * b3: UTF-16 encoding

    If set, the position is known to be expressed in UTF-16 code units.
    (Introduced in Swift 5.7)

Before Swift 5.7, bits b1, b2 and b3 used to be part of the resilient slice. See
the notes on Character Alignment and Index Encoding below to see how this works.

*/
extension String {
  /// A position of a character or code unit in a string.
  @frozen
  public struct Index: Sendable {
    @usableFromInline
    internal var _rawBits: UInt64

    @inlinable @inline(__always)
    init(_ raw: UInt64) {
      self._rawBits = raw
      self._invariantCheck()
    }
  }
}

extension String.Index {
  @inlinable @inline(__always)
  internal var orderingValue: UInt64 { return _rawBits &>> 14 }

  // Whether this is at the canonical "start" position, that is encoded AND
  // transcoded offset of 0.
  @inlinable @inline(__always)
  internal var isZeroPosition: Bool { return orderingValue == 0 }

  /// The UTF-16 code unit offset corresponding to this index.
  public func utf16Offset<S: StringProtocol>(in s: S) -> Int {
    return s.utf16.distance(from: s.utf16.startIndex, to: self)
  }

  /// The offset into a string's code units for this index.
  @available(swift, deprecated: 4.2, message: """
    encodedOffset has been deprecated as most common usage is incorrect. \
    Use utf16Offset(in:) to achieve the same behavior.
    """)
  @inlinable
  public var encodedOffset: Int { return _encodedOffset }

  @inlinable @inline(__always)
  internal var _encodedOffset: Int {
    return Int(truncatingIfNeeded: _rawBits &>> 16)
  }

  @inlinable @inline(__always)
  internal var transcodedOffset: Int {
    return Int(truncatingIfNeeded: orderingValue & 0x3)
  }

  @usableFromInline
  internal var characterStride: Int? {
    let value = (_rawBits & 0x3F00) &>> 8
    return value > 0 ? Int(truncatingIfNeeded: value) : nil
  }

  @inlinable @inline(__always)
  internal init(encodedOffset: Int, transcodedOffset: Int) {
    let pos = UInt64(truncatingIfNeeded: encodedOffset)
    let trans = UInt64(truncatingIfNeeded: transcodedOffset)
    _internalInvariant(pos == pos & 0x0000_FFFF_FFFF_FFFF)
    _internalInvariant(trans <= 3)

    self.init((pos &<< 16) | (trans &<< 14))
  }

  /// Creates a new index at the specified UTF-16 code unit offset
  ///
  /// - Parameter offset: An offset in UTF-16 code units.
  public init<S: StringProtocol>(utf16Offset offset: Int, in s: S) {
    let (start, end) = (s.utf16.startIndex, s.utf16.endIndex)
    guard offset >= 0,
          let idx = s.utf16.index(start, offsetBy: offset, limitedBy: end)
    else {
      self = end.nextEncoded
      return
    }
    self = idx
  }

  /// Creates a new index at the specified code unit offset.
  ///
  /// - Parameter offset: An offset in code units.
  @available(swift, deprecated: 4.2, message: """
    encodedOffset has been deprecated as most common usage is incorrect. \
    Use String.Index(utf16Offset:in:) to achieve the same behavior.
    """)
  @inlinable
  public init(encodedOffset offset: Int) {
    self.init(_encodedOffset: offset)
  }

  @inlinable @inline(__always)
  internal init(_encodedOffset offset: Int) {
    self.init(encodedOffset: offset, transcodedOffset: 0)
  }

  @usableFromInline
  internal init(
    encodedOffset: Int, transcodedOffset: Int, characterStride: Int
  ) {
    self.init(encodedOffset: encodedOffset, transcodedOffset: transcodedOffset)
    if _slowPath(characterStride > 0x3F) { return }
    self._rawBits |= UInt64(truncatingIfNeeded: characterStride &<< 8)
    self._invariantCheck()
  }

  @usableFromInline
  internal init(encodedOffset pos: Int, characterStride char: Int) {
    self.init(encodedOffset: pos, transcodedOffset: 0, characterStride: char)
  }

  #if !INTERNAL_CHECKS_ENABLED
  @inlinable @inline(__always) internal func _invariantCheck() {}
  #else
  @usableFromInline @inline(never) @_effects(releasenone)
  internal func _invariantCheck() {
    _internalInvariant(_encodedOffset >= 0)
    if self._isCharacterAligned {
      _internalInvariant(_isScalarAligned)
    }
    if self._isScalarAligned {
      _internalInvariant_5_1(transcodedOffset == 0)
    }
  }
  #endif // INTERNAL_CHECKS_ENABLED
}

// Creation helpers, which will make migration easier if we decide to use and
// propagate the reserved bits.
extension String.Index {
  @inlinable @inline(__always)
  internal var strippingTranscoding: String.Index {
    return String.Index(_encodedOffset: self._encodedOffset)
  }

  @inlinable @inline(__always)
  internal var nextEncoded: String.Index {
    _internalInvariant(self.transcodedOffset == 0)
    return String.Index(_encodedOffset: self._encodedOffset &+ 1)
  }

  @inlinable @inline(__always)
  internal var priorEncoded: String.Index {
    _internalInvariant(self.transcodedOffset == 0)
    return String.Index(_encodedOffset: self._encodedOffset &- 1)
  }

  @inlinable @inline(__always)
  internal var nextTranscoded: String.Index {
    return String.Index(
      encodedOffset: self._encodedOffset,
      transcodedOffset: self.transcodedOffset &+ 1)
  }

  @inlinable @inline(__always)
  internal var priorTranscoded: String.Index {
    return String.Index(
      encodedOffset: self._encodedOffset,
      transcodedOffset: self.transcodedOffset &- 1)
  }


  // Get an index with an encoded offset relative to this one.
  // Note: strips any transcoded offset.
  @inlinable @inline(__always)
  internal func encoded(offsetBy n: Int) -> String.Index {
    return String.Index(_encodedOffset: self._encodedOffset &+ n)
  }

  @inlinable @inline(__always)
  internal func transcoded(withOffset n: Int) -> String.Index {
    _internalInvariant(self.transcodedOffset == 0)
    return String.Index(encodedOffset: self._encodedOffset, transcodedOffset: n)
  }
}

extension String.Index {
  @_alwaysEmitIntoClient @inline(__always) // Swift 5.7
  internal static var __scalarAlignmentBit: UInt64 { 0x1 }

  @_alwaysEmitIntoClient @inline(__always) // Swift 5.7
  internal static var __characterAlignmentBit: UInt64 { 0x2 }

  @_alwaysEmitIntoClient @inline(__always) // Swift 5.7
  internal static var __utf8Bit: UInt64 { 0x4 }

  @_alwaysEmitIntoClient @inline(__always) // Swift 5.7
  internal static var __utf16Bit: UInt64 { 0x8 }

  @_alwaysEmitIntoClient @inline(__always) // Swift 5.7
  internal static func __encodingBit(utf16: Bool) -> UInt64 {
    let utf16 = Int8(Builtin.zext_Int1_Int8(utf16._value))
    return __utf8Bit &<< utf16
  }
}

/*
  Index Scalar Alignment

  SE-0180 unifies the Index type of String and all its views and allows
  non-scalar-aligned indices to be used across views. In order to guarantee
  behavior, we often have to check and perform scalar alignment. To speed up
  these checks, we allocate a bit denoting known-to-be-scalar-aligned, so that
  the alignment check can skip the load. The below shows what views need to
  check for alignment before they can operate, and whether the indices they
  produce are aligned.

  ┌───────────────╥───────────────────────────┬─────────────────────────┐
  │ View          ║ Requires Scalar Alignment │ Produces Scalar Aligned │
  ╞═══════════════╬═══════════════════════════╪═════════════════════════╡
  │ Native UTF8   ║ no                        │ no                      │
  ├───────────────╫───────────────────────────┼─────────────────────────┤
  │ Native UTF16  ║ yes                       │ no                      │
  ╞═══════════════╬═══════════════════════════╪═════════════════════════╡
  │ Foreign UTF8  ║ yes                       │ no                      │
  ├───────────────╫───────────────────────────┼─────────────────────────┤
  │ Foreign UTF16 ║ no                        │ no                      │
  ╞═══════════════╬═══════════════════════════╪═════════════════════════╡
  │ UnicodeScalar ║ yes                       │ yes                     │
  ├───────────────╫───────────────────────────┼─────────────────────────┤
  │ Character     ║ yes                       │ yes                     │
  └───────────────╨───────────────────────────┴─────────────────────────┘

  The "requires scalar alignment" applies to any operation taking a String.Index
  that's not defined entirely in terms of other operations taking a
  String.Index. These include:

  * index(after:)
  * index(before:)
  * subscript
  * distance(from:to:) (since `to` is compared against directly)
  * UTF16View._nativeGetOffset(for:)

*/
extension String.Index {
  @_alwaysEmitIntoClient // Swift 5.1
  @inline(__always)
  internal var _isScalarAligned: Bool {
    0 != _rawBits & Self.__scalarAlignmentBit
  }

  @_alwaysEmitIntoClient // Swift 5.1
  @inline(__always)
  internal var _scalarAligned: String.Index {
    var idx = self
    idx._rawBits |= Self.__scalarAlignmentBit
    idx._invariantCheck()
    return idx
  }
}

// ### Character (a.k.a. Extended Grapheme Cluster) Alignment
//
// Swift 5.7 assigned a new bit denoting that an index is known to be
// `Character`-aligned. This is used to enable more reliable detection &
// handling of extended grapheme cluster boundaries in indexing edge cases
// introduced by SE-0180, without slowing down the usual case, when code isn't
// interchanging indices between views.
//
// Beware! In substrings whose bounds aren't `Character`-aligned, extended
// grapheme breaks are sometimes in different places than in their base string.
// (The sequence of characters in a substring depend only on the Unicode scalars
// that make up its contents, not on their surrounding context.) Therefore, such
// substrings must not look at or set this bit: indices must be reliably
// interchangeable between strings and their associated substrings, even if the
// latter are irregular.
//
// Note that `startIndex` and `endIndex` have fully inlinable implementations.
// This means that when code built on older releases runs on 5.7, this bit may
// not be set on these, even though they are always `Character`-aligned. This is
// fine -- `index(after:)` and `index(before:)` still do the right thing with
// minimal/no performance loss. (The start/end index is handled specially.)
extension String.Index {
  @_alwaysEmitIntoClient // Swift 5.7
  @inline(__always)
  internal var _isCharacterAligned: Bool {
    0 != _rawBits & Self.__characterAlignmentBit
  }

  /// Return the same index with both the scalar- and `Character`-aligned bits
  /// set.
  ///
  /// (`Character` alignment implies scalar alignment.)
  @_alwaysEmitIntoClient // Swift 5.7
  @inline(__always)
  internal var _characterAligned: String.Index {
    let r = _rawBits | Self.__characterAlignmentBit | Self.__scalarAlignmentBit
    let idx = Self(r)
    idx._invariantCheck()
    return idx
  }
}

extension String.Index {
  @_alwaysEmitIntoClient // Swift 5.7
  internal func _copyingAlignment(from index: Self) -> Self {
    let mask = Self.__scalarAlignmentBit | Self.__characterAlignmentBit
    return Self((_rawBits & ~mask) | (index._rawBits & mask))
  }
}

// ### Index Encoding
//
// Swift 5.7 introduced bookkeeping to keep track of the Unicode encoding
// associated with the position value in String indices. Indices whose position
// is an offset into UTF-8 storage come with the corresponding flag set, and a
// separate flag is set for UTF-16 indices. (Only foreign strings can be UTF-16
// encoded. As of 5.7, all foreign strings are UTF-16; but this is subject to
// change later if we ever decide to implement additional foreign forms.)
//
// In releases before 5.7, the bits corresponding to these flags were considered
// reserved, and they were both set to zero in inlinable code. This means that
// (on ABI stable platforms at least) we cannot assume that either of these bits
// will be reliably set. If they are both clear, then we must fall back to
// assuming that the index has the right encoding for whatever string it is used
// on. However, if either of these bits are set, then the other bit's value is
// also reliable -- whether it's set or cleared.
//
// The indices of ASCII strings are encoding-independent, i.e. transcoding such
// strings from UTF-8 to UTF-16 (or vice versa) does not change the position
// value of any of their indices. Therefore it isn't an error for an index to
// have both of these flags set. (The start index of every string also behaves
// this way: position zero is the same no matter how what encoding is used for
// the rest of string.)
//
// These two bits (along with the isForeignUTF8 flag in StringObject) allow
// newer versions of the Standard Library to more reliably catch runtime errors
// where client code is applying an index from a UTF-16 string to a UTF-8 one,
// or vice versa. This typically happens when indices from a UTF-16 Cocoa string
// that was verbatim bridged into Swift are accidentally applied to a mutated
// version of the same string. (The mutation turns it into a UTF-8 native
// string, where the same numerical offsets might correspond to wildly different
// logical positions.)
//
// Such code has always been broken, as the old indices are documented to be no
// longer valid after the mutation; however, in previous releases such cases
// weren't reliably detected, and if the code was only ever tested on ASCII
// strings, then the bug could lie dormant for a long time. (Until the code
// encounters a non-ASCII character and someone gets surprised that the results
// no longer make sense.)
//
// As more code gets rebuilt with Swift 5.7+, the stdlib will gradually become
// able to reliably catch and correct all such issues. The error cases are
// handled in `_StringGuts.ensureMatchingEncoding(_:)`; see there for the sordid
// details.
extension String.Index {
  @_alwaysEmitIntoClient // Swift 5.7
  @inline(__always)
  internal var _encodingBits: UInt64 {
    _rawBits & (Self.__utf8Bit | Self.__utf16Bit)
  }

  /// Returns true if the position in this index can be interpreted as an offset
  /// into UTF-8-encoded string storage.
  ///
  /// (This returns true if either we know for sure that this is an UTF-8 index,
  /// or if we don't have enough information to determine its encoding.)
  @_alwaysEmitIntoClient // Swift 5.7
  @inline(__always)
  internal var _canBeUTF8: Bool {
    // The only way an index cannot be UTF-8 is it has only the UTF-16 flag set.
    _encodingBits != Self.__utf16Bit
  }

  /// Returns true if the position in this index can be interpreted as offset
  /// into UTF-16-encoded string storage.
  ///
  /// (This returns true if either we know for sure that this is an UTF-16
  /// index, or if we don't have enough information to determine its
  /// encoding.)
  @_alwaysEmitIntoClient // Swift 5.7
  @inline(__always)
  internal var _canBeUTF16: Bool {
    // The only way an index cannot be UTF-16 is it has only the UTF-8 flag set.
    _encodingBits != Self.__utf8Bit
  }

  /// Returns true if the encoding of this index isn't known to be in conflict
  /// with the specified encoding.
  ///
  /// If the index was created by code that was built on a stdlib below 5.7,
  /// then this check may incorrectly return true on a mismatching index, but it
  /// is guaranteed to never incorrectly return false. If all loaded binaries
  /// were built in 5.7+, then this method is guaranteed to always return the
  /// correct value.
  @_alwaysEmitIntoClient // Swift 5.7
  @inline(__always)
  internal func _hasMatchingEncoding(isUTF8 utf8: Bool) -> Bool {
    _encodingBits != Self.__encodingBit(utf16: utf8)
  }

  /// Returns the same index with the UTF-8 bit set.
  @_alwaysEmitIntoClient // Swift 5.7
  @inline(__always)
  internal var _knownUTF8: Self { Self(_rawBits | Self.__utf8Bit) }

  /// Returns the same index with the UTF-16 bit set.
  @_alwaysEmitIntoClient // Swift 5.7
  @inline(__always)
  internal var _knownUTF16: Self { Self(_rawBits | Self.__utf16Bit) }

  /// Returns the same index with both UTF-8 & UTF-16 bits set.
  @_alwaysEmitIntoClient // Swift 5.7
  @inline(__always)
  internal var _encodingIndependent: Self {
    Self(_rawBits | Self.__utf8Bit | Self.__utf16Bit)
  }

  @_alwaysEmitIntoClient // Swift 5.7
  internal func _copyingEncoding(from index: Self) -> Self {
    let mask = Self.__utf8Bit | Self.__utf16Bit
    return Self((_rawBits & ~mask) | (index._rawBits & mask))
  }
}

extension String.Index: Equatable {
  @inlinable @inline(__always)
  public static func == (lhs: String.Index, rhs: String.Index) -> Bool {
    return lhs.orderingValue == rhs.orderingValue
  }
}

extension String.Index: Comparable {
  @inlinable @inline(__always)
  public static func < (lhs: String.Index, rhs: String.Index) -> Bool {
    return lhs.orderingValue < rhs.orderingValue
  }
}

extension String.Index: Hashable {
  /// Hashes the essential components of this value by feeding them into the
  /// given hasher.
  ///
  /// - Parameter hasher: The hasher to use when combining the components
  ///   of this instance.
  @inlinable
  public func hash(into hasher: inout Hasher) {
    hasher.combine(orderingValue)
  }
}

extension String.Index {
  @_alwaysEmitIntoClient
  internal var _encodingDescription: String {
    switch (_rawBits & Self.__utf8Bit != 0, _rawBits & Self.__utf16Bit != 0) {
    case (false, false): return "unknown"
    case (true, false): return "utf8"
    case (false, true): return "utf16"
    case (true, true): return "any"
    }
  }

  // The definitions below are placeholders for potential future `String.Index`
  // conformances to `CustomStringConvertible` and
  // `CustomDebugStringConvertible`. They are supplied here to make working with
  // string indices somewhat bearable while we're working on adding the actual
  // conformances.

  /// A textual representation of this instance.
  @_alwaysEmitIntoClient
  @inline(never)
  public var _description: String {
    // 23[utf8]+1
    var d = "\(_encodedOffset)[\(_encodingDescription)]"
    if transcodedOffset != 0 {
      d += "+\(transcodedOffset)"
    }
    return d
  }

  /// A textual representation of this instance, suitable for debugging.
  @_alwaysEmitIntoClient
  @inline(never)
  public var _debugDescription: String {
    var d = "String.Index("
    d += "offset: \(_encodedOffset)[\(_encodingDescription)]"
    if transcodedOffset != 0 {
      d += "+\(transcodedOffset)"
    }
    if _isCharacterAligned {
      d += ", aligned: character"
    } else if _isScalarAligned {
      d += ", aligned: scalar"
    }
    if let stride = characterStride {
      d += ", stride: \(stride)"
    }
    d += ")"
    return d
  }
}