File: Decoding.swift

package info (click to toggle)
swiftlang 6.0.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,519,992 kB
  • sloc: cpp: 9,107,863; ansic: 2,040,022; asm: 1,135,751; python: 296,500; objc: 82,456; f90: 60,502; lisp: 34,951; pascal: 19,946; sh: 18,133; perl: 7,482; ml: 4,937; javascript: 4,117; makefile: 3,840; awk: 3,535; xml: 914; fortran: 619; cs: 573; ruby: 573
file content (163 lines) | stat: -rw-r--r-- 4,872 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//

/*

 Provide very low-level interfaces for scalar decoding.

 These can be faster if we assume certain invariants are
 maintained. We assert, of course, because we're not monsters.

 Thus they are unsafe in the following senses:

 - They assume validly encoded contents, otherwise UB
 - They assume any pointers passed in will be live and valid
   during execution and not concurrently written to, otherwise UB
 - They assume any pointer passed in has sufficient bounds
   for decoding a scalar, otherwise UB.

 String maintains these invariants for its in-memory storage.

 */


// TODO: Design an "unsafe" and "assumingValid" API convention

enum UnsafeAssumingValidUTF8 {
  @inlinable @inline(__always)
  static func decode(_ x: UInt8) -> Unicode.Scalar {
    _internalInvariant(UTF8.isASCII(x))
    return Unicode.Scalar(_unchecked: UInt32(x))
  }

  @inlinable @inline(__always)
  static func decode(
    _ x: UInt8, _ y: UInt8
  ) -> Unicode.Scalar {
    _internalInvariant(scalarLength(x) == 2)
    _internalInvariant(UTF8.isContinuation(y))
    let x = UInt32(x)
    let value = ((x & 0b0001_1111) &<< 6) | continuationPayload(y)
    return Unicode.Scalar(_unchecked: value)
  }

  @inlinable @inline(__always)
  static func decode(
    _ x: UInt8, _ y: UInt8, _ z: UInt8
  ) -> Unicode.Scalar {
    _internalInvariant(scalarLength(x) == 3)
    _internalInvariant(UTF8.isContinuation(y) && UTF8.isContinuation(z))
    let x = UInt32(x)
    let value = ((x & 0b0000_1111) &<< 12)
    | (continuationPayload(y) &<< 6)
    | continuationPayload(z)
    return Unicode.Scalar(_unchecked: value)
  }

  @inlinable @inline(__always)
  static func decode(
    _ x: UInt8, _ y: UInt8, _ z: UInt8, _ w: UInt8
  ) -> Unicode.Scalar {
    _internalInvariant(scalarLength(x) == 4)
    _internalInvariant(
      UTF8.isContinuation(y) && UTF8.isContinuation(z)
      && UTF8.isContinuation(w))
    let x = UInt32(x)
    let value = ((x & 0b0000_1111) &<< 18)
    | (continuationPayload(y) &<< 12)
    | (continuationPayload(z) &<< 6)
    | continuationPayload(w)
    return Unicode.Scalar(_unchecked: value)
  }

  // Also, assuming we can load from those bounds...
  @inlinable
  static func decode(
    _ utf8: UnsafeByteBuffer, startingAt i: Int
  ) -> (Unicode.Scalar, scalarLength: Int) {
    let cu0 = utf8[_unchecked: i]
    let len = scalarLength(cu0)
    switch  len {
    case 1: return (decode(cu0), len)
    case 2: return (decode(cu0, utf8[_unchecked: i &+ 1]), len)
    case 3: return (decode(
      cu0, utf8[_unchecked: i &+ 1], utf8[_unchecked: i &+ 2]), len)
    case 4:
      return (decode(
        cu0,
        utf8[_unchecked: i &+ 1],
        utf8[_unchecked: i &+ 2],
        utf8[_unchecked: i &+ 3]),
              len)
    default:
      fatalError("unreachable")//Builtin.unreachable()
    }
  }

  @inlinable
  static func decode(
    _ utf8: UnsafeByteBuffer, endingAt i: Int
  ) -> (Unicode.Scalar, scalarLength: Int) {
    let len = scalarLength(utf8, endingAt: i)
    let (scalar, scalarLen) = decode(utf8, startingAt: i &- len)
    _internalInvariant(len == scalarLen)
    return (scalar, len)
  }

  @inlinable @inline(__always)
  static func scalarLength(_ x: UInt8) -> Int {
    _internalInvariant(!UTF8.isContinuation(x))
    if UTF8.isASCII(x) { return 1 }
    // TODO(String micro-performance): check codegen
    return (~x).leadingZeroBitCount
  }

  @inlinable @inline(__always)
  static func scalarLength(
    _ utf8: UnsafeByteBuffer, endingAt i: Int
  ) -> Int {
    var len = 1
    while UTF8.isContinuation(utf8[_unchecked: i &- len]) {
      len &+= 1
    }
    _internalInvariant(len == scalarLength(utf8[i &- len]))
    return len
  }

  @inlinable @inline(__always)
  static func continuationPayload(_ x: UInt8) -> UInt32 {
    return UInt32(x & 0x3F)
  }

  @inlinable
  static func scalarAlign(
    _ utf8: UnsafeByteBuffer, _ idx: Int
  ) -> Int {
    guard _fastPath(idx != utf8.count) else { return idx }

    var i = idx
    while _slowPath(UTF8.isContinuation(utf8[_unchecked: i])) {
      i &-= 1
      _internalInvariant(i >= 0,
                         "Malformed contents: starts with continuation byte")
    }
    return i
  }
}

// TODO: Validating versions that remove that aspect of
// unsafety. Stdlib has stuff on _StrinGuts that could be
// at least partially refactored.

// TODO: Consider UTF-16 support, but that's normally best
// handled as a transcoding concern.