1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
|
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//
/*
Provide very low-level interfaces for scalar decoding.
These can be faster if we assume certain invariants are
maintained. We assert, of course, because we're not monsters.
Thus they are unsafe in the following senses:
- They assume validly encoded contents, otherwise UB
- They assume any pointers passed in will be live and valid
during execution and not concurrently written to, otherwise UB
- They assume any pointer passed in has sufficient bounds
for decoding a scalar, otherwise UB.
String maintains these invariants for its in-memory storage.
*/
// TODO: Design an "unsafe" and "assumingValid" API convention
enum UnsafeAssumingValidUTF8 {
@inlinable @inline(__always)
static func decode(_ x: UInt8) -> Unicode.Scalar {
_internalInvariant(UTF8.isASCII(x))
return Unicode.Scalar(_unchecked: UInt32(x))
}
@inlinable @inline(__always)
static func decode(
_ x: UInt8, _ y: UInt8
) -> Unicode.Scalar {
_internalInvariant(scalarLength(x) == 2)
_internalInvariant(UTF8.isContinuation(y))
let x = UInt32(x)
let value = ((x & 0b0001_1111) &<< 6) | continuationPayload(y)
return Unicode.Scalar(_unchecked: value)
}
@inlinable @inline(__always)
static func decode(
_ x: UInt8, _ y: UInt8, _ z: UInt8
) -> Unicode.Scalar {
_internalInvariant(scalarLength(x) == 3)
_internalInvariant(UTF8.isContinuation(y) && UTF8.isContinuation(z))
let x = UInt32(x)
let value = ((x & 0b0000_1111) &<< 12)
| (continuationPayload(y) &<< 6)
| continuationPayload(z)
return Unicode.Scalar(_unchecked: value)
}
@inlinable @inline(__always)
static func decode(
_ x: UInt8, _ y: UInt8, _ z: UInt8, _ w: UInt8
) -> Unicode.Scalar {
_internalInvariant(scalarLength(x) == 4)
_internalInvariant(
UTF8.isContinuation(y) && UTF8.isContinuation(z)
&& UTF8.isContinuation(w))
let x = UInt32(x)
let value = ((x & 0b0000_1111) &<< 18)
| (continuationPayload(y) &<< 12)
| (continuationPayload(z) &<< 6)
| continuationPayload(w)
return Unicode.Scalar(_unchecked: value)
}
// Also, assuming we can load from those bounds...
@inlinable
static func decode(
_ utf8: UnsafeByteBuffer, startingAt i: Int
) -> (Unicode.Scalar, scalarLength: Int) {
let cu0 = utf8[_unchecked: i]
let len = scalarLength(cu0)
switch len {
case 1: return (decode(cu0), len)
case 2: return (decode(cu0, utf8[_unchecked: i &+ 1]), len)
case 3: return (decode(
cu0, utf8[_unchecked: i &+ 1], utf8[_unchecked: i &+ 2]), len)
case 4:
return (decode(
cu0,
utf8[_unchecked: i &+ 1],
utf8[_unchecked: i &+ 2],
utf8[_unchecked: i &+ 3]),
len)
default:
fatalError("unreachable")//Builtin.unreachable()
}
}
@inlinable
static func decode(
_ utf8: UnsafeByteBuffer, endingAt i: Int
) -> (Unicode.Scalar, scalarLength: Int) {
let len = scalarLength(utf8, endingAt: i)
let (scalar, scalarLen) = decode(utf8, startingAt: i &- len)
_internalInvariant(len == scalarLen)
return (scalar, len)
}
@inlinable @inline(__always)
static func scalarLength(_ x: UInt8) -> Int {
_internalInvariant(!UTF8.isContinuation(x))
if UTF8.isASCII(x) { return 1 }
// TODO(String micro-performance): check codegen
return (~x).leadingZeroBitCount
}
@inlinable @inline(__always)
static func scalarLength(
_ utf8: UnsafeByteBuffer, endingAt i: Int
) -> Int {
var len = 1
while UTF8.isContinuation(utf8[_unchecked: i &- len]) {
len &+= 1
}
_internalInvariant(len == scalarLength(utf8[i &- len]))
return len
}
@inlinable @inline(__always)
static func continuationPayload(_ x: UInt8) -> UInt32 {
return UInt32(x & 0x3F)
}
@inlinable
static func scalarAlign(
_ utf8: UnsafeByteBuffer, _ idx: Int
) -> Int {
guard _fastPath(idx != utf8.count) else { return idx }
var i = idx
while _slowPath(UTF8.isContinuation(utf8[_unchecked: i])) {
i &-= 1
_internalInvariant(i >= 0,
"Malformed contents: starts with continuation byte")
}
return i
}
}
// TODO: Validating versions that remove that aspect of
// unsafety. Stdlib has stuff on _StrinGuts that could be
// at least partially refactored.
// TODO: Consider UTF-16 support, but that's normally best
// handled as a transcoding concern.
|