1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
|
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021-2023 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//
extension UInt8 {
static var _lineFeed: UInt8 { 0x0A }
static var _carriageReturn: UInt8 { 0x0D }
static var _lineTab: UInt8 { 0x0B }
static var _formFeed: UInt8 { 0x0C }
static var _space: UInt8 { 0x20 }
static var _tab: UInt8 { 0x09 }
static var _underscore: UInt8 { 0x5F }
}
private var _0: UInt8 { 0x30 }
private var _9: UInt8 { 0x39 }
private var _a: UInt8 { 0x61 }
private var _z: UInt8 { 0x7A }
private var _A: UInt8 { 0x41 }
private var _Z: UInt8 { 0x5A }
extension UInt8 {
var _isASCII: Bool { self < 0x80 }
// TODO: Bitvectors for the below
/// Assuming we're ASCII, whether we match `\d`
var _asciiIsDigit: Bool {
assert(_isASCII)
return(_0..._9).contains(self)
}
/// Assuming we're ASCII, whether we match `\h`
var _asciiIsHorizontalWhitespace: Bool {
assert(_isASCII)
return self == ._space || self == ._tab
}
/// Assuming we're ASCII, whether we match `\v`
var _asciiIsVerticalWhitespace: Bool {
assert(_isASCII)
switch self {
case ._lineFeed, ._carriageReturn, ._lineTab, ._formFeed:
return true
default:
return false
}
}
/// Assuming we're ASCII, whether we match `\s`
var _asciiIsWhitespace: Bool {
assert(_isASCII)
switch self {
case ._space, ._tab, ._lineFeed, ._lineTab, ._formFeed, ._carriageReturn:
return true
default:
return false
}
}
/// Assuming we're ASCII, whether we match `[a-zA-Z]`
var _asciiIsLetter: Bool {
assert(_isASCII)
return (_a..._z).contains(self) || (_A..._Z).contains(self)
}
/// Assuming we're ASCII, whether we match `\w`
var _asciiIsWord: Bool {
assert(_isASCII)
return _asciiIsDigit || _asciiIsLetter || self == ._underscore
}
}
extension String {
/// TODO: better to take isScalarSemantics parameter, we can return more results
/// and we can give the right `next` index, not requiring the caller to re-adjust it
/// TODO: detailed description of nuanced semantics
func _quickASCIICharacter(
at idx: Index,
limitedBy end: Index
) -> (first: UInt8, next: Index, crLF: Bool)? {
// TODO: fastUTF8 version
assert(String.Index(idx, within: unicodeScalars) != nil)
assert(idx <= end)
if idx == end {
return nil
}
let base = utf8[idx]
guard base._isASCII else {
assert(!self[idx].isASCII)
return nil
}
var next = utf8.index(after: idx)
if next == end {
return (first: base, next: next, crLF: false)
}
let tail = utf8[next]
guard tail._isSub300StartingByte else { return nil }
// Handle CR-LF:
if base == ._carriageReturn && tail == ._lineFeed {
utf8.formIndex(after: &next)
guard next == end || utf8[next]._isSub300StartingByte else {
return nil
}
return (first: base, next: next, crLF: true)
}
assert(self[idx].isASCII && self[idx] != "\r\n")
return (first: base, next: next, crLF: false)
}
func _quickMatch(
_ cc: _CharacterClassModel.Representation,
at idx: Index,
limitedBy end: Index,
isScalarSemantics: Bool
) -> (next: Index, matchResult: Bool)? {
/// ASCII fast-paths
guard let (asciiValue, next, isCRLF) = _quickASCIICharacter(
at: idx, limitedBy: end
) else {
return nil
}
// TODO: bitvectors
switch cc {
case .any, .anyGrapheme:
return (next, true)
case .digit:
return (next, asciiValue._asciiIsDigit)
case .horizontalWhitespace:
return (next, asciiValue._asciiIsHorizontalWhitespace)
case .verticalWhitespace, .newlineSequence:
if asciiValue._asciiIsVerticalWhitespace {
if isScalarSemantics && isCRLF && cc == .verticalWhitespace {
return (utf8.index(before: next), true)
}
return (next, true)
}
return (next, false)
case .whitespace:
if asciiValue._asciiIsWhitespace {
if isScalarSemantics && isCRLF {
return (utf8.index(before: next), true)
}
return (next, true)
}
return (next, false)
case .word:
return (next, asciiValue._asciiIsWord)
}
}
}
|