1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
|
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//
@_implementationOnly import _RegexParser
// NOTE: This is a model type. We want to be able to get one from
// an AST, but this isn't a natural thing to produce in the context
// of parsing or to store in an AST
struct _CharacterClassModel: Hashable {
/// The actual character class to match.
let cc: Representation
/// The level (character or Unicode scalar) at which to match.
let matchLevel: MatchingOptions.SemanticLevel
/// If this character character class only matches ascii characters
let isStrictASCII: Bool
/// Whether this character class matches against an inverse,
/// e.g \D, \S, [^abc].
let isInverted: Bool
init(
cc: Representation,
options: MatchingOptions,
isInverted: Bool
) {
self.cc = cc
self.matchLevel = options.semanticLevel
self.isStrictASCII = cc.isStrictAscii(options: options)
self.isInverted = isInverted
}
enum Representation: UInt64, Hashable {
/// Any character
case any = 0
/// Any grapheme cluster
case anyGrapheme
/// Character.isDigit
case digit
/// Horizontal whitespace: `[:blank:]`, i.e
/// `[\p{gc=Space_Separator}\N{CHARACTER TABULATION}]
case horizontalWhitespace
/// Character.isNewline
case newlineSequence
/// Vertical whitespace: `[\u{0A}-\u{0D}\u{85}\u{2028}\u{2029}]`
case verticalWhitespace
/// Character.isWhitespace
case whitespace
/// Character.isLetter or Character.isDigit or Character == "_"
case word
}
/// Returns the end of the match of this character class in the string.
///
/// - Parameter str: The string to match against.
/// - Parameter at: The index to start matching.
/// - Parameter options: Options for the match operation.
/// - Returns: The index of the end of the match, or `nil` if there is no match.
func matches(
in input: String,
at currentPosition: String.Index,
limitedBy end: String.Index
) -> String.Index? {
// FIXME: This is only called in custom character classes that contain builtin
// character classes as members (ie: [a\w] or set operations), is there
// any way to avoid that? Can we remove this somehow?
guard currentPosition < end else {
return nil
}
let isScalarSemantics = matchLevel == .unicodeScalar
return input.matchBuiltinCC(
cc,
at: currentPosition,
limitedBy: end,
isInverted: isInverted,
isStrictASCII: isStrictASCII,
isScalarSemantics: isScalarSemantics)
}
}
extension _CharacterClassModel.Representation {
/// Returns true if this CharacterClass should be matched by strict ascii under the given options
func isStrictAscii(options: MatchingOptions) -> Bool {
switch self {
case .digit: return options.usesASCIIDigits
case .horizontalWhitespace: return options.usesASCIISpaces
case .newlineSequence: return options.usesASCIISpaces
case .verticalWhitespace: return options.usesASCIISpaces
case .whitespace: return options.usesASCIISpaces
case .word: return options.usesASCIIWord
default: return false
}
}
}
extension _CharacterClassModel.Representation: CustomStringConvertible {
var description: String {
switch self {
case .any: return "<any>"
case .anyGrapheme: return "<any grapheme>"
case .digit: return "<digit>"
case .horizontalWhitespace: return "<horizontal whitespace>"
case .newlineSequence: return "<newline sequence>"
case .verticalWhitespace: return "vertical whitespace"
case .whitespace: return "<whitespace>"
case .word: return "<word>"
}
}
}
extension _CharacterClassModel: CustomStringConvertible {
var description: String {
return "\(isInverted ? "not " : "")\(cc)"
}
}
extension DSLTree.Atom.CharacterClass {
/// Converts this DSLTree CharacterClass into our runtime representation
func asRuntimeModel(_ options: MatchingOptions) -> _CharacterClassModel {
let cc: _CharacterClassModel.Representation
var inverted = false
switch self {
case .digit:
cc = .digit
case .notDigit:
cc = .digit
inverted = true
case .horizontalWhitespace:
cc = .horizontalWhitespace
case .notHorizontalWhitespace:
cc = .horizontalWhitespace
inverted = true
case .newlineSequence:
cc = .newlineSequence
// FIXME: This is more like '.' than inverted '\R', as it is affected
// by e.g (*CR). We should therefore really be emitting it through
// emitDot(). For now we treat it as semantically invalid.
case .notNewline:
cc = .newlineSequence
inverted = true
case .whitespace:
cc = .whitespace
case .notWhitespace:
cc = .whitespace
inverted = true
case .verticalWhitespace:
cc = .verticalWhitespace
case .notVerticalWhitespace:
cc = .verticalWhitespace
inverted = true
case .word:
cc = .word
case .notWord:
cc = .word
inverted = true
case .anyGrapheme:
cc = .anyGrapheme
case .anyUnicodeScalar:
fatalError("Unsupported")
}
return _CharacterClassModel(cc: cc, options: options, isInverted: inverted)
}
}
|