1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
|
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//
@_implementationOnly import _RegexParser
@available(SwiftStdlib 5.7, *)
extension Regex {
/// Returns a regular expression that ignores case when matching.
///
/// - Parameter ignoresCase: A Boolean value indicating whether to ignore case.
/// - Returns: The modified regular expression.
public func ignoresCase(_ ignoresCase: Bool = true) -> Regex<RegexOutput> {
wrapInOption(.caseInsensitive, addingIf: ignoresCase)
}
/// Returns a regular expression that matches only ASCII characters as word
/// characters.
///
/// - Parameter useASCII: A Boolean value indicating whether to match only
/// ASCII characters as word characters.
/// - Returns: The modified regular expression.
public func asciiOnlyWordCharacters(_ useASCII: Bool = true) -> Regex<RegexOutput> {
wrapInOption(.asciiOnlyWord, addingIf: useASCII)
}
/// Returns a regular expression that matches only ASCII characters as digits.
///
/// - Parameter useasciiOnlyDigits: A Boolean value indicating whether to
/// match only ASCII characters as digits.
/// - Returns: The modified regular expression.
public func asciiOnlyDigits(_ useASCII: Bool = true) -> Regex<RegexOutput> {
wrapInOption(.asciiOnlyDigit, addingIf: useASCII)
}
/// Returns a regular expression that matches only ASCII characters as space
/// characters.
///
/// - Parameter asciiOnlyWhitespace: A Boolean value indicating whether to
/// match only ASCII characters as space characters.
/// - Returns: The modified regular expression.
public func asciiOnlyWhitespace(_ useASCII: Bool = true) -> Regex<RegexOutput> {
wrapInOption(.asciiOnlySpace, addingIf: useASCII)
}
/// Returns a regular expression that matches only ASCII characters when
/// matching character classes.
///
/// - Parameter useASCII: A Boolean value indicating whether to match only
/// ASCII characters when matching character classes.
/// - Returns: The modified regular expression.
public func asciiOnlyCharacterClasses(_ useASCII: Bool = true) -> Regex<RegexOutput> {
wrapInOption(.asciiOnlyPOSIXProps, addingIf: useASCII)
}
/// Returns a regular expression that uses the specified word boundary algorithm.
///
/// - Parameter wordBoundaryKind: The algorithm to use for determining word boundaries.
/// - Returns: The modified regular expression.
public func wordBoundaryKind(_ wordBoundaryKind: RegexWordBoundaryKind) -> Regex<RegexOutput> {
wrapInOption(.unicodeWordBoundaries, addingIf: wordBoundaryKind == .default)
}
/// Returns a regular expression where the "any" metacharacter (`.`)
/// also matches against the start and end of a line.
///
/// - Parameter dotMatchesNewlines: A Boolean value indicating whether `.`
/// should match a newline character.
/// - Returns: The modified regular expression.
public func dotMatchesNewlines(_ dotMatchesNewlines: Bool = true) -> Regex<RegexOutput> {
wrapInOption(.singleLine, addingIf: dotMatchesNewlines)
}
/// Returns a regular expression where the start and end of input
/// anchors (`^` and `$`) also match against the start and end of a line.
///
/// This method corresponds to applying the `m` option in regex syntax. For
/// this behavior in the `RegexBuilder` syntax, see
/// `Anchor.startOfLine`, `Anchor.endOfLine`, `Anchor.startOfSubject`,
/// and `Anchor.endOfSubject`.
///
/// - Parameter matchLineEndings: A Boolean value indicating whether `^` and
/// `$` should match the start and end of lines, respectively.
/// - Returns: The modified regular expression.
public func anchorsMatchLineEndings(_ matchLineEndings: Bool = true) -> Regex<RegexOutput> {
wrapInOption(.multiline, addingIf: matchLineEndings)
}
/// Returns a regular expression where quantifiers use the specified behavior
/// by default.
///
/// This setting does not affect calls to quantifier methods, such as
/// `OneOrMore`, that include an explicit `behavior` parameter.
///
/// Passing `.eager` or `.reluctant` to this method corresponds to applying
/// the `(?-U)` or `(?U)` option in regex syntax, respectively.
///
/// - Parameter behavior: The default behavior to use for quantifiers.
public func repetitionBehavior(_ behavior: RegexRepetitionBehavior) -> Regex<RegexOutput> {
if behavior == .possessive {
return wrapInOption(.possessiveByDefault, addingIf: true)
} else {
return wrapInOption(.reluctantByDefault, addingIf: behavior == .reluctant)
}
}
/// Returns a regular expression that matches with the specified semantic
/// level.
///
/// When matching with grapheme cluster semantics (the default),
/// metacharacters like `.` and `\w`, custom character classes, and character
/// class instances like `.any` match a grapheme cluster when possible,
/// corresponding with the default string representation. In addition,
/// matching with grapheme cluster semantics compares characters using their
/// canonical representation, corresponding with how strings comparison works.
///
/// When matching with Unicode scalar semantics, metacharacters and character
/// classes always match a single Unicode scalar value, even if that scalar
/// comprises part of a grapheme cluster.
///
/// These semantic levels can lead to different results, especially when
/// working with strings that have decomposed characters. In the following
/// example, `queRegex` matches any 3-character string that begins with `"q"`.
///
/// let composed = "qué"
/// let decomposed = "que\u{301}"
///
/// let queRegex = /^q..$/
///
/// print(composed.contains(queRegex))
/// // Prints "true"
/// print(decomposed.contains(queRegex))
/// // Prints "true"
///
/// When using Unicode scalar semantics, however, the regular expression only
/// matches the composed version of the string, because each `.` matches a
/// single Unicode scalar value.
///
/// let queRegexScalar = queRegex.matchingSemantics(.unicodeScalar)
/// print(composed.contains(queRegexScalar))
/// // Prints "true"
/// print(decomposed.contains(queRegexScalar))
/// // Prints "false"
///
/// - Parameter semanticLevel: The semantics to use during matching.
/// - Returns: The modified regular expression.
public func matchingSemantics(_ semanticLevel: RegexSemanticLevel) -> Regex<RegexOutput> {
switch semanticLevel.base {
case .graphemeCluster:
return wrapInOption(.graphemeClusterSemantics, addingIf: true)
case .unicodeScalar:
return wrapInOption(.unicodeScalarSemantics, addingIf: true)
}
}
/// Returns a regular expression that uses an NSRegularExpression
/// compatibility mode.
///
/// This mode includes using Unicode scalar semantics and treating a `dot`
/// as matching newline sequences (when in the unrelated dot-matches-newlines
/// mode).
@_spi(Foundation)
public var _nsreCompatibility: Regex<RegexOutput> {
wrapInOption(.nsreCompatibleDot, addingIf: true)
.wrapInOption(.unicodeScalarSemantics, addingIf: true)
}
}
/// A semantic level to use during regex matching.
///
/// The semantic level determines whether a regex matches with the same
/// character-based semantics as string comparisons or by matching individual
/// Unicode scalar values. See ``Regex/matchingSemantics(_:)`` for more about
/// changing the semantic level for all or part of a regex.
@available(SwiftStdlib 5.7, *)
public struct RegexSemanticLevel: Hashable {
internal enum Representation {
case graphemeCluster
case unicodeScalar
}
internal var base: Representation
/// Match at the character level.
///
/// At this semantic level, each matched element is a `Character` value.
/// This is the default semantic level.
public static var graphemeCluster: RegexSemanticLevel {
.init(base: .graphemeCluster)
}
/// Match at the Unicode scalar level.
///
/// At this semantic level, the string's `UnicodeScalarView` is used for
/// matching, and each matched element is a `UnicodeScalar` value.
public static var unicodeScalar: RegexSemanticLevel {
.init(base: .unicodeScalar)
}
}
/// A word boundary algorithm to use during regex matching.
///
/// See ``Regex/wordBoundaryKind(_:)`` for information about specifying the
/// word boundary kind for all or part of a regex.
@available(SwiftStdlib 5.7, *)
public struct RegexWordBoundaryKind: Hashable {
internal enum Representation {
case unicodeLevel1
case unicodeLevel2
}
internal var base: Representation
/// A word boundary algorithm that implements the "simple word boundary"
/// Unicode recommendation.
///
/// A simple word boundary is a position in the input between two characters
/// that match `/\w\W/` or `/\W\w/`, or between the start or end of the input
/// and a `\w` character. Word boundaries therefore depend on the option-
/// defined behavior of `\w`.
public static var simple: Self {
.init(base: .unicodeLevel1)
}
/// A word boundary algorithm that implements the "default word boundary"
/// Unicode recommendation.
///
/// Default word boundaries use a Unicode algorithm that handles some cases
/// better than simple word boundaries, such as words with internal
/// punctuation, changes in script, and Emoji.
public static var `default`: Self {
.init(base: .unicodeLevel2)
}
}
/// Specifies how much to attempt to match when using a quantifier.
///
/// See ``Regex/repetitionBehavior(_:)`` for more about specifying the default
/// matching behavior for all or part of a regex.
@available(SwiftStdlib 5.7, *)
public struct RegexRepetitionBehavior: Hashable {
internal enum Kind {
case eager
case reluctant
case possessive
}
var kind: Kind
@_spi(RegexBuilder) public var dslTreeKind: DSLTree._AST.QuantificationKind {
switch kind {
case .eager: return .eager
case .reluctant: return .reluctant
case .possessive: return .possessive
}
}
}
@available(SwiftStdlib 5.7, *)
extension RegexRepetitionBehavior {
/// Match as much of the input string as possible, backtracking when
/// necessary.
public static var eager: Self {
.init(kind: .eager)
}
/// Match as little of the input string as possible, expanding the matched
/// region as necessary to complete a match.
public static var reluctant: Self {
.init(kind: .reluctant)
}
/// Match as much of the input string as possible, performing no backtracking.
public static var possessive: Self {
.init(kind: .possessive)
}
}
// MARK: - Helper method
@available(SwiftStdlib 5.7, *)
extension RegexComponent {
fileprivate func wrapInOption(
_ option: AST.MatchingOption.Kind,
addingIf shouldAdd: Bool) -> Regex<RegexOutput>
{
let sequence = shouldAdd
? AST.MatchingOptionSequence(adding: [.init(option, location: .fake)])
: AST.MatchingOptionSequence(removing: [.init(option, location: .fake)])
return Regex(node: .nonCapturingGroup(
.init(ast: .changeMatchingOptions(sequence)), regex.root))
}
}
|