File: _CharacterClassModel.swift

package info (click to toggle)
swiftlang 6.0.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,519,992 kB
  • sloc: cpp: 9,107,863; ansic: 2,040,022; asm: 1,135,751; python: 296,500; objc: 82,456; f90: 60,502; lisp: 34,951; pascal: 19,946; sh: 18,133; perl: 7,482; ml: 4,937; javascript: 4,117; makefile: 3,840; awk: 3,535; xml: 914; fortran: 619; cs: 573; ruby: 573
file content (184 lines) | stat: -rw-r--r-- 5,587 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//

@_implementationOnly import _RegexParser

// NOTE: This is a model type. We want to be able to get one from
// an AST, but this isn't a natural thing to produce in the context
// of parsing or to store in an AST

struct _CharacterClassModel: Hashable {
  /// The actual character class to match.
  let cc: Representation

  /// The level (character or Unicode scalar) at which to match.
  let matchLevel: MatchingOptions.SemanticLevel

  /// If this character character class only matches ascii characters
  let isStrictASCII: Bool

  /// Whether this character class matches against an inverse,
  /// e.g \D, \S, [^abc].
  let isInverted: Bool

  init(
    cc: Representation,
    options: MatchingOptions,
    isInverted: Bool
  ) {
    self.cc = cc
    self.matchLevel = options.semanticLevel
    self.isStrictASCII = cc.isStrictAscii(options: options)
    self.isInverted = isInverted
  }

  enum Representation: UInt64, Hashable {
    /// Any character
    case any = 0
    /// Any grapheme cluster
    case anyGrapheme
    /// Character.isDigit
    case digit
    /// Horizontal whitespace: `[:blank:]`, i.e
    /// `[\p{gc=Space_Separator}\N{CHARACTER TABULATION}]
    case horizontalWhitespace
    /// Character.isNewline
    case newlineSequence
    /// Vertical whitespace: `[\u{0A}-\u{0D}\u{85}\u{2028}\u{2029}]`
    case verticalWhitespace
    /// Character.isWhitespace
    case whitespace
    /// Character.isLetter or Character.isDigit or Character == "_"
    case word
  }

  /// Returns the end of the match of this character class in the string.
  ///
  /// - Parameter str: The string to match against.
  /// - Parameter at: The index to start matching.
  /// - Parameter options: Options for the match operation.
  /// - Returns: The index of the end of the match, or `nil` if there is no match.
  func matches(
    in input: String,
    at currentPosition: String.Index,
    limitedBy end: String.Index
  ) -> String.Index? {
    // FIXME: This is only called in custom character classes that contain builtin
    // character classes as members (ie: [a\w] or set operations), is there
    // any way to avoid that? Can we remove this somehow?
    guard currentPosition < end else {
      return nil
    }

    let isScalarSemantics = matchLevel == .unicodeScalar

    return input.matchBuiltinCC(
      cc,
      at: currentPosition,
      limitedBy: end,
      isInverted: isInverted,
      isStrictASCII: isStrictASCII,
      isScalarSemantics: isScalarSemantics)
  }
}

extension _CharacterClassModel.Representation {
  /// Returns true if this CharacterClass should be matched by strict ascii under the given options
  func isStrictAscii(options: MatchingOptions) -> Bool {
    switch self {
    case .digit: return options.usesASCIIDigits
    case .horizontalWhitespace: return options.usesASCIISpaces
    case .newlineSequence: return options.usesASCIISpaces
    case .verticalWhitespace: return options.usesASCIISpaces
    case .whitespace: return options.usesASCIISpaces
    case .word: return options.usesASCIIWord
    default: return false
    }
  }
}

extension _CharacterClassModel.Representation: CustomStringConvertible {
  var description: String {
    switch self {
    case .any: return "<any>"
    case .anyGrapheme: return "<any grapheme>"
    case .digit: return "<digit>"
    case .horizontalWhitespace: return "<horizontal whitespace>"
    case .newlineSequence: return "<newline sequence>"
    case .verticalWhitespace: return "vertical whitespace"
    case .whitespace: return "<whitespace>"
    case .word: return "<word>"
    }
  }
}

extension _CharacterClassModel: CustomStringConvertible {
  var description: String {
    return "\(isInverted ? "not " : "")\(cc)"
  }
}

extension DSLTree.Atom.CharacterClass {
  /// Converts this DSLTree CharacterClass into our runtime representation
  func asRuntimeModel(_ options: MatchingOptions) -> _CharacterClassModel {
    let cc: _CharacterClassModel.Representation
    var inverted = false
    switch self {
    case .digit:
      cc = .digit
    case .notDigit:
      cc = .digit
      inverted = true

    case .horizontalWhitespace:
      cc = .horizontalWhitespace
    case .notHorizontalWhitespace:
      cc = .horizontalWhitespace
      inverted = true

    case .newlineSequence:
      cc = .newlineSequence

    // FIXME: This is more like '.' than inverted '\R', as it is affected
    // by e.g (*CR). We should therefore really be emitting it through
    // emitDot(). For now we treat it as semantically invalid.
    case .notNewline:
      cc = .newlineSequence
      inverted = true

    case .whitespace:
      cc = .whitespace
    case .notWhitespace:
      cc = .whitespace
      inverted = true

    case .verticalWhitespace:
      cc = .verticalWhitespace
    case .notVerticalWhitespace:
      cc = .verticalWhitespace
      inverted = true

    case .word:
      cc = .word
    case .notWord:
      cc = .word
      inverted = true

    case .anyGrapheme:
      cc = .anyGrapheme
    case .anyUnicodeScalar:
      fatalError("Unsupported")
    }
    return _CharacterClassModel(cc: cc, options: options, isInverted: inverted)
  }
}