File: DelimiterLexing.swift

package info (click to toggle)
swiftlang 6.0.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,519,992 kB
  • sloc: cpp: 9,107,863; ansic: 2,040,022; asm: 1,135,751; python: 296,500; objc: 82,456; f90: 60,502; lisp: 34,951; pascal: 19,946; sh: 18,133; perl: 7,482; ml: 4,937; javascript: 4,117; makefile: 3,840; awk: 3,535; xml: 914; fortran: 619; cs: 573; ruby: 573
file content (390 lines) | stat: -rw-r--r-- 12,820 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//

public struct Delimiter: Hashable {
  let kind: Kind
  let poundCount: Int

  init(_ kind: Kind, poundCount: Int) {
    precondition(kind.allowsExtendedPoundSyntax || poundCount == 0)
    self.kind = kind
    self.poundCount = poundCount
  }

  var opening: String {
    String(repeating: "#", count: poundCount) + kind.opening
  }
  var closing: String {
    kind.closing + String(repeating: "#", count: poundCount)
  }

  /// Whether or not multi-line mode is permitted.
  var allowsMultiline: Bool {
    switch kind {
    case .forwardSlash:
      return poundCount > 0
    case .experimental:
      return false
    }
  }

  /// The delimiters which are currently enabled.
  static var enabledDelimiters: [Kind] { [.forwardSlash] }

  /// All known delimiters.
  static var allDelimiters: [Kind] { Kind.allCases }
}

extension Delimiter {
  enum Kind: Hashable, CaseIterable {
    case forwardSlash
    case experimental

    var openingAndClosing: (opening: String, closing: String) {
      switch self {
      case .forwardSlash: return ("/", "/")
      case .experimental: return ("#|", "|#")
      }
    }
    var opening: String { openingAndClosing.opening }
    var closing: String { openingAndClosing.closing }

    /// Whether or not extended pound syntax e.g `##/.../##` is allowed with
    /// this delimiter.
    var allowsExtendedPoundSyntax: Bool {
      switch self {
      case .forwardSlash:
        return true
      case .experimental:
        return false
      }
    }
  }
}

/// A wrapper for an `UnsafeRawPointer` representing a source location, marked
/// `@unchecked Sendable` to help errors carrying such information satisfy their
/// `Sendable` requirement.
///
/// TODO: consider further refactoring to avoid needing to lie to the compiler
/// about Sendability.
struct UnsafeSourceLocation: @unchecked Sendable {
  let ptr: UnsafeRawPointer

  init(_ ptr: UnsafeRawPointer) {
    self.ptr = ptr
  }
}

public struct DelimiterLexError: Error, CustomStringConvertible {
  public enum Kind: Hashable {
    case unterminated
    case invalidUTF8 // TODO: better range reporting
    case unknownDelimiter
    case unprintableASCII
    case multilineClosingNotOnNewline
  }

  public var kind: Kind

  var resumeLocation: UnsafeSourceLocation

  /// The pointer at which to resume lexing.
  public var resumePtr: UnsafeRawPointer { resumeLocation.ptr }

  init(_ kind: Kind, resumeAt resumePtr: UnsafeRawPointer) {
    self.kind = kind
    self.resumeLocation = UnsafeSourceLocation(resumePtr)
  }

  public var description: String {
    switch kind {
    case .unterminated: return "unterminated regex literal"
    case .invalidUTF8: return "invalid UTF-8 found in source file"
    case .unknownDelimiter: return "unknown regex literal delimiter"
    case .unprintableASCII: return "unprintable ASCII character found in source file"
    case .multilineClosingNotOnNewline: return "closing delimiter must appear on new line"
    }
  }
}

fileprivate struct DelimiterLexer {
  let start: UnsafeRawPointer
  var cursor: UnsafeRawPointer
  let end: UnsafeRawPointer

  var firstNewline: UnsafeRawPointer?
  var isMultiline: Bool { firstNewline != nil }

  let delimiters: [Delimiter.Kind]

  init(start: UnsafeRawPointer, end: UnsafeRawPointer,
       delimiters: [Delimiter.Kind]) {
    precondition(start <= end)
    self.start = start
    self.cursor = start
    self.end = end
    self.delimiters = delimiters
  }

  func ascii(_ s: Unicode.Scalar) -> UInt8 {
    assert(s.value <= 0x7F)
    return UInt8(asserting: s.value)
  }

  /// Return the byte at the current cursor, or `nil` if the end of the buffer
  /// has been reached.
  func load() -> UInt8? {
    guard cursor < end else { return nil }
    return cursor.load(as: UInt8.self)
  }

  /// Return the slice of `count` bytes from a specified cursor position, or
  /// `nil` if there are fewer than `count` bytes until the end of the buffer.
  func slice(
    at cursor: UnsafeRawPointer, _ count: Int
  ) -> UnsafeRawBufferPointer? {
    guard cursor + count <= end else { return nil }
    return UnsafeRawBufferPointer(start: cursor, count: count)
  }

  /// Return the slice of `count` bytes from the current cursor, or `nil` if
  /// there are fewer than `count` bytes until the end of the buffer.
  func slice(_ count: Int) -> UnsafeRawBufferPointer? {
    slice(at: cursor, count)
  }

  /// Advance the cursor `n` bytes.
  mutating func advanceCursor(_ n: Int = 1) {
    cursor += n
    precondition(cursor <= end, "Cannot advance past end")
  }

  /// Check to see if a byte sequence can be eaten from the current cursor.
  func canEat<C : Collection>(_ bytes: C) -> Bool where C.Element == UInt8 {
    guard let slice = slice(bytes.count) else { return false }
    return slice.elementsEqual(bytes)
  }

  /// Attempt to eat a byte sequence, returning `true` if successful.
  mutating func tryEat<C : Collection>(
    _ bytes: C
  ) -> Bool where C.Element == UInt8 {
    guard canEat(bytes) else { return false }
    advanceCursor(bytes.count)
    return true
  }

  /// Attempt to eat an ascii scalar, returning `true` if successful.
  mutating func tryEat(ascii s: Unicode.Scalar) -> Bool {
    guard load() == ascii(s) else { return false }
    advanceCursor()
    return true
  }

  /// Attempt to eat a particular closing delimiter, returning the contents of
  /// the literal, and ending pointer, or `nil` if this is not a delimiter
  /// ending.
  mutating func tryEatEnding(
    _ delimiter: Delimiter, contentsStart: UnsafeRawPointer
  ) throws -> (contents: String, end: UnsafeRawPointer)? {
    let contentsEnd = cursor
    guard tryEat(delimiter.closing.utf8) else { return nil }

    let count = contentsEnd - contentsStart
    let contents = UnsafeRawBufferPointer(
      start: contentsStart, count: count)

    // In multi-line mode, we must be on a new line. So scan backwards and make
    // sure we only have whitespace until the newline.
    if isMultiline {
      let idx = contents.lastIndex(
        where: { $0 == ascii("\n") || $0 == ascii("\r") })! + 1
      guard contents[idx...].all({ $0 == ascii(" ") || $0 == ascii("\t") })
      else {
        throw DelimiterLexError(.multilineClosingNotOnNewline, resumeAt: cursor)
      }
    }

    // Form a string from the contents and make sure it's valid UTF-8.
    let s = String(decoding: contents, as: UTF8.self)
    guard s.utf8.elementsEqual(contents) else {
      throw DelimiterLexError(.invalidUTF8, resumeAt: cursor)
    }
    return (contents: s, end: cursor)
  }

  /// Attempt to advance the lexer, throwing an error if the end of a line or
  /// the end of the buffer is reached.
  mutating func advance(escaped: Bool = false) throws {
    guard let next = load() else {
      // We've hit the end of the buffer. In multi-line mode, we don't want to
      // skip over what is likely otherwise valid Swift code, so resume from the
      // first newline.
      throw DelimiterLexError(.unterminated, resumeAt: firstNewline ?? cursor)
    }
    switch UnicodeScalar(next) {
    case let next where !next.isASCII:
      // Just advance into a UTF-8 sequence. It shouldn't matter that we'll
      // iterate through each byte as we only match against ASCII, and we
      // validate it at the end. This case is separated out so we can just deal
      // with the ASCII cases below.
      advanceCursor()

    case "\n", "\r":
      guard isMultiline else {
        throw DelimiterLexError(.unterminated, resumeAt: cursor)
      }
      advanceCursor()

    case "\0":
      // TODO: Warn to match the behavior of String literal lexer? Or should
      // we error as unprintable?
      advanceCursor()

    case "\\" where !escaped:
      // Advance again for an escape sequence.
      advanceCursor()
      try advance(escaped: true)

    case let next
      where !next.isPrintableASCII && !(isMultiline && next == "\t"):
      // Diagnose unprintable ASCII.
      // Note that tabs are allowed in multi-line literals.
      // TODO: This matches the string literal behavior, but should we allow
      // tabs for single-line regex literals too?
      // TODO: Ideally we would recover and continue to lex until the ending
      // delimiter.
      throw DelimiterLexError(.unprintableASCII, resumeAt: cursor.successor())

    default:
      advanceCursor()
    }
  }

  mutating func tryLexOpeningDelimiter(poundCount: Int) -> Delimiter? {
    for kind in delimiters {
      // If the delimiter allows extended pound syntax, or there are no pounds,
      // we just need to lex it.
      let opening = kind.opening.utf8
      if kind.allowsExtendedPoundSyntax || poundCount == 0 {
        guard tryEat(opening) else { continue }
        return Delimiter(kind, poundCount: poundCount)
      }

      // The delimiter doesn't allow extended pound syntax, so the pounds must be
      // part of the delimiter.
      guard
        poundCount < opening.count,
        opening.prefix(poundCount)
          .elementsEqual(repeatElement(ascii("#"), count: poundCount)),
        tryEat(opening.dropFirst(poundCount))
      else { continue }

      return Delimiter(kind, poundCount: 0)
    }
    return nil
  }

  /*consuming*/ mutating func lex(
  ) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
    // We can consume any number of pound signs.
    var poundCount = 0
    while tryEat(ascii: "#") {
      poundCount += 1
    }

    // Try to lex the opening delimiter.
    guard let delimiter = tryLexOpeningDelimiter(poundCount: poundCount) else {
      throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor())
    }
    let contentsStart = cursor

    // If the delimiter allows multi-line, try skipping over any whitespace to a
    // newline character. If we can do that, we enter multi-line mode.
    if delimiter.allowsMultiline {
      while let next = load() {
        switch next {
        case ascii(" "), ascii("\t"):
          advanceCursor()
          continue
        case ascii("\n"), ascii("\r"):
          firstNewline = cursor
        default:
          break
        }
        break
      }
    }
    while true {
      // Try to lex the closing delimiter.
      if let (contents, end) = try tryEatEnding(delimiter,
                                                contentsStart: contentsStart) {
        return (contents, delimiter, end)
      }
      // Try to advance the lexer.
      try advance()
    }
  }
}

/// Drop a set of regex delimiters from the input string, returning the contents
/// and the delimiter used. The input string must have valid delimiters.
func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
  func stripDelimiter(_ kind: Delimiter.Kind) -> (String, Delimiter)? {
    var slice = str.utf8[...]

    // Try strip any number of opening '#'s.
    var poundCount = 0
    if kind.allowsExtendedPoundSyntax {
      poundCount = slice.prefix(while: {
        $0 == UInt8(("#" as UnicodeScalar).value)
      }).count
      slice = slice.dropFirst(poundCount)
    }

    // The opening delimiter must match.
    guard var slice = slice.tryDropPrefix(kind.opening.utf8)
    else { return nil }

    // The closing delimiter may optionally match, as it may not be present in
    // invalid code.
    let delim = Delimiter(kind, poundCount: poundCount)
    if let newSlice = slice.tryDropSuffix(delim.closing.utf8) {
      slice = newSlice
    }
    let result = String(decoding: slice, as: UTF8.self)
    precondition(result.utf8.elementsEqual(slice))
    return (result, delim)
  }
  for kind in Delimiter.allDelimiters {
    if let (contents, d) = stripDelimiter(kind) {
      return (contents, d)
    }
  }
  fatalError("No valid delimiters")
}

/// Attempt to lex a regex literal between `start` and `end`, returning either
/// the contents and pointer from which to resume lexing, or an error.
func lexRegex(
  start: UnsafeRawPointer, end: UnsafeRawPointer,
  delimiters: [Delimiter.Kind] = Delimiter.enabledDelimiters
) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
  var lexer = DelimiterLexer(start: start, end: end, delimiters: delimiters)
  return try lexer.lex()
}

public func lexRegex(
  start: UnsafeRawPointer, end: UnsafeRawPointer
) throws -> (contents: String, Delimiter, end: UnsafeRawPointer)  {
  return try lexRegex(start: start, end: end, delimiters: Delimiter.enabledDelimiters)
}