File: Lexer.swift

package info (click to toggle)
swiftlang 6.2.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 2,856,264 kB
  • sloc: cpp: 9,995,718; ansic: 2,234,019; asm: 1,092,167; python: 313,940; objc: 82,726; f90: 80,126; lisp: 38,373; pascal: 25,580; sh: 20,378; ml: 5,058; perl: 4,751; makefile: 4,725; awk: 3,535; javascript: 3,018; xml: 918; fortran: 664; cs: 573; ruby: 396
file content (509 lines) | stat: -rw-r--r-- 18,630 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
import WasmParser

enum TokenKind: Equatable {
    case leftParen
    case rightParen
    case lineComment
    case blockComment
    case id
    case keyword
    case string([UInt8])
    case integer(FloatingPointSign?, IntegerToken)
    case float(FloatingPointSign?, FloatToken)
    case unknown

    var isMeaningful: Bool {
        switch self {
        case .lineComment, .blockComment:
            return false
        default:
            return true
        }
    }
}

enum FloatToken: Equatable {
    case inf
    case nan(hexPattern: String?)
    case hexPattern(String)
    case decimalPattern(String)
}

enum IntegerToken: Equatable {
    case hexPattern(String)
    case decimalPattern(String)
}

struct Token {
    let range: Range<Lexer.Index>
    let kind: TokenKind

    func text(from lexer: Lexer) -> String {
        String(lexer.cursor.input[range])
    }

    func location(in lexer: Lexer) -> Location {
        Location(at: range.lowerBound, in: lexer.cursor.input)
    }
}

struct Lexer {
    typealias Index = String.UnicodeScalarView.Index
    fileprivate struct Cursor {
        let input: String.UnicodeScalarView
        var nextIndex: Index

        var isEOF: Bool {
            return nextIndex == input.endIndex
        }

        init(input: String) {
            self.init(input: input.unicodeScalars)
        }

        init(input: String.UnicodeScalarView) {
            self.input = input
            self.nextIndex = self.input.startIndex
        }

        /// Seek to the given offset
        /// - Parameter offset: The offset to seek
        mutating func seek(at offset: Index) {
            self.nextIndex = offset
        }

        func peek(at offset: Int = 0) throws -> Unicode.Scalar? {
            precondition(offset >= 0)
            guard self.input.index(self.nextIndex, offsetBy: offset) < self.input.endIndex else {
                return nil
            }
            let index = self.input.index(self.nextIndex, offsetBy: offset)
            return self.input[index]
        }

        mutating func next() throws -> Unicode.Scalar? {
            guard self.nextIndex < self.input.endIndex else { return nil }
            defer { self.nextIndex = self.input.index(after: self.nextIndex) }
            return self.input[self.nextIndex]
        }

        mutating func eat(_ expected: Unicode.Scalar) throws -> Bool {
            if try peek() == expected {
                _ = try next()
                return true
            }
            return false
        }

        mutating func eat(_ expected: String) throws -> Bool {
            var index = self.nextIndex
            for char in expected.unicodeScalars {
                guard index < self.input.endIndex, self.input[index] == char else {
                    return false
                }
                index = self.input.index(after: index)
            }
            self.nextIndex = index
            return true
        }

        mutating func eatOneOf(_ expectedSet: [Unicode.Scalar]) throws -> Unicode.Scalar? {
            guard let ch = try peek() else { return nil }
            for expected in expectedSet {
                if ch == expected {
                    _ = try next()
                    return ch
                }
            }
            return nil
        }

        /// Check if the next characters match the expected string without consuming them
        /// - Parameters:
        ///   - expected: The expected string
        ///   - eof: Whether if EOF is expected after the string
        /// - Returns: `true` if the next characters match the expected string
        func match(_ expected: String, eof: Bool = false) throws -> Bool {
            var index = self.nextIndex
            for char in expected.unicodeScalars {
                guard index < self.input.endIndex, self.input[index] == char else {
                    return false
                }
                index = self.input.index(after: index)
            }
            if eof {
                return index == self.input.endIndex
            }
            return true
        }

        /// Returns the current location in line-column style. Line is 1-indexed and column is 0-indexed.
        func currentSourceLocation() -> Location {
            return Location(at: nextIndex, in: self.input)
        }

        func createError(_ description: String) -> WatParserError {
            return WatParserError(description, location: currentSourceLocation())
        }

        func unexpectedEof() -> WatParserError {
            createError("Unexpected end-of-file")
        }
    }

    fileprivate var cursor: Cursor

    init(input: String) {
        self.cursor = Cursor(input: input)
    }

    /// Seek to the given offset
    /// - Parameter offset: The offset to seek
    mutating func seek(at offset: Index) {
        cursor.seek(at: offset)
    }

    /// Lex the next meaningful token
    /// - Returns: The next meaningful token or `nil` if EOF
    mutating func lex() throws -> Token? {
        while true {
            guard let token = try rawLex() else { return nil }
            guard token.kind.isMeaningful else { continue }
            return token
        }
    }

    /// Lex the next token without skipping comments
    mutating func rawLex() throws -> Token? {
        guard let (start, initialChar) = try peekNonWhitespaceChar() else {
            return nil
        }
        guard let kind = try classifyToken(initialChar) else { return nil }
        let end = cursor.nextIndex
        return Token(range: start..<end, kind: kind)
    }

    func location() -> Location {
        return cursor.currentSourceLocation()
    }

    private mutating func classifyToken(_ initialChar: Unicode.Scalar) throws -> TokenKind? {
        switch initialChar {
        case "(":
            _ = try cursor.next()
            switch try cursor.peek() {
            case ";":
                _ = try cursor.next()
                return try lexBlockComment()
            default: return .leftParen
            }
        case ")":
            _ = try cursor.next()
            return .rightParen
        case ";":
            _ = try cursor.next()
            // Lex ";; ..." line comment
            guard try cursor.eat(";") else {
                throw cursor.createError("Expected ';' after ';' line comment")
            }
            while let char = try cursor.next() {
                switch char {
                case "\r":
                    if try cursor.peek() == "\n" {
                        _ = try cursor.next()
                    }
                    return .lineComment
                case "\n":
                    return .lineComment
                default: break
                }
            }
            // source file ends with line comment
            return .lineComment
        case "\"",
            _ where isIdChar(initialChar):
            let (kind, text) = try lexReservedChars(initial: initialChar)
            switch kind {
            case .idChars:
                if initialChar == "$" {
                    return .id
                }
                do {
                    // Try to parse as integer or float
                    var numberSource = Cursor(input: String.UnicodeScalarView(text))
                    var sign: FloatingPointSign? = nil
                    if let maybeSign = try numberSource.peek(),
                        let (found, _) = [(FloatingPointSign.plus, "+"), (FloatingPointSign.minus, "-")].first(where: { $1 == maybeSign })
                    {
                        sign = found
                        _ = try numberSource.next()
                    }
                    if try numberSource.match("inf", eof: true) {
                        return .float(sign, .inf)
                    }
                    if try numberSource.match("nan", eof: true) {
                        return .float(sign, .nan(hexPattern: nil))
                    }
                    if try numberSource.eat("nan:0x") {
                        return .float(sign, .nan(hexPattern: try numberSource.parseHexNumber()))
                    }
                    var pattern: String
                    let parseFraction: () throws -> String
                    let makeFloatToken: (String) -> FloatToken
                    if try numberSource.eat("0x") {
                        pattern = try numberSource.parseHexNumber()
                        if numberSource.isEOF {
                            return .integer(sign, .hexPattern(pattern))
                        }
                        parseFraction = { try numberSource.parseHexNumber() }
                        makeFloatToken = { FloatToken.hexPattern($0) }
                    } else {
                        pattern = try numberSource.parseDecimalNumber()
                        parseFraction = { try numberSource.parseDecimalNumber() }
                        makeFloatToken = { FloatToken.decimalPattern($0) }
                    }
                    if !pattern.isEmpty {
                        // The token has at least single digit
                        if numberSource.isEOF {
                            // No more characters
                            return .integer(sign, .decimalPattern(pattern))
                        }
                        // Still might be a float
                        if try numberSource.eat(".") {
                            let fraction = try parseFraction()
                            pattern += "." + fraction
                        }
                        if let expCh = try numberSource.eatOneOf(["e", "E", "p", "P"]) {
                            pattern += String(expCh)
                            if try numberSource.eat("+") {
                                pattern += "+"
                            } else if try numberSource.eat("-") {
                                pattern += "-"
                            }
                            let exponent = try numberSource.parseDecimalNumber()
                            guard !exponent.isEmpty else { return .unknown }
                            pattern += exponent
                        }
                        guard numberSource.isEOF else { return .unknown }
                        return .float(sign, makeFloatToken(pattern))
                    }
                }
                if ("a"..."z").contains(initialChar) {
                    return .keyword
                }
                return .unknown
            case .string(let string):
                return .string(string)
            case .unknown:
                return .unknown
            }
        default:
            _ = try cursor.next()
            return .unknown
        }
    }

    private mutating func lexBlockComment() throws -> TokenKind {
        var level = 1
        while true {
            guard let char = try cursor.next() else {
                throw cursor.unexpectedEof()
            }
            switch char {
            case "(":
                if try cursor.peek() == ";" {
                    // Nested comment block
                    level += 1
                }
            case ";":
                if try cursor.peek() == ")" {
                    level -= 1
                    _ = try cursor.next()
                    if level == 0 {
                        return .blockComment
                    }
                }
            default: break
            }
        }
    }

    private mutating func peekNonWhitespaceChar() throws -> (index: Lexer.Index, byte: Unicode.Scalar)? {
        guard var char = try cursor.peek() else { return nil }
        var start: Lexer.Index = cursor.nextIndex
        // https://webassembly.github.io/spec/core/text/lexical.html#white-space
        let whitespaces: [Unicode.Scalar] = [" ", "\n", "\t", "\r"]
        while whitespaces.contains(char) {
            _ = try cursor.next()
            start = cursor.nextIndex
            guard let newChar = try cursor.peek() else { return nil }
            char = newChar
        }
        return (start, char)
    }

    // https://webassembly.github.io/spec/core/text/values.html#text-idchar
    private func isIdChar(_ char: Unicode.Scalar) -> Bool {
        // NOTE: Intentionally not using Range here to keep fast enough even in debug mode
        return ("0" <= char && char <= "9")
            || ("A" <= char && char <= "Z")
            || ("a" <= char && char <= "z")
            || "!" == char || "#" == char || "$" == char || "%" == char
            || "&" == char || "'" == char || "*" == char || "+" == char
            || "-" == char || "." == char || "/" == char || ":" == char
            || "<" == char || "=" == char || ">" == char || "?" == char
            || "@" == char || "\\" == char || "^" == char || "_" == char
            || "`" == char || "|" == char || "~" == char
    }

    private enum ReservedKind {
        case string([UInt8])
        case idChars
        case unknown
    }

    private mutating func lexReservedChars(initial: Unicode.Scalar) throws -> (ReservedKind, String.UnicodeScalarView.SubSequence) {
        let start = cursor.nextIndex
        var numberOfIdChars: Int = 0
        var strings: [[UInt8]] = []
        var char = initial

        while true {
            if isIdChar(char) {
                _ = try cursor.next()
                numberOfIdChars += 1
            } else if char == "\"" {
                _ = try cursor.next()
                strings.append(try readString())
            } else {
                break
            }
            guard let new = try cursor.peek() else { break }
            char = new
        }
        let text = cursor.input[start..<cursor.nextIndex]
        if numberOfIdChars > 0, strings.count == 0 {
            return (.idChars, text)
        } else if numberOfIdChars == 0, strings.count == 1 {
            return (.string(strings[0]), text)
        } else if numberOfIdChars == 1, strings.count == 1, initial == "$" {
            return (.idChars, text)
        }
        return (.unknown, text)
    }

    private mutating func readString() throws -> [UInt8] {
        var copyingBuffer: [UInt8] = []
        func append(_ char: Unicode.Scalar) {
            copyingBuffer.append(contentsOf: String(char).utf8)
        }

        while let char = try cursor.next() {
            if char == "\"" {
                break
            }
            if char == "\\" {
                guard let nextChar = try cursor.next() else {
                    throw cursor.unexpectedEof()
                }
                switch nextChar {
                case "\"", "'", "\\":
                    append(nextChar)
                case "t": append("\t")
                case "n": append("\n")
                case "r": append("\r")
                case "u":
                    // Unicode escape sequence \u{XXXX}
                    guard try cursor.eat("{") else {
                        throw cursor.createError("Expected '{' after \\u unicode escape sequence")
                    }
                    let codePointString = try cursor.parseHexNumber()
                    guard let codePoint = UInt32(codePointString, radix: 16) else {
                        throw cursor.createError("Cannot parse code point in \\u unicode escape sequence as 32-bit unsigned hex integer")
                    }
                    guard try cursor.eat("}") else {
                        throw cursor.createError("No closing '}' after \\u unicode escape sequence")
                    }
                    // Allocate copying buffer if not already allocated
                    guard let scalar = Unicode.Scalar(codePoint) else {
                        throw cursor.createError("Invalid code point in \\u unicode escape sequence")
                    }
                    append(scalar)
                case let nChar where nChar.properties.isASCIIHexDigit:
                    guard let mChar = try cursor.next() else {
                        throw cursor.unexpectedEof()
                    }
                    guard mChar.properties.isASCIIHexDigit else {
                        throw cursor.createError("Invalid escape sequence: \(mChar)")
                    }
                    let n = try parseHexDigit(nChar)!
                    let m = try parseHexDigit(mChar)!
                    let digit = n * 16 + m
                    copyingBuffer.append(digit)
                case let other:
                    throw cursor.createError("Invalid escape sequence: \(other)")
                }
            } else {
                append(char)
            }
        }
        return copyingBuffer
    }
}

func parseHexDigit(_ char: Unicode.Scalar) throws -> UInt8? {
    let base: Unicode.Scalar
    let addend: UInt8
    if ("0"..."9").contains(char) {
        base = "0"
        addend = 0
    } else if ("a"..."f").contains(char) {
        base = "a"
        addend = 10
    } else if ("A"..."F").contains(char) {
        base = "A"
        addend = 10
    } else {
        return nil
    }
    return UInt8(char.value - base.value + UInt32(addend))
}

extension Lexer.Cursor {
    mutating func parseHexNumber() throws -> String {
        return try parseUnderscoredChars(continueParsing: \.properties.isASCIIHexDigit)
    }

    mutating func parseDecimalNumber() throws -> String {
        return try parseUnderscoredChars(continueParsing: { "0"..."9" ~= $0 })
    }

    /// Parse underscore-separated characters
    /// - Parameter continueParsing: A closure that returns `true` if the parsing should continue
    /// - Returns: The parsed string without underscores
    mutating func parseUnderscoredChars(continueParsing: (Unicode.Scalar) -> Bool) throws -> String {
        var value = String.UnicodeScalarView()
        var lastParsedChar: Unicode.Scalar?
        while let char = try peek() {
            if char == "_" {
                guard let lastChar = lastParsedChar else {
                    throw createError("Invalid hex number, leading underscore")
                }
                guard lastChar != "_" else {
                    throw createError("Invalid hex number, consecutive underscores")
                }
                lastParsedChar = char
                _ = try next()
                continue
            }
            guard continueParsing(char) else { break }
            lastParsedChar = char
            value.append(char)
            _ = try next()
        }
        if lastParsedChar == "_" {
            throw createError("Invalid hex number, trailing underscore")
        }
        return String(value)
    }
}