File: InfluencingIdentifiers.swift

package info (click to toggle)
swiftlang 6.2.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 2,856,264 kB
  • sloc: cpp: 9,995,718; ansic: 2,234,019; asm: 1,092,167; python: 313,940; objc: 82,726; f90: 80,126; lisp: 38,373; pascal: 25,580; sh: 20,378; ml: 5,058; perl: 4,751; makefile: 4,725; awk: 3,535; javascript: 3,018; xml: 918; fortran: 664; cs: 573; ruby: 396
file content (169 lines) | stat: -rw-r--r-- 7,060 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2024 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

import Foundation

fileprivate typealias UTF8Bytes = Pattern.UTF8Bytes

package struct InfluencingIdentifiers: Sendable {
  // `nonisolated(unsafe)` is fine because the underlying buffer is not modified until `deallocate` is called and the
  // struct must not be used anymore after `deallocate` was called.
  private nonisolated(unsafe) let identifiers: UnsafeBufferPointer<Identifier>

  private init(identifiers: UnsafeBufferPointer<Identifier>) {
    self.identifiers = identifiers
  }

  private static func allocate(copyingTokenizedIdentifiers possiblyEmptyTokenizedIdentifiers: [[String]]) -> Self {
    let tokenizedIdentifiers = possiblyEmptyTokenizedIdentifiers.filter { possiblyEmptyTokenizedIdentifier in
      possiblyEmptyTokenizedIdentifier.count > 0
    }
    let allocatedIdentifiers: [Identifier] = tokenizedIdentifiers.enumerated().map {
      identifierIndex,
      tokenizedIdentifier in
      // First is 1, last is 0.9375, scale is linear. Only a small preference for the first word. Right now when
      // we have two words, it's for cases like an argument label and internal name predicting the argument type
      // or a variable name and its type predicting it's value. This scoring shows a slight affinity for the name.
      let scoreScale =
        (identifierIndex == 0)
        ? 1 : 1 - (0.0625 * (Double(identifierIndex) / Double(tokenizedIdentifiers.count - 1)))
      return Identifier.allocate(copyingTokenizedIdentifier: tokenizedIdentifier, scoreScale: scoreScale)
    }
    return InfluencingIdentifiers(identifiers: UnsafeBufferPointer.allocate(copyOf: allocatedIdentifiers))
  }

  private func deallocate() {
    for identifier in identifiers {
      identifier.deallocate()
    }
    identifiers.deallocate()
  }

  /// Invoke `body` with an instance of `InfluencingIdentifiers` that refers to memory only valid during the scope of `body`.
  /// This pattern is used so that this code has no referencing counting overhead. Using types like Array to represent the
  /// tokens during scoring results in referencing counting costing ~30% of the work. To avoid that, we use unsafe
  /// buffer pointers, and then this method to constrain lifetimes.
  /// - Parameter identifiers: The influencing identifiers in most to least influencing order.
  package static func withUnsafeInfluencingTokenizedIdentifiers<R>(
    _ tokenizedIdentifiers: [[String]],
    body: (Self) throws -> R
  ) rethrows -> R {
    let allocatedIdentifiers = allocate(copyingTokenizedIdentifiers: tokenizedIdentifiers)
    defer { allocatedIdentifiers.deallocate() }
    return try body(allocatedIdentifiers)
  }

  var hasContent: Bool {
    identifiers.hasContent
  }

  private func match(token: Token, candidate: Candidate, candidateTokenization: Pattern.Tokenization) -> Bool {
    candidateTokenization.anySatisfy { candidateTokenRange in
      if token.bytes.count == candidateTokenRange.count {
        let candidateToken = UnsafeBufferPointer(rebasing: candidate.bytes[candidateTokenRange])
        let leadingByteMatches = token.bytes[0].lowercasedUTF8Byte == candidateToken[0].lowercasedUTF8Byte
        return leadingByteMatches && equateBytes(token.bytes.afterFirst(), candidateToken.afterFirst())
      }
      return false
    }
  }

  /// Returns a value between 0...1, where 0 indicates `Candidate` was not textually related to the identifiers, and 1.0
  /// indicates the candidate was strongly related to the identifiers.
  ///
  /// Currently, this is implemented by tokenizing the candidate and the identifiers, and then seeing if any of the tokens
  /// match. If each identifier has one or more tokens in the candidate, return 1.0. If no tokens from the identifiers appear
  /// in the candidate, return 0.0.
  package func score(candidate: Candidate, allocator: inout UnsafeStackAllocator) -> Double {
    var candidateTokenization: Pattern.Tokenization? = nil
    defer { candidateTokenization?.deallocate(allocator: &allocator) }
    var score = 0.0
    for identifier in identifiers {
      // TODO: We could turn this loop inside out to walk the candidate tokens first, and skip the ones that are shorter
      // than the shortest token, or keep bit for each length we have, and skip almost all of them.
      let matchedTokenCount = identifier.tokens.countOf { token in
        if (RejectionFilter.match(pattern: token.rejectionFilter, candidate: candidate.rejectionFilter)
          == .maybe)
        {
          let candidateTokenization = candidateTokenization.lazyInitialize {
            Pattern.Tokenization.allocate(
              mixedcaseBytes: candidate.bytes,
              contentType: candidate.contentType,
              allocator: &allocator
            )
          }
          return match(token: token, candidate: candidate, candidateTokenization: candidateTokenization)
        }
        return false
      }
      score = max(score, identifier.score(matchedTokenCount: matchedTokenCount))
    }
    return score
  }
}

fileprivate extension InfluencingIdentifiers {
  struct Identifier {
    let tokens: UnsafeBufferPointer<Token>
    private let scoreScale: Double

    private init(tokens: UnsafeBufferPointer<Token>, scoreScale: Double) {
      self.tokens = tokens
      self.scoreScale = scoreScale
    }

    func deallocate() {
      for token in tokens {
        token.deallocate()
      }
      tokens.deallocate()
    }

    static func allocate(copyingTokenizedIdentifier tokenizedIdentifier: [String], scoreScale: Double) -> Self {
      return Identifier(
        tokens: UnsafeBufferPointer.allocate(copyOf: tokenizedIdentifier.map(Token.allocate)),
        scoreScale: scoreScale
      )
    }

    /// Returns a value between 0...1
    func score(matchedTokenCount: Int) -> Double {
      if matchedTokenCount == 0 {
        return 0
      } else if tokens.count == 1 {  // We matched them all, make it obvious we won't divide by 0.
        return 1 * scoreScale
      } else {
        let p = Double(matchedTokenCount - 1) / Double(tokens.count - 1)
        return (0.75 + (p * 0.25)) * scoreScale
      }
    }
  }
}

fileprivate extension InfluencingIdentifiers {
  struct Token {
    let bytes: UTF8Bytes
    let rejectionFilter: RejectionFilter
    private init(bytes: UTF8Bytes) {
      self.bytes = bytes
      self.rejectionFilter = RejectionFilter(bytes: bytes)
    }

    static func allocate(_ text: String) -> Self {
      Token(bytes: UnsafeBufferPointer.allocate(copyOf: text.utf8))
    }

    func deallocate() {
      bytes.deallocate()
    }
  }
}