File: WordBreaking.swift

package info (click to toggle)
swiftlang 6.0.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,519,992 kB
  • sloc: cpp: 9,107,863; ansic: 2,040,022; asm: 1,135,751; python: 296,500; objc: 82,456; f90: 60,502; lisp: 34,951; pascal: 19,946; sh: 18,133; perl: 7,482; ml: 4,937; javascript: 4,117; makefile: 3,840; awk: 3,535; xml: 914; fortran: 619; cs: 573; ruby: 573
file content (98 lines) | stat: -rw-r--r-- 3,020 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//

@_spi(_Unicode)
import Swift

// TODO: Sink onto String
extension Processor {
  func atSimpleBoundary(
    _ usesAsciiWord: Bool,
    _ semanticLevel: MatchingOptions.SemanticLevel
  ) -> Bool {
    func matchesWord(at i: Input.Index) -> Bool {
      switch semanticLevel {
      case .graphemeCluster:
        // TODO: needs benchmark coverage
        let c = input[i]
        return c.isWordCharacter && (c.isASCII || !usesAsciiWord)
      case .unicodeScalar:
        // TODO: needs benchmark coverage
        let c = input.unicodeScalars[i]
        return (c.properties.isAlphabetic || c == "_") && (c.isASCII || !usesAsciiWord)
      }
    }
    
    // FIXME: How should we handle bounds?
    // We probably need two concepts
    if subjectBounds.isEmpty { return false }
    if currentPosition == subjectBounds.lowerBound {
      return matchesWord(at: currentPosition)
    }
    let priorIdx = semanticLevel == .graphemeCluster
      ? input.index(before: currentPosition)
      : input.unicodeScalars.index(before: currentPosition)
    if currentPosition == subjectBounds.upperBound {
      return matchesWord(at: priorIdx)
    }
    
    let prior = matchesWord(at: priorIdx)
    let current = matchesWord(at: currentPosition)
    return prior != current
  }
}

extension String {
  func isOnWordBoundary(
    at i: String.Index,
    in range: Range<String.Index>,
    using cache: inout Set<String.Index>?,
    _ maxIndex: inout String.Index?
  ) -> Bool {
    // TODO: needs benchmark coverage
    guard i != range.lowerBound, i != range.upperBound else {
      return true
    }
    assert(range.contains(i))

    // If our index is already in our cache, then this is obviously on a
    // boundary.
    if let cache = cache, cache.contains(i) {
      return true
    }
    
    // If its not in the cache AND our max index is larger than our index, it
    // means this index is never on a word boundary in our string. If our index
    // is larger than max index, we may need to still do work to determine if
    // i is on a boundary. If it's equal to max index, then it should've been
    // taken the cache path.
    if let maxIndex = maxIndex, i < maxIndex {
      return false
    }
    
    if #available(SwiftStdlib 5.7, *) {
      var indices: Set<String.Index> = []
      var j = maxIndex ?? range.lowerBound
      
      while j < range.upperBound, j <= i {
        indices.insert(j)
        j = _wordIndex(after: j)
      }
      
      cache = indices
      maxIndex = j
      
      return indices.contains(i)
    } else {
      return false
    }
  }
}