File: WordBreaking.swift

package info (click to toggle)
swiftlang 6.1.3-3
  • links: PTS, VCS
  • area: main
  • in suites: forky
  • size: 2,791,748 kB
  • sloc: cpp: 9,901,738; ansic: 2,201,433; asm: 1,091,827; python: 308,252; objc: 82,166; f90: 80,126; lisp: 38,358; pascal: 25,559; sh: 20,429; ml: 5,058; perl: 4,745; makefile: 4,484; awk: 3,535; javascript: 3,018; xml: 918; fortran: 664; cs: 573; ruby: 396
file content (100 lines) | stat: -rw-r--r-- 3,028 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//

internal import _RegexParser

@_spi(_Unicode)
import Swift

// TODO: Sink onto String
extension Processor {
  func atSimpleBoundary(
    _ usesAsciiWord: Bool,
    _ semanticLevel: MatchingOptions.SemanticLevel
  ) -> Bool {
    func matchesWord(at i: Input.Index) -> Bool {
      switch semanticLevel {
      case .graphemeCluster:
        // TODO: needs benchmark coverage
        let c = input[i]
        return c.isWordCharacter && (c.isASCII || !usesAsciiWord)
      case .unicodeScalar:
        // TODO: needs benchmark coverage
        let c = input.unicodeScalars[i]
        return (c.properties.isAlphabetic || c == "_") && (c.isASCII || !usesAsciiWord)
      }
    }
    
    // FIXME: How should we handle bounds?
    // We probably need two concepts
    if subjectBounds.isEmpty { return false }
    if currentPosition == subjectBounds.lowerBound {
      return matchesWord(at: currentPosition)
    }
    let priorIdx = semanticLevel == .graphemeCluster
      ? input.index(before: currentPosition)
      : input.unicodeScalars.index(before: currentPosition)
    if currentPosition == subjectBounds.upperBound {
      return matchesWord(at: priorIdx)
    }
    
    let prior = matchesWord(at: priorIdx)
    let current = matchesWord(at: currentPosition)
    return prior != current
  }
}

extension String {
  func isOnWordBoundary(
    at i: String.Index,
    in range: Range<String.Index>,
    using cache: inout Set<String.Index>?,
    _ maxIndex: inout String.Index?
  ) -> Bool {
    // TODO: needs benchmark coverage
    guard i != range.lowerBound, i != range.upperBound else {
      return true
    }
    assert(range.contains(i))

    // If our index is already in our cache, then this is obviously on a
    // boundary.
    if let cache = cache, cache.contains(i) {
      return true
    }
    
    // If its not in the cache AND our max index is larger than our index, it
    // means this index is never on a word boundary in our string. If our index
    // is larger than max index, we may need to still do work to determine if
    // i is on a boundary. If it's equal to max index, then it should've been
    // taken the cache path.
    if let maxIndex = maxIndex, i < maxIndex {
      return false
    }
    
    if #available(SwiftStdlib 5.7, *) {
      if cache == nil {
        cache = []
      }
      var j = maxIndex ?? range.lowerBound
      
      while j < range.upperBound, j <= i {
        cache!.insert(j)
        j = _wordIndex(after: j)
      }
      
      maxIndex = j
      return cache!.contains(i)
    } else {
      return false
    }
  }
}