File: WordBreaking.swift

package info (click to toggle)
swiftlang 6.0.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,519,992 kB
  • sloc: cpp: 9,107,863; ansic: 2,040,022; asm: 1,135,751; python: 296,500; objc: 82,456; f90: 60,502; lisp: 34,951; pascal: 19,946; sh: 18,133; perl: 7,482; ml: 4,937; javascript: 4,117; makefile: 3,840; awk: 3,535; xml: 914; fortran: 619; cs: 573; ruby: 573
file content (66 lines) | stat: -rw-r--r-- 1,860 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

// Normalization tests are currently only avaible on Darwin, awaiting a sensible
// file API...
#if _runtime(_ObjC)
import Foundation

func parseWordBreakTests(
  _ data: String,
  into result: inout [(String, [String])]
) {
  for line in data.split(separator: "\n") {
    // Only look at actual tests
    guard line.hasPrefix("÷") else {
      continue
    }

    let components = line.split(separator: "#").first!.split(separator: " ")

    var string = ""
    var words: [String] = [""]

    for i in components.indices.dropFirst() {
      // If we're an odd index, this is a scalar.
      if !i.isMultiple(of: 2) {
        let scalar = Unicode.Scalar(UInt32(components[i], radix: 16)!)!

        string.unicodeScalars.append(scalar)
        words[words.count - 1].unicodeScalars.append(scalar)
      } else {
        // Otherwise, it is a word breaking operator.

        // If this is a break, record the +1 count. Otherwise it is × which is
        // not a break.
        if components[i] == "÷" {
          words.append("")
        }
      }
    }

    words.removeLast()

    result.append((string, words))
  }
}

public let wordBreakTests: [(String, [String])] = {
  var result: [(String, [String])] = []

  let testFile = readInputFile("WordBreakTest.txt")

  parseWordBreakTests(testFile, into: &result)

  return result
}()
#endif