File: vector-ops.mlir

package info (click to toggle)
swiftlang 6.0.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,519,992 kB
  • sloc: cpp: 9,107,863; ansic: 2,040,022; asm: 1,135,751; python: 296,500; objc: 82,456; f90: 60,502; lisp: 34,951; pascal: 19,946; sh: 18,133; perl: 7,482; ml: 4,937; javascript: 4,117; makefile: 3,840; awk: 3,535; xml: 914; fortran: 619; cs: 573; ruby: 573
file content (143 lines) | stat: -rw-r--r-- 5,617 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
// RUN: mlir-opt %s -enable-arm-streaming="mode=locally enable-za" \
// RUN:   -convert-vector-to-arm-sme -convert-vector-to-llvm="enable-arm-sme" \
// RUN:   -allocate-arm-sme-tiles -test-lower-to-llvm | \
// RUN: mlir-translate -mlir-to-llvmir | \
// RUN: %lli_aarch64_cmd --march=aarch64 --mattr="+sve,+sme" \
// RUN:   --entry-function=entry \
// RUN:   --dlopen=%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext | \
// RUN: FileCheck %s

func.func @entry() -> i32 {
  %c0 = arith.constant 0 : index
  %c1_i8 = arith.constant 1 : i8
  %c1_index = arith.constant 1 : index

  %c16 = arith.constant 16 : index
  %vscale = vector.vscale

  // "svl" refers to the Streaming Vector Length and "svl_b" the number of
  // 8-bit elements in a vector of SVL bits.
  %svl_b = arith.muli %c16, %vscale : index

  // Allocate memory and fill with ones.
  //
  // TODO: type conversion of rank > 1 vector types generates array(s) of
  // vectors. This is invalid for scalable vectors since LLVM doesn't support
  // arrays of scalable vectors. This prevents initializing 2-d vectors with
  // 'vector.store' or 'vector.transfer_write' ops until this is resolved or
  // there's a custom lowering path.
  %za_b = memref.alloca(%svl_b, %svl_b) : memref<?x?xi8>
  scf.for %i = %c0 to %svl_b step %c1_index {
    scf.for %j = %c0 to %svl_b step %c1_index {
      memref.store %c1_i8, %za_b[%i, %j] : memref<?x?xi8>
    }
  }

  // Verify memory is ones by doing a mul reduction with initial value of one.
  %init_1 = arith.constant 1 : i64
  %mul_reduce = scf.for %vnum = %c0 to %svl_b step %c1_index iter_args(%iter = %init_1) -> (i64) {
    %row = vector.load %za_b[%vnum, %c0] : memref<?x?xi8>, vector<[16]xi8>

    %inner_mul_reduce = scf.for %offset = %c0 to %svl_b step %c1_index iter_args(%inner_iter = %init_1) -> (i64) {
      %t = vector.extractelement %row[%offset : index] : vector<[16]xi8>
      %t_i64 = arith.extui %t : i8 to i64
      %inner_mul_reduce_next = arith.muli %inner_iter, %t_i64 : i64
      scf.yield %inner_mul_reduce_next : i64
    }

    %mul_reduce_next = arith.muli %iter, %inner_mul_reduce : i64
    scf.yield %mul_reduce_next : i64
  }

  // CHECK: 1
  vector.print %mul_reduce : i64

  // Verify the mul reduction works as expected.
  //
  // TODO: ZA currently isn't re-enabled after calls and is therefore disable
  // by the callee on return. Once this is resolved this can be moved to a
  // function.
  %c3 = arith.constant 3 : index
  %c4 = arith.constant 4 : i8
  %c7 = arith.constant 7 : index
  %c15 = arith.constant 15 : i8
  memref.store %c4, %za_b[%c3, %c7] : memref<?x?xi8>
  memref.store %c15, %za_b[%c7, %c3] : memref<?x?xi8>
  %mul_reduce2 = scf.for %vnum = %c0 to %svl_b step %c1_index iter_args(%iter = %init_1) -> (i64) {
    %row = vector.load %za_b[%vnum, %c0] : memref<?x?xi8>, vector<[16]xi8>

    %inner_mul_reduce = scf.for %offset = %c0 to %svl_b step %c1_index iter_args(%inner_iter = %init_1) -> (i64) {
      %t = vector.extractelement %row[%offset : index] : vector<[16]xi8>
      %t_i64 = arith.extui %t : i8 to i64
      %inner_mul_reduce_next = arith.muli %inner_iter, %t_i64 : i64
      scf.yield %inner_mul_reduce_next : i64
    }

    %mul_reduce_next = arith.muli %iter, %inner_mul_reduce : i64
    scf.yield %mul_reduce_next : i64
  }

  // 15*4=60
  // CHECK: 60
  vector.print %mul_reduce2 : i64

  // Fill memory with zeroes.
  //
  // This will get lowered to:
  //
  //   zero {za}
  //   for vnum = 0; vnum < SVLb; ++vnum;
  //     str za[vnum], [ptr]
  //     ...
  //
  %cst_0 = arith.constant dense<0> : vector<[16]x[16]xi8>
  vector.transfer_write %cst_0, %za_b[%c0, %c0] {in_bounds = [true, true]} : vector<[16]x[16]xi8>, memref<?x?xi8>

  // Verify memory is zeroed by doing an add reduction with initial value of
  // zero.
  %init_0 = arith.constant 0 : i64
  %add_reduce = scf.for %vnum = %c0 to %svl_b step %c1_index iter_args(%iter = %init_0) -> (i64) {
    %row = vector.load %za_b[%vnum, %c0] : memref<?x?xi8>, vector<[16]xi8>

    %inner_add_reduce = scf.for %offset = %c0 to %svl_b step %c1_index iter_args(%inner_iter = %init_0) -> (i64) {
      %t = vector.extractelement %row[%offset : index] : vector<[16]xi8>
      %t_i64 = arith.extui %t : i8 to i64
      %inner_add_reduce_next = arith.addi %inner_iter, %t_i64 : i64
      scf.yield %inner_add_reduce_next : i64
    }

    %add_reduce_next = arith.addi %iter, %inner_add_reduce : i64
    scf.yield %add_reduce_next : i64
  }

  // CHECK-NEXT: 0
  vector.print %add_reduce : i64

  // Verify the add reduction works as expected.
  //
  // TODO: ZA currently isn't re-enabled after calls and is therefore disable
  // by the callee on return. Once this is resolved this can be moved to a
  // function.
  memref.store %c4, %za_b[%c3, %c7] : memref<?x?xi8>
  memref.store %c15, %za_b[%c7, %c3] : memref<?x?xi8>
  %add_reduce2 = scf.for %vnum = %c0 to %svl_b step %c1_index iter_args(%iter = %init_0) -> (i64) {
    %row = vector.load %za_b[%vnum, %c0] : memref<?x?xi8>, vector<[16]xi8>

    %inner_add_reduce = scf.for %offset = %c0 to %svl_b step %c1_index iter_args(%inner_iter = %init_0) -> (i64) {
      %t = vector.extractelement %row[%offset : index] : vector<[16]xi8>
      %t_i64 = arith.extui %t : i8 to i64
      %inner_add_reduce_next = arith.addi %inner_iter, %t_i64 : i64
      scf.yield %inner_add_reduce_next : i64
    }

    %add_reduce_next = arith.addi %iter, %inner_add_reduce : i64
    scf.yield %add_reduce_next : i64
  }

  // 15+4=19
  // CHECK-NEXT: 19
  vector.print %add_reduce2 : i64

  %c0_i32 = arith.constant 0 : i32
  return %c0_i32 : i32
}