File: vector-transfer-full-partial-split.mlir

package info (click to toggle)
llvm-toolchain-15 1%3A15.0.6-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,554,644 kB
  • sloc: cpp: 5,922,452; ansic: 1,012,136; asm: 674,362; python: 191,568; objc: 73,855; f90: 42,327; lisp: 31,913; pascal: 11,973; javascript: 10,144; sh: 9,421; perl: 7,447; ml: 5,527; awk: 3,523; makefile: 2,520; xml: 885; cs: 573; fortran: 567
file content (434 lines) | stat: -rw-r--r-- 25,235 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
// RUN: mlir-opt %s -test-vector-transfer-full-partial-split -split-input-file | FileCheck %s
// RUN: mlir-opt %s -test-vector-transfer-full-partial-split=use-memref-copy -split-input-file | FileCheck %s --check-prefix=LINALG

// CHECK-DAG: #[[$map_p4:.*]] = affine_map<()[s0] -> (s0 + 4)>
// CHECK-DAG: #[[$map_p8:.*]] = affine_map<()[s0] -> (s0 + 8)>
// CHECK-DAG: #[[$map_2d_stride_1:.*]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>

// LINALG-DAG: #[[$map_p4:.*]] = affine_map<()[s0] -> (s0 + 4)>
// LINALG-DAG: #[[$map_p8:.*]] = affine_map<()[s0] -> (s0 + 8)>
// LINALG-DAG: #[[$map_2d_stride_1:.*]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
// LINALG-DAG: #[[$map_2d_stride_8x1:.*]] = affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>
// LINALG-DAG: #[[$bounds_map_4:.*]] = affine_map<(d0, d1, d2) -> (d0 - d1, 4)>
// LINALG-DAG: #[[$bounds_map_8:.*]] = affine_map<(d0, d1, d2) -> (d0 - d1, 8)>

// CHECK-LABEL: split_vector_transfer_read_2d(
//  CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref
//  CHECK-SAME: %[[i:[a-zA-Z0-9]*]]: index
//  CHECK-SAME: %[[j:[a-zA-Z0-9]*]]: index

// LINALG-LABEL: split_vector_transfer_read_2d(
//  LINALG-SAME: %[[A:[a-zA-Z0-9]*]]: memref
//  LINALG-SAME: %[[i:[a-zA-Z0-9]*]]: index
//  LINALG-SAME: %[[j:[a-zA-Z0-9]*]]: index
func.func @split_vector_transfer_read_2d(%A: memref<?x8xf32>, %i: index, %j: index) -> vector<4x8xf32> {
  %c0 = arith.constant 0 : index
  %f0 = arith.constant 0.0 : f32

  //  CHECK-DAG: %[[c8:.*]] = arith.constant 8 : index
  //  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
  // alloca for boundary full tile
  //      CHECK: %[[alloc:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32>
  // %i + 4 <= dim(%A, 0)
  //      CHECK: %[[idx0:.*]] = affine.apply #[[$map_p4]]()[%[[i]]]
  //      CHECK: %[[d0:.*]] = memref.dim %[[A]], %[[c0]] : memref<?x8xf32>
  //      CHECK: %[[cmp0:.*]] = arith.cmpi sle, %[[idx0]], %[[d0]] : index
  // %j + 8 <= dim(%A, 1)
  //      CHECK: %[[idx1:.*]] = affine.apply #[[$map_p8]]()[%[[j]]]
  //      CHECK: %[[cmp1:.*]] = arith.cmpi sle, %[[idx1]], %[[c8]] : index
  // are both conds true
  //      CHECK: %[[cond:.*]] = arith.andi %[[cmp0]], %[[cmp1]] : i1
  //      CHECK: %[[ifres:.*]]:3 = scf.if %[[cond]] -> (memref<?x8xf32>, index, index) {
  //               inBounds, just yield %A
  //      CHECK:   scf.yield %[[A]], %[[i]], %[[j]] : memref<?x8xf32>, index, index
  //      CHECK: } else {
  //               slow path, fill tmp alloc and yield a memref_casted version of it
  //      CHECK:   %[[slow:.*]] = vector.transfer_read %[[A]][%[[i]], %[[j]]], %cst :
  // CHECK-SAME:     memref<?x8xf32>, vector<4x8xf32>
  //      CHECK:   %[[cast_alloc:.*]] = vector.type_cast %[[alloc]] :
  // CHECK-SAME:     memref<4x8xf32> to memref<vector<4x8xf32>>
  //      CHECK:   store %[[slow]], %[[cast_alloc]][] : memref<vector<4x8xf32>>
  //      CHECK:   %[[yielded:.*]] = memref.cast %[[alloc]] :
  // CHECK-SAME:     memref<4x8xf32> to memref<?x8xf32>
  //      CHECK:   scf.yield %[[yielded]], %[[c0]], %[[c0]] :
  // CHECK-SAME:     memref<?x8xf32>, index, index
  //      CHECK: }
  //      CHECK: %[[res:.*]] = vector.transfer_read %[[ifres]]#0[%[[ifres]]#1, %[[ifres]]#2], %cst
  // CHECK-SAME:   {in_bounds = [true, true]} : memref<?x8xf32>, vector<4x8xf32>

  //  LINALG-DAG: %[[c0:.*]] = arith.constant 0 : index
  //  LINALG-DAG: %[[c4:.*]] = arith.constant 4 : index
  //  LINALG-DAG: %[[c8:.*]] = arith.constant 8 : index
  // alloca for boundary full tile
  //      LINALG: %[[alloc:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32>
  // %i + 4 <= dim(%A, 0)
  //      LINALG: %[[idx0:.*]] = affine.apply #[[$map_p4]]()[%[[i]]]
  //      LINALG: %[[d0:.*]] = memref.dim %[[A]], %[[c0]] : memref<?x8xf32>
  //      LINALG: %[[cmp0:.*]] = arith.cmpi sle, %[[idx0]], %[[d0]] : index
  // %j + 8 <= dim(%A, 1)
  //      LINALG: %[[idx1:.*]] = affine.apply #[[$map_p8]]()[%[[j]]]
  //      LINALG: %[[cmp1:.*]] = arith.cmpi sle, %[[idx1]], %[[c8]] : index
  // are both conds true
  //      LINALG: %[[cond:.*]] = arith.andi %[[cmp0]], %[[cmp1]] : i1
  //      LINALG: %[[ifres:.*]]:3 = scf.if %[[cond]] -> (memref<?x8xf32>, index, index) {
  //               inBounds, just yield %A
  //      LINALG:   scf.yield %[[A]], %[[i]], %[[j]] : memref<?x8xf32>, index, index
  //      LINALG: } else {
  //               slow path, fill tmp alloc and yield a memref_casted version of it
  //      LINALG:   linalg.fill ins(%cst : f32) outs(%[[alloc]] : memref<4x8xf32>)
  //      LINALG:   %[[d0:.*]] = memref.dim %[[A]], %[[c0]] : memref<?x8xf32>
  //      LINALG:   %[[sv0:.*]] = affine.min #[[$bounds_map_4]](%[[d0]], %[[i]], %[[c4]])
  //      LINALG:   %[[sv1:.*]] = affine.min #[[$bounds_map_8]](%[[c8]], %[[j]], %[[c8]])
  //      LINALG:   %[[sv:.*]] = memref.subview %[[A]][%[[i]], %[[j]]] [%[[sv0]], %[[sv1]]] [1, 1]
  // LINALG-SAME:     memref<?x8xf32> to memref<?x?xf32, #[[$map_2d_stride_8x1]]>
  //      LINALG:   %[[alloc_view:.*]] = memref.subview %[[alloc]][0, 0] [%[[sv0]], %[[sv1]]] [1, 1]
  //      LINALG:   memref.copy %[[sv]], %[[alloc_view]] : memref<?x?xf32, #[[$map_2d_stride_8x1]]> to memref<?x?xf32, #{{.*}}>
  //      LINALG:   %[[yielded:.*]] = memref.cast %[[alloc]] :
  // LINALG-SAME:     memref<4x8xf32> to memref<?x8xf32>
  //      LINALG:   scf.yield %[[yielded]], %[[c0]], %[[c0]] :
  // LINALG-SAME:     memref<?x8xf32>, index, index
  //      LINALG: }
  //      LINALG: %[[res:.*]] = vector.transfer_read %[[ifres]]#0[%[[ifres]]#1, %[[ifres]]#2], %cst
  // LINALG-SAME:   {in_bounds = [true, true]} : memref<?x8xf32>, vector<4x8xf32>
  %1 = vector.transfer_read %A[%i, %j], %f0 : memref<?x8xf32>, vector<4x8xf32>

  // LINALG: return %[[res]] : vector<4x8xf32>
  return %1: vector<4x8xf32>
}

// CHECK-LABEL: split_vector_transfer_read_strided_2d(
//  CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref
//  CHECK-SAME: %[[i:[a-zA-Z0-9]*]]: index
//  CHECK-SAME: %[[j:[a-zA-Z0-9]*]]: index

// LINALG-LABEL: split_vector_transfer_read_strided_2d(
//  LINALG-SAME: %[[A:[a-zA-Z0-9]*]]: memref
//  LINALG-SAME: %[[i:[a-zA-Z0-9]*]]: index
//  LINALG-SAME: %[[j:[a-zA-Z0-9]*]]: index
func.func @split_vector_transfer_read_strided_2d(
    %A: memref<7x8xf32, offset:?, strides:[?, 1]>,
    %i: index, %j: index) -> vector<4x8xf32> {
  %c0 = arith.constant 0 : index
  %f0 = arith.constant 0.0 : f32

  //  CHECK-DAG: %[[c7:.*]] = arith.constant 7 : index
  //  CHECK-DAG: %[[c8:.*]] = arith.constant 8 : index
  //  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
  // alloca for boundary full tile
  //      CHECK: %[[alloc:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32>
  // %i + 4 <= dim(%A, 0)
  //      CHECK: %[[idx0:.*]] = affine.apply #[[$map_p4]]()[%[[i]]]
  //      CHECK: %[[cmp0:.*]] = arith.cmpi sle, %[[idx0]], %[[c7]] : index
  // %j + 8 <= dim(%A, 1)
  //      CHECK: %[[idx1:.*]] = affine.apply #[[$map_p8]]()[%[[j]]]
  //      CHECK: %[[cmp1:.*]] = arith.cmpi sle, %[[idx1]], %[[c8]] : index
  // are both conds true
  //      CHECK: %[[cond:.*]] = arith.andi %[[cmp0]], %[[cmp1]] : i1
  //      CHECK: %[[ifres:.*]]:3 = scf.if %[[cond]] -> (memref<?x8xf32, #[[$map_2d_stride_1]]>, index, index) {
  //               inBounds but not cast-compatible: yield a memref_casted form of %A
  //      CHECK:   %[[casted:.*]] = memref.cast %arg0 :
  // CHECK-SAME:     memref<7x8xf32, #[[$map_2d_stride_1]]> to memref<?x8xf32, #[[$map_2d_stride_1]]>
  //      CHECK:   scf.yield %[[casted]], %[[i]], %[[j]] :
  // CHECK-SAME:     memref<?x8xf32, #[[$map_2d_stride_1]]>, index, index
  //      CHECK: } else {
  //               slow path, fill tmp alloc and yield a memref_casted version of it
  //      CHECK:   %[[slow:.*]] = vector.transfer_read %[[A]][%[[i]], %[[j]]], %cst :
  // CHECK-SAME:     memref<7x8xf32, #[[$map_2d_stride_1]]>, vector<4x8xf32>
  //      CHECK:   %[[cast_alloc:.*]] = vector.type_cast %[[alloc]] :
  // CHECK-SAME:     memref<4x8xf32> to memref<vector<4x8xf32>>
  //      CHECK:   store %[[slow]], %[[cast_alloc]][] :
  // CHECK-SAME:     memref<vector<4x8xf32>>
  //      CHECK:   %[[yielded:.*]] = memref.cast %[[alloc]] :
  // CHECK-SAME:     memref<4x8xf32> to memref<?x8xf32, #[[$map_2d_stride_1]]>
  //      CHECK:   scf.yield %[[yielded]], %[[c0]], %[[c0]] :
  // CHECK-SAME:     memref<?x8xf32, #[[$map_2d_stride_1]]>, index, index
  //      CHECK: }
  //      CHECK: %[[res:.*]] = vector.transfer_read {{.*}} {in_bounds = [true, true]} :
  // CHECK-SAME:   memref<?x8xf32, #[[$map_2d_stride_1]]>, vector<4x8xf32>

  //  LINALG-DAG: %[[c0:.*]] = arith.constant 0 : index
  //  LINALG-DAG: %[[c4:.*]] = arith.constant 4 : index
  //  LINALG-DAG: %[[c7:.*]] = arith.constant 7 : index
  //  LINALG-DAG: %[[c8:.*]] = arith.constant 8 : index
  // alloca for boundary full tile
  //      LINALG: %[[alloc:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32>
  // %i + 4 <= dim(%A, 0)
  //      LINALG: %[[idx0:.*]] = affine.apply #[[$map_p4]]()[%[[i]]]
  //      LINALG: %[[cmp0:.*]] = arith.cmpi sle, %[[idx0]], %[[c7]] : index
  // %j + 8 <= dim(%A, 1)
  //      LINALG: %[[idx1:.*]] = affine.apply #[[$map_p8]]()[%[[j]]]
  //      LINALG: %[[cmp1:.*]] = arith.cmpi sle, %[[idx1]], %[[c8]] : index
  // are both conds true
  //      LINALG: %[[cond:.*]] = arith.andi %[[cmp0]], %[[cmp1]] : i1
  //      LINALG: %[[ifres:.*]]:3 = scf.if %[[cond]] -> (memref<?x8xf32, #[[$map_2d_stride_1]]>, index, index) {
  //               inBounds but not cast-compatible: yield a memref_casted form of %A
  //      LINALG:   %[[casted:.*]] = memref.cast %arg0 :
  // LINALG-SAME:     memref<7x8xf32, #[[$map_2d_stride_1]]> to memref<?x8xf32, #[[$map_2d_stride_1]]>
  //      LINALG:   scf.yield %[[casted]], %[[i]], %[[j]] :
  // LINALG-SAME:     memref<?x8xf32, #[[$map_2d_stride_1]]>, index, index
  //      LINALG: } else {
  //               slow path, fill tmp alloc and yield a memref_casted version of it
  //      LINALG:   linalg.fill ins(%cst : f32) outs(%[[alloc]] : memref<4x8xf32>)
  //      LINALG:   %[[sv0:.*]] = affine.min #[[$bounds_map_4]](%[[c7]], %[[i]], %[[c4]])
  //      LINALG:   %[[sv1:.*]] = affine.min #[[$bounds_map_8]](%[[c8]], %[[j]], %[[c8]])
  //      LINALG:   %[[sv:.*]] = memref.subview %[[A]][%[[i]], %[[j]]] [%[[sv0]], %[[sv1]]] [1, 1]
  // LINALG-SAME:     memref<7x8xf32, #[[$map_2d_stride_1]]> to memref<?x?xf32, #[[$map_2d_stride_1]]>
  //      LINALG:   %[[alloc_view:.*]] = memref.subview %[[alloc]][0, 0] [%[[sv0]], %[[sv1]]] [1, 1]
  //      LINALG:   memref.copy %[[sv]], %[[alloc_view]] : memref<?x?xf32, #[[$map_2d_stride_1]]> to memref<?x?xf32, #{{.*}}>
  //      LINALG:   %[[yielded:.*]] = memref.cast %[[alloc]] :
  // LINALG-SAME:     memref<4x8xf32> to memref<?x8xf32, #[[$map_2d_stride_1]]>
  //      LINALG:   scf.yield %[[yielded]], %[[c0]], %[[c0]] :
  // LINALG-SAME:     memref<?x8xf32, #[[$map_2d_stride_1]]>, index, index
  //      LINALG: }
  //      LINALG: %[[res:.*]] = vector.transfer_read {{.*}} {in_bounds = [true, true]} :
  // LINALG-SAME:   memref<?x8xf32, #[[$map_2d_stride_1]]>, vector<4x8xf32>
  %1 = vector.transfer_read %A[%i, %j], %f0 :
    memref<7x8xf32, offset:?, strides:[?, 1]>, vector<4x8xf32>

  // CHECK: return %[[res]] : vector<4x8xf32>
  return %1 : vector<4x8xf32>
}

// -----

func.func @split_vector_transfer_write_2d(%V: vector<4x8xf32>, %A: memref<?x8xf32>, %i: index, %j: index) {
  vector.transfer_write %V, %A[%i, %j] :
    vector<4x8xf32>, memref<?x8xf32>
  return
}

// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 + 4)>
// CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 8)>
// CHECK:     func @split_vector_transfer_write_2d(
// CHECK-SAME:                                         %[[VEC:.*]]: vector<4x8xf32>,
// CHECK-SAME:                                         %[[DEST:.*]]: memref<?x8xf32>,
// CHECK-SAME:                                         %[[I:.*]]: index,
// CHECK-SAME:                                         %[[J:.*]]: index) {
// CHECK-DAG:       %[[C8:.*]] = arith.constant 8 : index
// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
// CHECK-DAG:       %[[CT:.*]] = arith.constant true
// CHECK:           %[[TEMP:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32>
// CHECK:           %[[VAL_8:.*]] = affine.apply #[[MAP0]]()[%[[I]]]
// CHECK:           %[[DIM0:.*]] = memref.dim %[[DEST]], %[[C0]] : memref<?x8xf32>
// CHECK:           %[[DIM0_IN:.*]] = arith.cmpi sle, %[[VAL_8]], %[[DIM0]] : index
// CHECK:           %[[DIM1:.*]] = affine.apply #[[MAP1]]()[%[[J]]]
// CHECK:           %[[DIM1_IN:.*]] = arith.cmpi sle, %[[DIM1]], %[[C8]] : index
// CHECK:           %[[IN_BOUNDS:.*]] = arith.andi %[[DIM0_IN]], %[[DIM1_IN]] : i1
// CHECK:           %[[IN_BOUND_DEST:.*]]:3 = scf.if %[[IN_BOUNDS]] ->
// CHECK-SAME:          (memref<?x8xf32>, index, index) {
// CHECK:             scf.yield %[[DEST]], %[[I]], %[[J]] : memref<?x8xf32>, index, index
// CHECK:           } else {
// CHECK:             %[[VAL_15:.*]] = memref.cast %[[TEMP]]
// CHECK-SAME:            : memref<4x8xf32> to memref<?x8xf32>
// CHECK:             scf.yield %[[VAL_15]], %[[C0]], %[[C0]]
// CHECK-SAME:            : memref<?x8xf32>, index, index
// CHECK:           }
// CHECK:           vector.transfer_write %[[VEC]],
// CHECK-SAME:           %[[IN_BOUND_DEST:.*]]#0[%[[IN_BOUND_DEST]]#1, %[[IN_BOUND_DEST]]#2]
// CHECK-SAME:           {in_bounds = [true, true]} : vector<4x8xf32>, memref<?x8xf32>
// CHECK:           %[[OUT_BOUNDS:.*]] = arith.xori %[[IN_BOUNDS]], %[[CT]] : i1
// CHECK:           scf.if %[[OUT_BOUNDS]] {
// CHECK:             %[[CASTED:.*]] = vector.type_cast %[[TEMP]]
// CHECK-SAME:            : memref<4x8xf32> to memref<vector<4x8xf32>>
// CHECK:             %[[RESULT_COPY:.*]] = memref.load %[[CASTED]][]
// CHECK-SAME:            : memref<vector<4x8xf32>>
// CHECK:             vector.transfer_write %[[RESULT_COPY]],
// CHECK-SAME:            %[[DEST]][%[[I]], %[[J]]]
// CHECK-SAME:            : vector<4x8xf32>, memref<?x8xf32>
// CHECK:           }
// CHECK:           return
// CHECK:         }

// LINALG-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 + 4)>
// LINALG-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 8)>
// LINALG-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (d0 - d1, 4)>
// LINALG-DAG: #[[MAP3:.*]] = affine_map<(d0, d1, d2) -> (d0 - d1, 8)>
// LINALG-DAG: #[[MAP4:.*]] = affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>
// LINALG:     func @split_vector_transfer_write_2d(
// LINALG-SAME:                                         %[[VEC:.*]]: vector<4x8xf32>,
// LINALG-SAME:                                         %[[DEST:.*]]: memref<?x8xf32>,
// LINALG-SAME:                                         %[[I:.*]]: index,
// LINALG-SAME:                                         %[[J:.*]]: index) {
// LINALG-DAG:       %[[CT:.*]] = arith.constant true
// LINALG-DAG:       %[[C0:.*]] = arith.constant 0 : index
// LINALG-DAG:       %[[C4:.*]] = arith.constant 4 : index
// LINALG-DAG:       %[[C8:.*]] = arith.constant 8 : index
// LINALG:           %[[TEMP:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32>
// LINALG:           %[[IDX0:.*]] = affine.apply #[[MAP0]]()[%[[I]]]
// LINALG:           %[[DIM0:.*]] = memref.dim %[[DEST]], %[[C0]] : memref<?x8xf32>
// LINALG:           %[[DIM0_IN:.*]] = arith.cmpi sle, %[[IDX0]], %[[DIM0]] : index
// LINALG:           %[[DIM1:.*]] = affine.apply #[[MAP1]]()[%[[J]]]
// LINALG:           %[[DIM1_IN:.*]] = arith.cmpi sle, %[[DIM1]], %[[C8]] : index
// LINALG:           %[[IN_BOUNDS:.*]] = arith.andi %[[DIM0_IN]], %[[DIM1_IN]] : i1
// LINALG:           %[[IN_BOUND_DEST:.*]]:3 = scf.if %[[IN_BOUNDS]]
// LINALG-SAME:          -> (memref<?x8xf32>, index, index) {
// LINALG:             scf.yield %[[DEST]], %[[I]], %[[J]] : memref<?x8xf32>, index, index
// LINALG:           } else {
// LINALG:             %[[VAL_16:.*]] = memref.cast %[[TEMP]] : memref<4x8xf32> to memref<?x8xf32>
// LINALG:             scf.yield %[[VAL_16]], %[[C0]], %[[C0]] : memref<?x8xf32>, index, index
// LINALG:           }
// LINALG:           vector.transfer_write %[[VEC]],
// LINALG-SAME:          %[[IN_BOUND_DEST:.*]]#0[%[[IN_BOUND_DEST]]#1, %[[IN_BOUND_DEST]]#2]
// LINALG-SAME:          {in_bounds = [true, true]} : vector<4x8xf32>, memref<?x8xf32>
// LINALG:           %[[OUT_BOUNDS:.*]] = arith.xori %[[IN_BOUNDS]], %[[CT]] : i1
// LINALG:           scf.if %[[OUT_BOUNDS]] {
// LINALG:             %[[VAL_19:.*]] = memref.dim %[[DEST]], %[[C0]] : memref<?x8xf32>
// LINALG-DAG:         %[[VAL_20:.*]] = affine.min #[[MAP2]](%[[VAL_19]], %[[I]], %[[C4]])
// LINALG-DAG:         %[[VAL_21:.*]] = affine.min #[[MAP3]](%[[C8]], %[[J]], %[[C8]])
// LINALG:             %[[VAL_22:.*]] = memref.subview %[[TEMP]]
// LINALG-SAME:            [%[[I]], %[[J]]] [%[[VAL_20]], %[[VAL_21]]]
// LINALG-SAME:            [1, 1] : memref<4x8xf32> to memref<?x?xf32, #[[MAP4]]>
// LINALG:             %[[DEST_VIEW:.*]] = memref.subview %[[DEST]][0, 0] [%[[VAL_20]], %[[VAL_21]]] [1, 1]
// LINALG:             memref.copy %[[VAL_22]], %[[DEST_VIEW]]
// LINALG-SAME:            : memref<?x?xf32, #[[MAP4]]> to memref<?x?xf32, #{{.*}}>
// LINALG:           }
// LINALG:           return
// LINALG:         }

// -----

func.func @split_vector_transfer_write_strided_2d(
    %V: vector<4x8xf32>, %A: memref<7x8xf32, offset:?, strides:[?, 1]>,
    %i: index, %j: index) {
  vector.transfer_write %V, %A[%i, %j] :
    vector<4x8xf32>, memref<7x8xf32, offset:?, strides:[?, 1]>
  return
}

// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
// CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 4)>
// CHECK-DAG: #[[MAP2:.*]] = affine_map<()[s0] -> (s0 + 8)>
// CHECK:   func @split_vector_transfer_write_strided_2d(
// CHECK-SAME:                                                 %[[VEC:.*]]: vector<4x8xf32>,
// CHECK-SAME:                                                 %[[DEST:.*]]: memref<7x8xf32, #[[MAP0]]>,
// CHECK-SAME:                                                 %[[I:.*]]: index,
// CHECK-SAME:                                                 %[[J:.*]]: index) {
// CHECK-DAG:       %[[C7:.*]] = arith.constant 7 : index
// CHECK-DAG:       %[[C8:.*]] = arith.constant 8 : index
// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
// CHECK-DAG:       %[[CT:.*]] = arith.constant true
// CHECK:           %[[TEMP:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32>
// CHECK:           %[[DIM0:.*]] = affine.apply #[[MAP1]]()[%[[I]]]
// CHECK:           %[[DIM0_IN:.*]] = arith.cmpi sle, %[[DIM0]], %[[C7]] : index
// CHECK:           %[[DIM1:.*]] = affine.apply #[[MAP2]]()[%[[J]]]
// CHECK:           %[[DIM1_IN:.*]] = arith.cmpi sle, %[[DIM1]], %[[C8]] : index
// CHECK:           %[[IN_BOUNDS:.*]] = arith.andi %[[DIM0_IN]], %[[DIM1_IN]] : i1
// CHECK:           %[[IN_BOUND_DEST:.*]]:3 = scf.if %[[IN_BOUNDS]]
// CHECK-SAME:          -> (memref<?x8xf32, #[[MAP0]]>, index, index) {
// CHECK:             %[[VAL_15:.*]] = memref.cast %[[DEST]]
// CHECK-SAME:            : memref<7x8xf32, #[[MAP0]]> to memref<?x8xf32, #[[MAP0]]>
// CHECK:             scf.yield %[[VAL_15]], %[[I]], %[[J]]
// CHECK-SAME:            : memref<?x8xf32, #[[MAP0]]>, index, index
// CHECK:           } else {
// CHECK:             %[[VAL_16:.*]] = memref.cast %[[TEMP]]
// CHECK-SAME:            : memref<4x8xf32> to memref<?x8xf32, #[[MAP0]]>
// CHECK:             scf.yield %[[VAL_16]], %[[C0]], %[[C0]]
// CHECK-SAME:            : memref<?x8xf32, #[[MAP0]]>, index, index
// CHECK:           }
// CHECK:           vector.transfer_write %[[VEC]],
// CHECK-SAME:          %[[IN_BOUND_DEST:.*]]#0
// CHECK-SAME:          [%[[IN_BOUND_DEST]]#1, %[[IN_BOUND_DEST]]#2]
// CHECK-SAME:          {in_bounds = [true, true]} : vector<4x8xf32>, memref<?x8xf32, #[[MAP0]]>
// CHECK:           %[[OUT_BOUNDS:.*]] = arith.xori %[[IN_BOUNDS]], %[[CT]] : i1
// CHECK:           scf.if %[[OUT_BOUNDS]] {
// CHECK:             %[[VAL_19:.*]] = vector.type_cast %[[TEMP]]
// CHECK-SAME:            : memref<4x8xf32> to memref<vector<4x8xf32>>
// CHECK:             %[[VAL_20:.*]] = memref.load %[[VAL_19]][]
// CHECK-SAME:            : memref<vector<4x8xf32>>
// CHECK:             vector.transfer_write %[[VAL_20]], %[[DEST]][%[[I]], %[[J]]]
// CHECK-SAME:            : vector<4x8xf32>, memref<7x8xf32, #[[MAP0]]>
// CHECK:           }
// CHECK:           return
// CHECK:         }

// LINALG-DAG: #[[MAP0:.*]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
// LINALG-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 4)>
// LINALG-DAG: #[[MAP2:.*]] = affine_map<()[s0] -> (s0 + 8)>
// LINALG-DAG: #[[MAP3:.*]] = affine_map<(d0, d1, d2) -> (d0 - d1, 4)>
// LINALG-DAG: #[[MAP4:.*]] = affine_map<(d0, d1, d2) -> (d0 - d1, 8)>
// LINALG-DAG: #[[MAP5:.*]] = affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>
// LINALG:   func @split_vector_transfer_write_strided_2d(
// LINALG-SAME:                                                 %[[VEC:.*]]: vector<4x8xf32>,
// LINALG-SAME:                                                 %[[DEST:.*]]: memref<7x8xf32, #[[MAP0]]>,
// LINALG-SAME:                                                 %[[I:.*]]: index,
// LINALG-SAME:                                                 %[[J:.*]]: index) {
// LINALG-DAG:       %[[C0:.*]] = arith.constant 0 : index
// LINALG-DAG:       %[[CT:.*]] = arith.constant true
// LINALG-DAG:       %[[C7:.*]] = arith.constant 7 : index
// LINALG-DAG:       %[[C4:.*]] = arith.constant 4 : index
// LINALG-DAG:       %[[C8:.*]] = arith.constant 8 : index
// LINALG:           %[[TEMP:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32>
// LINALG:           %[[DIM0:.*]] = affine.apply #[[MAP1]]()[%[[I]]]
// LINALG:           %[[DIM0_IN:.*]] = arith.cmpi sle, %[[DIM0]], %[[C7]] : index
// LINALG:           %[[DIM1:.*]] = affine.apply #[[MAP2]]()[%[[J]]]
// LINALG:           %[[DIM1_IN:.*]] = arith.cmpi sle, %[[DIM1]], %[[C8]] : index
// LINALG:           %[[IN_BOUNDS:.*]] = arith.andi %[[DIM0_IN]], %[[DIM1_IN]] : i1
// LINALG:           %[[IN_BOUND_DEST:.*]]:3 = scf.if %[[IN_BOUNDS]]
// LINALG-SAME:          -> (memref<?x8xf32, #[[MAP0]]>, index, index) {
// LINALG:             %[[VAL_16:.*]] = memref.cast %[[DEST]]
// LINALG-SAME:            : memref<7x8xf32, #[[MAP0]]> to memref<?x8xf32, #[[MAP0]]>
// LINALG:             scf.yield %[[VAL_16]], %[[I]], %[[J]]
// LINALG-SAME:            : memref<?x8xf32, #[[MAP0]]>, index, index
// LINALG:           } else {
// LINALG:             %[[VAL_17:.*]] = memref.cast %[[TEMP]]
// LINALG-SAME:            : memref<4x8xf32> to memref<?x8xf32, #[[MAP0]]>
// LINALG:             scf.yield %[[VAL_17]], %[[C0]], %[[C0]]
// LINALG-SAME:            : memref<?x8xf32, #[[MAP0]]>, index, index
// LINALG:           }
// LINALG:           vector.transfer_write %[[VEC]],
// LINALG-SAME:          %[[IN_BOUND_DEST:.*]]#0
// LINALG-SAME:          [%[[IN_BOUND_DEST]]#1, %[[IN_BOUND_DEST]]#2]
// LINALG-SAME:          {in_bounds = [true, true]}
// LINALG-SAME:          : vector<4x8xf32>, memref<?x8xf32, #[[MAP0]]>
// LINALG:           %[[OUT_BOUNDS:.*]] = arith.xori %[[IN_BOUNDS]], %[[CT]] : i1
// LINALG:           scf.if %[[OUT_BOUNDS]] {
// LINALG-DAG:         %[[VAL_20:.*]] = affine.min #[[MAP3]](%[[C7]], %[[I]], %[[C4]])
// LINALG-DAG:         %[[VAL_21:.*]] = affine.min #[[MAP4]](%[[C8]], %[[J]], %[[C8]])
// LINALG:             %[[VAL_22:.*]] = memref.subview %[[TEMP]]
// LINALG-SAME:            [%[[I]], %[[J]]] [%[[VAL_20]], %[[VAL_21]]]
// LINALG-SAME:            [1, 1] : memref<4x8xf32> to memref<?x?xf32, #[[MAP5]]>
// LINALG:             %[[DEST_VIEW:.*]] = memref.subview %[[DEST]][0, 0] [%[[VAL_20]], %[[VAL_21]]] [1, 1]
// LINALG:             memref.copy %[[VAL_22]], %[[DEST_VIEW]]
// LINALG-SAME:            : memref<?x?xf32, #[[MAP5]]> to memref<?x?xf32, #[[MAP0]]>
// LINALG:           }
// LINALG:           return
// LINALG:         }

// -----

func.func private @fake_side_effecting_fun(%0: vector<2x2xf32>) -> ()

// CHECK-LABEL: transfer_read_within_async_execute
func.func @transfer_read_within_async_execute(%A : memref<?x?xf32>) -> !async.token {
  %c0 = arith.constant 0 : index
  %f0 = arith.constant 0.0 : f32
  // CHECK-NOT: alloca
  //     CHECK: async.execute
  //     CHECK:   alloca
  %token = async.execute {
    %0 = vector.transfer_read %A[%c0, %c0], %f0 : memref<?x?xf32>, vector<2x2xf32>
    func.call @fake_side_effecting_fun(%0) : (vector<2x2xf32>) -> ()
    async.yield
  }
  return %token : !async.token
}

// -----

func.func private @fake_side_effecting_fun(%0: vector<2x2xf32>) -> ()

// Ensure that `alloca`s are inserted outside of loops even though loops are
// consdered allocation scopes.
// CHECK-LABEL: transfer_read_within_scf_for
func.func @transfer_read_within_scf_for(%A : memref<?x?xf32>, %lb : index, %ub : index, %step : index) {
  %c0 = arith.constant 0 : index
  %f0 = arith.constant 0.0 : f32
  // CHECK: alloca
  // CHECK: scf.for
  // CHECK-NOT: alloca
  scf.for %i = %lb to %ub step %step {
    %0 = vector.transfer_read %A[%c0, %c0], %f0 : memref<?x?xf32>, vector<2x2xf32>
    func.call @fake_side_effecting_fun(%0) : (vector<2x2xf32>) -> ()
  }
  return
}