File: vector-warp-distribute.mlir

package info (click to toggle)
llvm-toolchain-15 1%3A15.0.6-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,554,644 kB
  • sloc: cpp: 5,922,452; ansic: 1,012,136; asm: 674,362; python: 191,568; objc: 73,855; f90: 42,327; lisp: 31,913; pascal: 11,973; javascript: 10,144; sh: 9,421; perl: 7,447; ml: 5,527; awk: 3,523; makefile: 2,520; xml: 885; cs: 573; fortran: 567
file content (631 lines) | stat: -rw-r--r-- 30,946 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
// RUN: mlir-opt %s -allow-unregistered-dialect -split-input-file -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if | FileCheck %s --check-prefix=CHECK-SCF-IF
// RUN: mlir-opt %s -allow-unregistered-dialect -split-input-file -test-vector-warp-distribute="hoist-uniform" | FileCheck --check-prefixes=CHECK-HOIST %s
// RUN: mlir-opt %s -allow-unregistered-dialect -split-input-file -test-vector-warp-distribute="hoist-uniform distribute-transfer-write" | FileCheck --check-prefixes=CHECK-D %s
// RUN: mlir-opt %s -allow-unregistered-dialect -split-input-file -test-vector-warp-distribute=propagate-distribution -canonicalize | FileCheck --check-prefixes=CHECK-PROP %s

// CHECK-SCF-IF-DAG: memref.global "private" @__shared_32xf32 : memref<32xf32, 3>
// CHECK-SCF-IF-DAG: memref.global "private" @__shared_64xf32 : memref<64xf32, 3>
// CHECK-SCF-IF-DAG: memref.global "private" @__shared_128xf32 : memref<128xf32, 3>
// CHECK-SCF-IF-DAG: memref.global "private" @__shared_256xf32 : memref<256xf32, 3>

// CHECK-SCF-IF-LABEL: func @rewrite_warp_op_to_scf_if(
//  CHECK-SCF-IF-SAME:     %[[laneid:.*]]: index,
//  CHECK-SCF-IF-SAME:     %[[v0:.*]]: vector<4xf32>, %[[v1:.*]]: vector<8xf32>)
func.func @rewrite_warp_op_to_scf_if(%laneid: index,
                                %v0: vector<4xf32>, %v1: vector<8xf32>) {
//   CHECK-SCF-IF-DAG:   %[[c0:.*]] = arith.constant 0 : index
//   CHECK-SCF-IF-DAG:   %[[c2:.*]] = arith.constant 2 : index
//   CHECK-SCF-IF-DAG:   %[[c4:.*]] = arith.constant 4 : index
//   CHECK-SCF-IF-DAG:   %[[c8:.*]] = arith.constant 8 : index
//       CHECK-SCF-IF:   %[[is_lane_0:.*]] = arith.cmpi eq, %[[laneid]], %[[c0]]

//       CHECK-SCF-IF:   %[[buffer_v0:.*]] = memref.get_global @__shared_128xf32
//       CHECK-SCF-IF:   %[[s0:.*]] = arith.muli %[[laneid]], %[[c4]]
//       CHECK-SCF-IF:   vector.store %[[v0]], %[[buffer_v0]][%[[s0]]]
//       CHECK-SCF-IF:   %[[buffer_v1:.*]] = memref.get_global @__shared_256xf32
//       CHECK-SCF-IF:   %[[s1:.*]] = arith.muli %[[laneid]], %[[c8]]
//       CHECK-SCF-IF:   vector.store %[[v1]], %[[buffer_v1]][%[[s1]]]

//   CHECK-SCF-IF-DAG:   gpu.barrier
//   CHECK-SCF-IF-DAG:   %[[buffer_def_0:.*]] = memref.get_global @__shared_32xf32
//   CHECK-SCF-IF-DAG:   %[[buffer_def_1:.*]] = memref.get_global @__shared_64xf32

//       CHECK-SCF-IF:   scf.if %[[is_lane_0]] {
  %r:2 = vector.warp_execute_on_lane_0(%laneid)[32]
      args(%v0, %v1 : vector<4xf32>, vector<8xf32>) -> (vector<1xf32>, vector<2xf32>) {
    ^bb0(%arg0: vector<128xf32>, %arg1: vector<256xf32>):
//       CHECK-SCF-IF:     %[[arg1:.*]] = vector.load %[[buffer_v1]][%[[c0]]] : memref<256xf32, 3>, vector<256xf32>
//       CHECK-SCF-IF:     %[[arg0:.*]] = vector.load %[[buffer_v0]][%[[c0]]] : memref<128xf32, 3>, vector<128xf32>
//       CHECK-SCF-IF:     %[[def_0:.*]] = "some_def"(%[[arg0]]) : (vector<128xf32>) -> vector<32xf32>
//       CHECK-SCF-IF:     %[[def_1:.*]] = "some_def"(%[[arg1]]) : (vector<256xf32>) -> vector<64xf32>
    %2 = "some_def"(%arg0) : (vector<128xf32>) -> vector<32xf32>
    %3 = "some_def"(%arg1) : (vector<256xf32>) -> vector<64xf32>
//       CHECK-SCF-IF:     vector.store %[[def_0]], %[[buffer_def_0]][%[[c0]]]
//       CHECK-SCF-IF:     vector.store %[[def_1]], %[[buffer_def_1]][%[[c0]]]
    vector.yield %2, %3 : vector<32xf32>, vector<64xf32>
  }
//       CHECK-SCF-IF:   }
//       CHECK-SCF-IF:   gpu.barrier
//       CHECK-SCF-IF:   %[[o1:.*]] = arith.muli %[[laneid]], %[[c2]]
//       CHECK-SCF-IF:   %[[r1:.*]] = vector.load %[[buffer_def_1]][%[[o1]]] : memref<64xf32, 3>, vector<2xf32>
//       CHECK-SCF-IF:   %[[r0:.*]] = vector.load %[[buffer_def_0]][%[[laneid]]] : memref<32xf32, 3>, vector<1xf32>
//       CHECK-SCF-IF:   "some_use"(%[[r0]]) : (vector<1xf32>) -> ()
//       CHECK-SCF-IF:   "some_use"(%[[r1]]) : (vector<2xf32>) -> ()
  "some_use"(%r#0) : (vector<1xf32>) -> ()
  "some_use"(%r#1) : (vector<2xf32>) -> ()
  return
}

// -----

// CHECK-D-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 * 2 + 32)>

// CHECK-ALL-LABEL: func @warp(
// CHECK-HOIST: memref.subview
// CHECK-HOIST: memref.subview
// CHECK-HOIST: memref.subview
// CHECK-HOIST: vector.warp_execute_on_lane_0

//     CHECK-D: %[[R:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<2xf32>, vector<1xf32>) {
//     CHECK-D:   arith.addf {{.*}} : vector<32xf32>
//     CHECK-D:   arith.addf {{.*}} : vector<64xf32>
//     CHECK-D:   vector.yield %{{.*}}, %{{.*}} : vector<64xf32>, vector<32xf32>
// CHECK-D-DAG: vector.transfer_write %[[R]]#1, %{{.*}}[%{{.*}}] {in_bounds = [true]} : vector<1xf32>, memref<128xf32
// CHECK-D-DAG: %[[ID1:.*]] = affine.apply #[[MAP1]]()[%{{.*}}]
// CHECK-D-DAG: vector.transfer_write %[[R]]#0, %2[%[[ID1]]] {in_bounds = [true]} : vector<2xf32>, memref<128xf32

// CHECK-ALL-NOT: vector.warp_execute_on_lane_0
// CHECK-ALL: vector.transfer_read {{.*}} vector<1xf32>
// CHECK-ALL: vector.transfer_read {{.*}} vector<1xf32>
// CHECK-ALL: vector.transfer_read {{.*}} vector<2xf32>
// CHECK-ALL: vector.transfer_read {{.*}} vector<2xf32>
// CHECK-ALL: arith.addf {{.*}} : vector<1xf32>
// CHECK-ALL: arith.addf {{.*}} : vector<2xf32>
// CHECK-ALL: vector.transfer_write {{.*}} : vector<1xf32>
// CHECK-ALL: vector.transfer_write {{.*}} : vector<2xf32>

#map0 =  affine_map<(d0)[s0] -> (d0 + s0)>
func.func @warp(%laneid: index, %arg1: memref<1024xf32>, %arg2: memref<1024xf32>,
           %arg3: memref<1024xf32>, %gid : index) {
  vector.warp_execute_on_lane_0(%laneid)[32] {
    %sa = memref.subview %arg1[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, #map0>
    %sb = memref.subview %arg2[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, #map0>
    %sc = memref.subview %arg3[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, #map0>
    %c0 = arith.constant 0 : index
    %c32 = arith.constant 32 : index
    %cst = arith.constant 0.000000e+00 : f32
    %2 = vector.transfer_read %sa[%c0], %cst : memref<128xf32, #map0>, vector<32xf32>
    %3 = vector.transfer_read %sa[%c32], %cst : memref<128xf32, #map0>, vector<32xf32>
    %4 = vector.transfer_read %sb[%c0], %cst : memref<128xf32, #map0>, vector<64xf32>
    %5 = vector.transfer_read %sb[%c32], %cst : memref<128xf32, #map0>, vector<64xf32>
    %6 = arith.addf %2, %3 : vector<32xf32>
    %7 = arith.addf %4, %5 : vector<64xf32>
    vector.transfer_write %6, %sc[%c0] : vector<32xf32>, memref<128xf32, #map0>
    vector.transfer_write %7, %sc[%c32] : vector<64xf32>, memref<128xf32, #map0>
  }
  return
}

// -----

// CHECK-D-LABEL: func @warp_extract(
//       CHECK-D:   %[[WARPOP:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1x1xf32>)
//       CHECK-D:     "test.dummy_op"
//       CHECK-D:     "test.dummy_op"
//       CHECK-D:     vector.yield %{{.*}}, %{{.*}} : vector<1xf32>, vector<1x1xf32>
//       CHECK-D:   }
//       CHECK-D:   vector.warp_execute_on_lane_0(%{{.*}})[32] {
//       CHECK-D:     vector.transfer_write %[[WARPOP]]#1, %{{.*}}[%{{.*}}] {{.*}} : vector<1x1xf32>
//       CHECK-D:   }
//       CHECK-D:   vector.warp_execute_on_lane_0(%{{.*}})[32] {
//       CHECK-D:     vector.transfer_write %[[WARPOP]]#0, %{{.*}}[%{{.*}}] {{.*}} : vector<1xf32>
//       CHECK-D:   }

func.func @warp_extract(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : index) {
  vector.warp_execute_on_lane_0(%laneid)[32] {
    %c0 = arith.constant 0 : index
    %v = "test.dummy_op"() : () -> (vector<1xf32>)
    %v1 = "test.dummy_op"() : () -> (vector<1x1xf32>)
    vector.transfer_write %v1, %arg1[%c0, %c0] : vector<1x1xf32>, memref<1024x1024xf32>
    vector.transfer_write %v, %arg1[%c0, %c0] : vector<1xf32>, memref<1024x1024xf32>
  }
  return
}

// -----

// CHECK-PROP-LABEL:   func @warp_dead_result(
func.func @warp_dead_result(%laneid: index) -> (vector<1xf32>) {
  // CHECK-PROP: %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>)
  %r:3 = vector.warp_execute_on_lane_0(%laneid)[32] ->
    (vector<1xf32>, vector<1xf32>, vector<1xf32>) {
    %2 = "some_def"() : () -> (vector<32xf32>)
    %3 = "some_def"() : () -> (vector<32xf32>)
    %4 = "some_def"() : () -> (vector<32xf32>)
  // CHECK-PROP:   vector.yield %{{.*}} : vector<32xf32>
    vector.yield %2, %3, %4 : vector<32xf32>, vector<32xf32>, vector<32xf32>
  }
  // CHECK-PROP: return %[[R]] : vector<1xf32>
  return %r#1 : vector<1xf32>
}

// -----

// CHECK-PROP-LABEL:   func @warp_propagate_operand(
//  CHECK-PROP-SAME:   %[[ID:.*]]: index, %[[V:.*]]: vector<4xf32>)
func.func @warp_propagate_operand(%laneid: index, %v0: vector<4xf32>)
  -> (vector<4xf32>) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32]
     args(%v0 : vector<4xf32>) -> (vector<4xf32>) {
     ^bb0(%arg0 : vector<128xf32>) :
    vector.yield %arg0 : vector<128xf32>
  }
  // CHECK-PROP: return %[[V]] : vector<4xf32>
  return %r : vector<4xf32>
}

// -----

#map0 = affine_map<()[s0] -> (s0 * 2)>

// CHECK-PROP-LABEL:   func @warp_propagate_elementwise(
func.func @warp_propagate_elementwise(%laneid: index, %dest: memref<1024xf32>) {
  %c0 = arith.constant 0 : index
  %c32 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  // CHECK-PROP: %[[R:.*]]:4 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xf32>, vector<2xf32>, vector<2xf32>)
  %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] ->
    (vector<1xf32>, vector<2xf32>) {
    // CHECK-PROP: %[[V0:.*]] = "some_def"() : () -> vector<32xf32>
    // CHECK-PROP: %[[V1:.*]] = "some_def"() : () -> vector<32xf32>
    // CHECK-PROP: %[[V2:.*]] = "some_def"() : () -> vector<64xf32>
    // CHECK-PROP: %[[V3:.*]] = "some_def"() : () -> vector<64xf32>
    // CHECK-PROP: vector.yield %[[V0]], %[[V1]], %[[V2]], %[[V3]] : vector<32xf32>, vector<32xf32>, vector<64xf32>, vector<64xf32>
    %2 = "some_def"() : () -> (vector<32xf32>)
    %3 = "some_def"() : () -> (vector<32xf32>)
    %4 = "some_def"() : () -> (vector<64xf32>)
    %5 = "some_def"() : () -> (vector<64xf32>)
    %6 = arith.addf %2, %3 : vector<32xf32>
    %7 = arith.addf %4, %5 : vector<64xf32>
    vector.yield %6, %7 : vector<32xf32>, vector<64xf32>
  }
  // CHECK-PROP: %[[A0:.*]] = arith.addf %[[R]]#2, %[[R]]#3 : vector<2xf32>
  // CHECK-PROP: %[[A1:.*]] = arith.addf %[[R]]#0, %[[R]]#1 : vector<1xf32>
  %id2 = affine.apply #map0()[%laneid]
  // CHECK-PROP: vector.transfer_write %[[A1]], {{.*}} : vector<1xf32>, memref<1024xf32>
  // CHECK-PROP: vector.transfer_write %[[A0]], {{.*}} : vector<2xf32>, memref<1024xf32>
  vector.transfer_write %r#0, %dest[%laneid] : vector<1xf32>, memref<1024xf32>
  vector.transfer_write %r#1, %dest[%id2] : vector<2xf32>, memref<1024xf32>
  return
}

// -----

// CHECK-PROP-LABEL: func @warp_propagate_scalar_arith(
//       CHECK-PROP:   %[[r:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} {
//       CHECK-PROP:     %[[some_def0:.*]] = "some_def"
//       CHECK-PROP:     %[[some_def1:.*]] = "some_def"
//       CHECK-PROP:     vector.yield %[[some_def0]], %[[some_def1]]
//       CHECK-PROP:   }
//       CHECK-PROP:   arith.addf %[[r]]#0, %[[r]]#1 : f32
func.func @warp_propagate_scalar_arith(%laneid: index) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
    %0 = "some_def"() : () -> (f32)
    %1 = "some_def"() : () -> (f32)
    %2 = arith.addf %0, %1 : f32
    vector.yield %2 : f32
  }
  vector.print %r : f32
  return
}

// -----

// CHECK-PROP-LABEL: func @warp_propagate_cast(
//   CHECK-PROP-NOT:   vector.warp_execute_on_lane_0
//       CHECK-PROP:   %[[result:.*]] = arith.sitofp %{{.*}} : i32 to f32
//       CHECK-PROP:   return %[[result]]
func.func @warp_propagate_cast(%laneid : index, %i : i32) -> (f32) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
    %casted = arith.sitofp %i : i32 to f32
    vector.yield %casted : f32
  }
  return %r : f32
}

// -----

#map0 = affine_map<()[s0] -> (s0 * 2)>

//  CHECK-PROP-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 * 2)>

// CHECK-PROP:   func @warp_propagate_read
//  CHECK-PROP-SAME:     (%[[ID:.*]]: index
func.func @warp_propagate_read(%laneid: index, %src: memref<1024xf32>, %dest: memref<1024xf32>) {
// CHECK-PROP-NOT: warp_execute_on_lane_0
// CHECK-PROP-DAG: %[[R0:.*]] = vector.transfer_read %arg1[%[[ID]]], %{{.*}} : memref<1024xf32>, vector<1xf32>
// CHECK-PROP-DAG: %[[ID2:.*]] = affine.apply #[[MAP0]]()[%[[ID]]]
// CHECK-PROP-DAG: %[[R1:.*]] = vector.transfer_read %arg1[%[[ID2]]], %{{.*}} : memref<1024xf32>, vector<2xf32>
// CHECK-PROP: vector.transfer_write %[[R0]], {{.*}} : vector<1xf32>, memref<1024xf32>
// CHECK-PROP: vector.transfer_write %[[R1]], {{.*}} : vector<2xf32>, memref<1024xf32>
  %c0 = arith.constant 0 : index
  %c32 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] ->(vector<1xf32>, vector<2xf32>) {
    %2 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>, vector<32xf32>
    %3 = vector.transfer_read %src[%c32], %cst : memref<1024xf32>, vector<64xf32>
    vector.yield %2, %3 : vector<32xf32>, vector<64xf32>
  }
  %id2 = affine.apply #map0()[%laneid]
  vector.transfer_write %r#0, %dest[%laneid] : vector<1xf32>, memref<1024xf32>
  vector.transfer_write %r#1, %dest[%id2] : vector<2xf32>, memref<1024xf32>
  return
}

// -----

// CHECK-PROP-LABEL: func @fold_vector_broadcast(
//       CHECK-PROP:   %[[r:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>)
//       CHECK-PROP:     %[[some_def:.*]] = "some_def"
//       CHECK-PROP:     vector.yield %[[some_def]] : vector<1xf32>
//       CHECK-PROP:   vector.print %[[r]] : vector<1xf32>
func.func @fold_vector_broadcast(%laneid: index) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
    %0 = "some_def"() : () -> (vector<1xf32>)
    %1 = vector.broadcast %0 : vector<1xf32> to vector<32xf32>
    vector.yield %1 : vector<32xf32>
  }
  vector.print %r : vector<1xf32>
  return
}

// -----

// CHECK-PROP-LABEL: func @extract_vector_broadcast(
//       CHECK-PROP:   %[[r:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>)
//       CHECK-PROP:     %[[some_def:.*]] = "some_def"
//       CHECK-PROP:     vector.yield %[[some_def]] : vector<1xf32>
//       CHECK-PROP:   %[[broadcasted:.*]] = vector.broadcast %[[r]] : vector<1xf32> to vector<2xf32>
//       CHECK-PROP:   vector.print %[[broadcasted]] : vector<2xf32>
func.func @extract_vector_broadcast(%laneid: index) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
    %0 = "some_def"() : () -> (vector<1xf32>)
    %1 = vector.broadcast %0 : vector<1xf32> to vector<64xf32>
    vector.yield %1 : vector<64xf32>
  }
  vector.print %r : vector<2xf32>
  return
}

// -----

// CHECK-PROP-LABEL: func @extract_scalar_vector_broadcast(
//       CHECK-PROP:   %[[r:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (f32)
//       CHECK-PROP:     %[[some_def:.*]] = "some_def"
//       CHECK-PROP:     vector.yield %[[some_def]] : f32
//       CHECK-PROP:   %[[broadcasted:.*]] = vector.broadcast %[[r]] : f32 to vector<2xf32>
//       CHECK-PROP:   vector.print %[[broadcasted]] : vector<2xf32>
func.func @extract_scalar_vector_broadcast(%laneid: index) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
    %0 = "some_def"() : () -> (f32)
    %1 = vector.broadcast %0 : f32 to vector<64xf32>
    vector.yield %1 : vector<64xf32>
  }
  vector.print %r : vector<2xf32>
  return
}

// -----

// CHECK-PROP-LABEL:   func @warp_scf_for(
// CHECK-PROP: %[[INI:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>) {
// CHECK-PROP:   %[[INI1:.*]] = "some_def"() : () -> vector<128xf32>
// CHECK-PROP:   vector.yield %[[INI1]] : vector<128xf32>
// CHECK-PROP: }
// CHECK-PROP: %[[F:.*]] = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[FARG:.*]] = %[[INI]]) -> (vector<4xf32>) {
// CHECK-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG]] : vector<4xf32>) -> (vector<4xf32>) {
// CHECK-PROP:    ^bb0(%[[ARG:.*]]: vector<128xf32>):
// CHECK-PROP:      %[[ACC:.*]] = "some_def"(%[[ARG]]) : (vector<128xf32>) -> vector<128xf32>
// CHECK-PROP:      vector.yield %[[ACC]] : vector<128xf32>
// CHECK-PROP:   }
// CHECK-PROP:   scf.yield %[[W]] : vector<4xf32>
// CHECK-PROP: }
// CHECK-PROP: "some_use"(%[[F]]) : (vector<4xf32>) -> ()
func.func @warp_scf_for(%arg0: index) {
  %c128 = arith.constant 128 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = vector.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) {
    %ini = "some_def"() : () -> (vector<128xf32>)
    %3 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini) -> (vector<128xf32>) {
      %acc = "some_def"(%arg4) : (vector<128xf32>) -> (vector<128xf32>)
      scf.yield %acc : vector<128xf32>
    }
    vector.yield %3 : vector<128xf32>
  }
  "some_use"(%0) : (vector<4xf32>) -> ()
  return
}

// -----

// CHECK-PROP-LABEL:   func @warp_scf_for_swap(
// CHECK-PROP: %[[INI:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) {
// CHECK-PROP:   %[[INI1:.*]] = "some_def"() : () -> vector<128xf32>
// CHECK-PROP:   %[[INI2:.*]] = "some_def"() : () -> vector<128xf32>
// CHECK-PROP:   vector.yield %[[INI1]], %[[INI2]] : vector<128xf32>, vector<128xf32>
// CHECK-PROP: }
// CHECK-PROP: %[[F:.*]]:2 = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[FARG1:.*]] = %[[INI]]#0, %[[FARG2:.*]] = %[[INI]]#1) -> (vector<4xf32>, vector<4xf32>) {
// CHECK-PROP:   %[[W:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG1]], %[[FARG2]] : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>) {
// CHECK-PROP:    ^bb0(%[[ARG1:.*]]: vector<128xf32>, %[[ARG2:.*]]: vector<128xf32>):
// CHECK-PROP:      %[[ACC1:.*]] = "some_def"(%[[ARG1]]) : (vector<128xf32>) -> vector<128xf32>
// CHECK-PROP:      %[[ACC2:.*]] = "some_def"(%[[ARG2]]) : (vector<128xf32>) -> vector<128xf32>
// CHECK-PROP:      vector.yield %[[ACC2]], %[[ACC1]] : vector<128xf32>, vector<128xf32>
// CHECK-PROP:   }
// CHECK-PROP:   scf.yield %[[W]]#0, %[[W]]#1 : vector<4xf32>, vector<4xf32>
// CHECK-PROP: }
// CHECK-PROP: "some_use"(%[[F]]#0) : (vector<4xf32>) -> ()
// CHECK-PROP: "some_use"(%[[F]]#1) : (vector<4xf32>) -> ()
func.func @warp_scf_for_swap(%arg0: index) {
  %c128 = arith.constant 128 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0:2 = vector.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>, vector<4xf32>) {
    %ini1 = "some_def"() : () -> (vector<128xf32>)
    %ini2 = "some_def"() : () -> (vector<128xf32>)
    %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini1, %arg5 = %ini2) -> (vector<128xf32>, vector<128xf32>) {
      %acc1 = "some_def"(%arg4) : (vector<128xf32>) -> (vector<128xf32>)
      %acc2 = "some_def"(%arg5) : (vector<128xf32>) -> (vector<128xf32>)
      scf.yield %acc2, %acc1 : vector<128xf32>, vector<128xf32>
    }
    vector.yield %3#0, %3#1 : vector<128xf32>, vector<128xf32>
  }
  "some_use"(%0#0) : (vector<4xf32>) -> ()
  "some_use"(%0#1) : (vector<4xf32>) -> ()
  return
}

// -----

// CHECK-PROP-LABEL:   func @warp_scf_for_swap_no_yield(
// CHECK-PROP:           scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
// CHECK-PROP-NEXT:        vector.warp_execute_on_lane_0(%{{.*}})[32] {
// CHECK-PROP-NEXT:          "some_op"() : () -> ()
// CHECK-PROP-NEXT:        }
// CHECK-PROP-NEXT:      }
func.func @warp_scf_for_swap_no_yield(%arg0: index) {
  %c128 = arith.constant 128 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  vector.warp_execute_on_lane_0(%arg0)[32] {
    scf.for %arg3 = %c0 to %c128 step %c1 {
      "some_op"() : () -> ()
    }
  }
  return
}

// -----

#map = affine_map<()[s0] -> (s0 * 4)>
#map1 = affine_map<()[s0] -> (s0 * 128 + 128)>
#map2 = affine_map<()[s0] -> (s0 * 4 + 128)>

// CHECK-PROP-LABEL:   func @warp_scf_for_multiple_yield(
//       CHECK-PROP:   vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) {
//  CHECK-PROP-NEXT:     "some_def"() : () -> vector<32xf32>
//  CHECK-PROP-NEXT:     vector.yield %{{.*}} : vector<32xf32>
//  CHECK-PROP-NEXT:   }
//   CHECK-PROP-NOT:   vector.warp_execute_on_lane_0
//       CHECK-PROP:   vector.transfer_read {{.*}} : memref<?xf32>, vector<4xf32>
//       CHECK-PROP:   vector.transfer_read {{.*}} : memref<?xf32>, vector<4xf32>
//       CHECK-PROP:   %{{.*}}:2 = scf.for {{.*}} -> (vector<4xf32>, vector<4xf32>) {
//   CHECK-PROP-NOT:     vector.warp_execute_on_lane_0
//       CHECK-PROP:     vector.transfer_read {{.*}} : memref<?xf32>, vector<4xf32>
//       CHECK-PROP:     vector.transfer_read {{.*}} : memref<?xf32>, vector<4xf32>
//       CHECK-PROP:     arith.addf {{.*}} : vector<4xf32>
//       CHECK-PROP:     arith.addf {{.*}} : vector<4xf32>
//       CHECK-PROP:     scf.yield {{.*}} : vector<4xf32>, vector<4xf32>
//       CHECK-PROP:   }
func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref<?xf32>, %arg2: memref<?xf32>) {
  %c256 = arith.constant 256 : index
  %c128 = arith.constant 128 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0:3 = vector.warp_execute_on_lane_0(%arg0)[32] ->
  (vector<1xf32>, vector<4xf32>, vector<4xf32>) {
    %def = "some_def"() : () -> (vector<32xf32>)
    %r1 = vector.transfer_read %arg2[%c0], %cst {in_bounds = [true]} : memref<?xf32>, vector<128xf32>
    %r2 = vector.transfer_read %arg2[%c128], %cst {in_bounds = [true]} : memref<?xf32>, vector<128xf32>
    %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %r1, %arg5 = %r2)
    -> (vector<128xf32>, vector<128xf32>) {
      %o1 = affine.apply #map1()[%arg3]
      %o2 = affine.apply #map2()[%arg3]
      %4 = vector.transfer_read %arg1[%o1], %cst {in_bounds = [true]} : memref<?xf32>, vector<128xf32>
      %5 = vector.transfer_read %arg1[%o2], %cst {in_bounds = [true]} : memref<?xf32>, vector<128xf32>
      %6 = arith.addf %4, %arg4 : vector<128xf32>
      %7 = arith.addf %5, %arg5 : vector<128xf32>
      scf.yield %6, %7 : vector<128xf32>, vector<128xf32>
    }
    vector.yield %def, %3#0, %3#1 :  vector<32xf32>, vector<128xf32>, vector<128xf32>
  }
  %1 = affine.apply #map()[%arg0]
  vector.transfer_write %0#1, %arg2[%1] {in_bounds = [true]} : vector<4xf32>, memref<?xf32>
  %2 = affine.apply #map2()[%arg0]
  vector.transfer_write %0#2, %arg2[%2] {in_bounds = [true]} : vector<4xf32>, memref<?xf32>
  "some_use"(%0#0) : (vector<1xf32>) -> ()
  return
}

// -----

// CHECK-PROP-LABEL: func @vector_reduction(
//  CHECK-PROP-SAME:     %[[laneid:.*]]: index)
//   CHECK-PROP-DAG:   %[[c1:.*]] = arith.constant 1 : i32
//   CHECK-PROP-DAG:   %[[c2:.*]] = arith.constant 2 : i32
//   CHECK-PROP-DAG:   %[[c4:.*]] = arith.constant 4 : i32
//   CHECK-PROP-DAG:   %[[c8:.*]] = arith.constant 8 : i32
//   CHECK-PROP-DAG:   %[[c16:.*]] = arith.constant 16 : i32
//   CHECK-PROP-DAG:   %[[c32:.*]] = arith.constant 32 : i32
//       CHECK-PROP:   %[[warp_op:.*]] = vector.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<1xf32>) {
//       CHECK-PROP:     vector.yield %{{.*}} : vector<32xf32>
//       CHECK-PROP:   }
//       CHECK-PROP:   %[[a:.*]] = vector.extract %[[warp_op]][0] : vector<1xf32>
//       CHECK-PROP:   %[[r0:.*]], %{{.*}} = gpu.shuffle  xor %[[a]], %[[c1]], %[[c32]]
//       CHECK-PROP:   %[[a0:.*]] = arith.addf %[[a]], %[[r0]]
//       CHECK-PROP:   %[[r1:.*]], %{{.*}} = gpu.shuffle  xor %[[a0]], %[[c2]], %[[c32]]
//       CHECK-PROP:   %[[a1:.*]] = arith.addf %[[a0]], %[[r1]]
//       CHECK-PROP:   %[[r2:.*]], %{{.*}} = gpu.shuffle  xor %[[a1]], %[[c4]], %[[c32]]
//       CHECK-PROP:   %[[a2:.*]] = arith.addf %[[a1]], %[[r2]]
//       CHECK-PROP:   %[[r3:.*]], %{{.*}} = gpu.shuffle  xor %[[a2]], %[[c8]], %[[c32]]
//       CHECK-PROP:   %[[a3:.*]] = arith.addf %[[a2]], %[[r3]]
//       CHECK-PROP:   %[[r4:.*]], %{{.*}} = gpu.shuffle  xor %[[a3]], %[[c16]], %[[c32]]
//       CHECK-PROP:   %[[a4:.*]] = arith.addf %[[a3]], %[[r4]]
//       CHECK-PROP:   return %[[a4]] : f32
func.func @vector_reduction(%laneid: index) -> (f32) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
    %0 = "some_def"() : () -> (vector<32xf32>)
    %1 = vector.reduction <add>, %0 : vector<32xf32> into f32
    vector.yield %1 : f32
  }
  return %r : f32
}

// -----

func.func @vector_reduction(%laneid: index, %m0: memref<4x2x32xf32>, %m1: memref<f32>) {
  %c0 = arith.constant 0: index
  %f0 = arith.constant 0.0: f32
  //     CHECK-D: %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<f32>) {
  //     CHECK-D: vector.warp_execute_on_lane_0(%{{.*}})[32] {
  //     CHECK-D:   vector.transfer_write %[[R]], %{{.*}}[] : vector<f32>, memref<f32>
  vector.warp_execute_on_lane_0(%laneid)[32] {
    %0 = vector.transfer_read %m0[%c0, %c0, %c0], %f0 {in_bounds = [true]} : memref<4x2x32xf32>, vector<32xf32>
    %1 = vector.transfer_read %m1[], %f0 : memref<f32>, vector<f32>
    %2 = vector.extractelement %1[] : vector<f32>
    %3 = vector.reduction <add>, %0 : vector<32xf32> into f32
    %4 = arith.addf %3, %2 : f32
    %5 = vector.broadcast %4 : f32 to vector<f32>
    vector.transfer_write %5, %m1[] : vector<f32>, memref<f32>
  }
  return
}

// -----

// CHECK-PROP-LABEL: func @vector_reduction_large(
//  CHECK-PROP-SAME:     %[[laneid:.*]]: index)
//   CHECK-PROP-DAG:   %[[c1:.*]] = arith.constant 1 : i32
//   CHECK-PROP-DAG:   %[[c2:.*]] = arith.constant 2 : i32
//   CHECK-PROP-DAG:   %[[c4:.*]] = arith.constant 4 : i32
//   CHECK-PROP-DAG:   %[[c8:.*]] = arith.constant 8 : i32
//   CHECK-PROP-DAG:   %[[c16:.*]] = arith.constant 16 : i32
//   CHECK-PROP-DAG:   %[[c32:.*]] = arith.constant 32 : i32
//       CHECK-PROP:   %[[warp_op:.*]] = vector.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<2xf32>) {
//       CHECK-PROP:     vector.yield %{{.*}} : vector<64xf32>
//       CHECK-PROP:   }
//       CHECK-PROP:   %[[a:.*]] = vector.reduction <add>, %[[warp_op]] : vector<2xf32> into f32
//       CHECK-PROP:   %[[r0:.*]], %{{.*}} = gpu.shuffle  xor %[[a]], %[[c1]], %[[c32]]
//       CHECK-PROP:   %[[a0:.*]] = arith.addf %[[a]], %[[r0]]
//       CHECK-PROP:   %[[r1:.*]], %{{.*}} = gpu.shuffle  xor %[[a0]], %[[c2]], %[[c32]]
//       CHECK-PROP:   %[[a1:.*]] = arith.addf %[[a0]], %[[r1]]
//       CHECK-PROP:   %[[r2:.*]], %{{.*}} = gpu.shuffle  xor %[[a1]], %[[c4]], %[[c32]]
//       CHECK-PROP:   %[[a2:.*]] = arith.addf %[[a1]], %[[r2]]
//       CHECK-PROP:   %[[r3:.*]], %{{.*}} = gpu.shuffle  xor %[[a2]], %[[c8]], %[[c32]]
//       CHECK-PROP:   %[[a3:.*]] = arith.addf %[[a2]], %[[r3]]
//       CHECK-PROP:   %[[r4:.*]], %{{.*}} = gpu.shuffle  xor %[[a3]], %[[c16]], %[[c32]]
//       CHECK-PROP:   %[[a4:.*]] = arith.addf %[[a3]], %[[r4]]
//       CHECK-PROP:   return %[[a4]] : f32
func.func @vector_reduction_large(%laneid: index) -> (f32) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
    %0 = "some_def"() : () -> (vector<64xf32>)
    %1 = vector.reduction <add>, %0 : vector<64xf32> into f32
    vector.yield %1 : f32
  }
  return %r : f32
}

// -----

// CHECK-PROP-LABEL: func @vector_reduction_acc(
//  CHECK-PROP-SAME:     %[[laneid:.*]]: index)
//   CHECK-PROP-DAG:   %[[c1:.*]] = arith.constant 1 : i32
//   CHECK-PROP-DAG:   %[[c2:.*]] = arith.constant 2 : i32
//   CHECK-PROP-DAG:   %[[c4:.*]] = arith.constant 4 : i32
//   CHECK-PROP-DAG:   %[[c8:.*]] = arith.constant 8 : i32
//   CHECK-PROP-DAG:   %[[c16:.*]] = arith.constant 16 : i32
//   CHECK-PROP-DAG:   %[[c32:.*]] = arith.constant 32 : i32
//       CHECK-PROP:   %[[warp_op:.*]]:2 = vector.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<2xf32>, f32) {
//       CHECK-PROP:     vector.yield %{{.*}}, %{{.*}} : vector<64xf32>, f32
//       CHECK-PROP:   }
//       CHECK-PROP:   %[[a:.*]] = vector.reduction <add>, %[[warp_op]]#0 : vector<2xf32> into f32
//       CHECK-PROP:   %[[r0:.*]], %{{.*}} = gpu.shuffle  xor %[[a]], %[[c1]], %[[c32]]
//       CHECK-PROP:   %[[a0:.*]] = arith.addf %[[a]], %[[r0]]
//       CHECK-PROP:   %[[r1:.*]], %{{.*}} = gpu.shuffle  xor %[[a0]], %[[c2]], %[[c32]]
//       CHECK-PROP:   %[[a1:.*]] = arith.addf %[[a0]], %[[r1]]
//       CHECK-PROP:   %[[r2:.*]], %{{.*}} = gpu.shuffle  xor %[[a1]], %[[c4]], %[[c32]]
//       CHECK-PROP:   %[[a2:.*]] = arith.addf %[[a1]], %[[r2]]
//       CHECK-PROP:   %[[r3:.*]], %{{.*}} = gpu.shuffle  xor %[[a2]], %[[c8]], %[[c32]]
//       CHECK-PROP:   %[[a3:.*]] = arith.addf %[[a2]], %[[r3]]
//       CHECK-PROP:   %[[r4:.*]], %{{.*}} = gpu.shuffle  xor %[[a3]], %[[c16]], %[[c32]]
//       CHECK-PROP:   %[[a4:.*]] = arith.addf %[[a3]], %[[r4]]
//       CHECK-PROP:   %[[a5:.*]] = arith.addf %[[a4]], %[[warp_op]]#1
//       CHECK-PROP:   return %[[a5]] : f32
func.func @vector_reduction_acc(%laneid: index) -> (f32) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
    %0 = "some_def"() : () -> (vector<64xf32>)
    %1 = "some_def"() : () -> (f32)
    %2 = vector.reduction <add>, %0, %1 : vector<64xf32> into f32
    vector.yield %2 : f32
  }
  return %r : f32
}

// -----

// CHECK-PROP-LABEL:   func @warp_duplicate_yield(
func.func @warp_duplicate_yield(%laneid: index) -> (vector<1xf32>, vector<1xf32>) {
  //   CHECK-PROP: %{{.*}}:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xf32>)
  %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>, vector<1xf32>) {
    %2 = "some_def"() : () -> (vector<32xf32>)
    %3 = "some_def"() : () -> (vector<32xf32>)
    %4 = arith.addf %2, %3 : vector<32xf32>
    %5 = arith.addf %2, %2 : vector<32xf32>
// CHECK-PROP-NOT:   arith.addf
//     CHECK-PROP:   vector.yield %{{.*}}, %{{.*}} : vector<32xf32>, vector<32xf32>
    vector.yield %4, %5 : vector<32xf32>, vector<32xf32>
  }
  return %r#0, %r#1 : vector<1xf32>, vector<1xf32>
}

// -----

// CHECK-PROP-LABEL: func @warp_constant(
//       CHECK-PROP:   %[[C:.*]] = arith.constant dense<2.000000e+00> : vector<1xf32>
//       CHECK-PROP:   return %[[C]] : vector<1xf32>
func.func @warp_constant(%laneid: index) -> (vector<1xf32>) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
    %cst = arith.constant dense<2.0> : vector<32xf32>
    vector.yield %cst : vector<32xf32>
  }
  return %r : vector<1xf32>
}

// -----

// CHECK-PROP-LABEL: func.func @vector_extract_simple(
//       CHECK-PROP:   %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) {
//       CHECK-PROP:     %[[V:.*]] = "some_def"() : () -> vector<1xf32>
//       CHECK-PROP:     vector.yield %[[V]] : vector<1xf32>
//       CHECK-PROP:   }
//       CHECK-PROP:   %[[E:.*]] = vector.extract %[[R]][0] : vector<1xf32>
//       CHECK-PROP:   return %[[E]] : f32
func.func @vector_extract_simple(%laneid: index) -> (f32) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
    %0 = "some_def"() : () -> (vector<1xf32>)
    %1 = vector.extract %0[0] : vector<1xf32>
    vector.yield %1 : f32
  }
  return %r : f32
}