File: vector-warp-distribute.mlir

package info (click to toggle)
swiftlang 6.0.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,519,992 kB
  • sloc: cpp: 9,107,863; ansic: 2,040,022; asm: 1,135,751; python: 296,500; objc: 82,456; f90: 60,502; lisp: 34,951; pascal: 19,946; sh: 18,133; perl: 7,482; ml: 4,937; javascript: 4,117; makefile: 3,840; awk: 3,535; xml: 914; fortran: 619; cs: 573; ruby: 573
file content (1175 lines) | stat: -rw-r--r-- 58,819 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
// RUN: mlir-opt %s -allow-unregistered-dialect -split-input-file -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if | FileCheck %s --check-prefix=CHECK-SCF-IF
// RUN: mlir-opt %s -allow-unregistered-dialect -split-input-file -test-vector-warp-distribute="hoist-uniform" | FileCheck --check-prefixes=CHECK-HOIST %s
// RUN: mlir-opt %s -allow-unregistered-dialect -split-input-file -test-vector-warp-distribute="hoist-uniform distribute-transfer-write" | FileCheck --check-prefixes=CHECK-D %s
// RUN: mlir-opt %s -allow-unregistered-dialect -split-input-file -test-vector-warp-distribute=propagate-distribution -canonicalize | FileCheck --check-prefixes=CHECK-PROP %s
// RUN: mlir-opt %s -allow-unregistered-dialect -split-input-file -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" -canonicalize | FileCheck --check-prefixes=CHECK-DIST-AND-PROP %s

// CHECK-SCF-IF-DAG: #[[$TIMES2:.*]] = affine_map<()[s0] -> (s0 * 2)>
// CHECK-SCF-IF-DAG: #[[$TIMES4:.*]] = affine_map<()[s0] -> (s0 * 4)>
// CHECK-SCF-IF-DAG: #[[$TIMES8:.*]] = affine_map<()[s0] -> (s0 * 8)>
// CHECK-SCF-IF-DAG: memref.global "private" @__shared_32xf32 : memref<32xf32, 3>
// CHECK-SCF-IF-DAG: memref.global "private" @__shared_64xf32 : memref<64xf32, 3>
// CHECK-SCF-IF-DAG: memref.global "private" @__shared_128xf32 : memref<128xf32, 3>
// CHECK-SCF-IF-DAG: memref.global "private" @__shared_256xf32 : memref<256xf32, 3>

// CHECK-SCF-IF-LABEL: func @rewrite_warp_op_to_scf_if(
//  CHECK-SCF-IF-SAME:     %[[laneid:.*]]: index,
//  CHECK-SCF-IF-SAME:     %[[v0:.*]]: vector<4xf32>, %[[v1:.*]]: vector<8xf32>)
func.func @rewrite_warp_op_to_scf_if(%laneid: index,
                                %v0: vector<4xf32>, %v1: vector<8xf32>) {
//   CHECK-SCF-IF-DAG:   %[[c0:.*]] = arith.constant 0 : index
//       CHECK-SCF-IF:   %[[is_lane_0:.*]] = arith.cmpi eq, %[[laneid]], %[[c0]]

//       CHECK-SCF-IF:   %[[buffer_v0:.*]] = memref.get_global @__shared_128xf32
//       CHECK-SCF-IF:   %[[s0:.*]] = affine.apply #[[$TIMES4]]()[%[[laneid]]]
//       CHECK-SCF-IF:   vector.transfer_write %[[v0]], %[[buffer_v0]][%[[s0]]]
//       CHECK-SCF-IF:   %[[buffer_v1:.*]] = memref.get_global @__shared_256xf32
//       CHECK-SCF-IF:   %[[s1:.*]] = affine.apply #[[$TIMES8]]()[%[[laneid]]]
//       CHECK-SCF-IF:   vector.transfer_write %[[v1]], %[[buffer_v1]][%[[s1]]]

//   CHECK-SCF-IF-DAG:   gpu.barrier
//   CHECK-SCF-IF-DAG:   %[[buffer_def_0:.*]] = memref.get_global @__shared_32xf32
//   CHECK-SCF-IF-DAG:   %[[buffer_def_1:.*]] = memref.get_global @__shared_64xf32

//       CHECK-SCF-IF:   scf.if %[[is_lane_0]] {
  %r:2 = vector.warp_execute_on_lane_0(%laneid)[32]
      args(%v0, %v1 : vector<4xf32>, vector<8xf32>) -> (vector<1xf32>, vector<2xf32>) {
    ^bb0(%arg0: vector<128xf32>, %arg1: vector<256xf32>):
//       CHECK-SCF-IF:     %[[arg1:.*]] = vector.transfer_read %[[buffer_v1]][%[[c0]]], %{{.*}} {in_bounds = [true]} : memref<256xf32, 3>, vector<256xf32>
//       CHECK-SCF-IF:     %[[arg0:.*]] = vector.transfer_read %[[buffer_v0]][%[[c0]]], %{{.*}} {in_bounds = [true]} : memref<128xf32, 3>, vector<128xf32>
//       CHECK-SCF-IF:     %[[def_0:.*]] = "some_def"(%[[arg0]]) : (vector<128xf32>) -> vector<32xf32>
//       CHECK-SCF-IF:     %[[def_1:.*]] = "some_def"(%[[arg1]]) : (vector<256xf32>) -> vector<64xf32>
    %2 = "some_def"(%arg0) : (vector<128xf32>) -> vector<32xf32>
    %3 = "some_def"(%arg1) : (vector<256xf32>) -> vector<64xf32>
//       CHECK-SCF-IF:     vector.transfer_write %[[def_0]], %[[buffer_def_0]][%[[c0]]]
//       CHECK-SCF-IF:     vector.transfer_write %[[def_1]], %[[buffer_def_1]][%[[c0]]]
    vector.yield %2, %3 : vector<32xf32>, vector<64xf32>
  }
//       CHECK-SCF-IF:   }
//       CHECK-SCF-IF:   gpu.barrier
//       CHECK-SCF-IF:   %[[o1:.*]] = affine.apply #[[$TIMES2]]()[%[[laneid]]]
//       CHECK-SCF-IF:   %[[r1:.*]] = vector.transfer_read %[[buffer_def_1]][%[[o1]]], %{{.*}} {in_bounds = [true]} : memref<64xf32, 3>, vector<2xf32>
//       CHECK-SCF-IF:   %[[r0:.*]] = vector.transfer_read %[[buffer_def_0]][%[[laneid]]], %{{.*}} {in_bounds = [true]} : memref<32xf32, 3>, vector<1xf32>
//       CHECK-SCF-IF:   "some_use"(%[[r0]]) : (vector<1xf32>) -> ()
//       CHECK-SCF-IF:   "some_use"(%[[r1]]) : (vector<2xf32>) -> ()
  "some_use"(%r#0) : (vector<1xf32>) -> ()
  "some_use"(%r#1) : (vector<2xf32>) -> ()
  return
}

// -----

// CHECK-D-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 * 2 + 32)>

// CHECK-DIST-AND-PROP-LABEL: func @warp(
// CHECK-HOIST: memref.subview
// CHECK-HOIST: memref.subview
// CHECK-HOIST: memref.subview
// CHECK-HOIST: vector.warp_execute_on_lane_0

//     CHECK-D: %[[R:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<2xf32>, vector<1xf32>) {
//     CHECK-D:   arith.addf {{.*}} : vector<32xf32>
//     CHECK-D:   arith.addf {{.*}} : vector<64xf32>
//     CHECK-D:   vector.yield %{{.*}}, %{{.*}} : vector<64xf32>, vector<32xf32>
// CHECK-D-DAG: vector.transfer_write %[[R]]#1, %{{.*}}[%{{.*}}] {in_bounds = [true]} : vector<1xf32>, memref<128xf32
// CHECK-D-DAG: %[[ID1:.*]] = affine.apply #[[MAP1]]()[%{{.*}}]
// CHECK-D-DAG: vector.transfer_write %[[R]]#0, %{{.*}}[%[[ID1]]] {in_bounds = [true]} : vector<2xf32>, memref<128xf32

// CHECK-DIST-AND-PROP-NOT: vector.warp_execute_on_lane_0
// CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<1xf32>
// CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<1xf32>
// CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<2xf32>
// CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<2xf32>
// CHECK-DIST-AND-PROP: arith.addf {{.*}} : vector<1xf32>
// CHECK-DIST-AND-PROP: arith.addf {{.*}} : vector<2xf32>
// CHECK-DIST-AND-PROP: vector.transfer_write {{.*}} : vector<1xf32>
// CHECK-DIST-AND-PROP: vector.transfer_write {{.*}} : vector<2xf32>

func.func @warp(%laneid: index, %arg1: memref<1024xf32>, %arg2: memref<1024xf32>,
           %arg3: memref<1024xf32>, %gid : index) {
  vector.warp_execute_on_lane_0(%laneid)[32] {
    %sa = memref.subview %arg1[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, strided<[1], offset: ?>>
    %sb = memref.subview %arg2[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, strided<[1], offset: ?>>
    %sc = memref.subview %arg3[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, strided<[1], offset: ?>>
    %c0 = arith.constant 0 : index
    %c32 = arith.constant 32 : index
    %cst = arith.constant 0.000000e+00 : f32
    %2 = vector.transfer_read %sa[%c0], %cst : memref<128xf32, strided<[1], offset: ?>>, vector<32xf32>
    %3 = vector.transfer_read %sa[%c32], %cst : memref<128xf32, strided<[1], offset: ?>>, vector<32xf32>
    %4 = vector.transfer_read %sb[%c0], %cst : memref<128xf32, strided<[1], offset: ?>>, vector<64xf32>
    %5 = vector.transfer_read %sb[%c32], %cst : memref<128xf32, strided<[1], offset: ?>>, vector<64xf32>
    %6 = arith.addf %2, %3 : vector<32xf32>
    %7 = arith.addf %4, %5 : vector<64xf32>
    vector.transfer_write %6, %sc[%c0] : vector<32xf32>, memref<128xf32, strided<[1], offset: ?>>
    vector.transfer_write %7, %sc[%c32] : vector<64xf32>, memref<128xf32, strided<[1], offset: ?>>
  }
  return
}

// -----

// CHECK-D-LABEL: func @warp_extract(
//       CHECK-D:   %[[WARPOP:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1x1xf32>)
//       CHECK-D:     "test.dummy_op"
//       CHECK-D:     "test.dummy_op"
//       CHECK-D:     vector.yield %{{.*}}, %{{.*}} : vector<1xf32>, vector<1x1xf32>
//       CHECK-D:   }
//       CHECK-D:   vector.warp_execute_on_lane_0(%{{.*}})[32] {
//       CHECK-D:     vector.transfer_write %[[WARPOP]]#1, %{{.*}}[%{{.*}}] {{.*}} : vector<1x1xf32>
//       CHECK-D:   }
//       CHECK-D:   vector.warp_execute_on_lane_0(%{{.*}})[32] {
//       CHECK-D:     vector.transfer_write %[[WARPOP]]#0, %{{.*}}[%{{.*}}] {{.*}} : vector<1xf32>
//       CHECK-D:   }

func.func @warp_extract(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : index) {
  vector.warp_execute_on_lane_0(%laneid)[32] {
    %c0 = arith.constant 0 : index
    %v = "test.dummy_op"() : () -> (vector<1xf32>)
    %v1 = "test.dummy_op"() : () -> (vector<1x1xf32>)
    vector.transfer_write %v1, %arg1[%c0, %c0] : vector<1x1xf32>, memref<1024x1024xf32>
    vector.transfer_write %v, %arg1[%c0, %c0] : vector<1xf32>, memref<1024x1024xf32>
  }
  return
}

// -----

// CHECK-PROP-LABEL:   func @warp_dead_result(
func.func @warp_dead_result(%laneid: index) -> (vector<1xf32>) {
  // CHECK-PROP: %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>)
  %r:3 = vector.warp_execute_on_lane_0(%laneid)[32] ->
    (vector<1xf32>, vector<1xf32>, vector<1xf32>) {
    %2 = "some_def"() : () -> (vector<32xf32>)
    %3 = "some_def"() : () -> (vector<32xf32>)
    %4 = "some_def"() : () -> (vector<32xf32>)
  // CHECK-PROP:   vector.yield %{{.*}} : vector<32xf32>
    vector.yield %2, %3, %4 : vector<32xf32>, vector<32xf32>, vector<32xf32>
  }
  // CHECK-PROP: return %[[R]] : vector<1xf32>
  return %r#1 : vector<1xf32>
}

// -----

// CHECK-PROP-LABEL:   func @warp_propagate_operand(
//  CHECK-PROP-SAME:   %[[ID:.*]]: index, %[[V:.*]]: vector<4xf32>)
func.func @warp_propagate_operand(%laneid: index, %v0: vector<4xf32>)
  -> (vector<4xf32>) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32]
     args(%v0 : vector<4xf32>) -> (vector<4xf32>) {
     ^bb0(%arg0 : vector<128xf32>) :
    vector.yield %arg0 : vector<128xf32>
  }
  // CHECK-PROP: return %[[V]] : vector<4xf32>
  return %r : vector<4xf32>
}

// -----

#map0 = affine_map<()[s0] -> (s0 * 2)>

// CHECK-PROP-LABEL:   func @warp_propagate_elementwise(
func.func @warp_propagate_elementwise(%laneid: index, %dest: memref<1024xf32>) {
  %c0 = arith.constant 0 : index
  %c32 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  // CHECK-PROP: %[[R:.*]]:4 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xf32>, vector<2xf32>, vector<2xf32>)
  %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] ->
    (vector<1xf32>, vector<2xf32>) {
    // CHECK-PROP: %[[V0:.*]] = "some_def"() : () -> vector<32xf32>
    // CHECK-PROP: %[[V1:.*]] = "some_def"() : () -> vector<32xf32>
    // CHECK-PROP: %[[V2:.*]] = "some_def"() : () -> vector<64xf32>
    // CHECK-PROP: %[[V3:.*]] = "some_def"() : () -> vector<64xf32>
    // CHECK-PROP: vector.yield %[[V0]], %[[V1]], %[[V2]], %[[V3]] : vector<32xf32>, vector<32xf32>, vector<64xf32>, vector<64xf32>
    %2 = "some_def"() : () -> (vector<32xf32>)
    %3 = "some_def"() : () -> (vector<32xf32>)
    %4 = "some_def"() : () -> (vector<64xf32>)
    %5 = "some_def"() : () -> (vector<64xf32>)
    %6 = arith.addf %2, %3 : vector<32xf32>
    %7 = arith.addf %4, %5 : vector<64xf32>
    vector.yield %6, %7 : vector<32xf32>, vector<64xf32>
  }
  // CHECK-PROP: %[[A0:.*]] = arith.addf %[[R]]#2, %[[R]]#3 : vector<2xf32>
  // CHECK-PROP: %[[A1:.*]] = arith.addf %[[R]]#0, %[[R]]#1 : vector<1xf32>
  %id2 = affine.apply #map0()[%laneid]
  // CHECK-PROP: vector.transfer_write %[[A1]], {{.*}} : vector<1xf32>, memref<1024xf32>
  // CHECK-PROP: vector.transfer_write %[[A0]], {{.*}} : vector<2xf32>, memref<1024xf32>
  vector.transfer_write %r#0, %dest[%laneid] : vector<1xf32>, memref<1024xf32>
  vector.transfer_write %r#1, %dest[%id2] : vector<2xf32>, memref<1024xf32>
  return
}

// -----

// CHECK-PROP-LABEL: func @warp_propagate_scalar_arith(
//       CHECK-PROP:   %[[r:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} {
//       CHECK-PROP:     %[[some_def0:.*]] = "some_def"
//       CHECK-PROP:     %[[some_def1:.*]] = "some_def"
//       CHECK-PROP:     vector.yield %[[some_def0]], %[[some_def1]]
//       CHECK-PROP:   }
//       CHECK-PROP:   arith.addf %[[r]]#0, %[[r]]#1 : f32
func.func @warp_propagate_scalar_arith(%laneid: index) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
    %0 = "some_def"() : () -> (f32)
    %1 = "some_def"() : () -> (f32)
    %2 = arith.addf %0, %1 : f32
    vector.yield %2 : f32
  }
  vector.print %r : f32
  return
}

// -----

// CHECK-PROP-LABEL: func @warp_propagate_cast(
//   CHECK-PROP-NOT:   vector.warp_execute_on_lane_0
//       CHECK-PROP:   %[[result:.*]] = arith.sitofp %{{.*}} : i32 to f32
//       CHECK-PROP:   return %[[result]]
func.func @warp_propagate_cast(%laneid : index, %i : i32) -> (f32) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
    %casted = arith.sitofp %i : i32 to f32
    vector.yield %casted : f32
  }
  return %r : f32
}

// -----

#map0 = affine_map<()[s0] -> (s0 * 2)>

//  CHECK-PROP-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 * 2)>

// CHECK-PROP:   func @warp_propagate_read
//  CHECK-PROP-SAME:     (%[[ID:.*]]: index
func.func @warp_propagate_read(%laneid: index, %src: memref<1024xf32>, %dest: memref<1024xf32>) {
// CHECK-PROP-NOT: warp_execute_on_lane_0
// CHECK-PROP-DAG: %[[R0:.*]] = vector.transfer_read %arg1[%[[ID]]], %{{.*}} : memref<1024xf32>, vector<1xf32>
// CHECK-PROP-DAG: %[[ID2:.*]] = affine.apply #[[MAP0]]()[%[[ID]]]
// CHECK-PROP-DAG: %[[R1:.*]] = vector.transfer_read %arg1[%[[ID2]]], %{{.*}} : memref<1024xf32>, vector<2xf32>
// CHECK-PROP: vector.transfer_write %[[R0]], {{.*}} : vector<1xf32>, memref<1024xf32>
// CHECK-PROP: vector.transfer_write %[[R1]], {{.*}} : vector<2xf32>, memref<1024xf32>
  %c0 = arith.constant 0 : index
  %c32 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] ->(vector<1xf32>, vector<2xf32>) {
    %2 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>, vector<32xf32>
    %3 = vector.transfer_read %src[%c32], %cst : memref<1024xf32>, vector<64xf32>
    vector.yield %2, %3 : vector<32xf32>, vector<64xf32>
  }
  %id2 = affine.apply #map0()[%laneid]
  vector.transfer_write %r#0, %dest[%laneid] : vector<1xf32>, memref<1024xf32>
  vector.transfer_write %r#1, %dest[%id2] : vector<2xf32>, memref<1024xf32>
  return
}

// -----

// CHECK-PROP-LABEL: func @fold_vector_broadcast(
//       CHECK-PROP:   %[[r:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>)
//       CHECK-PROP:     %[[some_def:.*]] = "some_def"
//       CHECK-PROP:     vector.yield %[[some_def]] : vector<1xf32>
//       CHECK-PROP:   vector.print %[[r]] : vector<1xf32>
func.func @fold_vector_broadcast(%laneid: index) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
    %0 = "some_def"() : () -> (vector<1xf32>)
    %1 = vector.broadcast %0 : vector<1xf32> to vector<32xf32>
    vector.yield %1 : vector<32xf32>
  }
  vector.print %r : vector<1xf32>
  return
}

// -----

// CHECK-PROP-LABEL: func @extract_vector_broadcast(
//       CHECK-PROP:   %[[r:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>)
//       CHECK-PROP:     %[[some_def:.*]] = "some_def"
//       CHECK-PROP:     vector.yield %[[some_def]] : vector<1xf32>
//       CHECK-PROP:   %[[broadcasted:.*]] = vector.broadcast %[[r]] : vector<1xf32> to vector<2xf32>
//       CHECK-PROP:   vector.print %[[broadcasted]] : vector<2xf32>
func.func @extract_vector_broadcast(%laneid: index) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
    %0 = "some_def"() : () -> (vector<1xf32>)
    %1 = vector.broadcast %0 : vector<1xf32> to vector<64xf32>
    vector.yield %1 : vector<64xf32>
  }
  vector.print %r : vector<2xf32>
  return
}

// -----

// CHECK-PROP-LABEL: func @extract_scalar_vector_broadcast(
//       CHECK-PROP:   %[[r:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (f32)
//       CHECK-PROP:     %[[some_def:.*]] = "some_def"
//       CHECK-PROP:     vector.yield %[[some_def]] : f32
//       CHECK-PROP:   %[[broadcasted:.*]] = vector.broadcast %[[r]] : f32 to vector<2xf32>
//       CHECK-PROP:   vector.print %[[broadcasted]] : vector<2xf32>
func.func @extract_scalar_vector_broadcast(%laneid: index) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
    %0 = "some_def"() : () -> (f32)
    %1 = vector.broadcast %0 : f32 to vector<64xf32>
    vector.yield %1 : vector<64xf32>
  }
  vector.print %r : vector<2xf32>
  return
}

// -----

// CHECK-PROP-LABEL:   func @warp_scf_for(
// CHECK-PROP: %[[INI:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>) {
// CHECK-PROP:   %[[INI1:.*]] = "some_def"() : () -> vector<128xf32>
// CHECK-PROP:   vector.yield %[[INI1]] : vector<128xf32>
// CHECK-PROP: }
// CHECK-PROP: %[[F:.*]] = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[FARG:.*]] = %[[INI]]) -> (vector<4xf32>) {
// CHECK-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG]] : vector<4xf32>) -> (vector<4xf32>) {
// CHECK-PROP:    ^bb0(%[[ARG:.*]]: vector<128xf32>):
// CHECK-PROP:      %[[ACC:.*]] = "some_def"(%[[ARG]]) : (vector<128xf32>) -> vector<128xf32>
// CHECK-PROP:      vector.yield %[[ACC]] : vector<128xf32>
// CHECK-PROP:   }
// CHECK-PROP:   scf.yield %[[W]] : vector<4xf32>
// CHECK-PROP: }
// CHECK-PROP: "some_use"(%[[F]]) : (vector<4xf32>) -> ()
func.func @warp_scf_for(%arg0: index) {
  %c128 = arith.constant 128 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = vector.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) {
    %ini = "some_def"() : () -> (vector<128xf32>)
    %3 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini) -> (vector<128xf32>) {
      %acc = "some_def"(%arg4) : (vector<128xf32>) -> (vector<128xf32>)
      scf.yield %acc : vector<128xf32>
    }
    vector.yield %3 : vector<128xf32>
  }
  "some_use"(%0) : (vector<4xf32>) -> ()
  return
}

// -----

// CHECK-PROP-LABEL:   func @warp_scf_for_use_from_above(
// CHECK-PROP: %[[INI:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) {
// CHECK-PROP:   %[[INI1:.*]] = "some_def"() : () -> vector<128xf32>
// CHECK-PROP:   %[[USE:.*]] = "some_def_above"() : () -> vector<128xf32>
// CHECK-PROP:   vector.yield %[[INI1]], %[[USE]] : vector<128xf32>, vector<128xf32>
// CHECK-PROP: }
// CHECK-PROP: %[[F:.*]] = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[FARG:.*]] = %[[INI]]#0) -> (vector<4xf32>) {
// CHECK-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG]], %[[INI]]#1 : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>) {
// CHECK-PROP:    ^bb0(%[[ARG0:.*]]: vector<128xf32>, %[[ARG1:.*]]: vector<128xf32>):
// CHECK-PROP:      %[[ACC:.*]] = "some_def"(%[[ARG0]], %[[ARG1]]) : (vector<128xf32>, vector<128xf32>) -> vector<128xf32>
// CHECK-PROP:      vector.yield %[[ACC]] : vector<128xf32>
// CHECK-PROP:   }
// CHECK-PROP:   scf.yield %[[W]] : vector<4xf32>
// CHECK-PROP: }
// CHECK-PROP: "some_use"(%[[F]]) : (vector<4xf32>) -> ()
func.func @warp_scf_for_use_from_above(%arg0: index) {
  %c128 = arith.constant 128 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = vector.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) {
    %ini = "some_def"() : () -> (vector<128xf32>)
    %use_from_above = "some_def_above"() : () -> (vector<128xf32>)
    %3 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini) -> (vector<128xf32>) {
      %acc = "some_def"(%arg4, %use_from_above) : (vector<128xf32>, vector<128xf32>) -> (vector<128xf32>)
      scf.yield %acc : vector<128xf32>
    }
    vector.yield %3 : vector<128xf32>
  }
  "some_use"(%0) : (vector<4xf32>) -> ()
  return
}

// -----

// CHECK-PROP-LABEL:   func @warp_scf_for_swap(
// CHECK-PROP: %[[INI:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) {
// CHECK-PROP:   %[[INI1:.*]] = "some_def"() : () -> vector<128xf32>
// CHECK-PROP:   %[[INI2:.*]] = "some_def"() : () -> vector<128xf32>
// CHECK-PROP:   vector.yield %[[INI1]], %[[INI2]] : vector<128xf32>, vector<128xf32>
// CHECK-PROP: }
// CHECK-PROP: %[[F:.*]]:2 = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[FARG1:.*]] = %[[INI]]#0, %[[FARG2:.*]] = %[[INI]]#1) -> (vector<4xf32>, vector<4xf32>) {
// CHECK-PROP:   %[[W:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG1]], %[[FARG2]] : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>) {
// CHECK-PROP:    ^bb0(%[[ARG1:.*]]: vector<128xf32>, %[[ARG2:.*]]: vector<128xf32>):
// CHECK-PROP:      %[[ACC1:.*]] = "some_def"(%[[ARG1]]) : (vector<128xf32>) -> vector<128xf32>
// CHECK-PROP:      %[[ACC2:.*]] = "some_def"(%[[ARG2]]) : (vector<128xf32>) -> vector<128xf32>
// CHECK-PROP:      vector.yield %[[ACC2]], %[[ACC1]] : vector<128xf32>, vector<128xf32>
// CHECK-PROP:   }
// CHECK-PROP:   scf.yield %[[W]]#0, %[[W]]#1 : vector<4xf32>, vector<4xf32>
// CHECK-PROP: }
// CHECK-PROP: "some_use"(%[[F]]#0) : (vector<4xf32>) -> ()
// CHECK-PROP: "some_use"(%[[F]]#1) : (vector<4xf32>) -> ()
func.func @warp_scf_for_swap(%arg0: index) {
  %c128 = arith.constant 128 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0:2 = vector.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>, vector<4xf32>) {
    %ini1 = "some_def"() : () -> (vector<128xf32>)
    %ini2 = "some_def"() : () -> (vector<128xf32>)
    %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini1, %arg5 = %ini2) -> (vector<128xf32>, vector<128xf32>) {
      %acc1 = "some_def"(%arg4) : (vector<128xf32>) -> (vector<128xf32>)
      %acc2 = "some_def"(%arg5) : (vector<128xf32>) -> (vector<128xf32>)
      scf.yield %acc2, %acc1 : vector<128xf32>, vector<128xf32>
    }
    vector.yield %3#0, %3#1 : vector<128xf32>, vector<128xf32>
  }
  "some_use"(%0#0) : (vector<4xf32>) -> ()
  "some_use"(%0#1) : (vector<4xf32>) -> ()
  return
}

// -----

// CHECK-PROP-LABEL:   func @warp_scf_for_swap_no_yield(
// CHECK-PROP:           scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
// CHECK-PROP-NEXT:        vector.warp_execute_on_lane_0(%{{.*}})[32] {
// CHECK-PROP-NEXT:          "some_op"() : () -> ()
// CHECK-PROP-NEXT:        }
// CHECK-PROP-NEXT:      }
func.func @warp_scf_for_swap_no_yield(%arg0: index) {
  %c128 = arith.constant 128 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  vector.warp_execute_on_lane_0(%arg0)[32] {
    scf.for %arg3 = %c0 to %c128 step %c1 {
      "some_op"() : () -> ()
    }
  }
  return
}

// -----

#map = affine_map<()[s0] -> (s0 * 4)>
#map1 = affine_map<()[s0] -> (s0 * 128 + 128)>
#map2 = affine_map<()[s0] -> (s0 * 4 + 128)>

// CHECK-PROP-LABEL:   func @warp_scf_for_multiple_yield(
//       CHECK-PROP:   vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) {
//  CHECK-PROP-NEXT:     "some_def"() : () -> vector<32xf32>
//  CHECK-PROP-NEXT:     vector.yield %{{.*}} : vector<32xf32>
//  CHECK-PROP-NEXT:   }
//   CHECK-PROP-NOT:   vector.warp_execute_on_lane_0
//       CHECK-PROP:   vector.transfer_read {{.*}} : memref<?xf32>, vector<4xf32>
//       CHECK-PROP:   vector.transfer_read {{.*}} : memref<?xf32>, vector<4xf32>
//       CHECK-PROP:   %{{.*}}:2 = scf.for {{.*}} -> (vector<4xf32>, vector<4xf32>) {
//   CHECK-PROP-NOT:     vector.warp_execute_on_lane_0
//       CHECK-PROP:     vector.transfer_read {{.*}} : memref<?xf32>, vector<4xf32>
//       CHECK-PROP:     vector.transfer_read {{.*}} : memref<?xf32>, vector<4xf32>
//       CHECK-PROP:     arith.addf {{.*}} : vector<4xf32>
//       CHECK-PROP:     arith.addf {{.*}} : vector<4xf32>
//       CHECK-PROP:     scf.yield {{.*}} : vector<4xf32>, vector<4xf32>
//       CHECK-PROP:   }
func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref<?xf32>, %arg2: memref<?xf32>) {
  %c256 = arith.constant 256 : index
  %c128 = arith.constant 128 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0:3 = vector.warp_execute_on_lane_0(%arg0)[32] ->
  (vector<1xf32>, vector<4xf32>, vector<4xf32>) {
    %def = "some_def"() : () -> (vector<32xf32>)
    %r1 = vector.transfer_read %arg2[%c0], %cst {in_bounds = [true]} : memref<?xf32>, vector<128xf32>
    %r2 = vector.transfer_read %arg2[%c128], %cst {in_bounds = [true]} : memref<?xf32>, vector<128xf32>
    %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %r1, %arg5 = %r2)
    -> (vector<128xf32>, vector<128xf32>) {
      %o1 = affine.apply #map1()[%arg3]
      %o2 = affine.apply #map2()[%arg3]
      %4 = vector.transfer_read %arg1[%o1], %cst {in_bounds = [true]} : memref<?xf32>, vector<128xf32>
      %5 = vector.transfer_read %arg1[%o2], %cst {in_bounds = [true]} : memref<?xf32>, vector<128xf32>
      %6 = arith.addf %4, %arg4 : vector<128xf32>
      %7 = arith.addf %5, %arg5 : vector<128xf32>
      scf.yield %6, %7 : vector<128xf32>, vector<128xf32>
    }
    vector.yield %def, %3#0, %3#1 :  vector<32xf32>, vector<128xf32>, vector<128xf32>
  }
  %1 = affine.apply #map()[%arg0]
  vector.transfer_write %0#1, %arg2[%1] {in_bounds = [true]} : vector<4xf32>, memref<?xf32>
  %2 = affine.apply #map2()[%arg0]
  vector.transfer_write %0#2, %arg2[%2] {in_bounds = [true]} : vector<4xf32>, memref<?xf32>
  "some_use"(%0#0) : (vector<1xf32>) -> ()
  return
}

// -----

// CHECK-PROP-LABEL: func @vector_reduction(
//  CHECK-PROP-SAME:     %[[laneid:.*]]: index)
//   CHECK-PROP-DAG:   %[[c1:.*]] = arith.constant 1 : i32
//   CHECK-PROP-DAG:   %[[c2:.*]] = arith.constant 2 : i32
//   CHECK-PROP-DAG:   %[[c4:.*]] = arith.constant 4 : i32
//   CHECK-PROP-DAG:   %[[c8:.*]] = arith.constant 8 : i32
//   CHECK-PROP-DAG:   %[[c16:.*]] = arith.constant 16 : i32
//   CHECK-PROP-DAG:   %[[c32:.*]] = arith.constant 32 : i32
//       CHECK-PROP:   %[[warp_op:.*]] = vector.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<1xf32>) {
//       CHECK-PROP:     vector.yield %{{.*}} : vector<32xf32>
//       CHECK-PROP:   }
//       CHECK-PROP:   %[[a:.*]] = vector.extract %[[warp_op]][0] : vector<1xf32>
//       CHECK-PROP:   %[[r0:.*]], %{{.*}} = gpu.shuffle  xor %[[a]], %[[c1]], %[[c32]]
//       CHECK-PROP:   %[[a0:.*]] = arith.addf %[[a]], %[[r0]]
//       CHECK-PROP:   %[[r1:.*]], %{{.*}} = gpu.shuffle  xor %[[a0]], %[[c2]], %[[c32]]
//       CHECK-PROP:   %[[a1:.*]] = arith.addf %[[a0]], %[[r1]]
//       CHECK-PROP:   %[[r2:.*]], %{{.*}} = gpu.shuffle  xor %[[a1]], %[[c4]], %[[c32]]
//       CHECK-PROP:   %[[a2:.*]] = arith.addf %[[a1]], %[[r2]]
//       CHECK-PROP:   %[[r3:.*]], %{{.*}} = gpu.shuffle  xor %[[a2]], %[[c8]], %[[c32]]
//       CHECK-PROP:   %[[a3:.*]] = arith.addf %[[a2]], %[[r3]]
//       CHECK-PROP:   %[[r4:.*]], %{{.*}} = gpu.shuffle  xor %[[a3]], %[[c16]], %[[c32]]
//       CHECK-PROP:   %[[a4:.*]] = arith.addf %[[a3]], %[[r4]]
//       CHECK-PROP:   return %[[a4]] : f32
func.func @vector_reduction(%laneid: index) -> (f32) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
    %0 = "some_def"() : () -> (vector<32xf32>)
    %1 = vector.reduction <add>, %0 : vector<32xf32> into f32
    vector.yield %1 : f32
  }
  return %r : f32
}

// -----

func.func @vector_reduction(%laneid: index, %m0: memref<4x2x32xf32>, %m1: memref<f32>) {
  %c0 = arith.constant 0: index
  %f0 = arith.constant 0.0: f32
  //     CHECK-D: %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<f32>) {
  //     CHECK-D: vector.warp_execute_on_lane_0(%{{.*}})[32] {
  //     CHECK-D:   vector.transfer_write %[[R]], %{{.*}}[] : vector<f32>, memref<f32>
  vector.warp_execute_on_lane_0(%laneid)[32] {
    %0 = vector.transfer_read %m0[%c0, %c0, %c0], %f0 {in_bounds = [true]} : memref<4x2x32xf32>, vector<32xf32>
    %1 = vector.transfer_read %m1[], %f0 : memref<f32>, vector<f32>
    %2 = vector.extractelement %1[] : vector<f32>
    %3 = vector.reduction <add>, %0 : vector<32xf32> into f32
    %4 = arith.addf %3, %2 : f32
    %5 = vector.broadcast %4 : f32 to vector<f32>
    vector.transfer_write %5, %m1[] : vector<f32>, memref<f32>
  }
  return
}

// -----

// CHECK-PROP-LABEL: func @vector_reduction_large(
//  CHECK-PROP-SAME:     %[[laneid:.*]]: index)
//   CHECK-PROP-DAG:   %[[c1:.*]] = arith.constant 1 : i32
//   CHECK-PROP-DAG:   %[[c2:.*]] = arith.constant 2 : i32
//   CHECK-PROP-DAG:   %[[c4:.*]] = arith.constant 4 : i32
//   CHECK-PROP-DAG:   %[[c8:.*]] = arith.constant 8 : i32
//   CHECK-PROP-DAG:   %[[c16:.*]] = arith.constant 16 : i32
//   CHECK-PROP-DAG:   %[[c32:.*]] = arith.constant 32 : i32
//       CHECK-PROP:   %[[warp_op:.*]] = vector.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<2xf32>) {
//       CHECK-PROP:     vector.yield %{{.*}} : vector<64xf32>
//       CHECK-PROP:   }
//       CHECK-PROP:   %[[a:.*]] = vector.reduction <add>, %[[warp_op]] : vector<2xf32> into f32
//       CHECK-PROP:   %[[r0:.*]], %{{.*}} = gpu.shuffle  xor %[[a]], %[[c1]], %[[c32]]
//       CHECK-PROP:   %[[a0:.*]] = arith.addf %[[a]], %[[r0]]
//       CHECK-PROP:   %[[r1:.*]], %{{.*}} = gpu.shuffle  xor %[[a0]], %[[c2]], %[[c32]]
//       CHECK-PROP:   %[[a1:.*]] = arith.addf %[[a0]], %[[r1]]
//       CHECK-PROP:   %[[r2:.*]], %{{.*}} = gpu.shuffle  xor %[[a1]], %[[c4]], %[[c32]]
//       CHECK-PROP:   %[[a2:.*]] = arith.addf %[[a1]], %[[r2]]
//       CHECK-PROP:   %[[r3:.*]], %{{.*}} = gpu.shuffle  xor %[[a2]], %[[c8]], %[[c32]]
//       CHECK-PROP:   %[[a3:.*]] = arith.addf %[[a2]], %[[r3]]
//       CHECK-PROP:   %[[r4:.*]], %{{.*}} = gpu.shuffle  xor %[[a3]], %[[c16]], %[[c32]]
//       CHECK-PROP:   %[[a4:.*]] = arith.addf %[[a3]], %[[r4]]
//       CHECK-PROP:   return %[[a4]] : f32
func.func @vector_reduction_large(%laneid: index) -> (f32) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
    %0 = "some_def"() : () -> (vector<64xf32>)
    %1 = vector.reduction <add>, %0 : vector<64xf32> into f32
    vector.yield %1 : f32
  }
  return %r : f32
}

// -----

// CHECK-PROP-LABEL: func @vector_reduction_acc(
//  CHECK-PROP-SAME:     %[[laneid:.*]]: index)
//   CHECK-PROP-DAG:   %[[c1:.*]] = arith.constant 1 : i32
//   CHECK-PROP-DAG:   %[[c2:.*]] = arith.constant 2 : i32
//   CHECK-PROP-DAG:   %[[c4:.*]] = arith.constant 4 : i32
//   CHECK-PROP-DAG:   %[[c8:.*]] = arith.constant 8 : i32
//   CHECK-PROP-DAG:   %[[c16:.*]] = arith.constant 16 : i32
//   CHECK-PROP-DAG:   %[[c32:.*]] = arith.constant 32 : i32
//       CHECK-PROP:   %[[warp_op:.*]]:2 = vector.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<2xf32>, f32) {
//       CHECK-PROP:     vector.yield %{{.*}}, %{{.*}} : vector<64xf32>, f32
//       CHECK-PROP:   }
//       CHECK-PROP:   %[[a:.*]] = vector.reduction <add>, %[[warp_op]]#0 : vector<2xf32> into f32
//       CHECK-PROP:   %[[r0:.*]], %{{.*}} = gpu.shuffle  xor %[[a]], %[[c1]], %[[c32]]
//       CHECK-PROP:   %[[a0:.*]] = arith.addf %[[a]], %[[r0]]
//       CHECK-PROP:   %[[r1:.*]], %{{.*}} = gpu.shuffle  xor %[[a0]], %[[c2]], %[[c32]]
//       CHECK-PROP:   %[[a1:.*]] = arith.addf %[[a0]], %[[r1]]
//       CHECK-PROP:   %[[r2:.*]], %{{.*}} = gpu.shuffle  xor %[[a1]], %[[c4]], %[[c32]]
//       CHECK-PROP:   %[[a2:.*]] = arith.addf %[[a1]], %[[r2]]
//       CHECK-PROP:   %[[r3:.*]], %{{.*}} = gpu.shuffle  xor %[[a2]], %[[c8]], %[[c32]]
//       CHECK-PROP:   %[[a3:.*]] = arith.addf %[[a2]], %[[r3]]
//       CHECK-PROP:   %[[r4:.*]], %{{.*}} = gpu.shuffle  xor %[[a3]], %[[c16]], %[[c32]]
//       CHECK-PROP:   %[[a4:.*]] = arith.addf %[[a3]], %[[r4]]
//       CHECK-PROP:   %[[a5:.*]] = arith.addf %[[a4]], %[[warp_op]]#1
//       CHECK-PROP:   return %[[a5]] : f32
func.func @vector_reduction_acc(%laneid: index) -> (f32) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
    %0 = "some_def"() : () -> (vector<64xf32>)
    %1 = "some_def"() : () -> (f32)
    %2 = vector.reduction <add>, %0, %1 : vector<64xf32> into f32
    vector.yield %2 : f32
  }
  return %r : f32
}

// -----

// CHECK-PROP-LABEL:   func @warp_duplicate_yield(
func.func @warp_duplicate_yield(%laneid: index) -> (vector<1xf32>, vector<1xf32>) {
  //   CHECK-PROP: %{{.*}}:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xf32>)
  %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>, vector<1xf32>) {
    %2 = "some_def"() : () -> (vector<32xf32>)
    %3 = "some_def"() : () -> (vector<32xf32>)
    %4 = arith.addf %2, %3 : vector<32xf32>
    %5 = arith.addf %2, %2 : vector<32xf32>
// CHECK-PROP-NOT:   arith.addf
//     CHECK-PROP:   vector.yield %{{.*}}, %{{.*}} : vector<32xf32>, vector<32xf32>
    vector.yield %4, %5 : vector<32xf32>, vector<32xf32>
  }
  return %r#0, %r#1 : vector<1xf32>, vector<1xf32>
}

// -----

// CHECK-PROP-LABEL: func @warp_constant(
//       CHECK-PROP:   %[[C:.*]] = arith.constant dense<2.000000e+00> : vector<1xf32>
//       CHECK-PROP:   return %[[C]] : vector<1xf32>
func.func @warp_constant(%laneid: index) -> (vector<1xf32>) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
    %cst = arith.constant dense<2.0> : vector<32xf32>
    vector.yield %cst : vector<32xf32>
  }
  return %r : vector<1xf32>
}

// -----

// TODO: We could use warp shuffles instead of broadcasting the entire vector.

// CHECK-PROP-LABEL: func.func @vector_extract_1d(
//   CHECK-PROP-DAG:   %[[C5_I32:.*]] = arith.constant 5 : i32
//   CHECK-PROP-DAG:   %[[C1:.*]] = arith.constant 1 : index
//       CHECK-PROP:   %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<2xf32>) {
//       CHECK-PROP:     %[[V:.*]] = "some_def"() : () -> vector<64xf32>
//       CHECK-PROP:     vector.yield %[[V]] : vector<64xf32>
//       CHECK-PROP:   }
//       CHECK-PROP:   %[[E:.*]] = vector.extractelement %[[R]][%[[C1]] : index] : vector<2xf32>
//       CHECK-PROP:   %[[SHUFFLED:.*]], %{{.*}} = gpu.shuffle  idx %[[E]], %[[C5_I32]]
//       CHECK-PROP:   return %[[SHUFFLED]] : f32
func.func @vector_extract_1d(%laneid: index) -> (f32) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
    %0 = "some_def"() : () -> (vector<64xf32>)
    %1 = vector.extract %0[9] : vector<64xf32>
    vector.yield %1 : f32
  }
  return %r : f32
}

// -----

// CHECK-PROP-LABEL: func.func @vector_extract_2d(
//       CHECK-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x3xf32>) {
//       CHECK-PROP:     %[[V:.*]] = "some_def"
//       CHECK-PROP:     vector.yield %[[V]] : vector<5x96xf32>
//       CHECK-PROP:   }
//       CHECK-PROP:   %[[E:.*]] = vector.extract %[[W]][2] : vector<5x3xf32>
//       CHECK-PROP:   return %[[E]]
func.func @vector_extract_2d(%laneid: index) -> (vector<3xf32>) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) {
    %0 = "some_def"() : () -> (vector<5x96xf32>)
    %1 = vector.extract %0[2] : vector<5x96xf32>
    vector.yield %1 : vector<96xf32>
  }
  return %r : vector<3xf32>
}

// -----

// CHECK-PROP-LABEL: func.func @vector_extract_2d_broadcast_scalar(
//       CHECK-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x96xf32>) {
//       CHECK-PROP:     %[[V:.*]] = "some_def"
//       CHECK-PROP:     vector.yield %[[V]] : vector<5x96xf32>
//       CHECK-PROP:   }
//       CHECK-PROP:   %[[E:.*]] = vector.extract %[[W]][1, 2] : vector<5x96xf32>
//       CHECK-PROP:   return %[[E]]
func.func @vector_extract_2d_broadcast_scalar(%laneid: index) -> (f32) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
    %0 = "some_def"() : () -> (vector<5x96xf32>)
    %1 = vector.extract %0[1, 2] : vector<5x96xf32>
    vector.yield %1 : f32
  }
  return %r : f32
}

// -----

// CHECK-PROP-LABEL: func.func @vector_extract_2d_broadcast(
//       CHECK-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x96xf32>) {
//       CHECK-PROP:     %[[V:.*]] = "some_def"
//       CHECK-PROP:     vector.yield %[[V]] : vector<5x96xf32>
//       CHECK-PROP:   }
//       CHECK-PROP:   %[[E:.*]] = vector.extract %[[W]][2] : vector<5x96xf32>
//       CHECK-PROP:   return %[[E]]
func.func @vector_extract_2d_broadcast(%laneid: index) -> (vector<96xf32>) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<96xf32>) {
    %0 = "some_def"() : () -> (vector<5x96xf32>)
    %1 = vector.extract %0[2] : vector<5x96xf32>
    vector.yield %1 : vector<96xf32>
  }
  return %r : vector<96xf32>
}

// -----

// CHECK-PROP-LABEL: func.func @vector_extract_3d(
//       CHECK-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<8x4x96xf32>) {
//       CHECK-PROP:     %[[V:.*]] = "some_def"
//       CHECK-PROP:     vector.yield %[[V]] : vector<8x128x96xf32>
//       CHECK-PROP:   }
//       CHECK-PROP:   %[[E:.*]] = vector.extract %[[W]][2] : vector<8x4x96xf32>
//       CHECK-PROP:   return %[[E]]
func.func @vector_extract_3d(%laneid: index) -> (vector<4x96xf32>) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) {
    %0 = "some_def"() : () -> (vector<8x128x96xf32>)
    %1 = vector.extract %0[2] : vector<8x128x96xf32>
    vector.yield %1 : vector<128x96xf32>
  }
  return %r : vector<4x96xf32>
}

// -----

// CHECK-PROP-LABEL: func.func @vector_extractelement_0d(
//       CHECK-PROP:   %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<f32>) {
//       CHECK-PROP:     %[[V:.*]] = "some_def"() : () -> vector<f32>
//       CHECK-PROP:     vector.yield %[[V]] : vector<f32>
//       CHECK-PROP:   }
//       CHECK-PROP:   %[[E:.*]] = vector.extractelement %[[R]][] : vector<f32>
//       CHECK-PROP:   return %[[E]] : f32
func.func @vector_extractelement_0d(%laneid: index) -> (f32) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
    %0 = "some_def"() : () -> (vector<f32>)
    %1 = vector.extractelement %0[] : vector<f32>
    vector.yield %1 : f32
  }
  return %r : f32
}

// -----

// CHECK-PROP-LABEL: func.func @vector_extractelement_1element(
//       CHECK-PROP:   %[[C0:.*]] = arith.constant 0 : index
//       CHECK-PROP:   %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) {
//       CHECK-PROP:     %[[V:.*]] = "some_def"() : () -> vector<1xf32>
//       CHECK-PROP:     vector.yield %[[V]] : vector<1xf32>
//       CHECK-PROP:   }
//       CHECK-PROP:   %[[E:.*]] = vector.extractelement %[[R]][%[[C0]] : index] : vector<1xf32>
//       CHECK-PROP:   return %[[E]] : f32
func.func @vector_extractelement_1element(%laneid: index) -> (f32) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
    %0 = "some_def"() : () -> (vector<1xf32>)
    %c0 = arith.constant 0 : index
    %1 = vector.extractelement %0[%c0 : index] : vector<1xf32>
    vector.yield %1 : f32
  }
  return %r : f32
}

// -----

//       CHECK-PROP: #[[$map:.*]] = affine_map<()[s0] -> (s0 ceildiv 3)>
//       CHECK-PROP: #[[$map1:.*]] = affine_map<()[s0] -> (s0 mod 3)>
// CHECK-PROP-LABEL: func.func @vector_extractelement_1d(
//  CHECK-PROP-SAME:     %[[LANEID:.*]]: index, %[[POS:.*]]: index
//   CHECK-PROP-DAG:   %[[C32:.*]] = arith.constant 32 : i32
//       CHECK-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<3xf32>) {
//       CHECK-PROP:     %[[V:.*]] = "some_def"
//       CHECK-PROP:     vector.yield %[[V]] : vector<96xf32>
//       CHECK-PROP:   }
//       CHECK-PROP:   %[[FROM_LANE:.*]] = affine.apply #[[$map]]()[%[[POS]]]
//       CHECK-PROP:   %[[DISTR_POS:.*]] = affine.apply #[[$map1]]()[%[[POS]]]
//       CHECK-PROP:   %[[EXTRACTED:.*]] = vector.extractelement %[[W]][%[[DISTR_POS]] : index] : vector<3xf32>
//       CHECK-PROP:   %[[FROM_LANE_I32:.*]] = arith.index_cast %[[FROM_LANE]] : index to i32
//       CHECK-PROP:   %[[SHUFFLED:.*]], %{{.*}} = gpu.shuffle  idx %[[EXTRACTED]], %[[FROM_LANE_I32]], %[[C32]] : f32
//       CHECK-PROP:   return %[[SHUFFLED]]
func.func @vector_extractelement_1d(%laneid: index, %pos: index) -> (f32) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
    %0 = "some_def"() : () -> (vector<96xf32>)
    %1 = vector.extractelement %0[%pos : index] : vector<96xf32>
    vector.yield %1 : f32
  }
  return %r : f32
}

// -----

// CHECK-PROP:   func @lane_dependent_warp_propagate_read
//  CHECK-PROP-SAME:   %[[ID:.*]]: index
func.func @lane_dependent_warp_propagate_read(
    %laneid: index, %src: memref<1x1024xf32>, %dest: memref<1x1024xf32>) {
  // CHECK-PROP-DAG: %[[C0:.*]] = arith.constant 0 : index
  // CHECK-PROP-NOT: vector.warp_execute_on_lane_0
  // CHECK-PROP-DAG: %[[R0:.*]] = vector.transfer_read %arg1[%[[C0]], %[[ID]]], %{{.*}} : memref<1x1024xf32>, vector<1x1xf32>
  // CHECK-PROP: vector.transfer_write %[[R0]], {{.*}} : vector<1x1xf32>, memref<1x1024xf32>
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x1xf32>) {
    %2 = vector.transfer_read %src[%c0, %c0], %cst : memref<1x1024xf32>, vector<1x32xf32>
    vector.yield %2 : vector<1x32xf32>
  }
  vector.transfer_write %r, %dest[%c0, %laneid] : vector<1x1xf32>, memref<1x1024xf32>
  return
}

// -----

// CHECK-PROP:   func @dont_duplicate_read
func.func @dont_duplicate_read(
  %laneid: index, %src: memref<1024xf32>) -> vector<1xf32> {
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
//       CHECK-PROP:   vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) {
//  CHECK-PROP-NEXT:     vector.transfer_read
//  CHECK-PROP-NEXT:     "blocking_use"
//  CHECK-PROP-NEXT:     vector.yield
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
    %2 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>, vector<32xf32>
    "blocking_use"(%2) : (vector<32xf32>) -> ()
    vector.yield %2 : vector<32xf32>
  }
  return %r : vector<1xf32>
}

// -----

// CHECK-PROP:   func @dedup
func.func @dedup(%laneid: index, %v0: vector<4xf32>, %v1: vector<4xf32>)
    -> (vector<1xf32>, vector<1xf32>) {

  // CHECK-PROP: %[[SINGLE_RES:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>) {
  %r:2 = vector.warp_execute_on_lane_0(%laneid)[32]
      args(%v0, %v1 : vector<4xf32>, vector<4xf32>) -> (vector<1xf32>, vector<1xf32>) {
    ^bb0(%arg0: vector<128xf32>, %arg1: vector<128xf32>):

    // CHECK-PROP: %[[SINGLE_VAL:.*]] = "some_def"(%{{.*}}) : (vector<128xf32>) -> vector<32xf32>
    %2 = "some_def"(%arg0) : (vector<128xf32>) -> vector<32xf32>

    // CHECK-PROP: vector.yield %[[SINGLE_VAL]] : vector<32xf32>
    vector.yield %2, %2 : vector<32xf32>, vector<32xf32>
  }

  // CHECK-PROP: return %[[SINGLE_RES]], %[[SINGLE_RES]] : vector<1xf32>, vector<1xf32>
  return %r#0, %r#1 : vector<1xf32>, vector<1xf32>
}

// -----

// CHECK-SCF-IF:   func @warp_execute_has_broadcast_semantics
func.func @warp_execute_has_broadcast_semantics(%laneid: index, %s0: f32, %v0: vector<f32>, %v1: vector<1xf32>, %v2: vector<1x1xf32>)
    -> (f32, vector<f32>, vector<1xf32>, vector<1x1xf32>) {
  // CHECK-SCF-IF-DAG: %[[C0:.*]] = arith.constant 0 : index

  // CHECK-SCF-IF: scf.if{{.*}}{
  %r:4 = vector.warp_execute_on_lane_0(%laneid)[32]
      args(%s0, %v0, %v1, %v2 : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>) -> (f32, vector<f32>, vector<1xf32>, vector<1x1xf32>) {
    ^bb0(%bs0: f32, %bv0: vector<f32>, %bv1: vector<1xf32>, %bv2: vector<1x1xf32>):

      // CHECK-SCF-IF: vector.transfer_read {{.*}}[%[[C0]], %[[C0]]]{{.*}} {in_bounds = [true, true]} : memref<1x1xf32, 3>, vector<1x1xf32>
      // CHECK-SCF-IF: vector.transfer_read {{.*}}[%[[C0]]]{{.*}} {in_bounds = [true]} : memref<1xf32, 3>, vector<1xf32>
      // CHECK-SCF-IF: vector.transfer_read {{.*}}[]{{.*}} : memref<f32, 3>, vector<f32>
      // CHECK-SCF-IF: memref.load {{.*}}[%[[C0]]] : memref<1xf32, 3>
      // CHECK-SCF-IF: "some_def_0"(%{{.*}}) : (f32) -> f32
      // CHECK-SCF-IF: "some_def_1"(%{{.*}}) : (vector<f32>) -> vector<f32>
      // CHECK-SCF-IF: "some_def_1"(%{{.*}}) : (vector<1xf32>) -> vector<1xf32>
      // CHECK-SCF-IF: "some_def_1"(%{{.*}}) : (vector<1x1xf32>) -> vector<1x1xf32>
      // CHECK-SCF-IF: memref.store {{.*}}[%[[C0]]] : memref<1xf32, 3>
      // CHECK-SCF-IF: vector.transfer_write {{.*}}[] : vector<f32>, memref<f32, 3>
      // CHECK-SCF-IF: vector.transfer_write {{.*}}[%[[C0]]] {in_bounds = [true]} : vector<1xf32>, memref<1xf32, 3>
      // CHECK-SCF-IF: vector.transfer_write {{.*}}[%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<1x1xf32>, memref<1x1xf32, 3>

      %rs0 = "some_def_0"(%bs0) : (f32) -> f32
      %rv0 = "some_def_1"(%bv0) : (vector<f32>) -> vector<f32>
      %rv1 = "some_def_1"(%bv1) : (vector<1xf32>) -> vector<1xf32>
      %rv2 = "some_def_1"(%bv2) : (vector<1x1xf32>) -> vector<1x1xf32>

      // CHECK-SCF-IF-NOT: vector.yield
      vector.yield %rs0, %rv0, %rv1, %rv2 : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>
  }

  // CHECK-SCF-IF: gpu.barrier
  // CHECK-SCF-IF: %[[RV2:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]]{{.*}} {in_bounds = [true, true]} : memref<1x1xf32, 3>, vector<1x1xf32>
  // CHECK-SCF-IF: %[[RV1:.*]] = vector.transfer_read {{.*}}[%[[C0]]]{{.*}} {in_bounds = [true]} : memref<1xf32, 3>, vector<1xf32>
  // CHECK-SCF-IF: %[[RV0:.*]] = vector.transfer_read {{.*}}[]{{.*}} : memref<f32, 3>, vector<f32>
  // CHECK-SCF-IF: %[[RS0:.*]] = memref.load {{.*}}[%[[C0]]] : memref<1xf32, 3>
  // CHECK-SCF-IF: return %[[RS0]], %[[RV0]], %[[RV1]], %[[RV2]] : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>
  return %r#0, %r#1, %r#2, %r#3 : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>
}

// -----

// CHECK-SCF-IF-DAG: #[[$TIMES2:.*]] = affine_map<()[s0] -> (s0 * 2)>

// CHECK-SCF-IF:   func @warp_execute_nd_distribute
// CHECK-SCF-IF-SAME: (%[[LANEID:.*]]: index
func.func @warp_execute_nd_distribute(%laneid: index, %v0: vector<1x64x1xf32>, %v1: vector<1x2x128xf32>)
    -> (vector<1x64x1xf32>, vector<1x2x128xf32>) {
  // CHECK-SCF-IF-DAG: %[[C0:.*]] = arith.constant 0 : index

  // CHECK-SCF-IF:  vector.transfer_write %{{.*}}, %{{.*}}[%[[LANEID]], %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x1xf32>, memref<32x64x1xf32, 3>
  // CHECK-SCF-IF:  %[[RID:.*]] = affine.apply #[[$TIMES2]]()[%[[LANEID]]]
  // CHECK-SCF-IF:  vector.transfer_write %{{.*}}, %{{.*}}[%[[C0]], %[[RID]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x2x128xf32>, memref<1x64x128xf32, 3>
  // CHECK-SCF-IF:  gpu.barrier

  // CHECK-SCF-IF: scf.if{{.*}}{
  %r:2 = vector.warp_execute_on_lane_0(%laneid)[32]
      args(%v0, %v1 : vector<1x64x1xf32>, vector<1x2x128xf32>) -> (vector<1x64x1xf32>, vector<1x2x128xf32>) {
    ^bb0(%arg0: vector<32x64x1xf32>, %arg1: vector<1x64x128xf32>):

  // CHECK-SCF-IF-DAG: %[[SR0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} {in_bounds = [true, true, true]} : memref<32x64x1xf32, 3>, vector<32x64x1xf32>
  // CHECK-SCF-IF-DAG: %[[SR1:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} {in_bounds = [true, true, true]} : memref<1x64x128xf32, 3>, vector<1x64x128xf32>
  //     CHECK-SCF-IF: %[[W0:.*]] = "some_def_0"(%[[SR0]]) : (vector<32x64x1xf32>) -> vector<32x64x1xf32>
  //     CHECK-SCF-IF: %[[W1:.*]] = "some_def_1"(%[[SR1]]) : (vector<1x64x128xf32>) -> vector<1x64x128xf32>
  // CHECK-SCF-IF-DAG: vector.transfer_write %[[W0]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<32x64x1xf32>, memref<32x64x1xf32, 3>
  // CHECK-SCF-IF-DAG: vector.transfer_write %[[W1]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x64x128xf32>, memref<1x64x128xf32, 3>

      %r0 = "some_def_0"(%arg0) : (vector<32x64x1xf32>) -> vector<32x64x1xf32>
      %r1 = "some_def_1"(%arg1) : (vector<1x64x128xf32>) -> vector<1x64x128xf32>

      // CHECK-SCF-IF-NOT: vector.yield
      vector.yield %r0, %r1 : vector<32x64x1xf32>, vector<1x64x128xf32>
  }

  //     CHECK-SCF-IF: gpu.barrier
  //     CHECK-SCF-IF: %[[WID:.*]] = affine.apply #[[$TIMES2]]()[%[[LANEID]]]
  // CHECK-SCF-IF-DAG: %[[R0:.*]] = vector.transfer_read %{{.*}}[%[[LANEID]], %[[C0]], %[[C0]]], %cst {in_bounds = [true, true, true]} : memref<32x64x1xf32, 3>, vector<1x64x1xf32>
  // CHECK-SCF-IF-DAG: %[[R1:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[WID]], %[[C0]]], %cst {in_bounds = [true, true, true]} : memref<1x64x128xf32, 3>, vector<1x2x128xf32>
  //     CHECK-SCF-IF: return %[[R0]], %[[R1]] : vector<1x64x1xf32>, vector<1x2x128xf32>
  return %r#0, %r#1 : vector<1x64x1xf32>, vector<1x2x128xf32>
}

// -----

//       CHECK-PROP:   #[[$MAP:.*]] = affine_map<()[s0] -> (s0 ceildiv 3)>
//       CHECK-PROP:   #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 mod 3)>
// CHECK-PROP-LABEL: func @vector_insertelement_1d(
//  CHECK-PROP-SAME:     %[[LANEID:.*]]: index, %[[POS:.*]]: index
//       CHECK-PROP:   %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, f32)
//       CHECK-PROP:   %[[INSERTING_LANE:.*]] = affine.apply #[[$MAP]]()[%[[POS]]]
//       CHECK-PROP:   %[[INSERTING_POS:.*]] = affine.apply #[[$MAP1]]()[%[[POS]]]
//       CHECK-PROP:   %[[SHOULD_INSERT:.*]] = arith.cmpi eq, %[[LANEID]], %[[INSERTING_LANE]] : index
//       CHECK-PROP:   %[[R:.*]] = scf.if %[[SHOULD_INSERT]] -> (vector<3xf32>) {
//       CHECK-PROP:     %[[INSERT:.*]] = vector.insertelement %[[W]]#1, %[[W]]#0[%[[INSERTING_POS]] : index]
//       CHECK-PROP:     scf.yield %[[INSERT]]
//       CHECK-PROP:   } else {
//       CHECK-PROP:     scf.yield %[[W]]#0
//       CHECK-PROP:   }
//       CHECK-PROP:   return %[[R]]
func.func @vector_insertelement_1d(%laneid: index, %pos: index) -> (vector<3xf32>) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) {
    %0 = "some_def"() : () -> (vector<96xf32>)
    %f = "another_def"() : () -> (f32)
    %1 = vector.insertelement %f, %0[%pos : index] : vector<96xf32>
    vector.yield %1 : vector<96xf32>
  }
  return %r : vector<3xf32>
}

// -----

// CHECK-PROP-LABEL: func @vector_insertelement_1d_broadcast(
//  CHECK-PROP-SAME:     %[[LANEID:.*]]: index, %[[POS:.*]]: index
//       CHECK-PROP:   %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, f32)
//       CHECK-PROP:     %[[VEC:.*]] = "some_def"
//       CHECK-PROP:     %[[VAL:.*]] = "another_def"
//       CHECK-PROP:     vector.yield %[[VEC]], %[[VAL]]
//       CHECK-PROP:   vector.insertelement %[[W]]#1, %[[W]]#0[%[[POS]] : index] : vector<96xf32>
func.func @vector_insertelement_1d_broadcast(%laneid: index, %pos: index) -> (vector<96xf32>) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<96xf32>) {
    %0 = "some_def"() : () -> (vector<96xf32>)
    %f = "another_def"() : () -> (f32)
    %1 = vector.insertelement %f, %0[%pos : index] : vector<96xf32>
    vector.yield %1 : vector<96xf32>
  }
  return %r : vector<96xf32>
}

// -----

// CHECK-PROP-LABEL: func @vector_insertelement_0d(
//       CHECK-PROP:   %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector<f32>, f32)
//       CHECK-PROP:     %[[VEC:.*]] = "some_def"
//       CHECK-PROP:     %[[VAL:.*]] = "another_def"
//       CHECK-PROP:     vector.yield %[[VEC]], %[[VAL]]
//       CHECK-PROP:   vector.insertelement %[[W]]#1, %[[W]]#0[] : vector<f32>
func.func @vector_insertelement_0d(%laneid: index) -> (vector<f32>) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<f32>) {
    %0 = "some_def"() : () -> (vector<f32>)
    %f = "another_def"() : () -> (f32)
    %1 = vector.insertelement %f, %0[] : vector<f32>
    vector.yield %1 : vector<f32>
  }
  return %r : vector<f32>
}

// -----

// CHECK-PROP-LABEL: func @vector_insert_1d(
//  CHECK-PROP-SAME:     %[[LANEID:.*]]: index
//   CHECK-PROP-DAG:   %[[C1:.*]] = arith.constant 1 : index
//   CHECK-PROP-DAG:   %[[C26:.*]] = arith.constant 26 : index
//       CHECK-PROP:   %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, f32)
//       CHECK-PROP:     %[[VEC:.*]] = "some_def"
//       CHECK-PROP:     %[[VAL:.*]] = "another_def"
//       CHECK-PROP:     vector.yield %[[VEC]], %[[VAL]]
//       CHECK-PROP:   %[[SHOULD_INSERT:.*]] = arith.cmpi eq, %[[LANEID]], %[[C26]]
//       CHECK-PROP:   %[[R:.*]] = scf.if %[[SHOULD_INSERT]] -> (vector<3xf32>) {
//       CHECK-PROP:     %[[INSERT:.*]] = vector.insertelement %[[W]]#1, %[[W]]#0[%[[C1]] : index]
//       CHECK-PROP:     scf.yield %[[INSERT]]
//       CHECK-PROP:   } else {
//       CHECK-PROP:     scf.yield %[[W]]#0
//       CHECK-PROP:   }
//       CHECK-PROP:   return %[[R]]
func.func @vector_insert_1d(%laneid: index) -> (vector<3xf32>) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) {
    %0 = "some_def"() : () -> (vector<96xf32>)
    %f = "another_def"() : () -> (f32)
    %1 = vector.insert %f, %0[76] : f32 into vector<96xf32>
    vector.yield %1 : vector<96xf32>
  }
  return %r : vector<3xf32>
}

// -----

// CHECK-PROP-LABEL: func @vector_insert_2d_distr_src(
//  CHECK-PROP-SAME:     %[[LANEID:.*]]: index
//       CHECK-PROP:   %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, vector<4x3xf32>)
//       CHECK-PROP:     %[[VEC:.*]] = "some_def"
//       CHECK-PROP:     %[[VAL:.*]] = "another_def"
//       CHECK-PROP:     vector.yield %[[VAL]], %[[VEC]]
//       CHECK-PROP:   %[[INSERT:.*]] = vector.insert %[[W]]#0, %[[W]]#1 [2] : vector<3xf32> into vector<4x3xf32>
//       CHECK-PROP:   return %[[INSERT]]
func.func @vector_insert_2d_distr_src(%laneid: index) -> (vector<4x3xf32>) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x3xf32>) {
    %0 = "some_def"() : () -> (vector<4x96xf32>)
    %s = "another_def"() : () -> (vector<96xf32>)
    %1 = vector.insert %s, %0[2] : vector<96xf32> into vector<4x96xf32>
    vector.yield %1 : vector<4x96xf32>
  }
  return %r : vector<4x3xf32>
}

// -----

// CHECK-PROP-LABEL: func @vector_insert_2d_distr_pos(
//  CHECK-PROP-SAME:     %[[LANEID:.*]]: index
//       CHECK-PROP:   %[[C19:.*]] = arith.constant 19 : index
//       CHECK-PROP:   %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, vector<4x96xf32>)
//       CHECK-PROP:     %[[VEC:.*]] = "some_def"
//       CHECK-PROP:     %[[VAL:.*]] = "another_def"
//       CHECK-PROP:     vector.yield %[[VAL]], %[[VEC]]
//       CHECK-PROP:   %[[SHOULD_INSERT:.*]] = arith.cmpi eq, %[[LANEID]], %[[C19]]
//       CHECK-PROP:   %[[R:.*]] = scf.if %[[SHOULD_INSERT]] -> (vector<4x96xf32>) {
//       CHECK-PROP:     %[[INSERT:.*]] = vector.insert %[[W]]#0, %[[W]]#1 [3] : vector<96xf32> into vector<4x96xf32>
//       CHECK-PROP:     scf.yield %[[INSERT]]
//       CHECK-PROP:   } else {
//       CHECK-PROP:     scf.yield %[[W]]#1
//       CHECK-PROP:   }
//       CHECK-PROP:   return %[[R]]
func.func @vector_insert_2d_distr_pos(%laneid: index) -> (vector<4x96xf32>) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) {
    %0 = "some_def"() : () -> (vector<128x96xf32>)
    %s = "another_def"() : () -> (vector<96xf32>)
    %1 = vector.insert %s, %0[79] : vector<96xf32> into vector<128x96xf32>
    vector.yield %1 : vector<128x96xf32>
  }
  return %r : vector<4x96xf32>
}

// -----

// CHECK-PROP-LABEL: func @vector_insert_2d_broadcast(
//  CHECK-PROP-SAME:     %[[LANEID:.*]]: index
//       CHECK-PROP:   %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, vector<4x96xf32>)
//       CHECK-PROP:     %[[VEC:.*]] = "some_def"
//       CHECK-PROP:     %[[VAL:.*]] = "another_def"
//       CHECK-PROP:     vector.yield %[[VAL]], %[[VEC]]
//       CHECK-PROP:   %[[INSERT:.*]] = vector.insert %[[W]]#0, %[[W]]#1 [2] : vector<96xf32> into vector<4x96xf32>
//       CHECK-PROP:   return %[[INSERT]]
func.func @vector_insert_2d_broadcast(%laneid: index) -> (vector<4x96xf32>) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) {
    %0 = "some_def"() : () -> (vector<4x96xf32>)
    %s = "another_def"() : () -> (vector<96xf32>)
    %1 = vector.insert %s, %0[2] : vector<96xf32> into vector<4x96xf32>
    vector.yield %1 : vector<4x96xf32>
  }
  return %r : vector<4x96xf32>
}

// -----

// Check that we don't propagate transfer_reads that have dependencies on
// values inside the warp_execute_on_lane_0.
// In this case, propagating would create transfer_read that depends on the
// extractelment defined in the body.

// CHECK-PROP-LABEL: func @transfer_read_no_prop(
//  CHECK-PROP-SAME:     %[[IN2:[^ :]*]]: vector<1x2xindex>,
//  CHECK-PROP-SAME:     %[[AR1:[^ :]*]]: memref<1x4x2xi32>,
//  CHECK-PROP-SAME:     %[[AR2:[^ :]*]]: memref<1x4x1024xf32>)
//   CHECK-PROP-DAG:   %[[C0:.*]] = arith.constant 0 : index
//   CHECK-PROP-DAG:   %[[THREADID:.*]] = gpu.thread_id  x
//       CHECK-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%[[THREADID]])[32] args(%[[IN2]]
//       CHECK-PROP:     %[[GATHER:.*]] = vector.gather %[[AR1]][{{.*}}]
//       CHECK-PROP:     %[[EXTRACT:.*]] = vector.extract %[[GATHER]][0] : vector<1x64xi32>
//       CHECK-PROP:     %[[CAST:.*]] = arith.index_cast %[[EXTRACT]] : vector<64xi32> to vector<64xindex>
//       CHECK-PROP:     %[[EXTRACTELT:.*]] = vector.extractelement %[[CAST]][{{.*}}: i32] : vector<64xindex>
//       CHECK-PROP:     %[[TRANSFERREAD:.*]] = vector.transfer_read %[[AR2]][%[[C0]], %[[EXTRACTELT]], %[[C0]]],
//       CHECK-PROP:     vector.yield %[[TRANSFERREAD]] : vector<64xf32>
//       CHECK-PROP:   return %[[W]]
func.func @transfer_read_no_prop(%in2: vector<1x2xindex>, %ar1 :  memref<1x4x2xi32>, %ar2 : memref<1x4x1024xf32>)-> vector<2xf32> {
  %0 = gpu.thread_id  x
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %cst = arith.constant dense<0> : vector<1x64xi32>
  %cst_0 = arith.constant dense<true> : vector<1x64xi1>
  %cst_1 = arith.constant dense<3> : vector<64xindex>
  %cst_2 = arith.constant dense<0> : vector<64xindex>
  %cst_6 = arith.constant 0.000000e+00 : f32

  %18 = vector.warp_execute_on_lane_0(%0)[32] args(%in2 : vector<1x2xindex>) -> (vector<2xf32>) {
  ^bb0(%arg4: vector<1x64xindex>):
    %28 = vector.gather %ar1[%c0, %c0, %c0] [%arg4], %cst_0, %cst : memref<1x4x2xi32>, vector<1x64xindex>, vector<1x64xi1>, vector<1x64xi32> into vector<1x64xi32>
    %29 = vector.extract %28[0] : vector<1x64xi32>
    %30 = arith.index_cast %29 : vector<64xi32> to vector<64xindex>
    %36 = vector.extractelement %30[%c0_i32 : i32] : vector<64xindex>
    %37 = vector.transfer_read %ar2[%c0, %36, %c0], %cst_6 {in_bounds = [true]} : memref<1x4x1024xf32>, vector<64xf32>
    vector.yield %37 : vector<64xf32>
  }
  return %18 : vector<2xf32>
}

// -----

// Check that we don't fold vector.broadcast when each thread doesn't get the
// same value.

// CHECK-PROP-LABEL: func @dont_fold_vector_broadcast(
//       CHECK-PROP:   %[[r:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (vector<1x2xf32>)
//       CHECK-PROP:     %[[some_def:.*]] = "some_def"
//       CHECK-PROP:     %[[broadcast:.*]] = vector.broadcast %[[some_def]] : vector<64xf32> to vector<1x64xf32>
//       CHECK-PROP:     vector.yield %[[broadcast]] : vector<1x64xf32>
//       CHECK-PROP:   vector.print %[[r]] : vector<1x2xf32>
func.func @dont_fold_vector_broadcast(%laneid: index) {
  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x2xf32>) {
    %0 = "some_def"() : () -> (vector<64xf32>)
    %1 = vector.broadcast %0 : vector<64xf32> to vector<1x64xf32>
    vector.yield %1 : vector<1x64xf32>
  }
  vector.print %r : vector<1x2xf32>
  return
}