File: amx-low-intrinsics-no-amx-bitcast.ll

package info (click to toggle)
llvm-toolchain-14 1%3A14.0.6-12
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,496,180 kB
  • sloc: cpp: 5,593,972; ansic: 986,872; asm: 585,869; python: 184,223; objc: 72,530; lisp: 31,119; f90: 27,793; javascript: 9,780; pascal: 9,762; sh: 9,482; perl: 7,468; ml: 5,432; awk: 3,523; makefile: 2,538; xml: 953; cs: 573; fortran: 567
file content (211 lines) | stat: -rw-r--r-- 17,080 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s

define dso_local void @test_no_bitcast(i32* %A_mem, i32* %B_mem, i32* %C_mem) local_unnamed_addr #0 {
; CHECK-LABEL: @test_no_bitcast(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[C_MEM:%.*]] to i8*
; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_ROWS_HEADER:%.*]]
; CHECK:       tileload.scalarize.rows.header:
; CHECK-NEXT:    [[TILELOAD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH:%.*]] ]
; CHECK-NEXT:    [[VEC_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP10:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH]] ]
; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_ROWS_BODY:%.*]]
; CHECK:       tileload.scalarize.rows.body:
; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_COLS_HEADER:%.*]]
; CHECK:       tileload.scalarize.cols.header:
; CHECK-NEXT:    [[TILELOAD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH:%.*]] ]
; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW]], [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TMP10]], [[TILELOAD_SCALARIZE_COLS_LATCH]] ]
; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_COLS_BODY:%.*]]
; CHECK:       tileload.scalarize.cols.body:
; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[TILELOAD_SCALARIZE_ROWS_IV]] to i64
; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TILELOAD_SCALARIZE_COLS_IV]] to i64
; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP1]], 4
; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], [[TMP2]]
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP0]] to i32*
; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i64 [[TMP4]]
; CHECK-NEXT:    [[TMP7:%.*]] = mul i16 [[TILELOAD_SCALARIZE_ROWS_IV]], 16
; CHECK-NEXT:    [[TMP8:%.*]] = add i16 [[TMP7]], [[TILELOAD_SCALARIZE_COLS_IV]]
; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP6]], align 4
; CHECK-NEXT:    [[TMP10]] = insertelement <256 x i32> [[VEC_PHI]], i32 [[TMP9]], i16 [[TMP8]]
; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_COLS_LATCH]]
; CHECK:       tileload.scalarize.cols.latch:
; CHECK-NEXT:    [[TILELOAD_SCALARIZE_COLS_STEP]] = add i16 [[TILELOAD_SCALARIZE_COLS_IV]], 1
; CHECK-NEXT:    [[TILELOAD_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_COLS_STEP]], 4
; CHECK-NEXT:    br i1 [[TILELOAD_SCALARIZE_COLS_COND]], label [[TILELOAD_SCALARIZE_COLS_HEADER]], label [[TILELOAD_SCALARIZE_ROWS_LATCH]]
; CHECK:       tileload.scalarize.rows.latch:
; CHECK-NEXT:    [[TILELOAD_SCALARIZE_ROWS_STEP]] = add i16 [[TILELOAD_SCALARIZE_ROWS_IV]], 1
; CHECK-NEXT:    [[TILELOAD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_ROWS_STEP]], 4
; CHECK-NEXT:    br i1 [[TILELOAD_SCALARIZE_ROWS_COND]], label [[TILELOAD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]]
; CHECK:       continue:
; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <256 x i32> [[TMP10]] to x86_amx
; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[A_MEM:%.*]] to i8*
; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_ROWS_HEADER2:%.*]]
; CHECK:       tileload.scalarize.rows.header2:
; CHECK-NEXT:    [[TILELOAD_SCALARIZE_ROWS_IV5:%.*]] = phi i16 [ 0, [[CONTINUE]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP6:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH4:%.*]] ]
; CHECK-NEXT:    [[VEC_PHI_ROW14:%.*]] = phi <256 x i32> [ zeroinitializer, [[CONTINUE]] ], [ [[TMP22:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH4]] ]
; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_ROWS_BODY3:%.*]]
; CHECK:       tileload.scalarize.rows.body3:
; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_COLS_HEADER8:%.*]]
; CHECK:       tileload.scalarize.cols.header8:
; CHECK-NEXT:    [[TILELOAD_SCALARIZE_COLS_IV11:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY3]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP12:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH10:%.*]] ]
; CHECK-NEXT:    [[VEC_PHI15:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW14]], [[TILELOAD_SCALARIZE_ROWS_BODY3]] ], [ [[TMP22]], [[TILELOAD_SCALARIZE_COLS_LATCH10]] ]
; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_COLS_BODY9:%.*]]
; CHECK:       tileload.scalarize.cols.body9:
; CHECK-NEXT:    [[TMP13:%.*]] = zext i16 [[TILELOAD_SCALARIZE_ROWS_IV5]] to i64
; CHECK-NEXT:    [[TMP14:%.*]] = zext i16 [[TILELOAD_SCALARIZE_COLS_IV11]] to i64
; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], 4
; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP15]], [[TMP14]]
; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP12]] to i32*
; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i64 [[TMP16]]
; CHECK-NEXT:    [[TMP19:%.*]] = mul i16 [[TILELOAD_SCALARIZE_ROWS_IV5]], 16
; CHECK-NEXT:    [[TMP20:%.*]] = add i16 [[TMP19]], [[TILELOAD_SCALARIZE_COLS_IV11]]
; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[TMP18]], align 4
; CHECK-NEXT:    [[TMP22]] = insertelement <256 x i32> [[VEC_PHI15]], i32 [[TMP21]], i16 [[TMP20]]
; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_COLS_LATCH10]]
; CHECK:       tileload.scalarize.cols.latch10:
; CHECK-NEXT:    [[TILELOAD_SCALARIZE_COLS_STEP12]] = add i16 [[TILELOAD_SCALARIZE_COLS_IV11]], 1
; CHECK-NEXT:    [[TILELOAD_SCALARIZE_COLS_COND13:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_COLS_STEP12]], 4
; CHECK-NEXT:    br i1 [[TILELOAD_SCALARIZE_COLS_COND13]], label [[TILELOAD_SCALARIZE_COLS_HEADER8]], label [[TILELOAD_SCALARIZE_ROWS_LATCH4]]
; CHECK:       tileload.scalarize.rows.latch4:
; CHECK-NEXT:    [[TILELOAD_SCALARIZE_ROWS_STEP6]] = add i16 [[TILELOAD_SCALARIZE_ROWS_IV5]], 1
; CHECK-NEXT:    [[TILELOAD_SCALARIZE_ROWS_COND7:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_ROWS_STEP6]], 4
; CHECK-NEXT:    br i1 [[TILELOAD_SCALARIZE_ROWS_COND7]], label [[TILELOAD_SCALARIZE_ROWS_HEADER2]], label [[CONTINUE1:%.*]]
; CHECK:       continue1:
; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <256 x i32> [[TMP22]] to x86_amx
; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i32* [[B_MEM:%.*]] to i8*
; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_ROWS_HEADER17:%.*]]
; CHECK:       tileload.scalarize.rows.header17:
; CHECK-NEXT:    [[TILELOAD_SCALARIZE_ROWS_IV20:%.*]] = phi i16 [ 0, [[CONTINUE1]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP21:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH19:%.*]] ]
; CHECK-NEXT:    [[VEC_PHI_ROW29:%.*]] = phi <256 x i32> [ zeroinitializer, [[CONTINUE1]] ], [ [[TMP34:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH19]] ]
; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_ROWS_BODY18:%.*]]
; CHECK:       tileload.scalarize.rows.body18:
; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_COLS_HEADER23:%.*]]
; CHECK:       tileload.scalarize.cols.header23:
; CHECK-NEXT:    [[TILELOAD_SCALARIZE_COLS_IV26:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY18]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP27:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH25:%.*]] ]
; CHECK-NEXT:    [[VEC_PHI30:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW29]], [[TILELOAD_SCALARIZE_ROWS_BODY18]] ], [ [[TMP34]], [[TILELOAD_SCALARIZE_COLS_LATCH25]] ]
; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_COLS_BODY24:%.*]]
; CHECK:       tileload.scalarize.cols.body24:
; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TILELOAD_SCALARIZE_ROWS_IV20]] to i64
; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TILELOAD_SCALARIZE_COLS_IV26]] to i64
; CHECK-NEXT:    [[TMP27:%.*]] = mul i64 [[TMP25]], 4
; CHECK-NEXT:    [[TMP28:%.*]] = add i64 [[TMP27]], [[TMP26]]
; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i8* [[TMP24]] to i32*
; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i32, i32* [[TMP29]], i64 [[TMP28]]
; CHECK-NEXT:    [[TMP31:%.*]] = mul i16 [[TILELOAD_SCALARIZE_ROWS_IV20]], 16
; CHECK-NEXT:    [[TMP32:%.*]] = add i16 [[TMP31]], [[TILELOAD_SCALARIZE_COLS_IV26]]
; CHECK-NEXT:    [[TMP33:%.*]] = load i32, i32* [[TMP30]], align 4
; CHECK-NEXT:    [[TMP34]] = insertelement <256 x i32> [[VEC_PHI30]], i32 [[TMP33]], i16 [[TMP32]]
; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_COLS_LATCH25]]
; CHECK:       tileload.scalarize.cols.latch25:
; CHECK-NEXT:    [[TILELOAD_SCALARIZE_COLS_STEP27]] = add i16 [[TILELOAD_SCALARIZE_COLS_IV26]], 1
; CHECK-NEXT:    [[TILELOAD_SCALARIZE_COLS_COND28:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_COLS_STEP27]], 4
; CHECK-NEXT:    br i1 [[TILELOAD_SCALARIZE_COLS_COND28]], label [[TILELOAD_SCALARIZE_COLS_HEADER23]], label [[TILELOAD_SCALARIZE_ROWS_LATCH19]]
; CHECK:       tileload.scalarize.rows.latch19:
; CHECK-NEXT:    [[TILELOAD_SCALARIZE_ROWS_STEP21]] = add i16 [[TILELOAD_SCALARIZE_ROWS_IV20]], 1
; CHECK-NEXT:    [[TILELOAD_SCALARIZE_ROWS_COND22:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_ROWS_STEP21]], 4
; CHECK-NEXT:    br i1 [[TILELOAD_SCALARIZE_ROWS_COND22]], label [[TILELOAD_SCALARIZE_ROWS_HEADER17]], label [[CONTINUE16:%.*]]
; CHECK:       continue16:
; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <256 x i32> [[TMP34]] to x86_amx
; CHECK-NEXT:    br label [[TILEDPBSSD_SCALARIZE_ROWS_HEADER:%.*]]
; CHECK:       tiledpbssd.scalarize.rows.header:
; CHECK-NEXT:    [[TILEDPBSSD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[CONTINUE16]] ], [ [[TILEDPBSSD_SCALARIZE_ROWS_STEP:%.*]], [[TILEDPBSSD_SCALARIZE_ROWS_LATCH:%.*]] ]
; CHECK-NEXT:    [[VEC_C_PHI_ROW:%.*]] = phi <256 x i32> [ [[TMP10]], [[CONTINUE16]] ], [ [[TMP52:%.*]], [[TILEDPBSSD_SCALARIZE_ROWS_LATCH]] ]
; CHECK-NEXT:    [[VEC_D_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[CONTINUE16]] ], [ [[TMP54:%.*]], [[TILEDPBSSD_SCALARIZE_ROWS_LATCH]] ]
; CHECK-NEXT:    br label [[TILEDPBSSD_SCALARIZE_ROWS_BODY:%.*]]
; CHECK:       tiledpbssd.scalarize.rows.body:
; CHECK-NEXT:    br label [[TILEDPBSSD_SCALARIZE_COLS_HEADER:%.*]]
; CHECK:       tiledpbssd.scalarize.cols.header:
; CHECK-NEXT:    [[TILEDPBSSD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILEDPBSSD_SCALARIZE_ROWS_BODY]] ], [ [[TILEDPBSSD_SCALARIZE_COLS_STEP:%.*]], [[TILEDPBSSD_SCALARIZE_COLS_LATCH:%.*]] ]
; CHECK-NEXT:    [[VEC_C_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_ROW]], [[TILEDPBSSD_SCALARIZE_ROWS_BODY]] ], [ [[TMP52]], [[TILEDPBSSD_SCALARIZE_COLS_LATCH]] ]
; CHECK-NEXT:    [[VEC_D_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_ROW]], [[TILEDPBSSD_SCALARIZE_ROWS_BODY]] ], [ [[TMP54]], [[TILEDPBSSD_SCALARIZE_COLS_LATCH]] ]
; CHECK-NEXT:    [[TMP36:%.*]] = mul i16 [[TILEDPBSSD_SCALARIZE_ROWS_IV]], 16
; CHECK-NEXT:    [[TMP37:%.*]] = add i16 [[TMP36]], [[TILEDPBSSD_SCALARIZE_COLS_IV]]
; CHECK-NEXT:    br label [[TILEDPBSSD_SCALARIZE_COLS_BODY:%.*]]
; CHECK:       tiledpbssd.scalarize.cols.body:
; CHECK-NEXT:    br label [[TILEDPBSSD_SCALARIZE_INNER_HEADER:%.*]]
; CHECK:       tiledpbssd.scalarize.inner.header:
; CHECK-NEXT:    [[TILEDPBSSD_SCALARIZE_INNER_IV:%.*]] = phi i16 [ 0, [[TILEDPBSSD_SCALARIZE_COLS_BODY]] ], [ [[TILEDPBSSD_SCALARIZE_INNER_STEP:%.*]], [[TILEDPBSSD_SCALARIZE_INNER_LATCH:%.*]] ]
; CHECK-NEXT:    [[VEC_C_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_COL]], [[TILEDPBSSD_SCALARIZE_COLS_BODY]] ], [ [[TMP52]], [[TILEDPBSSD_SCALARIZE_INNER_LATCH]] ]
; CHECK-NEXT:    br label [[TILEDPBSSD_SCALARIZE_INNER_BODY:%.*]]
; CHECK:       tiledpbssd.scalarize.inner.body:
; CHECK-NEXT:    [[TMP38:%.*]] = mul i16 [[TILEDPBSSD_SCALARIZE_ROWS_IV]], 16
; CHECK-NEXT:    [[TMP39:%.*]] = add i16 [[TMP38]], [[TILEDPBSSD_SCALARIZE_INNER_IV]]
; CHECK-NEXT:    [[TMP40:%.*]] = mul i16 [[TILEDPBSSD_SCALARIZE_INNER_IV]], 16
; CHECK-NEXT:    [[TMP41:%.*]] = add i16 [[TMP40]], [[TILEDPBSSD_SCALARIZE_COLS_IV]]
; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <256 x i32> [[VEC_C_INNER_PHI]], i16 [[TMP37]]
; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <256 x i32> [[TMP22]], i16 [[TMP39]]
; CHECK-NEXT:    [[TMP44:%.*]] = bitcast i32 [[TMP43]] to <4 x i8>
; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <256 x i32> [[TMP34]], i16 [[TMP41]]
; CHECK-NEXT:    [[TMP46:%.*]] = bitcast i32 [[TMP45]] to <4 x i8>
; CHECK-NEXT:    [[TMP47:%.*]] = sext <4 x i8> [[TMP46]] to <4 x i32>
; CHECK-NEXT:    [[TMP48:%.*]] = sext <4 x i8> [[TMP44]] to <4 x i32>
; CHECK-NEXT:    [[TMP49:%.*]] = mul <4 x i32> [[TMP48]], [[TMP47]]
; CHECK-NEXT:    [[TMP50:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP49]])
; CHECK-NEXT:    [[TMP51:%.*]] = add i32 [[TMP42]], [[TMP50]]
; CHECK-NEXT:    [[TMP52]] = insertelement <256 x i32> [[VEC_C_INNER_PHI]], i32 [[TMP51]], i16 [[TMP37]]
; CHECK-NEXT:    br label [[TILEDPBSSD_SCALARIZE_INNER_LATCH]]
; CHECK:       tiledpbssd.scalarize.inner.latch:
; CHECK-NEXT:    [[TILEDPBSSD_SCALARIZE_INNER_STEP]] = add i16 [[TILEDPBSSD_SCALARIZE_INNER_IV]], 1
; CHECK-NEXT:    [[TILEDPBSSD_SCALARIZE_INNER_COND:%.*]] = icmp ne i16 [[TILEDPBSSD_SCALARIZE_INNER_STEP]], 4
; CHECK-NEXT:    br i1 [[TILEDPBSSD_SCALARIZE_INNER_COND]], label [[TILEDPBSSD_SCALARIZE_INNER_HEADER]], label [[TILEDPBSSD_SCALARIZE_COLS_LATCH]]
; CHECK:       tiledpbssd.scalarize.cols.latch:
; CHECK-NEXT:    [[TILEDPBSSD_SCALARIZE_COLS_STEP]] = add i16 [[TILEDPBSSD_SCALARIZE_COLS_IV]], 1
; CHECK-NEXT:    [[TILEDPBSSD_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILEDPBSSD_SCALARIZE_COLS_STEP]], 4
; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <256 x i32> [[TMP52]], i16 [[TMP37]]
; CHECK-NEXT:    [[TMP54]] = insertelement <256 x i32> [[VEC_D_PHI_COL]], i32 [[TMP53]], i16 [[TMP37]]
; CHECK-NEXT:    br i1 [[TILEDPBSSD_SCALARIZE_COLS_COND]], label [[TILEDPBSSD_SCALARIZE_COLS_HEADER]], label [[TILEDPBSSD_SCALARIZE_ROWS_LATCH]]
; CHECK:       tiledpbssd.scalarize.rows.latch:
; CHECK-NEXT:    [[TILEDPBSSD_SCALARIZE_ROWS_STEP]] = add i16 [[TILEDPBSSD_SCALARIZE_ROWS_IV]], 1
; CHECK-NEXT:    [[TILEDPBSSD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILEDPBSSD_SCALARIZE_ROWS_STEP]], 4
; CHECK-NEXT:    br i1 [[TILEDPBSSD_SCALARIZE_ROWS_COND]], label [[TILEDPBSSD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE31:%.*]]
; CHECK:       continue31:
; CHECK-NEXT:    [[TMP55:%.*]] = bitcast <256 x i32> [[TMP54]] to x86_amx
; CHECK-NEXT:    br label [[TILESTORE_SCALARIZE_ROWS_HEADER:%.*]]
; CHECK:       tilestore.scalarize.rows.header:
; CHECK-NEXT:    [[TILESTORE_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[CONTINUE31]] ], [ [[TILESTORE_SCALARIZE_ROWS_STEP:%.*]], [[TILESTORE_SCALARIZE_ROWS_LATCH:%.*]] ]
; CHECK-NEXT:    br label [[TILESTORE_SCALARIZE_ROWS_BODY:%.*]]
; CHECK:       tilestore.scalarize.rows.body:
; CHECK-NEXT:    br label [[TILESTORE_SCALARIZE_COLS_HEADER:%.*]]
; CHECK:       tilestore.scalarize.cols.header:
; CHECK-NEXT:    [[TILESTORE_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILESTORE_SCALARIZE_ROWS_BODY]] ], [ [[TILESTORE_SCALARIZE_COLS_STEP:%.*]], [[TILESTORE_SCALARIZE_COLS_LATCH:%.*]] ]
; CHECK-NEXT:    br label [[TILESTORE_SCALARIZE_COLS_BODY:%.*]]
; CHECK:       tilestore.scalarize.cols.body:
; CHECK-NEXT:    [[TMP56:%.*]] = zext i16 [[TILESTORE_SCALARIZE_ROWS_IV]] to i64
; CHECK-NEXT:    [[TMP57:%.*]] = zext i16 [[TILESTORE_SCALARIZE_COLS_IV]] to i64
; CHECK-NEXT:    [[TMP58:%.*]] = mul i64 [[TMP56]], 4
; CHECK-NEXT:    [[TMP59:%.*]] = add i64 [[TMP58]], [[TMP57]]
; CHECK-NEXT:    [[TMP60:%.*]] = bitcast i8* [[TMP0]] to i32*
; CHECK-NEXT:    [[TMP61:%.*]] = getelementptr i32, i32* [[TMP60]], i64 [[TMP59]]
; CHECK-NEXT:    [[TMP62:%.*]] = mul i16 [[TILESTORE_SCALARIZE_ROWS_IV]], 16
; CHECK-NEXT:    [[TMP63:%.*]] = add i16 [[TMP62]], [[TILESTORE_SCALARIZE_COLS_IV]]
; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <256 x i32> [[TMP54]], i16 [[TMP63]]
; CHECK-NEXT:    store i32 [[TMP64]], i32* [[TMP61]], align 4
; CHECK-NEXT:    br label [[TILESTORE_SCALARIZE_COLS_LATCH]]
; CHECK:       tilestore.scalarize.cols.latch:
; CHECK-NEXT:    [[TILESTORE_SCALARIZE_COLS_STEP]] = add i16 [[TILESTORE_SCALARIZE_COLS_IV]], 1
; CHECK-NEXT:    [[TILESTORE_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILESTORE_SCALARIZE_COLS_STEP]], 4
; CHECK-NEXT:    br i1 [[TILESTORE_SCALARIZE_COLS_COND]], label [[TILESTORE_SCALARIZE_COLS_HEADER]], label [[TILESTORE_SCALARIZE_ROWS_LATCH]]
; CHECK:       tilestore.scalarize.rows.latch:
; CHECK-NEXT:    [[TILESTORE_SCALARIZE_ROWS_STEP]] = add i16 [[TILESTORE_SCALARIZE_ROWS_IV]], 1
; CHECK-NEXT:    [[TILESTORE_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILESTORE_SCALARIZE_ROWS_STEP]], 4
; CHECK-NEXT:    br i1 [[TILESTORE_SCALARIZE_ROWS_COND]], label [[TILESTORE_SCALARIZE_ROWS_HEADER]], label [[CONTINUE32:%.*]]
; CHECK:       continue32:
; CHECK-NEXT:    ret void
;
entry:
  %0 = bitcast i32* %C_mem to i8*
  %1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 4, i16 16, i8* %0, i64 16)
  %2 = bitcast i32* %A_mem to i8*
  %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 4, i16 16, i8* %2, i64 16)
  %4 = bitcast i32* %B_mem to i8*
  %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 4, i16 16, i8* %4, i64 16)
  %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 4, i16 16, i16 16, x86_amx %1, x86_amx %3, x86_amx %5)
  tail call void @llvm.x86.tilestored64.internal(i16 4, i16 16, i8* %0, i64 16, x86_amx %6)
  ret void
}

declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)

attributes #0 = { noinline nounwind optnone }