File: amx-combine.ll

package info (click to toggle)
llvm-toolchain-17 1%3A17.0.6-22
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,799,624 kB
  • sloc: cpp: 6,428,607; ansic: 1,383,196; asm: 793,408; python: 223,504; objc: 75,364; f90: 60,502; lisp: 33,869; pascal: 15,282; sh: 9,684; perl: 7,453; ml: 4,937; awk: 3,523; makefile: 2,889; javascript: 2,149; xml: 888; fortran: 619; cs: 573
file content (171 lines) | stat: -rw-r--r-- 9,684 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s

define void @combine_store(ptr%p) {
; CHECK-LABEL: @combine_store(
; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P:%.*]], i64 64, x86_amx [[T1]])
; CHECK-NEXT:    ret void
;
  %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
  %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
  store <256 x i32> %t2, ptr %p, align 64
  ret void
}

define <256 x i32> @combine_store_2user(ptr%p) {
; CHECK-LABEL: @combine_store_2user(
; CHECK-NEXT:    [[TMP1:%.*]] = alloca <256 x i32>, align 64
; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[TMP1]], i64 64, x86_amx [[T1]])
; CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP1]], align 1024
; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P:%.*]], i64 64, x86_amx [[T1]])
; CHECK-NEXT:    ret <256 x i32> [[TMP2]]
;
  %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
  %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
  store <256 x i32> %t2, ptr %p, align 64
  ret <256 x i32> %t2
}

define void @combine_load(ptr%p, ptr%p2) {
; CHECK-LABEL: @combine_load(
; CHECK-NEXT:    [[TMP1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr [[P:%.*]], i64 64)
; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P2:%.*]], i64 64, x86_amx [[TMP1]])
; CHECK-NEXT:    ret void
;
  %t1 = load <256 x i32>, ptr %p, align 64
  %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1)
  call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %p2, i64 64, x86_amx %t2)
  ret void
}

define void @combine_cast_across_store(ptr%p, ptr%p2) {
; CHECK-LABEL: @combine_cast_across_store(
; CHECK-NEXT:    [[TMP1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr [[P:%.*]], i64 64)
; CHECK-NEXT:    store <256 x i32> zeroinitializer, ptr [[P]], align 64
; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P2:%.*]], i64 64, x86_amx [[TMP1]])
; CHECK-NEXT:    ret void
;
  %t1 = load <256 x i32>, ptr %p, align 64
  store <256 x i32> zeroinitializer, ptr %p, align 64
  %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1)
  call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %p2, i64 64, x86_amx %t2)
  ret void
}

define <256 x i32> @combine_load_2user(ptr%p, ptr%p2) {
; CHECK-LABEL: @combine_load_2user(
; CHECK-NEXT:    [[TMP1:%.*]] = alloca <256 x i32>, align 64
; CHECK-NEXT:    [[T1:%.*]] = load <256 x i32>, ptr [[P:%.*]], align 64
; CHECK-NEXT:    store <256 x i32> [[T1]], ptr [[TMP1]], align 1024
; CHECK-NEXT:    [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr [[TMP1]], i64 64)
; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P2:%.*]], i64 64, x86_amx [[TMP2]])
; CHECK-NEXT:    ret <256 x i32> [[T1]]
;
  %t1 = load <256 x i32>, ptr %p, align 64
  %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1)
  call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %p2, i64 64, x86_amx %t2)
  %t3 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t2)
  ret <256 x i32> %t3
}

define <256 x i32> @combine_load_3user(ptr%p, ptr%p2) {
; CHECK-LABEL: @combine_load_3user(
; CHECK-NEXT:    [[TMP1:%.*]] = alloca <256 x i32>, align 64
; CHECK-NEXT:    [[T1:%.*]] = load <256 x i32>, ptr [[P:%.*]], align 64
; CHECK-NEXT:    store <256 x i32> [[T1]], ptr [[TMP1]], align 1024
; CHECK-NEXT:    [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 16, ptr [[TMP1]], i64 16)
; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P2:%.*]], i64 64, x86_amx [[TMP2]])
; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 16, i16 16, i16 64, x86_amx [[TMP2]], x86_amx [[TMP2]], x86_amx [[TMP2]])
; CHECK-NEXT:    ret <256 x i32> [[T1]]
;
  %t1 = load <256 x i32>, ptr %p, align 64
  %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1)
  call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %p2, i64 64, x86_amx %t2)
  %t3 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t2)
  call x86_amx @llvm.x86.tdpbssd.internal(i16 16, i16 16, i16 64, x86_amx %t2, x86_amx %t2, x86_amx %t2)
  ret <256 x i32> %t3
}

; the shape is loaded after tile.
%struct.__tile1024i_str = type <{ i16, i16, [60 x i8], <256 x i32> }>
define void @test_tile_dpbssd(ptr byval(%struct.__tile1024i_str) align 64 %a, ptr byval(%struct.__tile1024i_str) align 64 %b, ptr byval(%struct.__tile1024i_str) align 64 %c) {
; CHECK-LABEL: @test_tile_dpbssd(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
; CHECK-NEXT:    [[B_ROW_PTR:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 2
; CHECK-NEXT:    [[B_ROW:%.*]] = load i16, ptr [[B_ROW_PTR]], align 2
; CHECK-NEXT:    [[B_TILE_PTR:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 64
; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[B_ROW]] to i64
; CHECK-NEXT:    [[B_TILE:%.*]] = load <256 x i32>, ptr [[B_TILE_PTR]], align 64
; CHECK-NEXT:    store <256 x i32> [[B_TILE]], ptr [[TMP0]], align 1024
; CHECK-NEXT:    [[A_ROW:%.*]] = load i16, ptr [[A:%.*]], align 64
; CHECK-NEXT:    [[A_COL_PTR:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
; CHECK-NEXT:    [[A_COL:%.*]] = load i16, ptr [[A_COL_PTR]], align 2
; CHECK-NEXT:    [[TMP2:%.*]] = udiv i16 [[A_COL]], 4
; CHECK-NEXT:    [[A_TILE_PTR:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 64
; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[A_COL]] to i64
; CHECK-NEXT:    [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[A_ROW]], i16 [[A_COL]], ptr [[A_TILE_PTR]], i64 [[TMP3]])
; CHECK-NEXT:    [[C_TILE_PTR:%.*]] = getelementptr inbounds [[STRUCT___TILE1024I_STR:%.*]], ptr [[C:%.*]], i64 0, i32 3
; CHECK-NEXT:    [[TMP5:%.*]] = sext i16 [[B_ROW]] to i64
; CHECK-NEXT:    [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[A_ROW]], i16 [[B_ROW]], ptr [[C_TILE_PTR]], i64 [[TMP5]])
; CHECK-NEXT:    [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP2]], i16 [[B_ROW]], ptr [[TMP0]], i64 [[TMP1]])
; CHECK-NEXT:    [[RES:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[A_ROW]], i16 [[B_ROW]], i16 [[A_COL]], x86_amx [[TMP6]], x86_amx [[TMP4]], x86_amx [[TMP7]])
; CHECK-NEXT:    ret void
;
entry:
  %b.row.ptr= getelementptr inbounds i8, ptr %b, i64 2
  %b.row = load i16, ptr %b.row.ptr, align 2
  %b.tile.ptr = getelementptr inbounds i8, ptr %b, i64 64
  %b.tile = load <256 x i32>, ptr %b.tile.ptr, align 64
  %a.row = load i16, ptr %a, align 64
  %a.col.ptr = getelementptr inbounds i8, ptr %a, i64 2
  %a.col = load i16, ptr %a.col.ptr, align 2
  %a.tile.ptr = getelementptr inbounds i8, ptr %a, i64 64
  %a.tile = load <256 x i32>, ptr %a.tile.ptr, align 64
  %c.tile.ptr = getelementptr inbounds %struct.__tile1024i_str, ptr %c, i64 0, i32 3
  %c.tile = load <256 x i32>, ptr %c.tile.ptr, align 64
  %c.amx = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %c.tile)
  %a.amx = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %a.tile)
  %b.amx = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %b.tile)
  %res = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %a.row, i16 %b.row, i16 %a.col, x86_amx %c.amx, x86_amx %a.amx, x86_amx %b.amx)
  ret void
}

define void @combine_v256i8amcast_with_store(i8* %src_ptr, <256 x i8>* %dst_ptr) {
; CHECK-LABEL: @combine_v256i8amcast_with_store(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[TILE:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 32, ptr [[SRC_PTR:%.*]], i64 64)
; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[DST_PTR:%.*]], i64 32, x86_amx [[TILE]])
; CHECK-NEXT:    ret void
;
entry:
  %tile = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 32, i8* %src_ptr, i64 64)
  %vec = call <256 x i8> @llvm.x86.cast.tile.to.vector.v256i8(x86_amx %tile)
  store <256 x i8> %vec, <256 x i8>* %dst_ptr, align 256
  ret void
}

define void @combine_v256i8amcast_with_load(i8* %src_ptr, <256 x i8>* %dst_ptr) {
; CHECK-LABEL: @combine_v256i8amcast_with_load(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[TMP0:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 32, ptr [[SRC_PTR:%.*]], i64 32)
; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[DST_PTR:%.*]], i64 32, x86_amx [[TMP0]])
; CHECK-NEXT:    ret void
;
entry:
  %vec = load <256 x i8>, ptr %src_ptr, align 256
  %tile = call x86_amx @llvm.x86.cast.vector.to.tile.v256i8(<256 x i8> %vec)
  call void @llvm.x86.tilestored64.internal(i16 8, i16 32, <256 x i8>* %dst_ptr, i64 32, x86_amx %tile)
  ret void
}

declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>)
declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx)
declare x86_amx @llvm.x86.cast.vector.to.tile.v256i8(<256 x i8>)
declare <256 x i8> @llvm.x86.cast.tile.to.vector.v256i8(x86_amx)
declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64)
declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)