File: amx-combine.ll

package info (click to toggle)
llvm-toolchain-16 1%3A16.0.6-15~deb12u1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,634,792 kB
  • sloc: cpp: 6,179,261; ansic: 1,216,205; asm: 741,319; python: 196,614; objc: 75,325; f90: 49,640; lisp: 32,396; pascal: 12,286; sh: 9,394; perl: 7,442; ml: 5,494; awk: 3,523; makefile: 2,723; javascript: 1,206; xml: 886; fortran: 581; cs: 573
file content (138 lines) | stat: -rw-r--r-- 8,020 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s

define void @combine_store(ptr%p) {
; CHECK-LABEL: @combine_store(
; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P:%.*]], i64 64, x86_amx [[T1]])
; CHECK-NEXT:    ret void
;
  %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
  %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
  store <256 x i32> %t2, ptr %p, align 64
  ret void
}

define <256 x i32> @combine_store_2user(ptr%p) {
; CHECK-LABEL: @combine_store_2user(
; CHECK-NEXT:    [[TMP1:%.*]] = alloca <256 x i32>, align 64
; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[TMP1]], i64 64, x86_amx [[T1]])
; CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP1]], align 1024
; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P:%.*]], i64 64, x86_amx [[T1]])
; CHECK-NEXT:    ret <256 x i32> [[TMP2]]
;
  %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
  %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
  store <256 x i32> %t2, ptr %p, align 64
  ret <256 x i32> %t2
}

define void @combine_load(ptr%p, ptr%p2) {
; CHECK-LABEL: @combine_load(
; CHECK-NEXT:    [[TMP1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr [[P:%.*]], i64 64)
; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P2:%.*]], i64 64, x86_amx [[TMP1]])
; CHECK-NEXT:    ret void
;
  %t1 = load <256 x i32>, ptr %p, align 64
  %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1)
  call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %p2, i64 64, x86_amx %t2)
  ret void
}

define void @combine_cast_across_store(ptr%p, ptr%p2) {
; CHECK-LABEL: @combine_cast_across_store(
; CHECK-NEXT:    [[TMP1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr [[P:%.*]], i64 64)
; CHECK-NEXT:    store <256 x i32> zeroinitializer, ptr [[P]], align 64
; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P2:%.*]], i64 64, x86_amx [[TMP1]])
; CHECK-NEXT:    ret void
;
  %t1 = load <256 x i32>, ptr %p, align 64
  store <256 x i32> zeroinitializer, ptr %p, align 64
  %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1)
  call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %p2, i64 64, x86_amx %t2)
  ret void
}

define <256 x i32> @combine_load_2user(ptr%p, ptr%p2) {
; CHECK-LABEL: @combine_load_2user(
; CHECK-NEXT:    [[TMP1:%.*]] = alloca <256 x i32>, align 64
; CHECK-NEXT:    [[T1:%.*]] = load <256 x i32>, ptr [[P:%.*]], align 64
; CHECK-NEXT:    store <256 x i32> [[T1]], ptr [[TMP1]], align 1024
; CHECK-NEXT:    [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr [[TMP1]], i64 64)
; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P2:%.*]], i64 64, x86_amx [[TMP2]])
; CHECK-NEXT:    ret <256 x i32> [[T1]]
;
  %t1 = load <256 x i32>, ptr %p, align 64
  %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1)
  call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %p2, i64 64, x86_amx %t2)
  %t3 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t2)
  ret <256 x i32> %t3
}

define <256 x i32> @combine_load_3user(ptr%p, ptr%p2) {
; CHECK-LABEL: @combine_load_3user(
; CHECK-NEXT:    [[TMP1:%.*]] = alloca <256 x i32>, align 64
; CHECK-NEXT:    [[T1:%.*]] = load <256 x i32>, ptr [[P:%.*]], align 64
; CHECK-NEXT:    store <256 x i32> [[T1]], ptr [[TMP1]], align 1024
; CHECK-NEXT:    [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 16, ptr [[TMP1]], i64 16)
; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P2:%.*]], i64 64, x86_amx [[TMP2]])
; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 16, i16 16, i16 64, x86_amx [[TMP2]], x86_amx [[TMP2]], x86_amx [[TMP2]])
; CHECK-NEXT:    ret <256 x i32> [[T1]]
;
  %t1 = load <256 x i32>, ptr %p, align 64
  %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1)
  call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %p2, i64 64, x86_amx %t2)
  %t3 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t2)
  call x86_amx @llvm.x86.tdpbssd.internal(i16 16, i16 16, i16 64, x86_amx %t2, x86_amx %t2, x86_amx %t2)
  ret <256 x i32> %t3
}

; the shape is loaded after tile.
%struct.__tile1024i_str = type <{ i16, i16, [60 x i8], <256 x i32> }>
define void @test_tile_dpbssd(ptr byval(%struct.__tile1024i_str) align 64 %a, ptr byval(%struct.__tile1024i_str) align 64 %b, ptr byval(%struct.__tile1024i_str) align 64 %c) {
; CHECK-LABEL: @test_tile_dpbssd(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
; CHECK-NEXT:    [[B_ROW_PTR:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 2
; CHECK-NEXT:    [[B_ROW:%.*]] = load i16, ptr [[B_ROW_PTR]], align 2
; CHECK-NEXT:    [[B_TILE_PTR:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 64
; CHECK-NEXT:    [[B_TILE:%.*]] = load <256 x i32>, ptr [[B_TILE_PTR]], align 64
; CHECK-NEXT:    store <256 x i32> [[B_TILE]], ptr [[TMP0]], align 1024
; CHECK-NEXT:    [[A_ROW:%.*]] = load i16, ptr [[A:%.*]], align 64
; CHECK-NEXT:    [[A_COL_PTR:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
; CHECK-NEXT:    [[A_COL:%.*]] = load i16, ptr [[A_COL_PTR]], align 2
; CHECK-NEXT:    [[TMP1:%.*]] = udiv i16 [[A_COL]], 4
; CHECK-NEXT:    [[A_TILE_PTR:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 64
; CHECK-NEXT:    [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[A_ROW]], i16 [[A_COL]], ptr [[A_TILE_PTR]], i64 64)
; CHECK-NEXT:    [[C_TILE_PTR:%.*]] = getelementptr inbounds [[STRUCT___TILE1024I_STR:%.*]], ptr [[C:%.*]], i64 0, i32 3
; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[A_ROW]], i16 [[B_ROW]], ptr [[C_TILE_PTR]], i64 64)
; CHECK-NEXT:    [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[B_ROW]], ptr [[TMP0]], i64 64)
; CHECK-NEXT:    [[RES:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[A_ROW]], i16 [[B_ROW]], i16 [[A_COL]], x86_amx [[TMP3]], x86_amx [[TMP2]], x86_amx [[TMP4]])
; CHECK-NEXT:    ret void
;
entry:
  %b.row.ptr= getelementptr inbounds i8, ptr %b, i64 2
  %b.row = load i16, ptr %b.row.ptr, align 2
  %b.tile.ptr = getelementptr inbounds i8, ptr %b, i64 64
  %b.tile = load <256 x i32>, ptr %b.tile.ptr, align 64
  %a.row = load i16, ptr %a, align 64
  %a.col.ptr = getelementptr inbounds i8, ptr %a, i64 2
  %a.col = load i16, ptr %a.col.ptr, align 2
  %a.tile.ptr = getelementptr inbounds i8, ptr %a, i64 64
  %a.tile = load <256 x i32>, ptr %a.tile.ptr, align 64
  %c.tile.ptr = getelementptr inbounds %struct.__tile1024i_str, ptr %c, i64 0, i32 3
  %c.tile = load <256 x i32>, ptr %c.tile.ptr, align 64
  %c.amx = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %c.tile)
  %a.amx = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %a.tile)
  %b.amx = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %b.tile)
  %res = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %a.row, i16 %b.row, i16 %a.col, x86_amx %c.amx, x86_amx %a.amx, x86_amx %b.amx)
  ret void
}

declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>)
declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx)
declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64)
declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)