File: lds-dma-waits.ll

package info (click to toggle)
llvm-toolchain-19 1%3A19.1.7-3~deb12u1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm-proposed-updates
  • size: 1,998,492 kB
  • sloc: cpp: 6,951,680; ansic: 1,486,157; asm: 913,598; python: 232,024; f90: 80,126; objc: 75,281; lisp: 37,276; pascal: 16,990; sh: 10,009; ml: 5,058; perl: 4,724; awk: 3,523; makefile: 3,167; javascript: 2,504; xml: 892; fortran: 664; cs: 573
file content (154 lines) | stat: -rw-r--r-- 8,580 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GCN,GFX9
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck %s --check-prefixes=GCN,GFX10

@lds.0 = internal addrspace(3) global [64 x float] poison, align 16
@lds.1 = internal addrspace(3) global [64 x float] poison, align 16
@lds.2 = internal addrspace(3) global [64 x float] poison, align 16
@lds.3 = internal addrspace(3) global [64 x float] poison, align 16
@lds.4 = internal addrspace(3) global [64 x float] poison, align 16
@lds.5 = internal addrspace(3) global [64 x float] poison, align 16
@lds.6 = internal addrspace(3) global [64 x float] poison, align 16
@lds.7 = internal addrspace(3) global [64 x float] poison, align 16
@lds.8 = internal addrspace(3) global [64 x float] poison, align 16
@lds.9 = internal addrspace(3) global [64 x float] poison, align 16

declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)

; GCN-LABEL: {{^}}buffer_load_lds_dword_2_arrays:
; GCN-COUNT-4: buffer_load_dword
; GCN: s_waitcnt vmcnt(2)
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(0)
; GCN: ds_read_b32
define amdgpu_kernel void @buffer_load_lds_dword_2_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
main_body:
  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0, i32 0, i32 0)
  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 4, i32 0, i32 0, i32 0)
  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.1, i32 4, i32 8, i32 0, i32 0, i32 0)
  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.1, i32 4, i32 12, i32 0, i32 0, i32 0)
  %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
  %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
  %val.0 = load float, ptr addrspace(3) %gep.0, align 4
  call void @llvm.amdgcn.wave.barrier()
  %val.1 = load float, ptr addrspace(3) %gep.1, align 4
  %tmp.0 = insertelement <2 x float> undef, float %val.0, i32 0
  %res = insertelement <2 x float> %tmp.0, float %val.1, i32 1
  store <2 x float> %res, ptr addrspace(1) %out
  ret void
}

; On gfx9 if there is a pending FLAT operation, and this is a VMem or LGKM
; waitcnt and the target can report early completion, then we need to force a waitcnt 0.

; GCN-LABEL: {{^}}global_load_lds_dword_2_arrays:
; GCN-COUNT-4: global_load_dword
; GFX9: s_waitcnt vmcnt(0)
; GFX9-COUNT-2: ds_read_b32
; GFX10: s_waitcnt vmcnt(2)
; GFX10: ds_read_b32
; GFX10: s_waitcnt vmcnt(0)
; GFX10: ds_read_b32
define amdgpu_kernel void @global_load_lds_dword_2_arrays(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
main_body:
  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 4, i32 0)
  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 8, i32 0)
  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 12, i32 0)
  %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
  %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
  %val.0 = load float, ptr addrspace(3) %gep.0, align 4
  call void @llvm.amdgcn.wave.barrier()
  %val.1 = load float, ptr addrspace(3) %gep.1, align 4
  %tmp.0 = insertelement <2 x float> undef, float %val.0, i32 0
  %res = insertelement <2 x float> %tmp.0, float %val.1, i32 1
  store <2 x float> %res, ptr addrspace(1) %out
  ret void
}

; There are 8 pseudo registers defined to track LDS DMA dependencies.
; When exhausted we default to vmcnt(0).

; GCN-LABEL: {{^}}buffer_load_lds_dword_10_arrays:
; GCN-COUNT-10: buffer_load_dword
; GCN: s_waitcnt vmcnt(8)
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(7)
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(6)
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(5)
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(4)
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(3)
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(2)
; GCN-NOT: s_waitcnt vmcnt
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(0)
; GCN: ds_read_b32
define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, ptr addrspace(1) %out) {
main_body:
  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0, i32 0, i32 0)
  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.1, i32 4, i32 0, i32 0, i32 0, i32 0)
  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.2, i32 4, i32 0, i32 0, i32 0, i32 0)
  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.3, i32 4, i32 0, i32 0, i32 0, i32 0)
  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.4, i32 4, i32 0, i32 0, i32 0, i32 0)
  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.5, i32 4, i32 0, i32 0, i32 0, i32 0)
  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.6, i32 4, i32 0, i32 0, i32 0, i32 0)
  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.7, i32 4, i32 0, i32 0, i32 0, i32 0)
  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.8, i32 4, i32 0, i32 0, i32 0, i32 0)
  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.9, i32 4, i32 0, i32 0, i32 0, i32 0)
  %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
  %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
  %gep.2 = getelementptr float, ptr addrspace(3) @lds.2, i32 %i2
  %gep.3 = getelementptr float, ptr addrspace(3) @lds.3, i32 %i2
  %gep.4 = getelementptr float, ptr addrspace(3) @lds.4, i32 %i2
  %gep.5 = getelementptr float, ptr addrspace(3) @lds.5, i32 %i2
  %gep.6 = getelementptr float, ptr addrspace(3) @lds.6, i32 %i2
  %gep.7 = getelementptr float, ptr addrspace(3) @lds.7, i32 %i2
  %gep.8 = getelementptr float, ptr addrspace(3) @lds.8, i32 %i2
  %gep.9 = getelementptr float, ptr addrspace(3) @lds.9, i32 %i2
  %val.0 = load float, ptr addrspace(3) %gep.0, align 4
  call void @llvm.amdgcn.wave.barrier()
  %val.1 = load float, ptr addrspace(3) %gep.1, align 4
  call void @llvm.amdgcn.wave.barrier()
  %val.2 = load float, ptr addrspace(3) %gep.2, align 4
  call void @llvm.amdgcn.wave.barrier()
  %val.3 = load float, ptr addrspace(3) %gep.3, align 4
  call void @llvm.amdgcn.wave.barrier()
  %val.4 = load float, ptr addrspace(3) %gep.4, align 4
  call void @llvm.amdgcn.wave.barrier()
  %val.5 = load float, ptr addrspace(3) %gep.5, align 4
  call void @llvm.amdgcn.wave.barrier()
  %val.6 = load float, ptr addrspace(3) %gep.6, align 4
  call void @llvm.amdgcn.wave.barrier()
  %val.7 = load float, ptr addrspace(3) %gep.7, align 4
  call void @llvm.amdgcn.wave.barrier()
  %val.8 = load float, ptr addrspace(3) %gep.8, align 4
  call void @llvm.amdgcn.wave.barrier()
  %val.9 = load float, ptr addrspace(3) %gep.9, align 4
  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
  %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
  %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
  %out.gep.4 = getelementptr float, ptr addrspace(1) %out, i32 4
  %out.gep.5 = getelementptr float, ptr addrspace(1) %out, i32 5
  %out.gep.6 = getelementptr float, ptr addrspace(1) %out, i32 6
  %out.gep.7 = getelementptr float, ptr addrspace(1) %out, i32 7
  %out.gep.8 = getelementptr float, ptr addrspace(1) %out, i32 8
  %out.gep.9 = getelementptr float, ptr addrspace(1) %out, i32 9
  store float %val.0, ptr addrspace(1) %out
  store float %val.1, ptr addrspace(1) %out.gep.1
  store float %val.2, ptr addrspace(1) %out.gep.2
  store float %val.3, ptr addrspace(1) %out.gep.3
  store float %val.4, ptr addrspace(1) %out.gep.4
  store float %val.5, ptr addrspace(1) %out.gep.5
  store float %val.6, ptr addrspace(1) %out.gep.6
  store float %val.7, ptr addrspace(1) %out.gep.7
  store float %val.8, ptr addrspace(1) %out.gep.8
  store float %val.9, ptr addrspace(1) %out.gep.9
  ret void
}

declare void @llvm.amdgcn.wave.barrier()