File: amdgpu-kernel-arg-pointer-type.cu

package info (click to toggle)
llvm-toolchain-19 1%3A19.1.7-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,998,520 kB
  • sloc: cpp: 6,951,680; ansic: 1,486,157; asm: 913,598; python: 232,024; f90: 80,126; objc: 75,281; lisp: 37,276; pascal: 16,990; sh: 10,009; ml: 5,058; perl: 4,724; awk: 3,523; makefile: 3,167; javascript: 2,504; xml: 892; fortran: 664; cs: 573
file content (134 lines) | stat: -rw-r--r-- 6,187 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
// REQUIRES: x86-registered-target
// REQUIRES: amdgpu-registered-target

// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -o - | FileCheck --check-prefixes=COMMON,CHECK %s
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -disable-O0-optnone -o - | opt -S -O2 | FileCheck %s --check-prefixes=COMMON,OPT
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -x hip %s -o - | FileCheck -check-prefix=HOST %s

#include "Inputs/cuda.h"

// Coerced struct from `struct S` without all generic pointers lowered into
// global ones.

// On the host-side compilation, generic pointer won't be coerced.
// HOST-NOT: %struct.S.coerce
// HOST-NOT: %struct.T.coerce

// HOST: define{{.*}} void @_Z22__device_stub__kernel1Pi(ptr noundef %x)
// COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel1Pi(ptr addrspace(1){{.*}} %x.coerce)
// CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
// OPT: [[VAL:%.*]] = load i32, ptr addrspace(1) %x.coerce, align 4{{$}}
// OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1
// OPT: store i32 [[INC]], ptr addrspace(1) %x.coerce, align 4
// OPT: ret void
__global__ void kernel1(int *x) {
  x[0]++;
}

// HOST: define{{.*}} void @_Z22__device_stub__kernel2Ri(ptr noundef nonnull align 4 dereferenceable(4) %x)
// COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel2Ri(ptr addrspace(1){{.*}} nonnull align 4 dereferenceable(4) %x.coerce)
// CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
// OPT: [[VAL:%.*]] = load i32, ptr addrspace(1) %x.coerce, align 4{{$}}
// OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1
// OPT: store i32 [[INC]], ptr addrspace(1) %x.coerce, align 4
// OPT: ret void
__global__ void kernel2(int &x) {
  x++;
}

// HOST: define{{.*}} void @_Z22__device_stub__kernel3PU3AS2iPU3AS1i(ptr addrspace(2)  noundef %x, ptr addrspace(1) noundef %y)
// CHECK-LABEL: define{{.*}} amdgpu_kernel void  @_Z7kernel3PU3AS2iPU3AS1i(ptr addrspace(2){{.*}} %x, ptr addrspace(1){{.*}} %y)
// CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
__global__ void kernel3(__attribute__((address_space(2))) int *x,
                        __attribute__((address_space(1))) int *y) {
  y[0] = x[0];
}

// COMMON-LABEL: define{{.*}} void @_Z4funcPi(ptr{{.*}} %x)
// CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
__device__ void func(int *x) {
  x[0]++;
}

struct S {
  int *x;
  float *y;
};
// `by-val` struct is passed by-indirect-alias (a mix of by-ref and indirect
// by-val). However, the enhanced address inferring pass should be able to
// assume they are global pointers.
//
// HOST: define{{.*}} void @_Z22__device_stub__kernel41S(ptr %s.coerce0, ptr %s.coerce1)
// COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel41S(ptr addrspace(4){{.*}} byref(%struct.S) align 8 %0)
// OPT: [[P0:%.*]] = load ptr, ptr addrspace(4) %0, align 8
// OPT: [[G0:%.*]] ={{.*}} addrspacecast ptr [[P0]] to ptr addrspace(1)
// OPT: [[R1:%.*]] = getelementptr inbounds i8, ptr addrspace(4) %0, i64 8
// OPT: [[P1:%.*]] = load ptr, ptr addrspace(4) [[R1]], align 8
// OPT: [[G1:%.*]] ={{.*}} addrspacecast ptr [[P1]] to ptr addrspace(1)
// OPT: [[V0:%.*]] = load i32, ptr addrspace(1) [[G0]], align 4, !amdgpu.noclobber ![[MD:[0-9]+]]
// OPT: [[INC:%.*]] = add nsw i32 [[V0]], 1
// OPT: store i32 [[INC]], ptr addrspace(1) [[G0]], align 4
// OPT: [[V1:%.*]] = load float, ptr addrspace(1) [[G1]], align 4
// OPT: [[ADD:%.*]] = fadd contract float [[V1]], 1.000000e+00
// OPT: store float [[ADD]], ptr addrspace(1) [[G1]], align 4
// OPT: ret void
__global__ void kernel4(struct S s) {
  s.x[0]++;
  s.y[0] += 1.f;
}

// If a pointer to struct is passed, only the pointer itself is coerced into the global one.
// HOST: define{{.*}} void @_Z22__device_stub__kernel5P1S(ptr noundef %s)
// COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel5P1S(ptr addrspace(1){{.*}} %s.coerce)
__global__ void kernel5(struct S *s) {
  s->x[0]++;
  s->y[0] += 1.f;
}

struct T {
  float *x[2];
};
// `by-val` array is passed by-indirect-alias (a mix of by-ref and indirect
// by-val). However, the enhanced address inferring pass should be able to
// assume they are global pointers.
//
// HOST: define{{.*}} void @_Z22__device_stub__kernel61T(ptr %t.coerce0, ptr %t.coerce1)
// COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel61T(ptr addrspace(4){{.*}} byref(%struct.T) align 8 %0)
// OPT: [[P0:%.*]] = load ptr, ptr addrspace(4) %0, align 8
// OPT: [[G0:%.*]] ={{.*}} addrspacecast ptr [[P0]] to ptr addrspace(1)
// OPT: [[R1:%.*]] = getelementptr inbounds i8, ptr addrspace(4) %0, i64 8
// OPT: [[P1:%.*]] = load ptr, ptr addrspace(4) [[R1]], align 8
// OPT: [[G1:%.*]] ={{.*}} addrspacecast ptr [[P1]] to ptr addrspace(1)
// OPT: [[V0:%.*]] = load float, ptr addrspace(1) [[G0]], align 4, !amdgpu.noclobber ![[MD]]
// OPT: [[ADD0:%.*]] = fadd contract float [[V0]], 1.000000e+00
// OPT: store float [[ADD0]], ptr addrspace(1) [[G0]], align 4
// OPT: [[V1:%.*]] = load float, ptr addrspace(1) [[G1]], align 4
// OPT: [[ADD1:%.*]] = fadd contract float [[V1]], 2.000000e+00
// OPT: store float [[ADD1]], ptr addrspace(1) [[G1]], align 4
// OPT: ret void
__global__ void kernel6(struct T t) {
  t.x[0][0] += 1.f;
  t.x[1][0] += 2.f;
}

// Check that coerced pointers retain the noalias attribute when qualified with __restrict.
// HOST: define{{.*}} void @_Z22__device_stub__kernel7Pi(ptr noalias noundef %x)
// COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel7Pi(ptr addrspace(1) noalias{{.*}} %x.coerce)
__global__ void kernel7(int *__restrict x) {
  x[0]++;
}

// Single element struct.
struct SS {
  float *x;
};
// HOST: define{{.*}} void @_Z22__device_stub__kernel82SS(ptr %a.coerce)
// COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel82SS(ptr addrspace(1){{.*}} %a.coerce)
// CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
// OPT: [[VAL:%.*]] = load float, ptr addrspace(1) %a.coerce, align 4{{$}}
// OPT: [[INC:%.*]] = fadd contract float [[VAL]], 3.000000e+00
// OPT: store float [[INC]], ptr addrspace(1) %a.coerce, align 4
// OPT: ret void
__global__ void kernel8(struct SS a) {
  *a.x += 3.f;
}