1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
|
; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes COMMON,IR,IRC
; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-nvcl | FileCheck %s --check-prefixes COMMON,IR,IRO
; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes COMMON,PTX,PTXC
; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-nvcl| FileCheck %s --check-prefixes COMMON,PTX,PTXO
; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %}
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%class.outer = type <{ %class.inner, i32, [4 x i8] }>
%class.inner = type { ptr, ptr }
%class.padded = type { i8, i32 }
; Check that nvptx-lower-args preserves arg alignment
; COMMON-LABEL: load_alignment
define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %arg) {
entry:
; IR: call void @llvm.memcpy.p0.p101.i64(ptr align 8
; PTX: ld.param.u64
; PTX-NOT: ld.param.u8
%arg.idx.val = load ptr, ptr %arg, align 8
%arg.idx1 = getelementptr %class.outer, ptr %arg, i64 0, i32 0, i32 1
%arg.idx1.val = load ptr, ptr %arg.idx1, align 8
%arg.idx2 = getelementptr %class.outer, ptr %arg, i64 0, i32 1
%arg.idx2.val = load i32, ptr %arg.idx2, align 8
%arg.idx.val.val = load i32, ptr %arg.idx.val, align 4
%add.i = add nsw i32 %arg.idx.val.val, %arg.idx2.val
store i32 %add.i, ptr %arg.idx1.val, align 4
; let the pointer escape so we still create a local copy this test uses to
; check the load alignment.
%tmp = call ptr @escape(ptr nonnull %arg.idx2)
ret void
}
; Check that nvptx-lower-args copies padding as the struct may have been a union
; COMMON-LABEL: load_padding
define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) {
; PTX: {
; PTX-NEXT: .local .align 8 .b8 __local_depot1[8];
; PTX-NEXT: .reg .b64 %SP;
; PTX-NEXT: .reg .b64 %SPL;
; PTX-NEXT: .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.u64 %SPL, __local_depot1;
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
; PTX-NEXT: ld.param.u64 %rd1, [load_padding_param_0];
; PTX-NEXT: st.u64 [%SP], %rd1;
; PTX-NEXT: add.u64 %rd2, %SP, 0;
; PTX-NEXT: { // callseq 1, 0
; PTX-NEXT: .param .b64 param0;
; PTX-NEXT: st.param.b64 [param0], %rd2;
; PTX-NEXT: .param .b64 retval0;
; PTX-NEXT: call.uni (retval0),
; PTX-NEXT: escape,
; PTX-NEXT: (
; PTX-NEXT: param0
; PTX-NEXT: );
; PTX-NEXT: ld.param.b64 %rd3, [retval0];
; PTX-NEXT: } // callseq 1
; PTX-NEXT: ret;
%tmp = call ptr @escape(ptr nonnull align 16 %arg)
ret void
}
; COMMON-LABEL: ptr_generic
define ptx_kernel void @ptr_generic(ptr %out, ptr %in) {
; IRC: %in3 = addrspacecast ptr %in to ptr addrspace(1)
; IRC: %in4 = addrspacecast ptr addrspace(1) %in3 to ptr
; IRC: %out1 = addrspacecast ptr %out to ptr addrspace(1)
; IRC: %out2 = addrspacecast ptr addrspace(1) %out1 to ptr
; PTXC: cvta.to.global.u64
; PTXC: cvta.to.global.u64
; PTXC: ld.global.u32
; PTXC: st.global.u32
; OpenCL can't make assumptions about incoming pointer, so we should generate
; generic pointers load/store.
; IRO-NOT: addrspacecast
; PTXO-NOT: cvta.to.global
; PTXO: ld.u32
; PTXO: st.u32
%v = load i32, ptr %in, align 4
store i32 %v, ptr %out, align 4
ret void
}
; COMMON-LABEL: ptr_nongeneric
define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(4) %in) {
; IR-NOT: addrspacecast
; PTX-NOT: cvta.to.global
; PTX: ld.const.u32
; PTX st.global.u32
%v = load i32, ptr addrspace(4) %in, align 4
store i32 %v, ptr addrspace(1) %out, align 4
ret void
}
; COMMON-LABEL: ptr_as_int
define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) {
; IR: [[P:%.*]] = inttoptr i64 %i to ptr
; IRC: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
; IRC: addrspacecast ptr addrspace(1) [[P1]] to ptr
; IRO-NOT: addrspacecast
; PTXC-DAG: ld.param.u64 [[I:%rd.*]], [ptr_as_int_param_0];
; PTXC-DAG: ld.param.u32 [[V:%r.*]], [ptr_as_int_param_1];
; PTXC: cvta.to.global.u64 %[[P:rd.*]], [[I]];
; PTXC: st.global.u32 [%[[P]]], [[V]];
; PTXO-DAG: ld.param.u64 %[[P:rd.*]], [ptr_as_int_param_0];
; PTXO-DAG: ld.param.u32 [[V:%r.*]], [ptr_as_int_param_1];
; PTXO: st.u32 [%[[P]]], [[V]];
%p = inttoptr i64 %i to ptr
store i32 %v, ptr %p, align 4
ret void
}
%struct.S = type { i64 }
; COMMON-LABEL: ptr_as_int_aggr
define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) align 8 %s, i32 noundef %v) {
; IR: [[S:%.*]] = addrspacecast ptr %s to ptr addrspace(101)
; IR: [[I:%.*]] = load i64, ptr addrspace(101) [[S]], align 8
; IR: [[P0:%.*]] = inttoptr i64 [[I]] to ptr
; IRC: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
; IRC: [[P:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr
; IRO-NOT: addrspacecast
; PTXC-DAG: ld.param.u64 [[I:%rd.*]], [ptr_as_int_aggr_param_0];
; PTXC-DAG: ld.param.u32 [[V:%r.*]], [ptr_as_int_aggr_param_1];
; PTXC: cvta.to.global.u64 %[[P:rd.*]], [[I]];
; PTXC: st.global.u32 [%[[P]]], [[V]];
; PTXO-DAG: ld.param.u64 %[[P:rd.*]], [ptr_as_int_aggr_param_0];
; PTXO-DAG: ld.param.u32 [[V:%r.*]], [ptr_as_int_aggr_param_1];
; PTXO: st.u32 [%[[P]]], [[V]];
%i = load i64, ptr %s, align 8
%p = inttoptr i64 %i to ptr
store i32 %v, ptr %p, align 4
ret void
}
; Function Attrs: convergent nounwind
declare dso_local ptr @escape(ptr) local_unnamed_addr
|