File: vector-loads.ll

package info (click to toggle)
llvm-toolchain-17 1%3A17.0.6-22
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,799,624 kB
  • sloc: cpp: 6,428,607; ansic: 1,383,196; asm: 793,408; python: 223,504; objc: 75,364; f90: 60,502; lisp: 33,869; pascal: 15,282; sh: 9,684; perl: 7,453; ml: 4,937; awk: 3,523; makefile: 2,889; javascript: 2,149; xml: 888; fortran: 619; cs: 573
file content (154 lines) | stat: -rw-r--r-- 4,893 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}

; Even though general vector types are not supported in PTX, we can still
; optimize loads/stores with pseudo-vector instructions of the form:
;
; ld.v2.f32 {%f0, %f1}, [%r0]
;
; which will load two floats at once into scalar registers.

; CHECK-LABEL: foo
define void @foo(ptr %a) {
; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}
  %t1 = load <2 x float>, ptr %a
  %t2 = fmul <2 x float> %t1, %t1
  store <2 x float> %t2, ptr %a
  ret void
}

; CHECK-LABEL: foo2
define void @foo2(ptr %a) {
; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
  %t1 = load <4 x float>, ptr %a
  %t2 = fmul <4 x float> %t1, %t1
  store <4 x float> %t2, ptr %a
  ret void
}

; CHECK-LABEL: foo3
define void @foo3(ptr %a) {
; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
; CHECK-NEXT: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
  %t1 = load <8 x float>, ptr %a
  %t2 = fmul <8 x float> %t1, %t1
  store <8 x float> %t2, ptr %a
  ret void
}



; CHECK-LABEL: foo4
define void @foo4(ptr %a) {
; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}
  %t1 = load <2 x i32>, ptr %a
  %t2 = mul <2 x i32> %t1, %t1
  store <2 x i32> %t2, ptr %a
  ret void
}

; CHECK-LABEL: foo5
define void @foo5(ptr %a) {
; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
  %t1 = load <4 x i32>, ptr %a
  %t2 = mul <4 x i32> %t1, %t1
  store <4 x i32> %t2, ptr %a
  ret void
}

; CHECK-LABEL: foo6
define void @foo6(ptr %a) {
; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
; CHECK-NEXT: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
  %t1 = load <8 x i32>, ptr %a
  %t2 = mul <8 x i32> %t1, %t1
  store <8 x i32> %t2, ptr %a
  ret void
}

; The following test wasn't passing previously as the address
; computation was still too complex when LSV was called.
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; CHECK-LABEL: foo_complex
define void @foo_complex(ptr nocapture readonly align 16 dereferenceable(134217728) %alloc0) {
  %t0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !1
  %t1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  %t2 = lshr i32 %t1, 8
  %t3 = shl nuw nsw i32 %t1, 9
  %ttile_origin.2 = and i32 %t3, 130560
  %tstart_offset_x_mul = shl nuw nsw i32 %t0, 1
  %t4 = or i32 %ttile_origin.2, %tstart_offset_x_mul
  %t6 = or i32 %t4, 1
  %t8 = or i32 %t4, 128
  %t9 = zext i32 %t8 to i64
  %t10 = or i32 %t4, 129
  %t11 = zext i32 %t10 to i64
  %t20 = zext i32 %t2 to i64
  %t27 = getelementptr inbounds [1024 x [131072 x i8]], ptr %alloc0, i64 0, i64 %t20, i64 %t9
; CHECK: ld.v2.u8
  %t28 = load i8, ptr %t27, align 2
  %t31 = getelementptr inbounds [1024 x [131072 x i8]], ptr %alloc0, i64 0, i64 %t20, i64 %t11
  %t32 = load i8, ptr %t31, align 1
  %t33 = icmp ult i8 %t28, %t32
  %t34 = select i1 %t33, i8 %t32, i8 %t28
  store i8 %t34, ptr %t31
; CHECK: ret
  ret void
}

; CHECK-LABEL: extv8f16_global_a16(
define void @extv8f16_global_a16(ptr addrspace(1) noalias readonly align 16 %dst, ptr addrspace(1) noalias readonly align 16 %src) #0 {
; CHECK: ld.global.v4.b16 {%f
; CHECK: ld.global.v4.b16 {%f
  %v = load <8 x half>, ptr addrspace(1) %src, align 16
  %ext = fpext <8 x half> %v to <8 x float>
; CHECK: st.global.v4.f32
; CHECK: st.global.v4.f32
  store <8 x float> %ext, ptr addrspace(1) %dst, align 16
  ret void
}

; CHECK-LABEL: extv8f16_global_a4(
define void @extv8f16_global_a4(ptr addrspace(1) noalias readonly align 16 %dst, ptr addrspace(1) noalias readonly align 16 %src) #0 {
; CHECK: ld.global.v2.b16 {%f
; CHECK: ld.global.v2.b16 {%f
; CHECK: ld.global.v2.b16 {%f
; CHECK: ld.global.v2.b16 {%f
  %v = load <8 x half>, ptr addrspace(1) %src, align 4
  %ext = fpext <8 x half> %v to <8 x float>
; CHECK: st.global.v4.f32
; CHECK: st.global.v4.f32
  store <8 x float> %ext, ptr addrspace(1) %dst, align 16
  ret void
}


; CHECK-LABEL: extv8f16_generic_a16(
define void @extv8f16_generic_a16(ptr noalias readonly align 16 %dst, ptr noalias readonly align 16 %src) #0 {
; CHECK: ld.v4.b16 {%f
; CHECK: ld.v4.b16 {%f
  %v = load <8 x half>, ptr %src, align 16
  %ext = fpext <8 x half> %v to <8 x float>
; CHECK: st.v4.f32
; CHECK: st.v4.f32
  store <8 x float> %ext, ptr %dst, align 16
  ret void
}

; CHECK-LABEL: extv8f16_generic_a4(
define void @extv8f16_generic_a4(ptr noalias readonly align 16 %dst, ptr noalias readonly align 16 %src) #0 {
; CHECK: ld.v2.b16 {%f
; CHECK: ld.v2.b16 {%f
; CHECK: ld.v2.b16 {%f
; CHECK: ld.v2.b16 {%f
  %v = load <8 x half>, ptr %src, align 4
  %ext = fpext <8 x half> %v to <8 x float>
; CHECK: st.v4.f32
; CHECK: st.v4.f32
  store <8 x float> %ext, ptr %dst, align 16
  ret void
}


!1 = !{i32 0, i32 64}