File: load-vector-private.ll

package info (click to toggle)
intel-graphics-compiler 1.0.17791.18-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 102,312 kB
  • sloc: cpp: 935,343; lisp: 286,143; ansic: 16,196; python: 3,279; yacc: 2,487; lex: 1,642; pascal: 300; sh: 174; makefile: 27
file content (242 lines) | stat: -rw-r--r-- 16,450 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
;=========================== begin_copyright_notice ============================
;
; Copyright (C) 2023 Intel Corporation
;
; SPDX-License-Identifier: MIT
;
;============================ end_copyright_notice =============================

;
; RUN: %opt %use_old_pass_manager% -enable-debugify -GenXLoadStoreLowering -march=genx64 -mcpu=Gen9 -mtriple=spir64-unknown-unknown -enable-ldst-lowering=true -mattr=+ocl_runtime -S < %s 2>&1 | FileCheck %s
; RUN: %opt %use_old_pass_manager% -enable-debugify -GenXLoadStoreLowering -march=genx64 -mcpu=XeHPC -mtriple=spir64-unknown-unknown -enable-ldst-lowering=true -mattr=+ocl_runtime -S < %s 2>&1 | FileCheck --check-prefix=CHECK-LSC %s
;
; CHECK-NOT: WARNING
; CHECK: CheckModuleDebugify: PASS
; CHECK-LSC-NOT: WARNING
; CHECK-LSC: CheckModuleDebugify: PASS

; COM: Basic test on load lowering pass
; COM: simplest vector load from addrspace(0)

target datalayout = "e-p:64:64-i64:64-n8:16:32:64"
target triple = "genx64-unknown-unknown"

; Address space 0 (private) operations are lowered into svm/ugm intrinsics

define void @replace_load_i8_block(<16 x i8>* %pi8) {
  ; CHECK: call <16 x i8> @llvm.genx.svm.block.ld.v16i8.i64(i64 %{{[0-9a-zA-Z.]+}})
  ; CHECK-LSC: call <2 x i64> @llvm.vc.internal.lsc.load.ugm.v2i64.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 4, i8 2, <2 x i8> zeroinitializer, i64 0, i64 %{{[0-9a-zA-Z.]+}}, i16 1, i32 0, <2 x i64> undef)
  %loaded = load <16 x i8>, <16 x i8>* %pi8
  ret void
}

define void @replace_load_i16_block(<16 x i16>* %pi16) {
  ; CHECK: call <16 x i16> @llvm.genx.svm.block.ld.v16i16.i64(i64 %{{[0-9a-zA-Z.]+}})
  ; CHECK-LSC: call <4 x i64> @llvm.vc.internal.lsc.load.ugm.v4i64.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 4, i8 4, <2 x i8> zeroinitializer, i64 0, i64 %{{[0-9a-zA-Z.]+}}, i16 1, i32 0, <4 x i64> undef)
  %loaded = load <16 x i16>, <16 x i16>* %pi16
  ret void
}

define void @replace_load_i32_block(<16 x i32>* %pi32) {
  ; CHECK: call <16 x i32> @llvm.genx.svm.block.ld.v16i32.i64(i64 %{{[0-9a-zA-Z.]+}})
  ; CHECK-LSC: call <8 x i64> @llvm.vc.internal.lsc.load.ugm.v8i64.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 4, i8 5, <2 x i8> zeroinitializer, i64 0, i64 %{{[0-9a-zA-Z.]+}}, i16 1, i32 0, <8 x i64> undef)
  %loaded = load <16 x i32>, <16 x i32>* %pi32
  ret void
}

define void @replace_load_i64_block(<16 x i64>* %pi64) {
  ; CHECK: call <16 x i64> @llvm.genx.svm.block.ld.v16i64.i64(i64 %{{[0-9a-zA-Z.]+}})
  ; CHECK-LSC: call <16 x i64> @llvm.vc.internal.lsc.load.ugm.v16i64.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 4, i8 6, <2 x i8> zeroinitializer, i64 0, i64 %{{[0-9a-zA-Z.]+}}, i16 1, i32 0, <16 x i64> undef)
  %loaded = load <16 x i64>, <16 x i64>* %pi64
  ret void
}

define void @replace_load_i8_block_unaligned(<16 x i8>* %pi8) {
  ; CHECK: call <64 x i8> @llvm.genx.svm.gather.v64i8.v16i1.v16i64
  ; CHECK-LSC: call <16 x i32> @llvm.vc.internal.lsc.load.ugm.v16i32.v16i1.v2i8.v16i64
  %loaded = load <16 x i8>, <16 x i8>* %pi8, align 1
  ret void
}

define void @replace_load_i16_block_unaligned(<16 x i16>* %pi16) {
  ; CHECK: call <64 x i8> @llvm.genx.svm.gather.v64i8.v16i1.v16i64
  ; CHECK-LSC: call <16 x i32> @llvm.vc.internal.lsc.load.ugm.v16i32.v16i1.v2i8.v16i64
  %loaded = load <16 x i16>, <16 x i16>* %pi16, align 1
  ret void
}

define void @replace_load_i32_block_unaligned(<16 x i32>* %pi32) {
  ; CHECK: call <16 x i32> @llvm.genx.svm.gather.v16i32.v16i1.v16i64
  ; CHECK-LSC: call <16 x i32> @llvm.vc.internal.lsc.load.ugm.v16i32.v16i1.v2i8.v16i64
  %loaded = load <16 x i32>, <16 x i32>* %pi32, align 1
  ret void
}

define void @replace_load_i64_block_unaligned(<16 x i64>* %pi64) {
  ; CHECK: call <16 x i64> @llvm.genx.svm.gather.v16i64.v16i1.v16i64
  ; CHECK-LSC: call <16 x i64> @llvm.vc.internal.lsc.load.ugm.v16i64.v16i1.v2i8.v16i64
  %loaded = load <16 x i64>, <16 x i64>* %pi64, align 1
  ret void
}

define void @replace_load_i8_block_dwalign(<16 x i8>* %pi8) {
  ; CHECK: call <16 x i8> @llvm.genx.svm.block.ld.unaligned.v16i8.i64(i64 %{{[0-9a-zA-Z.]+}})
  ; CHECK-LSC: call <4 x i32> @llvm.vc.internal.lsc.load.ugm.v4i32.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 3, i8 4, <2 x i8> zeroinitializer, i64 0, i64 %{{[0-9a-zA-Z.]+}}, i16 1, i32 0, <4 x i32> undef)
  %loaded = load <16 x i8>, <16 x i8>* %pi8, align 4
  ret void
}

define void @replace_load_i16_block_dwalign(<16 x i16>* %pi16) {
  ; CHECK: call <16 x i16> @llvm.genx.svm.block.ld.unaligned.v16i16.i64(i64 %{{[0-9a-zA-Z.]+}})
  ; CHECK-LSC: call <8 x i32> @llvm.vc.internal.lsc.load.ugm.v8i32.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 3, i8 5, <2 x i8> zeroinitializer, i64 0, i64 %{{[0-9a-zA-Z.]+}}, i16 1, i32 0, <8 x i32> undef)
  %loaded = load <16 x i16>, <16 x i16>* %pi16, align 4
  ret void
}

define void @replace_load_i32_block_dwalign(<16 x i32>* %pi32) {
  ; CHECK: call <16 x i32> @llvm.genx.svm.block.ld.unaligned.v16i32.i64(i64 %{{[0-9a-zA-Z.]+}})
  ; CHECK-LSC: call <16 x i32> @llvm.vc.internal.lsc.load.ugm.v16i32.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 3, i8 6, <2 x i8> zeroinitializer, i64 0, i64 %{{[0-9a-zA-Z.]+}}, i16 1, i32 0, <16 x i32> undef)
  %loaded = load <16 x i32>, <16 x i32>* %pi32, align 4
  ret void
}

define void @replace_load_i64_block_dwalign(<16 x i64>* %pi64) {
  ; CHECK: call <16 x i64> @llvm.genx.svm.block.ld.unaligned.v16i64.i64(i64 %{{[0-9a-zA-Z.]+}})
  ; CHECK-LSC: call <32 x i32> @llvm.vc.internal.lsc.load.ugm.v32i32.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 3, i8 7, <2 x i8> zeroinitializer, i64 0, i64 %{{[0-9a-zA-Z.]+}}, i16 1, i32 0, <32 x i32> undef)
  %loaded = load <16 x i64>, <16 x i64>* %pi64, align 4
  ret void
}

define void @replace_load_i8_block_qwalign(<16 x i8>* %pi8) {
  ; CHECK: call <16 x i8> @llvm.genx.svm.block.ld.unaligned.v16i8.i64(i64 %{{[0-9a-zA-Z.]+}})
  ; CHECK-LSC: call <2 x i64> @llvm.vc.internal.lsc.load.ugm.v2i64.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 4, i8 2, <2 x i8> zeroinitializer, i64 0, i64 %{{[0-9a-zA-Z.]+}}, i16 1, i32 0, <2 x i64> undef)
  %loaded = load <16 x i8>, <16 x i8>* %pi8, align 8
  ret void
}

define void @replace_load_i16_block_qwalign(<16 x i16>* %pi16) {
  ; CHECK: call <16 x i16> @llvm.genx.svm.block.ld.unaligned.v16i16.i64(i64 %{{[0-9a-zA-Z.]+}})
  ; CHECK-LSC: call <4 x i64> @llvm.vc.internal.lsc.load.ugm.v4i64.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 4, i8 4, <2 x i8> zeroinitializer, i64 0, i64 %{{[0-9a-zA-Z.]+}}, i16 1, i32 0, <4 x i64> undef)
  %loaded = load <16 x i16>, <16 x i16>* %pi16, align 8
  ret void
}

define void @replace_load_i32_block_qwalign(<16 x i32>* %pi32) {
  ; CHECK: call <16 x i32> @llvm.genx.svm.block.ld.unaligned.v16i32.i64(i64 %{{[0-9a-zA-Z.]+}})
  ; CHECK-LSC: call <8 x i64> @llvm.vc.internal.lsc.load.ugm.v8i64.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 4, i8 5, <2 x i8> zeroinitializer, i64 0, i64 %{{[0-9a-zA-Z.]+}}, i16 1, i32 0, <8 x i64> undef)
  %loaded = load <16 x i32>, <16 x i32>* %pi32, align 8
  ret void
}

define void @replace_load_i64_block_qwalign(<16 x i64>* %pi64) {
  ; CHECK: call <16 x i64> @llvm.genx.svm.block.ld.unaligned.v16i64.i64(i64 %{{[0-9a-zA-Z.]+}})
  ; CHECK-LSC: call <16 x i64> @llvm.vc.internal.lsc.load.ugm.v16i64.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 4, i8 6, <2 x i8> zeroinitializer, i64 0, i64 %{{[0-9a-zA-Z.]+}}, i16 1, i32 0, <16 x i64> undef)
  %loaded = load <16 x i64>, <16 x i64>* %pi64, align 8
  ret void
}

define void @replace_load_i8_block_owalign(<16 x i8>* %pi8) {
  ; CHECK: call <16 x i8> @llvm.genx.svm.block.ld.v16i8.i64(i64 %{{[0-9a-zA-Z.]+}})
  ; CHECK-LSC: call <2 x i64> @llvm.vc.internal.lsc.load.ugm.v2i64.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 4, i8 2, <2 x i8> zeroinitializer, i64 0, i64 %{{[0-9a-zA-Z.]+}}, i16 1, i32 0, <2 x i64> undef)
  %loaded = load <16 x i8>, <16 x i8>* %pi8, align 16
  ret void
}

define void @replace_load_i16_block_owalign(<16 x i16>* %pi16) {
  ; CHECK: call <16 x i16> @llvm.genx.svm.block.ld.v16i16.i64(i64 %{{[0-9a-zA-Z.]+}})
  ; CHECK-LSC: call <4 x i64> @llvm.vc.internal.lsc.load.ugm.v4i64.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 4, i8 4, <2 x i8> zeroinitializer, i64 0, i64 %{{[0-9a-zA-Z.]+}}, i16 1, i32 0, <4 x i64> undef)
  %loaded = load <16 x i16>, <16 x i16>* %pi16, align 16
  ret void
}

define void @replace_load_i32_block_owalign(<16 x i32>* %pi32) {
  ; CHECK: call <16 x i32> @llvm.genx.svm.block.ld.v16i32.i64(i64 %{{[0-9a-zA-Z.]+}})
  ; CHECK-LSC: call <8 x i64> @llvm.vc.internal.lsc.load.ugm.v8i64.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 4, i8 5, <2 x i8> zeroinitializer, i64 0, i64 %{{[0-9a-zA-Z.]+}}, i16 1, i32 0, <8 x i64> undef)
  %loaded = load <16 x i32>, <16 x i32>* %pi32, align 16
  ret void
}

define void @replace_load_i64_block_owalign(<16 x i64>* %pi64) {
  ; CHECK: call <16 x i64> @llvm.genx.svm.block.ld.v16i64.i64(i64 %{{[0-9a-zA-Z.]+}})
  ; CHECK-LSC: call <16 x i64> @llvm.vc.internal.lsc.load.ugm.v16i64.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 4, i8 6, <2 x i8> zeroinitializer, i64 0, i64 %{{[0-9a-zA-Z.]+}}, i16 1, i32 0, <16 x i64> undef)
  %loaded = load <16 x i64>, <16 x i64>* %pi64, align 16
  ret void
}

define void @replace_load_i8_block_1023bytes(<1023 x i8>* %pi8) {
  ; CHECK: call <128 x i8> @llvm.genx.svm.block.ld.v128i8.i64(i64 [[ADDR0:%[0-9a-zA-Z.]+]])
  ; CHECK: [[ADDR128:%[0-9a-zA-Z.]+]] = add i64 [[ADDR0]], 128
  ; CHECK: call <128 x i8> @llvm.genx.svm.block.ld.v128i8.i64(i64 [[ADDR128]])
  ; CHECK: [[ADDR256:%[0-9a-zA-Z.]+]] = add i64 [[ADDR0]], 256
  ; CHECK: call <128 x i8> @llvm.genx.svm.block.ld.v128i8.i64(i64 [[ADDR256]])
  ; CHECK: [[ADDR384:%[0-9a-zA-Z.]+]] = add i64 [[ADDR0]], 384
  ; CHECK: call <128 x i8> @llvm.genx.svm.block.ld.v128i8.i64(i64 [[ADDR384]])
  ; CHECK: [[ADDR512:%[0-9a-zA-Z.]+]] = add i64 [[ADDR0]], 512
  ; CHECK: call <128 x i8> @llvm.genx.svm.block.ld.v128i8.i64(i64 [[ADDR512]])
  ; CHECK: [[ADDR640:%[0-9a-zA-Z.]+]] = add i64 [[ADDR0]], 640
  ; CHECK: call <128 x i8> @llvm.genx.svm.block.ld.v128i8.i64(i64 [[ADDR640]])
  ; CHECK: [[ADDR768:%[0-9a-zA-Z.]+]] = add i64 [[ADDR0]], 768
  ; CHECK: call <128 x i8> @llvm.genx.svm.block.ld.v128i8.i64(i64 [[ADDR768]])
  ; CHECK: [[ADDR896:%[0-9a-zA-Z.]+]] = add i64 [[ADDR0]], 896
  ; CHECK: call <64 x i8> @llvm.genx.svm.block.ld.v64i8.i64(i64 [[ADDR896]])
  ; CHECK: [[ADDR960:%[0-9a-zA-Z.]+]] = add i64 [[ADDR0]], 960
  ; CHECK: call <32 x i8> @llvm.genx.svm.block.ld.v32i8.i64(i64 [[ADDR960]])
  ; CHECK: [[ADDR992:%[0-9a-zA-Z.]+]] = add i64 [[ADDR0]], 992
  ; CHECK: call <16 x i8> @llvm.genx.svm.block.ld.v16i8.i64(i64 [[ADDR992]])
  ; CHECK: call <60 x i8> @llvm.genx.svm.gather.v60i8.v15i1.v15i64
  ; CHECK-LSC: call <64 x i64> @llvm.vc.internal.lsc.load.ugm.v64i64.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 4, i8 8, <2 x i8> zeroinitializer, i64 0, i64 [[ADDR:%[0-9a-zA-Z.]+]], i16 1, i32 0, <64 x i64> undef)
  ; CHECK-LSC: [[ADDR512:%[^ ]+]] = add i64 [[ADDR]], 512
  ; CHECK-LSC: call <32 x i64> @llvm.vc.internal.lsc.load.ugm.v32i64.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 4, i8 7, <2 x i8> zeroinitializer, i64 0, i64 [[ADDR512]], i16 1, i32 0, <32 x i64> undef)
  ; CHECK-LSC: [[ADDR768:%[^ ]+]] = add i64 [[ADDR]], 768
  ; CHECK-LSC: call <16 x i64> @llvm.vc.internal.lsc.load.ugm.v16i64.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 4, i8 6, <2 x i8> zeroinitializer, i64 0, i64 [[ADDR768]], i16 1, i32 0, <16 x i64> undef)
  ; CHECK-LSC: [[ADDR896:%[^ ]+]] = add i64 [[ADDR]], 896
  ; CHECK-LSC: call <8 x i64> @llvm.vc.internal.lsc.load.ugm.v8i64.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 4, i8 5, <2 x i8> zeroinitializer, i64 0, i64 [[ADDR896]], i16 1, i32 0, <8 x i64> undef)
  ; CHECK-LSC: [[ADDR960:%[^ ]+]] = add i64 [[ADDR]], 960
  ; CHECK-LSC: call <4 x i64> @llvm.vc.internal.lsc.load.ugm.v4i64.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 4, i8 4, <2 x i8> zeroinitializer, i64 0, i64 [[ADDR960]], i16 1, i32 0, <4 x i64> undef)
  ; CHECK-LSC: [[ADDR992:%[^ ]+]] = add i64 [[ADDR]], 992
  ; CHECK-LSC: call <3 x i64> @llvm.vc.internal.lsc.load.ugm.v3i64.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 4, i8 3, <2 x i8> zeroinitializer, i64 0, i64 [[ADDR992]], i16 1, i32 0, <3 x i64> undef)
  ; CHECK-LSC: call <7 x i32> @llvm.vc.internal.lsc.load.ugm.v7i32.v7i1.v2i8.v7i64(<7 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i8 3, i8 5, i8 1, <2 x i8> zeroinitializer, i64 0, <7 x i64> %{{[0-9a-zA-Z]+}}, i16 1, i32 0, <7 x i32> undef)
  %loaded = load <1023 x i8>, <1023 x i8>* %pi8
  ret void
}

define void @replace_load_i8_block_1023bytes_dwalign(<1023 x i8>* %pi8) {
  ; CHECK: call <128 x i8> @llvm.genx.svm.block.ld.unaligned.v128i8.i64(i64 [[ADDRDW0:%[0-9a-zA-Z.]+]])
  ; CHECK: [[ADDRDW128:%[0-9a-zA-Z.]+]] = add i64 [[ADDRDW0]], 128
  ; CHECK: call <128 x i8> @llvm.genx.svm.block.ld.unaligned.v128i8.i64(i64 [[ADDRDW128]])
  ; CHECK: [[ADDRDW256:%[0-9a-zA-Z.]+]] = add i64 [[ADDRDW0]], 256
  ; CHECK: call <128 x i8> @llvm.genx.svm.block.ld.unaligned.v128i8.i64(i64 [[ADDRDW256]])
  ; CHECK: [[ADDRDW384:%[0-9a-zA-Z.]+]] = add i64 [[ADDRDW0]], 384
  ; CHECK: call <128 x i8> @llvm.genx.svm.block.ld.unaligned.v128i8.i64(i64 [[ADDRDW384]])
  ; CHECK: [[ADDRDW512:%[0-9a-zA-Z.]+]] = add i64 [[ADDRDW0]], 512
  ; CHECK: call <128 x i8> @llvm.genx.svm.block.ld.unaligned.v128i8.i64(i64 [[ADDRDW512]])
  ; CHECK: [[ADDRDW640:%[0-9a-zA-Z.]+]] = add i64 [[ADDRDW0]], 640
  ; CHECK: call <128 x i8> @llvm.genx.svm.block.ld.unaligned.v128i8.i64(i64 [[ADDRDW640]])
  ; CHECK: [[ADDRDW768:%[0-9a-zA-Z.]+]] = add i64 [[ADDRDW0]], 768
  ; CHECK: call <128 x i8> @llvm.genx.svm.block.ld.unaligned.v128i8.i64(i64 [[ADDRDW768]])
  ; CHECK: [[ADDRDW896:%[0-9a-zA-Z.]+]] = add i64 [[ADDRDW0]], 896
  ; CHECK: call <64 x i8> @llvm.genx.svm.block.ld.unaligned.v64i8.i64(i64 [[ADDRDW896]])
  ; CHECK: [[ADDRDW960:%[0-9a-zA-Z.]+]] = add i64 [[ADDRDW0]], 960
  ; CHECK: call <32 x i8> @llvm.genx.svm.block.ld.unaligned.v32i8.i64(i64 [[ADDRDW960]])
  ; CHECK: [[ADDRDW992:%[0-9a-zA-Z.]+]] = add i64 [[ADDRDW0]], 992
  ; CHECK: call <16 x i8> @llvm.genx.svm.block.ld.unaligned.v16i8.i64(i64 [[ADDRDW992]])
  ; CHECK: call <60 x i8> @llvm.genx.svm.gather.v60i8.v15i1.v15i64
  ; CHECK-LSC: call <64 x i32> @llvm.vc.internal.lsc.load.ugm.v64i32.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 3, i8 8, <2 x i8> zeroinitializer, i64 0, i64 [[ADDRDW:%[0-9a-zA-Z.]+]], i16 1, i32 0, <64 x i32> undef)
  ; CHECK-LSC: [[ADDRDW256:%[^ ]+]] = add i64 [[ADDRDW]], 256
  ; CHECK-LSC: call <64 x i32> @llvm.vc.internal.lsc.load.ugm.v64i32.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 3, i8 8, <2 x i8> zeroinitializer, i64 0, i64 [[ADDRDW256]], i16 1, i32 0, <64 x i32> undef)
  ; CHECK-LSC: [[ADDRDW512:%[^ ]+]] = add i64 [[ADDRDW]], 512
  ; CHECK-LSC: call <64 x i32> @llvm.vc.internal.lsc.load.ugm.v64i32.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 3, i8 8, <2 x i8> zeroinitializer, i64 0, i64 [[ADDRDW512]], i16 1, i32 0, <64 x i32> undef)
  ; CHECK-LSC: [[ADDRDW768:%[^ ]+]] = add i64 [[ADDRDW]], 768
  ; CHECK-LSC: call <32 x i32> @llvm.vc.internal.lsc.load.ugm.v32i32.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 3, i8 7, <2 x i8> zeroinitializer, i64 0, i64 [[ADDRDW768]], i16 1, i32 0, <32 x i32> undef)
  ; CHECK-LSC: [[ADDRDW896:%[^ ]+]] = add i64 [[ADDRDW]], 896
  ; CHECK-LSC: call <16 x i32> @llvm.vc.internal.lsc.load.ugm.v16i32.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 3, i8 6, <2 x i8> zeroinitializer, i64 0, i64 [[ADDRDW896]], i16 1, i32 0, <16 x i32> undef)
  ; CHECK-LSC: [[ADDRDW960:%[^ ]+]] = add i64 [[ADDRDW]], 960
  ; CHECK-LSC: call <8 x i32> @llvm.vc.internal.lsc.load.ugm.v8i32.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 3, i8 5, <2 x i8> zeroinitializer, i64 0, i64 [[ADDRDW960]], i16 1, i32 0, <8 x i32> undef)
  ; CHECK-LSC: [[ADDRDW992:%[^ ]+]] = add i64 [[ADDRDW]], 992
  ; CHECK-LSC: call <4 x i32> @llvm.vc.internal.lsc.load.ugm.v4i32.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 3, i8 4, <2 x i8> zeroinitializer, i64 0, i64 [[ADDRDW992]], i16 1, i32 0, <4 x i32> undef)
  ; CHECK-LSC: [[ADDRDW1008:%[^ ]+]] = add i64 [[ADDRDW]], 1008
  ; CHECK-LSC: call <3 x i32> @llvm.vc.internal.lsc.load.ugm.v3i32.v1i1.v2i8.i64(<1 x i1> <i1 true>, i8 3, i8 3, i8 3, <2 x i8> zeroinitializer, i64 0, i64 [[ADDRDW1008]], i16 1, i32 0, <3 x i32> undef)
  ; CHECK-LSC: call <3 x i32> @llvm.vc.internal.lsc.load.ugm.v3i32.v3i1.v2i8.v3i64(<3 x i1> <i1 true, i1 true, i1 true>, i8 3, i8 5, i8 1, <2 x i8> zeroinitializer, i64 0, <3 x i64> %{{[0-9a-zA-Z]+}}, i16 1, i32 0, <3 x i32> undef)
  %loaded = load <1023 x i8>, <1023 x i8>* %pi8, align 4
  ret void
}