File: predicated-load_fence_split.ll

package info (click to toggle)
intel-graphics-compiler2 2.16.0-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 106,644 kB
  • sloc: cpp: 805,640; lisp: 287,672; ansic: 16,414; python: 3,952; yacc: 2,588; lex: 1,666; pascal: 313; sh: 186; makefile: 35
file content (79 lines) | stat: -rw-r--r-- 4,022 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
;=========================== begin_copyright_notice ============================
;
; Copyright (C) 2025 Intel Corporation
;
; SPDX-License-Identifier: MIT
;
;============================ end_copyright_notice =============================

; The test is to show barrier will block load merging across it.
;    load i8; load i8; barrier; load i8; load i8; -> load <2xi8>; barrier; load <2xi8>
;      No load <4 x i8>

; REQUIRES: llvm-14-plus, regkeys

; RUN: igc_opt --opaque-pointers %s -S -inputocl -igc-ldstcombine -regkey=EnableLdStCombine=5 \
; RUN:           -platformbmg \
; RUN: | FileCheck %s



 ; CHECK-LABEL: define spir_kernel void @test_fence
 ; CHECK:  call <2 x i8> @llvm.genx.GenISA.PredicatedLoad.v2i8.p1.v2i8(ptr addrspace(1) %{{.*}}, i64 1, i1 true, <2 x i8> <i8 2, i8 3>)
 ; CHECK: call void @llvm.genx.GenISA.memoryfence
 ; CHECK:  call <2 x i8> @llvm.genx.GenISA.PredicatedLoad.v2i8.p1.v2i8(ptr addrspace(1) %{{.*}}, i64 1, i1 true, <2 x i8> <i8 4, i8 5>)
 ; CHECK-NOT: <4 x i8> @llvm.genx.GenISA.PredicatedLoad
 ; CHECK: ret void

 ; Function Attrs: convergent nounwind
define spir_kernel void @test_fence(i32 addrspace(1)* %d, i8 addrspace(1)* %s, <8 x i32> %r0, <8 x i32> %payloadHeader, <3 x i32> %enqueuedLocalSize, i16 %localIdX, i16 %localIdY, i16 %localIdZ) #0 {
entry:
  %payloadHeader.scalar = extractelement <8 x i32> %payloadHeader, i32 0
  %enqueuedLocalSize.scalar = extractelement <3 x i32> %enqueuedLocalSize, i32 0
  %r0.scalar17 = extractelement <8 x i32> %r0, i32 1
  %mul.i.i.i = mul i32 %enqueuedLocalSize.scalar, %r0.scalar17
  %localIdX2 = zext i16 %localIdX to i32
  %add.i.i.i = add i32 %mul.i.i.i, %localIdX2
  %add4.i.i.i = add i32 %add.i.i.i, %payloadHeader.scalar
  %mul = shl nsw i32 %add4.i.i.i, 2
  %idxprom = sext i32 %mul to i64
  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %s, i64 %idxprom
  %0 = call i8 @llvm.genx.GenISA.PredicatedLoad.i8.p1i8.i8(i8 addrspace(1)* %arrayidx, i64 1, i1 true, i8 2)
  %add = or i32 %mul, 1
  %idxprom2 = sext i32 %add to i64
  %arrayidx3 = getelementptr inbounds i8, i8 addrspace(1)* %s, i64 %idxprom2
  %1 = call i8 @llvm.genx.GenISA.PredicatedLoad.i8.p1i8.i8(i8 addrspace(1)* %arrayidx3, i64 1, i1 true, i8 3)
  call void @llvm.genx.GenISA.memoryfence(i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false)
  call void @llvm.genx.GenISA.threadgroupbarrier()
  %add5 = or i32 %mul, 2
  %idxprom6 = sext i32 %add5 to i64
  %arrayidx7 = getelementptr inbounds i8, i8 addrspace(1)* %s, i64 %idxprom6
  %2 = call i8 @llvm.genx.GenISA.PredicatedLoad.i8.p1i8.i8(i8 addrspace(1)* %arrayidx7, i64 1, i1 true, i8 4)
  %add9 = or i32 %mul, 3
  %idxprom10 = sext i32 %add9 to i64
  %arrayidx11 = getelementptr inbounds i8, i8 addrspace(1)* %s, i64 %idxprom10
  %3 = call i8 @llvm.genx.GenISA.PredicatedLoad.i8.p1i8.i8(i8 addrspace(1)* %arrayidx11, i64 1, i1 true, i8 5)
  %vecinit14.assembled.vect = insertelement <4 x i8> undef, i8 %0, i32 0
  %vecinit14.assembled.vect33 = insertelement <4 x i8> %vecinit14.assembled.vect, i8 %1, i32 1
  %vecinit14.assembled.vect34 = insertelement <4 x i8> %vecinit14.assembled.vect33, i8 %2, i32 2
  %vecinit14.assembled.vect35 = insertelement <4 x i8> %vecinit14.assembled.vect34, i8 %3, i32 3
  %idxprom15 = sext i32 %add4.i.i.i to i64
  %arrayidx16 = getelementptr inbounds i32, i32 addrspace(1)* %d, i64 %idxprom15
  %4 = bitcast i32 addrspace(1)* %arrayidx16 to <4 x i8> addrspace(1)*
  store <4 x i8> %vecinit14.assembled.vect35, <4 x i8> addrspace(1)* %4, align 4
  ret void
}

; Function Attrs: convergent nounwind
declare void @llvm.genx.GenISA.memoryfence(i1, i1, i1, i1, i1, i1, i1, i1) #1

; Function Attrs: convergent nounwind
declare void @llvm.genx.GenISA.threadgroupbarrier() #1

; Function Attrs: nounwind readonly
declare i8 @llvm.genx.GenISA.PredicatedLoad.i8.p1i8.i8(i8 addrspace(1)*, i64, i1, i8) #2

attributes #0 = { convergent nounwind "less-precise-fpmad"="true" }
attributes #1 = { convergent nounwind }
attributes #2 = { nounwind readonly }