File: extend-vectorization-factor-for-unprofitable-memops.ll

package info (click to toggle)
llvm-toolchain-14 1%3A14.0.6-12
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,496,180 kB
  • sloc: cpp: 5,593,972; ansic: 986,872; asm: 585,869; python: 184,223; objc: 72,530; lisp: 31,119; f90: 27,793; javascript: 9,780; pascal: 9,762; sh: 9,482; perl: 7,468; ml: 5,432; awk: 3,523; makefile: 2,538; xml: 953; cs: 573; fortran: 567
file content (123 lines) | stat: -rw-r--r-- 4,094 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
; RUN: opt -loop-vectorize -mtriple=arm64-apple-darwin -S %s | FileCheck %s

; Test cases for extending the vectorization factor, if small memory operations
; are not profitable.

; Test with a loop that contains memory accesses of i8 and i32 types. The
; default maximum VF for NEON is 4. And while we don't have an instruction to
; load 4 x i8, vectorization might still be profitable.
define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) {
; CHECK-LABEL: @test_load_i8_store_i32(
; CHECK:       <4 x i8>
;
entry:
  br label %loop

loop:
  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
  %gep.src = getelementptr inbounds i8, i8* %src, i64 %iv
  %lv = load i8, i8* %gep.src, align 1
  %lv.ext = zext i8 %lv to i32
  %add = add i32 %lv.ext, %off
  %gep.dst = getelementptr inbounds i32, i32* %dst, i64 %iv
  store i32 %add, i32* %gep.dst
  %iv.next = add nuw nsw i64 %iv, 1
  %exitcond.not = icmp eq i64 %iv.next, %N
  br i1 %exitcond.not, label %exit, label %loop

exit:
  ret void
}

; Same as test_load_i8_store_i32, but with types flipped for load and store.
define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) {
; CHECK-LABEL: @test_load_i32_store_i8(
; CHECK:     <4 x i8>
;
entry:
  br label %loop

loop:
  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
  %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv
  %lv = load i32, i32* %gep.src, align 1
  %add = add i32 %lv, %off
  %add.trunc = trunc i32 %add to i8
  %gep.dst = getelementptr inbounds i8, i8* %dst, i64 %iv
  store i8 %add.trunc, i8* %gep.dst
  %iv.next = add nuw nsw i64 %iv, 1
  %exitcond.not = icmp eq i64 %iv.next, %N
  br i1 %exitcond.not, label %exit, label %loop

exit:
  ret void
}

; All memory operations use i32, all memory operations are profitable with VF 4.
define void @test_load_i32_store_i32(i32* noalias %src, i32* noalias %dst, i8 %off, i64 %N) {
; CHECK-LABEL: @test_load_i32_store_i32(
; CHECK: vector.body:
; CHECK:   <4 x i32>
;
entry:
  br label %loop

loop:
  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
  %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv
  %lv = load i32, i32* %gep.src, align 1
  %lv.trunc = trunc i32 %lv to i8
  %add = add i8 %lv.trunc, %off
  %add.ext = zext i8 %add to i32
  %gep.dst = getelementptr inbounds i32, i32* %dst, i64 %iv
  store i32 %add.ext, i32* %gep.dst
  %iv.next = add nuw nsw i64 %iv, 1
  %exitcond.not = icmp eq i64 %iv.next, %N
  br i1 %exitcond.not, label %exit, label %loop

exit:
  ret void
}

; Test with loop body that requires a large number of vector registers if the
; vectorization factor is large. Make sure the register estimates limit the
; vectorization factor.
define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) {
; CHECK-LABEL: @test_load_i8_store_i64_large
; CHECK: <2 x i64>
;
entry:
  br label %loop

loop:
  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
  %gep.src = getelementptr inbounds i8, i8* %src, i64 %iv
  %gep.dst.3 = getelementptr inbounds i64, i64* %dst.3, i64 %iv
  %lv.dst.3 = load i64, i64* %gep.dst.3, align 1
  %gep.dst.5 = getelementptr inbounds i64, i64* %dst.5, i64 %iv
  %lv.dst.5 = load i64, i64* %gep.dst.3, align 1

  %lv = load i8, i8* %gep.src, align 1
  %lv.ext = zext i8 %lv to i64
  %add = add i64 %lv.ext, %off
  %add.2 = add i64 %add, %off.2
  %gep.dst = getelementptr inbounds i64, i64* %dst, i64 %iv
  %gep.dst.2 = getelementptr inbounds i64, i64* %dst.2, i64 %iv

  %add.3 = add i64 %add.2, %lv.dst.3
  %add.4 = add i64 %add.3, %add
  %gep.dst.4 = getelementptr inbounds i64, i64* %dst.4, i64 %iv
  %add.5 = add i64 %add.2, %lv.dst.5
  store i64 %add.2, i64* %gep.dst.2
  store i64 %add, i64* %gep.dst
  store i64 %add.3, i64* %gep.dst.3
  store i64 %add.4, i64* %gep.dst.4
  store i64 %add.5, i64* %gep.dst.5

  %iv.next = add nuw nsw i64 %iv, 1
  %exitcond.not = icmp eq i64 %iv.next, %N
  br i1 %exitcond.not, label %exit, label %loop

exit:
  ret void
}