File: 0001-WG-vectorizer-Re-enable-LLVM-vectorizers.patch

package info (click to toggle)
pocl 6.0-7
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 25,320 kB
  • sloc: lisp: 149,513; ansic: 103,778; cpp: 54,947; python: 1,513; sh: 949; ruby: 255; pascal: 226; tcl: 180; makefile: 175; java: 72; xml: 49
file content (146 lines) | stat: -rw-r--r-- 5,272 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
From ff169087f30537c5318ab048f2cc093be9e364f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pekka=20J=C3=A4=C3=A4skel=C3=A4inen?=
 <pekka.jaaskelainen@intel.com>
Date: Thu, 15 Aug 2024 17:20:46 +0300
Subject: [PATCH] WG-vectorizer: Re-enable LLVM vectorizers

The LLVM vectorizers were accidentally disabled when
transitioning to the new PM. This commit re-enables
them and exposes some new inefficiencies (to be continued...).

Yeah, we should also add a perf.regression test or at least a "smoke
test" for the WG vectorization.

(cherry picked from commit 19dc70bbe927a6f6210ccd45b6a4c2a49fe6da96)
---
 lib/CL/pocl_llvm_api.h |  9 +++++++--
 lib/CL/pocl_llvm_wg.cc | 45 +++++++++++++++++++++++++++++++-----------
 2 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/lib/CL/pocl_llvm_api.h b/lib/CL/pocl_llvm_api.h
index 0542974f1..9b90c923a 100644
--- a/lib/CL/pocl_llvm_api.h
+++ b/lib/CL/pocl_llvm_api.h
@@ -30,6 +30,7 @@
 #include <llvm/IR/DiagnosticPrinter.h>
 #include <llvm/IR/Module.h>
 #include <llvm/Support/raw_os_ostream.h>
+#include <llvm/Target/TargetMachine.h>
 
 #include <map>
 #include <string>
@@ -101,8 +102,12 @@ POCL_EXPORT bool getModuleBoolMetadata (const llvm::Module &mod,
  * SizeL - optimize for size
  * Vectorize - whether to invoke the vectorizer (only used for legacy PM)
  */
-POCL_EXPORT void populateModulePM (void *Passes, void *Module, unsigned OptL,
-                                   unsigned SizeL, bool Vectorize = true);
+POCL_EXPORT void populateModulePM (void *Passes,
+                                   void *Module,
+                                   unsigned OptL,
+                                   unsigned SizeL,
+                                   bool Vectorize = true,
+                                   llvm::TargetMachine *TM = nullptr);
 
 extern std::string CurrentWgMethod;
 
diff --git a/lib/CL/pocl_llvm_wg.cc b/lib/CL/pocl_llvm_wg.cc
index 5eff8ffbd..041c75aad 100644
--- a/lib/CL/pocl_llvm_wg.cc
+++ b/lib/CL/pocl_llvm_wg.cc
@@ -200,7 +200,7 @@ llvm::Error PoCLModulePassManager::build(std::string PoclPipeline,
   // devices do not want to vectorize intra work-item at this
   // stage.
   Vectorize = ((CurrentWgMethod == "loopvec" || CurrentWgMethod == "cbs") &&
-               (Dev->spmd == CL_FALSE));
+               (!Dev->spmd));
   PTO.SLPVectorization = Vectorize;
   PTO.LoopVectorization = Vectorize;
   OptimizeLevel = OLevel;
@@ -273,7 +273,6 @@ llvm::Error PoCLModulePassManager::build(std::string PoclPipeline,
 #endif
 
   pocl::registerFunctionAnalyses(PB);
-
   // Register all the basic analyses with the managers.
   PB.registerModuleAnalyses(MAM);
   PB.registerCGSCCAnalyses(CGAM);
@@ -313,7 +312,7 @@ void PoCLModulePassManager::run(llvm::Module &Bitcode) {
   PM.run(Bitcode, MAM);
 #ifdef SEPARATE_OPTIMIZATION_FROM_POCL_PASSES
   populateModulePM(nullptr, (void *)&Bitcode, OptimizeLevel, SizeLevel,
-                   Vectorize);
+                   Vectorize, Machine.get());
 #endif
 }
 
@@ -532,7 +531,7 @@ static void addStage2PassesToPipeline(cl_device_id Dev,
 
   // NOTE: if you add a new PoCL pass here,
   // don't forget to register it in registerPassBuilderPasses
-  if (Dev->spmd == CL_FALSE) {
+  if (!Dev->spmd) {
     addPass(Passes, "simplifycfg");
     addPass(Passes, "loop-simplify");
 
@@ -1528,7 +1527,7 @@ int pocl_llvm_codegen(cl_device_id Device, cl_program program, void *Modp,
 }
 
 void populateModulePM(void *Passes, void *Module, unsigned OptL, unsigned SizeL,
-                      bool Vectorize) {
+                      bool Vectorize, TargetMachine *TM) {
 #if LLVM_MAJOR < MIN_LLVM_NEW_PASSMANAGER
   PassManagerBuilder Builder;
   Builder.OptLevel = OptL;
@@ -1555,18 +1554,42 @@ void populateModulePM(void *Passes, void *Module, unsigned OptL, unsigned SizeL,
     LegacyPasses->run(*Mod);
   }
 #else
+
+  PipelineTuningOptions PTO;
+
+  // Let the loopvec decide when to unroll.
+  PTO.LoopUnrolling = false;
+#if LLVM_MAJOR > 16
+  PTO.UnifiedLTO = false;
+#endif
+  PTO.SLPVectorization = Vectorize;
+  PTO.LoopVectorization = Vectorize;
+
+#ifdef DEBUG_NEW_PASS_MANAGER
+  PrintPassOptions PrintPassOpts;
+  PassInstrumentationCallbacks PIC;
+  llvm::LLVMContext Context; // for SI
+  std::unique_ptr<StandardInstrumentations> SI;
+  PrintPassOpts.Verbose = true;
+  PrintPassOpts.SkipAnalyses = false;
+  PrintPassOpts.Indent = true;
+  SI.reset(new StandardInstrumentations(Context,
+                                        true,  // debug logging
+                                        false, // verify each
+                                        PrintPassOpts));
+  SI->registerCallbacks(PIC, &MAM);
+
+  PassBuilder PB(TM, PTO, std::nullopt, &PIC);
+#else
+  PassBuilder PB(TM, PTO);
+#endif
+
   // Create the analysis managers.
   LoopAnalysisManager LAM;
   FunctionAnalysisManager FAM;
   CGSCCAnalysisManager CGAM;
   ModuleAnalysisManager MAM;
 
-  // Create the new pass manager builder.
-  // Take a look at the PassBuilder constructor parameters for more
-  // customization, e.g. specifying a TargetMachine or various debugging
-  // options.
-  PassBuilder PB;
-
   // Register all the basic analyses with the managers.
   PB.registerModuleAnalyses(MAM);
   PB.registerCGSCCAnalyses(CGAM);
-- 
2.47.1