File: onednn_acl_thread_local_scheduler.patch

package info (click to toggle)
tensorflow 2.14.1%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 359,396 kB
  • sloc: cpp: 2,418,453; python: 736,954; java: 20,254; ansic: 18,962; sh: 9,279; pascal: 7,941; objc: 1,584; xml: 988; ada: 727; cs: 273; perl: 150; makefile: 92
file content (98 lines) | stat: -rw-r--r-- 4,396 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
 *******************************************************************************
 Copyright 2023 Arm Limited and affiliates.
 SPDX-License-Identifier: Apache-2.0

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 *******************************************************************************
diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
index d7d83badcb..1a7bcd74ed 100644
--- a/src/cpu/aarch64/acl_thread.cpp
+++ b/src/cpu/aarch64/acl_thread.cpp
@@ -41,14 +41,17 @@ void acl_thread_bind() {
 #endif

 #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
-void acl_set_custom_scheduler() {
-    static std::once_flag flag_once;
-    // Create threadpool scheduler
-    std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
-            = std::make_unique<ThreadpoolScheduler>();
+void acl_set_custom_scheduler(int intra_threads = 0) {
+    static thread_local std::once_flag flag_once;
     // set CUSTOM scheduler in ACL
     std::call_once(flag_once,
-            [&]() { arm_compute::Scheduler::set(threadpool_scheduler); });
+            [&]() {
+                    // Create threadpool scheduler
+                    std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
+                        = std::make_unique<ThreadpoolScheduler>();
+                    threadpool_scheduler->set_num_threads(intra_threads);
+
+                    arm_compute::Scheduler::set(threadpool_scheduler); });
 }

 void acl_set_threadpool_num_threads() {
diff --git a/src/cpu/aarch64/acl_thread.hpp b/src/cpu/aarch64/acl_thread.hpp
index 46dde5eb05..13b3910515 100644
--- a/src/cpu/aarch64/acl_thread.hpp
+++ b/src/cpu/aarch64/acl_thread.hpp
@@ -34,7 +34,7 @@ void acl_thread_bind();

 #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
 // Retrieve threadpool size during primitive execution and set ThreadpoolScheduler num_threads
-void acl_set_custom_scheduler();
+void acl_set_custom_scheduler(int intra_threads);
 void acl_set_threadpool_num_threads();
 #endif

diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.cpp b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
index 418d7f30f9..7eb8a052b0 100644
--- a/src/cpu/aarch64/acl_threadpool_scheduler.cpp
+++ b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
@@ -102,8 +102,6 @@ void ThreadpoolScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints,
 void ThreadpoolScheduler::run_workloads(
         std::vector<arm_compute::IScheduler::Workload> &workloads) {

-    arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
-
     const unsigned int num_threads
             = std::min(static_cast<unsigned int>(_num_threads),
                     static_cast<unsigned int>(workloads.size()));
diff --git a/src/cpu/cpu_engine.cpp b/src/cpu/cpu_engine.cpp
index 4ee70a405c..e9211f42e0 100644
--- a/src/cpu/cpu_engine.cpp
+++ b/src/cpu/cpu_engine.cpp
@@ -47,6 +47,7 @@ status_t cpu_engine_t::create_stream(stream_t **stream, unsigned flags) {
 #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL
 status_t cpu_engine_t::create_stream(stream_t **stream,
         dnnl::threadpool_interop::threadpool_iface *threadpool) {
+    dnnl::impl::cpu::aarch64::acl_thread_utils::acl_set_custom_scheduler(threadpool->get_num_threads());
     return safe_ptr_assign<stream_t>(
             *stream, new cpu_stream_t(this, threadpool));
 }
diff --git a/src/cpu/cpu_engine.hpp b/src/cpu/cpu_engine.hpp
index 7aa077e4ef..2938650963 100644
--- a/src/cpu/cpu_engine.hpp
+++ b/src/cpu/cpu_engine.hpp
@@ -175,11 +175,6 @@ public:
         // dnnl_get_max_threads() == OMP_NUM_THREADS
         dnnl::impl::cpu::aarch64::acl_thread_utils::acl_thread_bind();
 #endif
-
-#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
-        // Set ACL scheduler for threadpool runtime
-        dnnl::impl::cpu::aarch64::acl_thread_utils::acl_set_custom_scheduler();
-#endif
 #endif
         return status::success;
     };