File: 0013-add-arm-and-ppc-memory-barriers.patch

package info (click to toggle)
rocr-runtime 6.4.3%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 12,928 kB
  • sloc: cpp: 126,824; ansic: 41,837; lisp: 1,225; asm: 905; sh: 452; python: 117; makefile: 59
file content (124 lines) | stat: -rw-r--r-- 6,328 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
From: Cordell Bloor <cgmb@debian.org>
Date: Sat, 25 Oct 2025 09:10:37 +0000
Subject: add arm and ppc memory barriers

I'm not entirely certain that these are correct, but perhaps
an expert can take a look when it gets forwarded upstream.

Forwarded: no
---
 runtime/hsa-runtime/core/inc/amd_gpu_agent.h         | 12 ++++++++++++
 runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp   |  6 ++++++
 runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp |  8 +++++++-
 runtime/hsa-runtime/core/runtime/intercept_queue.cpp | 18 ++++++++++++++++++
 4 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
index 0045289..8abd650 100644
--- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
+++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
@@ -435,9 +435,21 @@ class GpuAgent : public GpuAgentInt {
   /// @brief Force a WC flush on PCIe devices by doing a write and then read-back
   __forceinline void PcieWcFlush(void *ptr, size_t size) const {
     if (!xgmi_cpu_gpu_) {
+#if defined(__x86_64__) || defined(_M_X64)
       _mm_sfence();
+#elif defined(__aarch64__) || defined(_M_ARM64)
+      asm volatile("dsb st" ::: "memory");
+#elif defined(__ppc__) || defined(__powerpc__) || defined(__powerpc64__)
+      asm volatile("eieio" ::: "memory");
+#endif
       *((uint8_t*)ptr + size - 1) = *((uint8_t*)ptr + size - 1);
+#if defined(__x86_64__) || defined(_M_X64)
       _mm_mfence();
+#elif defined(__aarch64__) || defined(_M_ARM64)
+      asm volatile("dsb sy" ::: "memory");
+#elif defined(__ppc__) || defined(__powerpc__) || defined(__powerpc64__)
+      asm volatile("sync" ::: "memory");
+#endif
       auto readback = *(reinterpret_cast<volatile uint8_t*>(ptr) + size - 1);
     }
   }
diff --git a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
index f8613db..d9e5228 100644
--- a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
@@ -1665,7 +1665,13 @@ void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b, hsa_fence_scope
   memcpy(&queue_slot[1], &slot_data[1], slot_size_b - sizeof(uint32_t));
   if (core::Runtime::runtime_singleton_->flag().dev_mem_queue() && !agent_->is_xgmi_cpu_gpu()) {
     // Ensure the packet body is written as header may get reordered when writing over PCIE
+#if defined(__x86_64__) || defined(_M_X64)
     _mm_sfence();
+#elif defined(__aarch64__) || defined(_M_ARM64)
+    asm volatile("dmb st");
+#elif defined(__ppc__) || defined(__powerpc__) || defined(__powerpc64__)
+    asm volatile("eieio" ::: "memory");
+#endif
   }
   atomic::Store(&queue_slot[0], slot_data[0], std::memory_order_release);
 
diff --git a/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp b/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp
index 3286f91..acc6e0f 100644
--- a/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp
@@ -892,7 +892,13 @@ void BlitKernel::PopulateQueue(uint64_t index, uint64_t code_handle, void* args,
   std::atomic_thread_fence(std::memory_order_release);
   if (core::Runtime::runtime_singleton_->flag().dev_mem_queue() && !queue_->needsPcieOrdering()) {
     // Ensure the packet body is written as header may get reordered when writing over PCIE
-    _mm_sfence();
+#if defined(__x86_64__) || defined(_M_X64)
+      _mm_sfence();
+#elif defined(__aarch64__) || defined(_M_ARM64)
+      asm volatile("dmb st");
+#elif defined(__ppc__) || defined(__powerpc__) || defined(__powerpc64__)
+      asm volatile("eieio" ::: "memory");
+#endif
   }
   queue_buffer[index & queue_bitmask_].header = kDispatchPacketHeader;
 
diff --git a/runtime/hsa-runtime/core/runtime/intercept_queue.cpp b/runtime/hsa-runtime/core/runtime/intercept_queue.cpp
index a86dabb..9c8a66f 100644
--- a/runtime/hsa-runtime/core/runtime/intercept_queue.cpp
+++ b/runtime/hsa-runtime/core/runtime/intercept_queue.cpp
@@ -258,7 +258,13 @@ uint64_t InterceptQueue::Submit(const AqlPacket* packets, uint64_t count) {
       ring[barrier & mask].barrier_and.completion_signal = Signal::Convert(async_doorbell_);
       if (Runtime::runtime_singleton_->flag().dev_mem_queue() && !needsPcieOrdering()) {
         // Ensure the packet body is written as header may get reordered when writing over PCIE
+#if defined(__x86_64__) || defined(_M_X64)
         _mm_sfence();
+#elif defined(__aarch64__) || defined(_M_ARM64)
+        asm volatile("dmb st");
+#elif defined(__ppc__) || defined(__powerpc__) || defined(__powerpc64__)
+        asm volatile("eieio" ::: "memory");
+#endif
       }
       atomic::Store(&ring[barrier & mask].barrier_and.header, kBarrierHeader,
                     std::memory_order_release);
@@ -305,7 +311,13 @@ uint64_t InterceptQueue::Submit(const AqlPacket* packets, uint64_t count) {
       if (write_index != 0) {
         if (Runtime::runtime_singleton_->flag().dev_mem_queue() && !needsPcieOrdering()) {
           // Ensure the packet body is written as header may get reordered when writing over PCIE
+#if defined(__x86_64__) || defined(_M_X64)
           _mm_sfence();
+#elif defined(__aarch64__) || defined(_M_ARM64)
+          asm volatile("dmb st");
+#elif defined(__ppc__) || defined(__powerpc__) || defined(__powerpc64__)
+          asm volatile("eieio" ::: "memory");
+#endif
         }
         atomic::Store(&ring[write & mask].packet.header, packets[first_written_packet_index].packet.header,
                       std::memory_order_release);
@@ -374,7 +386,13 @@ void InterceptQueue::StoreRelaxed(hsa_signal_value_t value) {
     handler.first(&ring[i & mask], 1, i, handler.second, PacketWriter);
     if (Runtime::runtime_singleton_->flag().dev_mem_queue() && !needsPcieOrdering()) {
       // Ensure the packet body is written as header may get reordered when writing over PCIE
+#if defined(__x86_64__) || defined(_M_X64)
       _mm_sfence();
+#elif defined(__aarch64__) || defined(_M_ARM64)
+      asm volatile("dmb st");
+#elif defined(__ppc__) || defined(__powerpc__) || defined(__powerpc64__)
+      asm volatile("eieio" ::: "memory");
+#endif
     }
     // Invalidate consumed packet.
     atomic::Store(&ring[i & mask].packet.header, kInvalidHeader, std::memory_order_release);