File: Downgrade-ROCm-dependency-to-5.5.patch

package info (click to toggle)
ggml 0.9.4-1
links: PTS, VCS
area: main
in suites: sid
size: 17,128 kB
sloc: cpp: 107,161; ansic: 36,329; lisp: 9,094; python: 1,558; objc: 1,045; sh: 773; makefile: 59
file content (143 lines) | stat: -rw-r--r-- 5,439 bytes
From: Christian Kastner <ckk@debian.org>
Date: Sun, 28 Sep 2025 23:05:58 +0200
Subject: Downgrade ROCm dependency to 5.5

The downgrade is necessary to have this package buildable in trixie and
in current unstable.

This requires a fixed rocm-hipamd 5.7.1, see

    http://bugs.debian.org/1116585

Origin: https://github.com/ggml-org/llama.cpp/pull/15296
---
 src/ggml-cuda/common.cuh    | 14 +++++++++-----
 src/ggml-cuda/ggml-cuda.cu  | 24 ++++++++++++++++++++++++
 src/ggml-cuda/vendors/hip.h | 16 ++++++++++++++++
 src/ggml-hip/CMakeLists.txt |  4 ++--
 4 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/src/ggml-cuda/common.cuh b/src/ggml-cuda/common.cuh
index 3b13491..2a9422a 100644
--- a/src/ggml-cuda/common.cuh
+++ b/src/ggml-cuda/common.cuh
@@ -486,21 +486,25 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
 }
 
 static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
-#if defined(GGML_USE_HIP)
+#if defined(GGML_USE_HIP) && HIP_VERSION >= 50700000
     return half2(__hmax(a.x, b.x), __hmax(a.y, b.y));
-#elif CUDART_VERSION >= CUDART_HMAX
+#elif !defined(GGML_USE_HIP) && CUDART_VERSION >= CUDART_HMAX
     return __hmax2(a, b);
-#else
+#elif !defined(GGML_USE_HIP)
     half2 ret;
     reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a),  __low2float(b)));
     reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
     return ret;
+#else
+    GGML_UNUSED(a);
+    GGML_UNUSED(b);
+    NO_DEVICE_CODE;
 #endif
 }
 
 template<int width = WARP_SIZE>
 static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
-#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
 #pragma unroll
    for (int offset = width/2; offset > 0; offset >>= 1) {
        x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
@@ -509,7 +513,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
 #else
    GGML_UNUSED(x);
    NO_DEVICE_CODE;
-#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
+#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
 }
 
 #if (defined(CUDART_VERSION) && CUDART_VERSION < CUDART_HMASK) || defined(GGML_USE_HIP) || \
diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu
index 4d85c5d..057af99 100644
--- a/src/ggml-cuda/ggml-cuda.cu
+++ b/src/ggml-cuda/ggml-cuda.cu
@@ -183,6 +183,30 @@ static int ggml_cuda_parse_id(char devName[]) {
 #endif // defined(GGML_USE_HIP)
 
 static ggml_cuda_device_info ggml_cuda_init() {
+#if defined(GGML_USE_HIP)
+    // Workaround for a rocBLAS bug when using multiple graphics cards:
+    // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
+    {
+        int major_version = 0;
+        size_t version_length = 0;
+        if (rocblas_get_version_string_size(&version_length) == rocblas_status_success) {
+            std::vector<char> version(version_length+1, '\0');
+            if (rocblas_get_version_string(version.data(), version.size()) == rocblas_status_success) {
+                version.resize(::strlen(version.data()));
+                int parsed_value = 0;
+                if (std::from_chars(version.data(), version.data() + version.size(), parsed_value).ec == std::errc()) {
+                    major_version = parsed_value;
+                }
+            }
+        }
+        if (major_version < 4) {
+            GGML_LOG_DEBUG(GGML_CUDA_NAME " calling rocblas_initialize as a workaround for a rocBLAS bug\n");
+            rocblas_initialize();
+            CUDA_CHECK(cudaDeviceSynchronize());
+        }
+    }
+#endif
+
     ggml_cuda_device_info info = {};
 
     cudaError_t err = cudaGetDeviceCount(&info.device_count);
diff --git a/src/ggml-cuda/vendors/hip.h b/src/ggml-cuda/vendors/hip.h
index 37386af..9f41a87 100644
--- a/src/ggml-cuda/vendors/hip.h
+++ b/src/ggml-cuda/vendors/hip.h
@@ -5,6 +5,8 @@
 #include <hipblas/hipblas.h>
 #include <hip/hip_fp16.h>
 #include <hip/hip_bf16.h>
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
 
 #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
@@ -263,3 +265,17 @@ static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigne
     }
     return c;
 }
+
+#if HIP_VERSION < 50600000
+// __shfl_xor() for half2 was added in ROCm 5.6
+static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
+    typedef union half2_b32 {
+        half2 val;
+        int   b32;
+    } half2_b32_t;
+    half2_b32_t tmp;
+    tmp.val = var;
+    tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
+    return tmp.val;
+}
+#endif
diff --git a/src/ggml-hip/CMakeLists.txt b/src/ggml-hip/CMakeLists.txt
index d327b90..852de97 100644
--- a/src/ggml-hip/CMakeLists.txt
+++ b/src/ggml-hip/CMakeLists.txt
@@ -46,8 +46,8 @@ if (GGML_HIP_ROCWMMA_FATTN)
     endif()
 endif()
 
-if (${hip_VERSION} VERSION_LESS 6.1)
-    message(FATAL_ERROR "At least ROCM/HIP V6.1 is required")
+if (${hip_VERSION} VERSION_LESS 5.5)
+    message(FATAL_ERROR "At least ROCM/HIP V5.5 is required")
 endif()
 
 message(STATUS "HIP and hipBLAS found")