1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
|
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c23746e7f..bc326c8b5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,10 +23,10 @@ endif()
set(version 1.1.0)
# Check support for CUDA/HIP in Cmake
-project(composable_kernel VERSION ${version} LANGUAGES CXX)
+project(composable_kernel VERSION ${version} LANGUAGES CXX HIP)
include(CTest)
-find_package(Python3 3.6 COMPONENTS Interpreter REQUIRED)
+find_package(Python3 COMPONENTS Interpreter REQUIRED)
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
@@ -227,27 +227,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
-## OpenMP
-if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
- # workaround issue hipcc in rocm3.5 cannot find openmp
- set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
- set(OpenMP_CXX_FLAGS "-fopenmp=libomp -Wno-unused-command-line-argument")
- set(OpenMP_CXX_LIB_NAMES "libomp" "libgomp" "libiomp5")
- set(OpenMP_libomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
- set(OpenMP_libgomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
- set(OpenMP_libiomp5_LIBRARY ${OpenMP_CXX_LIB_NAMES})
-else()
- find_package(OpenMP REQUIRED)
-endif()
-
-message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
-message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
-message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
-message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
-
-link_libraries(${OpenMP_gomp_LIBRARY})
-link_libraries(${OpenMP_pthread_LIBRARY})
-
## HIP
find_package(HIP REQUIRED)
# Override HIP version in config.h, if necessary.
@@ -269,12 +248,6 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
message(STATUS "CK_HIP_VERSION_PATCH overridden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
endif()
message(STATUS "Build with HIP ${HIP_VERSION}")
-link_libraries(hip::device)
-if(CK_hip_VERSION VERSION_GREATER_EQUAL 6.0.23494)
- add_compile_definitions(__HIP_PLATFORM_AMD__=1)
-else()
- add_compile_definitions(__HIP_PLATFORM_HCC__=1)
-endif()
## tidy
include(EnableCompilerWarnings)
@@ -541,11 +514,3 @@ rocm_install(FILES
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
set(CPACK_RPM_PACKAGE_LICENSE "MIT")
-
-rocm_create_package(
- NAME composablekernel
- DESCRIPTION "High Performance Composable Kernel for AMD GPUs"
- MAINTAINER "MIOpen Kernels Dev Team <dl.MIOpen@amd.com>"
- LDCONFIG
- HEADER_ONLY
-)
diff --git a/example/ck_tile/01_fmha/generate.py b/example/ck_tile/01_fmha/generate.py
index 51fecd07b..5ed371995 100644
--- a/example/ck_tile/01_fmha/generate.py
+++ b/example/ck_tile/01_fmha/generate.py
@@ -566,7 +566,7 @@ def write_blobs(output_dir : Optional[str], kernel_filter : Optional[str], recei
def list_blobs(output_file : Optional[str], kernel_filter : Optional[str], receipt, mask_impl) -> None:
assert output_file is not None
file_path = Path(output_file)
- with file_path.open('a') as f:
+ with file_path.open('w') as f:
_, kernels = get_blobs(kernel_filter, receipt, mask_impl)
for kernel in kernels:
f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
diff --git a/include/ck/host_utility/hip_check_error.hpp b/include/ck/host_utility/hip_check_error.hpp
index c0894f1d7..559481fee 100644
--- a/include/ck/host_utility/hip_check_error.hpp
+++ b/include/ck/host_utility/hip_check_error.hpp
@@ -6,19 +6,7 @@
#include <sstream>
#include <hip/hip_runtime.h>
-// To be removed, which really does not tell the location of failed HIP functional call
-inline void hip_check_error(hipError_t x)
-{
- if(x != hipSuccess)
- {
- std::ostringstream ss;
- ss << "HIP runtime error: " << hipGetErrorString(x) << ". "
- << "hip_check_error.hpp"
- << ": " << __LINE__ << "in function: " << __func__;
- throw std::runtime_error(ss.str());
- }
-}
-
+#ifndef HIP_CHECK_ERROR
#define HIP_CHECK_ERROR(retval_or_funcall) \
do \
{ \
@@ -32,3 +20,9 @@ inline void hip_check_error(hipError_t x)
throw std::runtime_error(ostr.str()); \
} \
} while(0)
+#endif
+
+#ifndef hip_check_error
+#define hip_check_error HIP_CHECK_ERROR
+#endif
+
diff --git a/include/ck_tile/core/utility/transpose_vectors.hpp b/include/ck_tile/core/utility/transpose_vectors.hpp
index a164c3f94..293ead89a 100644
--- a/include/ck_tile/core/utility/transpose_vectors.hpp
+++ b/include/ck_tile/core/utility/transpose_vectors.hpp
@@ -11,6 +11,9 @@
namespace ck_tile {
+template <typename... Ts>
+constexpr bool always_false = false;
+
// S: scalar type (or it can be non-scalar type)
// NX: # of vector before transpose
// NY: # of vector after transpose
@@ -117,9 +120,11 @@ struct transpose_vectors
}
else
{
- static_assert(false, "not implemented");
+ static_assert(always_false<S_, number<NX>, number<NY>>, "not implemented");
}
}
};
+
} // namespace ck_tile
+
diff --git a/include/ck_tile/host/hip_check_error.hpp b/include/ck_tile/host/hip_check_error.hpp
index 3acdb4d87..cc26e184f 100644
--- a/include/ck_tile/host/hip_check_error.hpp
+++ b/include/ck_tile/host/hip_check_error.hpp
@@ -8,20 +8,7 @@
#include <stdexcept>
#include <hip/hip_runtime.h>
-namespace ck_tile {
-// To be removed, which really does not tell the location of failed HIP functional call
-CK_TILE_HOST void hip_check_error(hipError_t x)
-{
- if(x != hipSuccess)
- {
- std::ostringstream ss;
- ss << "HIP runtime error: " << hipGetErrorString(x) << ". " << __FILE__ << ": " << __LINE__
- << "in function: " << __func__;
- throw std::runtime_error(ss.str());
- }
-}
-} // namespace ck_tile
-
+#ifndef HIP_CHECK_ERROR
#define HIP_CHECK_ERROR(retval_or_funcall) \
do \
{ \
@@ -34,3 +21,9 @@ CK_TILE_HOST void hip_check_error(hipError_t x)
throw std::runtime_error(ostr.str()); \
} \
} while(0)
+#endif
+
+#ifndef hip_check_error
+#define hip_check_error HIP_CHECK_ERROR
+#endif
+
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index c035e7e56..8c5f36d2e 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -59,8 +59,14 @@ function(add_instance_library INSTANCE_NAME)
endforeach()
#only continue if there are some source files left on the list
if(ARGN)
+ set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
add_library(${INSTANCE_NAME} OBJECT ${ARGN})
+ # Always disable debug symbol and C debug assert due to
+ # - Linker error: ... relocation truncated to fit ..., caused by object files to be linked are too huge.
+ # - https://github.com/ROCmSoftwarePlatform/composable_kernel/issues/622
+ target_compile_options(${INSTANCE_NAME} PRIVATE -g0 -DNDEBUG)
target_compile_features(${INSTANCE_NAME} PUBLIC)
+ target_compile_definitions(${INSTANCE_NAME} PRIVATE "__HIP_PLATFORM_AMD__=1" "__HIP_PLATFORM_HCC__=1")
set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
clang_tidy_check(${INSTANCE_NAME})
set(result 0)
--- ./include/ck/utility/amd_buffer_addressing.hpp 2024-09-05 10:12:33.343091000 +0800
+++ ./include/ck/utility/amd_buffer_addressing_new.hpp 2024-09-05 10:12:20.276686000 +0800
@@ -991,7 +991,8 @@
asm volatile("s_mov_b32 m0, %0; \n\t"
"buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
"v"(global_offset_bytes),
- "s"(src_resource));
+ "s"(src_resource)
+ : "memory");
#else
// LDS pointer must be attributed with the LDS address space.
__attribute__((address_space(3))) uint32_t* lds_ptr =
--- ./include/ck_tile/core/arch/amd_buffer_addressing.hpp 2024-09-05 10:18:28.884031000 +0800
+++ ./include/ck_tile/core/arch/amd_buffer_addressing_new.hpp 2024-09-05 10:17:29.434931000 +0800
@@ -26,7 +26,12 @@
CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t size = 0xffffffff)
{
buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD};
- return __builtin_bit_cast(int32x4_t, res);
+ int32x4_t r = __builtin_bit_cast(int32x4_t, res);
+ r.x = __builtin_amdgcn_readfirstlane(r.x);
+ r.y = __builtin_amdgcn_readfirstlane(r.y);
+ r.z = __builtin_amdgcn_readfirstlane(r.z);
+ r.w = __builtin_amdgcn_readfirstlane(r.w);
+ return r;
}
// TODO: glc/slc/...
@@ -2016,7 +2021,8 @@
asm volatile("s_mov_b32 m0, %0; \n\t"
"buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
"v"(global_offset_bytes),
- "s"(src_resource));
+ "s"(src_resource)
+ : "memory");
#else
// LDS pointer must be attributed with the LDS address space.
__attribute__((address_space(3))) uint32_t* lds_ptr =
|