From: Cordell Bloor <cgmb@slerp.xyz>
Date: Wed, 3 Jul 2024 11:25:24 -0600
Subject: use xnack-specialized assembly kernels with gfx90a

This change passes the xnack-specialized targets gfx90a:xnack- and
gfx90a:xnack+ for the Tensile architectures when rocBLAS is built for
the non-specialized gfx90a target. This helps to reduce the library
binary size without affecting the assembly kernels in Tensile.

Applied-Upstream: https://github.com/ROCm/rocBLAS/commit/6a267fdd2bfa9c64c4f7b08bd36025c00da605b2
Forwarded: not-needed
---
 CMakeLists.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1ea8e9d..3a0c9b8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -129,8 +129,11 @@ set( AMDGPU_TARGETS "${target_list}" CACHE STRING "AMD GPU targets to compile fo
 
 if(NOT SKIP_LIBRARY)
   if( BUILD_WITH_TENSILE )
+    list(TRANSFORM AMDGPU_TARGETS REPLACE "^gfx90a$" "gfx90a:xnack+;gfx90a:xnack-" OUTPUT_VARIABLE Tensile_ARCHITECTURE_INIT)
+    list(REMOVE_DUPLICATES Tensile_ARCHITECTURE_INIT)
+
     # we will have expanded "all" for tensile to ensure consistency as we have local rules
-    set( Tensile_ARCHITECTURE "${AMDGPU_TARGETS}" CACHE STRING "Tensile to use which architecture?" FORCE)
+    set( Tensile_ARCHITECTURE "${Tensile_ARCHITECTURE_INIT}" CACHE STRING "Tensile to use which architecture?" FORCE)
 
     set( Tensile_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/tensile/Tensile")
 
