From 15027e5133e0536b5ac6c296f8f9c62f9b02deb3 Mon Sep 17 00:00:00 2001
From: Cordell Bloor <cgmb@slerp.xyz>
Date: Mon, 23 Jun 2025 14:06:30 -0600
Subject: [PATCH] Revert removal of reset gpu partition

Upstream removed the functions rsmi_dev_compute_partition_reset and
rsmi_dev_memory_partition_reset in ROCm 6.3 without changing the
SOVERSION. This patch restores those functions by reverting upstream
commit a1295714f2f6eefa5c55bf5779d17d163ed0e3d0.

Forwarded: no
---
 include/rocm_smi/rocm_smi.h                   | 40 ++++++++++
 python_smi_tools/README.md                    | 13 +--
 python_smi_tools/rocm_smi.py                  | 74 ++++++++++++++++-
 rocm_smi/example/rocm_smi_example.cc          | 10 +++
 src/rocm_smi.cc                               | 80 ++++++++++++++++++-
 .../functional/computepartition_read_write.cc | 38 ++++-----
 .../functional/memorypartition_read_write.cc  | 25 +++++-
 7 files changed, 252 insertions(+), 28 deletions(-)

--- a/include/rocm_smi/rocm_smi.h
+++ b/include/rocm_smi/rocm_smi.h
@@ -4242,6 +4242,25 @@
                                rsmi_compute_partition_type_t compute_partition);
 
 /**
+ *  @brief Reverts a selected device's compute partition setting back to its
+ *  boot state.
+ *
+ *  @details Given a device index @p dv_ind , this function will attempt to
+ *  revert its compute partition setting back to its boot state.
+ *
+ *  @param[in] dv_ind a device index
+ *
+ *  @retval ::RSMI_STATUS_SUCCESS call was successful
+ *  @retval ::RSMI_STATUS_PERMISSION function requires root access
+ *  @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
+ *  support this function
+ *  @retval ::RSMI_STATUS_BUSY A resource or mutex could not be acquired
+ *  because it is already being used - device is busy
+ *
+ */
+rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind);
+
+/**
  *  @brief Retrieves the partition_id for a desired device
  *
  *  @details
@@ -4366,6 +4385,27 @@
 rsmi_dev_memory_partition_set(uint32_t dv_ind,
                               rsmi_memory_partition_type_t memory_partition);
 
+/**
+ *  @brief Reverts a selected device's memory partition setting back to its
+ *  boot state.
+ *
+ *  @details Given a device index @p dv_ind , this function will attempt to
+ *  revert its current memory partition setting back to its boot state.
+ *
+ *  @param[in] dv_ind a device index
+ *
+ *  @retval ::RSMI_STATUS_SUCCESS call was successful
+ *  @retval ::RSMI_STATUS_PERMISSION function requires root access
+ *  @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
+ *  support this function
+ *  @retval ::RSMI_STATUS_AMDGPU_RESTART_ERR could not successfully restart
+ *  the amdgpu driver
+ *  @retval ::RSMI_STATUS_BUSY A resource or mutex could not be acquired
+ *  because it is already being used - device is busy
+ *
+ */
+rsmi_status_t rsmi_dev_memory_partition_reset(uint32_t dv_ind);
+
 /** @} */  // end of memory_partition
 
 /*****************************************************************************/
--- a/python_smi_tools/README.md
+++ b/python_smi_tools/README.md
@@ -15,8 +15,8 @@
 
 ## Version
 
-The SMI will report two "versions", ROCM-SMI version and other is ROCM-SMI-LIB version.
-- ROCM-SMI version is the CLI/tool version number with commit ID appended after + sign.
+The SMI will report two "versions", ROCM-SMI version and other is ROCM-SMI-LIB version.  
+- ROCM-SMI version is the CLI/tool version number with commit ID appended after + sign.  
 - ROCM-SMI-LIB version is the library package version number.
 ```
 ROCM-SMI version: 2.0.0+8e78352
@@ -44,7 +44,7 @@
                 [--showtopoaccess] [--showtopoweight] [--showtopohops] [--showtopotype] [--showtoponuma]
                 [--showenergycounter] [--shownodesbw] [--showcomputepartition] [--showmemorypartition] [-r]
                 [--resetfans] [--resetprofile] [--resetpoweroverdrive] [--resetxgmierr] [--resetperfdeterminism]
-                [--setclock TYPE LEVEL] [--setsclk LEVEL [LEVEL ...]]
+                [--resetcomputepartition] [--resetmemorypartition] [--setclock TYPE LEVEL] [--setsclk LEVEL [LEVEL ...]]
                 [--setmclk LEVEL [LEVEL ...]] [--setpcie LEVEL [LEVEL ...]] [--setslevel SCLKLEVEL SCLK SVOLT]
                 [--setmlevel MCLKLEVEL MCLK MVOLT] [--setvc POINT SCLK SVOLT] [--setsrange SCLKMIN SCLKMAX]
                 [--setextremum min|max sclk|mclk CLK] [--setmrange MCLKMIN MCLKMAX] [--setfan LEVEL]
@@ -185,7 +185,8 @@
                                                                    state
   --resetxgmierr                                                   Reset XGMI error count
   --resetperfdeterminism                                           Disable performance determinism
-
+  --resetcomputepartition                                          Resets to boot compute partition state
+  --resetmemorypartition                                           Resets to boot memory partition state
 
 Auto-response options:
   --autorespond RESPONSE                                           Response to automatically provide for all prompts
@@ -199,8 +200,8 @@
 ```
 
 ## Detailed Option Descriptions
-`--setextremum <min/max> <sclk or mclk> <value in MHz to set to>`
-Provided ASIC support, users can now set a maximum or minimum sclk or mclk value through our Python CLI tool (`rocm-smi --setextremum max sclk 1500`). See example below.
+`--setextremum <min/max> <sclk or mclk> <value in MHz to set to>`  
+Provided ASIC support, users can now set a maximum or minimum sclk or mclk value through our Python CLI tool (`rocm-smi --setextremum max sclk 1500`). See example below.  
 
 ```shell
 $ sudo /opt/rocm/bin/rocm-smi --setextremum max sclk 2100
--- a/python_smi_tools/rocm_smi.py
+++ b/python_smi_tools/rocm_smi.py
@@ -1174,6 +1174,72 @@
     printLogSpacer()
 
 
+def resetComputePartition(deviceList):
+    """ Reset Compute Partition to its boot state
+
+    :param deviceList: List of DRM devices (can be a single-item list)
+    """
+    printLogSpacer(" Reset compute partition to its boot state ")
+    for device in deviceList:
+        originalPartition = getComputePartition(device)
+        ret = rocmsmi.rsmi_dev_compute_partition_reset(device)
+        if rsmi_ret_ok(ret, device, 'reset_compute_partition', silent=True):
+            resetBootState = getComputePartition(device)
+            printLog(device, "Successfully reset compute partition (" +
+                originalPartition + ") to boot state (" + resetBootState +
+                ")", None)
+        elif ret == rsmi_status_t.RSMI_STATUS_PERMISSION:
+            printLog(device, 'Permission denied', None)
+        elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
+            printLog(device, 'Not supported on the given system', None)
+        elif ret == rsmi_status_t.RSMI_STATUS_BUSY:
+            printLog(device, 'Device is currently busy, try again later',
+                     None)
+        else:
+            rsmi_ret_ok(ret, device, 'reset_compute_partition')
+            printErrLog(device, 'Failed to reset the compute partition to boot state')
+    printLogSpacer()
+
+
+def resetMemoryPartition(deviceList):
+    """ Reset current memory partition to its boot state
+
+    :param deviceList: List of DRM devices (can be a single-item list)
+    """
+    printLogSpacer(" Reset memory partition to its boot state ")
+    for device in deviceList:
+        originalPartition = getMemoryPartition(device)
+        t1 = multiprocessing.Process(target=showProgressbar,
+                            args=("Resetting memory partition",13,))
+        t1.start()
+        addExtraLine=True
+        start=time.time()
+        ret = rocmsmi.rsmi_dev_memory_partition_reset(device)
+        stop=time.time()
+        duration=stop-start
+        if t1.is_alive():
+            t1.terminate()
+            t1.join()
+        if duration < float(0.1):   # For longer runs, add extra line before output
+            addExtraLine=False      # This is to prevent overriding progress bar
+        if rsmi_ret_ok(ret, device, 'reset_memory_partition', silent=True):
+            resetBootState = getMemoryPartition(device)
+            printLog(device, "Successfully reset memory partition (" +
+                originalPartition + ") to boot state (" +
+                resetBootState + ")", None, addExtraLine)
+        elif ret == rsmi_status_t.RSMI_STATUS_PERMISSION:
+            printLog(device, 'Permission denied', None, addExtraLine)
+        elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
+            printLog(device, 'Not supported on the given system', None, addExtraLine)
+        elif ret == rsmi_status_t.RSMI_STATUS_BUSY:
+            printLog(device, 'Device is currently busy, try again later',
+                     None)
+        else:
+            rsmi_ret_ok(ret, device, 'reset_memory_partition')
+            printErrLog(device, 'Failed to reset memory partition to boot state')
+    printLogSpacer()
+
+
 def setClockRange(deviceList, clkType, minvalue, maxvalue, autoRespond):
     """ Set the range for the specified clktype in the PowerPlay table for a list of devices.
 
@@ -4256,6 +4322,8 @@
                                   action='store_true')
     groupActionReset.add_argument('--resetxgmierr', help='Reset XGMI error count', action='store_true')
     groupActionReset.add_argument('--resetperfdeterminism', help='Disable performance determinism', action='store_true')
+    groupActionReset.add_argument('--resetcomputepartition', help='Resets to boot compute partition state', action='store_true')
+    groupActionReset.add_argument('--resetmemorypartition', help='Resets to boot memory partition state', action='store_true')
     groupAction.add_argument('--setclock',
                              help='Set Clock Frequency Level(s) for specified clock (requires manual Perf level)',
                              metavar=('TYPE','LEVEL'), nargs=2)
@@ -4343,7 +4411,7 @@
             or args.setpoweroverdrive or args.resetpoweroverdrive or args.rasenable or args.rasdisable or \
             args.rasinject or args.gpureset or args.setperfdeterminism or args.setslevel or args.setmlevel or \
             args.setvc or args.setsrange or args.setextremum or args.setmrange or args.setclock or \
-            args.setcomputepartition or args.setmemorypartition:
+            args.setcomputepartition or args.setmemorypartition or args.resetcomputepartition or args.resetmemorypartition:
         relaunchAsSudo()
 
     # If there is one or more device specified, use that for all commands, otherwise use a
@@ -4593,6 +4661,10 @@
         resetXgmiErr(deviceList)
     if args.resetperfdeterminism:
         resetPerfDeterminism(deviceList)
+    if args.resetcomputepartition:
+        resetComputePartition(deviceList)
+    if args.resetmemorypartition:
+        resetMemoryPartition(deviceList)
     if args.rasenable:
         setRas(deviceList, 'enable', args.rasenable[0], args.rasenable[1])
     if args.rasdisable:
--- a/rocm_smi/example/rocm_smi_example.cc
+++ b/rocm_smi/example/rocm_smi_example.cc
@@ -652,6 +652,11 @@
     std::cout << "\n" << "\n";
   }
 
+  std::cout << "About to initate compute partition reset..." << "\n";
+  ret = rsmi_dev_compute_partition_reset(dv_ind);
+  CHK_RSMI_NOT_SUPPORTED_RET(ret)
+  std::cout << "Done resetting compute partition." << "\n";
+
   std::string myComputePartition = originalComputePartition;
   if (myComputePartition.empty() == false) {
     std::cout << "Resetting back to original compute partition to "
@@ -704,6 +709,11 @@
               << "." << "\n\n\n";
   }
 
+  std::cout << "About to initate memory partition reset...\n";
+  ret = rsmi_dev_memory_partition_reset(dv_ind);
+  CHK_RSMI_NOT_SUPPORTED_RET(ret)
+  std::cout << "Done resetting memory partition.\n";
+
   std::string myMemPart = originalMemoryPartition;
   if (myMemPart.empty() == false) {
     std::cout << "Resetting memory partition to " << originalMemoryPartition
--- a/src/rocm_smi.cc
+++ b/src/rocm_smi.cc
@@ -762,7 +762,7 @@
    * Add domain to full pci_id:
    * BDFID = ((DOMAIN & 0xFFFFFFFF) << 32) | ((PARTITION_ID & 0xF) << 28) |
    * ((BUS & 0xFF) << 8) | ((DEVICE & 0x1F) <<3 ) | (FUNCTION & 0x7)
-   *
+   * 
    * bits [63:32] = domain
    * bits [31:28] or bits [2:0] = partition id
    * bits [27:16] = reserved
@@ -5344,6 +5344,84 @@
      << " | Returning = "
      << getRSMIStatusString(ret) << " |";
   LOG_TRACE(ss);
+  return ret;
+  CATCH
+}
+
+rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind) {
+  TRY
+  std::ostringstream ss;
+  ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind;
+  LOG_TRACE(ss);
+  REQUIRE_ROOT_ACCESS
+  DEVICE_MUTEX
+  GET_DEV_FROM_INDX
+  rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED;
+
+  // Only use 1st index, rest are there in-case of future issues
+  // NOTE: Partitions sets cause rocm-smi indexes to fluctuate
+  // since the nodes are grouped in respect to primary node - why we only use
+  // 1st node/device id to reset
+  std::string bootState =
+          dev->readBootPartitionState<rsmi_compute_partition_type_t>(0);
+
+  // Initiate reset
+  // If bootState is UNKNOWN, we cannot reset - return RSMI_STATUS_NOT_SUPPORTED
+  // Likely due to device not supporting it
+  if (bootState != "UNKNOWN") {
+    rsmi_compute_partition_type_t compute_partition =
+      mapStringToRSMIComputePartitionTypes.at(bootState);
+    ret = rsmi_dev_compute_partition_set(dv_ind, compute_partition);
+  }
+  ss << __PRETTY_FUNCTION__
+     << " | ======= end ======= "
+     << " | Success - if original boot state was not unknown or valid setting"
+     << " | Device #: " << dv_ind
+     << " | Type: "
+     << devInfoTypesStrings.at(amd::smi::kDevComputePartition)
+     << " | Data: " << bootState
+     << " | Returning = "
+     << getRSMIStatusString(ret) << " |";
+  LOG_TRACE(ss);
+  return ret;
+  CATCH
+}
+
+rsmi_status_t rsmi_dev_memory_partition_reset(uint32_t dv_ind) {
+  TRY
+  std::ostringstream ss;
+  ss << __PRETTY_FUNCTION__ << "| ======= start =======, " << dv_ind;
+  LOG_TRACE(ss);
+  REQUIRE_ROOT_ACCESS
+  DEVICE_MUTEX
+  GET_DEV_FROM_INDX
+  rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED;
+
+  // Only use 1st index, rest are there in-case of future issues
+  // NOTE: Partitions sets cause rocm-smi indexes to fluctuate.
+  // Since the nodes are grouped in respect to primary node - why we only use
+  // 1st node/device id to reset
+  std::string bootState =
+          dev->readBootPartitionState<rsmi_memory_partition_type_t>(0);
+
+  // Initiate reset
+  // If bootState is UNKNOWN, we cannot reset - return RSMI_STATUS_NOT_SUPPORTED
+  // Likely due to device not supporting it
+  if (bootState != "UNKNOWN") {
+    rsmi_memory_partition_type_t memory_partition =
+      mapStringToMemoryPartitionTypes.at(bootState);
+    ret = rsmi_dev_memory_partition_set(dv_ind, memory_partition);
+  }
+  ss << __PRETTY_FUNCTION__
+     << " | ======= end ======= "
+     << " | Success - if original boot state was not unknown or valid setting"
+     << " | Device #: " << dv_ind
+     << " | Type: "
+     << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition)
+     << " | Data: " << bootState
+     << " | Returning = "
+     << getRSMIStatusString(ret) << " |";
+  LOG_TRACE(ss);
   return ret;
   CATCH
 }
--- a/tests/rocm_smi_test/functional/computepartition_read_write.cc
+++ b/tests/rocm_smi_test/functional/computepartition_read_write.cc
@@ -428,7 +428,7 @@
      * [0:SPX, 1:CPX, 2:CPX, 3:CPX, 4:CPX, 5:CPX, 6:SPX, 7:SPX] <- set 1 to CPX
      * [0:SPX, 1:SPX, 2:SPX, 3:SPX] <- reset(1)
      * ...
-     *
+     * 
      */
     std::string final_partition_state = "UNKNOWN";
 
@@ -609,29 +609,29 @@
                 << "========" << std::endl;
     }
     std::string oldPartition = current_char_computePartition;
-    rsmi_compute_partition_type_t updatePartition =
-         static_cast<rsmi_compute_partition_type_t>(
-          mapStringToRSMIComputePartitionTypes.at(
-            std::string(orig_char_computePartition)));
-    ret = rsmi_dev_compute_partition_set(dv_ind, updatePartition);
-
-
-    ret = rsmi_dev_compute_partition_get(dv_ind, current_char_computePartition, 255);
-    if (strcmp(oldPartition.c_str(), current_char_computePartition) !=
-       0) {
-        devicePartitionUpdated = true;
-        final_partition_state = current_char_computePartition;
-    } else {
-        devicePartitionUpdated = false;
+    bool wasResetSuccess = false;
+    ret = rsmi_dev_compute_partition_reset(dv_ind);
+    IF_VERB(STANDARD) {
+      std::cout << "\t**"
+                << "rsmi_dev_compute_partition_reset(" << dv_ind << "): "
+                << amd::smi::getRSMIStatusString(ret, false) << "\n";
+    }
+    ASSERT_TRUE((ret == RSMI_STATUS_SUCCESS) ||
+                (ret == RSMI_STATUS_NOT_SUPPORTED) ||
+                (ret == RSMI_STATUS_BUSY));
+    if (ret == RSMI_STATUS_SUCCESS) {
+      wasResetSuccess = true;
     }
+    ret = rsmi_dev_compute_partition_get(dv_ind, current_char_computePartition,
+                                        255);
     CHK_ERR_ASRT(ret)
     IF_VERB(STANDARD) {
       std::cout << "\t**" << "Current compute partition: "
                 << current_char_computePartition << "\n"
-                << "\t**" << "Old Partition partition (before setting to original): "
-                << oldPartition << "\n"
                 << "\t**" << "Original compute partition: "
                 << orig_char_computePartition << "\n"
+                << "\t**" << "Reset Successful: "
+                << (wasResetSuccess ? "TRUE" : "FALSE") << "\n"
                 << "\t**" << "Partitions Updated: "
                 << (devicePartitionUpdated ? "TRUE" : "FALSE") << "\n";
     }
@@ -643,8 +643,8 @@
       checkPartitionIdChanges(dv_ind, std::string(current_char_computePartition),
                             isVerbose, false);
     }
-    if (devicePartitionUpdated) {
-      EXPECT_STRNE(oldPartition.c_str(), current_char_computePartition);
+    if (wasResetSuccess && devicePartitionUpdated) {
+      ASSERT_STRNE(oldPartition.c_str(), current_char_computePartition);
       IF_VERB(STANDARD) {
       std::cout << "\t**"
                 << "Confirmed prior partition (" << oldPartition << ") is not "
--- a/tests/rocm_smi_test/functional/memorypartition_read_write.cc
+++ b/tests/rocm_smi_test/functional/memorypartition_read_write.cc
@@ -411,7 +411,13 @@
                 << " ) ========" << std::endl;
     }
     std::string oldMode = current_memory_partition;
-
+    bool wasResetSuccess = false;
+    ret = rsmi_dev_memory_partition_reset(dv_ind);
+    ASSERT_TRUE((ret == RSMI_STATUS_SUCCESS) ||
+                (ret == RSMI_STATUS_NOT_SUPPORTED));
+    if (ret == RSMI_STATUS_SUCCESS) {
+      wasResetSuccess = true;
+    }
     ret = rsmi_dev_memory_partition_get(dv_ind, current_memory_partition, 255);
     CHK_ERR_ASRT(ret)
     IF_VERB(STANDARD) {
@@ -419,6 +425,23 @@
                 << "Current memory partition: " << current_memory_partition
                 << std::endl;
     }
+    if (wasResetSuccess && wasSetSuccess) {
+      ASSERT_STRNE(oldMode.c_str(), current_memory_partition);
+      IF_VERB(STANDARD) {
+      std::cout << "\t**"
+                << "Confirmed prior memory partition (" << oldMode << ") is "
+                << "not equal to current memory partition ("
+                << current_memory_partition << ")" << std::endl;
+      }
+    } else {
+      ASSERT_STREQ(oldMode.c_str(), current_memory_partition);
+      IF_VERB(STANDARD) {
+      std::cout << "\t**"
+                << "Confirmed prior memory partition (" << oldMode << ") is "
+                << "equal to current memory partition ("
+                << current_memory_partition << ")" << std::endl;
+      }
+    }
 
     new_memory_partition
       = mapStringToRSMIMemoryPartitionTypes.at(orig_memory_partition);
