File: sysman_os_diagnostics_imp.cpp

package info (click to toggle)
intel-compute-runtime 25.44.36015.8-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 79,632 kB
  • sloc: cpp: 931,547; lisp: 2,074; sh: 719; makefile: 162; python: 21
file content (137 lines) | stat: -rw-r--r-- 6,388 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
/*
 * Copyright (C) 2023-2025 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

#include "level_zero/sysman/source/api/diagnostics/linux/sysman_os_diagnostics_imp.h"

#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/helpers/sleep.h"
#include "shared/source/helpers/string.h"

#include "level_zero/sysman/source/shared/firmware_util/sysman_firmware_util.h"
#include "level_zero/sysman/source/shared/linux/sysman_fs_access_interface.h"

namespace L0 {
namespace Sysman {

const std::string LinuxDiagnosticsImp::deviceDir("device");

// the sysfs node will be at /sys/class/drm/card<n>/invalidate_lmem_mmaps
const std::string LinuxDiagnosticsImp::invalidateLmemFile("invalidate_lmem_mmaps");
// the sysfs node will be at /sys/class/drm/card<n>/quiesce_gpu
const std::string LinuxDiagnosticsImp::quiescentGpuFile("quiesce_gpu");
void OsDiagnostics::getSupportedDiagTestsFromFW(void *pOsSysman, std::vector<std::string> &supportedDiagTests) {}

// before running diagnostics need to close all active workloads
// writing 1 to /sys/class/drm/card<n>/quiesce_gpu will signal KMD
// to close and clear all allocations,
// ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE will be sent till the kworker confirms that
// all allocations are closed and GPU is be wedged.
// GPU will only be unwedged after warm/cold reset
// writing 1 to /sys/class/drm/card<n>/invalidate_lmem_mmaps clears
// all memory mappings where LMEMBAR is being referenced are invalidated.
// Also prevents new ones from being created.
// It will invalidate LMEM memory mappings only when sysfs entry quiesce_gpu is set.
ze_result_t LinuxDiagnosticsImp::waitForQuiescentCompletion() {
    uint32_t count = 0;
    const int intVal = 1;
    ze_result_t result = ZE_RESULT_ERROR_UNKNOWN;

    // limiting to 10 retries as we can endup going into a infinite loop if the cleanup and a process start are out of sync
    do {
        result = pSysfsAccess->write(quiescentGpuFile, intVal);
        if (ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE == result) {
            count++;

            // Sleep for 1second every loop, gives enough time for KMD to clear all allocations and wedge the system
            NEO::sleep(std::chrono::seconds(1));
            auto processResult = pLinuxSysmanImp->gpuProcessCleanup(true);
            if (ZE_RESULT_SUCCESS != processResult) {
                return processResult;
            }
        } else if (ZE_RESULT_SUCCESS == result) {
            break;
        } else {
            return result;
        }
    } while (count < 10);
    result = pSysfsAccess->write(invalidateLmemFile, intVal);
    if (ZE_RESULT_SUCCESS != result) {
        NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): SysfsAccess->write() failed to write into %s and returning error:0x%x \n", __FUNCTION__, invalidateLmemFile.c_str(), result);
        return result;
    }
    return result;
}

ze_result_t LinuxDiagnosticsImp::osRunDiagTestsinFW(zes_diag_result_t *pResult) {
    pLinuxSysmanImp->diagnosticsReset = true;
    pLinuxSysmanImp->releaseSysmanDeviceResources();
    ze_result_t result = pLinuxSysmanImp->gpuProcessCleanup(true);
    if (ZE_RESULT_SUCCESS != result) {
        NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): gpuProcessCleanup() failed and returning error:0x%x \n", __FUNCTION__, result);
        return result;
    }
    result = waitForQuiescentCompletion();
    if (ZE_RESULT_SUCCESS != result) {
        NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): waitForQuiescentCompletion() failed and returning error:0x%x \n", __FUNCTION__, result);
        return result;
    }
    result = pFwInterface->fwRunDiagTests(osDiagType, pResult);
    if (ZE_RESULT_SUCCESS != result) {
        NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): fwRunDiagTests() failed and returning error:0x%x \n", __FUNCTION__, result);
        return result;
    }

    if (osDiagType == "MEMORY_PPR") {
        pLinuxSysmanImp->isMemoryDiagnostics = true;
    }

    if (*pResult == ZES_DIAG_RESULT_REBOOT_FOR_REPAIR) {
        result = pLinuxSysmanImp->osColdReset();
        if (result != ZE_RESULT_SUCCESS) {
            NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): osColdReset() failed and returning error:0x%x \n", __FUNCTION__, result);
            return result;
        }
    } else {
        result = pLinuxSysmanImp->osWarmReset(); // we need to at least do a Warm reset to bring the machine out of wedged state
        if (result != ZE_RESULT_SUCCESS) {
            NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): osWarmReset() failed and returning error:0x%x \n", __FUNCTION__, result);
            return result;
        }
    }
    return pLinuxSysmanImp->reInitSysmanDeviceResources();
}

void LinuxDiagnosticsImp::osGetDiagProperties(zes_diag_properties_t *pProperties) {
    pProperties->onSubdevice = isSubdevice;
    pProperties->subdeviceId = subdeviceId;
    pProperties->haveTests = 0; // osGetDiagTests is Unsupported
    strncpy_s(pProperties->name, ZES_STRING_PROPERTY_SIZE, osDiagType.c_str(), osDiagType.size());
    return;
}

ze_result_t LinuxDiagnosticsImp::osGetDiagTests(uint32_t *pCount, zes_diag_test_t *pTests) {
    NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s() returning UNSUPPORTED_FEATURE \n", __FUNCTION__);
    return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}

ze_result_t LinuxDiagnosticsImp::osRunDiagTests(uint32_t start, uint32_t end, zes_diag_result_t *pResult) {
    return osRunDiagTestsinFW(pResult);
}

LinuxDiagnosticsImp::LinuxDiagnosticsImp(OsSysman *pOsSysman, const std::string &diagTests) : osDiagType(diagTests) {
    pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
    pFwInterface = pLinuxSysmanImp->getFwUtilInterface();
    pSysfsAccess = &pLinuxSysmanImp->getSysfsAccess();
}

std::unique_ptr<OsDiagnostics> OsDiagnostics::create(OsSysman *pOsSysman, const std::string &diagTests) {
    std::unique_ptr<LinuxDiagnosticsImp> pLinuxDiagnosticsImp = std::make_unique<LinuxDiagnosticsImp>(pOsSysman, diagTests);
    return pLinuxDiagnosticsImp;
}

} // namespace Sysman
} // namespace L0