File: os_ras_imp_gt.cpp

package info (click to toggle)
intel-compute-runtime 25.48.36300.8-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 80,652 kB
  • sloc: cpp: 939,022; lisp: 2,090; sh: 722; makefile: 162; python: 21
file content (357 lines) | stat: -rw-r--r-- 18,312 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
/*
 * Copyright (C) 2021-2025 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

#include "shared/source/debug_settings/debug_settings_manager.h"

#include "level_zero/tools/source/sysman/linux/fs_access.h"
#include "level_zero/tools/source/sysman/linux/os_sysman_imp.h"
#include "level_zero/tools/source/sysman/linux/pmu/pmu.h"
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp.h"
#include "level_zero/tools/source/sysman/sysman_imp.h"

#include <linux/perf_event.h>

namespace L0 {

static const std::map<zes_ras_error_category_exp_t, std::vector<std::string>> categoryToListOfEventsUncorrectable = {
    {ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS,
     {"fatal-array-bist", "fatal-idi-parity", "fatal-l3-double",
      "fatal-l3-ecc-checker",
      "fatal-sqidi", "fatal-tlb", "fatal-l3bank"}},
    {ZES_RAS_ERROR_CATEGORY_EXP_RESET,
     {"engine-reset"}},
    {ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS,
     {"eu-attention"}},
    {ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS,
     {"soc-fatal-psf-0", "soc-fatal-psf-1", "soc-fatal-psf-2", "soc-fatal-psf-csc-0",
      "soc-fatal-psf-csc-1", "soc-fatal-psf-csc-2", "soc-fatal-punit",
      "sgunit-fatal", "soc-nonfatal-punit", "sgunit-fatal", "sgunit-nonfatal", "gsc-nonfatal-mia-shutdown",
      "gsc-nonfatal-aon-parity", "gsc-nonfatal-rom-parity", "gsc-nonfatal-fuse-crc-check",
      "gsc-nonfatal-selfmbist", "gsc-nonfatal-fuse-pull", "gsc-nonfatal-sram-ecc", "gsc-nonfatal-glitch-det",
      "gsc-nonfatal-ucode-parity", "gsc-nonfatal-mia-int", "gsc-nonfatal-wdg-timeout",
      "soc-nonfatal-mdfi-east", "soc-nonfatal-mdfi-south",
      "soc-nonfatal-cd0-mdfi", "soc-fatal-iosf-pciaer", "soc-fatal-iosf-pcierr", "soc-fatal-pciaer",
      "soc-fatal-pcierr", "soc-fatal-serr-spi", "soc-fatal-serr-srcs", "soc-fatal-ur-response", "soc-fatal-ur",
      "soc-fatal-hbm-mca", "soc-fatal-hbm-punit-mca"}},
    {ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS,
     {"fatal-fpu", "fatal-eu-grf", "fatal-sampler", "fatal-slm",
      "fatal-guc", "fatal-eu-ic", "fatal-subslice"}},
    {ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS,
     {"driver-object-migration", "driver-engine-other", "driver-ggtt",
      "driver-gt-interrupt", "driver-gt-other", "driver-guc-communication",
      "driver-rps"}},
    {ZES_RAS_ERROR_CATEGORY_EXP_L3FABRIC_ERRORS,
     {"soc-fatal-mdfi-east", "soc-fatal-mdfi-south", "soc-fatal-mdfi-west", "fatal-l3-fabric", "soc-fatal-cd0-mdfi"}}};

static const std::map<zes_ras_error_category_exp_t, std::vector<std::string>> categoryToListOfEventsCorrectable = {
    {ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS,
     {"correctable-l3-sng", "correctable-l3bank"}},
    {ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS,
     {"sgunit-correctable", "gsc-correctable-sram-ecc"}},
    {ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS,
     {"correctable-eu-grf", "correctable-eu-ic", "correctable-guc", "correctable-sampler", "correctable-slm", "correctable-subslice"}}};

static void closeFd(int64_t &fd) {
    if (fd != -1) {
        NEO::SysCalls::close(static_cast<int>(fd));
        fd = -1;
    }
}

static const std::map<zes_ras_error_category_exp_t, zes_ras_error_cat_t> rasErrorCatExpToErrorCat = {
    {ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS, ZES_RAS_ERROR_CAT_CACHE_ERRORS},
    {ZES_RAS_ERROR_CATEGORY_EXP_RESET, ZES_RAS_ERROR_CAT_RESET},
    {ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS, ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS},
    {ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS, ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS},
    {ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS, ZES_RAS_ERROR_CAT_COMPUTE_ERRORS},
    {ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS, ZES_RAS_ERROR_CAT_DRIVER_ERRORS},
    {ZES_RAS_ERROR_CATEGORY_EXP_DISPLAY_ERRORS, ZES_RAS_ERROR_CAT_DISPLAY_ERRORS}};

static ze_result_t readI915EventsDirectory(LinuxSysmanImp *pLinuxSysmanImp, std::vector<std::string> &listOfEvents, std::string *eventDirectory) {
    // To know how many errors are supported on a platform scan
    // /sys/devices/i915_0000_01_00.0/events/
    // all events are enumerated in sysfs at /sys/devices/i915_0000_01_00.0/events/
    // For above example device is in PCI slot 0000:01:00.0:
    SysfsAccess *pSysfsAccess = &pLinuxSysmanImp->getSysfsAccess();
    const std::string deviceDir("device");
    const std::string sysDevicesDir("/sys/devices/");
    std::string bdfDir;
    ze_result_t result = pSysfsAccess->readSymLink(deviceDir, bdfDir);
    if (ZE_RESULT_SUCCESS != result) {
        NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed to read Symlink from %s and returning error:0x%x \n", __FUNCTION__, deviceDir.c_str(), ZE_RESULT_ERROR_UNSUPPORTED_FEATURE);
        return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
    }
    const auto loc = bdfDir.find_last_of('/');
    auto bdf = bdfDir.substr(loc + 1);
    std::replace(bdf.begin(), bdf.end(), ':', '_');
    std::string i915DirName = "i915_" + bdf;
    std::string sysfsNode = sysDevicesDir + i915DirName + "/" + "events";
    if (eventDirectory != nullptr) {
        *eventDirectory = sysfsNode;
    }
    FsAccess *pFsAccess = &pLinuxSysmanImp->getFsAccess();
    result = pFsAccess->listDirectory(sysfsNode, listOfEvents);
    if (ZE_RESULT_SUCCESS != result) {
        NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed to list directories from %s and returning error:0x%x \n", __FUNCTION__, sysfsNode.c_str(), ZE_RESULT_ERROR_UNSUPPORTED_FEATURE);
        return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
    }
    return ZE_RESULT_SUCCESS;
}

static uint64_t convertHexToUint64(std::string strVal) {
    auto loc = strVal.find('=');
    std::stringstream ss;
    ss << std::hex << strVal.substr(loc + 1);
    uint64_t config = 0;
    ss >> config;
    return config;
}

static bool getErrorType(std::map<zes_ras_error_category_exp_t, std::vector<std::string>> categoryToListOfEvents, std::vector<std::string> &eventList, ze_device_handle_t deviceHandle) {
    ze_bool_t onSubDevice = false;
    uint32_t subDeviceId = 0;
    SysmanDeviceImp::getSysmanDeviceInfo(deviceHandle, subDeviceId, onSubDevice, true);
    // Naming convention of files containing config values for errors
    // error--<Name of error> Ex:- error--engine-reset  (config file with no subdevice)
    // error-gt<N>--<Name of error> Ex:- error-gt0--engine-reset (config file with subdevices)
    // error--<Name of error> Ex:- error--driver-object-migration  (config file for device level errors)
    std::string errorPrefix = "error--"; // prefix string of the file containing config value for pmu counters
    if (onSubDevice == true) {
        errorPrefix = "error-gt" + std::to_string(subDeviceId) + "--";
    }
    for (auto const &rasErrorCatToListOfEvents : categoryToListOfEvents) {
        for (auto const &nameOfError : rasErrorCatToListOfEvents.second) {
            std::string errorPrefixLocal = errorPrefix;
            if (nameOfError == "driver-object-migration") { // check for errors which occur at device level
                errorPrefixLocal = "error--";
            }
            if (std::find(eventList.begin(), eventList.end(), errorPrefixLocal + nameOfError) != eventList.end()) {
                return true;
            }
        }
    }
    return false;
}

void LinuxRasSourceGt::closeFds() {
    for (auto &memberFd : memberFds) {
        closeFd(memberFd);
    }
    memberFds.clear();
    closeFd(groupFd);
}

LinuxRasSourceGt::~LinuxRasSourceGt() {
    closeFds();
}

void LinuxRasSourceGt::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle) {
    LinuxSysmanImp *pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
    std::vector<std::string> listOfEvents = {};
    ze_result_t result = readI915EventsDirectory(pLinuxSysmanImp, listOfEvents, nullptr);
    if (result != ZE_RESULT_SUCCESS) {
        return;
    }
    if (getErrorType(categoryToListOfEventsCorrectable, listOfEvents, deviceHandle) == true) {
        errorType.insert(ZES_RAS_ERROR_TYPE_CORRECTABLE);
    }
    if (getErrorType(categoryToListOfEventsUncorrectable, listOfEvents, deviceHandle) == true) {
        errorType.insert(ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
    }
}

ze_result_t LinuxRasSourceGt::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
    if (clear == true) {
        closeFds();
        memset(state.category, 0, maxRasErrorCategoryCount * sizeof(uint64_t));
        memset(initialErrorCount, 0, maxRasErrorCategoryExpCount * sizeof(uint64_t));
    }
    initRasErrors(clear);
    // Iterate over all the file descriptor values present in vector which is mapped to given ras error category
    // Use the file descriptors to read pmu counters and add all the errors corresponding to the ras error category
    if (groupFd < 0) {
        return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
    }

    auto numEvents = memberFds.size() + 1;             // Add 1 for group Fd
    std::vector<std::uint64_t> data(2 + numEvents, 0); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
    if (pPmuInterface->pmuRead(static_cast<int>(groupFd), data.data(), sizeof(uint64_t) * data.size()) < 0) {
        return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
    }
    /* The data buffer retrieved after reading pmu counters is parsed to get the error count for each suberror category */
    uint64_t initialIndex = 2; // Initial index in the buffer from which the data be parsed begins
    for (auto errorCat = errorCategoryToEventCount.begin(); errorCat != errorCategoryToEventCount.end(); errorCat++) {
        auto errorCategory = rasErrorCatExpToErrorCat.find(errorCat->first);
        if (errorCategory == rasErrorCatExpToErrorCat.end()) {
            initialIndex += errorCat->second;
            continue;
        }
        uint64_t errorCount = 0;
        uint64_t j = 0;
        for (; j < errorCat->second; j++) {
            errorCount += data[initialIndex + j];
        }
        state.category[errorCat->first] = errorCount + initialErrorCount[errorCat->first];
        initialIndex += j;
    }

    return ZE_RESULT_SUCCESS;
}

ze_result_t LinuxRasSourceGt::osRasGetStateExp(uint32_t numCategoriesRequested, zes_ras_state_exp_t *pState) {
    initRasErrors(false);
    // Iterate over all the file descriptor values present in vector which is mapped to given ras error category
    // Use the file descriptors to read pmu counters and add all the errors corresponding to the ras error category
    if (groupFd < 0) {
        return ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE;
    }

    auto numEvents = memberFds.size() + 1;             // Add 1 for group Fd
    std::vector<std::uint64_t> data(2 + numEvents, 0); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
    if (pPmuInterface->pmuRead(static_cast<int>(groupFd), data.data(), sizeof(uint64_t) * data.size()) < 0) {
        return ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE;
    }

    /* The data buffer retrieved after reading pmu counters is parsed to get the error count for each suberror category */
    uint64_t initialIndex = 2; // Initial index in the buffer from which the data be parsed begins
    uint32_t categoryIdx = 0u;
    for (auto errorCat = errorCategoryToEventCount.begin(); (errorCat != errorCategoryToEventCount.end()) && (categoryIdx < numCategoriesRequested); errorCat++) {
        uint64_t errorCount = 0;
        uint64_t j = 0;
        for (; j < errorCat->second; j++) {
            errorCount += data[initialIndex + j];
        }
        pState[categoryIdx].category = errorCat->first;
        pState[categoryIdx].errorCounter = errorCount + initialErrorCount[errorCat->first];
        initialIndex += j;
        categoryIdx++;
    }

    return ZE_RESULT_SUCCESS;
}

ze_result_t LinuxRasSourceGt::osRasClearStateExp(zes_ras_error_category_exp_t category) {
    ze_result_t result = ZE_RESULT_ERROR_NOT_AVAILABLE;
    // check requested category is already initialized
    if (errorCategoryToEventCount.find(category) != errorCategoryToEventCount.end()) {
        closeFds();
        clearStatus |= (1 << category);
        initialErrorCount[category] = 0;
        result = ZE_RESULT_SUCCESS;
    }
    return result;
}

uint32_t LinuxRasSourceGt::osRasGetCategoryCount() {
    if (osRasErrorType == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
        return static_cast<uint32_t>(categoryToListOfEventsUncorrectable.size());
    }
    return static_cast<uint32_t>(categoryToListOfEventsCorrectable.size());
}

ze_result_t LinuxRasSourceGt::getPmuConfig(
    const std::string &eventDirectory,
    const std::vector<std::string> &listOfEvents,
    const std::string &errorFileToGetConfig,
    std::string &pmuConfig) {
    auto findErrorInList = std::find(listOfEvents.begin(), listOfEvents.end(), errorFileToGetConfig);
    if (findErrorInList == listOfEvents.end()) {
        NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed to find %s from list of events and returning error:0x%x \n", __FUNCTION__, errorFileToGetConfig.c_str(), ZE_RESULT_ERROR_UNKNOWN);
        return ZE_RESULT_ERROR_UNKNOWN;
    }
    return pFsAccess->read(eventDirectory + "/" + errorFileToGetConfig, pmuConfig);
}

ze_result_t LinuxRasSourceGt::getBootUpErrorCountFromSysfs(
    std::string nameOfError,
    const std::string &errorCounterDir,
    uint64_t &errorVal) {
    std::replace(nameOfError.begin(), nameOfError.end(), '-', '_'); // replace - with _ to convert name of pmu config node to name of sysfs node
    return pSysfsAccess->read(errorCounterDir + "/" + nameOfError, errorVal);
}

void LinuxRasSourceGt::initRasErrors(ze_bool_t clear) {

    // if already initialized
    if (groupFd >= 0) {
        return;
    }

    std::string eventDirectory;
    std::vector<std::string> listOfEvents = {};
    ze_result_t result = readI915EventsDirectory(pLinuxSysmanImp, listOfEvents, &eventDirectory);
    if (result != ZE_RESULT_SUCCESS) {
        return;
    }
    std::map<zes_ras_error_category_exp_t, std::vector<std::string>> categoryToListOfEvents;
    if (osRasErrorType == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
        categoryToListOfEvents = categoryToListOfEventsCorrectable;
    }
    if (osRasErrorType == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
        categoryToListOfEvents = categoryToListOfEventsUncorrectable;
    }
    std::string errorPrefix = "error--";                  // prefix string of the file containing config value for pmu counters
    std::string errorCounterDir = "gt/gt0/error_counter"; // Directory containing the sysfs nodes which in turn contains initial value of error count
    if (isSubdevice == true) {
        errorPrefix = "error-gt" + std::to_string(subdeviceId) + "--";
        errorCounterDir = "gt/gt" + std::to_string(subdeviceId) + "/error_counter";
    }
    // Following loop retrieves initial count of errors from sysfs and pmu config values for each ras error
    // PMU: error--<Name of error> Ex:- error--engine-reset  (config with no subdevice)
    // PMU: error-gt<N>--<Name of error> Ex:- error-gt0--engine-reset (config with subdevices)
    // PMU: error--<Name of error> Ex:- error--driver-object-migration  (config for device level errors)
    // Sysfs: card0/gt/gt0/error_counter/<Name of error> Ex:- gt/gt0/error_counter/engine_reset  (sysfs with no subdevice)
    // Sysfs: card0/gt/gt<N>/error_counter/<Name of error> Ex:- gt/gt1/error_counter/engine_reset  (sysfs with dubdevices)
    // Sysfs: error_counter/<Name of error> Ex:- error_counter/driver_object_migration  (sysfs for error which occur at device level)
    for (auto const &rasErrorCatToListOfEvents : categoryToListOfEvents) {
        uint64_t eventCount = 0;
        uint64_t errorCount = 0;
        for (auto const &nameOfError : rasErrorCatToListOfEvents.second) {
            std::string errorPrefixLocal = errorPrefix;
            std::string errorCounterDirLocal = errorCounterDir;
            if (nameOfError == "driver-object-migration") { // check for errors which occur at device level
                errorCounterDirLocal = "error_counter";
                errorPrefixLocal = "error--";
            }
            uint64_t initialErrorVal = 0;
            if ((clear == false) && (getAbsoluteCount(rasErrorCatToListOfEvents.first) == true)) {
                result = getBootUpErrorCountFromSysfs(nameOfError, errorCounterDirLocal, initialErrorVal);
                if (result != ZE_RESULT_SUCCESS) {
                    continue;
                }
            }
            std::string pmuConfig;
            result = getPmuConfig(eventDirectory, listOfEvents, errorPrefixLocal + nameOfError, pmuConfig);
            if (result != ZE_RESULT_SUCCESS) {
                continue;
            }
            uint64_t config = convertHexToUint64(std::move(pmuConfig));
            if (groupFd == -1) {
                groupFd = pPmuInterface->pmuInterfaceOpen(config, -1, PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP); // To get file descriptor of the group leader
                if (groupFd < 0) {
                    return;
                }
            } else {
                // The rest of the group members are created with subsequent calls with groupFd being set to the file descriptor of the group leader
                memberFds.push_back(pPmuInterface->pmuInterfaceOpen(config, static_cast<int>(groupFd), PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP));
            }
            eventCount++;
            errorCount += initialErrorVal;
        }
        clearStatus &= ~(1 << rasErrorCatToListOfEvents.first);
        initialErrorCount[rasErrorCatToListOfEvents.first] = errorCount;
        errorCategoryToEventCount[rasErrorCatToListOfEvents.first] = eventCount;
    }
}

LinuxRasSourceGt::LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : pLinuxSysmanImp(pLinuxSysmanImp), osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) {
    pPmuInterface = pLinuxSysmanImp->getPmuInterface();
    pFsAccess = &pLinuxSysmanImp->getFsAccess();
    pSysfsAccess = &pLinuxSysmanImp->getSysfsAccess();
}

} // namespace L0