File: kernel_hw.h

package info (click to toggle)
intel-compute-runtime 20.44.18297-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 34,780 kB
  • sloc: cpp: 379,729; lisp: 4,931; python: 299; sh: 196; makefile: 8
file content (142 lines) | stat: -rw-r--r-- 7,033 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
/*
 * Copyright (C) 2019-2020 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

#pragma once

#include "shared/source/command_container/command_encoder.h"
#include "shared/source/helpers/string.h"

#include "level_zero/core/source/kernel/kernel_imp.h"
#include "level_zero/core/source/module/module.h"

#include "igfxfmid.h"

#include <algorithm>

namespace L0 {

template <GFXCORE_FAMILY gfxCoreFamily>
struct KernelHw : public KernelImp {
    using KernelImp::KernelImp;
    using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;

    void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override {
        uint64_t baseAddress = castToUint64(address);
        auto sshAlignmentMask = NEO::EncodeSurfaceState<GfxFamily>::getSurfaceBaseAddressAlignmentMask();

        // Remove misalligned bytes, accounted for in in bufferOffset patch token
        baseAddress &= sshAlignmentMask;

        auto offset = ptrDiff(address, reinterpret_cast<void *>(baseAddress));
        size_t sizeTillEndOfSurface = alloc->getUnderlyingBufferSize() - offset;
        auto argInfo = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescPointer>();
        bool offsetWasPatched = NEO::patchNonPointer(ArrayRef<uint8_t>(this->crossThreadData.get(), this->crossThreadDataSize),
                                                     argInfo.bufferOffset, static_cast<uint32_t>(offset));
        if (false == offsetWasPatched) {
            // fallback to handling offset in surface state
            baseAddress = reinterpret_cast<uintptr_t>(address);
            DEBUG_BREAK_IF(baseAddress != (baseAddress & sshAlignmentMask));
            offset = 0;
        }

        auto surfaceStateAddress = ptrOffset(surfaceStateHeapData.get(), argInfo.bindful);
        uint64_t bufferAddressForSsh = baseAddress;
        auto alignment = NEO::EncodeSurfaceState<GfxFamily>::getSurfaceBaseAddressAlignment();
        size_t bufferSizeForSsh = ptrDiff(alloc->getGpuAddress(), bufferAddressForSsh);
        bufferSizeForSsh += sizeTillEndOfSurface; // take address alignment offset into account
        bufferSizeForSsh = alignUp(bufferSizeForSsh, alignment);

        auto mocs = this->module->getDevice()->getMOCS(true, false);
        NEO::Device *neoDevice = module->getDevice()->getNEODevice();
        NEO::EncodeSurfaceState<GfxFamily>::encodeBuffer(surfaceStateAddress, bufferAddressForSsh, bufferSizeForSsh, mocs,
                                                         false, false, false, neoDevice->getNumAvailableDevices(),
                                                         alloc, neoDevice->getGmmHelper());
    }

    std::unique_ptr<Kernel> clone() const override {
        std::unique_ptr<Kernel> ret{new KernelHw<gfxCoreFamily>};
        auto cloned = static_cast<KernelHw<gfxCoreFamily> *>(ret.get());

        cloned->kernelImmData = kernelImmData;
        cloned->module = module;
        cloned->kernelArgHandlers.assign(this->kernelArgHandlers.begin(), this->kernelArgHandlers.end());
        cloned->residencyContainer.assign(this->residencyContainer.begin(), this->residencyContainer.end());

        if (printfBuffer != nullptr) {
            const auto &it = std::find(cloned->residencyContainer.rbegin(), cloned->residencyContainer.rend(), this->printfBuffer);
            if (it == cloned->residencyContainer.rbegin()) {
                cloned->residencyContainer.resize(cloned->residencyContainer.size() - 1);
            } else {
                std::iter_swap(it, cloned->residencyContainer.rbegin());
            }
            cloned->createPrintfBuffer();
        }

        std::copy(this->groupSize, this->groupSize + 3, cloned->groupSize);
        cloned->numThreadsPerThreadGroup = this->numThreadsPerThreadGroup;
        cloned->threadExecutionMask = this->threadExecutionMask;

        if (this->surfaceStateHeapDataSize > 0) {
            cloned->surfaceStateHeapData.reset(new uint8_t[this->surfaceStateHeapDataSize]);
            memcpy_s(cloned->surfaceStateHeapData.get(),
                     this->surfaceStateHeapDataSize,
                     this->surfaceStateHeapData.get(), this->surfaceStateHeapDataSize);
            cloned->surfaceStateHeapDataSize = this->surfaceStateHeapDataSize;
        }

        if (this->crossThreadDataSize != 0) {
            cloned->crossThreadData.reset(new uint8_t[this->crossThreadDataSize]);
            memcpy_s(cloned->crossThreadData.get(),
                     this->crossThreadDataSize,
                     this->crossThreadData.get(),
                     this->crossThreadDataSize);
            cloned->crossThreadDataSize = this->crossThreadDataSize;
        }

        if (this->dynamicStateHeapDataSize != 0) {
            cloned->dynamicStateHeapData.reset(new uint8_t[this->dynamicStateHeapDataSize]);
            memcpy_s(cloned->dynamicStateHeapData.get(),
                     this->dynamicStateHeapDataSize,
                     this->dynamicStateHeapData.get(), this->dynamicStateHeapDataSize);
            cloned->dynamicStateHeapDataSize = this->dynamicStateHeapDataSize;
        }

        if (this->perThreadDataForWholeThreadGroup != nullptr) {
            alignedFree(cloned->perThreadDataForWholeThreadGroup);
            cloned->perThreadDataForWholeThreadGroup = reinterpret_cast<uint8_t *>(alignedMalloc(perThreadDataSizeForWholeThreadGroupAllocated, 32));
            memcpy_s(cloned->perThreadDataForWholeThreadGroup,
                     this->perThreadDataSizeForWholeThreadGroupAllocated,
                     this->perThreadDataForWholeThreadGroup,
                     this->perThreadDataSizeForWholeThreadGroupAllocated);
            cloned->perThreadDataSizeForWholeThreadGroupAllocated = this->perThreadDataSizeForWholeThreadGroupAllocated;
            cloned->perThreadDataSizeForWholeThreadGroup = this->perThreadDataSizeForWholeThreadGroup;
            cloned->perThreadDataSize = this->perThreadDataSize;
        }

        return ret;
    }

    void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {
        size_t localWorkSizes[3];
        localWorkSizes[0] = this->groupSize[0];
        localWorkSizes[1] = this->groupSize[1];
        localWorkSizes[2] = this->groupSize[2];

        kernelRequiresGenerationOfLocalIdsByRuntime = NEO::EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(
            kernelDescriptor.kernelAttributes.numLocalIdChannels,
            localWorkSizes,
            std::array<uint8_t, 3>{
                {kernelDescriptor.kernelAttributes.workgroupWalkOrder[0],
                 kernelDescriptor.kernelAttributes.workgroupWalkOrder[1],
                 kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]}},
            kernelDescriptor.kernelAttributes.flags.requiresWorkgroupWalkOrder,
            requiredWorkgroupOrder,
            kernelDescriptor.kernelAttributes.simdSize);
    }
};

} // namespace L0