File: bcs_split.inl

package info (click to toggle)
intel-compute-runtime 25.44.36015.8-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 79,632 kB
  • sloc: cpp: 931,547; lisp: 2,074; sh: 719; makefile: 162; python: 21
file content (142 lines) | stat: -rw-r--r-- 6,602 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
/*
 * Copyright (C) 2025 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

namespace L0 {

template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t BcsSplit::appendSplitCall(CommandListCoreFamilyImmediate<gfxCoreFamily> *cmdList,
                                      const BcsSplitParams::CopyParams &copyParams,
                                      size_t size,
                                      ze_event_handle_t hSignalEvent,
                                      uint32_t numWaitEvents,
                                      ze_event_handle_t *phWaitEvents,
                                      bool performMigration,
                                      bool hasRelaxedOrderingDependencies,
                                      NEO::TransferDirection direction,
                                      size_t estimatedCmdBufferSize,
                                      AppendCallFuncT<gfxCoreFamily> appendCall) {

    using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;

    const auto aggregatedEventsMode = this->events.aggregatedEventsMode;
    auto signalEvent = Event::fromHandle(hSignalEvent);

    ze_result_t result = ZE_RESULT_SUCCESS;
    auto cmdListsForSplit = this->getCmdListsForSplit(direction, size);
    auto engineCount = cmdListsForSplit.size();
    size_t markerEventIndex = 0;
    uint64_t aggregatedEventIncrementVal = 1;

    const bool useSignalEventForSubcopy = aggregatedEventsMode && cmdList->isUsingAdditionalBlitProperties() && Event::isAggregatedEvent(signalEvent) &&
                                          (signalEvent->getInOrderIncrementValue(1) % engineCount == 0);

    if (useSignalEventForSubcopy) {
        aggregatedEventIncrementVal = signalEvent->getInOrderIncrementValue(1) / engineCount;
    } else {
        auto markerEventIndexRet = this->events.obtainForSplit(Context::fromHandle(cmdList->getCmdListContext()), maxEventCountInPool<GfxFamily>);
        if (!markerEventIndexRet.has_value()) {
            return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY;
        }
        markerEventIndex = *markerEventIndexRet;
    }

    auto barrierRequired = !cmdList->isInOrderExecutionEnabled() && cmdList->isBarrierRequired();
    if (barrierRequired) {
        cmdList->appendSignalEvent(this->events.barrier[markerEventIndex]->toHandle(), false);
    }

    auto subcopyEventIndex = markerEventIndex * this->cmdLists.size();
    StackVec<ze_event_handle_t, 16> eventHandles;

    if (!cmdList->handleCounterBasedEventOperations(signalEvent, false)) {
        return ZE_RESULT_ERROR_INVALID_ARGUMENT;
    }

    auto totalSize = size;
    for (size_t i = 0; i < cmdListsForSplit.size(); i++) {
        auto subCmdList = static_cast<CommandListCoreFamilyImmediate<gfxCoreFamily> *>(cmdListsForSplit[i]);

        auto lock = subCmdList->getCsr(false)->obtainUniqueOwnership();

        subCmdList->checkAvailableSpace(numWaitEvents, hasRelaxedOrderingDependencies, estimatedCmdBufferSize, false);

        if (barrierRequired) {
            auto barrierEventHandle = this->events.barrier[markerEventIndex]->toHandle();
            subCmdList->addEventsToCmdList(1u, &barrierEventHandle, nullptr, hasRelaxedOrderingDependencies, false, true, false, false);
        }

        if (cmdList->hasInOrderDependencies()) {
            auto &inOrderExecInfo = cmdList->getInOrderExecInfo();
            subCmdList->appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue(), inOrderExecInfo->getAllocationOffset(), hasRelaxedOrderingDependencies, false, false, false, false);
        }
        subCmdList->addEventsToCmdList(numWaitEvents, phWaitEvents, nullptr, hasRelaxedOrderingDependencies, false, false, false, false);

        if (!useSignalEventForSubcopy && signalEvent && i == 0u) {
            subCmdList->appendEventForProfilingAllWalkers(signalEvent, nullptr, nullptr, true, true, false, true);
        }

        auto localSize = totalSize / engineCount;

        BcsSplitParams::CopyParams localCopyParams;

        std::visit([&](auto &&arg) {
            using T = std::decay_t<decltype(arg)>;
            localCopyParams = T{ptrOffset(arg.dst, size - totalSize),
                                ptrOffset(arg.src, size - totalSize)};
        },
                   copyParams);

        auto copyEventIndex = aggregatedEventsMode ? markerEventIndex : subcopyEventIndex + i;
        auto eventHandle = useSignalEventForSubcopy ? signalEvent : this->events.subcopy[copyEventIndex]->toHandle();

        if (aggregatedEventsMode && !useSignalEventForSubcopy) {
            subCmdList->getCmdContainer().addToResidencyContainer(this->events.subcopy[copyEventIndex]->getInOrderExecInfo()->getDeviceCounterAllocation());
        }

        result = appendCall(subCmdList, localCopyParams, localSize, eventHandle, aggregatedEventIncrementVal);
        subCmdList->flushImmediate(result, true, !hasRelaxedOrderingDependencies, hasRelaxedOrderingDependencies, NEO::AppendOperations::nonKernel, false, nullptr, true, &lock, nullptr);

        if ((aggregatedEventsMode && i == 0) || !aggregatedEventsMode) {
            eventHandles.push_back(eventHandle);
        }

        totalSize -= localSize;
        engineCount--;

        if (signalEvent) {
            signalEvent->appendAdditionalCsr(subCmdList->getCsr(false));
        }
    }

    const bool dualStreamCopyOffload = cmdList->isDualStreamCopyOffloadOperation(cmdList->isCopyOffloadEnabled());

    cmdList->addEventsToCmdList(static_cast<uint32_t>(eventHandles.size()), eventHandles.data(), nullptr, hasRelaxedOrderingDependencies, false, true, false, dualStreamCopyOffload);

    const auto isCopyCmdList = cmdList->isCopyOnly(dualStreamCopyOffload);

    if (!useSignalEventForSubcopy && signalEvent) {
        cmdList->appendSignalEventPostWalker(signalEvent, nullptr, nullptr, !isCopyCmdList, false, isCopyCmdList);
    }

    if (!aggregatedEventsMode) {
        cmdList->appendSignalEventPostWalker(this->events.marker[markerEventIndex], nullptr, nullptr, !isCopyCmdList, false, isCopyCmdList);
    }

    if (cmdList->isInOrderExecutionEnabled()) {
        cmdList->appendSignalInOrderDependencyCounter(signalEvent, dualStreamCopyOffload, false, false, useSignalEventForSubcopy);
    }
    cmdList->handleInOrderDependencyCounter(signalEvent, false, dualStreamCopyOffload);

    if (aggregatedEventsMode && !useSignalEventForSubcopy) {
        std::lock_guard<std::mutex> lock(events.mtx);
        cmdList->assignInOrderExecInfoToEvent(this->events.marker[markerEventIndex]);
    }

    return result;
}

} // namespace L0