1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
|
/*
* Copyright (C) 2018-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/command_stream/csr_definitions.h"
#include "shared/source/command_stream/wait_status.h"
#include "shared/source/direct_submission/direct_submission_hw.h"
#include "shared/source/direct_submission/dispatchers/blitter_dispatcher.h"
#include "shared/source/direct_submission/dispatchers/render_dispatcher.h"
#include "shared/source/helpers/dirty_state_helpers.h"
#include "shared/source/helpers/pipeline_select_args.h"
#include "shared/source/helpers/state_base_address_helper.h"
namespace NEO {
class TagNodeBase;
template <typename GfxFamily>
class DeviceCommandStreamReceiver;
struct PipeControlArgs;
template <typename GfxFamily>
class CommandStreamReceiverHw : public CommandStreamReceiver {
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
using STATE_BASE_ADDRESS = typename StateBaseAddressTypeHelper<GfxFamily>::type;
struct ImmediateFlushData {
PipelineSelectArgs pipelineSelectArgs{};
size_t estimatedSize = 0;
void *endPtr = nullptr;
size_t csrStartOffset = 0;
bool pipelineSelectFullConfigurationNeeded = false;
bool pipelineSelectDirty = false;
bool frontEndFullConfigurationNeeded = false;
bool frontEndDirty = false;
bool stateComputeModeFullConfigurationNeeded = false;
bool stateComputeModeDirty = false;
bool stateBaseAddressFullConfigurationNeeded = false;
bool stateBaseAddressDirty = false;
bool contextOneTimeInit = false;
bool stateCacheFlushRequired = false;
};
public:
static CommandStreamReceiver *create(ExecutionEnvironment &executionEnvironment,
uint32_t rootDeviceIndex,
const DeviceBitfield deviceBitfield) {
return new CommandStreamReceiverHw<GfxFamily>(executionEnvironment, rootDeviceIndex, deviceBitfield);
}
CommandStreamReceiverHw(ExecutionEnvironment &executionEnvironment,
uint32_t rootDeviceIndex,
const DeviceBitfield deviceBitfield);
~CommandStreamReceiverHw() override;
SubmissionStatus flush(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) override;
CompletionStamp flushTask(LinearStream &commandStream, size_t commandStreamStart,
const IndirectHeap *dsh, const IndirectHeap *ioh, const IndirectHeap *ssh,
TaskCountType taskLevel, DispatchFlags &dispatchFlags, Device &device) override;
void addPipeControlFlushTaskIfNeeded(LinearStream &commandStreamCSR, TaskCountType taskLevel);
CompletionStamp flushBcsTask(LinearStream &commandStreamTask, size_t commandStreamTaskStart, const DispatchBcsFlags &dispatchBcsFlags, const HardwareInfo &hwInfo) override;
CompletionStamp flushImmediateTask(LinearStream &immediateCommandStream, size_t immediateCommandStreamStart,
ImmediateDispatchFlags &dispatchFlags, Device &device) override;
CompletionStamp flushImmediateTaskStateless(LinearStream &immediateCommandStream, size_t immediateCommandStreamStart,
ImmediateDispatchFlags &dispatchFlags, Device &device) override;
void forcePipeControl(NEO::LinearStream &commandStreamCSR);
bool flushBatchedSubmissions() override;
void programHardwareContext(LinearStream &cmdStream) override;
size_t getCmdsSizeForHardwareContext() const override;
static void addBatchBufferEnd(LinearStream &commandStream, void **patchLocation);
void programEndingCmd(LinearStream &commandStream, void **patchLocation, bool directSubmissionEnabled, bool hasRelaxedOrderingDependencies, bool isBcs);
void addBatchBufferStart(MI_BATCH_BUFFER_START *commandBufferMemory, uint64_t startAddress, bool secondary);
size_t getRequiredStateBaseAddressSize(const Device &device) const;
size_t getRequiredCmdStreamSize(const DispatchFlags &dispatchFlags, Device &device);
size_t getRequiredCmdStreamSizeAligned(const DispatchFlags &dispatchFlags, Device &device);
size_t getRequiredCmdStreamSize(const DispatchBcsFlags &dispatchBcsFlags);
size_t getRequiredCmdStreamSizeAligned(const DispatchBcsFlags &dispatchBcsFlags);
size_t getRequiredCmdStreamHeaplessSize(const DispatchFlags &dispatchFlags, Device &device);
size_t getRequiredCmdStreamHeaplessSizeAligned(const DispatchFlags &dispatchFlags, Device &device);
size_t getRequiredCmdSizeForPreamble(Device &device) const;
size_t getCmdSizeForPreemption(const DispatchFlags &dispatchFlags) const;
size_t getCmdSizeForEpilogue(const DispatchFlags &dispatchFlags) const;
size_t getCmdSizeForEpilogueCommands(const DispatchFlags &dispatchFlags) const;
size_t getCmdSizeForL3Config() const;
size_t getCmdSizeForPipelineSelect() const;
size_t getCmdSizeForEngineMode(const DispatchFlags &dispatchFlags) const;
size_t getCmdSizeForPerDssBackedBuffer(const HardwareInfo &hwInfo);
size_t getCmdSizeForActivePartitionConfig() const;
size_t getCmdSizeForStallingCommands(const DispatchFlags &dispatchFlags) const;
size_t getCmdSizeForStallingNoPostSyncCommands() const;
size_t getCmdSizeForStallingPostSyncCommands() const;
size_t getCmdSizeForComputeMode();
MOCKABLE_VIRTUAL bool hasSharedHandles();
bool isPipelineSelectAlreadyProgrammed() const;
void programComputeMode(LinearStream &csr, DispatchFlags &dispatchFlags, const HardwareInfo &hwInfo);
WaitStatus waitForTaskCountWithKmdNotifyFallback(TaskCountType taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, QueueThrottle throttle) override;
void collectStateBaseAddresPatchInfo(
uint64_t commandBufferAddress,
uint64_t commandOffset,
const LinearStream *dsh,
const LinearStream *ioh,
const LinearStream *ssh,
uint64_t generalStateBase,
bool imagesSupported);
void collectStateBaseAddresIohPatchInfo(uint64_t commandBufferAddress, uint64_t commandOffset, const LinearStream &ioh);
void resetKmdNotifyHelper(KmdNotifyHelper *newHelper);
CommandStreamReceiverType getType() const override {
return CommandStreamReceiverType::hardware;
}
TaskCountType flushBcsTask(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking, Device &device) override;
SubmissionStatus flushTagUpdate() override;
SubmissionStatus flushMiFlushDW(bool initializeProlog);
SubmissionStatus flushPipeControl(bool stateCacheFlush);
SubmissionStatus flushSmallTask(LinearStream &commandStreamTask,
size_t commandStreamStartTask);
MOCKABLE_VIRTUAL SubmissionStatus flushHandler(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency);
SubmissionStatus sendRenderStateCacheFlush() override;
bool isUpdateTagFromWaitEnabled() override;
void updateTagFromWait() override;
bool isMultiOsContextCapable() const override;
MemoryCompressionState getMemoryCompressionState(bool auxTranslationRequired) const override;
bool isDirectSubmissionEnabled() const override {
return directSubmission.get() != nullptr;
}
bool isBlitterDirectSubmissionEnabled() const override {
return blitterDirectSubmission.get() != nullptr;
}
bool directSubmissionRelaxedOrderingEnabled() const override;
uint32_t getDirectSubmissionRelaxedOrderingQueueDepth() const override;
void stopDirectSubmission(bool blocking, bool needsLock) override;
QueueThrottle getLastDirectSubmissionThrottle() override;
virtual bool isKmdWaitModeActive() { return true; }
bool initDirectSubmission() override;
GraphicsAllocation *getClearColorAllocation() override;
TagAllocatorBase *getTimestampPacketAllocator() override;
std::unique_ptr<TagAllocatorBase> createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer &rootDeviceIndices) override;
void postInitFlagsSetup() override;
void programActivePartitionConfig(LinearStream &csr);
void programComputeBarrierCommand(LinearStream &cmdStream) override {
programStallingNoPostSyncCommandsForBarrier(cmdStream);
}
size_t getCmdsSizeForComputeBarrierCommand() const override {
return getCmdSizeForStallingNoPostSyncCommands();
}
void programStallingCommandsForBarrier(LinearStream &cmdStream, TimestampPacketContainer *barrierTimestampPacketNodes, const bool isDcFlushRequired) override;
SubmissionStatus initializeDeviceWithFirstSubmission(Device &device) override;
HeapDirtyState &getDshState() {
return dshState;
}
HeapDirtyState &getSshState() {
return sshState;
}
HeapDirtyState &getIohState() {
return iohState;
}
void dispatchRayTracingStateCommand(LinearStream &cmdStream, Device &device);
uint64_t getScratchPatchAddress();
SubmissionStatus programHeaplessProlog(Device &device);
MOCKABLE_VIRTUAL void programHeaplessStateProlog(Device &device, LinearStream &commandStream);
void programStateBaseAddressHeapless(Device &device, LinearStream &commandStream);
void programComputeModeHeapless(Device &device, LinearStream &commandStream);
void handleAllocationsResidencyForflushTaskStateless(const IndirectHeap *dsh, const IndirectHeap *ioh, const IndirectHeap *ssh, Device &device);
bool submitDependencyUpdate(TagNodeBase *tag) override;
void unblockPagingFenceSemaphore(uint64_t pagingFenceValue) override;
protected:
CompletionStamp flushTaskHeapful(LinearStream &commandStream, size_t commandStreamStart,
const IndirectHeap *dsh, const IndirectHeap *ioh, const IndirectHeap *ssh,
TaskCountType taskLevel, DispatchFlags &dispatchFlags, Device &device) override;
CompletionStamp flushTaskHeapless(LinearStream &commandStream, size_t commandStreamStart,
const IndirectHeap *dsh, const IndirectHeap *ioh, const IndirectHeap *ssh,
TaskCountType taskLevel, DispatchFlags &dispatchFlags, Device &device) override;
void programPreemption(LinearStream &csr, DispatchFlags &dispatchFlags);
void programL3(LinearStream &csr, uint32_t &newL3Config, bool isBcs);
void programPreamble(LinearStream &csr, Device &device, uint32_t &newL3Config);
void programPipelineSelect(LinearStream &csr, PipelineSelectArgs &pipelineSelectArgs);
void programEpilogue(LinearStream &csr, Device &device, void **batchBufferEndLocation, DispatchFlags &dispatchFlags);
void programEpliogueCommands(LinearStream &csr, const DispatchFlags &dispatchFlags);
void programPerDssBackedBuffer(LinearStream &scr, Device &device, DispatchFlags &dispatchFlags);
void programStateSip(LinearStream &cmdStream, Device &device);
void programVFEState(LinearStream &csr, DispatchFlags &dispatchFlags, uint32_t maxFrontEndThreads);
void programStallingNoPostSyncCommandsForBarrier(LinearStream &cmdStream);
void programStallingPostSyncCommandsForBarrier(LinearStream &cmdStream, TagNodeBase &tagNode, bool dcFlushRequired);
void programEngineModeCommands(LinearStream &csr, const DispatchFlags &dispatchFlags);
void programEngineModeEpliogue(LinearStream &csr, const DispatchFlags &dispatchFlags);
void programActivePartitionConfigFlushTask(LinearStream &csr);
void programEnginePrologue(LinearStream &csr);
size_t getCmdSizeForPrologue() const;
void programExceptions(LinearStream &csr, Device &device);
size_t getCmdSizeForExceptions() const;
size_t getCmdSizeForHeaplessPrologue(Device &device) const;
void handleAllocationsResidencyForHeaplessProlog(LinearStream &linearStream, Device &device);
void addPipeControlBeforeStateSip(LinearStream &commandStream, Device &device);
void addPipeControlBefore3dState(LinearStream &commandStream, DispatchFlags &dispatchFlags);
bool are4GbHeapsAvailable() const;
void createScratchSpaceController();
bool detectInitProgrammingFlagsRequired(const DispatchFlags &dispatchFlags) const;
bool checkPlatformSupportsNewResourceImplicitFlush() const;
bool checkPlatformSupportsGpuIdleImplicitFlush() const;
void configurePostSyncWriteOffset();
void unregisterDirectSubmissionFromController();
void handleFrontEndStateTransition(const DispatchFlags &dispatchFlags);
void handlePipelineSelectStateTransition(const DispatchFlags &dispatchFlags);
void handleStateBaseAddressStateTransition(const DispatchFlags &dispatchFlags, bool &isStateBaseAddressDirty);
void updateStreamTaskCount(LinearStream &stream, TaskCountType newTaskCount);
inline void programStateBaseAddress(const IndirectHeap *dsh,
const IndirectHeap *ioh,
const IndirectHeap *ssh,
DispatchFlags &dispatchFlags,
Device &device, LinearStream &commandStreamCSR,
bool stateBaseAddressDirty);
inline void reprogramStateBaseAddress(const IndirectHeap *dsh,
const IndirectHeap *ioh,
const IndirectHeap *ssh,
DispatchFlags &dispatchFlags,
Device &device, LinearStream &commandStreamCSR,
bool force32BitAllocations, bool sshDirty, bool bindingTablePoolCommandNeeded);
inline void programStateBaseAddressCommon(const IndirectHeap *dsh,
const IndirectHeap *ioh,
const IndirectHeap *ssh,
StateBaseAddressProperties *sbaProperties,
uint64_t generalStateBaseAddress,
uint64_t indirectObjectStateBaseAddress,
PipelineSelectArgs &pipelineSelectArgs,
Device &device,
LinearStream &csrCommandStream,
bool dispatchBindingTableCommand,
bool areMultipleSubDevicesInContext,
bool setGeneralStateBaseAddress);
inline void emitTagUpdateWithoutDCFlush(LinearStream &commandStream);
inline void processBarrierWithPostSync(LinearStream &commandStreamTask,
DispatchFlags &dispatchFlags,
bool &levelClosed,
void *¤tPipeControlForNooping,
void *&epiloguePipeControlLocation,
bool &hasStallingCmdsOnTaskStream,
PipeControlArgs &args);
inline CompletionStamp handleFlushTaskSubmission(BatchBuffer &&batchBuffer,
const DispatchFlags &dispatchFlags,
Device &device,
void *currentPipeControlForNooping,
void *epiloguePipeControlLocation,
PipeControlArgs &args,
bool submitTask,
bool submitCSR,
bool hasStallingCmdsOnTaskStream,
bool levelClosed,
bool implicitFlush);
inline CompletionStamp updateTaskCountAndGetCompletionStamp(bool levelClosed);
inline void programSamplerCacheFlushBetweenRedescribedSurfaceReads(LinearStream &commandStreamCSR);
bool bcsRelaxedOrderingAllowed(const BlitPropertiesContainer &blitPropertiesContainer, bool hasStallingCmds) const;
inline void handleImmediateFlushPipelineSelectState(ImmediateDispatchFlags &dispatchFlags, ImmediateFlushData &flushData);
inline void dispatchImmediateFlushPipelineSelectCommand(ImmediateFlushData &flushData, LinearStream &csrStream);
inline void handleImmediateFlushFrontEndState(ImmediateDispatchFlags &dispatchFlags, ImmediateFlushData &flushData);
inline void dispatchImmediateFlushFrontEndCommand(ImmediateFlushData &flushData, Device &device, LinearStream &csrStream);
inline void handleImmediateFlushStateComputeModeState(ImmediateDispatchFlags &dispatchFlags, ImmediateFlushData &flushData);
inline void dispatchImmediateFlushStateComputeModeCommand(ImmediateFlushData &flushData, LinearStream &csrStream);
inline void handleImmediateFlushStateBaseAddressState(ImmediateDispatchFlags &dispatchFlags, ImmediateFlushData &flushData, Device &device);
inline void dispatchImmediateFlushStateBaseAddressCommand(ImmediateFlushData &flushData, LinearStream &csrStream, Device &device);
inline void handleImmediateFlushOneTimeContextInitState(ImmediateDispatchFlags &dispatchFlags, ImmediateFlushData &flushData, Device &device);
inline void dispatchImmediateFlushOneTimeContextInitCommand(ImmediateFlushData &flushData, LinearStream &csrStream, Device &device);
inline void handleImmediateFlushJumpToImmediate(ImmediateFlushData &flushData);
inline void dispatchImmediateFlushJumpToImmediateCommand(LinearStream &immediateCommandStream,
size_t immediateCommandStreamStart,
ImmediateFlushData &flushData,
LinearStream &csrStream);
inline void dispatchImmediateFlushClientBufferCommands(ImmediateDispatchFlags &dispatchFlags,
LinearStream &immediateCommandStream,
ImmediateFlushData &flushData);
void handleImmediateFlushStatelessAllocationsResidency(size_t csrEstimatedSize,
LinearStream &csrStream,
Device &device);
inline void handleImmediateFlushAllocationsResidency(Device &device,
LinearStream &immediateCommandStream,
ImmediateFlushData &flushData,
LinearStream &csrStream);
inline CompletionStamp handleImmediateFlushSendBatchBuffer(LinearStream &immediateCommandStream,
size_t immediateCommandStreamStart,
ImmediateDispatchFlags &dispatchFlags,
ImmediateFlushData &flushData,
LinearStream &csrStream);
inline void handleBatchedDispatchImplicitFlush(uint64_t globalMemorySize, bool implicitFlush);
inline BatchBuffer prepareBatchBufferForSubmission(LinearStream &commandStreamTask,
size_t commandStreamStartTask,
LinearStream &commandStreamCSR,
size_t commandStreamStartCSR,
DispatchFlags &dispatchFlags,
Device &device,
bool submitTask,
bool submitCSR,
bool hasStallingCmdsOnTaskStream);
inline void chainCsrWorkToTask(LinearStream &commandStreamCSR,
LinearStream &commandStreamTask,
size_t commandStreamStartTask,
void *bbEndLocation,
size_t &chainedBatchBufferStartOffset,
GraphicsAllocation *&chainedBatchBuffer);
HeapDirtyState dshState;
HeapDirtyState iohState;
HeapDirtyState sshState;
CsrSizeRequestFlags csrSizeRequestFlags = {};
bool wasSubmittedToSingleSubdevice = false;
std::unique_ptr<DirectSubmissionHw<GfxFamily, RenderDispatcher<GfxFamily>>> directSubmission;
std::unique_ptr<DirectSubmissionHw<GfxFamily, BlitterDispatcher<GfxFamily>>> blitterDirectSubmission;
size_t cmdStreamStart = 0;
uint32_t latestSentBcsWaValue = std::numeric_limits<uint32_t>::max();
};
} // namespace NEO
|