/*========================== begin_copyright_notice ============================ Copyright (C) 2017-2026 Intel Corporation SPDX-License-Identifier: MIT ============================= end_copyright_notice ===========================*/ #pragma once #include "IGC/common/StringMacros.hpp" #include "usc.h" #include "usc_gen7.h" #include "usc_gen9.h" #include "common/Stats.hpp" #include "common/Types.hpp" #include "common/allocator.h" #include "common/igc_resourceDimTypes.h" // hack #include "common/debug/Debug.hpp" #include "common/debug/Dump.hpp" #include #include #include #include #include #include "Compiler/CISACodeGen/ShaderUnits.hpp" #include "Compiler/CISACodeGen/Platform.hpp" #include "Compiler/CISACodeGen/DriverInfo.hpp" #include "Compiler/CISACodeGen/helper.h" #include "Compiler/MetaDataApi/MetaDataApi.h" #include "Compiler/MetaDataApi/IGCMetaDataHelper.h" #include "Compiler/CodeGenContextWrapper.hpp" #include "visa/include/RelocationInfo.h" #include #include "AdaptorOCL/OCL/KernelAnnotations.hpp" #include "AdaptorOCL/OCL/sp/spp_g8.h" #include "GenISAIntrinsics/GenIntrinsics.h" #include "GenISAIntrinsics/GenIntrinsicInst.h" #include "common/LLVMWarningsPush.hpp" #include #include #include #include #include "llvm/IR/Function.h" #include "llvm/IR/ValueMap.h" #include #include "llvm/IR/AssemblyAnnotationWriter.h" #include #include "common/LLVMWarningsPop.hpp" #include "CodeGenPublicEnums.h" #include "AdaptorOCL/TranslationBlock.h" #include "AdaptorCommon/RayTracing/API/BVHInfo.h" #include "common/MDFrameWork.h" #include #include "Probe/Assertion.h" #include #include "llvmWrapper/IR/Module.h" #include "Compiler/UserAddrSpaceMD.hpp" /************************************************************************ This file contains the interface structure and functions to communicate between front ends and code generator ************************************************************************/ namespace llvm { class Module; class Function; } // namespace llvm #define MAX_VSHADER_INPUT_REGISTERS_PACKAGEABLE 32 namespace IGCOpts { // Pass level optimizations static const std::string LowerGEPForPrivMemPass = "IGC-LowerGEPForPrivMem"; static const std::string AddressArithmeticSinkingPass = "IGC-AddressArithmeticSinking"; static const std::string PreRASchedulerPass = "IGC-PreRAScheduler"; static const std::string MergeURBWritePass = "IGC-MergeURBWrites"; static const std::string ConstantCoalescingPass = "IGC-ConstantCoalescing"; static const std::string SinkLoadOptPass = "IGC-SinkLoadOpt"; // Non-pass optimizations static const std::string AllowSimd32Slicing = "IGC-AllowSimd32Slicing"; } // namespace IGCOpts namespace IGC { struct BifLLVMModule; class CodeGenContext; struct SIMDInfoStruct { uint32_t simd8 = 0; uint32_t simd16 = 0; uint32_t simd32 = 0; uint32_t dual_simd8 = 0; uint32_t quad_simd8_dynamic = 0; }; struct SProgramOutput { public: typedef std::vector SymbolListTy; typedef std::vector RelocListTy; typedef std::vector FuncAttrListTy; // function scope symbols struct ZEBinFuncSymbolTable { SymbolListTy function; // function symbols SymbolListTy sampler; // sampler symbols SymbolListTy local; // local symbols }; // function scope gtpin info struct ZEBinFuncGTPinInfo { std::string name; void *buffer = nullptr; unsigned bufferSize = 0; }; typedef std::vector FuncGTPinInfoListTy; public: void *m_programBin = nullptr; //genIsa) unsigned int m_debugDataSize = 0; // GenISA) unsigned int m_debugDataGenISASize = 0; //; // Pair of name for the section (1st elem) and VISA asm text (2nd elem). std::vector m_VISAAsm; // Optional statistics std::optional m_NumGRFSpill; std::optional m_NumGRFFill; std::optional m_NumSends; std::optional m_NumCycles; std::optional m_NumSendStallCycles; unsigned int m_numThreads = 0; unsigned int m_perThreadArgumentStackSize = 0; void Destroy() { if (m_programBin) { IGC::aligned_free(m_programBin); } if (m_debugData) { IGC::aligned_free(m_debugData); } if (m_debugDataGenISA) { IGC::aligned_free(m_debugDataGenISA); } } void init(bool roundPower2KBytes, unsigned int scratchSpaceSizeLimitT, bool useScratchSpacePrivateMemory, bool SepSpillPvtSS, bool SeparateScratchWA) { m_roundPower2KBytes = roundPower2KBytes; m_scratchSpaceSizeLimit = scratchSpaceSizeLimitT; m_UseScratchSpacePrivateMemory = useScratchSpacePrivateMemory; m_SeparatingSpillAndPrivateScratchMemorySpace = SepSpillPvtSS; m_EnableSeparateScratchWA = SeparateScratchWA; } // if IGC needs scratch for private memory, we use slot0 for private // if IGC does not need scratch for private, slot0 is used for spill // if we want to use both private and spill in single slot, we need // to add them together unsigned int getScratchSpaceUsageInSlot0() const { unsigned int result = (m_UseScratchSpacePrivateMemory ? m_scratchSpaceUsedByShader : 0); if (result == 0) { result = (m_scratchSpaceUsedBySpills + m_scratchSpaceUsedByGtpin); } else if (!m_SeparatingSpillAndPrivateScratchMemorySpace) { result += (m_scratchSpaceUsedBySpills + m_scratchSpaceUsedByGtpin); } else if (m_EnableSeparateScratchWA) { // \TODO: doubts about driver-compiler interface, conservatively set the size // to the max of two slots result = std::max(result, m_scratchSpaceUsedBySpills + m_scratchSpaceUsedByGtpin); } result = roundSize(result); IGC_ASSERT(result <= m_scratchSpaceSizeLimit); return result; } // slot1 is used for spilling only when m_SeparatingSpillAndPrivateScratchMemorySpace is on // and Slot0 is used for IGC private memory unsigned int getScratchSpaceUsageInSlot1() const { unsigned int slot0_offset = (m_UseScratchSpacePrivateMemory ? m_scratchSpaceUsedByShader : 0); unsigned int result = 0; if (m_SeparatingSpillAndPrivateScratchMemorySpace && slot0_offset > 0) { if (m_EnableSeparateScratchWA) { // \TODO: doubts about driver-compiler interface, conservatively set the size // to the max of two slots result = std::max(slot0_offset, m_scratchSpaceUsedBySpills + m_scratchSpaceUsedByGtpin); } else { result = m_scratchSpaceUsedBySpills + m_scratchSpaceUsedByGtpin; } } result = roundSize(result); IGC_ASSERT(result <= m_scratchSpaceSizeLimit); return result; } unsigned int getScratchSpaceUsageInStateless() const { return roundSize(!m_UseScratchSpacePrivateMemory ? m_scratchSpaceUsedByShader : 0); } void setScratchSpaceUsedByShader(unsigned int scratchSpaceUsedByShader) { m_scratchSpaceUsedByShader = scratchSpaceUsedByShader; } private: unsigned int roundSize(unsigned int size) const { if (m_roundPower2KBytes) { size = roundPower2KBbyte(size); } else { size = roundPower2Byte(size); } return size; } unsigned int roundPower2KBbyte(unsigned int size) const { return (size ? iSTD::RoundPower2(iSTD::Max(int_cast(size), static_cast(sizeof(KILOBYTE)))) : 0); } // XeHP_SDV+ : we round to one of values: pow(2, (0, 6, 7, 8...18)) unsigned int roundPower2Byte(unsigned int size) const { unsigned int ret = (size ? iSTD::RoundPower2(int_cast(size)) : 0); // round any value in (0,32] to 64 BYTEs ret = ((ret > 0 && ret <= 32) ? 64 : ret); return ret; } }; enum InstrStatTypes { SROA_PROMOTED, LICM_STAT, TOTAL_TYPES }; enum InstrStatStage { BEGIN, END, EXCEED_THRESHOLD, TOTAL_STAGE }; struct SInstrTypes { bool CorrelatedValuePropagationEnable{}; bool hasMultipleBB{}; bool hasCmp{}; bool hasSwitch{}; bool hasPhi{}; bool hasLoadStore{}; bool hasIndirectCall{}; bool hasInlineAsm{}; bool hasInlineAsmPointerAccess{}; bool hasIndirectBranch{}; bool hasFunctionAddressTaken{}; bool hasSel{}; bool hasPointer{}; bool hasLocalLoadStore{}; bool hasGlobalLoad{}; // has (stateless) loads from global addresspace bool hasGlobalStore{}; // has (stateless) stores to global addresspace bool hasStorageBufferLoad{}; // has (stateful) loads from storage buffers (UAV/SSBO) bool hasStorageBufferStore{}; // has (stateful) stores to storage buffers (UAV/SSBO) bool hasSubroutines{}; bool hasPrimitiveAlloca{}; bool hasNonPrimitiveAlloca{}; bool hasReadOnlyArray{}; bool hasBuiltin{}; bool hasFRem{}; bool psHasSideEffect{}; //=0) { // pushableAddress += // *(uint32_t*)(pShaderRuntimeData + 4*pushableOffsetGrfOffset); // } // pushableAddress += m_offset; // // m_pushableOffsetGrfOffset is also used when isBindless is true and // contains the GRF offset that was used to calculate the Surface State // Offset of the buffer. It must contain one of the values provided by // frontend in pushInfo.bindlessPushInfo metadata. int m_pushableAddressGrfOffset = -1; int m_pushableOffsetGrfOffset = -1; // Immediate offset in bytes add to the start of the simple push region. uint m_offset = 0; // Data size in bytes, must be a multiple of GRF size uint m_size = 0; bool isStateless = false; bool isBindless = false; }; struct ConstantPayloadInfo { int DerivedConstantsOffset = -1; }; struct SResInfoFoldingOutput { uint32_t textureID; bool value[4]; }; enum SIMDInfoBit { SIMD_SELECTED, // 0: if the SIMD is selected. If 1, all the other bits are // ignored. SIMD_RETRY, // 1: is a retry SIMD_SKIP_HW, // 2: skip this SIMD due to HW restriction / WA. SIMD_SKIP_REGPRES, // 3: skip this SIMD due to register pressure early // out. SIMD_SKIP_SPILL, // 4: skip this SIMD due to spill or high chance of // spilling. SIMD_SKIP_STALL, // 5: skip this SIMD due to stall cycle or thread // occupancy heuristic. SIMD_SKIP_THGRPSIZE, // 6: skip due to threadGroupSize heuristic(CS / OCL // only). SIMD_SKIP_PERF, // 7: skip this SIMD due to performance concern (dx12 + // discard, MRT, etc) or other reasons. SIMD_SKIP_ML, // 8: skip this SIMD due to ML engine prediction. SIMD_FORCE_CONTENT, // 9: force this simd due to shader content (simd32 if // WaveActive, barriers + interlocks) SIMD_FORCE_HINT, // 10: force this simd by hint(s) (now for WaveSize only) SIMD_INFO_RESERVED // 11: *** If new entry is added, make sure it still // fits in m_SIMDInfo *** }; struct SKernelProgram { SProgramOutput simd1; SProgramOutput simd8; SProgramOutput simd16; SProgramOutput simd32; unsigned int bindingTableEntryCount = 0; char *gatherMap = nullptr; unsigned int gatherMapSize = 0; unsigned int ConstantBufferLength = 0; unsigned int ConstantBufferMask = 0; unsigned int MaxNumberOfThreads = 0; bool isMessageTargetDataCacheDataPort = false; unsigned int NOSBufferSize = 0; unsigned int ConstantBufferLoaded = 0; uint64_t UavLoaded = 0; unsigned int ShaderResourceLoaded[4] = {}; unsigned int RenderTargetLoaded = 0; bool hasControlFlow = false; unsigned int bufferSlot = 0; unsigned int statelessCBPushedSize = 0; bool hasEvalSampler = false; std::vector m_ResInfoFoldingOutput; // GenUpdateCB outputs void *m_ConstantBufferReplaceShaderPatterns = nullptr; uint m_ConstantBufferReplaceShaderPatternsSize = 0; uint m_ConstantBufferUsageMask = 0; uint m_ConstantBufferReplaceSize = 0; SSimplePushInfo simplePushInfoArr[g_c_maxNumberOfBufferPushed]; SIMDInfoStruct SIMDInfo; void *m_StagingCtx = nullptr; bool m_RequestStage2 = false; uint numSyncRTStacks = 0; }; /// Gen10+, corresponds to 3DSTATE_VF_SGVS_2 as described below struct SVertexFetchSGVExtendedParameters { struct { bool enabled = false; // SlotNum; uint64_t ShaderHash = 0; // raygen specific fields // TODO: need to separate out bindless and raygen into two structs // for both DX and Vulkan. // dynamically select between the 1D and 2D layout at runtime based // on the size of the dispatch. uint32_t DimX1D = 0; uint32_t DimY1D = 0; uint32_t DimX2D = 0; uint32_t DimY2D = 0; // Shaders that satisfy `isPrimaryShaderIdentifier()` can also have // a collection of other names that they go by. std::vector Aliases; // if the shader was created by cloning another shader // this will contain the name of the original shader std::string OriginatingShaderName; // We maintain this information to provide to GTPin. These are all // offsets in bytes from the base of GRF. uint32_t GlobalPtrOffset = 0; // pointer to RTGlobals uint32_t LocalPtrOffset = 0; // pointer to local root sig (except for raygen!) uint32_t StackIDsOffset = 0; // stack ID vector base // Shader has LSC store messages with non-default L1 cache control bool HasLscStoresWithNonDefaultL1CacheControls = false; // This is just diagnostic information to track whether we picked // the retry shader bool BetterThanPrev = false; // Informs if the kernel requires extra allocation of the sync HW stack bool UsesSyncHWStack = false; }; struct SRayTracingShadersGroup { // This is the default shader that is executed when the RTUnit // encounters a null shader. It is optional because there is // no need to compile it for collection state objects. std::optional callStackHandler; typedef llvm::SmallVector BindlessShaderVec; // These are the raygen shaders BindlessShaderVec m_DispatchPrograms; // Non raygen shaders BindlessShaderVec m_CallableShaders; // Continuation shaders BindlessShaderVec m_Continuations; std::optional prologueKernel; }; struct SRayTracingPipelineConfig { unsigned int maxTraceRecursionDepth = 0; unsigned int pipelineFlags = 0; }; struct SRayTracingShaderConfig { unsigned MaxPayloadSizeInBytes = 0; unsigned MaxAttributeSizeInBytes = 0; }; struct SOpenCLKernelInfo { struct SResourceInfo { enum { RES_UAV, RES_SRV, RES_OTHER } Type; int Index; }; SOpenCLKernelInfo() {}; std::string m_kernelName = {}; QWORD m_ShaderHashCode = {}; iOpenCL::ThreadPayload m_threadPayload = {}; iOpenCL::ExecutionEnvironment m_executionEnvironment = {}; SKernelProgram m_kernelProgram = {}; // ----- Information for zebin ----- // // Cross-thread payload arguments zebin::PayloadArgumentsTy m_zePayloadArgs; // BTI information for payload arguments zebin::BindingTableIndicesTy m_zeBTIArgs; // Kernel attributes. zeinfo's user_attributes of kernels zebin::zeInfoUserAttribute m_zeUserAttributes; // Kernel args info zebin::ArgsInfoTy m_zeKernelArgsInfo; // Inline samplers zebin::InlineSamplersTy m_zeInlineSamplers; // Analysis result of if there are non-kernel-argument ld/st in the kernel // If all false, we can avoid expensive memory setting of each kernel during runtime int m_hasNonKernelArgLoad = -1; int m_hasNonKernelArgStore = -1; int m_hasNonKernelArgAtomic = -1; }; struct SOpenCLKernelCostExpInfo { zebin::KCMArgsSymTy argsSym; zebin::KCMLoopCountExpsTy loopLCE; zebin::KCMLoopCostsTy kernelCost; }; struct SOpenCLProgramInfo { struct ZEBinRelocTable { std::vector globalReloc; std::vector globalConstReloc; }; // program scope symbols struct ZEBinProgramSymbolTable { using SymbolSeq = std::vector; SymbolSeq global; // global symbols SymbolSeq globalConst; // global constant symbols SymbolSeq globalStringConst; // global string constant symbols }; typedef std::vector ZEBinGlobalHostAccessTable; std::unique_ptr m_initConstantAnnotation; std::unique_ptr m_initConstantStringAnnotation; std::unique_ptr m_initGlobalAnnotation; ZEBinRelocTable m_GlobalPointerAddressRelocAnnotation; ZEBinProgramSymbolTable m_zebinSymbolTable; ZEBinGlobalHostAccessTable m_zebinGlobalHostAccessTable; bool m_hasCrossThreadOffsetRelocations = false; bool m_hasPerThreadOffsetRelocations = false; }; class CBTILayout { public: unsigned int GetSystemThreadBindingTableIndex(void) const; unsigned int GetBindingTableEntryCount(void) const; unsigned int GetTextureIndex(unsigned int index) const; unsigned int GetUavIndex(unsigned int index) const; unsigned int GetRenderTargetIndex(unsigned int index) const; unsigned int GetConstantBufferIndex(unsigned int index) const; unsigned int GetTextureIndexSize() const { return m_pLayout->maxResourceIdx - m_pLayout->minResourceIdx; } unsigned int GetUavIndexSize() const { return m_pLayout->maxUAVIdx - m_pLayout->minUAVIdx; } unsigned int GetRenderTargetIndexSize() const { return m_pLayout->maxColorBufferIdx - m_pLayout->minColorBufferIdx; } unsigned int GetConstantBufferIndexSize() const { return m_pLayout->maxConstantBufferIdx - m_pLayout->minConstantBufferIdx; } unsigned int GetNullSurfaceIdx() const; unsigned int GetTGSMIndex() const; unsigned int GetScratchSurfaceBindingTableIndex() const; unsigned int GetStatelessBindingTableIndex() const; unsigned int GetImmediateConstantBufferOffset() const; unsigned int GetDrawIndirectBufferIndex() const; const USC::SShaderStageBTLayout *GetBtLayout() const { return m_pLayout; }; const std::vector &GetColorBufferMappingTable() const { return m_ColorBufferMappings; } CBTILayout(const USC::SShaderStageBTLayout *pLayout) : m_pLayout(pLayout) {} CBTILayout(const USC::SShaderStageBTLayout *pLayout, const std::vector &colorBufferMappings) : m_pLayout(pLayout), m_ColorBufferMappings(colorBufferMappings) {} protected: const USC::SShaderStageBTLayout *m_pLayout; // Vulkan front end provides a separate vector with color buffer mappings. const std::vector m_ColorBufferMappings; }; // This is insanely ugly, but it's the pretties solution we could // think of that preserves the GFX code. // This is temporary and will go away once image access between // OCL and GFX is unified. // This happens because in GFX the layout comes from the driver and is // immutable, while in OCL we need to change the layout mid-codegen. class COCLBTILayout : public CBTILayout { public: COCLBTILayout(const USC::SShaderStageBTLayout *pLayout) : CBTILayout(pLayout) {} USC::SShaderStageBTLayout *getModifiableLayout(); }; class RetryManager { public: RetryManager(); ~RetryManager(); RetryManager(const RetryManager &) = delete; RetryManager &operator=(const RetryManager &) = delete; bool AdvanceState(); void DecreaseState(); bool AllowLICM(llvm::Function *F = nullptr) const; bool AllowPromotePrivateMemory(llvm::Function *F = nullptr) const; bool AllowVISAPreRAScheduler(llvm::Function *F = nullptr) const; bool AllowCodeSinking(llvm::Function *F = nullptr) const; bool AllowAddressArithmeticSinking(llvm::Function *F = nullptr) const; bool AllowCloneAddressArithmetic(llvm::Function *F = nullptr) const; bool AllowCodeScheduling(llvm::Function *F = nullptr) const; bool AllowSimd32Slicing(llvm::Function *F = nullptr) const; bool AllowLargeURBWrite(llvm::Function *F = nullptr) const; bool AllowConstantCoalescing(llvm::Function *F = nullptr) const; bool AllowLargeGRF(llvm::Function *F = nullptr) const; bool ForceIndirectCallsInSyncRT() const; bool AllowRaytracingSpillCompaction() const; bool AllowLoadSinking(llvm::Function *F = nullptr) const; void SetFirstStateId(int id); bool IsFirstTry() const; bool IsLastTry() const; bool Trigger2xGRFRetry() const; unsigned GetRetryId() const; unsigned GetPerFuncRetryStateId(llvm::Function *F) const; void Enable(ShaderType ty = ShaderType::UNKNOWN); void Disable(bool DisablePerKernel = false); void SetSpillSize(unsigned int spillSize); unsigned int GetLastSpillSize() const; unsigned int numInstructions = 0; // For OCL the retry manager will work on per-kernel basis, that means // Disable() will disable only specific kernel. Other kernels still can // be retried. To keep the old behavior for other shader types, Disable() // will check the field and keep the old behavior. If other shader // types want to follow OCL this has to be set, see CodeGenContext // constructor. bool perKernel; /// the set of OCL kernels that was compiled std::map previousKernels; /// the set of OCL kernels that need to recompile std::set kernelSet; /// the set of selected OCL kernels that go through early retry std::set earlyRetryKernelSet; /// the set of OCL kernels that need to skip recompilation std::set kernelSkip; // Check if current shader is better then previous one bool IsBetterThanPrevious(CShaderProgram *pCurrent, float threshold = 1.0f); // Get the previous compilation of the current kernel CShaderProgram *GetPrevious(CShaderProgram *pCurrent, bool ReleaseUPtr = false); // Collect compilation of the current kernel void Collect(CShaderProgram::UPtr pCurrent); // Set of functions within a function group that should be retried std::set PerFuncRetrySet; void ClearSpillParams(); // save entry for given SIMD mode, to avoid recompile for next retry. void SaveSIMDEntry(SIMDMode simdMode, CShader *shader); CShader *GetSIMDEntry(SIMDMode simdMode); bool AnyKernelSpills() const; // Try to pickup the simd mode & kernel based on heuristics and fill // programOutput. If returning true, then stop the further retry. bool PickupKernels(CodeGenContext *cgCtx); private: unsigned stateId; unsigned prevStateId; // For debugging purposes, it can be useful to start on a particular // ID rather than id 0. unsigned firstStateId; // internal knob to disable retry manager. bool enabled; // shader type for shader specific opt ShaderType shaderType; unsigned lastSpillSize = 0; // cache the compiled kernel during retry struct CacheEntry { SIMDMode simdMode; CShader *shader; }; CacheEntry cache[3] = { {SIMDMode::SIMD8, nullptr}, {SIMDMode::SIMD16, nullptr}, {SIMDMode::SIMD32, nullptr}, }; CacheEntry *GetCacheEntry(SIMDMode simdMode); }; /// This class: /// Add intrinsic cache to LLVM context /// Add llvm metadata cache class LLVMContextWrapper : public llvm::LLVMContext { LLVMContextWrapper(LLVMContextWrapper &) = delete; LLVMContextWrapper &operator=(LLVMContextWrapper &) = delete; public: LLVMContextWrapper(bool createResourceDimTypes = true); /// ref count the LLVMContext as now CodeGenContext owns it unsigned int refCount = 0; /// IntrinsicIDCache - Cache of intrinsic pointer to numeric ID mappings /// requested in this context typedef llvm::ValueMap SafeIntrinsicIDCacheTy; SafeIntrinsicIDCacheTy m_SafeIntrinsicIDCache; /// metadata caching UserAddrSpaceMD m_UserAddrSpaceMD; // structType caching : for unique identified struct type llvm::SmallVector m_allLayoutStructTypes; void AddRef(); void Release(); // TODO: Remove after switch to LLVM 16 opque pointers. // In order to get rid of `Reapply_hasSetOpaquePointersValue.patch` patch, we're implementing // check if pointer type was set in IGC. bool IGC_IsPointerModeAlreadySet = false; }; struct RoutingIndex { unsigned int resourceRangeID; unsigned int indexIntoRange; unsigned int routeTo; unsigned int lscCacheCtrl; }; class CodeGenContext { private: // For assigning a unique Function ID within CodeGenContext. std::unordered_map m_functionIDs; bool m_enableDumpUseShorterName = false; public: /// input: hash key ShaderHash hash; ShaderType type; // This variable should probably only be set if there is one shader in // the module. For example, raytracing and OpenCL modules can have an // arbitrary number of shaders, so it's unclear what setting this would // mean in that case. std::string shaderName = ""; /// input: Platform features supported const CPlatform &platform; /// input: binding table layout used by the driver const CBTILayout &btiLayout; /// information about the driver const CDriverInfo &m_DriverInfo; /// output: driver instrumentation TimeStats *m_compilerTimeStats = nullptr; ShaderStats *m_sumShaderStats = nullptr; /// output: list of buffer IDs which are promoted to direct AS // Map of promoted buffer ids with their respective buffer offsets if needed. Buffer offset will be -1 if no need of // buffer offset std::map m_buffersPromotedToDirectAS; PushConstantMode m_pushConstantMode = PushConstantMode::DEFAULT; static constexpr uint32_t DEFAULT_TOTAL_GRF_NUM = 128; SInstrTypes m_instrTypes = {}; SInstrTypes m_instrTypesAfterOpts = {}; // The module contains global variables with private address space. // When this is true, the flag "ForceGlobalMemoryAllocation" is enabled as a WA bool m_hasGlobalInPrivateAddressSpace = false; ///// used for instruction statistic before/after pass int instrStat[TOTAL_TYPES][TOTAL_STAGE]; // Module flag for subroutines/stackcalls enabled bool m_enableSubroutine = false; // Module flag for function pointers enabled bool m_enableFunctionPointer = false; // Module flag for when we need to compile multiple SIMD sizes to support SIMD variants bool m_enableSimdVariantCompilation = false; // Module flag to indicate if non-inlinable stack functions are present bool m_hasStackCalls = false; // Flag to determine if early Z culling should be called for certain patterns bool m_ForceEarlyZMathCheck = false; // Adding multiversioning to partially redundant samples, if AIL is on. bool m_enableSampleMultiversioning = false; // Re-enabling SIMD16 for compute shader if spill oversizes on SIMD32 bool m_fallbackCSSIMD16 = false; bool m_src1RemovedForBlendOpt = false; llvm::AssemblyAnnotationWriter *annotater = nullptr; RetryManager m_retryManager; // Used scratch space for private variables llvm::DenseMap m_ScratchSpaceUsage; // shader stat for opt customization uint32_t m_tempCount = 0; uint32_t m_sampler = 0; uint32_t m_inputCount = 0; uint32_t m_dxbcCount = 0; uint32_t m_ConstantBufferCount = 0; uint32_t m_numGradientSinked = 0; std::vector m_indexableTempSize; bool m_highPsRegisterPressure = 0; // Record previous simd for code patching CShader *m_prevShader = nullptr; // For IR dump after pass unsigned m_numPasses = 0; bool m_threadCombiningOptDone = false; void *m_ConstantBufferReplaceShaderPatterns = nullptr; uint m_ConstantBufferReplaceShaderPatternsSize = 0; uint m_ConstantBufferUsageMask = 0; uint m_ConstantBufferReplaceSize = 0; // tracking next available GRF offset for constants payload unsigned int m_constantPayloadNextAvailableGRFOffset = 0; ConstantPayloadInfo m_constantPayloadOffsets; // Contains the data (bytecode, enabling bit) for BIF functions // provided externally. size_t m_numBifModules = 0; BifLLVMModule *m_bifModules = nullptr; // If this flag is enabled, STOC level emulation will be added to every AnyHitShader. bool m_enableSubTriangleOpacityEmulation = false; void *gtpin_init = nullptr; bool m_hasLegacyDebugInfo = false; bool m_hasEmu64BitInsts = false; bool m_hasDPEmu = false; bool m_hasDPDivSqrtEmu = false; bool m_hasDPConvEmu = false; // Flag for staged compilation CG_FLAG_t m_CgFlag = FLAG_CG_ALL_SIMDS; // Staging context passing from Stage 1 for compile continuation CG_CTX_t *m_StagingCtx = nullptr; // We determine whether generating SIMD32 based on SIMD16's result // For staged compilation, we record if SIMD32 will be generated in Stage1, and // pass it to Stage2. bool m_doSimd32Stage2 = false; bool m_doSimd16Stage2 = false; std::string m_savedBitcodeString; SInstrTypes m_savedInstrTypes; bool m_hasVendorExtension = false; // Kernels for which recompilation should be forced. std::vector m_kernelsWithForcedRetry; std::vector m_hsIdxMap; std::vector m_dsIdxMap; std::vector m_gsIdxMap; std::vector m_hsNonDefaultIdxMap; std::vector m_dsNonDefaultIdxMap; std::vector m_gsNonDefaultIdxMap; std::vector m_psIdxMap; DWORD LtoUsedMask = 0; uint32_t HdcEnableIndexSize = 0; std::vector HdcEnableIndexValues; SIMDInfoStruct m_SIMDInfo; // Raytracing (any shader type) BVHInfo bvhInfo; // Immediate constant buffer promotion is enabled for all optimization except for Direct storage case bool m_disableICBPromotion = false; // Ignore per module fast math flag and use only per instruction fast math flags // Add few changes to CustomUnsafeOptPass related to fast flag propagation bool m_checkFastFlagPerInstructionInCustomUnsafeOptPass = false; // Specifies if this compilation uses indirect addressing with // differently aligned types. This can result in cross grf boundary // access in inactive channels of address register. bool m_mayHaveUnalignedAddressRegister = false; // Map to store global offsets in original global buffer std::map inlineProgramScopeGlobalOffsets; std::vector entry_names; uint m_spillAllowed = 0; uint m_spillAllowedFor256GRF = 0; uint m_ForceSIMDRPELimit = 0; private: // For storing emitted warning messages std::unordered_set m_emittedWarnings; // For storing error message std::stringstream oclErrorMessage; // For storing warning message std::stringstream oclWarningMessage; std::unique_ptr RemarksFile; protected: // Objects pointed to by these pointers are owned by this class. LLVMContextWrapper *llvmCtxWrapper; /// input: LLVM module IGCLLVM::Module *module = nullptr; /// input: IGC MetaData Utils IGC::IGCMD::MetaDataUtils *m_pMdUtils = nullptr; IGC::ModuleMetaData *modMD = nullptr; uint32_t m_NumGRFPerThread = 0; virtual void setFlagsPerCtx(); public: CodeGenContext(ShaderType _type, ///< shader type const CBTILayout &_bitLayout, ///< binding table layout to be used in code gen const CPlatform &_platform, ///< IGC HW platform description const CDriverInfo &driverInfo, ///< Queries to know runtime features support const bool createResourceDimTypes = true, LLVMContextWrapper *LLVMContext = nullptr) ///< LLVM context to use, if null a new one will be ///< created : type(_type), platform(_platform), btiLayout(_bitLayout), m_DriverInfo(driverInfo), llvmCtxWrapper(LLVMContext) { if (llvmCtxWrapper == nullptr) { initLLVMContextWrapper(createResourceDimTypes); } else { llvmCtxWrapper->AddRef(); } m_indexableTempSize.resize(64); for (uint i = 0; i < TOTAL_TYPES; i++) { for (uint j = 0; j < TOTAL_STAGE; j++) { instrStat[i][j] = 0; } } // Per context flag/key adjustment setFlagsPerCtx(); // Set retry behavor for Disable() m_retryManager.perKernel = (type == ShaderType::OPENCL_SHADER); m_ForceSIMDRPELimit = IGC_GET_FLAG_VALUE(ForceSIMDRPELimit); } CodeGenContext(CodeGenContext &) = delete; CodeGenContext &operator=(CodeGenContext &) = delete; // TODO: Right now CodeGenContext::print method must be manually updated for each // new member added. Modify the printer to automatically support new members based // on some "printable" metadata available with member's definition. // Possible solution: TableGen. void print(llvm::raw_ostream &stream) const; void initLLVMContextWrapper(bool createResourceDimTypes = true); llvm::LLVMContext *getLLVMContext() const; IGC::IGCMD::MetaDataUtils *getMetaDataUtils() const; IGCLLVM::Module *getModule() const; std::vector getEntryNames() const; void setModule(llvm::Module *m); void setEntryNames(llvm::Module *m); void clearEntryNames(); // Several clients explicitly delete module without resetting module to null. // This causes the issue later when the dtor is invoked (trying to delete a // dangling pointer again). This function is used to replace any explicit // delete in order to prevent deleting dangling pointers happening. void deleteModule(); IGC::ModuleMetaData *getModuleMetaData() const; unsigned int getRegisterPointerSizeInBits(unsigned int AS) const; bool enableFunctionCall() const; void CheckEnableSubroutine(llvm::Module &M); void checkDPEmulationEnabled(); virtual void InitVarMetaData(); virtual ~CodeGenContext(); CodeGenContext(const CodeGenContext &) = delete; CodeGenContext &operator=(const CodeGenContext &) = delete; void clear(); void clearMD(); void EmitMessage(std::ostream &OS, const char *errorstr, const llvm::Value *context) const; void EmitError(const char *errorstr, const llvm::Value *context); void EmitWarning(const char *warningstr, const llvm::Value *context = nullptr); inline bool HasError() const { return !this->oclErrorMessage.str().empty(); } inline bool HasWarning() const { return !this->oclWarningMessage.str().empty(); } inline const std::string GetWarning() { return this->oclWarningMessage.str(); } inline const std::string GetError() { return this->oclErrorMessage.str(); } inline const std::string GetErrorAndWarning() { return GetWarning() + GetError(); } CompOptions &getCompilerOption(); virtual void resetOnRetry(bool isSubmodule = false); virtual int32_t getNumThreadsPerEU() const; virtual uint32_t getExpGRFSize() const; virtual uint32_t getNumGRFPerThread(bool returnDefault = true); virtual void setNumGRFPerThread(uint32_t value) { m_NumGRFPerThread = value; } virtual bool isAutoGRFSelectionEnabled() const { return false; }; virtual bool forceGlobalMemoryAllocation() const; virtual bool allocatePrivateAsGlobalBuffer() const; virtual bool noLocalToGenericOptionEnabled() const; virtual bool mustDistinguishBetweenPrivateAndGlobalPtr() const; virtual bool enableTakeGlobalAddress() const; virtual int16_t getVectorCoalescingControl() const; virtual uint32_t getPrivateMemoryMinimalSizePerThread() const; virtual uint32_t getIntelScratchSpacePrivateMemoryMinimalSizePerThread() const; bool isPOSH() const; virtual bool isBufferBoundsChecking() const; virtual uint64_t getMinimumValidAddress() const; bool allowATOB(); UserAddrSpaceMD &getUserAddrSpaceMD() { IGC_ASSERT(llvmCtxWrapper); return llvmCtxWrapper->m_UserAddrSpaceMD; } llvm::SmallVector &getLayoutStructTypes() { IGC_ASSERT(llvmCtxWrapper); return llvmCtxWrapper->m_allLayoutStructTypes; } bool isSWSubTriangleOpacityCullingEmulationEnabled() const; enum Action { Set, Clear }; // ModifySIMDInfo is used by both Set and ClearSIMDInfo. Since Clear // function doesn't have bit information, it defaults to // SIMD_INFO_RESERVED if the argument is not passed. bit will not be // used when action is Action::clear void ModifySIMDInfo(SIMDMode simd, ShaderDispatchMode mode, Action action, SIMDInfoBit bit = SIMD_INFO_RESERVED) { uint32_t bit_value = 1UL << bit; bool clear = action == Action::Clear ? true : false; switch (mode) { case ShaderDispatchMode::NOT_APPLICABLE: switch (simd) { case SIMDMode::SIMD8: m_SIMDInfo.simd8 = clear ? 0 : m_SIMDInfo.simd8 | bit_value; break; case SIMDMode::SIMD16: m_SIMDInfo.simd16 = clear ? 0 : m_SIMDInfo.simd16 | bit_value; break; case SIMDMode::SIMD32: m_SIMDInfo.simd32 = clear ? 0 : m_SIMDInfo.simd32 | bit_value; break; default: IGC_ASSERT_MESSAGE(0, "Unknown SIMD Mode"); break; } break; case ShaderDispatchMode::DUAL_SIMD8: m_SIMDInfo.dual_simd8 = clear ? 0 : m_SIMDInfo.dual_simd8 | bit_value; break; case ShaderDispatchMode::QUAD_SIMD8_DYNAMIC: m_SIMDInfo.quad_simd8_dynamic = clear ? 0 : m_SIMDInfo.quad_simd8_dynamic | bit_value; break; default: IGC_ASSERT_MESSAGE(0, "Unknown SIMD Mode"); break; } } void SetSIMDInfo(SIMDInfoBit bit, SIMDMode simd, ShaderDispatchMode mode) { IGC_ASSERT(bit < SIMD_INFO_RESERVED); ModifySIMDInfo(simd, mode, Action::Set, bit); } void ClearSIMDInfo(SIMDMode simd, ShaderDispatchMode mode) { ModifySIMDInfo(simd, mode, Action::Clear); } SIMDInfoStruct GetSIMDInfo() const { return m_SIMDInfo; } SIMDMode GetSIMDMode() const; virtual std::optional knownSIMDSize() const { return std::nullopt; } // This can be paired with `EncodeAS4GFXResource()` to get a unique // index. uint32_t getUniqueIndirectIdx() { return getModuleMetaData()->CurUniqueIndirectIdx++; } // Frontends may elect to compute indices in their own way. If so, // they should call this at the end to mark the max index they have // reserved so that later passes can ensure that `getUniqueIndirectIdx()` // won't collide with any indices from the frontend. void setUniqueIndirectIdx(uint32_t NewVal) { uint32_t &CurVal = getModuleMetaData()->CurUniqueIndirectIdx; CurVal = std::max(CurVal, NewVal); } // Use this when you want to know about a particular function's // rayquery usage. bool hasSyncRTCalls(llvm::Function *F) const { auto *MMD = getModuleMetaData(); auto funcMDItr = MMD->FuncMD.find(F); bool hasRQCall = (funcMDItr != MMD->FuncMD.end() && funcMDItr->second.hasSyncRTCalls); return hasRQCall; } // Use this to determine if any shaders in the module use rayquery. bool hasSyncRTCalls() const { return (getModuleMetaData()->rtInfo.RayQueryAllocSizeInBytes != 0); } // For creating internal names with function IDs. void createFunctionIDs(); int getFunctionID(llvm::Function *F); std::string getFunctionDumpName(int functionId); bool dumpUseShorterName() const { return m_enableDumpUseShorterName; } // For remarks void initializeRemarkEmitter(const ShaderHash &hash); bool syncRTCallsNeedSplitting() { if (platform.supportRayTracingSIMD32()) return false; // In general, we don't want to compile SIMD32 for rayquery. // Determine if we are forced to do so. if (type != ShaderType::COMPUTE_SHADER) return false; auto &csInfo = getModuleMetaData()->csInfo; if (IGC_IS_FLAG_ENABLED(ForceCSSIMD32) || IGC_GET_FLAG_VALUE(ForceCSSimdSize4RQ) == 32) return true; if (IGC_IS_FLAG_ENABLED(ForceCSSIMD16)) return false; if (csInfo.forcedSIMDSize == 32) return true; if (csInfo.forcedSIMDSize == 16) return false; if (csInfo.waveSize == 32) return true; return false; } bool hasSpills(uint mscratchSpaceUsedBySpills, uint numGRF) { if (numGRF == 256 && m_spillAllowedFor256GRF) return (mscratchSpaceUsedBySpills > m_spillAllowedFor256GRF); else return (mscratchSpaceUsedBySpills > m_spillAllowed); } bool useStatelessToStateful() { return (m_instrTypes.hasLoadStore && m_DriverInfo.SupportsStatelessToStatefulBufferTransformation() && !getModuleMetaData()->compOpt.GreaterThan4GBBufferRequired && !platform.hasEfficient64bEnabled() && IGC_IS_FLAG_ENABLED(EnableStatelessToStateful) && !m_instrTypes.hasInlineAsmPointerAccess); } bool supportsVRT() const { return platform.supportsVRT() && m_DriverInfo.supportsVRT() && (getModuleMetaData()->compOpt.EnableVRT && IGC_IS_FLAG_ENABLED(EnableVRT)); } }; struct SComputeShaderSecondCompileInput { bool secondCompile; bool isRowMajor; int numChannelsUsed; int runtimeVal_LoopCount; int runtimeVal_ResWidthOrHeight; int runtimeVal_ConstBufferSize; SComputeShaderSecondCompileInput() : secondCompile(false), isRowMajor(false), numChannelsUsed(0), runtimeVal_LoopCount(0), runtimeVal_ResWidthOrHeight(0), runtimeVal_ConstBufferSize(0) {} }; struct SComputeShaderWalkOrder { ThreadIDLayout m_threadIDLayout = ThreadIDLayout::X; CS_WALK_ORDER m_walkOrder = CS_WALK_ORDER::WO_XYZ; EMIT_LOCAL_MASK m_emitMask = EMIT_LOCAL_MASK::EM_NONE; // true if HW generates localIDs and puts them to payload // false if SW generates localIDs and prolog kernel loads them from memory bool m_enableHWGenerateLID = false; }; void OptimizeIR(CodeGenContext *ctx); /** * Fold derived constants. Load CB data from CBptr with index & offset, * calculate the new data based on LLVM bitcode and store results to pNewCB. * Then driver will push pNewCB to thread payload. */ void FoldDerivedConstant(char *bitcode, uint bitcodeSize, void *CBptr[15], std::function getResInfoCB, uint *pNewCB); } // namespace IGC