File: ShaderCodeGen.cpp

package info (click to toggle)
intel-graphics-compiler 1.0.12504.6-1%2Bdeb12u1
links: PTS, VCS
area: main
in suites: bookworm
size: 83,912 kB
sloc: cpp: 910,147; lisp: 202,655; ansic: 15,197; python: 4,025; yacc: 2,241; lex: 1,570; pascal: 244; sh: 104; makefile: 25
file content (1739 lines) | stat: -rw-r--r-- 69,517 bytes
/*========================== begin_copyright_notice ============================

Copyright (C) 2017-2022 Intel Corporation

SPDX-License-Identifier: MIT

============================= end_copyright_notice ===========================*/

#include "Compiler/CISACodeGen/ShaderCodeGen.hpp"
#include "Compiler/Legalizer/PeepholeTypeLegalizer.hpp"
#include "Compiler/CISACodeGen/layout.hpp"
#include "Compiler/CISACodeGen/DeSSA.hpp"
#include "Compiler/CISACodeGen/GenCodeGenModule.h"
#include "Compiler/CISACodeGen/AdvCodeMotion.h"
#include "Compiler/CISACodeGen/RematAddressArithmetic.h"
#include "Compiler/CISACodeGen/AdvMemOpt.h"
#include "Compiler/CISACodeGen/Emu64OpsPass.h"
#include "Compiler/CISACodeGen/PullConstantHeuristics.hpp"
#include "Compiler/CISACodeGen/PushAnalysis.hpp"
#include "Compiler/CISACodeGen/ScalarizerCodeGen.hpp"
#include "Compiler/CISACodeGen/CodeSinking.hpp"
#include "Compiler/CISACodeGen/AddressArithmeticSinking.hpp"
#include "Compiler/CISACodeGen/SinkCommonOffsetFromGEP.h"
#include "Compiler/CISACodeGen/ConstantCoalescing.hpp"
#include "Compiler/CISACodeGen/CheckInstrTypes.hpp"
#include "Compiler/CISACodeGen/EstimateFunctionSize.h"
#include "Compiler/CISACodeGen/PassTimer.hpp"
#include "Compiler/CISACodeGen/FixAddrSpaceCast.h"
#include "Compiler/CISACodeGen/FixupExtractValuePair.h"
#include "Compiler/CISACodeGen/GenIRLowering.h"
#include "Compiler/CISACodeGen/GenSimplification.h"
#include "Compiler/CISACodeGen/LoopDCE.h"
#include "Compiler/CISACodeGen/LdShrink.h"
#include "Compiler/CISACodeGen/MemOpt.h"
#include "Compiler/CISACodeGen/MemOpt2.h"
#include "Compiler/CISACodeGen/PreRARematFlag.h"
#include "Compiler/CISACodeGen/PreRAScheduler.hpp"
#include "Compiler/CISACodeGen/PromoteConstantStructs.hpp"
#include "Compiler/CISACodeGen/ResolveGAS.h"
#include "Compiler/CISACodeGen/ResolvePredefinedConstant.h"
#include "Compiler/CISACodeGen/Simd32Profitability.hpp"
#include "Compiler/CISACodeGen/SimplifyConstant.h"
#include "Compiler/CISACodeGen/TimeStatsCounter.h"
#include "Compiler/CISACodeGen/TypeDemote.h"
#include "Compiler/CISACodeGen/UniformAssumptions.hpp"
#include "Compiler/CISACodeGen/VectorProcess.hpp"
#include "Compiler/CISACodeGen/RuntimeValueLegalizationPass.h"
#include "Compiler/CISACodeGen/InsertGenericPtrArithmeticMetadata.hpp"
#include "Compiler/CISACodeGen/LowerGEPForPrivMem.hpp"
#include "Compiler/CISACodeGen/POSH_RemoveNonPositionOutput.h"
#include "Compiler/CISACodeGen/RegisterEstimator.hpp"
#include "Compiler/CISACodeGen/RayTracingShaderLowering.hpp"
#include "Compiler/CISACodeGen/RayTracingStatefulPass.h"
#include "Compiler/CISACodeGen/LSCCacheOptimizationPass.h"
#include "Compiler/CISACodeGen/LSCControlsAnalysisPass.h"
#include "Compiler/ConvertMSAAPayloadTo16Bit.hpp"
#include "Compiler/MSAAInsertDiscard.hpp"
#include "Compiler/CISACodeGen/PromoteInt8Type.hpp"
#include "Compiler/CISACodeGen/PrepareLoadsStoresPass.h"
#include "Compiler/CISACodeGen/HFpackingOpt.hpp"
#include "Compiler/CISACodeGen/EvaluateFreeze.hpp"

#include "Compiler/CISACodeGen/SLMConstProp.hpp"
#include "Compiler/Optimizer/OpenCLPasses/DebuggerSupport/ImplicitGIDPass.hpp"
#include "Compiler/Optimizer/OpenCLPasses/DebuggerSupport/ImplicitGIDRestoring.hpp"
#include "Compiler/Optimizer/OpenCLPasses/GenericAddressResolution/GenericAddressDynamicResolution.hpp"
#include "Compiler/Optimizer/OpenCLPasses/PrivateMemory/PrivateMemoryUsageAnalysis.hpp"
#include "Compiler/Optimizer/OpenCLPasses/PrivateMemory/PrivateMemoryResolution.hpp"
#include "Compiler/Optimizer/OpenCLPasses/PrivateMemory/PrivateMemoryToSLM.hpp"
#include "Compiler/Optimizer/OpenCLPasses/ProgramScopeConstants/ProgramScopeConstantResolution.hpp"
#include "Compiler/Optimizer/OpenCLPasses/WIFuncs/WIFuncResolution.hpp"
#include "Compiler/Optimizer/OpenCLPasses/BreakConstantExpr/BreakConstantExpr.hpp"
#include "Compiler/Optimizer/OpenCLPasses/ReplaceUnsupportedIntrinsics/ReplaceUnsupportedIntrinsics.hpp"
#include "Compiler/Optimizer/PreCompiledFuncImport.hpp"
#include "Compiler/Optimizer/OpenCLPasses/AddressSpaceAliasAnalysis/AddressSpaceAliasAnalysis.h"
#include "Compiler/Optimizer/OpenCLPasses/UndefinedReferences/UndefinedReferencesPass.hpp"
#include "Compiler/Optimizer/OpenCLPasses/StatelessToStateful/StatelessToStateful.hpp"
#include "Compiler/Optimizer/OpenCLPasses/DisableLoopUnrollOnRetry/DisableLoopUnrollOnRetry.hpp"
#include "Compiler/Optimizer/OpenCLPasses/TransformUnmaskedFunctionsPass.h"
#include "Compiler/Optimizer/OpenCLPasses/UnreachableHandling/UnreachableHandling.hpp"
#include "Compiler/Optimizer/OpenCLPasses/WIFuncs/WIFuncResolution.hpp"
#include "Compiler/Optimizer/OpenCLPasses/RegPressureLoopControl/RegPressureLoopControl.hpp"
#include "Compiler/Optimizer/MCSOptimization.hpp"
#include "Compiler/Optimizer/RectListOptimizationPass.hpp"
#include "Compiler/Optimizer/GatingSimilarSamples.hpp"
#include "Compiler/Optimizer/IntDivConstantReduction.hpp"
#include "Compiler/Optimizer/IntDivRemCombine.hpp"
#include "Compiler/Optimizer/SynchronizationObjectCoalescing.hpp"
#include "Compiler/Optimizer/RuntimeValueVectorExtractPass.h"
#include "Compiler/MetaDataApi/PurgeMetaDataUtils.hpp"
#include "Compiler/HandleLoadStoreInstructions.hpp"
#include "Compiler/CustomSafeOptPass.hpp"
#include "Compiler/CustomUnsafeOptPass.hpp"
#include "Compiler/CustomLoopOpt.hpp"
#include "Compiler/GenUpdateCB.h"
#include "Compiler/PromoteResourceToDirectAS.h"
#include "Compiler/PromoteStatelessToBindless.h"
#if defined(_DEBUG) && !defined(ANDROID)
#include "Compiler/VerificationPass.hpp"
#endif
#include "Compiler/FixInvalidFuncNamePass.hpp"
#include "Compiler/LegalizationPass.hpp"
#include "Compiler/LowPrecisionOptPass.hpp"
#include "Compiler/WorkaroundAnalysisPass.h"
#include "Compiler/MetaDataApi/MetaDataApi.h"
#include "Compiler/MetaDataUtilsWrapper.h"
#include "Compiler/MetaDataApi/IGCMetaDataHelper.h"
#include "Compiler/CodeGenContextWrapper.hpp"
#include "Compiler/DynamicTextureFolding.h"
#include "Compiler/SampleMultiversioning.hpp"
#include "Compiler/ThreadCombining.hpp"
#include "Compiler/InitializePasses.h"
#include "Compiler/GenRotate.hpp"
#include "Compiler/Optimizer/Scalarizer.h"
#include "Compiler/RemoveCodeAssumptions.hpp"
#include "common/debug/Debug.hpp"
#include "common/igc_regkeys.hpp"
#include "common/debug/Dump.hpp"
#include "common/MemStats.h"
#include <iStdLib/utility.h>
#include "common/LLVMWarningsPush.hpp"
#include "llvm/Config/llvm-config.h"
#include "llvm/ADT/PostOrderIterator.h"
#include <llvm/IR/LLVMContext.h>
#include <llvm/IR/Verifier.h>
#include <llvm/Analysis/CFGPrinter.h>
#include <llvm/Analysis/Passes.h>
#include <llvm/Pass.h>
#include <llvm/IR/LegacyPassManager.h>
#include <llvm/Transforms/IPO.h>
#include <llvm/Transforms/IPO/AlwaysInliner.h>
#include <llvm/Transforms/Scalar.h>
#include <llvm/Transforms/Scalar/GVN.h>
#include <llvm/IR/Module.h>
#include <llvm/IR/Function.h>
#include <llvm/Linker/Linker.h>
#include <llvm/Analysis/ScopedNoAliasAA.h>
#include <llvm/Analysis/TargetLibraryInfo.h>
#include <llvm/ADT/StringExtras.h>
#include <llvm/IRReader/IRReader.h>
#include <llvm/Support/MathExtras.h>
#include <llvm/Support/MemoryBuffer.h>
#include <llvm/Support/ErrorHandling.h>
#include <llvm/Support/SourceMgr.h>
#include <llvm/Transforms/IPO/FunctionAttrs.h>
#include <llvmWrapper/Transforms/Utils.h>
#include <llvm/Transforms/Scalar/InstSimplifyPass.h>
#include <llvmWrapper/Transforms/Scalar.h>
#include <llvmWrapper/Bitcode/BitcodeWriter.h>
#include <llvm/Transforms/InstCombine/InstCombine.h>
#include "common/LLVMWarningsPop.hpp"
#include <sstream>
#include "Compiler/CISACodeGen/PatternMatchPass.hpp"
#include "Compiler/CISACodeGen/EmitVISAPass.hpp"
#include "Compiler/CISACodeGen/CoalescingEngine.hpp"
#include "Compiler/GenTTI.h"
#include "Compiler/GenRotate.hpp"
#include "Compiler/SampleCmpToDiscard.h"
#include "Compiler/Optimizer/IGCInstCombiner/IGCInstructionCombining.hpp"
#include "DebugInfo.hpp"
#include "AdaptorCommon/RayTracing/RayTracingPasses.hpp"
#include "AdaptorCommon/RayTracing/RayTracingAddressSpaceAliasAnalysis.h"
#include "Compiler/SamplerPerfOptPass.hpp"
#include "Compiler/CISACodeGen/HalfPromotion.h"
#include "Compiler/CISACodeGen/AnnotateUniformAllocas.h"
#include "Probe/Assertion.h"
#include "Compiler/CISACodeGen/PartialEmuI64OpsPass.h"


/***********************************************************************************
This file contains the generic code generation functions for all the shaders
The class CShader is inherited for each specific type of shaders to add specific
information
************************************************************************************/

using namespace llvm;
using namespace IGC;
using namespace IGC::IGCMD;
using namespace IGC::Debug;

namespace IGC
{
const int LOOP_ROTATION_HEADER_INST_THRESHOLD = 32;
const int LOOP_NUM_THRESHOLD = 2000;
const int LOOP_INST_THRESHOLD = 65000;
const int INST_THRESHOLD = 80000;


void AddAnalysisPasses(CodeGenContext& ctx, IGCPassManager& mpm)
{
    COMPILER_TIME_START(&ctx, TIME_CG_Add_Analysis_Passes);

    bool isOptDisabled = ctx.getModuleMetaData()->compOpt.OptDisable;
    TODO("remove the following once all IGC passes are registered to PassRegistery in their constructor")
    initializeLoopInfoWrapperPassPass(*PassRegistry::getPassRegistry());

    mpm.add(createTimeStatsCounterPass(&ctx, TIME_CG_Analysis, STATS_COUNTER_START));

    // transform pull constants and inputs into push constants and inputs
    mpm.add(new PushAnalysis());
    mpm.add(CreateSampleCmpToDiscardPass());

    if (!isOptDisabled)
    {
        mpm.add(llvm::createDeadCodeEliminationPass());
    }

    // The 1st thing we do when getting into the IGC middle end is to split critical-edges:
    // PushAnalysis requires WIAnalysis
    // WIAnalysis requires dominator and post-dominator analysis
    // WIAnalysis also requires BreakCriticalEdge because it assumes that
    // potential phi-moves will be placed at those blocks
    mpm.add(llvm::createBreakCriticalEdgesPass());



    if (IGC_IS_FLAG_DISABLED(DisableMemOpt2) &&
        (ctx.type == ShaderType::COMPUTE_SHADER || (ctx.m_DriverInfo.WAEnableMemOpt2ForOCL())) &&
        !isOptDisabled)
    {
        mpm.add(createMemOpt2Pass(16));
    }

    // only limited code-sinking to several shader-type
    // vs input has the URB-reuse issue to be resolved.
    // Also need to understand the performance benefit better.
    mpm.add(new CodeSinking(true));


    // Run flag re-materialization if it's beneficial.
    if (ctx.m_DriverInfo.benefitFromPreRARematFlag() &&
        IGC_IS_FLAG_ENABLED(EnablePreRARematFlag)) {
        mpm.add(createPreRARematFlagPass());
    }
    // Peephole framework for generic type legalization
    mpm.add(new Legalizer::PeepholeTypeLegalizer());
    if (IGC_IS_FLAG_ENABLED(ForcePromoteI8) ||
        (IGC_IS_FLAG_ENABLED(EnablePromoteI8) && !ctx.platform.supportByteALUOperation()))
    {
        mpm.add(createPromoteInt8TypePass());
    }

    // need this before WIAnalysis:
    // insert phi to prevent changing of WIAnalysis result by later code-motion
    mpm.add(llvm::createLCSSAPass());
    // Fixup extract value pairs.
    mpm.add(createExtractValuePairFixupPass());

    if (IGC_IS_FLAG_ENABLED(EnableUnmaskedFunctions) &&
        IGC_IS_FLAG_ENABLED(LateInlineUnmaskedFunc))
    {
        mpm.add(new InlineUnmaskedFunctionsPass());
        // Newly created memcpy intrinsic are lowered
        mpm.add(createReplaceUnsupportedIntrinsicsPass());
        // Split complex constant expression into 2 simple ones
        mpm.add(new BreakConstantExpr());
        // Expand newly created allocas
        mpm.add(createSROAPass());
        // Run legalization pass to expand non-supported instructions
        // like shufflevector. The code below is just copied and
        // pasted as is.
        bool preserveNan = !ctx.getCompilerOption().NoNaNs;
        mpm.add(new Legalization(preserveNan));
        // Some clean up passes.
        mpm.add(llvm::createEarlyCSEPass());
        mpm.add(new BreakConstantExpr());
        mpm.add(llvm::createCFGSimplificationPass());
        mpm.add(createDeadCodeEliminationPass());
        // Create functions groups after unmasked functions inlining
        mpm.add(createGenXCodeGenModulePass());
        // Allocate non-primitive allocas. These peace of code is copied
        if (ctx.m_instrTypes.hasNonPrimitiveAlloca)
        {
            mpm.add(createBreakCriticalEdgesPass());
            mpm.add(createAnnotateUniformAllocasPass());

            if (IGC_IS_FLAG_DISABLED(DisablePromotePrivMem) &&
                ctx.m_retryManager.AllowPromotePrivateMemory())
            {
                mpm.add(createPromotePrivateArrayToReg());
                mpm.add(createCFGSimplificationPass());
            }
        }
        mpm.add(createPromoteMemoryToRegisterPass());
        // Resolving private memory allocas
        mpm.add(CreatePrivateMemoryResolution());
    }

    // Evaluates LLVM 10+ freeze instructions so EmitPass does not need to handle them
    mpm.add(createEvaluateFreezePass());

    // clean up constexpressions after EarlyCSE
    mpm.add( new BreakConstantExpr() );

    // This is for dumping register pressure info
    if (IGC_IS_FLAG_ENABLED(ForceRPE)) {
        mpm.add(new RegisterEstimator());
    }

    mpm.add(createFixInvalidFuncNamePass());

    // collect stats after all the optimization. This info can be dumped to the cos file
    mpm.add(new CheckInstrTypes(&(ctx.m_instrTypesAfterOpts), nullptr));

    //
    // Generally, passes that change IR should be prior to this place!
    //

    // let CleanPHINode be right before Layout
    mpm.add(createCleanPHINodePass());
    // Let Layout be the last pass before Emit Pass
    mpm.add(new Layout());

    mpm.add(createTimeStatsCounterPass(&ctx, TIME_CG_Analysis, STATS_COUNTER_END));

    COMPILER_TIME_END(&ctx, TIME_CG_Add_Analysis_Passes);
} // AddAnalysisPasses

static void UpdateInstTypeHint(CodeGenContext& ctx)
{
    // WA: save original values as preRA heuristic is based on those
    // we need to fix the preRA pass heuristic or get rid of preRA pass altogether
    unsigned int numBB = ctx.m_instrTypes.numBB;
    unsigned int numSample = ctx.m_instrTypes.numSample;
    unsigned int numInsts = ctx.m_instrTypes.numInsts;
    bool hasUnmaskedRegion = ctx.m_instrTypes.hasUnmaskedRegion;
    IGCPassManager mpm(&ctx, "UpdateOptPre");
    mpm.add(new CheckInstrTypes(&(ctx.m_instrTypes), nullptr));
    mpm.run(*ctx.getModule());
    ctx.m_instrTypes.numBB = numBB;
    ctx.m_instrTypes.numSample = numSample;
    ctx.m_instrTypes.numInsts = numInsts;
    ctx.m_instrTypes.hasLoadStore = true;
    ctx.m_instrTypes.hasUnmaskedRegion = hasUnmaskedRegion;
}

// forward declaration
llvm::ModulePass* createPruneUnusedArgumentsPass();

void AddLegalizationPasses(CodeGenContext& ctx, IGCPassManager& mpm, PSSignature* pSignature)
{
    COMPILER_TIME_START(&ctx, TIME_CG_Add_Legalization_Passes);

    mpm.add(createTimeStatsCounterPass(&ctx, TIME_CG_Legalization, STATS_COUNTER_START));

    // update type of instructions to know what passes are needed.
    UpdateInstTypeHint(ctx);
    // check again after full inlining if subroutines are still present
    ctx.CheckEnableSubroutine(*ctx.getModule());

    MetaDataUtils* pMdUtils = ctx.getMetaDataUtils();
    bool isOptDisabled = ctx.getModuleMetaData()->compOpt.OptDisable;
    bool fastCompile = ctx.getModuleMetaData()->compOpt.FastCompilation;
    bool highAllocaPressure = ctx.m_instrTypes.numAllocaInsts > IGC_GET_FLAG_VALUE(AllocaRAPressureThreshold);
    bool isPotentialHPCKernel = (ctx.m_instrTypes.numInsts > IGC_GET_FLAG_VALUE(HPCInstNumThreshold)) ||
        (ctx.m_instrTypes.numGlobalInsts > IGC_GET_FLAG_VALUE(HPCGlobalInstNumThreshold)) || IGC_GET_FLAG_VALUE(HPCFastCompilation);
    highAllocaPressure = IGC_GET_FLAG_VALUE(DisableFastRAWA) ? false : highAllocaPressure;
    isPotentialHPCKernel = IGC_GET_FLAG_VALUE(DisableFastRAWA) ? false : isPotentialHPCKernel;

    if (highAllocaPressure || isPotentialHPCKernel)
    {
        IGC_SET_FLAG_VALUE(FastCompileRA, 1);
        IGC_SET_FLAG_VALUE(HybridRAWithSpill, 1);
    }
    // In case of presence of Unmasked regions disable loop invariant motion after
    // Unmasked functions are inlined at the end of optimization phase
    if (IGC_IS_FLAG_ENABLED(EnableUnmaskedFunctions) &&
        IGC_IS_FLAG_DISABLED(LateInlineUnmaskedFunc) &&
        ctx.m_instrTypes.hasUnmaskedRegion) {
        IGC_SET_FLAG_VALUE(allowLICM, false);
    }

    if (IGC_IS_FLAG_ENABLED(ForceAllPrivateMemoryToSLM) ||
        IGC_IS_FLAG_ENABLED(ForcePrivateMemoryToSLMOnBuffers))
    {
        DummyPass* dummypass = new DummyPass();
        TargetIRAnalysis GenTTgetIIRAnalysis([&](const Function& F) {
            GenIntrinsicsTTIImpl GTTI(&ctx, dummypass);
            return TargetTransformInfo(GTTI);
            });
        mpm.add(new TargetTransformInfoWrapperPass(GenTTgetIIRAnalysis));
    }

    // Disable all target library functions.
    // right now we don't support any standard function in the code gen
    // maybe we want to support some at some point to take advantage of LLVM optimizations
    TargetLibraryInfoImpl TLI;
    TLI.disableAllFunctions();
    mpm.add(new llvm::TargetLibraryInfoWrapperPass(TLI));

    // Add Metadata API immutable pass
    mpm.add(new MetaDataUtilsWrapper(pMdUtils, ctx.getModuleMetaData()));
    // Add CodeGen Context Wrapper immutable pass
    mpm.add(new CodeGenContextWrapper(&ctx));
    //Add alias analysis pass
    mpm.add(createAddressSpaceAAWrapperPass());

    if (ctx.type == ShaderType::RAYTRACING_SHADER || ctx.hasSyncRTCalls())
    {
        if (IGC_IS_FLAG_DISABLED(DisableRTAliasAnalysis))
            mpm.add(createRayTracingAddressSpaceAAWrapperPass());
    }

    mpm.add(createExternalAAWrapperPass(&addAddressSpaceAAResult));
    mpm.add(createScopedNoAliasAAWrapperPass());

    TODO("remove the following once all IGC passes are registered to PassRegistery in their constructor")
    initializeWIAnalysisPass(*PassRegistry::getPassRegistry());
    initializeSimd32ProfitabilityAnalysisPass(*PassRegistry::getPassRegistry());
    initializeGenXFunctionGroupAnalysisPass(*PassRegistry::getPassRegistry());

    if (ctx.m_threadCombiningOptDone)
    {
        mpm.add(createLoopCanonicalization());
        mpm.add(llvm::createLoopDeletionPass());
        mpm.add(llvm::createBreakCriticalEdgesPass());
        mpm.add(llvm::createLoopRotatePass(LOOP_ROTATION_HEADER_INST_THRESHOLD));
        mpm.add(llvm::createLowerSwitchPass());

        int LoopUnrollThreshold = ctx.m_DriverInfo.GetLoopUnrollThreshold();

        if (LoopUnrollThreshold > 0 && (ctx.m_tempCount < 64))
        {
            mpm.add(IGCLLVM::createLoopUnrollPass(2, LoopUnrollThreshold, -1, 1));
        }

        mpm.add(createBarrierNoopPass());

        if (ctx.m_retryManager.AllowLICM() && IGC_IS_FLAG_ENABLED(allowLICM))
        {
            mpm.add(llvm::createLICMPass());
        }
        mpm.add(llvm::createLoopSimplifyPass());
    }


    // Lower/Resolve OCL inlined constants.
    if (ctx.m_DriverInfo.NeedLoweringInlinedConstants()) {
        // Run additional constant breaking which is assumed by the constant
        // resolver.
        mpm.add(new BreakConstantExpr());
        mpm.add(new ProgramScopeConstantResolution());
    }

    bool needDPEmu = (IGC_IS_FLAG_ENABLED(ForceDPEmulation) ||
        (ctx.m_DriverInfo.NeedFP64(ctx.platform.getPlatformInfo().eProductFamily) && ctx.platform.hasNoFP64Inst()));
    bool hasDPDivSqrtEmu = !ctx.platform.hasNoFP64Inst() && !ctx.platform.hasCorrectlyRoundedMacros() && ctx.m_DriverInfo.NeedFP64DivSqrt();
    uint32_t theEmuKind = (needDPEmu ? EmuKind::EMU_DP : 0);
    theEmuKind |= (hasDPDivSqrtEmu ? EmuKind::EMU_DP_DIV_SQRT : 0);
    theEmuKind |= (ctx.m_DriverInfo.NeedI64BitDivRem() ? EmuKind::EMU_I64DIVREM : 0);
    theEmuKind |=
        ((IGC_IS_FLAG_ENABLED(ForceSPDivEmulation) ||
            (ctx.m_DriverInfo.NeedIEEESPDiv() && !ctx.platform.hasCorrectlyRoundedMacros()))
        ? EmuKind::EMU_SP_DIV : 0);
    if (ctx.platform.preferFP32Emu() && IGC_IS_FLAG_DISABLED(Force32BitIntDivRemEmu)) {
        // Prefer using FP32 emulation even though DP support is available
        theEmuKind |= EmuKind::EMU_I32DIVREM_SP;
    }
    else if (!ctx.platform.hasNoFP64Inst() &&
            (IGC_IS_FLAG_ENABLED(Force32BitIntDivRemEmu) ||
            ctx.getCompilerOption().ForceInt32DivRemEmu ||
            (ctx.platform.Enable32BitIntDivRemEmu() &&
                !ctx.getCompilerOption().ForceInt32DivRemEmuSP &&
                IGC_IS_FLAG_DISABLED(Force32BitIntDivRemEmuSP))))
    {
        // Use DP (and float) opeations to emulate int32 div/rem
        theEmuKind |= EmuKind::EMU_I32DIVREM;
    }
    else if (ctx.platform.Enable32BitIntDivRemEmu() ||
                ctx.getCompilerOption().ForceInt32DivRemEmuSP ||
                IGC_IS_FLAG_ENABLED(Force32BitIntDivRemEmuSP))
    {
        // Use SP floating operations to emulate int32 div/rem
        theEmuKind |= EmuKind::EMU_I32DIVREM_SP;
    }

    if (IGC_IS_FLAG_ENABLED(RayTracingKeepUDivRemWA))
    {
        theEmuKind &= ~EmuKind::EMU_I32DIVREM;
        theEmuKind &= ~EmuKind::EMU_I32DIVREM_SP;
    }

    if (theEmuKind > 0 || IGC_IS_FLAG_ENABLED(EnableTestIGCBuiltin))
    {
        // Need to break constant expr as PreCompiledFuncImport does not handle it.
        mpm.add(new BreakConstantExpr());
        mpm.add(new PreCompiledFuncImport(&ctx, theEmuKind));
        mpm.add(createAlwaysInlinerLegacyPass());

        // Using DCE here as AlwaysInliner does not completely remove dead functions.
        // Once AlwaysInliner can delete all of them, this DCE is no longer needed.
        // mpm.add(createDeadCodeEliminationPass());
        //
        // DCE doesn't remove dead control flow; ADCE does (currently)
        // otherwise you'd have to call createCFGSimplificationPass and DCE
        // iteratively e.g..
        mpm.add(llvm::createAggressiveDCEPass());
        // TODO: we probably should be running other passes on the result

        if (!IGC::ForceAlwaysInline(&ctx))
        {
            mpm.add(new PurgeMetaDataUtils());
        }
    }

    // Find rotate pattern.
    //   Invoked after DP emulation so that it'd handle emulation functions.
    if (ctx.platform.supportRotateInstruction()) {
        mpm.add(createGenRotatePass());
    }

    mpm.add(createReplaceUnsupportedIntrinsicsPass());

    if (IGC_IS_FLAG_DISABLED(DisablePromoteToDirectAS) &&
        !ctx.getModuleMetaData()->compOpt.IsLibraryCompilation)
    {
        // Promotes indirect resource access to direct
        mpm.add(new BreakConstantExpr());
        mpm.add(new PromoteResourceToDirectAS());
    }

    if (ctx.m_instrTypes.hasReadOnlyArray)
    {
        mpm.add(createDeadCodeEliminationPass());
        mpm.add(createSROAPass());
    }

    if (ctx.m_instrTypes.hasGenericAddressSpacePointers)
    {
        if (IGC_IS_FLAG_ENABLED(EnableGASResolver))
        {
            mpm.add(createSROAPass());
            mpm.add(createFixAddrSpaceCastPass());
            mpm.add(createResolveGASPass());
        }
        mpm.add(createGenericAddressDynamicResolutionPass());
    }

    // Resolve the Private memory to register pass
    if (!isOptDisabled)
    {
        // In case of late inlining of Unmasked function allocate non
        // primitive Allocas after inlining is done. Otherwise there
        // is possibility RegAlloc cannot allocate registers for all
        // virtual registers. This piece of code is copied at the place
        // where inlining is done.
        if (ctx.m_instrTypes.hasNonPrimitiveAlloca &&
            !(IGC_IS_FLAG_ENABLED(EnableUnmaskedFunctions) &&
              IGC_IS_FLAG_ENABLED(LateInlineUnmaskedFunc)))
        {
            mpm.add(createBreakCriticalEdgesPass());
            mpm.add(createAnnotateUniformAllocasPass());

            if (IGC_IS_FLAG_DISABLED(DisablePromotePrivMem) &&
                ctx.m_retryManager.AllowPromotePrivateMemory())
            {
                mpm.add(createPromotePrivateArrayToReg());
                mpm.add(createCFGSimplificationPass());
            }
        }
        mpm.add(createPromoteMemoryToRegisterPass());
    }
    else
    {
            if (IGC_IS_FLAG_ENABLED(AllowMem2Reg))
            mpm.add(createPromoteMemoryToRegisterPass());
    }

    if (ctx.type == ShaderType::OPENCL_SHADER ||
        ctx.type == ShaderType::COMPUTE_SHADER)
    {
        if (IGC_IS_FLAG_ENABLED(ForceAllPrivateMemoryToSLM))
        {
            mpm.add(new PrivateMemoryToSLM(
                IGC_IS_FLAG_ENABLED(EnableOptReportPrivateMemoryToSLM)));
            mpm.add(createInferAddressSpacesPass());
        }
        else if (IGC_IS_FLAG_ENABLED(ForcePrivateMemoryToSLMOnBuffers))
        {
            std::string forcedBuffers(
                IGC_GET_REGKEYSTRING(ForcePrivateMemoryToSLMOnBuffers));

            mpm.add(new PrivateMemoryToSLM(
                forcedBuffers,
                IGC_IS_FLAG_ENABLED(EnableOptReportPrivateMemoryToSLM)));
            mpm.add(createInferAddressSpacesPass());
        }
    }

    if (ctx.m_instrTypes.numOfLoop)
    {
        // need to run loop simplify to canonicalize loop and merge latches
        mpm.add(createLoopCanonicalization());
        mpm.add(createLoopSimplifyPass());
    }

    if  (ctx.enableFunctionCall() || ctx.type == ShaderType::RAYTRACING_SHADER)
    {
        // Sort functions if subroutine/indirect fcall is enabled.
        mpm.add(llvm::createGlobalDCEPass());
        mpm.add(new PurgeMetaDataUtils());
        mpm.add(createGenXCodeGenModulePass());
    }

    // Remove all uses of implicit arg instrinsics after inlining by lowering them to kernel args
    mpm.add(new LowerImplicitArgIntrinsics());

    // Resolving private memory allocas
    // In case of late inlining of Unmasked function postpone memory
    // resolution till inlining is done as during inlining new Allocas
    // are created.
    if (!(IGC_IS_FLAG_ENABLED(EnableUnmaskedFunctions) &&
          IGC_IS_FLAG_ENABLED(LateInlineUnmaskedFunc)))
    {
        mpm.add(CreatePrivateMemoryResolution());
    }
    // Should help MemOpt pass to merge more loads
    mpm.add(createSinkCommonOffsetFromGEPPass());
    // Run MemOpt
    if (!isOptDisabled &&
        ctx.m_instrTypes.hasLoadStore && IGC_IS_FLAG_DISABLED(DisableMemOpt) && !ctx.getModuleMetaData()->disableMemOptforNegativeOffsetLoads) {

        if ((ctx.type == ShaderType::RAYTRACING_SHADER || ctx.hasSyncRTCalls()) &&
            IGC_IS_FLAG_DISABLED(DisablePrepareLoadsStores))
        {
            mpm.add(createPrepareLoadsStoresPass());
        }

        // run AdvMemOpt and MemOPt back-to-back so that we only
        // need to run WIAnalysis once
        if (IGC_IS_FLAG_ENABLED(EnableAdvMemOpt))
            mpm.add(createAdvMemOptPass());

        bool AllowNegativeSymPtrsForLoad =
            ctx.type == ShaderType::RAYTRACING_SHADER ||
            ctx.type == ShaderType::OPENCL_SHADER;

        bool AllowVector8LoadStore =
            IGC_IS_FLAG_ENABLED(EnableVector8LoadStore) ||
            ((ctx.type == ShaderType::RAYTRACING_SHADER || ctx.hasSyncRTCalls()) && ctx.platform.supports8DWLSCMessage());

        mpm.add(createMemOptPass(AllowNegativeSymPtrsForLoad, AllowVector8LoadStore));

        if (ctx.type == ShaderType::RAYTRACING_SHADER &&
            static_cast<RayDispatchShaderContext&>(ctx).doSpillWidening())
        {
            mpm.add(createRTSpillShrinkPass());
            mpm.add(createMemOptPass(AllowNegativeSymPtrsForLoad, AllowVector8LoadStore));
        }

        if ((ctx.type == ShaderType::RAYTRACING_SHADER || ctx.hasSyncRTCalls())
            && IGC_IS_FLAG_ENABLED(EnableLSCCacheOptimization))
        {
            // Optimize store instructions for utilizing the LSC-L1 cache.
            // This only runs for shaders with raytracing functionality.
            mpm.add(createLSCCacheOptimizationPass());
        }

        mpm.add(createIGCInstructionCombiningPass());
        if (ctx.type == ShaderType::OPENCL_SHADER &&
            static_cast<OpenCLProgramContext&>(ctx).m_InternalOptions.KernelDebugEnable)
        {
            mpm.add(new ImplicitGIDRestoring());
        }
    }

    if (ctx.type == ShaderType::RAYTRACING_SHADER)
    {
        if (IGC_IS_FLAG_ENABLED(EnableStackIDReleaseScheduling))
            mpm.add(createStackIDSchedulingPass());
        // This will help eliminate some redundant loads in some cases. Need
        // to run this before create ldraw*/storeraw* intrinsics for raytracing
        // memory.
        mpm.add(createEarlyCSEPass());
        if (IGC_IS_FLAG_DISABLED(DisableRTMemDSE))
            mpm.add(createRayTracingMemDSEPass());
        // Convert load/store to ldraw*/storeraw*
        mpm.add(createRaytracingStatefulPass());
        if (IGC_IS_FLAG_DISABLED(DisableRayTracingConstantCoalescing))
        {
            // block load RTGlobals
            mpm.add(createRayTracingConstantCoalescingPass());
        }
        // lift raygen shader global and local pointer to inline data access.
        mpm.add(CreateBindlessInlineDataPass());
    }
    else if (ctx.hasSyncRTCalls())
    {
        mpm.add(createRaytracingStatefulPass());
    }

    if (ctx.type == ShaderType::OPENCL_SHADER &&
        static_cast<OpenCLProgramContext&>(ctx).
            m_InternalOptions.PromoteStatelessToBindless &&
        (static_cast<OpenCLProgramContext&>(ctx).
            m_InternalOptions.UseBindlessLegacyMode ||
            !ctx.getModuleMetaData()->compOpt.GreaterThan4GBBufferRequired)
        )
    {
        mpm.add(new PromoteStatelessToBindless());
    }

    if (!isOptDisabled &&
        ctx.m_instrTypes.hasLoadStore &&
        ctx.m_DriverInfo.SupportsStatelessToStatefulBufferTransformation() &&
        !ctx.getModuleMetaData()->compOpt.GreaterThan4GBBufferRequired &&
        IGC_IS_FLAG_ENABLED(EnableStatelessToStateful) &&
        !ctx.m_instrTypes.hasInlineAsmPointerAccess)
    {
        bool hasBufOff = (IGC_IS_FLAG_ENABLED(EnableSupportBufferOffset) ||
                            ctx.getModuleMetaData()->compOpt.HasBufferOffsetArg);
        mpm.add(new StatelessToStateful(hasBufOff));
    }

    // Light cleanup for subroutines after cloning. Note that the constant
    // propogation order is reversed, compared to the opt sequence in
    // OptimizeIR. There is a substantial gain with CFG simplification after
    // interprocedural constant propagation.
    if (ctx.m_enableSubroutine && !isOptDisabled)
    {
        mpm.add(createPruneUnusedArgumentsPass());

#if LLVM_VERSION_MAJOR >= 12
        mpm.add(createIPSCCPPass());
#else
        if (!ctx.m_hasStackCalls)
        {
            // Don't run IPConstantProp when stackcalls are present.
            // Let global constants be relocated inside stack funcs.
            // We cannot process SLM constants inside stackcalls, so don't propagate them.
            mpm.add(createIPConstantPropagationPass());
        }
        mpm.add(createConstantPropagationPass());
#endif

        mpm.add(createDeadCodeEliminationPass());
        mpm.add(createCFGSimplificationPass());
    }
    // Since we don't support switch statements, switch lowering is needed after the last CFG simplication
    mpm.add(llvm::createLowerSwitchPass());

    // There's no particular reason for this exact place, but it should be after LowerGEPForPrivMem
    if (IGC_IS_FLAG_ENABLED(EnableSplitIndirectEEtoSel))
    {
        mpm.add(createSplitIndirectEEtoSelPass());
    }

    // This pass can create constant expression
    if (ctx.m_DriverInfo.HasDoubleLoadStore())
    {
        mpm.add(new HandleLoadStoreInstructions());
    }

    // Split big vector & 3-element load/store, etc.
    mpm.add(createVectorPreProcessPass());

    // Create Gen IR lowering.
    //   To replace SLM pointer if they are constants, break constants first.
    if (ctx.m_instrTypes.hasLocalLoadStore) {
        mpm.add(new BreakConstantExpr());
    }

    bool KeepGEPs;
    // In case of late inlining of Unmasked function postpone memory
    // resolution till inlining is done as during inlining new Allocas
    // are created.
    if (IGC_IS_FLAG_ENABLED(EnableUnmaskedFunctions) &&
        IGC_IS_FLAG_ENABLED(LateInlineUnmaskedFunc))
    {
        KeepGEPs = true;
    }
    else
    {
        KeepGEPs = false;
    }
    mpm.add(createGenIRLowerPass());

    if (KeepGEPs)
    {
        mpm.add(createSeparateConstOffsetFromGEPPass());
    }
    else
    {
        // Also break and lower GEP constexpr.
        mpm.add(new BreakConstantExpr());
        mpm.add(createGEPLoweringPass());
    }

    mpm.add(new WorkaroundAnalysis());

    if (!isOptDisabled) {
        // Removing code assumptions can enable some InstructionCombining optimizations.
        // Last instruction combining pass needs to be before Legalization pass, as it can produce illegal instructions.
        mpm.add(new RemoveCodeAssumptions());
        mpm.add(createIGCInstructionCombiningPass());
    }

    if (!isOptDisabled)
    {
        // Optimize lower-level IR
        if (!fastCompile && !highAllocaPressure && !isPotentialHPCKernel)
        {
            mpm.add(createIGCInstructionCombiningPass());
            if (ctx.type == ShaderType::OPENCL_SHADER &&
                static_cast<OpenCLProgramContext&>(ctx).m_InternalOptions.KernelDebugEnable)
            {
                mpm.add(new ImplicitGIDRestoring());
            }
        }
        mpm.add(new GenSpecificPattern());
        // Cases with DPDivSqrtEmu grow significantly.
        // We can disable EarlyCSE when m_hasDPDivSqrtEmu is true,
        // what causes the values will have shorter lifetime and we can avoid spills.
        if (!fastCompile && !highAllocaPressure && !isPotentialHPCKernel && !ctx.m_hasDPDivSqrtEmu)
        {
            mpm.add(createEarlyCSEPass());
        }
        else if (highAllocaPressure || isPotentialHPCKernel)
        {
            mpm.add(createSinkingPass());
        }
        if (!fastCompile && !highAllocaPressure && !isPotentialHPCKernel &&
            IGC_IS_FLAG_ENABLED(allowLICM) && ctx.m_retryManager.AllowLICM())
        {
            mpm.add(createLICMPass());
            if (ctx.type == ShaderType::OPENCL_SHADER ||
                ctx.type == ShaderType::COMPUTE_SHADER)
            {
                mpm.add(new RegPressureLoopControl());
            }
        }
        mpm.add(createAggressiveDCEPass());
        // As DPC++ FE apply LICM we cannot reduce register pressure just
        // by turning off LICM at IGC in some cases so apply sinking address arithmetic
        if (ctx.m_retryManager.AllowAddressArithmeticSinking() &&
            ctx.type == ShaderType::OPENCL_SHADER)
        {
            mpm.add(new AddressArithmeticSinking());
        }
    }

    // Enabling half promotion AIL for compute shaders only at this point.
    // If needed ctx.type check can be removed to apply for all shader types
    if (IGC_IS_FLAG_ENABLED(ForceHalfPromotion) ||
        (ctx.getModuleMetaData()->compOpt.WaForceHalfPromotion && ctx.type == ShaderType::COMPUTE_SHADER) ||
        (!ctx.platform.supportFP16() && IGC_IS_FLAG_ENABLED(EnableHalfPromotion)))
    {
        mpm.add(new HalfPromotion());
        mpm.add(createGVNPass());
        mpm.add(createDeadCodeEliminationPass());
    }

    // Run address remat after GVN as it may hoist address calculations and
    // create PHI nodes with addresses.
    mpm.add(createRematAddressArithmeticPass());


    // Run type demotion if it's beneficial.
    if (ctx.m_DriverInfo.benefitFromTypeDemotion() &&
        IGC_IS_FLAG_ENABLED(EnableTypeDemotion)) {
        mpm.add(createTypeDemotePass());
    }

    // Do Genx strengthreduction (do things like fdiv -> inv + mul)
    if (!isOptDisabled)
    {
        mpm.add(createGenStrengthReductionPass());
        mpm.add(createVectorBitCastOptPass());
    }

    if (ctx.m_instrTypes.hasUniformAssumptions) {
        mpm.add(new UniformAssumptions());
    }

    // NanHandlingPass need to be before Legalization since it might make
    // some changes and require Legalization to "legalize"
    if (IGC_IS_FLAG_DISABLED(DisableBranchSwaping) && ctx.m_DriverInfo.BranchSwapping())
    {
        mpm.add(createNanHandlingPass());
    }

    // TODO: move to use instruction flags
    // to figure out if we need to preserve Nan
    bool preserveNan = !ctx.getCompilerOption().NoNaNs;

    // Legalizer does not handle constant expressions
    mpm.add(new BreakConstantExpr());
    mpm.add(new Legalization(preserveNan));

    // Scalarizer in codegen to handle the vector instructions
    mpm.add(new ScalarizerCodeGen());

    // coalesce scalar loads into loads of larger quantity.
    // This require and preserves uniform analysis we should keep
    // other passes using uniformness together to avoid re-running it several times
    if (IGC_IS_FLAG_DISABLED(DisableConstantCoalescing) && ctx.m_retryManager.AllowConstantCoalescing() && !ctx.getModuleMetaData()->compOpt.DisableConstantCoalescing)
    {
        mpm.add(createBreakCriticalEdgesPass());
        mpm.add(new ConstantCoalescing());
    }

    if (ctx.type == ShaderType::RAYTRACING_SHADER || ctx.hasSyncRTCalls())
    {
        if (IGC_IS_FLAG_DISABLED(DisableLSCControlsForRayTracing))
            mpm.add(CreateLSCControlsAnalysisPass());

        // We do raytracing lowering a little earlier than the others here
        // to take advantage of the instruction simplification below.
        mpm.add(CreateRayTracingShaderLowering());
    }

    if (ctx.type == ShaderType::RAYTRACING_SHADER)
    {
        if (ctx.platform.WaPredicatedStackIDRelease())
            mpm.add(createRayTracingPredicatedStackIDReleasePass());

        if (IGC_IS_FLAG_DISABLED(DisableRTFenceElision))
            mpm.add(createSynchronizationObjectCoalescing());
    }

    // Instruction combining may merge instruction back into unsupported intrinsics.
    // Therefore last Replace Unsupported Intrinsics Pass must be after last
    // Instruction combining pass.
    // Replace Unsupported Intrinsics Pass may generate new 64 bit operations.
    // Therefore last 64bit emulation pass must be after the last Replace Unsupported Intrinsics Pass.
    mpm.add(createReplaceUnsupportedIntrinsicsPass());

    // When needDPEmu is true, enable Emu64Ops as well for now until
    // DPEmu is able to get rid of all 64bit integer ops fully.
    if ((needDPEmu && IGC_IS_FLAG_ENABLED(DPEmuNeedI64Emu)) ||
        (ctx.m_DriverInfo.Enable64BitEmu() &&
            (IGC_GET_FLAG_VALUE(Enable64BitEmulation) ||
            (IGC_GET_FLAG_VALUE(Enable64BitEmulationOnSelectedPlatform) &&
            ctx.platform.need64BitEmulation()))) ||
            ctx.platform.hasPartialInt64Support()
        )
    {
        mpm.add(new BreakConstantExpr());

        // Emu64OpsPass requires that we are working on legal types, specifically
        // that i128 uses are expanded to i64. This is why we need to run PeepholeTypeLegalizer
        // beforehand.
        mpm.add(new Legalizer::PeepholeTypeLegalizer());
        // Lower all GEPs now as Emu64 doesn't know how to handle them.
        if (KeepGEPs)
        {
            mpm.add(createGEPLoweringPass());
            mpm.add(llvm::createEarlyCSEPass());
        }
        // Run dead code elimination pass right before Emu64OpsPass,
        // as legalization passes do not always clear unused (operating
        // on illegal types) instructions.
        mpm.add(llvm::createDeadCodeEliminationPass());

        if (ctx.platform.hasPartialEmuI64Enabled())
        {
            mpm.add(createPartialEmuI64OpsPass());
        }
        else
        {
            mpm.add(createEmu64OpsPass());
        }

        ctx.m_hasEmu64BitInsts = true;
        if (!isOptDisabled)
        {
            mpm.add(new GenSpecificPattern());
        }
    }

    if (ctx.m_instrTypes.hasRuntimeValueVector)
    {
        // Legalize RuntimeValue calls for push analysis
        mpm.add(new RuntimeValueLegalizationPass());
    }

    mpm.add(createInstSimplifyLegacyPass());
    // This pass inserts bitcasts for vector loads/stores.
    // This pass could be moved further toward EmitPass.
    mpm.add(createVectorProcessPass());

    // handling constant expressions created by vectorProcess pass
    mpm.add(new BreakConstantExpr());

    mpm.add(new LowPrecisionOpt());


    mpm.add(new WAFMinFMax());

    // Preferred to be added after llvm instruction combining, otherwise 'generic.arith'
    // metadata may get lost during optimizations.
    mpm.add(new InsertGenericPtrArithmeticMetadata());

    mpm.add(createTimeStatsCounterPass(&ctx, TIME_CG_Legalization, STATS_COUNTER_END));

    COMPILER_TIME_END(&ctx, TIME_CG_Add_Legalization_Passes);
} // AddLegalizationPasses

void AddCodeGenPasses(
    CodeGenContext& ctx,
    CShaderProgram::KernelShaderMap& shaders,
    IGCPassManager& Passes,
    SIMDMode simdMode,
    bool canAbortOnSpill,
    ShaderDispatchMode shaderMode,
    PSSignature* pSignature)
{
    // Generate CISA
    COMPILER_TIME_START(&ctx, TIME_CG_Add_CodeGen_Passes);
    Passes.add(new EmitPass(shaders, simdMode, canAbortOnSpill, shaderMode, pSignature));
    COMPILER_TIME_END(&ctx, TIME_CG_Add_CodeGen_Passes);
}


// check based on performance measures.
bool SimdEarlyCheck(CodeGenContext* ctx)
{
    if (ctx->m_sampler < 11 || ctx->m_inputCount < 16 || ctx->m_tempCount < 40 || ctx->m_dxbcCount < 280 || ctx->m_ConstantBufferCount < 500)
    {
        if (ctx->m_tempCount < 90 && ctx->m_ConstantBufferCount < 210)
        {
            return true;
        }
    }
    return false;
}

bool ForceSimdWA(ComputeShaderContext& ctx, SIMDMode & forceSimd, SIMDMode minSimdMode, SIMDMode maxSimdMode)
{
    // WA for better utilization of SIMD lanes
    if (ctx.platform.needsWAForThreadsUtilization() &&
        ctx.getModuleMetaData()->csInfo.waveSize == 0)
    {
        unsigned sizeX = ctx.GetThreadGroupSizeX();
        unsigned sizeY = ctx.GetThreadGroupSizeY();
        unsigned sizeZ = ctx.GetThreadGroupSizeZ();

        // Force SIMD8 on thread group size 16x1x1
        if (sizeX == 16 && sizeY == 1 && sizeZ == 1 &&
            minSimdMode >= SIMDMode::SIMD8)
        {
            forceSimd = SIMDMode::SIMD8;
            return true;
        }
        // Force SIMD16 or lower on thread group size 32x1x1
        else if (sizeX == 32 && sizeY == 1 && sizeZ == 1 &&
            minSimdMode <= SIMDMode::SIMD16 &&
            maxSimdMode >= SIMDMode::SIMD16)
        {
            forceSimd = SIMDMode::SIMD16;
            return true;
        }
    }

    forceSimd = SIMDMode::UNKNOWN;
    return false;
}

void destroyShaderMap(CShaderProgram::KernelShaderMap& shaders)
{
    for (auto i : shaders)
    {
        CShaderProgram* shader = i.second;
        COMPILER_SHADER_STATS_PRINT(shader->m_shaderStats,
            shader->GetContext()->type, shader->GetContext()->hash, "");
        COMPILER_SHADER_STATS_SUM(shader->GetContext()->m_sumShaderStats,
            shader->m_shaderStats, shader->GetContext()->type);
        COMPILER_SHADER_STATS_DEL(shader->m_shaderStats);
        delete shader;
    }
}


void unify_opt_PreProcess(CodeGenContext* pContext)
{
    TODO("hasBuiltin should be calculated based on module");
    if (IGC_IS_FLAG_ENABLED(DisableLLVMGenericOptimizations))
    {
        pContext->getModuleMetaData()->compOpt.OptDisable = true;
    }

    if (IGC_GET_FLAG_VALUE(StripDebugInfo) == FLAG_DEBUG_INFO_STRIP_ALL)
    {
        StripDebugInfo(*pContext->getModule());
    }
    else if (IGC_GET_FLAG_VALUE(StripDebugInfo) == FLAG_DEBUG_INFO_STRIP_NONLINE)
    {
        stripNonLineTableDebugInfo(*pContext->getModule());
    }

    IGCPassManager mpm(pContext, "OPTPre");
    mpm.add(new CheckInstrTypes(&(pContext->m_instrTypes), &pContext->metrics));

    if (pContext->isPOSH())
    {
        mpm.add(createRemoveNonPositionOutputPass());
    }

    mpm.run(*pContext->getModule());

    // If the module does not contain called function declaration,
    // indirect calls are the only way to detect function pointers usage.
    if (pContext->m_instrTypes.hasIndirectCall)
        pContext->m_enableFunctionPointer = true;

    if (pContext->getMetaDataUtils()->size_FunctionsInfo() == 1 &&
        !pContext->m_instrTypes.hasSubroutines)
    {
        pContext->m_instrTypes.numBB =
            pContext->getMetaDataUtils()->begin_FunctionsInfo()->first->getBasicBlockList().size();
        pContext->m_instrTypes.hasMultipleBB = (pContext->m_instrTypes.numBB != 1);
    }
    else
    {
        pContext->m_instrTypes.hasMultipleBB = true;
    }

    pContext->m_instrTypes.hasLoadStore = true;

    pContext->m_instrTypes.CorrelatedValuePropagationEnable =
        (pContext->m_instrTypes.hasMultipleBB &&
        (pContext->m_instrTypes.hasSel ||
        pContext->m_instrTypes.hasCmp ||
        pContext->m_instrTypes.hasSwitch ||
            pContext->m_instrTypes.hasLoadStore));
}

static bool extensiveShader(CodeGenContext* pContext)
{
    return (pContext->type == ShaderType::OPENCL_SHADER &&
        pContext->m_instrTypes.numInsts > INST_THRESHOLD &&
        pContext->m_instrTypes.numLoopInsts > LOOP_INST_THRESHOLD &&
        pContext->m_instrTypes.numOfLoop > LOOP_NUM_THRESHOLD &&
        pContext->m_instrTypes.numBB == 0 &&
        pContext->m_instrTypes.numSample == 0 &&
        pContext->m_instrTypes.hasSubroutines);
}

// When we do not run optimizations, we still need to run always inline
// pass, otherwise codegen will fail.
static void alwaysInlineForNoOpt(CodeGenContext* pContext, bool NoOpt)
{
    if (NoOpt)
    {
        MetaDataUtils* pMdUtils = pContext->getMetaDataUtils();
        IGCPassManager mpm(pContext, "OPTPost");
        mpm.add(new MetaDataUtilsWrapper(pMdUtils, pContext->getModuleMetaData()));
        mpm.add(new CodeGenContextWrapper(pContext));
        mpm.add(createAlwaysInlinerLegacyPass());
        mpm.add(new PurgeMetaDataUtils());
        mpm.run(*pContext->getModule());
    }
}


void OptimizeIR(CodeGenContext* const pContext)
{
    IGC_ASSERT(nullptr != pContext);
    MetaDataUtils* pMdUtils = pContext->getMetaDataUtils();
    IGC_ASSERT(nullptr != pContext->getModuleMetaData());
    bool NoOpt = pContext->getModuleMetaData()->compOpt.OptDisable;

    alwaysInlineForNoOpt(pContext, NoOpt);

    if (pContext->type == ShaderType::OPENCL_SHADER)
    {
        if (((OpenCLProgramContext*)pContext)->m_InternalOptions.KernelDebugEnable)
        {
            IGCPassManager mpm(pContext, "CleanImplicitId");
            mpm.add(new CleanImplicitIds());
            mpm.run(*pContext->getModule());
        }
    }
    if (NoOpt)
    {
        return;
    }

    IGCPassManager mpm(pContext, "OPT");

#if defined(_DEBUG) || defined(_INTERNAL)
    // do verifyModule for debug/release_internal only.
    if (false == pContext->m_hasLegacyDebugInfo)
    {
        IGC_ASSERT(nullptr != pContext->getModule());
        IGC_ASSERT(false == llvm::verifyModule(*pContext->getModule(), &dbgs()));
    }
#endif

    COMPILER_TIME_START(pContext, TIME_OptimizationPasses);
    // scope to force destructors before mem usage sampling
    {
        unify_opt_PreProcess(pContext);
        /// Keeps track of the Dump objects so that we can free them after the pass manager has been run

        // right now we don't support any standard function in the code gen
        // maybe we want to support some at some point to take advantage of LLVM optimizations
        TargetLibraryInfoImpl TLI;
        TLI.disableAllFunctions();

        mpm.add(new MetaDataUtilsWrapper(pMdUtils, pContext->getModuleMetaData()));

        mpm.add(new CodeGenContextWrapper(pContext));
        DummyPass* dummypass = new DummyPass();
        mpm.add(dummypass);
            TargetIRAnalysis GenTTgetIIRAnalysis([&](const Function& F) {
            GenIntrinsicsTTIImpl GTTI(pContext, dummypass);
            return TargetTransformInfo(GTTI);
        });

        mpm.add(new TargetTransformInfoWrapperPass(GenTTgetIIRAnalysis));
#if defined(_DEBUG) && !defined(__ANDROID__)
        // IGC IR Verification pass checks that we get a correct IR after the Unification.
        mpm.add(new VerificationPass());
#endif
        mpm.add(new llvm::TargetLibraryInfoWrapperPass(TLI));
        initializeWIAnalysisPass(*PassRegistry::getPassRegistry());

        // Do inter-procedural constant propagation early.
        if (pContext->m_enableSubroutine)
        {
            // Here, we propagate function attributes across calls.  Remaining
            // function calls that were conservatively marked as 'convergent'
            // in ProcessBuiltinMetaData can have that attribute stripped if
            // possible which potentially allows late stage code sinking of
            // those calls by the instruction combiner.
            mpm.add(createPostOrderFunctionAttrsLegacyPass());
#if LLVM_VERSION_MAJOR >= 12
            mpm.add(createIPSCCPPass());
#else
            mpm.add(createConstantPropagationPass());

            // Don't run IPConstantProp if there are stackcalls
            if (!pContext->m_hasStackCalls)
                mpm.add(createIPConstantPropagationPass());
#endif
        }

        if (IGC_IS_FLAG_ENABLED(MSAA16BitPayloadEnable) &&
            pContext->platform.support16bitMSAAPayload())
        {
            mpm.add(new ConvertMSAAPayloadTo16Bit());
        }

        if (IGC_GET_FLAG_VALUE(MSAAClearedKernel) > 0)
        {
            mpm.add(new MSAAInsertDiscard());
        }
        mpm.add(createSamplerPerfOptPass());

        // enable this only when Pooled EU is not supported
        if ((IGC_IS_FLAG_ENABLED(EnableThreadCombiningOpt) ||
             IGC_IS_FLAG_ENABLED(EnableForceThreadCombining) ||
             IGC_IS_FLAG_ENABLED(EnableForceGroupSize)) &&
            (pContext->type == ShaderType::COMPUTE_SHADER) &&
            !pContext->platform.supportPooledEU() &&
            pContext->platform.supportsThreadCombining()&&
            SimdEarlyCheck(pContext))
        {
            initializePostDominatorTreeWrapperPassPass(*PassRegistry::getPassRegistry());
            mpm.add(new ThreadCombining());
            mpm.add(createAlwaysInlinerLegacyPass());
            mpm.add(createPromoteMemoryToRegisterPass());
        }

        if ((!IGC_IS_FLAG_ENABLED(DisableDynamicTextureFolding) && pContext->getModuleMetaData()->inlineDynTextures.size() > 0) ||
            (!IGC_IS_FLAG_ENABLED(DisableDynamicResInfoFolding)))
        {
            mpm.add(new DynamicTextureFolding());
        }

        if (IGC_IS_FLAG_ENABLED(EnableSLMConstProp) &&
            pContext->type == ShaderType::COMPUTE_SHADER)
        {
            mpm.add(createSLMConstPropPass());
        }

        if (pContext->m_DriverInfo.CodeSinkingBeforeCFGSimplification())
        {
            mpm.add(new CodeSinking(true));
        }
        mpm.add(llvm::createCFGSimplificationPass());

        mpm.add(llvm::createBasicAAWrapperPass());
        mpm.add(createAddressSpaceAAWrapperPass());

        if (pContext->type == ShaderType::RAYTRACING_SHADER || pContext->hasSyncRTCalls())
        {
            if (IGC_IS_FLAG_DISABLED(DisableRTAliasAnalysis))
                mpm.add(createRayTracingAddressSpaceAAWrapperPass());
        }

        mpm.add(createExternalAAWrapperPass(&addAddressSpaceAAResult));

        if (pContext->m_instrTypes.hasLoadStore)
        {
            mpm.add(llvm::createDeadStoreEliminationPass());
            mpm.add(createMarkReadOnlyLoadPass());
        }

        mpm.add(createLogicalAndToBranchPass());
        mpm.add(llvm::createEarlyCSEPass());

        if (IGC_IS_FLAG_ENABLED(EnableHFpacking) &&
            pContext->type == ShaderType::COMPUTE_SHADER)
        {
            mpm.add(createHFpackingOptPass());
        }

        if (pContext->m_instrTypes.CorrelatedValuePropagationEnable)
        {
            mpm.add(llvm::createCorrelatedValuePropagationPass());
        }

        mpm.add(new BreakConstantExpr());
        mpm.add(new IGCConstProp());
        mpm.add(new CustomSafeOptPass());
        if (!pContext->m_DriverInfo.WADisableCustomPass())
        {
            mpm.add(new CustomUnsafeOptPass());
        }

        if (IGC_IS_FLAG_ENABLED(EmulateFDIV))
        {
            mpm.add(createGenFDIVEmulation());
        }

        mpm.add(createIGCInstructionCombiningPass());
        if (pContext->type == ShaderType::OPENCL_SHADER &&
            static_cast<OpenCLProgramContext*>(pContext)->m_InternalOptions.KernelDebugEnable)
        {
            mpm.add(new ImplicitGIDRestoring());
        }
        mpm.add(new FCmpPaternMatch());
        mpm.add(llvm::createDeadCodeEliminationPass()); // this should be done both before/after constant propagation

        if (pContext->m_instrTypes.hasGenericAddressSpacePointers &&
            IGC_IS_FLAG_ENABLED(EnableGASResolver))
        {
            mpm.add(createSROAPass());
            mpm.add(createFixAddrSpaceCastPass());
            mpm.add(createResolveGASPass());
        }

        if (IGC_IS_FLAG_ENABLED(SampleMultiversioning) || pContext->m_enableSampleMultiversioning)
        {
            if (pContext->m_instrTypes.numOfLoop == 0)
                mpm.add(new SampleMultiversioning(pContext));
        }

        bool disableGOPT = ( (IsStage1FastestCompile(pContext->m_CgFlag, pContext->m_StagingCtx) ||
                               IGC_GET_FLAG_VALUE(ForceFastestSIMD)) &&
                             ((IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_DISABLE_GOPT) ||
                               IGC_GET_FLAG_VALUE(FastestS1Experiments) == FCEXP_NO_EXPRIMENT ||
                               pContext->getModuleMetaData()->compOpt.DisableFastestGopt));

        if (pContext->m_instrTypes.hasMultipleBB && !disableGOPT)
        {
            // disable loop unroll for excessive large shaders
            if (pContext->m_instrTypes.numOfLoop)
            {
                mpm.add(createLoopDeadCodeEliminationPass());
                mpm.add(createLoopCanonicalization());
                mpm.add(llvm::createLoopDeletionPass());
                mpm.add(llvm::createBreakCriticalEdgesPass());
                mpm.add(llvm::createLoopRotatePass(LOOP_ROTATION_HEADER_INST_THRESHOLD));
                mpm.add(llvm::createLCSSAPass());
                mpm.add(llvm::createLoopSimplifyPass());

                if (pContext->m_retryManager.AllowLICM() && IGC_IS_FLAG_ENABLED(allowLICM))
                {
                    int licmTh = IGC_GET_FLAG_VALUE(LICMStatThreshold);
                    mpm.add(new InstrStatistic(pContext, LICM_STAT, InstrStatStage::BEGIN, licmTh));
                    mpm.add(llvm::createLICMPass());
                    mpm.add(new InstrStatistic(pContext, LICM_STAT, InstrStatStage::END, licmTh));
                }


                if (!pContext->m_retryManager.IsFirstTry())
                {
                    mpm.add(new DisableLoopUnrollOnRetry());
                }


                mpm.add(createIGCInstructionCombiningPass());

                if (IGC_IS_FLAG_ENABLED(EnableLoopHoistConstant))
                {
                    mpm.add(createLoopHoistConstant());
                }

                if (pContext->type == ShaderType::OPENCL_SHADER &&
                    static_cast<OpenCLProgramContext*>(pContext)->m_InternalOptions.KernelDebugEnable)
                {
                    mpm.add(new ImplicitGIDRestoring());
                }
                if (IGC_IS_FLAG_ENABLED(EnableAdvCodeMotion) &&
                    pContext->type == ShaderType::OPENCL_SHADER &&
                    !pContext->m_instrTypes.hasSwitch)
                {
                    mpm.add(createAdvCodeMotionPass(IGC_GET_FLAG_VALUE(AdvCodeMotionControl)));
                }

                int LoopUnrollThreshold = pContext->m_DriverInfo.GetLoopUnrollThreshold();

                // override the LoopUnrollThreshold if the registry key is set
                if (IGC_GET_FLAG_VALUE(SetLoopUnrollThreshold) != 0)
                {
                    LoopUnrollThreshold = IGC_GET_FLAG_VALUE(SetLoopUnrollThreshold);
                }

                // if the shader contains indexable_temp, we'll keep unroll
                bool unroll = IGC_IS_FLAG_DISABLED(DisableLoopUnroll);
                bool hasIndexTemp = (pContext->m_indexableTempSize[0] > 0);
                bool disableLoopUnrollStage1 =
                    IsStage1FastestCompile(pContext->m_CgFlag, pContext->m_StagingCtx) &&
                       (IGC_GET_FLAG_VALUE(FastestS1Experiments) == FCEXP_NO_EXPRIMENT ||
                        (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_DISABLE_UNROLL));
                if ((LoopUnrollThreshold > 0 &&
                     unroll &&
                     !disableLoopUnrollStage1)
                    || hasIndexTemp)
                {
                    mpm.add(IGCLLVM::createLoopUnrollPass());
                }

                // Due to what looks like a bug in LICM, we need to break the LoopPassManager between
                // LoopUnroll and LICM.
                mpm.add(createBarrierNoopPass());

                if (pContext->m_retryManager.AllowLICM() && IGC_IS_FLAG_ENABLED(allowLICM))
                {
                    mpm.add(llvm::createLICMPass());
                }

                // Second unrolling with the same threshold.
                if (LoopUnrollThreshold > 0 && !IGC_IS_FLAG_ENABLED(DisableLoopUnroll))
                {
                    mpm.add(IGCLLVM::createLoopUnrollPass());
                }

                mpm.add(llvm::createLoopLoadEliminationPass());

                if (!extensiveShader(pContext) && pContext->m_instrTypes.hasNonPrimitiveAlloca)
                {
                        if (pContext->m_DriverInfo.NeedCountSROA())
                    {
                        mpm.add(new InstrStatistic(pContext, SROA_PROMOTED, InstrStatStage::BEGIN, 300));
                        mpm.add(createSROAPass());
                        mpm.add(new InstrStatistic(pContext, SROA_PROMOTED, InstrStatStage::END, 300));
                    }
                    else
                    {
                        mpm.add(createSROAPass());
                    }
                }
            }

            // Note:
            // call reassociation pass before IGCConstProp(EnableSimplifyGEP)
            // to preserve the the expr evaluation order that IGCConstProp
            // creates.
            // Limit this optimization to GPGPU-only because it tends to have
            // more address computation.
            // Do not apply reordering on vertex-shader as CustomUnsafeOptPass
            // does.
            if (IGC_IS_FLAG_ENABLED(OCLEnableReassociate) &&
                pContext->type == ShaderType::OPENCL_SHADER)
            {
                mpm.add(createReassociatePass());
            }

            mpm.add(createPromoteConstantStructsPass());

            if (IGC_IS_FLAG_ENABLED(EnableGVN))
            {
                mpm.add(llvm::createGVNPass());
            }
            mpm.add(createGenOptLegalizer());

            mpm.add(llvm::createSCCPPass());

            mpm.add(llvm::createDeadCodeEliminationPass());
            if (!extensiveShader(pContext))
                mpm.add(llvm::createAggressiveDCEPass());

            mpm.add(new BreakConstantExpr());
            mpm.add(new IGCConstProp(IGC_IS_FLAG_ENABLED(EnableSimplifyGEP)));

            if (IGC_IS_FLAG_DISABLED(DisableImmConstantOpt))
            {
                // If we have ICBs, need to emit clamp code so OOB access doesn't occur
                if (pContext->getModuleMetaData()->immConstant.zeroIdxs.size())
                {
                    mpm.add(createClampICBOOBAccess());
                }

                mpm.add(createIGCIndirectICBPropagaionPass());
            }

            mpm.add(new GenUpdateCB());

            if (!pContext->m_instrTypes.hasAtomics && !extensiveShader(pContext))
            {
                if (pContext->type == ShaderType::OPENCL_SHADER)
                {
                    // Add CFGSimplification for clean-up before JumpThreading.
                    mpm.add(llvm::createCFGSimplificationPass());
                }

                // jump threading currently causes the atomic_flag test from c11 conformance to fail.  Right now,
                // only do jump threading if we don't have atomics as using atomics as locks seems to be the most common
                // case of violating the no independent forward progress clause from the spec.
                mpm.add(llvm::createJumpThreadingPass());
            }
            mpm.add(llvm::createCFGSimplificationPass());
            mpm.add(llvm::createEarlyCSEPass());
            if (pContext->m_instrTypes.hasNonPrimitiveAlloca)
            {
                // run custom safe opts to potentially get rid of indirect
                // addressing of private arrays, see visitLoadInst
                mpm.add(new CustomSafeOptPass());
                mpm.add(createSROAPass());
            }

            // Use CFGSimplification to do clean-up. Needs to be invoked before lowerSwitch.
            mpm.add(llvm::createCFGSimplificationPass());

            if (IGC_IS_FLAG_DISABLED(DisableFlattenSmallSwitch))
            {
                mpm.add(createFlattenSmallSwitchPass());
            }
            //some optimization can create switch statement we don't support
            mpm.add(llvm::createLowerSwitchPass());

            // preferred to be added after all LowerSwitch pass runs, as switch lowering is able
            // to benefit from unreachable instruction when it's in default switch case
            mpm.add(new UnreachableHandling());

            // Conditions apply just as above due to problems with atomics
            // (see comment above for details).
            if (!pContext->m_instrTypes.hasAtomics && !extensiveShader(pContext))
            {
                // After lowering 'switch', run jump threading to remove redundant jumps.
                mpm.add(llvm::createJumpThreadingPass());
            }

            // run instruction combining to clean up the code after CFG optimizations
            mpm.add(createIGCInstructionCombiningPass());
            if (pContext->type == ShaderType::OPENCL_SHADER &&
                static_cast<OpenCLProgramContext*>(pContext)->m_InternalOptions.KernelDebugEnable)
            {
                mpm.add(new ImplicitGIDRestoring());
            }

            mpm.add(llvm::createDeadCodeEliminationPass());
            mpm.add(llvm::createEarlyCSEPass());


            // need to be before code sinking
            mpm.add(createInsertBranchOptPass());

            mpm.add(new CustomSafeOptPass());
            if (!pContext->m_DriverInfo.WADisableCustomPass())
            {
                mpm.add(new CustomUnsafeOptPass());
            }
        }
        else
        {
            if (pContext->m_instrTypes.hasMultipleBB)
            {
                assert(disableGOPT);
                // disable loop unroll for excessive large shaders
                if (pContext->m_instrTypes.numOfLoop)
                {
                    mpm.add(llvm::createLoopRotatePass(LOOP_ROTATION_HEADER_INST_THRESHOLD));


                    int LoopUnrollThreshold = pContext->m_DriverInfo.GetLoopUnrollThreshold();

                    // override the LoopUnrollThreshold if the registry key is set
                    if (IGC_GET_FLAG_VALUE(SetLoopUnrollThreshold) != 0)
                    {
                        LoopUnrollThreshold = IGC_GET_FLAG_VALUE(SetLoopUnrollThreshold);
                    }

                    // if the shader contains indexable_temp, we'll keep unroll
                    bool unroll = IGC_IS_FLAG_DISABLED(DisableLoopUnroll);
                    bool hasIndexTemp = (pContext->m_indexableTempSize[0] > 0);
                    // Enable loop unrolling for stage 1 for now due to persisting regressions
                    bool disableLoopUnrollStage1 =
                        IsStage1FastestCompile(pContext->m_CgFlag, pContext->m_StagingCtx) &&
                           (//IGC_GET_FLAG_VALUE(FastestS1Experiments) == FCEXP_NO_EXPRIMENT ||
                            (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_DISABLE_UNROLL));
                    if ((LoopUnrollThreshold > 0 &&
                         unroll &&
                         !disableLoopUnrollStage1)
                        || hasIndexTemp)
                    {
                        mpm.add(IGCLLVM::createLoopUnrollPass());
                    }
                }

                if (IGC_IS_FLAG_ENABLED(EnableGVN))
                {
                    mpm.add(llvm::createGVNPass());
                }
            }
            if (IGC_IS_FLAG_DISABLED(DisableImmConstantOpt))
            {
                // If we have ICBs, need to emit clamp code so OOB access doesn't occur
                if (pContext->getModuleMetaData()->immConstant.zeroIdxs.size())
                {
                    mpm.add(createClampICBOOBAccess());
                }

                mpm.add(createIGCIndirectICBPropagaionPass());
            }

            //single basic block
            if (!pContext->m_DriverInfo.WADisableCustomPass())
            {
                mpm.add(llvm::createEarlyCSEPass());
                mpm.add(new CustomSafeOptPass());
                mpm.add(new CustomUnsafeOptPass());
            }
            mpm.add(createGenOptLegalizer());
            mpm.add(createInsertBranchOptPass());
        }

        // If we have ICBs, need to emit clamp code so OOB access doesn't occur
        if (pContext->getModuleMetaData()->immConstant.zeroIdxs.size() && IGC_IS_FLAG_ENABLED(DisableImmConstantOpt))
        {
            mpm.add(createClampICBOOBAccess());
        }

        if (pContext->m_instrTypes.hasRuntimeValueVector)
        {
            // Optimize extracts from RuntimeValue vectors. It should be executed
            // after constants propagation and loop unrolling
            mpm.add(createVectorBitCastOptPass());
            mpm.add(new RuntimeValueVectorExtractPass());
        }

        if (pContext->m_enableSubroutine &&
            getFunctionControl(pContext) == FLAG_FCALL_DEFAULT)
        {
            mpm.add(createEstimateFunctionSizePass(EstimateFunctionSize::AL_Kernel));
            mpm.add(createSubroutineInlinerPass());
        }
        else
        {
            // Inline all remaining functions with always inline attribute.
            mpm.add(createAlwaysInlinerLegacyPass());
        }
        if ((pContext->m_DriverInfo.NeedExtraPassesAfterAlwaysInlinerPass() || pContext->m_enableSubroutine)
            && pContext->m_instrTypes.hasNonPrimitiveAlloca)
        {
            mpm.add(createSROAPass());
        }

#if LLVM_VERSION_MAJOR >= 7
        mpm.add(new TrivialLocalMemoryOpsElimination());
#endif
        mpm.add(createGenSimplificationPass());

        if (pContext->m_instrTypes.hasLoadStore)
        {
            mpm.add(llvm::createDeadStoreEliminationPass());
            mpm.add(llvm::createMemCpyOptPass());
            mpm.add(createLdShrinkPass());
        }

        mpm.add(llvm::createDeadCodeEliminationPass());

        if (IGC_IS_FLAG_ENABLED(EnableIntDivRemCombine)) {
            // simplify rem if the quotient is availble
            //
            // run GVN first so that stuff like the following can be
            // reduced as well:
            //  = foo / (2*x + 1)
            //  = foo % (2*x + 1)
            // can be reduced as well
            if (IGC_IS_FLAG_ENABLED(EnableGVN)) {
                mpm.add(llvm::createGVNPass());
            }
            //
            mpm.add(createIntDivRemCombinePass());
        }
        if (IGC_IS_FLAG_ENABLED(EnableConstIntDivReduction)) {
            // reduce division/remainder with a constant divisors/moduli to
            // more efficient sequences of multiplies, shifts, and adds
            mpm.add(createIntDivConstantReductionPass());
        }

        mpm.add(createMergeMemFromBranchOptPass());

        mpm.add(createConstantMergePass());

        mpm.add(CreateMCSOptimization());

        if (pContext->type == ShaderType::GEOMETRY_SHADER)
            mpm.add(createRectListOptimizationPass());

        mpm.add(CreateGatingSimilarSamples());

        if (!IGC::ForceAlwaysInline(pContext))
        {
            mpm.add(new PurgeMetaDataUtils());
        }
        // mpm.add(llvm::createDeadCodeEliminationPass()); // this should be done both before/after constant propagation

        if (IGC_IS_FLAG_ENABLED(EnableUnmaskedFunctions) &&
            IGC_IS_FLAG_DISABLED(LateInlineUnmaskedFunc))
        {
            mpm.add(new InlineUnmaskedFunctionsPass());
        }

        if (pContext->m_instrTypes.numOfLoop)
        {
            mpm.add(createDeadPHINodeEliminationPass());
        }


        if (IGC_IS_FLAG_ENABLED(EnableMadLoopSlice)) {
            mpm.add(createMadLoopSlicePass());
        }

        mpm.run(*pContext->getModule());
    } // end scope
    COMPILER_TIME_END(pContext, TIME_OptimizationPasses);

    //pContext->shaderEntry->viewCFG();
    DumpLLVMIR(pContext, "optimized");
    MEM_SNAPSHOT(IGC::SMS_AFTER_OPTIMIZER);
} // OptimizeIR

}  // namespace IGC