File: CoalescingEngine.cpp

package info (click to toggle)
intel-graphics-compiler 1.0.12504.6-1%2Bdeb12u1
links: PTS, VCS
area: main
in suites: bookworm
size: 83,912 kB
sloc: cpp: 910,147; lisp: 202,655; ansic: 15,197; python: 4,025; yacc: 2,241; lex: 1,570; pascal: 244; sh: 104; makefile: 25
file content (1562 lines) | stat: -rw-r--r-- 62,135 bytes
/*========================== begin_copyright_notice ============================

Copyright (C) 2017-2021 Intel Corporation

SPDX-License-Identifier: MIT

============================= end_copyright_notice ===========================*/

//===-------- CoalescingEngine.cpp - Pass and analysis for coalescing-------===//
//
//  Intel Extention to LLVM core
//===----------------------------------------------------------------------===//
//
// This pass is an implementation of idea of coalescing originated from USC
// but re-designed to work (and take advantage of) on SSA form.
//
//===----------------------------------------------------------------------===//

#include "Compiler/CISACodeGen/CoalescingEngine.hpp"
#include "Compiler/CISACodeGen/DeSSA.hpp"
#include "Compiler/CISACodeGen/ShaderCodeGen.hpp"
#include "Compiler/CISACodeGen/PatternMatchPass.hpp"
#include "Compiler/MetaDataApi/MetaDataApi.h"
#include "Compiler/CISACodeGen/helper.h"
#include "PayloadMapping.hpp"
#include "common/debug/Debug.hpp"
#include "common/debug/Dump.hpp"
#include "common/igc_regkeys.hpp"
#include "common/LLVMWarningsPush.hpp"
#include <llvm/IR/InstVisitor.h>
#include <llvm/ADT/SmallVector.h>
#include "common/LLVMWarningsPop.hpp"
#include "Compiler/IGCPassSupport.h"
#include "Probe/Assertion.h"

using namespace llvm;
using namespace IGC;
using namespace IGC::Debug;
using namespace IGC::IGCMD;

char CoalescingEngine::ID = 0;
#define PASS_FLAG "CoalescingEngine"
#define PASS_DESCRIPTION "coalesce moves coming payloads, insert and extract element"
#define PASS_CFG_ONLY true
#define PASS_ANALYSIS true
IGC_INITIALIZE_PASS_BEGIN(CoalescingEngine, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
IGC_INITIALIZE_PASS_DEPENDENCY(WIAnalysis)
IGC_INITIALIZE_PASS_DEPENDENCY(LiveVarsAnalysis)
IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenPatternMatch)
IGC_INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
IGC_INITIALIZE_PASS_DEPENDENCY(DeSSA)
IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
IGC_INITIALIZE_PASS_END(CoalescingEngine, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)

namespace IGC
{

    CoalescingEngine::CoalescingEngine() : FunctionPass(ID)
    {
        initializeCoalescingEnginePass(*PassRegistry::getPassRegistry());
    }

    void CoalescingEngine::getAnalysisUsage(llvm::AnalysisUsage& AU) const
    {
        AU.setPreservesAll();
        AU.addRequired<llvm::DominatorTreeWrapperPass>();
        AU.addRequired<WIAnalysis>();
        AU.addRequired<LiveVarsAnalysis>();
        AU.addRequired<CodeGenPatternMatch>();
        AU.addRequired<DeSSA>();
        AU.addRequired<MetaDataUtilsWrapper>();
        AU.addRequired<CodeGenContextWrapper>();
    }

    void CoalescingEngine::CCTuple::print(raw_ostream& OS, const Module*) const
    {
        OS << "CC Tuple";

        auto IT = OffsetToCCMap.begin();
        while (IT != OffsetToCCMap.end())
        {
            OS << IT->first << " : ";
            IT++;
        }

    }

    void CoalescingEngine::CCTuple::dump() const {
        print(ods());
    }

    void CoalescingEngine::print(raw_ostream& OS, const Module*) const
    {
        OS << "CC Tuple list: \n";


        for (auto itr = m_CCTupleList.begin(),
            iend = m_CCTupleList.end();
            itr != iend; ++itr)
        {
            CCTuple* ccTuple = *itr;
            ccTuple->print(OS);
        }

        OS << "CC Tuple list end. \n";

    }

    void CoalescingEngine::dump() const {
        print(ods());
    }

    /// \brief Entry point.
    bool CoalescingEngine::runOnFunction(Function& MF) {
        if (IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing))
        {
            return false;
        }

        MetaDataUtils* pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
        m_ModuleMetadata = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
        m_CodeGenContext = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
        if (!isEntryFunc(pMdUtils, &MF))
        {
            return false;
        }

        auto pCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
        m_Platform = pCtx->platform;
        m_PayloadMapping = PayloadMapping(pCtx);

        DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
        WIA = &getAnalysis<WIAnalysis>();
        CG = &getAnalysis<CodeGenPatternMatch>();
        LV = &getAnalysis<LiveVarsAnalysis>().getLiveVars();
        if (IGC_IS_FLAG_DISABLED(DisableDeSSA)) {
            m_DeSSA = &getAnalysis<DeSSA>();
        }
        else {
            m_DeSSA = nullptr;
        }

        for (Function::iterator I = MF.begin(), E = MF.end();
            I != E; ++I) {
            ProcessBlock(&(*I));
        }

        for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()),
            DE = df_end(DT->getRootNode()); DI != DE; ++DI) {

            IncrementalCoalesce(DI->getBlock());
        }
        return false;
    }

    /// Coalesce instructions within a single-BB
    void CoalescingEngine::IncrementalCoalesce(BasicBlock* MBB)
    {
        std::vector<Instruction*>& DefInstrs = BBProcessingDefs[MBB];
        std::sort(DefInstrs.begin(), DefInstrs.end(), MIIndexCompare(LV));

        for (std::vector<Instruction*>::const_iterator BBI = DefInstrs.begin(),
            BBE = DefInstrs.end(); BBI != BBE; ++BBI)
        {
            Instruction* DefMI = *BBI;

            auto RI = ValueNodeMap.find((Value*)DefMI);
            if (RI == ValueNodeMap.end()) {
                //Intrinsics themselves are not mapped to a node, they are
                //tuple generators.

                //NOTE: Please leave this commented code snippet! It is useful when debugging coalescing regressions:
                //      (used to isolate coalescing to particular shader to check failing/passing using binary search)
                //DWORD hash = (DWORD) static_cast<IGC::CodeGenContext&>(MBB->getParent()->getContext()).hash.getAsmHash();
                ////if (hash >= 0xA27Affff && hash <= 0xA3CFFF00)
                //if (hash >= 0xA3C00000 && hash <= 0xA3CFFFFF)
                //{
                //    continue;
                //}

                if (GenIntrinsicInst * intrinsic = llvm::dyn_cast<llvm::GenIntrinsicInst>(DefMI))
                {
                    GenISAIntrinsic::ID IID = intrinsic->getIntrinsicID();
                    if ((isURBWriteIntrinsic(intrinsic) && !(IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing_URB))) ||
                        (IID == GenISAIntrinsic::GenISA_RTWrite && !(IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing_RT))))
                    {
                        ProcessTuple(DefMI);
                    }
                }

                if (isSampleInstruction(DefMI) && !IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing_Sample))
                {
                    ProcessTuple(DefMI);
                }
                else if (isLdInstruction(DefMI))
                {
                    ProcessTuple(DefMI);
                }
            }
        }
    }

    static bool IsBindlessSampler(Instruction* inst)
    {
        if (SampleIntrinsic * sample = dyn_cast<SampleIntrinsic>(inst))
        {
            Value* sampler = sample->getSamplerValue();
            uint bufferIndex = 0;
            bool directIndexing = false;
            if (sampler->getType()->isPointerTy())
            {
                BufferType bufType = DecodeAS4GFXResource(
                    sampler->getType()->getPointerAddressSpace(), directIndexing, bufferIndex);
                if (bufType == BINDLESS_SAMPLER)
                {
                    return true;
                }
            }
        }
        return false;
    }

    /// Core algorithm.
    void CoalescingEngine::ProcessTuple(Instruction* tupleGeneratingInstruction)
    {
        const uint numOperands = m_PayloadMapping.GetNumPayloadElements(tupleGeneratingInstruction);
        // bindless sampler always have a header
        bool needsHeader = IsBindlessSampler(tupleGeneratingInstruction);

        //There should be an early exit in the case we have split-send available
        //and this instruction is sure to generate two-Element payload.
        if (m_Platform.supportSplitSend() && !needsHeader)
        {
            if (numOperands < 3)
            {
                return;
            }
        }
        else
        {
            if (numOperands < 2)
            {
                return;
            }

        }

        IGC_ASSERT(numOperands >= 2);

        //No result, but has side effects of updating the split mapping.
        DecideSplit(tupleGeneratingInstruction);

        uint numSplits = GetNumSplitParts(tupleGeneratingInstruction);
        for (uint numPart = 0; numPart < numSplits; numPart++)
        {
            SetCurrentPart(tupleGeneratingInstruction, numPart);
            const uint numOperands = GetNumPayloadElements(tupleGeneratingInstruction);
            bool isAnyNodeAnchored = false, isAnyNodeCoalescable = false;
            SmallPtrSet<CCTuple*, 8> touchedTuplesSet;
            SmallVector<CCTuple*, 8> touchedTuples;

            //Step: Prepare.
            PrepareTuple(
                numOperands,
                tupleGeneratingInstruction,
                //out:
                touchedTuplesSet,
                touchedTuples,
                isAnyNodeAnchored,
                isAnyNodeCoalescable);

            if (!isAnyNodeCoalescable) {
                continue;
            }


            //So at this point, there is at least one node that is candidate for coalescing.
            //It might be already a part of CCtuple, or it might be yet unconstrained.

            //Tuple Generating Instruction enforces sequential order of registers allocated to values.
            //Each value might belong to (up to) one CCtuple.
            //If it belongs to CCTuple, it has an unique offset in that CCTuple.
            //Three cases are considered:
            //1) No value have CCtuple assigned -> create new CCTuple and assign offsets.
            //    a) if platform supports split-send, then some (arbitrary) split point should be determined
            //2) At least one value belongs to a CCtuple (we say it is 'anchored' (constrained) to that tuple)
            //   and all values belong to the same CCtuple
            //3) More than one value belong to a tuple, and they belong to different (>1) tuples: NOT implemented


            //Case 1) : create new CCTuple
            if (!isAnyNodeAnchored)
            {
                CreateTuple(numOperands, tupleGeneratingInstruction);
            }
            else {
                //Case 2) : at least one value is 'anchored'(constrained) to a CCtuple
                //Sub-case: All arguments belong to the same tuple.
                if (touchedTuples.size() == 1) {
                    // Step I: find an anchoring element.
                    // If arguments of this tuple-instruction are already constrained by CCtuple, it means
                    // that they also constrain the tuple that would be generated by this payload.
                    // They 'anchor' the new tuple, so pick the element that anchors the tuple and
                    // determine its offset.

                    //E.g. :
                    // ..     v1 (0) v2 (1) CCtuple
                    // v0 (0) v1 (1) v2 (2) (this payload)
                    // v1 will have: thisTupleStartOffset = 1, rootTupleStartOffset = 0
                    // CCtuple will be 'grown' with element v0, that is at relative offset =  -1
                    // v0(-1) v1 (0) v2 (1) CCtuple

                    CCTuple* ccTuple = NULL;
                    int rootTupleStartOffset = 0;
                    int thisTupleStartOffset = 0;

                    DetermineAnchor(
                        numOperands,
                        tupleGeneratingInstruction,
                        //out:
                        ccTuple,
                        rootTupleStartOffset,
                        thisTupleStartOffset);

                    IGC_ASSERT(ccTuple);

                    /* Step II: Interference checking */
                    int offsetDiff = rootTupleStartOffset - thisTupleStartOffset;

                    bool interferes = false;

                    //Heuristic: we do not want to grow the ccTuple size too much
                    // [c0 c1 c2 c3  c4  c5  c6  c7]
                    //(offsetDiff=4)[e0' e1' e2' e3' e4' e5' e6' e7']
                    // here, tuple numElements = 8, and offsetDiff = 4, so it would result in
                    // 12 element CC tuple if joined together. We want to prevent growing
                    // more than MaxTupleSize elements.
                    if (abs(offsetDiff) > 0 && ccTuple->GetNumElements() >= MaxTupleSize)
                    {
                        interferes = true;
                    }

                    ProcessInterferencesElementFunctor interferencesFunctor(
                        false /*forceEviction */,
                        tupleGeneratingInstruction,
                        offsetDiff,
                        ccTuple,
                        this);

                    if (!interferes)
                    {
                        interferes = InterferenceCheck(
                            numOperands,
                            tupleGeneratingInstruction,
                            offsetDiff,
                            ccTuple,
                            //out:
                            &interferencesFunctor);

                        if (!interferes &&
                            (ccTuple->HasNonHomogeneousElements() ||
                                m_PayloadMapping.HasNonHomogeneousPayloadElements(
                                    tupleGeneratingInstruction)
                                )
                            )
                        {
                            interferes = CheckIntersectionForNonHomogeneous(
                                numOperands,
                                tupleGeneratingInstruction,
                                offsetDiff,
                                ccTuple);
                        }
                    }

                    //If 'interferes' is false, it means that whole transaction could be committed,
                    //provided that elements in dominatorsForDisplacement are displaced, and other nodes are attached.
                    //If interferes is true, then no element will be attached to the ccTuple.
                    if (!interferes) {
                        SmallPtrSet<Value*, 8> touchedValuesSet;
                        for (uint i = 0; i < numOperands; i++) {
                            Value* val = GetPayloadElementToValueMapping(tupleGeneratingInstruction, i);

                            ccTuple->CheckIndexForNonInitializedSlot(i + offsetDiff);

                            if (touchedValuesSet.count(val)) {
                                continue;
                            }
                            else {
                                touchedValuesSet.insert(val);
                            }

                            if (IsValConstOrIsolated(val)) {
                                //do nothing
                            }
                            else {
                                Value* RootV = getRegRoot(val);
                                if (!RootV) //bail out if isolated
                                    continue;

                                //We do not expect to get Argument here, it must have been isolated.
                                IGC_ASSERT(dyn_cast<llvm::Instruction>(val));
                                IGC_ASSERT(ValueNodeMap.count(val));
                                //auto RI = ValueNodeMap.find(val);
                                //ElementNode *Node = RI->second;
                                ElementNode* Node = ValueNodeMap[val];

                                if (!interferencesFunctor.GetComputedValuesForIsolation().count(val))
                                {
                                    ccTuple->AttachNodeAtIndex(i + offsetDiff, Node, *this);
                                    IGC_ASSERT(getRegRoot(val));
                                }
                            }
                        }//for
                    }
                    else  // (interferes) is true
                    {
                        CreateTuple(numOperands, tupleGeneratingInstruction);
                    }
                }
                else {
                    //Case 3)
                    //More than one value belong to a tuple, and they belong to different (>1) tuples.
                    //FIXME: not implemented yet
                    //Need to implement code for joining two non-overlapping tuples.
                    //This probably does not bring much benefit, but is nevertheless complex to implement.
                    //Having two congruence class tuples, we need to find whether piecewise congruent classes
                    //of the tuples interfere. Here a 'two-set interference check' due to Boissinot could be
                    //used (first sorting elements)
                }
            }
        }
    }

    ///
    bool CoalescingEngine::CheckIntersectionForNonHomogeneous(
        const uint numOperands,
        Instruction* tupleGeneratingInstruction,
        const int offsetDiff,
        CCTuple* ccTuple)
    {
        //Picturesque description:
        // Hi - homogeneous slots
        // <left_part>[H_0 H_1 .. H_n]<right_part>

        //ccTuple:
        // <left_part> [H_0  H_1  ..  H_n]<right_part>
        //
        //1) this tuple instruction contains non-homogeneous part:
        // <left_part'>[H_0' H_1' .. H_n']<right_part'>
        //2) or this tuple instruction does not contain non-homogeneous part
        //             [H_0' H_1' .. H_n']

        bool interferes = false;
        if (ccTuple->HasNonHomogeneousElements())
        {
            // Finding a supremum instruction for a homogeneous part is implemented
            // only for render target write instructions (RTWritIntrinsic).
            // An only other possible combination for non-homogeneous instructions comprises
            // RTWritIntrinsic and RTDualBlendSourceIntrinsic.
            // In some cases there is an opportunity to coalesce their payloads but there exists
            // a danger that they are compiled in different SIMD modes so there is a safe assumption
            // that they cannot be coalesced.
            // A comparison of the payload of RTWritIntrinsic and RTDualBlendSourceIntrinsic:
            // +----------------------------+----------------------------+
            // | RTWritIntrinsic            | RTDualBlendSourceIntrinsic |
            // +----------------------------+----------------------------+
            // | src0 Alpha (optional)      | src0 Alpha (unavailable)*  |
            // +----------------------------+----------------------------+
            // | output mask (optional)     | output mask (optional)     |
            // +----------------------------+----------------------------+
            // | -                          | R0 channel                 |
            // +----------------------------+----------------------------+
            // | -                          | G0 channel                 |
            // +----------------------------+----------------------------+
            // | -                          | B0 channel                 |
            // +----------------------------+----------------------------+
            // | -                          | A0 channel                 |
            // +----------------------------+----------------------------+
            // | R0 channel                 | R1 channel                 |
            // +----------------------------+----------------------------+
            // | G0 channel                 | G1 channel                 |
            // +----------------------------+----------------------------+
            // | B0 channel                 | B1 channel                 |
            // +----------------------------+----------------------------+
            // | A0 channel                 | A1 channel                 |
            // +----------------------------+----------------------------+
            // | Depth (optional)           | Depth (optional)           |
            // +----------------------------+----------------------------+
            // | Stencil (optional)         | Stencil (optional)         |
            // +----------------------------+----------------------------+
            // All optional fields except for src0 Alpha are consistent for both instruction called in a fragment shader.
            // * RTDualBlendSourceIntrinsic doesn't have such an argument but it is defined in its payload.
            if (offsetDiff == 0 &&
                ccTuple->GetNumElements() == numOperands &&
                llvm::isa<llvm::RTWritIntrinsic>(ccTuple->GetRoot()) &&
                llvm::isa<llvm::RTWritIntrinsic>(tupleGeneratingInstruction))
            {
                if (m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction))
                {
                    //TODO: test for 'supremum' of TGI and ccTuple root element can be obtained
                    const Instruction* supremum = m_PayloadMapping.GetSupremumOfNonHomogeneousPart(
                        tupleGeneratingInstruction,
                        ccTuple->GetRoot());
                    if (supremum)
                    {
                    }
                    else
                    {
                        interferes = true;
                    }
                }
            }
            else
            {
                //Since ccTuple contains non-homogeneous, and this homogeneous
                //part is not perfectly aligned to ccTuple, we cannot reason about
                //interferences safely, thus assume interferes.
                interferes = true;
            }
        }
        else
        {
            //ccTuple:
            //             [H_0  H_1  ..  H_n]
            //
            //1) this tuple instruction contains non-homogeneous part:
            // <left_part'>[H_0' H_1' .. H_n']<right_part'>

            if (m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction))
            {
                if (offsetDiff == 0 && ccTuple->GetNumElements() == numOperands)
                {
                    ccTuple->SetHasNonHomogeneousElements(tupleGeneratingInstruction);
                }
                else
                {
                    interferes = true;
                }
            }
        }

        return interferes;
    }

    void CoalescingEngine::DecideSplit(Instruction* tupleGeneratingInstruction)
    {
        if (m_PayloadMapping.DoPeelFirstElement(tupleGeneratingInstruction))
        {
            splitPoint_[tupleGeneratingInstruction] = 1;
            return;
        }

        uint splitPoint = 0;
        const uint numOperands = m_PayloadMapping.GetNumPayloadElements(tupleGeneratingInstruction);
        //No sense entering here if we have less than 2 operands.
        IGC_ASSERT(numOperands >= 2);
        Value* val = m_PayloadMapping.GetPayloadElementToValueMapping(tupleGeneratingInstruction, 0);
        if (IsValConstOrIsolated(val) || !getRegRoot(val))
        {
            splitPoint = 1;
        }

        val = m_PayloadMapping.GetPayloadElementToValueMapping(tupleGeneratingInstruction, numOperands - 1);
        if (IsValConstOrIsolated(val) || !getRegRoot(val))
        {
            splitPoint = numOperands - 1;
        }

        if (splitPoint > 0 && m_PayloadMapping.DoesAllowSplit(tupleGeneratingInstruction))
        {
            splitPoint_[tupleGeneratingInstruction] = splitPoint;
        }

    }


    /// Determines if any value that is an argument of (possible) tuple generating instruction
    /// can participate in payload coalescing. If so, it also determines if there are any
    /// 'anchored'(constrained) values and the number of constraining tuples.
    void CoalescingEngine::PrepareTuple(
        const uint numOperands,
        Instruction* tupleGeneratingInstruction,
        SmallPtrSet<CCTuple*, 8> & touchedTuplesSet,
        SmallVector<CCTuple*, 8> & touchedTuples,
        bool& isAnyNodeAnchored,
        bool& isAnyNodeCoalescable)
    {
        uint numConstants = 0;
        //Step: Prepare.
        for (uint i = 0; i < numOperands; i++)
        {
            Value* val = GetPayloadElementToValueMapping(tupleGeneratingInstruction, i);

            if (IsValConstOrIsolated(val) || !getRegRoot(val))
            {
                numConstants++;
                continue;
            }

            Value* RootV = getRegRoot(val);
            IGC_ASSERT(RootV);
            {
                isAnyNodeCoalescable = true;

                IGC_ASSERT(ValueNodeMap.count(RootV));
                auto RI = ValueNodeMap.find(RootV);
                ElementNode* Node = RI->second;

                auto CCI = NodeCCTupleMap.find(Node);
                if (CCI != NodeCCTupleMap.end())
                {
                    CCTuple* ccTuple = NodeCCTupleMap[Node];
                    if (!touchedTuplesSet.count(ccTuple))
                    {
                        touchedTuplesSet.insert(ccTuple);
                        touchedTuples.push_back(ccTuple);
                    }

                    isAnyNodeAnchored = true;
                }

                // Do not coalesce values having excessive number of uses.
                // This otfen introduces many anti-dependencies.
                const unsigned MAX_USE_COUNT = 20;
                if (val->hasNUsesOrMore(MAX_USE_COUNT))
                {
                    isAnyNodeAnchored = true;
                }
            }
        }

        //heuristic:
        //we do not want to create 'wide' root variables, where most of the slots
        //will be anyway non-coalesced (due to immediate constants/isolated variables)
        //vISA will anyway not benefit, while this increases pressure and makes coloring
        //with round-robin heuristic harder
        if (numOperands > 2 && numConstants > (numOperands / 2))
        {
            if (!m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction))
            {
                //force non-coalescing:
                isAnyNodeCoalescable = false;
            }
        }
    }

    /// Creates fresh tuple.
    /// Fresh tuple consists of nodes which do not have any tuple assigned yet.
    /// Precondition is that at least one value is coalescable (otherwise does not make sense).
    void CoalescingEngine::CreateTuple(
        const uint numOperands,
        Instruction* tupleGeneratingInstruction)
    {
        CCTuple* ccTuple = new CCTuple();
        m_CCTupleList.push_back(ccTuple);

        for (uint i = 0; i < numOperands; i++) {
            Value* val = GetPayloadElementToValueMapping(tupleGeneratingInstruction, i);

            if (IsValConstOrIsolated(val))
            {
                //It is important to initialize CC with empty slots, in order to get the proper
                //size of the vISA variable that will back CCtuple that corresponds to payload.
                ccTuple->ResizeBounds(i);
                continue;
            }

            Value* RootV = getRegRoot(val);
            if (!RootV) { //isolated
                ccTuple->ResizeBounds(i);
                continue;
            }

            IGC_ASSERT(ValueNodeMap.count(RootV));
            ElementNode* RootNode = ValueNodeMap[RootV];

            auto CCTI = NodeCCTupleMap.find(RootNode);
            if (CCTI == NodeCCTupleMap.end())
            {
                NodeCCTupleMap[RootNode] = ccTuple;
                NodeOffsetMap[RootNode] = i;
                ccTuple->InitializeIndexWithCCRoot(i, RootNode);
                CurrentDominatingParent[RootV] = RootV;
                ImmediateDominatingParent[RootV] = NULL;
            }
            else
            {
                //This must have been a value that is copied to payload multiple times.
                //OR: The new CCtuple has been isolated, and this is the element that
                //belongs to other tuple.
                //Since this value belongs to another tuple, we should not increase the
                //current tuple's size, no?
                //ccTuple->ResizeBounds(i);
            }
        } // loop over arguments

        if (m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction))
        {
            ccTuple->SetHasNonHomogeneousElements(tupleGeneratingInstruction);
        }
    }

    /// Given a tuple generating instruction, picks out an 'anchored'(constrained)
    /// value and determines the relative offset
    /// output:
    /// ccTuple              - a tuple that is constraining a picked value
    /// thisTupleStartOffset - the offset of the value in the current tuple generating
    ///                        instruction
    /// rootTupleStartOffset - the offset of the value in constraining tuple
    /// Assumption: it is already checked all values belong to exactly one tuple!
    /// (otherwise the result might pick an arbitrary tuple)
    void CoalescingEngine::DetermineAnchor(
        const uint numOperands,
        const Instruction* tupleGeneratingInstruction,
        CCTuple*& ccTuple,
        int& rootTupleStartOffset,
        int& thisTupleStartOffset)
    {
        for (uint i = 0; i < numOperands; i++) {
            Value* val = GetPayloadElementToValueMapping(tupleGeneratingInstruction, i);

            if (IsValConstOrIsolated(val)) {
                continue;
            }
            Value* RootV = getRegRoot(val);
            if (!RootV)
                continue;

            IGC_ASSERT(ValueNodeMap.count(RootV));
            ElementNode* RootNode = ValueNodeMap[RootV];

            auto CCTI = NodeCCTupleMap.find(RootNode);
            if (CCTI != NodeCCTupleMap.end()) {
                //Element is constrained by ccTuple.
                IGC_ASSERT(NodeOffsetMap.count(RootNode));
                rootTupleStartOffset = NodeOffsetMap[RootNode];
                thisTupleStartOffset = i;
                ccTuple = NodeCCTupleMap[RootNode];
                //Just a sanity check.. :
                IGC_ASSERT(RootNode == ccTuple->GetCCNodeWithIndex(NodeOffsetMap[RootNode]));
                break;
            }

        }//for
    }

    /// Prepares the slot for moves (e.g. induced by const, isolated, uniform to vector)
    /// into payload by evicting any value that is currently occupying this slot.
    /// Assumption is that heuristic has determined it is profitable to do so.
    /// input: bool evictFullCongruenceClass - tells whether the full congruence class
    /// should be evicted, or only the element that occupies the tuple instruction
    /// for the lifetime of a 'mov' (this is sufficient for issuing a mov to the payload slot).
    void CoalescingEngine::PrepareInsertionSlot(
        CCTuple* ccTuple,
        const int index,
        Instruction* inst,
        const bool evictFullCongruenceClass)
    {
        ElementNode* RootNode = ccTuple->OffsetToCCMap[index];
        IGC_ASSERT(RootNode);

        if (evictFullCongruenceClass) {
            Value* NewParent = GetActualDominatingParent(RootNode->value, inst);

            while (NewParent) {
                if (getRegRoot(NewParent)) {
                    isolateReg(NewParent);
                }
                NewParent = ImmediateDominatingParent[NewParent];
            }

            //it might turn out, that rootNode does not dominate 'inst'
            //since it is in another branch of DT
            //do not forget to delete it as well
            if (getRegRoot(RootNode->value)) {
                isolateReg(RootNode->value);
            }

        }
        else {
            //Evict dominating parent from CC.
            Value* NewParent = GetActualDominatingParent(RootNode->value, inst);
            if (NewParent)
                isolateReg(NewParent);
        }
    }

    bool CoalescingEngine::IsInsertionSlotAvailable(
        CCTuple* ccTuple,
        const int index,
        Instruction* tupleGeneratingInstruction,
        const bool canExtend)
    {
        bool avaiable = true;

        //ccTuple->OffsetToCCMap.count(index) == 0
        if (!canExtend)
        {
            const int tupleNumElements = ccTuple->GetNumElements();

            if (!(0 <= index && index < tupleNumElements))
            {
                return false;
            }
        }

        ElementNode* nodeToCheck = ccTuple->GetCCNodeWithIndex(index);

        if (nodeToCheck == NULL) {
            //tuple   :  X       CC1   CC2
            //payload :  <slot>   v1        v2
            //Means: Tuple has no live interval (X) that is live at the point the payload is
            //defined. It means, that a given 'register slot' can be used to issue a 'mov'
            //with an immediate (or isolated) value.
        }
        else {
            //tuple   :  CC0       CC1   CC2
            // ....       | <- lifetime
            //payload :  <slot>    v1       v2
            // .....      | <- lifetime
            //Here we need to check whether the value in CC is live
            //and intersects with the desired <slot> .

            //Sanity check: tuple contains the root element.

            //IGC_ASSERT(getRegRoot(nodeToCheck->value) == nodeToCheck->value);
            Value* RootV = nodeToCheck->value;
            Value* NewParent = GetActualDominatingParent(RootV, tupleGeneratingInstruction);
            //FIXME: what if we do not have pure ssa form? Look at de-ssa comment.
            if (NewParent && LV->isLiveAt(NewParent, tupleGeneratingInstruction)) {
                avaiable = false;
                //interferes = true;
                //break;
            }
        }

        return avaiable;
    }

    /// Processes the tuple elements in sequential order.
    /// A function object is passed as an argument, which responds to
    /// specific events (whether given payload elements is const/isolated/copied etc.)
    /// It is meant to be used as a template method in various context where different
    /// functors could be passed, but the walking mechanism is the same.
    void CoalescingEngine::ProcessElements(const uint numOperands,
        Instruction* tupleInst,
        const int offsetDiff,
        CCTuple* ccTuple,
        ElementFunctor* functor)
    {
        SmallPtrSet<Value*, 8> touchedValuesSet;

        for (uint i = 0; i < numOperands; i++) {
            functor->SetIndex(i);
            Value* val = GetPayloadElementToValueMapping(tupleInst, i);
            if (touchedValuesSet.count(val)) {
                if (functor->visitCopy())
                    continue;
                else
                    break;
            }
            else {
                touchedValuesSet.insert(val);
            }
            //Case 1: Constant
            if (llvm::isa<llvm::Constant>(val)) {
                if (functor->visitConstant())
                    continue;
                else
                    break;
            }
            else //Case 2: Variable
            {
                if (IsValIsolated(val) || !getRegRoot(val))
                {
                    //If value is isolated, a copy <slot> will be necessary.
                    if (functor->visitIsolated())
                        continue;
                    else
                        break;
                }

                IGC_ASSERT(ValueNodeMap.count(val));
                ElementNode* Node = ValueNodeMap[val];
                ElementNode* ccRootNode = ccTuple->GetCCNodeWithIndex(i + offsetDiff);

                //Interferes if and only if: live at this definition point AND has different value
                if (ccRootNode == Node) {
                    //No interference, since it is the same value. We have got a reuse of the same value in
                    //different tuple.
                    if (functor->visitAnchored())
                        continue;
                    else
                        break;
                }
                else
                {
                    IGC_ASSERT(llvm::isa<llvm::Instruction>(val));
                    //Here comes the meat, and the most interesting case:
                    if (ccRootNode != NULL)
                    {
                        Value* CCRootV = ccRootNode->value;
                        Value* dominating = nullptr;
                        Value* dominated = nullptr;

                        SymmetricInterferenceTest(
                            CCRootV, dyn_cast<llvm::Instruction>(val),
                            dominating,
                            dominated);

                        //Case 1):
                        //   (dominating=null)
                        //
                        //!   (value)
                        //
                        //    CC dominance tree:
                        //!   v67 <- (dominated)
                        //    ^
                        //    |
                        //!   v190
                        //
                        //!   (tupleInst)
                        // Value has no dominator inside CC class, but dominates elements in CC class
                        // and its lifetime spans until (but not including) tupleInst.
                        if (dominating == nullptr && dominated != nullptr)
                        {
                            //IGC_ASSERT(LV->isLiveAt(val, tupleInst));
                            //Interference check: value is live at dominated
                            if (functor->visitInterfering(val, false))
                                continue;
                            else
                                break;
                        }
                        else if (dominating != nullptr && dominated != nullptr)
                        {
                            //Case 2):
                            // CC dominance tree
                            //!  v67
                            //    ^
                            //    |
                            //!  v121 (dominating)
                            //    ^
                            //!   |  <-----(value)
                            //    |
                            //!  v190 <- (dominated)
                            //TODO: it would be possible to 'fit' v181 between v121 and v190 in some
                            //      cases, but this would require redesign of the 'dominance forest' logic
                            //      (regarding current dominating and immediate dominator handling)
                            //      For now, just assume there is an interference.
                            if (functor->visitInterfering(val, false))
                                continue;
                            else
                                break;
                        }
                        else if (dominating != nullptr && dominated == nullptr)
                        {
                            //Case 3):
                            // CC dominance tree
                            //!  v121
                            //    ^
                            //    |
                            //!  v190 <- (dominating)
                            //    |
                            //!   |    ----- (value)
                            //    |
                            //    | (dominated == null)
                            if (LV->isLiveAt(dominating, dyn_cast<llvm::Instruction>(val)))
                            {
                                if (functor->visitInterfering(val, false))
                                    continue;
                                else
                                    break;
                            }
                        }
                        else
                        {
                            //It is possible that the ccRootNode is not empty, but all
                            //elements that belong to it have been isolated in previously
                            //dominating tuples.
                            //IGC_ASSERT(0);
                        }
                    }
                    else
                    {
                        //ccRootNode == NULL -> congruence class not occupied yet
                        if (functor->visitPackedNonInterfering(val))
                            continue;
                        else
                            break;
                    }
                } //if( ccRootNode == Node )
            } //if( llvm::isa<llvm::Constant>( val ) )
        } //for loop
    }

    /// Here we have two major differences with the implementation in de-ssa:
    /// 1) Transactional character: even if one 'element' of CC-tuple does not interfere,
    /// some other element in CC-tuple might be interfering, rendering the whole group
    /// non-'coalescable'.
    /// 2) Values in payload coalescing are by default isolated, so if there is an interference,
    /// do nothing, no need for explicit isolation.
    bool CoalescingEngine::InterferenceCheck(
        const uint numOperands,
        Instruction* tupleInst,
        const int offsetDiff,
        CCTuple* ccTuple,
        ProcessInterferencesElementFunctor* interferencesFunctor)
    {
        SmallPtrSet<Value*, 8> touchedValuesSet;
        GatherWeightElementFunctor gatherFunctor;
        ProcessElements(numOperands, tupleInst, offsetDiff, ccTuple, &gatherFunctor);
        bool forceEviction =
            (gatherFunctor.GetNumInsertionSlotsRequired() + gatherFunctor.GetNumNeedsDisplacement() <=
                gatherFunctor.GetNumAlignedAnchors());


        interferencesFunctor->SetForceEviction(forceEviction);

        ProcessElements(numOperands, tupleInst, offsetDiff, ccTuple, interferencesFunctor);

        return interferencesFunctor->IsInterfering();
    }

    /// Given an instruction and numOperands (corresponding to the number of coalesce-partaking
    /// payload elements), return whether any 'value' that is an argument of an intrinsic
    /// takes part in the coalescing CC tuple. If yes, return non-zero value that is
    /// the representative tuple for this coalescing.
    /// output: zeroBasedPayloadElementOffset - an offset of this instructions payload
    /// relative to the containing ccTuple - this is necessary to correctly slice the
    /// the relevant part of the payload from ccTuple (e.g. for preparing a payload for URB writes)
    ///     offset [payload]
    ///     [ccTuple ...... ]
    /// note: obviously, if return value is null, then the meaning of zeroBasedPayloadElementOffset
    /// is undefined
    CoalescingEngine::CCTuple* CoalescingEngine::IsAnyValueCoalescedInCCTuple(
        llvm::Instruction* inst,
        const uint numOperands,
        int& zeroBasedPayloadElementOffset,
        Value*& outVal)
    {
        outVal = nullptr;
        zeroBasedPayloadElementOffset = 0; //provide some meaningful default
        CoalescingEngine::CCTuple* ccTuple = nullptr;
        uint index = 0;
        while (index < numOperands)
        {
            Value* val = GetPayloadElementToValueMapping(inst, index);

            if (!isa<Constant>(val) && GetValueCCTupleMapping(val))
            {
                ccTuple = GetValueCCTupleMapping(val);

                Value* valRoot = getRegRoot(val);
                IGC_ASSERT(valRoot);
                ElementNode* Node = ValueNodeMap[valRoot];
                int offset = NodeOffsetMap[Node];
                zeroBasedPayloadElementOffset = (offset - ccTuple->GetLeftBound()) - index;
                outVal = val;
                return ccTuple;
            }
            index++;
        }
        return ccTuple;
    }

    /// Return true if payload tuple might be completed (either by directly
    /// aliasing or by inserting an appropriate copy instruction).
    bool CoalescingEngine::IsPayloadCovered(
        llvm::Instruction* inst,
        CCTuple* ccTuple,
        const uint numOperands,
        const int payloadToCCTupleRelativeOffset)
    {
        uint relativeOffset = 0;
        bool payloadCovered = false;

        if (ccTuple)
        {
            SmallPtrSet<Value*, 8> touchedValuesSet;

            //index = 0;
            payloadCovered = true;
            for (uint index = 0; index < numOperands; index++)
            {
                Value* val = GetPayloadElementToValueMapping(inst, index);
                const int ccIndex = index + payloadToCCTupleRelativeOffset;

                if (touchedValuesSet.count(val)) {
                    if (!IsInsertionSlotAvailable(ccTuple, ccIndex, inst, false)) {
                        payloadCovered = false;
                        break;
                    }
                    continue;
                }
                else {
                    touchedValuesSet.insert(val);
                }

                if (IsValConstOrIsolated(val)) {
                    if (!IsInsertionSlotAvailable(ccTuple, ccIndex, inst, false)) {
                        payloadCovered = false;
                        break;
                    }
                }
                else {
                    if (CoalescingEngine::CCTuple * thisCCTuple = GetValueCCTupleMapping(val)) {
                        if (thisCCTuple == ccTuple) {
                            relativeOffset = GetValueOffsetInCCTuple(val);
                            if (relativeOffset != (ccIndex)) {
                                payloadCovered = false;
                                break;
                            }
                        }
                        else {
                            //different cc tuple
                            payloadCovered = false;
                            break;
                        }

                    }
                    else {
                        if (!IsInsertionSlotAvailable(ccTuple, ccIndex, inst, false)) {
                            payloadCovered = false;
                            break;
                        }
                    }
                }//if constant
            }//while
        } //if cctuple

        return payloadCovered;
    }


    /// Uses the tuple information provided by coalescing engine in order
    /// to generate optimized sequence of 'movs' for preparing a payload.
    CVariable* CoalescingEngine::PrepareExplicitPayload(
        CShader* outProgram,
        CEncoder* encoder,
        SIMDMode simdMode,
        const DataLayout* pDL,
        llvm::Instruction* inst,
        int& payloadOffsetInBytes)
    {
        SetCurrentPart(inst, 0);
        const uint numOperands = m_PayloadMapping.GetNumPayloadElements(inst);
        const uint grfSize = outProgram->GetContext()->platform.getGRFSize();
        const uint numSubVarsPerOperand = numLanes(simdMode) / (grfSize / 4);
        IGC_ASSERT(numSubVarsPerOperand == 1 || numSubVarsPerOperand == 2);
        CoalescingEngine::CCTuple* ccTuple = nullptr;
        int payloadToCCTupleRelativeOffset = 0;
        Value* dummyValPtr = nullptr;
        ccTuple = IsAnyValueCoalescedInCCTuple(inst, numOperands, payloadToCCTupleRelativeOffset, dummyValPtr);
        bool payloadCovered = IsPayloadCovered(inst, ccTuple, numOperands, payloadToCCTupleRelativeOffset);
        bool payloadOffsetComputed = false;
        payloadOffsetInBytes = 0;
        CVariable* payload = nullptr;

        if (payloadToCCTupleRelativeOffset < 0)
        {
            payloadCovered = false;
        }

        //Check for EOT payload.
        if (payloadCovered) {
            if (IGCLLVM::TerminatorInst * terminator = inst->getParent()->getTerminator())
            {
                if (terminator != &(*terminator->getParent()->begin()))
                {
                    IGC_ASSERT(dyn_cast<GenIntrinsicInst>(inst));
                    IGC_ASSERT(terminator->getPrevNode());
                    if (terminator->getPrevNode() == inst)
                    {
                        //heuristic:
                        //this inst is an EOT node. Be very careful about payload size:
                        //coalescing: hard constraint
                        if (ccTuple->GetNumElements() > MaxTupleSize)
                        {
                            payloadCovered = false;
                        }
                    }
                }
            }
        }

        if (payloadCovered) {
            SmallPtrSet<Value*, 8> touchedValuesSet;

            for (uint index = 0; index < numOperands; index++)
            {
                Value* val = m_PayloadMapping.GetPayloadElementToValueMapping(inst, index);

                CVariable* dataElement = outProgram->GetSymbol(val);
                //FIXME: need to do additional checks for size
                uint subVar = payloadToCCTupleRelativeOffset + index * numSubVarsPerOperand;
                encoder->SetDstSubVar(subVar);
                payload = outProgram->GetCCTupleToVariableMapping(ccTuple);

                if (touchedValuesSet.count(val)) {
                    encoder->Copy(payload, dataElement);
                    encoder->Push();
                    continue;
                }
                else {
                    touchedValuesSet.insert(val);
                }

                if (IsValConstOrIsolated(val)) {
                    encoder->Copy(payload, dataElement);
                    encoder->Push();
                }
                else
                {
                    if (GetValueCCTupleMapping(val))
                    {
                        if (!payloadOffsetComputed)
                        {
                            payloadOffsetComputed = true;

                            payloadOffsetInBytes = payloadToCCTupleRelativeOffset *
                                GetSingleElementWidth(simdMode, pDL, val);
                        }
                    }
                    else
                    {
                        //this one actually encompasses the case for !getRegRoot(val)
                        encoder->Copy(payload, dataElement);
                        encoder->Push();
                    }
                }//if constant
            }//for

        }
        else
        {
            payload = outProgram->GetNewVariable(numOperands * numLanes(simdMode), ISA_TYPE_F,
                outProgram->GetContext()->platform.getGRFSize() == 64 ? EALIGN_32WORD : EALIGN_HWORD,
                "CEExplicitPayload");

            for (uint i = 0; i < numOperands; i++)
            {
                Value* val = m_PayloadMapping.GetPayloadElementToValueMapping(inst, i);
                CVariable* data = outProgram->GetSymbol(val);
                encoder->SetDstSubVar(i * numSubVarsPerOperand);
                encoder->Copy(payload, data);
                encoder->Push();
            }
        }

        return payload;
    }

    // Prepares payload for the uniform URB Write messages that are used in
    // mesh and task shader stages.
    // On DG2 platform a uniform URB write is a simd1 message that writes data
    // from the fist DWORDs of each GRF of payload, e.g.:
    //   (W)     mov (1|M0)               r16.0<1>:ud   0x0:ud
    //   (W)     mov (1|M0)               r17.0<1>:f   -1.0:f
    //   (W)     mov (1|M0)               r18.0<1>:f    0.0:f
    //   (W)     mov (1|M0)               r19.0<1>:f    1.0:f
    //   (W)     send.urb (1|M0)          null     r2   r16:4  0x0  0x020827F7
    CVariable* CoalescingEngine::PrepareUniformUrbWritePayload(
        CShader* shader,
        CEncoder* encoder,
        llvm::GenIntrinsicInst* inst)
    {
        IGC_ASSERT(inst->getIntrinsicID() == GenISAIntrinsic::GenISA_URBWrite);
        CVariable* payload = nullptr;
        const uint numOperands = m_PayloadMapping.GetNumPayloadElements(inst);
        const uint grfSize = shader->GetContext()->platform.getGRFSize();
        {
            // Payload spans multiple GRFs, only the first DWORD of each GRF
            // will be populated with (uniform) data.
            const uint elemsPerGrf = grfSize / CVariable::GetCISADataTypeSize(ISA_TYPE_F);
            payload = shader->GetNewVariable(
                numOperands * elemsPerGrf, ISA_TYPE_F,
                (grfSize == 64 ? EALIGN_32WORD : EALIGN_HWORD),
                "CEExplicitPayload");
        }
        for (uint i = 0; i < numOperands; i++)
        {
            Value* val = m_PayloadMapping.GetPayloadElementToValueMapping(inst, i);
            CVariable* data = shader->GetSymbol(val);
            IGC_ASSERT(isUniform(val) && data->IsUniform());
            encoder->SetNoMask();
            encoder->SetSimdSize(SIMDMode::SIMD1);
            {
                encoder->SetDstSubVar(i);
            }
            encoder->Copy(payload, data);
        }
        return payload;
    }

    // Returns URB write payload for a split simd8 message in a simd16 mesh/task
    // shader.
    // TODO: This method does not use the CoalescingEngine currently and mearly
    // copies payload to a newly created variable.
    CVariable* CoalescingEngine::PrepareSplitUrbWritePayload(
        CShader* outProgram,
        CEncoder* encoder,
        SIMDMode  simdMode, ///< must be SIMD16
        uint32_t  splitPartNo, ///< can be either 0 - lower quarter, or 1 upper quarter
        llvm::Instruction* inst)
    {
        SetCurrentPart(inst, 0);
        IGC_ASSERT(outProgram->m_SIMDSize == SIMDMode::SIMD16);
        IGC_ASSERT(splitPartNo < 2);
        const GenIntrinsicInst* const intrinsicInst = dyn_cast<GenIntrinsicInst>(inst);
        IGC_ASSERT(nullptr != intrinsicInst);
        IGC_ASSERT(intrinsicInst->getIntrinsicID() == GenISAIntrinsic::GenISA_URBWrite);

        const uint numOperands = m_PayloadMapping.GetNumPayloadElements(inst);
        CVariable* payload =
            outProgram->GetNewVariable(numOperands * numLanes(SIMDMode::SIMD8), ISA_TYPE_F, outProgram->GetContext()->platform.getGRFSize() == 64 ? EALIGN_32WORD : EALIGN_HWORD, CName::NONE);

        for (uint i = 0; i < numOperands; i++)
        {
            Value* val = m_PayloadMapping.GetPayloadElementToValueMapping(inst, i);
            CVariable* data = outProgram->GetSymbol(val);
            encoder->SetSimdSize(SIMDMode::SIMD8);
            encoder->SetMask((splitPartNo == 0) ? EMASK_Q1 : EMASK_Q2);
            encoder->SetSrcSubVar(0, data->IsUniform() ? 0 : splitPartNo);
            encoder->SetDstSubVar(i);
            encoder->Copy(payload, data);
            encoder->Push();
        }
        return payload;
    }

    CoalescingEngine::ElementNode*
        CoalescingEngine::ElementNode::getLeader() {
        ElementNode* N = this;
        ElementNode* Parent = parent.getPointer();
        ElementNode* Grandparent = Parent->parent.getPointer();

        while (Parent != Grandparent) {
            N->parent.setPointer(Grandparent);
            N = Grandparent;
            Parent = Parent->parent.getPointer();
            Grandparent = Parent->parent.getPointer();
        }

        return Parent;
    }

    Value* CoalescingEngine::getRegRoot(Value* Val) const {
        auto RI = ValueNodeMap.find(Val);
        if (RI == ValueNodeMap.end())
            return nullptr;
        ElementNode* Node = RI->second;
        if (Node->parent.getInt() & ElementNode::kRegisterIsolatedFlag)
            return 0x0;
        return Node->getLeader()->value;
    }

    Value* CoalescingEngine::getPHIRoot(Instruction* PHI) const {
        IGC_ASSERT(dyn_cast<PHINode>(PHI));
        auto RI = ValueNodeMap.find(PHI);
        IGC_ASSERT(RI != ValueNodeMap.end());
        ElementNode* DestNode = RI->second;
        if (DestNode->parent.getInt() & ElementNode::kPHIIsolatedFlag)
            return 0x0;
        for (unsigned i = 0; i < PHI->getNumOperands(); i++) {
            Value* SrcVal = PHI->getOperand(i);
            if (isa<Instruction>(SrcVal)) {  // skip constant source
                Value* SrcRoot = getRegRoot(SrcVal);
                if (SrcRoot)
                    return SrcRoot;
            }
        }
        return 0x0;
    }



    void CoalescingEngine::unionRegs(Value* Val1, Value* Val2) {
        ElementNode* Node1 = ValueNodeMap[Val1]->getLeader();
        ElementNode* Node2 = ValueNodeMap[Val2]->getLeader();

        if (Node1->rank > Node2->rank) {
            Node2->parent.setPointer(Node1->getLeader());
        }
        else if (Node1->rank < Node2->rank) {
            Node1->parent.setPointer(Node2->getLeader());
        }
        else if (Node1 != Node2) {
            Node2->parent.setPointer(Node1->getLeader());
            Node1->rank++;
        }
    }

    // For now, return true if V is dessa-aliased/InsEltMap-ed/phi-coalesced.
    bool CoalescingEngine::isCoalescedByDeSSA(Value* V) const
    {
        if (m_DeSSA && m_DeSSA->getRootValue(V))
            return true;
        return false;
    }

    void CoalescingEngine::ProcessBlock(llvm::BasicBlock* bb)
    {
        llvm::BasicBlock::InstListType& instructionList = bb->getInstList();
        llvm::BasicBlock::InstListType::iterator I, E;

        //Loop through instructions top to bottom
        for (I = instructionList.begin(), E = instructionList.end(); I != E; ++I) {
            llvm::Instruction& inst = (*I);
            visit(inst);
        }
    }

    bool CoalescingEngine::MatchSingleInstruction(llvm::GenIntrinsicInst* inst)
    {
        GenISAIntrinsic::ID IID = inst->getIntrinsicID();
        if (isSampleInstruction(inst) ||
            isLdInstruction(inst) ||
            isURBWriteIntrinsic(inst) ||
            IID == GenISAIntrinsic::GenISA_RTWrite)
        {
            uint numOperands = inst->getNumOperands();
            for (uint i = 0; i < numOperands; i++)
            {
                Value* val = inst->getOperand(i);
                if (llvm::isa<llvm::Constant>(val))
                {
                    continue;
                }

                if (!ValueNodeMap.count(val))
                {
                    Instruction* DefMI = dyn_cast<Instruction>(val);
                    if (DefMI)
                    {
                        BBProcessingDefs[DefMI->getParent()].push_back(DefMI);
                    }
                    ValueNodeMap[val] = new (Allocator) ElementNode(val);
                }
            }

            //Add itself to processing stack as well.
            BBProcessingDefs[inst->getParent()].push_back(inst);
        }

        return true;
    }

    void CoalescingEngine::visitCallInst(llvm::CallInst& I)
    {
        //we do not care if we do not match
        //bool match = true;
        if (GenIntrinsicInst * CI = llvm::dyn_cast<GenIntrinsicInst>(&I))
        {
            switch (CI->getIntrinsicID())
            {
            case GenISAIntrinsic::GenISA_ROUNDNE:
            case GenISAIntrinsic::GenISA_imulH:
            case GenISAIntrinsic::GenISA_umulH:
            case GenISAIntrinsic::GenISA_uaddc:
            case GenISAIntrinsic::GenISA_usubb:
            case GenISAIntrinsic::GenISA_bfrev:
                break;
            case GenISAIntrinsic::GenISA_intatomicraw:
            case GenISAIntrinsic::GenISA_floatatomicraw:
            case GenISAIntrinsic::GenISA_icmpxchgatomicraw:
            case GenISAIntrinsic::GenISA_fcmpxchgatomicraw:
            case GenISAIntrinsic::GenISA_dwordatomicstructured:
            case GenISAIntrinsic::GenISA_floatatomicstructured:
            case GenISAIntrinsic::GenISA_cmpxchgatomicstructured:
            case GenISAIntrinsic::GenISA_fcmpxchgatomicstructured:
            case GenISAIntrinsic::GenISA_intatomictyped:
            case GenISAIntrinsic::GenISA_icmpxchgatomictyped:
            case GenISAIntrinsic::GenISA_ldstructured:
            case GenISAIntrinsic::GenISA_atomiccounterinc:
            case GenISAIntrinsic::GenISA_atomiccounterpredec:
                break;
            case GenISAIntrinsic::GenISA_GradientX:
            case GenISAIntrinsic::GenISA_GradientY:
            case GenISAIntrinsic::GenISA_GradientXfine:
            case GenISAIntrinsic::GenISA_GradientYfine:
                break;
            default:
                MatchSingleInstruction(CI);
                // no pattern for the rest of the intrinsics
                break;
            }
        }
        else if (IntrinsicInst * CI = llvm::dyn_cast<IntrinsicInst>(&I))
        {
            switch (CI->getIntrinsicID())
            {
            case Intrinsic::sqrt:
            case Intrinsic::log2:
            case Intrinsic::cos:
            case Intrinsic::sin:
            case Intrinsic::pow:
            case Intrinsic::floor:
            case Intrinsic::ceil:
            case Intrinsic::trunc:
            case Intrinsic::maxnum:
            case Intrinsic::minnum:
                break;
            case Intrinsic::exp2:
                break;

            default:
                break;
            }
        }
    }

    void CoalescingEngine::visitCastInst(llvm::CastInst& I)
    {

    }
    void CoalescingEngine::visitBinaryOperator(llvm::BinaryOperator& I)
    {

    }

    void CoalescingEngine::visitCmpInst(llvm::CmpInst& I)
    {

    }

    void CoalescingEngine::visitPHINode(llvm::PHINode& I)
    {

    }

    void CoalescingEngine::visitUnaryInstruction(llvm::UnaryInstruction& I)
    {

    }

    void CoalescingEngine::visitSelectInst(llvm::SelectInst& I)
    {

    }

    void CoalescingEngine::visitBitCastInst(llvm::BitCastInst& I)
    {

    }

    void CoalescingEngine::visitInstruction(llvm::Instruction& I)
    {

    }

    void CoalescingEngine::visitLoadInst(LoadInst& I)
    {

    }

    void CoalescingEngine::visitStoreInst(StoreInst& I)
    {

    }

} //namespace IGC