File: GenXLowering.cpp

package info (click to toggle)
intel-graphics-compiler 1.0.17791.18-1
links: PTS, VCS
area: main
in suites: sid
size: 102,312 kB
sloc: cpp: 935,343; lisp: 286,143; ansic: 16,196; python: 3,279; yacc: 2,487; lex: 1,642; pascal: 300; sh: 174; makefile: 27
file content (5307 lines) | stat: -rw-r--r-- 196,228 bytes
/*========================== begin_copyright_notice ============================

Copyright (C) 2017-2024 Intel Corporation

SPDX-License-Identifier: MIT

============================= end_copyright_notice ===========================*/

//
/// GenXLowering
/// ------------
///
/// GenXLowering is a function pass that lowers certain LLVM IR instructions
/// that the rest of the GenX backend cannot deal with, or to implement peephole
/// optimizations.
///
/// It also performs a few other tasks:
///
/// 1. It implements add sinking for a variable index in a region/element
///    access. This ensures that, in a sequence of operations to calculate a
///    variable index for a region/element access, any add constant is sunk to
///    the end, such that it can become a constant offset in an indirect
///    operand, and give GenXAddressCommoning more chance to common up address
///    calculations.
///
/// 2. It splits struct values where possible, by splitting all struct phi nodes
///    before running the main pass, then removing an extractvalue by using the
///    corresponding insertvalue's input instead. Any struct value used as an
///    arg or return value still remains, and needs to be dealt with by register
///    allocation.
///
/// 3. It widens some byte vector operations to short vector.
///
///    Gen has restrictions on byte operands. The jitter copes with that, but
///    sometimes it needs to do even-odd splitting, which can lead to suboptimal
///    code if cmps and predicates are involved.
///    Here we attempt to pick up the common cases by converting a byte
///    operation to short.
///
///    Note that we might end up with the extends being baled into the
///    instruction anyway, resulting in a byte operation in vISA.
///
/// 4. Certain uses of shufflevector are lowered:
///
///    a. a splat (copy of one element across a vector);
///    b. a boolean slice (extract of a subvector) becomes rdpredregion;
///    c. a boolean unslice (insert subvector) becomes wrpredregion.
///    d. non-boolean shufflevector is lowered to sequence of rd/wrregions
///
///    The only one case of shufflevector allowed is shufflevector of
///    predicate and undef with replicated mask.
///
/// 5. A Trunc is lowered to a bitcast then a region/element read with a stride.
///    GenXCoalescing will coalesce the bitcast, and possibly bale in the region
///    read, so this will hopefully save an instruction or two.
///
/// 6. Certain floating point comparison instructions are lowered.
///
/// **IR restriction**: LLVM IR instructions not supported after this pass:
///
/// * insertelement
/// * extractelement
/// * trunc
/// * zext/sext/uitofp from (vector of) i1
/// * select on vector of i1
/// * ``llvm.uadd.with.overflow`` (the other
///   overflowing arithmetic intrinsics are not allowed by the GenX backend
///   anyway.)
/// * ``llvm.genx.*imad``
///
///
/// **IR restriction**: rdpredregion intrinsic (which is generated by this pass
/// from certain cases of shufflevector, and represents a use of part of a
/// predicate) can only be used in select, wrregion, wrpredpredregion.
///
/// **IR restriction**: wrpredregion intrinsic (which is generated by this pass
/// from certain cases of shufflevector, and represents the write of part of a
/// predicate) must have a compare as its "new value" input.
///
/// **IR restriction**: No phi node of struct type after this pass. This is only
/// a general rule; subsequent passes have been known to reintroduce them so
/// GenXLiveness has another go at splitting them up.
///
//===----------------------------------------------------------------------===//

#include "GenX.h"
#include "GenXGotoJoin.h"
#include "GenXIntrinsics.h"
#include "GenXModule.h"
#include "GenXSubtarget.h"
#include "GenXTargetMachine.h"
#include "GenXUtil.h"
#include "GenXVisa.h"

#include "IGC/common/debug/DebugMacros.hpp"
#include "visa_igc_common_header.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/DiagnosticPrinter.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"

#include "llvmWrapper/IR/Constants.h"
#include "llvmWrapper/IR/DerivedTypes.h"
#include "llvmWrapper/IR/Instructions.h"
#include "llvmWrapper/Support/TypeSize.h"

#include "vc/Support/GenXDiagnostic.h"
#include "vc/Utils/GenX/GlobalVariable.h"

#include "Probe/Assertion.h"
#include <algorithm>
#include <iterator>
#include <numeric>

#define DEBUG_TYPE "genx-lowering"

using namespace llvm;
using namespace genx;

static cl::opt<bool>
    EnableGenXByteWidening("enable-genx-byte-widening", cl::init(true),
                           cl::Hidden, cl::desc("Enable GenX byte widening."));
namespace {

// Diagnostic for lowering problems
class DiagnosticInfoLowering : public DiagnosticInfo {
private:
  std::string Description;
  StringRef Filename;
  unsigned Line;
  unsigned Col;

  static const int KindID;

  static int getKindID() { return KindID; }

public:
  DiagnosticInfoLowering(const Instruction *Inst, const Twine &Desc,
                         DiagnosticSeverity Severity = DS_Error);
  void print(DiagnosticPrinter &DP) const override;
  static bool classof(const DiagnosticInfo *DI) {
    return DI->getKind() == getKindID();
  }
};

const int DiagnosticInfoLowering::KindID =
    llvm::getNextAvailablePluginDiagnosticKind();

DiagnosticInfoLowering::DiagnosticInfoLowering(const Instruction *Inst,
                                               const Twine &Desc,
                                               DiagnosticSeverity Severity)
    : DiagnosticInfo(getKindID(), Severity), Line(0), Col(0) {
  auto DL = Inst->getDebugLoc();
  if (DL) {
    Filename = DL->getFilename();
    Line = DL.getLine();
    Col = DL.getCol();
  }
  std::string str;
  vc::printToString(str, *Inst);

  Description =
      (Twine("GenXLowering failed for instruction ") + str + ": " + Desc).str();
}

void DiagnosticInfoLowering::print(DiagnosticPrinter &DP) const {
  std::string Loc((Twine(!Filename.empty() ? Filename : "<unknown>") + ":" +
                   Twine(Line) + (!Col ? Twine() : Twine(":") + Twine(Col)) +
                   ": ")
                      .str());
  DP << Loc << Description;
}

// GenXLowering : legalize execution widths and GRF crossing
class GenXLowering : public FunctionPass {
  DominatorTree *DT = nullptr;
  const GenXSubtarget *ST = nullptr;
  SmallVector<Instruction *, 8> ToErase;
  const DataLayout *DL = nullptr;

public:
  static char ID;
  explicit GenXLowering() : FunctionPass(ID), DT(nullptr) {}
  StringRef getPassName() const override { return "GenX lowering"; }
  void getAnalysisUsage(AnalysisUsage &AU) const override;
  bool runOnFunction(Function &F) override;

private:
  bool widenSIMD8GatherScatter(CallInst *CI, unsigned IID);
  bool lowerLSCTyped2DBlock(CallInst *CI, unsigned IID);
  bool lowerMediaWalkerAPIs(CallInst *CI, unsigned IID);
  bool translateSLMOWord(CallInst *CI, unsigned IID);
  bool splitGatherScatter(CallInst *CI, unsigned IID);
  bool processTwoAddressOpnd(CallInst *CI);
  bool processInst(Instruction *Inst);
  bool lowerAbs(CallInst *CI);
  bool lowerAllAny(CallInst *CI);
  bool lowerRdRegion(Instruction *Inst);
  bool lowerWrRegion(Instruction *Inst);
  bool lowerRdPredRegion(CallInst *Inst);
  bool lowerWrPredRegion(Instruction *Inst);
  bool lowerInsertElement(Instruction *Inst);
  bool lowerExtractElement(Instruction *Inst);
  Value *scaleInsertExtractElementIndex(Value *IdxVal, Type *ElTy,
                                        Instruction *InsertBefore);
  bool lowerTrunc(Instruction *Inst);
  bool lowerCast(Instruction *Inst);
  bool lowerBoolScalarSelect(SelectInst *SI);
  bool lowerBoolVectorSelect(SelectInst *SI);
  bool lowerBoolShuffle(ShuffleVectorInst *Inst);
  bool lowerBoolShuffleReplicatedSlice(ShuffleVectorInst *Inst);
  bool lowerBoolSplat(ShuffleVectorInst *SI, Value *In, unsigned Idx);
  bool lowerShuffle(ShuffleVectorInst *Inst);
  void lowerShuffleSplat(ShuffleVectorInst *SI,
                         ShuffleVectorAnalyzer::SplatInfo Splat);
  bool lowerShuffleToSelect(ShuffleVectorInst *Inst);
  void lowerShuffleToMove(ShuffleVectorInst *SI);
  bool lowerShr(Instruction *Inst);
  bool lowerExtractValue(ExtractValueInst *Inst);
  bool lowerInsertValue(InsertValueInst *Inst);
  bool lowerUAddWithOverflow(CallInst *CI);
  bool lowerUAddWithSat(CallInst *CI);
  bool lowerUSubWithSat(CallInst *CI);
  bool lowerCtpop(CallInst *CI);
  bool lowerFCmpInst(FCmpInst *Inst);
  bool lowerCttz(CallInst *Inst);
  bool lowerCtlz(CallInst *Inst);
  bool lowerUnorderedFCmpInst(FCmpInst *Inst);
  bool lowerSqrt(CallInst *CI);
  bool widenByteOp(Instruction *Inst);
  bool lowerGenXMul(CallInst *CI, unsigned IntrinsicID);
  bool lowerGenXMulSat(CallInst *CI, unsigned IntrinsicID);
  bool lowerGenXIMad(CallInst *CI, unsigned IntrinsicID);
  bool lowerMul64(Instruction *Inst);
  bool lowerLzd(Instruction *Inst);
  bool lowerTrap(CallInst *CI);
  bool lowerDebugTrap(CallInst *CI);
  bool lowerFMulAdd(CallInst *CI);
  bool lowerPowI(CallInst *CI);
  bool lowerAddcSubb(CallInst *CI, unsigned IntrinsicID);
  bool lower64Bitreverse(CallInst *CI);
  bool lowerBitreverse(CallInst *CI);
  bool lowerFunnelShift(CallInst *CI, unsigned IntrinsicID);
  bool lowerMathIntrinsic(CallInst *CI, GenXIntrinsic::ID GenXID,
                          bool IsHalfAllowed = false);
  bool lowerFastMathIntrinsic(CallInst *CI, GenXIntrinsic::ID GenXID);
  bool lowerStackSave(CallInst *CI);
  bool lowerStackRestore(CallInst *CI);
  bool lowerHardwareThreadID(CallInst *CI);
  bool lowerLogicalThreadID(CallInst *CI);
  bool lowerNamedBarrierArrive(CallInst *CI);
  bool lowerDpas(CallInst *CI);

  Value *swapLowHighHalves(IRBuilder<> &Builder, Value *Arg) const;
  bool lowerByteSwap(CallInst *CI);

  template <typename BuilderOp>
  bool lowerReduction(CallInst *CI, Value *Src, Value *Start,
                      BuilderOp Builder);

  bool lowerReduction(CallInst *CI, Instruction::BinaryOps Opcode);
  bool lowerReduction(CallInst *CI, Intrinsic::ID);

  bool lowerCopySign(CallInst *CI);

  bool generatePredicatedWrrForNewLoad(CallInst *CI);
};

} // end namespace

char GenXLowering::ID = 0;
namespace llvm {
void initializeGenXLoweringPass(PassRegistry &);
}
INITIALIZE_PASS_BEGIN(GenXLowering, "GenXLowering", "GenXLowering", false,
                      false)
INITIALIZE_PASS_END(GenXLowering, "GenXLowering", "GenXLowering", false, false)

FunctionPass *llvm::createGenXLoweringPass() {
  initializeGenXLoweringPass(*PassRegistry::getPassRegistry());
  return new GenXLowering;
}

void GenXLowering::getAnalysisUsage(AnalysisUsage &AU) const {
  AU.addRequired<TargetPassConfig>();
  AU.addPreserved<DominatorTreeWrapperPass>();
  AU.addPreserved<LoopInfoWrapperPass>();
  AU.addPreserved<GenXModule>();
}

/***********************************************************************
 * GenXLowering::runOnFunction : process one function to
 *    lower instructions as required for GenX backend.
 *
 * This does a postordered depth first traversal of the CFG,
 * processing instructions within a basic block in reverse, to
 * ensure that we see a def after its uses (ignoring phi node uses).
 * This helps peephole optimizations which generally want to be
 * approached from the top down. For example, add sinking in the index
 * of an indirect region/element wants to see the trunc before the trunc
 * is lowered to a bitcast and an element access.
 */
bool GenXLowering::runOnFunction(Function &F) {
  LLVM_DEBUG(dbgs() << "GenXLowering started\n");
  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
  DT = DTWP ? &DTWP->getDomTree() : nullptr;
  ST = &getAnalysis<TargetPassConfig>()
            .getTM<GenXTargetMachine>()
            .getGenXSubtarget();
  DL = &F.getParent()->getDataLayout();
  // First split any phi nodes with struct type.
  splitStructPhis(&F);
  // Create a list of basic blocks in the order we want to process them, before
  // we start the lowering. This is because lowering can split a basic block.
  SmallVector<BasicBlock *, 8> BBs;
  for (auto i = po_begin(&F.getEntryBlock()), e = po_end(&F.getEntryBlock());
       i != e; ++i)
    BBs.push_back(*i);
  // Process each basic block.
  for (auto i = BBs.begin(), e = BBs.end(); i != e; ++i) {
    BasicBlock *BB = *i;
    // The effect of this loop is that we process the instructions in reverse
    // order, and we re-process anything inserted before the instruction
    // being processed.
    for (Instruction *Inst = BB->getTerminator();;) {
      processInst(Inst);
      BasicBlock *Parent = Inst->getParent();
      if (Inst != &Parent->front())
        Inst = Inst->getPrevNode();
      else {
        if (Parent == BB)
          break;
        // We have reached the start of the basic block, but it is a different
        // basic block to BB, so lowering must have split a BB. Just go back to
        // the end of the previous one.
        Inst = Parent->getPrevNode()->getTerminator();
      }
    }
  }
  // Erase the instructions that we saved in ToErase.
  for (SmallVectorImpl<Instruction *>::iterator i = ToErase.begin(),
                                                e = ToErase.end();
       i != e; ++i)
    (*i)->eraseFromParent();
  ToErase.clear();
  return true;
}

// Optimize two address operands if any.
//
// An instruction with a two address opernd should be predicated. If predicate
// is a constant splat, then the old value will be over-written. In this case,
// replace the old value with undef which allows more optimizations to kick in.
//
bool GenXLowering::processTwoAddressOpnd(CallInst *CI) {
  auto OpNum = getTwoAddressOperandNum(CI);
  // Skip write regions whose OpNum is 0.
  if (OpNum && *OpNum != 0) {
    Type *Ty = CI->getArgOperand(*OpNum)->getType();
    IGC_ASSERT_MESSAGE(Ty == CI->getType(), "two address op type out of sync");

    for (unsigned i = 0; i < IGCLLVM::getNumArgOperands(CI); ++i) {
      auto Op = dyn_cast<Constant>(CI->getArgOperand(i));
      // Check if the predicate operand is all true.
      if (Op && Op->getType()->getScalarSizeInBits() == 1) {
        if (Op->getType()->isVectorTy())
          Op = Op->getSplatValue();
        if (Op && Op->isOneValue()) {
          CI->setOperand(*OpNum, UndefValue::get(Ty));
          return true;
        }
        return false;
      }
    }
  }

  return false;
}

// Check whether given intrinsic is new load
// without predicate and old value arguments.
static bool isNewLoadInst(CallInst *Inst) {
  unsigned IID = GenXIntrinsic::getGenXIntrinsicID(Inst);
  switch (IID) {
  case GenXIntrinsic::genx_gather4_scaled2:
  case GenXIntrinsic::genx_gather_scaled2:
  case GenXIntrinsic::genx_gather4_masked_scaled2:
  case GenXIntrinsic::genx_gather_masked_scaled2:
    return true;
  default:
    return false;
  }
}

// Find single wrregion user of load instruction.
// Returns nullptr on failure.
static CallInst *getLoadWrregion(CallInst *Inst) {
  IGC_ASSERT_MESSAGE(isNewLoadInst(Inst), "Expected new load intrinsics");
  if (!Inst->hasOneUse())
    return nullptr;

  auto *WrR = dyn_cast<CallInst>(Inst->user_back());
  if (!WrR)
    return nullptr;
  return GenXIntrinsic::isWrRegion(WrR) ? WrR : nullptr;
}

// Find single select user of load instruction.
// Returns nullptr on failure.
// TODO: maybe just lower every select to wrregion in lowerSelect?
static SelectInst *getLoadSelect(CallInst *Inst) {
  IGC_ASSERT_MESSAGE(isNewLoadInst(Inst), "Expected new load intrinsics");
  if (!Inst->hasOneUse())
    return nullptr;

  auto *SI = dyn_cast<SelectInst>(Inst->user_back());
  if (!SI)
    return nullptr;
  // TODO: handle inverted selects.
  // Need to regenerate mask in this case.
  if (SI->getTrueValue() != Inst)
    return nullptr;
  return SI;
}

// Generate predicate for wrregion of splitted load.
// Returns new predicate.
static Value *generatePredicateForLoadWrregion(
    Value *OldPred, unsigned Offset, unsigned Width, unsigned NumChannels,
    Instruction *InsertBefore, const DebugLoc &DL, const Twine &Name) {

  if (Constant *C = dyn_cast<Constant>(OldPred)) {
    if (isa<ConstantInt>(OldPred))
      return OldPred;

    if (C->isAllOnesValue())
      return ConstantInt::get(Type::getInt1Ty(OldPred->getContext()), 1);
  }

  Value *Pred = OldPred;
  // If old predicate is result of rdpredregion or shufflevector then
  // we can reuse their predicate and offset to avoid double read of predicate.
  if (GenXIntrinsic::getGenXIntrinsicID(OldPred) ==
      GenXIntrinsic::genx_rdpredregion) {
    auto *OldPredInst = cast<CallInst>(OldPred);
    Offset += cast<ConstantInt>(OldPredInst->getArgOperand(1))->getZExtValue();
    Pred = OldPredInst->getArgOperand(0);
  } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(OldPred)) {
    Offset +=
        ShuffleVectorAnalyzer::getReplicatedSliceDescriptor(SVI).InitialOffset;
    Pred = SVI->getOperand(0);
  }

  // Replicate mask across channels.
  SmallVector<Constant *, 16> NewMaskVals(Width);
  IGC_ASSERT(NumChannels);
  unsigned ChannelWidth = Width / NumChannels;
  Type *Int32Ty = IntegerType::getInt32Ty(Pred->getContext());
  for (unsigned i = 0; i < NumChannels; ++i)
    std::generate_n(NewMaskVals.begin() + ChannelWidth * i, ChannelWidth,
                    [Int32Ty, Offset]() mutable {
                      return ConstantInt::get(Int32Ty, Offset++);
                    });
  Constant *NewMask = ConstantVector::get(NewMaskVals);

  Value *Undef = UndefValue::get(Pred->getType());
  auto *Res = new ShuffleVectorInst(Pred, Undef, NewMask, Name, InsertBefore);
  Res->setDebugLoc(DL);
  return Res;
}

static Value *generatePredicatedWrregion(Value *OldVal, Value *NewVal,
                                         Value *Pred, unsigned Offset,
                                         Instruction *InsertBefore,
                                         const llvm::Twine &Name = "") {
  Type *NewValType = NewVal->getType();

  Region WrR(NewValType);
  WrR.Mask = Pred;
  WrR.Offset = Offset;
  return WrR.createWrRegion(OldVal, NewVal, Name, InsertBefore,
                            InsertBefore->getDebugLoc());
}

// Generate partial write for result of splitted 1-channel load instruction.
// Initially we could have following sequence for illegal load (on gather_scaled
// example):
//   res = gather_scaled <32>
//   mask = rdpredregion <32> pred, offset
//   newV = wrregion <32> oldV, res, wroffset, mask
// After splitting we want to get as less extra code as possible.
// To achieve this we generate following pattern:
// bale {
//   res1 = gather_scaled <16>
//   mask1 = rdpredregion <16> pred, offset
//   partialV = wrregion <16> oldV, res1, mask1
// }
// bale {
//   res2 = gather_scaled <16>
//   mask2 = rdpredregion <16> pred, offset + 16
//   newV = wrregion <16> partialV, res2, wroffset + 16 * elemsize, mask2
// }
// Bale markers show how this will be baled later.
static Value *generate1ChannelWrrregion(Value *Target, unsigned InitialOffset,
                                        CallInst *Load, Value *OldPred,
                                        unsigned AccumulatedOffset,
                                        Instruction *InsertBefore) {
  const DebugLoc &DL = Load->getDebugLoc();
  Type *LoadType = Load->getType();
  unsigned LoadWidth =
      cast<IGCLLVM::FixedVectorType>(LoadType)->getNumElements();

  Value *Pred =
      generatePredicateForLoadWrregion(OldPred, AccumulatedOffset, LoadWidth, 1,
                                       InsertBefore, DL, "load1.pred.split");
  Region WrR(LoadType);
  WrR.Mask = Pred;
  WrR.Offset =
      InitialOffset + AccumulatedOffset * (LoadType->getScalarSizeInBits() / 8);
  return WrR.createWrRegion(Target, Load, "load1.join", InsertBefore, DL);
}

// Generate partial write for result of splitted N-channel load.
// For channelled loads we need to also shuffle result of splitted
// instructions to write back them to destination in expected order.
// Temporary splits should always be predicated in case of atomics
// because latter load and store at the same time.
// Example for gather4_scaled (with two channels enabled). Before:
//   res = gather4_scaled <32> RG
//   mask = rdpredregion <64> pred, offset ; mask is replicated across channels
//   newV = wrregion <64> oldV, res, wroffset, mask
// After:
// bale {
//   res1temp = gather4_scaled <16> RG ; create temporary (unnecessary in case
//   of non-atomics) splitmask1 = rdpredregion <32> pred, offset ; replicated
//   res1 = wrregion <32> undef, res1temp, 0, splitmask1
// }
// bale {
//   res1R = rdregion <16> res1, 0
//   mask1R = rdpredregion <16> pred, offset ; same for all channels
//   partialVR = wrregion <16> oldV, res1R, wroffset, mask1R
// }
// bale {
//   res1G = rdregion <16> res1, 16 * elemsize
//   mask1G = rdpredregion <16> pred, offset
//   partialV = wrregion <16> partialVR, res1G, wroffset + 32 * elemsize, mask1G
// }
// bale {
//   res2temp = gather4_scaled <16> RG ; second temporary
//   splitmask2 = rdpredregion <32> pred, offset + 16
//   res2 = wrregion <32> undef, res2temp, 0, splitmask2
// }
// bale {
//   res2R = rdregion <16> res2, 0
//   mask2R = rdpredregion <16> pred, offset + 16
//   newVR = wrregion <16> partialV, res2R, wroffset + 16 * elemsize, mask2R
// }
// bale {
//   res2G = rdregion <16> res2, 16 * elemsize
//   mask2G = rdpredregion <16> pred, offset + 16
//   newV = wrregion <16> newVR, res2G, wroffset + 48 * elemsize, mask2G
// }
// As it can be noticed, splitting of channeled loads is quite expensive.
// We should hope that later passes (like region collapsing) can optimize it
// by analyzing how resulting value was assembled.
static Value *generateNChannelWrregion(Value *Target, unsigned InitialOffset,
                                       CallInst *Load, Value *OldPred,
                                       unsigned SplitNum, unsigned NumSplits,
                                       unsigned NumChannels,
                                       Instruction *InsertBefore) {
  const DebugLoc &DL = Load->getDebugLoc();
  Type *LoadType = Load->getType();
  unsigned LoadWidth =
      cast<IGCLLVM::FixedVectorType>(LoadType)->getNumElements();
  IGC_ASSERT(NumChannels);
  unsigned ChannelWidth = LoadWidth / NumChannels;
  unsigned MaskOffset = ChannelWidth * SplitNum;

  // Generate temporary for load.
  Value *Pred = generatePredicateForLoadWrregion(OldPred, MaskOffset, LoadWidth,
                                                 NumChannels, InsertBefore, DL,
                                                 "loadN.pred.split");
  Region WrR(LoadType);
  WrR.Mask = Pred;
  Value *SplitRes = WrR.createWrRegion(UndefValue::get(LoadType), Load,
                                       "loadN.split", InsertBefore, DL);

  // Generate shuffle writes to the target.
  unsigned ElemByteSize = LoadType->getScalarSizeInBits() / 8;
  Type *ShuffleType =
      IGCLLVM::FixedVectorType::get(LoadType->getScalarType(), ChannelWidth);
  Region ChannelRdR(ShuffleType);
  Region ChannelWrR(ShuffleType);
  Value *ResChannel = nullptr;
  for (unsigned i = 0; i < NumChannels; ++i) {
    ChannelRdR.Offset = ChannelWidth * i * ElemByteSize;
    ResChannel = ChannelRdR.createRdRegion(SplitRes, "loadN.channel.read.join",
                                           InsertBefore, DL);
    Pred = generatePredicateForLoadWrregion(OldPred, MaskOffset, ChannelWidth,
                                            1, InsertBefore, DL,
                                            "loadN.channel.pred.join");
    ChannelWrR.Offset =
        InitialOffset +
        (ChannelWidth * SplitNum + ChannelWidth * NumSplits * i) * ElemByteSize;
    ChannelWrR.Mask = Pred;
    Target = ChannelWrR.createWrRegion(Target, ResChannel, "loadN.channel.join",
                                       InsertBefore, DL);
  }
  return Target;
}

// Try to infere predicate for load inst which will be used
// in splitting:
// - Check for mask operand first
// - Check wrr or select user
//
// If wrr or select user was found then return it's predicate
// because exactly this predicate must be used
// during splittig.
Value *getPredicateForLoadSplitting(CallInst *Load) {
  IGC_ASSERT(Load);
  Value *LoadPred = getMaskOperand(Load);
  CallInst *LoadWrr = getLoadWrregion(Load);
  SelectInst *SI = getLoadSelect(Load);

  if (!LoadWrr && !SI) {
    // No suitable user for mask inference and no predicate in load, so
    // it's not preidcated
    if (!LoadPred ||
        (isa<Constant>(LoadPred) && cast<Constant>(LoadPred)->isAllOnesValue()))
      return ConstantInt::get(IntegerType::getInt1Ty(Load->getContext()), 1);

    return LoadPred;
  }

  // Try to infer the mask from users.
  if (LoadWrr) {
    // If we found wrregion user, then use its predicate for splitted
    // instructions.
    return LoadWrr->getArgOperand(
        GenXIntrinsic::GenXRegion::PredicateOperandNum);
  }
  // Else it's select, get it's predicate
  return SI->getCondition();
}

// Get target for wrregions of splitted load.
// Returns tuple consisted of:
//  1. Target for wrregions
//  2. Predicate
//  3. Initial offset of target
//  4. Instruction to replace later
static std::tuple<Value *, Value *, unsigned, Instruction *>
getLoadTarget(CallInst *Load, const GenXSubtarget *ST) {
  Value *LoadPred = getPredicateForLoadSplitting(Load);
  CallInst *LoadWrr = getLoadWrregion(Load);
  SelectInst *SI = getLoadSelect(Load);

  if (LoadWrr) {
    // If wrregion can be represented as raw operand, we can reuse its target
    // and offset.
    if (genx::isValueRegionOKForRaw(LoadWrr, true /* IsWrite */, ST)) {
      // TODO: mark wrregion to be erased once issue with ToErase and
      // iteration order will be resolved.
      Value *Target =
          LoadWrr->getArgOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
      Value *Offset =
          LoadWrr->getArgOperand(GenXIntrinsic::GenXRegion::WrIndexOperandNum);
      unsigned InitialOffset = cast<ConstantInt>(Offset)->getZExtValue();
      return {Target, LoadPred, InitialOffset, LoadWrr};
    }
  } else if (SI) {
    Value *Target = SI->getFalseValue();
    return {Target, LoadPred, 0, SI};
  }

  // Create new target for load.
  Value *Target = UndefValue::get(Load->getType());
  return {Target, LoadPred, 0, Load};
}

// Get a vector of widths that are considered legal.
// InstWidth is the width of an instruction.
// InitWidth is the max legal width for that instruction, must be a power of 2.
// Currently, a width considered legal if it is less than or equal to InitWidth
// and is power of 2.
static std::vector<unsigned> calculateLegalWidths(unsigned InstWidth,
                                                  unsigned InitWidth) {
  IGC_ASSERT_MESSAGE(InstWidth, "InstWidth cannot be 0");
  IGC_ASSERT_MESSAGE(isPowerOf2_32(InitWidth), "InitWidth must be power of 2");
  std::vector<unsigned> Widths;
  while (InstWidth) {
    if (InstWidth >= InitWidth) {
      Widths.push_back(InitWidth);
      InstWidth -= InitWidth;
    } else
      // TODO: some intrinsics support very limited exec size range. Legal exec
      // sizes must be calculated more accurately.
      InitWidth /= 2;
  }

  return Widths;
}

/***********************************************************************
 * splitGatherScatter : lower gather/scatter/atomic to the width support
 * by the hardware platform.
 *
 * This performs two functions:
 *
 * 1. If the operation is wider than what hardware can support, splits it
 *    into the legal width.
 *    gather4/scatter4_* are split into instructions of the same width. Whereas
 *    others supported by this function intrinsics are split in a more flexible
 *    way with the legal width variation.
 *
 * 2. For typed gather4/scatter4, when r or both v and r are zero, replace
 *    with undef so that they are not encoded in the vISA instruction and the
 *    message skips them.
 */
bool GenXLowering::splitGatherScatter(CallInst *CI, unsigned IID) {
  enum {
    MASK_IDX = 0,
    PRED_IDX = 1,
    SURF_IDX = 2,
    U_IDX = 3,
    DATA_IDX = 6,
    NONEED = 11
  };

  unsigned MaskIdx = NONEED;
  unsigned PredIdx = NONEED;
  unsigned AddrIdx = NONEED;
  unsigned DataIdx = NONEED;
  unsigned AtomicSrcIdx = NONEED;
  bool IsTyped = false;
  int AtomicNumSrc = (-1); // -1 means not-an-atomic
  unsigned NumVectorElements = 1;

  switch (IID) {
  case GenXIntrinsic::genx_typed_atomic_add:
  case GenXIntrinsic::genx_typed_atomic_and:
  // to be open-sourced soon (need to check)
  case GenXIntrinsic::genx_typed_atomic_fadd:
  case GenXIntrinsic::genx_typed_atomic_fsub:
  case GenXIntrinsic::genx_typed_atomic_fmax:
  case GenXIntrinsic::genx_typed_atomic_fmin:
  case GenXIntrinsic::genx_typed_atomic_imax:
  case GenXIntrinsic::genx_typed_atomic_imin:
  case GenXIntrinsic::genx_typed_atomic_max:
  case GenXIntrinsic::genx_typed_atomic_min:
  case GenXIntrinsic::genx_typed_atomic_or:
  case GenXIntrinsic::genx_typed_atomic_sub:
  case GenXIntrinsic::genx_typed_atomic_xchg:
  case GenXIntrinsic::genx_typed_atomic_xor:
    AtomicSrcIdx = 2;
    PredIdx = 0;
    AddrIdx = 3;
    IsTyped = true;
    AtomicNumSrc = 1;
    break;
  case GenXIntrinsic::genx_typed_atomic_dec:
  case GenXIntrinsic::genx_typed_atomic_inc:
    PredIdx = 0;
    AddrIdx = 2;
    IsTyped = true;
    AtomicNumSrc = 0;
    break;
  case GenXIntrinsic::genx_typed_atomic_cmpxchg:
  case GenXIntrinsic::genx_typed_atomic_fcmpwr:
    AtomicSrcIdx = 2;
    PredIdx = 0;
    AddrIdx = 4;
    IsTyped = true;
    AtomicNumSrc = 2;
    break;
  case GenXIntrinsic::genx_scatter4_typed:
  case GenXIntrinsic::genx_gather4_typed:
    DataIdx = DATA_IDX;
    MaskIdx = MASK_IDX;
    PredIdx = PRED_IDX;
    AddrIdx = U_IDX;
    IsTyped = true;
    break;
  case GenXIntrinsic::genx_scatter4_scaled:
  case GenXIntrinsic::genx_gather4_scaled:
    DataIdx = 6;
    PredIdx = 0;
    MaskIdx = 1;
    AddrIdx = 5;
    break;
  case GenXIntrinsic::genx_gather4_scaled2:
    MaskIdx = 0;
    AddrIdx = 4;
    break;
  case GenXIntrinsic::genx_gather4_masked_scaled2:
    PredIdx = 5;
    MaskIdx = 0;
    AddrIdx = 4;
    break;
  case GenXIntrinsic::genx_svm_scatter4_scaled:
  case GenXIntrinsic::genx_svm_gather4_scaled:
    DataIdx = 5;
    PredIdx = 0;
    MaskIdx = 1;
    AddrIdx = 4;
    break;
  case GenXIntrinsic::genx_scatter_scaled:
  case GenXIntrinsic::genx_gather_scaled:
    DataIdx = 6;
    PredIdx = 0;
    AddrIdx = 5;
    break;
  case GenXIntrinsic::genx_gather_scaled2:
    AddrIdx = 4;
    break;
  case GenXIntrinsic::genx_gather_masked_scaled2:
    PredIdx = 5;
    AddrIdx = 4;
    break;
  case GenXIntrinsic::genx_svm_scatter:
  case GenXIntrinsic::genx_svm_gather:
    DataIdx = 3;
    PredIdx = 0;
    AddrIdx = 2;
    break;
  case GenXIntrinsic::genx_svm_atomic_dec:
  case GenXIntrinsic::genx_svm_atomic_inc:
    DataIdx = 2;
    PredIdx = 0;
    AddrIdx = 1;
    AtomicNumSrc = 0;
    break;
  case GenXIntrinsic::genx_svm_atomic_add:
  case GenXIntrinsic::genx_svm_atomic_and:
  case GenXIntrinsic::genx_svm_atomic_fmax:
  case GenXIntrinsic::genx_svm_atomic_fmin:
  case GenXIntrinsic::genx_svm_atomic_imax:
  case GenXIntrinsic::genx_svm_atomic_imin:
  case GenXIntrinsic::genx_svm_atomic_max:
  case GenXIntrinsic::genx_svm_atomic_min:
  case GenXIntrinsic::genx_svm_atomic_or:
  case GenXIntrinsic::genx_svm_atomic_sub:
  case GenXIntrinsic::genx_svm_atomic_xchg:
  case GenXIntrinsic::genx_svm_atomic_xor:
    DataIdx = 3;
    PredIdx = 0;
    AddrIdx = 1;
    AtomicSrcIdx = 2;
    AtomicNumSrc = 1;
    break;
  case GenXIntrinsic::genx_svm_atomic_cmpxchg:
  case GenXIntrinsic::genx_svm_atomic_fcmpwr:
    DataIdx = 4;
    PredIdx = 0;
    AddrIdx = 1;
    AtomicSrcIdx = 2;
    AtomicNumSrc = 2;
    break;
  case GenXIntrinsic::genx_dword_atomic_add:
  case GenXIntrinsic::genx_dword_atomic_and:
  case GenXIntrinsic::genx_dword_atomic_fadd:
  case GenXIntrinsic::genx_dword_atomic_fsub:
  case GenXIntrinsic::genx_dword_atomic_fmax:
  case GenXIntrinsic::genx_dword_atomic_fmin:
  case GenXIntrinsic::genx_dword_atomic_imax:
  case GenXIntrinsic::genx_dword_atomic_imin:
  case GenXIntrinsic::genx_dword_atomic_max:
  case GenXIntrinsic::genx_dword_atomic_min:
  case GenXIntrinsic::genx_dword_atomic_or:
  case GenXIntrinsic::genx_dword_atomic_sub:
  case GenXIntrinsic::genx_dword_atomic_xchg:
  case GenXIntrinsic::genx_dword_atomic_xor:
    DataIdx = 4;
    PredIdx = 0;
    AddrIdx = 2;
    AtomicSrcIdx = 3;
    AtomicNumSrc = 1;
    break;
  case GenXIntrinsic::genx_dword_atomic_cmpxchg:
  case GenXIntrinsic::genx_dword_atomic_fcmpwr:
    DataIdx = 5;
    PredIdx = 0;
    AddrIdx = 2;
    AtomicSrcIdx = 3;
    AtomicNumSrc = 2;
    break;
  case GenXIntrinsic::genx_dword_atomic_dec:
  case GenXIntrinsic::genx_dword_atomic_inc:
    DataIdx = 3;
    PredIdx = 0;
    AddrIdx = 2;
    AtomicNumSrc = 0;
    break;

  default:
    return false;
  }

  // nulling unused inputs for typed gather/scatter/atomic
  if (IsTyped) {
    Constant *V = dyn_cast<Constant>(CI->getArgOperand(AddrIdx + 1));
    Constant *R = dyn_cast<Constant>(CI->getArgOperand(AddrIdx + 2));
    // Only continue when R is known to be zero.
    if (R && R->isNullValue()) {
      CI->setOperand(AddrIdx + 2, UndefValue::get(R->getType()));
      if (V && V->isNullValue())
        CI->setOperand(AddrIdx + 1, UndefValue::get(V->getType()));
    }
    // check if LOD is zero for atomic
    if (AtomicNumSrc >= 0) {
      Constant *LOD = dyn_cast<Constant>(CI->getArgOperand(AddrIdx + 3));
      if (LOD && LOD->isNullValue())
        CI->setOperand(AddrIdx + 3, UndefValue::get(LOD->getType()));
    }
  }
  // Deduce intrinsic width: check predicate if exists, then check address
  // vector.
  unsigned WidthOperand = NONEED;
  if (PredIdx != NONEED)
    WidthOperand = PredIdx;
  else if (AddrIdx != NONEED)
    WidthOperand = AddrIdx;
  else
    IGC_ASSERT_EXIT_MESSAGE(0, "Cannot infer execution width of intrinsic "
                               "(checked pred and addr operands)");
  auto Width =
      cast<IGCLLVM::FixedVectorType>(CI->getArgOperand(WidthOperand)->getType())
          ->getNumElements();
  unsigned TargetWidth = IsTyped ? 8 : 16;
  if (IsTyped && ST && ST->getGRFByteSize() > 32)
    TargetWidth = 16;
  // If exec size isn't a power of 2, it must be split. For example, exec size
  // 12 -> 8 and 4.
  if (Width <= TargetWidth && isPowerOf2_64(Width))
    return false;
  IGC_ASSERT(TargetWidth);
  const std::vector<unsigned> Widths = calculateLegalWidths(Width, TargetWidth);
  IGC_ASSERT(!Widths.empty());
  unsigned NumChannels = NumVectorElements;
  if (MaskIdx != NONEED) {
    vc::checkArgOperandIsConstantInt(*CI, MaskIdx, "channel mask");
    NumChannels =
        (unsigned)cast<ConstantInt>(CI->getArgOperand(MaskIdx))->getZExtValue();
    NumChannels = (NumChannels & 1) + ((NumChannels & 2) >> 1) +
                  ((NumChannels & 4) >> 2) + ((NumChannels & 8) >> 3);
    IGC_ASSERT(std::all_of(Widths.begin(), Widths.end(),
                           [FirstWidth = Widths.front()](unsigned NextWidth) {
                             return FirstWidth == NextWidth;
                           }));
  }

  unsigned NumBlks = 1;
  if (IID == GenXIntrinsic::genx_svm_scatter ||
      IID == GenXIntrinsic::genx_svm_gather) {
    const unsigned NumBlocksOpIDX = 1;
    vc::checkArgOperandIsConstantInt(*CI, NumBlocksOpIDX, "log2 num blocks");
    NumBlks = (unsigned)cast<ConstantInt>(CI->getArgOperand(NumBlocksOpIDX))
                  ->getZExtValue();
    NumBlks = (1 << NumBlks);
    auto ElmSz =
        CI->getArgOperand(DataIdx)->getType()->getScalarSizeInBits() / 8;
    if (ElmSz == 1 && NumBlks < 4)
      NumBlks = 4;
    else if (ElmSz == 2 && NumBlks < 2)
      NumBlks = 2;
  }
  const DebugLoc &DL = CI->getDebugLoc();
  Value *NewResult = nullptr;
  if (CI->getType() && CI->getType()->isVectorTy() &&
      cast<IGCLLVM::FixedVectorType>(CI->getType())->getNumElements() >=
          Width * NumChannels * NumBlks) {
    if (DataIdx != NONEED)
      NewResult = CI->getArgOperand(DataIdx);
    else
      NewResult = UndefValue::get(CI->getType());
  }

  bool IsNewLoad = isNewLoadInst(CI);
  Value *LoadPred = nullptr;
  unsigned InitialOffset = 0;
  Instruction *InstToReplace = CI;
  if (IsNewLoad)
    std::tie(NewResult, LoadPred, InitialOffset, InstToReplace) =
        getLoadTarget(CI, ST);

  unsigned AccumulatedOffset = 0;
  for (auto CurWidth : Widths) {
    SmallVector<Value *, 8> Args;
    // initialize the args with the old values
    for (unsigned ArgI = 0; ArgI < IGCLLVM::getNumArgOperands(CI); ++ArgI)
      Args.push_back(CI->getArgOperand(ArgI));
    // Predicate
    if (PredIdx != NONEED) {
      Value *V = CI->getArgOperand(PredIdx);
      if (auto C = dyn_cast<Constant>(V))
        Args[PredIdx] = getConstantSubvector(C, AccumulatedOffset, CurWidth);
      else
        Args[PredIdx] = Region::createRdPredRegion(
            V, AccumulatedOffset, CurWidth, "predsplit", CI, DL);
    }
    // address source
    unsigned NumAddrs = 1;
    if (IsTyped)
      NumAddrs = (AtomicNumSrc >= 0) ? 4 : 3;
    for (unsigned AddrI = 0; AddrI < NumAddrs; ++AddrI) {
      Value *V = CI->getArgOperand(AddrIdx + AddrI);
      Region R(V);
      R.Width = R.NumElements = CurWidth;
      R.Offset = AccumulatedOffset * V->getType()->getScalarSizeInBits() /
                 8; // in bytes
      Args[AddrIdx + AddrI] = R.createRdRegion(V, "addrsplit", CI, DL);
    }
    // data source
    // We need to construct a new vector with 8 elements per enabled
    // color.
    if (DataIdx != NONEED) {
      Value *V = CI->getArgOperand(DataIdx);
      auto DataTy = IGCLLVM::FixedVectorType::get(
          V->getType()->getScalarType(), CurWidth * NumChannels * NumBlks);
      auto ElmSz = V->getType()->getScalarSizeInBits() / 8;
      Value *NewVec = UndefValue::get(DataTy);
      if (!isa<UndefValue>(V)) {
        for (unsigned Channel = 0; Channel < NumChannels; ++Channel) {
          Region RdR(V);
          RdR.Width = RdR.NumElements = CurWidth * NumBlks;
          RdR.Offset =
              ElmSz * (Width * NumBlks * Channel + AccumulatedOffset * NumBlks);
          auto Rd = RdR.createRdRegion(V, "datasplit", CI, DL);
          if (NumChannels > 1) {
            Region WrR(DataTy);
            WrR.Width = WrR.NumElements = CurWidth * NumBlks;
            WrR.Offset = ElmSz * CurWidth * NumBlks * Channel;
            NewVec = WrR.createWrRegion(NewVec, Rd, "datasplit", CI, DL);
          } else
            NewVec = Rd;
        }
      }
      Args[DataIdx] = NewVec;
    }
    // atomic source operands
    if (AtomicSrcIdx != NONEED) {
      for (int SrcI = 0; SrcI < AtomicNumSrc; ++SrcI) {
        Value *V = CI->getArgOperand(AtomicSrcIdx + SrcI);
        Region R(V);
        R.Width = R.NumElements = CurWidth;
        R.Offset = AccumulatedOffset * V->getType()->getScalarSizeInBits() /
                   8; // in bytes
        Args[AtomicSrcIdx + SrcI] = R.createRdRegion(V, "addrsplit", CI, DL);
      }
    }
    // now create the new narrower instruction
    if (NewResult) {
      Type *DstTy = nullptr;
      if (DataIdx != NONEED)
        DstTy = Args[DataIdx]->getType();
      else {
        DstTy = IGCLLVM::FixedVectorType::get(CI->getType()->getScalarType(),
                                              CurWidth * NumBlks * NumChannels);
      }
      SmallVector<Type *, 4> Tys = {DstTy};

      // These two overloaded args can go in
      // arbitrary order in operand list
      unsigned IdxMin = std::min(AddrIdx, PredIdx);
      unsigned IdxMax = std::max(AddrIdx, PredIdx);
      if (IdxMin != NONEED)
        Tys.push_back(Args[IdxMin]->getType());
      if (IdxMax != NONEED)
        Tys.push_back(Args[IdxMax]->getType());

      auto Decl = GenXIntrinsic::getAnyDeclaration(CI->getModule(), IID, Tys);
      auto *Gather = CallInst::Create(Decl, Args, CI->getName() + ".split", CI);
      Gather->setDebugLoc(DL);
      Gather->copyMetadata(*CI);
      if (IsNewLoad) {
        if (NumChannels == 1)
          NewResult =
              generate1ChannelWrrregion(NewResult, InitialOffset, Gather,
                                        LoadPred, AccumulatedOffset, CI);
        else
          NewResult = generateNChannelWrregion(
              NewResult, InitialOffset, Gather, LoadPred,
              AccumulatedOffset / CurWidth, Widths.size(), NumChannels, CI);

        AccumulatedOffset += CurWidth;
        continue;
      }
      // Join the results together, starting with the old value.
      auto ElmSz = DstTy->getScalarSizeInBits() / 8;
      if (NumChannels > 1) {
        Region RdR(Gather);
        RdR.Width = RdR.NumElements = CurWidth * NumBlks;
        Region WrR(NewResult);
        WrR.Width = WrR.NumElements = CurWidth * NumBlks;
        WrR.Mask = nullptr;
        for (unsigned Channel = 0; Channel != NumChannels; ++Channel) {
          RdR.Offset = ElmSz * CurWidth * NumBlks * Channel;
          auto Rd = RdR.createRdRegion(Gather, "joint", CI, DL);
          WrR.Offset =
              ElmSz * (Width * NumBlks * Channel + NumBlks * AccumulatedOffset);
          NewResult = WrR.createWrRegion(NewResult, Rd, "join", CI, DL);
        }
      } else {
        Region WrR(NewResult);
        WrR.Width = WrR.NumElements = CurWidth * NumBlks;
        WrR.Offset = ElmSz * AccumulatedOffset * NumBlks;
        WrR.Mask = nullptr;
        NewResult = WrR.createWrRegion(NewResult, Gather, "join", CI, DL);
      }
    } else {
      IGC_ASSERT(CI->use_empty());
      IGC_ASSERT(DataIdx != NONEED);
      // Create the target-wide scatter instructions.
      Type *Tys[] = {Args[PredIdx]->getType(), Args[AddrIdx]->getType(),
                     Args[DataIdx]->getType()};
      auto Decl = GenXIntrinsic::getAnyDeclaration(CI->getModule(), IID, Tys);
      auto NewInst = CallInst::Create(Decl, Args, "", CI);
      NewInst->setDebugLoc(DL);
      NewInst->copyMetadata(*CI);
    }
    AccumulatedOffset += CurWidth;
  }

  if (NewResult)
    InstToReplace->replaceAllUsesWith(NewResult);

  if (InstToReplace != CI)
    ToErase.push_back(InstToReplace);
  ToErase.push_back(CI);
  return true;
}

static Constant *getConstVector(Type *ITy, unsigned int n, unsigned int step) {
  std::vector<Constant *> vConsts;
  unsigned v = 0;
  for (unsigned i = 0; i < n; i++) {
    vConsts.push_back(ConstantInt::get(ITy, v));
    v += step;
  }
  return ConstantVector::get(vConsts);
}

/***********************************************************************
 * translateSLMOWord : lower SLM OWord load/store to gathers/scatters
 * on legacy platform such as SKL.
 *
 * We only support the cases of 1,2,4,8 oword cases
 */
bool GenXLowering::translateSLMOWord(CallInst *CI, unsigned IID) {
  LLVMContext &CTX = CI->getContext();
  auto CIntTy = IntegerType::getInt32Ty(CTX);
  const DebugLoc &DL = CI->getDebugLoc();
  switch (IID) {
  case GenXIntrinsic::genx_oword_ld:
  case GenXIntrinsic::genx_oword_ld_unaligned: {
    constexpr unsigned BtiIdx = 1;
    constexpr unsigned AddrIdx = 2;
    Value *BtiV = CI->getArgOperand(BtiIdx);
    // only slm need this lowering
    if (!isa<ConstantInt>(BtiV))
      return false;
    if (cast<ConstantInt>(BtiV)->getZExtValue() !=
        visa::ReservedSurfaceIndex::RSI_Slm)
      return false;

    IRBuilder<> Builder(CI);
    Value *AddrV = CI->getArgOperand(AddrIdx);
    if (IID == GenXIntrinsic::genx_oword_ld) {
      AddrV =
          Builder.CreateShl(AddrV, llvm::ConstantInt::get(AddrV->getType(), 4));
    }
    auto *OrigVT = cast<IGCLLVM::FixedVectorType>(CI->getType());
    unsigned EltSize = OrigVT->getScalarSizeInBits();
    unsigned EltCount = OrigVT->getNumElements();
    // 1-oword is 16 bytes, using simd4 dword gather-scaled
    // 2-oword is 32 bytes, using simd8 dword gather-scaled
    // 4-oword is 64 bytes, using simd16 dword gather-scaled
    // 8-oword is 128 bytes, using 2*simd16 dword gather-scaled
    unsigned DWordCnt = (EltSize * EltCount) / 32;
    IGC_ASSERT(DWordCnt == 4 || DWordCnt == 8 || DWordCnt == 16 ||
               DWordCnt == 32);
    unsigned SimdWidth = (DWordCnt == 32) ? 16 : DWordCnt;
    auto NewVT = IGCLLVM::FixedVectorType::get(CIntTy, DWordCnt);
    auto GatherVT = IGCLLVM::FixedVectorType::get(CIntTy, SimdWidth);
    // generate gather-scaled
    auto VOffset = getConstVector(CIntTy, SimdWidth, 4);
    // create constant for predicate
    auto PredVTy =
        IGCLLVM::FixedVectorType::get(IntegerType::getInt1Ty(CTX), SimdWidth);
    auto OnePredV = Constant::getAllOnesValue(PredVTy);
    auto ScaleC = ConstantInt::get(Type::getInt16Ty(CTX), 0);
    std::string IntrName =
        std::string(GenXIntrinsic::getGenXIntrinsicPrefix()) + "gather.scaled";
    auto ID = GenXIntrinsic::lookupGenXIntrinsicID(IntrName);
    // crease constant for num-blocks, 2 means 4-bytes
    auto NumBlksC = ConstantInt::get(CIntTy, 2);
    // create the intrinsic call
    Function *NewFDecl = GenXIntrinsic::getGenXDeclaration(
        CI->getModule(), ID, {GatherVT, PredVTy, VOffset->getType()});
    Instruction *NewInst = nullptr;
    if (DWordCnt == SimdWidth) {
      NewInst = IntrinsicInst::Create(NewFDecl,
                                      {OnePredV, NumBlksC, ScaleC, BtiV, AddrV,
                                       VOffset, UndefValue::get(GatherVT)},
                                      CI->getName() + ".gather", CI);
      NewInst->setDebugLoc(DL);
      LLVM_DEBUG(dbgs() << "SLM OWord Load:\n");
      LLVM_DEBUG(CI->dump());
      LLVM_DEBUG(dbgs() << "Translated to gather:\n");
      LLVM_DEBUG(NewInst->dump());
    } else { // need to two gathers for 8 owords
      // 1st gather
      auto New1st =
          IntrinsicInst::Create(NewFDecl,
                                {OnePredV, NumBlksC, ScaleC, BtiV, AddrV,
                                 VOffset, UndefValue::get(GatherVT)},
                                CI->getName() + ".gather1", CI);
      New1st->setDebugLoc(DL);
      // 2nd gather
      AddrV = Builder.CreateAdd(AddrV,
                                llvm::ConstantInt::get(AddrV->getType(), 64));
      auto New2nd =
          IntrinsicInst::Create(NewFDecl,
                                {OnePredV, NumBlksC, ScaleC, BtiV, AddrV,
                                 VOffset, UndefValue::get(GatherVT)},
                                CI->getName() + ".gather2", CI);
      New2nd->setDebugLoc(DL);
      // write region, 1st half
      Region R(NewVT);
      R.Width = SimdWidth;
      R.NumElements = SimdWidth;
      R.Stride = 1;
      R.VStride = 0;
      R.Offset = 0;
      auto PartialV = R.createWrRegion(UndefValue::get(NewVT), New1st, "", CI,
                                       CI->getDebugLoc());
      // write region, 2nd half
      R.Offset = 64;
      NewInst = R.createWrRegion(PartialV, New2nd, "", CI, CI->getDebugLoc());
      LLVM_DEBUG(dbgs() << "SLM OWord Load:\n");
      LLVM_DEBUG(CI->dump());
      LLVM_DEBUG(dbgs() << "Translated to gather:\n");
      LLVM_DEBUG(New1st->dump());
      LLVM_DEBUG(New2nd->dump());
    }
    // cast back if required
    Value *Casted = NewInst;
    if (NewVT != OrigVT)
      Casted = CastInst::CreateBitOrPointerCast(
          Casted, OrigVT, Casted->getName() + VALUE_NAME(".cast"), CI);
    CI->replaceAllUsesWith(Casted);
    ToErase.push_back(CI);
    return true;
  }
  case GenXIntrinsic::genx_oword_st: {
    constexpr unsigned DataIdx = 2;
    constexpr unsigned AddrIdx = 1;
    constexpr unsigned BtiIdx = 0;
    Value *BtiV = CI->getArgOperand(BtiIdx);
    // Only slm need this lowering
    if (!isa<ConstantInt>(BtiV))
      return false;
    if (cast<ConstantInt>(BtiV)->getZExtValue() !=
        visa::ReservedSurfaceIndex::RSI_Slm)
      return false;

    IRBuilder<> Builder(CI);
    Value *AddrV = CI->getArgOperand(AddrIdx);
    AddrV =
        Builder.CreateShl(AddrV, llvm::ConstantInt::get(AddrV->getType(), 4));

    Value *Datum = CI->getArgOperand(DataIdx);
    auto *OrigVT = cast<IGCLLVM::FixedVectorType>(Datum->getType());
    unsigned EltSize = OrigVT->getScalarSizeInBits();
    unsigned EltCount = OrigVT->getNumElements();
    // 1-oword is 16 bytes, using simd4 dword scatter-scaled
    // 2-oword is 32 bytes, using simd8 dword scatter-scaled
    // 4-oword is 64 bytes, using simd16 dword scatter-scaled
    // 8-oword is 128 bytes, using 2*simd16 dword scatter-scaled
    unsigned DWordCnt = (EltSize * EltCount) / 32;
    IGC_ASSERT(DWordCnt == 4 || DWordCnt == 8 || DWordCnt == 16 ||
               DWordCnt == 32);
    auto NewVT = IGCLLVM::FixedVectorType::get(CIntTy, DWordCnt);
    IGC_ASSERT_MESSAGE(CastInst::isBitCastable(NewVT, OrigVT),
                       "We expect resulting vectors to be bitcastable");
    if (NewVT != OrigVT)
      Datum = CastInst::CreateBitOrPointerCast(
          Datum, NewVT, Datum->getName() + VALUE_NAME(".cast"), CI);
    unsigned SimdWidth = (DWordCnt == 32) ? 16 : DWordCnt;

    // generate scatter-scaled
    auto VOffset = getConstVector(CIntTy, SimdWidth, 4);
    // create constant for predicate
    auto PredVTy =
        IGCLLVM::FixedVectorType::get(IntegerType::getInt1Ty(CTX), SimdWidth);
    auto OnePredV = Constant::getAllOnesValue(PredVTy);
    auto ScaleC = ConstantInt::get(Type::getInt16Ty(CTX), 0);
    // create constant for num-blocks
    auto NumBlksC = ConstantInt::get(CIntTy, 2);
    std::string IntrName =
        std::string(GenXIntrinsic::getGenXIntrinsicPrefix()) + "scatter.scaled";
    auto ID = GenXIntrinsic::lookupGenXIntrinsicID(IntrName);
    // create the intrinsic call
    auto ScatterVT = IGCLLVM::FixedVectorType::get(CIntTy, SimdWidth);
    Function *NewFDecl = GenXIntrinsic::getGenXDeclaration(
        CI->getModule(), ID, {PredVTy, VOffset->getType(), ScatterVT});
    if (DWordCnt == SimdWidth) {
      // create one scatter
      auto NewInst = Builder.CreateCall(
          NewFDecl, {OnePredV, NumBlksC, ScaleC, BtiV, AddrV, VOffset, Datum});
      NewInst->setDebugLoc(DL);
      LLVM_DEBUG(dbgs() << "SLM OWord Store:\n");
      LLVM_DEBUG(CI->dump());
      LLVM_DEBUG(dbgs() << "Translated to scatter:\n");
      LLVM_DEBUG(NewInst->dump());
    } else { // 8-oword (i.e 32 dword) case
      // scatter the 1st 16 dwords
      // read region then scatter
      Region R(ScatterVT);
      R.Width = SimdWidth;
      R.NumElements = SimdWidth;
      R.Stride = 1;
      R.VStride = 0;
      R.Offset = 0;
      auto Datum1st = R.createRdRegion(Datum, "", CI, CI->getDebugLoc());
      auto New1st =
          Builder.CreateCall(NewFDecl, {OnePredV, NumBlksC, ScaleC, BtiV, AddrV,
                                        VOffset, Datum1st});
      New1st->setDebugLoc(DL);
      // scatter the 2nd 16 dwords
      // read region then scatter
      AddrV = Builder.CreateAdd(AddrV,
                                llvm::ConstantInt::get(AddrV->getType(), 64));
      R.Offset = 64;
      auto Datum2nd = R.createRdRegion(Datum, "", CI, CI->getDebugLoc());
      auto New2nd =
          Builder.CreateCall(NewFDecl, {OnePredV, NumBlksC, ScaleC, BtiV, AddrV,
                                        VOffset, Datum2nd});
      New2nd->setDebugLoc(DL);
      LLVM_DEBUG(dbgs() << "SLM OWord Store:\n");
      LLVM_DEBUG(CI->dump());
      LLVM_DEBUG(dbgs() << "Translated to scatter:\n");
      LLVM_DEBUG(New1st->dump());
      LLVM_DEBUG(New2nd->dump());
    }
    ToErase.push_back(CI);
    return true;
  }
  default:
    break;
  }
  return false;
}

static Value *ExpandPredicate(CallInst *CI, unsigned PredIdx,
                              unsigned SIMDWidth) {
  Value *PredV = CI->getArgOperand(PredIdx);
  auto Width =
      cast<IGCLLVM::FixedVectorType>(PredV->getType())->getNumElements();
  if (Width == SIMDWidth)
    return PredV;
  IGC_ASSERT(Width < SIMDWidth);
  auto PredV32Ty = IGCLLVM::FixedVectorType::get(
      IntegerType::getInt1Ty(CI->getContext()), SIMDWidth);
  auto ZeroPred32 = Constant::getNullValue(PredV32Ty);
  if (isa<Constant>(PredV)) {
    SmallVector<Constant *, 32> PredArray;
    for (unsigned i = 0; i < Width; ++i)
      PredArray.push_back(cast<ConstantVector>(PredV)->getOperand(i));
    for (unsigned i = Width; i < SIMDWidth; ++i)
      PredArray.push_back(Constant::getNullValue(
          cast<VectorType>(PredV32Ty)->getElementType()));
    return ConstantVector::get(PredArray);
  } else {
    Region R(ZeroPred32);
    R.Width = Width;
    R.NumElements = Width;
    return R.createWrPredRegion(ZeroPred32, PredV, 0, "", CI,
                                CI->getDebugLoc());
  }
}

/***************************************************************************
 * ExpandAddrOrData : expands ArgIdx argument of CI instruction to the type
 * corresponding to SIMDWidth.
 *
 * Matrix of N x ArgWidth elements expands to matrix of N x SIMDWidth elements.
 * First elements of the rows of initial matrix become the first elements of
 * corresponding rows of a new matrix.
 *
 * TODO: After wrregion lowering we will get one mov for every row, as dst
 * register doesn't have horizontal stride. So it is possible to optimize the
 * code by copying only those rows that are used.
 *
 * Returns new value/instruction/wrregion that represents expanded argument.
 * Doesn't change the call instruction itself.
 */
static Value *ExpandAddrOrData(CallInst *CI, unsigned ArgIdx, unsigned ArgWidth,
                               unsigned NumChannels, unsigned SIMDWidth) {
  Value *InitialData = CI->getArgOperand(ArgIdx);
  if (ArgWidth == SIMDWidth)
    return InitialData;

  IGC_ASSERT(ArgWidth < SIMDWidth);
  auto InitialDataNumElem =
      cast<IGCLLVM::FixedVectorType>(InitialData->getType())->getNumElements();
  IGC_ASSERT(ArgWidth);
  IGC_ASSERT_MESSAGE(InitialDataNumElem >= ArgWidth * NumChannels,
                     "there must be enough data and it should be a matrix with "
                     "ArgWidth width");
  IGC_ASSERT_MESSAGE(InitialDataNumElem % ArgWidth == 0,
                     "there must be enough data and it should be a matrix with "
                     "ArgWidth width");
  auto InitialDataNumRows = InitialDataNumElem / ArgWidth;

  auto ExpandDataTy = IGCLLVM::FixedVectorType::get(
      cast<VectorType>(InitialData->getType())->getElementType(),
      SIMDWidth * InitialDataNumRows);
  auto ExpandDataUndef = UndefValue::get(ExpandDataTy);
  if (isa<UndefValue>(InitialData))
    return ExpandDataUndef;
  Region R(ExpandDataUndef);
  R.Width = ArgWidth;
  R.NumElements = InitialDataNumElem;
  if (NumChannels > 1)
    R.VStride = SIMDWidth;
  return R.createWrRegion(ExpandDataUndef, InitialData, "", CI,
                          CI->getDebugLoc());
}

// Widen new load intrinsics. Currently only gather4_scaled2 is supported.
// Example for gather4_scaled2 with two channels.
// Before:
//   x = <16> gather4_scaled2 <8> addrs
// After:
//   newaddrs = <16> wrregion undef, addrs
//   tmp = <32> gather4_scaled2 newaddrs
// bale {
//   chR = rdregion <8> tmp, 0
//   partialx = wrregion <8> oldval, chR, 0
// }
// bale {
//   chG = rdregion <8> tmp, 16 * 4
//   x = wrregion <8> partialx, chG, 8 * 4
// }
static bool widenNewSIMD8Load(CallInst *CI, unsigned IID,
                              const GenXSubtarget *ST,
                              SmallVectorImpl<Instruction *> &ToErase) {
  switch (IID) {
  default:
    return false;
  case GenXIntrinsic::genx_gather4_scaled2:
  case GenXIntrinsic::genx_gather4_masked_scaled2:
    break;
  }

  // Probably valid only for gather.
  // Need to calculate it when other intrinsics will be normalized.
  constexpr unsigned WidenFactor = 2;

  unsigned NumChannels =
      countPopulation(cast<ConstantInt>(CI->getOperand(0))->getZExtValue());
  IGC_ASSERT(NumChannels);
  const DebugLoc &DL = CI->getDebugLoc();
  unsigned LoadWidth =
      cast<IGCLLVM::FixedVectorType>(CI->getType())->getNumElements();
  unsigned NewLoadWidth = LoadWidth * WidenFactor;
  unsigned ChannelWidth = LoadWidth / NumChannels;
  unsigned NewChannelWidth = ChannelWidth * WidenFactor;
  Type *ElemType = CI->getType()->getScalarType();

  if (ChannelWidth > 8)
    return false;

  Value *NewResult;
  Value *OldPred;
  unsigned InitialOffset;
  Instruction *InstToReplace;
  std::tie(NewResult, OldPred, InitialOffset, InstToReplace) =
      getLoadTarget(CI, ST);

  if (InstToReplace != CI)
    ToErase.push_back(InstToReplace);
  ToErase.push_back(CI);

  // Generate new addresses.
  Value *NewAddrs;
  {
    Value *Addrs = CI->getOperand(4);
    Type *AddrsType = Addrs->getType();
    Type *NewAddrsType = IGCLLVM::FixedVectorType::get(
        AddrsType->getScalarType(), NewChannelWidth);
    Region Rgn(AddrsType);
    NewAddrs = Rgn.createWrRegion(UndefValue::get(NewAddrsType), Addrs,
                                  "load.addrs.wide", CI, DL);
  }

  // Generate new wide load.
  Value *NewLoad;
  Value *NewPred;
  {
    Type *NewLoadType = IGCLLVM::FixedVectorType::get(ElemType, NewLoadWidth);
    SmallVector<Value *, 8> Args(CI->arg_begin(), CI->arg_end());
    SmallVector<Type *, 3> Tys;
    Tys.push_back(NewLoadType);
    switch (IID) {
    case GenXIntrinsic::genx_gather4_masked_scaled2: {
      IGC_ASSERT_MESSAGE(ChannelWidth == 8,
                         "Unexpected gather4_masked_scaled width");
      Args[4] = NewAddrs;
      // Expand predicate to the size of NewAddrs
      OldPred = CI->getOperand(5);
      NewPred =
          ExpandPredicate(CI, 5 /* PredIdx */,
                          cast<IGCLLVM::FixedVectorType>(NewAddrs->getType())
                              ->getNumElements());
      Args[5] = NewPred;
      Tys.push_back(NewAddrs->getType());
      Tys.push_back(NewPred->getType());
      break;
    }
    case GenXIntrinsic::genx_gather4_scaled2: {
      IGC_ASSERT_MESSAGE(ChannelWidth == 8, "Unexpected gather4_scaled width");
      Tys.push_back(NewAddrs->getType());
      Args[4] = NewAddrs;
      break;
    }
    default:
      IGC_ASSERT_EXIT_MESSAGE(0, "Unexpected load");
    }

    Function *Decl = GenXIntrinsic::getGenXDeclaration(
        CI->getModule(), static_cast<GenXIntrinsic::ID>(IID), Tys);
    auto *NewCI = CallInst::Create(Decl, Args, CI->getName() + "wide", CI);
    NewCI->setDebugLoc(DL);
    NewLoad = NewCI;
  }

  // Shuffle channels.
  unsigned ElemByteSize = ElemType->getScalarSizeInBits() / 8;
  Type *ChannelType = IGCLLVM::FixedVectorType::get(ElemType, ChannelWidth);
  Region RdR(ChannelType);
  Region WrR(ChannelType);
  for (unsigned i = 0; i < NumChannels; ++i) {
    RdR.Offset = i * NewChannelWidth * ElemByteSize;
    Value *NewChannel =
        RdR.createRdRegion(NewLoad, "load.wide.channel.read", CI, DL);
    WrR.Offset = InitialOffset + i * ChannelWidth * ElemByteSize;
    WrR.Mask = OldPred;
    NewResult = WrR.createWrRegion(NewResult, NewChannel,
                                   "load.wide.channel.write", CI, DL);
  }

  InstToReplace->replaceAllUsesWith(NewResult);
  return true;
}

/***********************************************************************
 * widenSIMD8GatherScatter: widen simd8 gather and scatter
 *
 * This performs two functions:
 *
 * 1. If the operation is simd8 rather than simd16, expand it into
 * simd16 instructions because the simd8 gather/scatter/atomic
 * is either removed by hw (typed gather/scatter/atomic) or semantically
 * changed by hw (untyped gather4/scatter4, data payload no longer
 * contiguous in case of simd8).
 *
 * 2. For a simd16 scatter4/gather4_typed, when r or both v and r are zero,
 * replace with undef so that they are not encoded in the vISA instruction and
 * the message skips them.
 */
bool GenXLowering::widenSIMD8GatherScatter(CallInst *CI, unsigned IID) {
  if (isNewLoadInst(CI))
    return widenNewSIMD8Load(CI, IID, ST, ToErase);

  enum {
    MASK_IDX = 0,
    PRED_IDX = 1,
    SURF_IDX = 2,
    U_IDX = 3,
    DATA_IDX = 6,
    NONEED = 11
  };
  constexpr int WidenSIMD = 16;

  unsigned MaskIdx = NONEED;
  unsigned PredIdx = NONEED;
  unsigned AddrIdx = NONEED;
  unsigned DataIdx = NONEED;
  bool IsTyped = false;
  int AtomicNumSrc = (-1); // -1 means not an atomic

  switch (IID) {
  case GenXIntrinsic::genx_typed_atomic_add:
  case GenXIntrinsic::genx_typed_atomic_and:
  case GenXIntrinsic::genx_typed_atomic_fadd:
  case GenXIntrinsic::genx_typed_atomic_fsub:
  case GenXIntrinsic::genx_typed_atomic_fmax:
  case GenXIntrinsic::genx_typed_atomic_fmin:
  case GenXIntrinsic::genx_typed_atomic_imax:
  case GenXIntrinsic::genx_typed_atomic_imin:
  case GenXIntrinsic::genx_typed_atomic_max:
  case GenXIntrinsic::genx_typed_atomic_min:
  case GenXIntrinsic::genx_typed_atomic_or:
  case GenXIntrinsic::genx_typed_atomic_sub:
  case GenXIntrinsic::genx_typed_atomic_xchg:
  case GenXIntrinsic::genx_typed_atomic_xor: {
    DataIdx = 2;
    PredIdx = 0;
    AddrIdx = 3;
    IsTyped = true;
    AtomicNumSrc = 1;
  } break;
  case GenXIntrinsic::genx_typed_atomic_dec:
  case GenXIntrinsic::genx_typed_atomic_inc: {
    PredIdx = 0;
    AddrIdx = 2;
    IsTyped = true;
    AtomicNumSrc = 0;
  } break;
  case GenXIntrinsic::genx_typed_atomic_cmpxchg:
  case GenXIntrinsic::genx_typed_atomic_fcmpwr: {
    DataIdx = 2;
    PredIdx = 0;
    AddrIdx = 4;
    IsTyped = true;
    AtomicNumSrc = 2;
  } break;
  case GenXIntrinsic::genx_scatter4_typed:
  case GenXIntrinsic::genx_gather4_typed: {
    DataIdx = DATA_IDX;
    MaskIdx = MASK_IDX;
    PredIdx = PRED_IDX;
    AddrIdx = U_IDX;
    IsTyped = true;
  } break;
  case GenXIntrinsic::genx_scatter4_scaled:
  case GenXIntrinsic::genx_gather4_scaled: {
    DataIdx = 6;
    PredIdx = 0;
    MaskIdx = 1;
    AddrIdx = 5;
  } break;
  case GenXIntrinsic::genx_svm_scatter4_scaled:
  case GenXIntrinsic::genx_svm_gather4_scaled: {
    DataIdx = 5;
    PredIdx = 0;
    MaskIdx = 1;
    AddrIdx = 4;
  } break;
  default:
    return false;
  }

  if (IsTyped) {
    Constant *V = dyn_cast<Constant>(CI->getArgOperand(AddrIdx + 1));
    Constant *R = dyn_cast<Constant>(CI->getArgOperand(AddrIdx + 2));
    // Only continue when R is known to be zero.
    if (R && R->isNullValue()) {
      CI->setOperand(AddrIdx + 2, UndefValue::get(R->getType()));
      if (V && V->isNullValue())
        CI->setOperand(AddrIdx + 1, UndefValue::get(V->getType()));
    }
    // check if LOD is zero for atomic
    if (AtomicNumSrc >= 0) {
      Constant *LOD = dyn_cast<Constant>(CI->getArgOperand(AddrIdx + 3));
      if (LOD && LOD->isNullValue())
        CI->setOperand(AddrIdx + 3, UndefValue::get(LOD->getType()));
    }
  }

  auto Width =
      cast<IGCLLVM::FixedVectorType>(CI->getArgOperand(PredIdx)->getType())
          ->getNumElements();
  if (Width == 16 || Width == 32)
    return false;

  IGC_ASSERT(Width == 1 || Width == 2 || Width == 4 || Width == 8);
  // legacy intrinsic, if width is 1/2/4/8, expand it into 16 wide
  // on PVC in legacy (no-translation) mode
  const DebugLoc &DL = CI->getDebugLoc();
  unsigned NumChannels = 1;
  if (MaskIdx != NONEED) {
    NumChannels =
        (unsigned)cast<ConstantInt>(CI->getArgOperand(MaskIdx))->getZExtValue();
    NumChannels = (NumChannels & 1) + ((NumChannels & 2) >> 1) +
                  ((NumChannels & 4) >> 2) + ((NumChannels & 8) >> 3);
  }

  SmallVector<Value *, 8> Args;
  for (unsigned i = 0; i < IGCLLVM::getNumArgOperands(CI); ++i) {
    Args.push_back(CI->getArgOperand(i));
  }
  Args[PredIdx] = ExpandPredicate(CI, PredIdx, WidenSIMD);

  Value *AddrV = CI->getArgOperand(AddrIdx);
  auto AddrV16Ty = IGCLLVM::FixedVectorType::get(
      cast<VectorType>(AddrV->getType())->getElementType(), 16);
  auto UndefAddr16 = UndefValue::get(AddrV16Ty);
  Region R(UndefAddr16);
  R.Width = Width;
  R.NumElements = Width;
  auto WAddrR = R.createWrRegion(UndefAddr16, AddrV, "", CI, DL);
  Args[AddrIdx] = WAddrR;
  // two more address arguments for typed-gather4
  if (IsTyped) {
    AddrV = CI->getArgOperand(AddrIdx + 1);
    if (isa<UndefValue>(AddrV))
      Args[AddrIdx + 1] = UndefAddr16;
    else
      Args[AddrIdx + 1] = R.createWrRegion(UndefAddr16, AddrV, "", CI, DL);
    AddrV = CI->getArgOperand(AddrIdx + 2);
    if (isa<UndefValue>(AddrV))
      Args[AddrIdx + 2] = UndefAddr16;
    else
      Args[AddrIdx + 2] = R.createWrRegion(UndefAddr16, AddrV, "", CI, DL);
    //  typed-atomic, LOD argument
    if (AtomicNumSrc >= 0) {
      auto LODV = CI->getArgOperand(AddrIdx + 3);
      if (isa<UndefValue>(LODV))
        Args[AddrIdx + 3] = UndefAddr16;
      else
        Args[AddrIdx + 3] = R.createWrRegion(UndefAddr16, LODV, "", CI, DL);
    }
  }

  unsigned NumSrc = (AtomicNumSrc >= 0) ? AtomicNumSrc : 1;
  for (unsigned i = 0; i < NumSrc; ++i) {
    Args[DataIdx + i] =
        ExpandAddrOrData(CI, DataIdx + i, Width, NumChannels, WidenSIMD);
  }

  Instruction *NewInst = nullptr;
  switch (IID) {
  case GenXIntrinsic::genx_scatter4_typed:
  case GenXIntrinsic::genx_scatter4_scaled:
  case GenXIntrinsic::genx_svm_scatter4_scaled: {
    // Create the 16 wide scatter4 instructions.
    Type *Tys[] = {Args[PredIdx]->getType(), Args[AddrIdx]->getType(),
                   Args[DataIdx]->getType()};
    auto Decl = GenXIntrinsic::getGenXDeclaration(CI->getModule(),
                                                  (GenXIntrinsic::ID)IID, Tys);
    NewInst = CallInst::Create(Decl, Args, "", CI);
    NewInst->setDebugLoc(DL);
    ToErase.push_back(CI);
  } break;
  default: {
    // Create the 16 wide gather4 or atomic instructions
    auto DataV16Ty =
        (DataIdx == NONEED)
            ? IGCLLVM::FixedVectorType::get(
                  cast<VectorType>(CI->getType())->getElementType(), WidenSIMD)
            : Args[DataIdx]->getType();
    Type *Tys[] = {DataV16Ty, Args[PredIdx]->getType(),
                   Args[AddrIdx]->getType()};
    auto Decl = GenXIntrinsic::getAnyDeclaration(CI->getModule(), IID, Tys);
    NewInst = CallInst::Create(Decl, Args, CI->getName() + ".expand", CI);
    NewInst->setDebugLoc(DL);
    // for gather, need to generate read-region
    Region R(CI->getType());
    IGC_ASSERT(WidenSIMD);
    R.Width = Width;
    R.NumElements =
        (cast<IGCLLVM::FixedVectorType>(NewInst->getType())->getNumElements() /
         WidenSIMD) *
        Width;
    if (NumChannels > 1) {
      R.VStride = WidenSIMD;
    }
    auto NewVec = R.createRdRegion(NewInst, "", CI, DL);
    CI->replaceAllUsesWith(NewVec);
    ToErase.push_back(CI);
  } break;
  }
  return true;
}

/***********************************************************************
 * lowerLSCTyped2DBlock : handle padding for the typed 2d block messages
 */
bool GenXLowering::lowerLSCTyped2DBlock(CallInst *CI, unsigned IID) {
  IGC_ASSERT(IID == vc::InternalIntrinsic::lsc_load_2d_tgm_bti ||
             IID == vc::InternalIntrinsic::lsc_store_2d_tgm_bti);

  auto *CacheOpts = CI->getOperand(0);
  auto *BTIV = CI->getOperand(1);
  auto *HeightV = cast<ConstantInt>(CI->getOperand(2));
  auto *WidthV = cast<ConstantInt>(CI->getOperand(3));
  auto *XOffV = CI->getOperand(4);
  auto *YOffV = CI->getOperand(5);
  Value *StoreDataV = nullptr;
  bool IsStore = IID == vc::InternalIntrinsic::lsc_store_2d_tgm_bti;
  if (IsStore)
    StoreDataV = CI->getOperand(6);

  auto *DataTy = StoreDataV ? StoreDataV->getType() : CI->getType();
  auto *VTy = cast<IGCLLVM::FixedVectorType>(DataTy);
  auto *ElementTy = VTy->getElementType();
  auto NElements = VTy->getNumElements();

  // Block width in elements
  auto Width = WidthV->getZExtValue();
  auto Height = HeightV->getZExtValue();

  auto ElementSize = DL->getTypeSizeInBits(ElementTy) / genx::ByteBits;

  auto WidthBytes = Width * ElementSize;
  auto PitchBytes = genx::roundedVal(WidthBytes, (decltype(WidthBytes))4);

  IGC_ASSERT_EXIT_MESSAGE(PitchBytes >= 4 && PitchBytes <= 64,
                          "Invalid 2d block width");

  auto Pitch = PitchBytes / ElementSize;
  auto TargetElements = Height * Pitch;

  // Load writes the whole register
  auto GRFElements = 64u / ElementSize;
  if (TargetElements % GRFElements)
    TargetElements = (TargetElements + GRFElements) & ~(GRFElements - 1);

  if (NElements == TargetElements) // no padding
    return false;

  auto *TargetVTy = IGCLLVM::FixedVectorType::get(ElementTy, TargetElements);

  SmallVector<Type *, 2> Types;
  if (!IsStore)
    Types.push_back(TargetVTy);
  Types.push_back(CacheOpts->getType());
  if (IsStore)
    Types.push_back(TargetVTy);

  auto *Decl = vc::getAnyDeclaration(CI->getModule(), IID, Types);

  vc::CMRegion R(ElementTy);
  R.NumElements = NElements;
  R.Stride = 1;
  R.Offset = 0;

  if (NElements == Width * Height) {
    R.Width = Width;
    R.VStride = Pitch;
  } else {
    R.Width = NElements;
    R.VStride = 0;
  }

  SmallVector<Value *, 7> Args = {CacheOpts, BTIV,  HeightV,
                                  WidthV,    XOffV, YOffV};

  switch (IID) {
  case vc::InternalIntrinsic::lsc_load_2d_tgm_bti: {
    auto *NewLoad = CallInst::Create(
        Decl, Args, CI->getName() + VALUE_NAME(".padding"), CI);
    NewLoad->setDebugLoc(CI->getDebugLoc());
    auto *RdRgn =
        R.createRdRegion(NewLoad, CI->getName() + VALUE_NAME(".rdregion"), CI,
                         CI->getDebugLoc());
    CI->replaceAllUsesWith(RdRgn);
  } break;
  case vc::InternalIntrinsic::lsc_store_2d_tgm_bti: {
    IGC_ASSERT_EXIT(StoreDataV);
    auto *WrRgn = R.createWrRegion(UndefValue::get(TargetVTy), StoreDataV,
                                   StoreDataV->getName() + ".wrregion", CI,
                                   CI->getDebugLoc());
    Args.push_back(WrRgn);
    auto *NewStore = CallInst::Create(Decl, Args, "", CI);
    NewStore->setDebugLoc(CI->getDebugLoc());
  } break;
  }

  ToErase.push_back(CI);

  return true;
}

/***********************************************************************
 * lowerMediaIntrinsic : lower media walker intrinsic calls
 */
bool GenXLowering::lowerMediaWalkerAPIs(CallInst *CI, unsigned IID) {
  // translate genx_thread_x -> genx_group_id_x
  //           genx_thread_y -> genx_group_id_y
  if (ST->translateMediaWalker()) {
    auto NewIID = GenXIntrinsic::not_any_intrinsic;
    switch (IID) {
    case GenXIntrinsic::genx_thread_x:
      NewIID = GenXIntrinsic::genx_group_id_x;
      break;
    case GenXIntrinsic::genx_thread_y:
      NewIID = GenXIntrinsic::genx_group_id_y;
      break;
    case GenXIntrinsic::genx_get_color:
      CI->getContext().emitError(CI,
                                 "get_color not supported on " + ST->getCPU());
    default:
      break;
    }

    if (GenXIntrinsic::isAnyNonTrivialIntrinsic(NewIID)) {
      IRBuilder<> Builder(CI);
      auto Fn = GenXIntrinsic::getGenXDeclaration(CI->getModule(), NewIID);
      Value *Val = Builder.CreateCall(Fn);
      Val = Builder.CreateTruncOrBitCast(Val, CI->getType());
      CI->replaceAllUsesWith(Val);
      ToErase.push_back(CI);
      return true;
    }
  }
  return false;
}

/***********************************************************************
 * generatePrecicatedWrrForNewLoad : Generate predicated wrr if result
 *                                   of a load that needs no splits
 * Return: true if predicated wrr was generated
 */

bool GenXLowering::generatePredicatedWrrForNewLoad(CallInst *CI) {
  IGC_ASSERT_MESSAGE(isNewLoadInst(CI), "New load expected");
  // Generate predicated wrr if result of a load is predicated with a select
  if (auto *SI = getLoadSelect(CI)) {
    Value *NewResult = SI->getFalseValue();
    Value *LoadPred = SI->getCondition();
    NewResult =
        generatePredicatedWrregion(NewResult, CI, LoadPred, 0 /* Offset */,
                                   SI->getNextNode(), "lowerpred");
    SI->replaceAllUsesWith(NewResult);
    ToErase.push_back(SI);
    return true;
  }
  return false;
}

/***********************************************************************
 * processInst : process one instruction in GenXLowering
 *
 * Return:  whether any change was made, and thus the current instruction
 *          is now marked for erasing
 */
bool GenXLowering::processInst(Instruction *Inst) {
  if (isa<InsertElementInst>(Inst))
    return lowerInsertElement(Inst);
  if (isa<ExtractElementInst>(Inst))
    return lowerExtractElement(Inst);
  if (isa<TruncInst>(Inst))
    return lowerTrunc(Inst);
  if (isa<CastInst>(Inst))
    return lowerCast(Inst);
  if (auto SI = dyn_cast<SelectInst>(Inst)) {
    if (SI->getType()->getScalarType()->isIntegerTy(1)) {
      if (SI->getType() == SI->getCondition()->getType())
        return lowerBoolVectorSelect(SI);
      return lowerBoolScalarSelect(SI);
    }
    // Widen byte op if it necessary.
    return widenByteOp(SI);
  }
  if (auto SI = dyn_cast<ShuffleVectorInst>(Inst)) {
    if (SI->getType()->getScalarType()->isIntegerTy(1))
      return lowerBoolShuffle(SI);
    return lowerShuffle(SI);
  }
  if (isa<BinaryOperator>(Inst)) {
    if (widenByteOp(Inst))
      return true;
    if (Inst->getOpcode() == Instruction::AShr ||
        Inst->getOpcode() == Instruction::LShr)
      return lowerShr(Inst);
    if (Inst->getOpcode() == Instruction::Mul)
      return lowerMul64(Inst);
    return false;
  }
  if (Inst->getOpcode() == Instruction::ICmp)
    return widenByteOp(Inst);
  if (auto *CI = dyn_cast<FCmpInst>(Inst))
    return lowerFCmpInst(CI);

  if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
    if (CI->isInlineAsm())
      return false;
    processTwoAddressOpnd(CI);
    unsigned IntrinsicID = GenXIntrinsic::not_any_intrinsic;
    if (Function *Callee = CI->getCalledFunction()) {
      IntrinsicID = vc::getAnyIntrinsicID(Callee);
      IGC_ASSERT(IGCLLVM::getNumArgOperands(CI) < GenXIntrinsicInfo::OPNDMASK);
    }
    IGC_ASSERT_EXIT(ST);
    // use gather/scatter to implement SLM oword load/store on
    // legacy platforms
    if (!ST->hasSLMOWord()) {
      if (translateSLMOWord(CI, IntrinsicID))
        return true;
    }
    if (ST->getGRFByteSize() > 32) {
      if (widenSIMD8GatherScatter(CI, IntrinsicID))
        return true;
    }
    // split gather/scatter/atomic into the width legal to the target
    if (splitGatherScatter(CI, IntrinsicID))
      return true;
    else if (isNewLoadInst(CI))
      return generatePredicatedWrrForNewLoad(CI);

    switch (IntrinsicID) {
    case GenXIntrinsic::genx_all:
    case GenXIntrinsic::genx_any:
      return lowerAllAny(CI);
    case GenXIntrinsic::genx_rdregioni:
    case GenXIntrinsic::genx_rdregionf:
      return lowerRdRegion(Inst);
    case GenXIntrinsic::genx_wrregioni:
    case GenXIntrinsic::genx_wrregionf:
      return lowerWrRegion(Inst);
    case GenXIntrinsic::genx_rdpredregion:
      return lowerRdPredRegion(CI);
    case GenXIntrinsic::genx_wrpredregion:
      return lowerWrPredRegion(Inst);
    case GenXIntrinsic::not_any_intrinsic:
      break;
    case GenXIntrinsic::genx_absi:
      break; // ignore
    case GenXIntrinsic::genx_thread_x:
    case GenXIntrinsic::genx_thread_y:
    case GenXIntrinsic::genx_get_color:
      return lowerMediaWalkerAPIs(CI, IntrinsicID);
    default:
    case GenXIntrinsic::genx_constantpred:
    case GenXIntrinsic::genx_constanti:
    case GenXIntrinsic::genx_constantf:
      break; // ignore
    case GenXIntrinsic::genx_vload: {
      if (!Inst->use_empty()) {
        Value *Ptr = Inst->getOperand(0);
        LoadInst *LI = new LoadInst(Inst->getType(), Ptr, "",
                                    /*volatile*/ true, Inst);
        LI->takeName(Inst);
        LI->setDebugLoc(Inst->getDebugLoc());
        Inst->replaceAllUsesWith(LI);
      }
      ToErase.push_back(Inst);
      return true;
    }
    case GenXIntrinsic::genx_vstore: {
      Value *Val = Inst->getOperand(0);
      Value *Ptr = Inst->getOperand(1);
      auto ST = new StoreInst(Val, Ptr, /*volatile*/ true, Inst);
      ST->setDebugLoc(Inst->getDebugLoc());
      ToErase.push_back(Inst);
      return true;
    }
    case vc::InternalIntrinsic::lsc_load_2d_tgm_bti:
    case vc::InternalIntrinsic::lsc_store_2d_tgm_bti:
      return lowerLSCTyped2DBlock(CI, IntrinsicID);
    case GenXIntrinsic::genx_ssmul:
    case GenXIntrinsic::genx_sumul:
    case GenXIntrinsic::genx_usmul:
    case GenXIntrinsic::genx_uumul:
      return lowerGenXMul(CI, IntrinsicID);
    case GenXIntrinsic::genx_addc:
    case GenXIntrinsic::genx_subb:
      return lowerAddcSubb(CI, IntrinsicID);
    case GenXIntrinsic::genx_ssmul_sat:
    case GenXIntrinsic::genx_sumul_sat:
    case GenXIntrinsic::genx_usmul_sat:
    case GenXIntrinsic::genx_uumul_sat:
      return lowerGenXMulSat(CI, IntrinsicID);
    case GenXIntrinsic::genx_simad:
    case GenXIntrinsic::genx_uimad:
      return lowerGenXIMad(CI, IntrinsicID);
    case GenXIntrinsic::genx_lzd:
      return lowerLzd(Inst);
    case Intrinsic::debugtrap:
      return lowerDebugTrap(CI);
    case Intrinsic::trap:
      return lowerTrap(CI);
    case Intrinsic::ctpop:
      return lowerCtpop(CI);
    case Intrinsic::cttz:
      return lowerCttz(CI);
    case Intrinsic::ctlz:
      return lowerCtlz(CI);
    case Intrinsic::sqrt:
      return lowerSqrt(CI);
    case Intrinsic::sadd_sat:
    case Intrinsic::ssub_sat:
      vc::fatal(Inst->getContext(), "GenXLowering",
                "Sorry not implemented: GenX backend cannot handle this "
                "intrinsic yet",
                Inst);
      break;
    case Intrinsic::uadd_sat:
      return lowerUAddWithSat(CI);
    case Intrinsic::usub_sat:
      return lowerUSubWithSat(CI);
    case Intrinsic::uadd_with_overflow:
      return lowerUAddWithOverflow(CI);
    case Intrinsic::sadd_with_overflow:
    case Intrinsic::ssub_with_overflow:
    case Intrinsic::usub_with_overflow:
    case Intrinsic::smul_with_overflow:
    case Intrinsic::umul_with_overflow:
      Inst->getContext().emitError(
          Inst, "GenX backend cannot handle overflowing intrinsics yet");
      break;
    case Intrinsic::assume:
      ToErase.push_back(Inst);
      return true;
    case Intrinsic::expect:
      llvm_unreachable("Expect intrinsic should be lowered before");
    case Intrinsic::fmuladd:
      return lowerFMulAdd(CI);
    case Intrinsic::powi:
      return lowerPowI(CI);
    case Intrinsic::bitreverse:
      return lowerBitreverse(CI);
    case Intrinsic::bswap:
      return lowerByteSwap(CI);
    case Intrinsic::fshl:
    case Intrinsic::fshr:
      return lowerFunnelShift(CI, IntrinsicID);
    case Intrinsic::abs:
      return lowerAbs(CI);
    case Intrinsic::ceil:
      return lowerMathIntrinsic(CI, GenXIntrinsic::genx_rndu);
    case Intrinsic::floor:
      return lowerMathIntrinsic(CI, GenXIntrinsic::genx_rndd);
    case Intrinsic::trunc:
      return lowerMathIntrinsic(CI, GenXIntrinsic::genx_rndz);
    case Intrinsic::stacksave:
      return lowerStackSave(CI);
    case Intrinsic::stackrestore:
      return lowerStackRestore(CI);
    case Intrinsic::vector_reduce_add:
      return lowerReduction(CI, Instruction::Add);
    case Intrinsic::vector_reduce_mul:
      return lowerReduction(CI, Instruction::Mul);
    case Intrinsic::vector_reduce_fadd:
      return lowerReduction(CI, Instruction::FAdd);
    case Intrinsic::vector_reduce_fmul:
      return lowerReduction(CI, Instruction::FMul);
    case Intrinsic::vector_reduce_fmax:
      return lowerReduction(CI, Intrinsic::maxnum);
    case Intrinsic::vector_reduce_fmin:
      return lowerReduction(CI, Intrinsic::minnum);
    case Intrinsic::copysign:
      return lowerCopySign(CI);
    case GenXIntrinsic::genx_get_hwid:
      return lowerHardwareThreadID(CI);
    case vc::InternalIntrinsic::logical_thread_id:
      return lowerLogicalThreadID(CI);
    case GenXIntrinsic::genx_nbarrier_arrive:
      return lowerNamedBarrierArrive(CI);
    case GenXIntrinsic::genx_dpas:
    case GenXIntrinsic::genx_dpas_nosrc0:
    case GenXIntrinsic::genx_dpas2:
      // The genx_dpasw and genx_dpasw_nosrc0 are intentionally not handled
      // here, because they don't support bfloat accumulator.
      return lowerDpas(CI);
    }
    return false;
  }
  if (ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Inst))
    return lowerExtractValue(EV);
  if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Inst))
    return lowerInsertValue(IV);
  return false;
}

/***********************************************************************
 * lowerAllAny : handle all and any intrinsics
 *
 * Return:  whether any change was made, and thus the current instruction
 *          is now marked for erasing
 *
 */
bool GenXLowering::lowerAllAny(CallInst *CI) {
  auto *Op = CI->getArgOperand(0);

  IRBuilder<> Builder(CI);

  auto *SrcTy = cast<IGCLLVM::FixedVectorType>(Op->getType());
  auto NElem = SrcTy->getNumElements();
  Type *Int1Ty = Type::getInt1Ty(CI->getContext());
  Type *Ty = IGCLLVM::FixedVectorType::get(Int1Ty, NElem);

  if (SrcTy->isIntOrIntVectorTy(1) && NElem > 1)
    return false;

  Value *Arg = nullptr;

  if (auto *Ext = dyn_cast<SExtInst>(Op))
    Arg = Ext->getOperand(0);
  else if (auto *Ext = dyn_cast<ZExtInst>(Op))
    Arg = Ext->getOperand(0);
  else if (!SrcTy->isIntOrIntVectorTy(1))
    Arg = Builder.CreateICmpNE(Op, ConstantInt::get(SrcTy, 0));
  else
    Arg = Op;

  Value *NewInst = nullptr;

  if (NElem > 1) {
    auto IntrID = GenXIntrinsic::getGenXIntrinsicID(CI);
    auto *Fn = GenXIntrinsic::getGenXDeclaration(CI->getModule(), IntrID, {Ty});
    NewInst =
        Builder.CreateCall(Fn, {Arg}, CI->getName() + VALUE_NAME(".lowered"));
  } else
    NewInst = Builder.CreateBitCast(Arg, Int1Ty,
                                    CI->getName() + VALUE_NAME(".lowered"));

  CI->replaceAllUsesWith(NewInst);

  ToErase.push_back(CI);
  return true;
}

/***********************************************************************
 * lowerRdRegion : handle read region instruction
 *
 * Return:  whether any change was made, and thus the current instruction
 *          is now marked for erasing
 *
 * 1. If index is variable do add sinking on it. (This in itself does not
 *    cause this function to return true, because it does not cause the
 *    original instruction to be replaced.)
 */
bool GenXLowering::lowerRdRegion(Instruction *Inst) {
  // Sink add in address calculation.
  Use *U = &Inst->getOperandUse(GenXIntrinsic::GenXRegion::RdIndexOperandNum);
  *U = sinkAdd(*U);
  return false;
}

/***********************************************************************
 * lowerWrRegion : handle write region instruction
 *
 * Return:  whether any change was made, and thus the current instruction
 *          is now marked for erasing
 *
 * 1. If index is variable do add sinking on it. (This in itself does not
 *    cause this function to return true, because it does not cause the
 *    original instruction to be replaced.)
 *
 * 2. If it is a predicated byte wrregion, see if it can be widened.
 */
bool GenXLowering::lowerWrRegion(Instruction *Inst) {
  // Sink add in address calculation.
  Use *U = &Inst->getOperandUse(GenXIntrinsic::GenXRegion::WrIndexOperandNum);
  *U = sinkAdd(*U);
  // See if a predicated byte wrregion can be widened.
  return widenByteOp(Inst);
}

/***********************************************************************
 * lowerRdPredRegion : handle read predicate region instruction
 *
 * Return:  whether any change was made, and thus the current instruction
 *          is now marked for erasing
 *
 * rdpredregion is a GenX backend internal intrinsic, and was thus created
 * within this GenXLowering pass. However it is considered legal only if its
 * uses are all in select or wrregion or wrpredpredregion; if not we lower
 * it further here. If a use is in rdpredregion, we need to combine the two
 * rdpredregions into one.
 */
bool GenXLowering::lowerRdPredRegion(CallInst *Inst) {
  auto UserIt = llvm::find_if(Inst->users(), [](auto *User) {
    if (isa<SelectInst>(User))
      return false;

    unsigned IID = vc::getAnyIntrinsicID(User);
    if (GenXIntrinsic::isWrRegion(IID) ||
        IID == GenXIntrinsic::genx_rdpredregion ||
        IID == GenXIntrinsic::genx_wrpredpredregion)
      return false;

    auto *CI = dyn_cast<CallInst>(User);
    return !CI || CI->doesNotAccessMemory();
  });

  bool IsValid = UserIt == Inst->user_end();

  auto *SrcV = Inst->getArgOperand(0);
  auto *StartV = cast<ConstantInt>(Inst->getArgOperand(1));
  auto *SrcTy = cast<IGCLLVM::FixedVectorType>(SrcV->getType());
  auto *ResTy = cast<IGCLLVM::FixedVectorType>(Inst->getType());

  unsigned Start = StartV->getZExtValue();
  unsigned Size = ResTy->getNumElements();
  unsigned SrcSize = SrcTy->getNumElements();

  IsValid &= SrcSize <= DWordBits;

  if (IsValid) {
    // All uses in select/wrregion/rdpredregion/non-ALU intrinsic, so we can
    // keep the rdpredregion.  Check for uses in another rdpredregion; we need
    // to combine those.
    for (auto *User : Inst->users()) {
      auto *UInst = dyn_cast<Instruction>(User);
      if (!UInst ||
          vc::getAnyIntrinsicID(UInst) != GenXIntrinsic::genx_rdpredregion)
        continue;

      unsigned UserStart =
          cast<ConstantInt>(UInst->getOperand(1))->getZExtValue();
      unsigned UserSize =
          cast<IGCLLVM::FixedVectorType>(UInst->getType())->getNumElements();
      auto *Combined =
          Region::createRdPredRegion(Inst->getOperand(0), Start + UserStart,
                                     UserSize, "", UInst, UInst->getDebugLoc());
      Combined->takeName(UInst);
      User->replaceAllUsesWith(Combined);
      ToErase.push_back(UInst);
    }
    return false;
  }

  // Need to lower it further.
  auto *In = Inst->getOperand(0);
  auto *InTy = cast<IGCLLVM::FixedVectorType>(In->getType());
  IRBuilder<> Builder(Inst);

  // Convert input to vector of short.
  auto *I16Ty = Builder.getInt16Ty();
  auto *InI16Ty = IGCLLVM::FixedVectorType::get(I16Ty, InTy->getNumElements());
  auto InI16 = Builder.CreateZExt(In, InI16Ty);

  // Use rdregion to extract the region.
  Region R(InI16);
  R.getSubregion(Start, Size);
  auto *Rd = R.createRdRegion(InI16, "", Inst, Inst->getDebugLoc());

  // Convert back to predicate.
  auto *Res = Builder.CreateICmpNE(Rd, Constant::getNullValue(Rd->getType()));

  Inst->replaceAllUsesWith(Res);
  ToErase.push_back(Inst);
  return true;
}

/***********************************************************************
 * lowerWrPredRegion : handle write predicate region instruction
 *
 * Return:  whether any change was made, and thus the current instruction
 *          is now marked for erasing
 *
 * wrpredregion is a GenX backend internal intrinsic, and was thus created
 * within this GenXLowering pass. However it is considered legal only if its
 * "new value" input is a compare; if not we lower it further here.
 */
bool GenXLowering::lowerWrPredRegion(Instruction *Inst) {
  auto NewVal = Inst->getOperand(1);
  if (isa<CmpInst>(NewVal))
    return false;
  // Need to lower it further.
  const DebugLoc &DL = Inst->getDebugLoc();
  // Convert "old value" input to vector of short.
  auto OldVal = Inst->getOperand(0);
  Type *I16Ty = Type::getInt16Ty(Inst->getContext());
  Type *OldValI16Ty = IGCLLVM::FixedVectorType::get(
      I16Ty,
      cast<IGCLLVM::FixedVectorType>(OldVal->getType())->getNumElements());
  auto OldValI16 = CastInst::Create(Instruction::ZExt, OldVal, OldValI16Ty,
                                    Inst->getName() + ".lower1", Inst);
  OldValI16->setDebugLoc(DL);
  // Convert "new value" input to vector of short.
  Type *NewValI16Ty = IGCLLVM::FixedVectorType::get(
      I16Ty,
      cast<IGCLLVM::FixedVectorType>(NewVal->getType())->getNumElements());
  auto NewValI16 = CastInst::Create(Instruction::ZExt, NewVal, NewValI16Ty,
                                    Inst->getName() + ".lower2", Inst);
  NewValI16->setDebugLoc(DL);
  // Use wrregion to write the new value into the old value.
  Region R(OldValI16);
  R.getSubregion(cast<ConstantInt>(Inst->getOperand(2))->getZExtValue(),
                 cast<IGCLLVM::FixedVectorType>(NewValI16Ty)->getNumElements());
  auto Wr = R.createWrRegion(OldValI16, NewValI16, Inst->getName() + ".lower3",
                             Inst, DL);
  // Convert back to predicate.
  auto Res = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, Wr,
                             Constant::getNullValue(Wr->getType()),
                             Inst->getName() + ".lower4", Inst);
  Res->setDebugLoc(DL);
  // Replace uses and erase.
  Inst->replaceAllUsesWith(Res);
  ToErase.push_back(Inst);
  return true;
}

/***********************************************************************
 * lowerInsertElement : lower InsertElement to wrregion, multiplying the
 *      index by the element size
 *
 * Return:  whether any change was made, and thus the current instruction
 *          is now marked for erasing
 */
bool GenXLowering::lowerInsertElement(Instruction *Inst) {
  Instruction *NewInst = NULL;
  // Special case - if the result has 1 element (usually turning scalar into 1
  // element vector) then simply transform the insert element into a bitcast We
  // don't need to worry about the index since if it is not zero the result is
  // undef anyway (and can be set to anything we like) We also don't need to
  // worry about what the original vector is (usually undef) since it will be
  // overwritten or undef
  auto *VT = dyn_cast<IGCLLVM::FixedVectorType>(Inst->getType());
  IGC_ASSERT(VT);
  unsigned NumElements = VT->getNumElements();
  const DebugLoc &DL = Inst->getDebugLoc();
  if (NumElements == 1) {
    Value *ToInsert = Inst->getOperand(1);
    NewInst = CastInst::Create(Instruction::BitCast, ToInsert, VT,
                               Inst->getName(), Inst);
    NewInst->setDebugLoc(DL);
  } else if (!Inst->getType()->getScalarType()->isIntegerTy(1)) {
    // Cast and scale the index.
    Value *IdxVal = scaleInsertExtractElementIndex(
        Inst->getOperand(2), Inst->getOperand(1)->getType(), Inst);
    // Sink adds in the address calculation.
    IdxVal = sinkAdd(IdxVal);
    // Create the new wrregion
    Value *Src = Inst->getOperand(1);
    Region R(Src);
    R.Indirect = IdxVal;
    NewInst = R.createWrRegion(Inst->getOperand(0), Src, Inst->getName(),
                               Inst /*InsertBefore*/, DL);
  } else {
    // Boolean insertelement. We have to cast everything to i16, do the
    // insertelement, and cast it back again. All this gets further lowered
    // subsequently.
    auto I16Ty = Type::getIntNTy(Inst->getContext(), 16);
    auto VecTy = IGCLLVM::FixedVectorType::get(
        I16Ty,
        cast<IGCLLVM::FixedVectorType>(Inst->getType())->getNumElements());
    auto CastVec =
        CastInst::Create(Instruction::ZExt, Inst->getOperand(0), VecTy,
                         Inst->getOperand(0)->getName() + ".casti16", Inst);
    CastVec->setDebugLoc(DL);
    auto CastEl =
        CastInst::Create(Instruction::ZExt, Inst->getOperand(1), I16Ty,
                         Inst->getOperand(1)->getName() + ".casti16", Inst);
    CastEl->setDebugLoc(DL);
    auto NewInsert = InsertElementInst::Create(CastVec, CastEl,
                                               Inst->getOperand(2), "", Inst);
    NewInsert->takeName(Inst);
    NewInsert->setDebugLoc(DL);
    NewInst = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, NewInsert,
                              Constant::getNullValue(VecTy),
                              NewInsert->getName() + ".casti1", Inst);
    NewInst->setDebugLoc(DL);
  }
  // Change uses and mark the old inst for erasing.
  Inst->replaceAllUsesWith(NewInst);
  ToErase.push_back(Inst);
  return true;
}

/***********************************************************************
 * lowerExtractElement : lower ExtractElement to rdregion, multiplying the
 *      index by the element size
 *
 * Return:  whether any change was made, and thus the current instruction
 *          is now marked for erasing
 */
bool GenXLowering::lowerExtractElement(Instruction *Inst) {
  Instruction *NewInst = nullptr;
  if (!Inst->getType()->isIntegerTy(1)) {
    // Cast and scale the index.
    Type *ElTy = Inst->getType();
    Value *IdxVal =
        scaleInsertExtractElementIndex(Inst->getOperand(1), ElTy, Inst);
    // Sink adds in the address calculation.
    IdxVal = sinkAdd(IdxVal);
    // Create the new rdregion.
    Region R(Inst);
    R.Indirect = IdxVal;
    NewInst = R.createRdRegion(Inst->getOperand(0), Inst->getName(),
                               Inst /*InsertBefore*/, Inst->getDebugLoc(),
                               true /*AllowScalar*/);
  } else {
    // Boolean extractelement. We have to cast everything to i16, do the
    // extractelement, and cast it back again. All this gets further lowered
    // subsequently.
    auto I16Ty = Type::getIntNTy(Inst->getContext(), 16);
    auto VecTy = IGCLLVM::FixedVectorType::get(
        I16Ty, cast<IGCLLVM::FixedVectorType>(Inst->getOperand(0)->getType())
                   ->getNumElements());
    auto CastVec =
        CastInst::Create(Instruction::ZExt, Inst->getOperand(0), VecTy,
                         Inst->getOperand(0)->getName() + ".casti16", Inst);
    const DebugLoc &DL = Inst->getDebugLoc();
    CastVec->setDebugLoc(DL);
    auto NewExtract =
        ExtractElementInst::Create(CastVec, Inst->getOperand(1), "", Inst);
    NewExtract->takeName(Inst);
    NewExtract->setDebugLoc(DL);
    NewInst = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, NewExtract,
                              Constant::getNullValue(I16Ty),
                              NewExtract->getName() + ".casti1", Inst);
    NewInst->setDebugLoc(DL);
  }
  // Change uses and mark the old inst for erasing.
  Inst->replaceAllUsesWith(NewInst);
  ToErase.push_back(Inst);
  return true;
}

/***********************************************************************
 * scaleInsertExtractElementIndex : scale index by element byte size,
 *      and ensure it is an i16
 */
Value *GenXLowering::scaleInsertExtractElementIndex(Value *IdxVal, Type *ElTy,
                                                    Instruction *InsertBefore) {
  // Do the cast and multiply.
  static_assert(genx::ByteBits);
  unsigned ElementBytes = DL->getTypeSizeInBits(ElTy) / genx::ByteBits;
  IntegerType *I16Ty = Type::getInt16Ty(IdxVal->getContext());
  if (ConstantInt *CI = dyn_cast<ConstantInt>(IdxVal))
    return ConstantInt::get(I16Ty, CI->getSExtValue() * ElementBytes);
  // Ensure the variable offset is i16.
  Instruction *IdxInst = CastInst::CreateIntegerCast(
      IdxVal, I16Ty, false /*isSigned*/, "cast", InsertBefore);
  IdxInst->setDebugLoc(InsertBefore->getDebugLoc());
  // Multiply it by the element size in bytes.
  if (ElementBytes > 1) {
    IdxInst = BinaryOperator::Create(
        Instruction::Shl, IdxInst,
        ConstantInt::get(I16Ty, genx::log2(ElementBytes)), "scale",
        InsertBefore);
    IdxInst->setDebugLoc(InsertBefore->getDebugLoc());
  }
  return IdxInst;
}

/***********************************************************************
 * lowerTrunc : lower a TruncInst
 *
 * Return:  whether any change was made, and thus the current instruction
 *          is now marked for erasing
 *
 * A Trunc is lowered to a bitcast then a region/element read with a stride.
 * GenXCoalescing will coalesce the bitcast, so this will hopefully save
 * an instruction.
 */
bool GenXLowering::lowerTrunc(Instruction *Inst) {
  Value *InValue = Inst->getOperand(0);
  // Check for the trunc's input being a sext/zext where the original element
  // size is the same as the result of the trunc. We can just remove the
  // whole thing then. (This can arise from GenXReduceIntSize.)
  if (auto CI = dyn_cast<CastInst>(InValue)) {
    if ((isa<SExtInst>(CI) || isa<ZExtInst>(CI)) &&
        CI->getOperand(0)->getType() == Inst->getType()) {
      // Just replace uses with the original unextended value.
      Inst->replaceAllUsesWith(CI->getOperand(0));
      ToErase.push_back(Inst);
      return true;
    }
  }

  // Lower "trunc i8 %v to i1" into "cmp.ne (%v & 1), 0"
  if (Inst->getType()->isIntOrIntVectorTy(1)) {
    IRBuilder<> Builder(Inst);
    auto V =
        Builder.CreateAnd(InValue, ConstantInt::get(InValue->getType(), 1));
    V = Builder.CreateICmpNE(V, ConstantInt::get(V->getType(), 0));
    if (auto I = dyn_cast<Instruction>(V))
      I->setDebugLoc(Inst->getDebugLoc());
    Inst->replaceAllUsesWith(V);
    ToErase.push_back(Inst);
    return true;
  }

  Type *InElementTy = InValue->getType();
  Type *OutElementTy = Inst->getType();
  unsigned NumElements = 1;
  if (auto *VT = dyn_cast<IGCLLVM::FixedVectorType>(InElementTy)) {
    InElementTy = VT->getElementType();
    OutElementTy = cast<VectorType>(OutElementTy)->getElementType();
    NumElements = VT->getNumElements();
  }

  // Lower "trunc <32 x i16> %v to <32 x i1>" into "cmp.ne (%v & 1), 0"
  if (NumElements > 1 && OutElementTy->isIntegerTy(1)) {
    IRBuilder<> Builder(Inst);
    unsigned N = NumElements;
    Value *Os = ConstantVector::getSplat(IGCLLVM::getElementCount(N),
                                         ConstantInt::get(InElementTy, 1));
    Value *Zs = ConstantVector::getSplat(IGCLLVM::getElementCount(N),
                                         ConstantInt::get(InElementTy, 0));
    auto V = Builder.CreateAnd(InValue, Os);
    if (auto I = dyn_cast<Instruction>(V))
      I->setDebugLoc(Inst->getDebugLoc());
    V = Builder.CreateICmpNE(V, Zs);
    if (auto I = dyn_cast<Instruction>(V))
      I->setDebugLoc(Inst->getDebugLoc());
    Inst->replaceAllUsesWith(V);
    ToErase.push_back(Inst);
    return true;
  }

  IGC_ASSERT(OutElementTy->getPrimitiveSizeInBits());
  unsigned Stride = InElementTy->getPrimitiveSizeInBits() /
                    OutElementTy->getPrimitiveSizeInBits();
  // Create the new bitcast.
  Instruction *BC = CastInst::Create(
      Instruction::BitCast, InValue,
      IGCLLVM::FixedVectorType::get(OutElementTy, Stride * NumElements),
      Inst->getName(), Inst /*InsertBefore*/);
  BC->setDebugLoc(Inst->getDebugLoc());
  // Create the new rdregion.
  Region R(BC);
  R.NumElements = NumElements;
  R.Stride = Stride;
  R.Width = NumElements;
  R.VStride = R.Stride * R.Width;
  Instruction *NewInst = R.createRdRegion(
      BC, Inst->getName(), Inst /*InsertBefore*/, Inst->getDebugLoc(),
      !isa<VectorType>(Inst->getType()) /*AllowScalar*/);
  // Change uses and mark the old inst for erasing.
  Inst->replaceAllUsesWith(NewInst);
  ToErase.push_back(Inst);
  return true;
}

/***********************************************************************
 * lowerCast : lower a CastInst
 *
 * Return:  whether any change was made, and thus the current instruction
 *          is now marked for erasing
 */
bool GenXLowering::lowerCast(Instruction *Inst) {
  // if it is a bitcast for vector of i1 predicate to vector,
  // create bitcast to scalar and them bitcast to vector due to visa
  // requirements
  if (Inst->getOpcode() == Instruction::BitCast &&
      Inst->getOperand(0)->getType()->getScalarType()->isIntegerTy(1) &&
      Inst->getType()->isVectorTy() &&
      !Inst->getType()->getScalarType()->isIntegerTy(1)) {
    IRBuilder<> Builder(Inst);
    Value *operand = Inst->getOperand(0);
    Type *originTy = Inst->getType();
    Type *scalarIntTy =
        Type::getIntNTy(Inst->getContext(), DL->getTypeSizeInBits(originTy));
    const Twine Name = "pred";
    Value *Scalar = Builder.CreateBitCast(operand, scalarIntTy, Name);
    Value *Res = Builder.CreateBitCast(Scalar, originTy, Name);
    Inst->replaceAllUsesWith(Res);
    ToErase.push_back(Inst);
    return true;
  }

  // If it is zext/sext/UIToFP from (vector of) i1, turn into a select.
  if (Inst->getOperand(0)->getType()->getScalarType()->isIntegerTy(1) &&
      Inst->getOpcode() != Instruction::BitCast) {
    int OneVal = 0;
    switch (Inst->getOpcode()) {
    case Instruction::ZExt:
      OneVal = 1;
      break;
    case Instruction::SExt:
      OneVal = -1;
      break;
    case Instruction::UIToFP:
      OneVal = 1;
      break;
    default:
      IGC_ASSERT_MESSAGE(0, "unknown opcode in lowerCast");
    }

    Instruction *NewInst;
    if (Inst->getType()->isFPOrFPVectorTy())
      NewInst = SelectInst::Create(
          Inst->getOperand(0), ConstantFP::get(Inst->getType(), OneVal),
          ConstantFP::get(Inst->getType(), 0), Inst->getName(), Inst);
    else
      NewInst = SelectInst::Create(
          Inst->getOperand(0), ConstantInt::get(Inst->getType(), OneVal),
          ConstantInt::get(Inst->getType(), 0), Inst->getName(), Inst);
    NewInst->setDebugLoc(Inst->getDebugLoc());
    Inst->replaceAllUsesWith(NewInst);
    ToErase.push_back(Inst);
    return true;
  }
  return false;
}

/***********************************************************************
 * lowerBoolScalarSelect : lower a SelectInst on vector of i1
 *
 * Return:  whether any change was made, and thus the current instruction
 *          is now marked for erasing
 *
 * This is a select on vector of i1 where the condition is scalar. This only
 * happens in simd control flow where an LLVM pass has optimized away the
 * conditional branch. We restore the conditional branch and create an
 * if..else..endif.
 */
bool GenXLowering::lowerBoolScalarSelect(SelectInst *SI) {
  //         BB1
  //        /  |
  // false /   | true
  //      /    |
  //   BB2     |
  //      \    |
  //       \   |
  //        \  |
  //         BB4
  //
  auto *BB1 = SI->getParent();

  auto *TermThen =
      SplitBlockAndInsertIfThen(SI->getCondition(), SI, false, nullptr, DT);
  (cast<BranchInst>(BB1->getTerminator()))->swapSuccessors();
  auto *BB2 = TermThen->getParent();
  auto *BB4 = BB2->getSingleSuccessor();
  BB2->setName("select.false");
  BB4->setName("select.true");

  // Replace 'select' with 'phi'
  auto Phi = PHINode::Create(SI->getType(), /*NumReservedValues=*/2, "",
                             &BB4->front());
  Phi->takeName(SI);
  Phi->addIncoming(SI->getTrueValue(), BB1);
  Phi->addIncoming(SI->getFalseValue(), BB2);
  Phi->setDebugLoc(SI->getDebugLoc());
  SI->replaceAllUsesWith(Phi);
  ToErase.push_back(SI);
  // Split the (critical) edge from BB1 to BB4 to avoid having critical edge.
  auto BB3 = SplitEdge(BB1, BB4, DT);
  BB3->setName("select.crit");
  return true;
}

/***********************************************************************
 * lowerBoolVectorSelect : lower a SelectInst on (vector of) i1
 *
 * Return:  whether any change was made, and thus the current instruction
 *          is now marked for erasing
 *
 * A select on (vector of) i1 is lowered to the equivalent and/or/xor
 * instructions. No simplification is done even if an input is a constant.
 *
 * However, if the selector looks like an EM value, and the "true" operand is
 * a cmp, it is instead lowered to an llvm.genx.wrpredpredregion. Baling will
 * bale the cmp into it, resulting in a masked cmp instruction that sets bits
 * of the flag only if the corresponding EM bit is set.
 *
 * FIXME: I have seen a case where the two inputs are all false and all true.
 * Rather than try and simplify that here in the GenX backend, we should
 * try and work out how to stop LLVM generating it in the first place.
 */
bool GenXLowering::lowerBoolVectorSelect(SelectInst *Inst) {
  if (isa<CmpInst>(Inst->getTrueValue())) {
    // Check for the condition being an EM value. It might be a shufflevector
    // that slices the EM value at index 0.
    bool IsEM = GotoJoin::isEMValue(Inst->getCondition());
    if (!IsEM) {
      if (auto SV = dyn_cast<ShuffleVectorInst>(Inst->getCondition())) {
        ShuffleVectorAnalyzer SVA(SV);
        if (!SVA.getAsSlice()) {
          // Slice at index 0.
          IsEM = GotoJoin::isEMValue(SV->getOperand(0));
        }
      }
    }
    if (IsEM) {
      // Can be lowered to llvm.genx.wrpredpredregion. It always has an index of
      // 0 and the "new value" operand the same vector width as the whole vector
      // here. That might get changed if it is split up in legalization.
      auto NewInst = Region::createWrPredPredRegion(
          Inst->getFalseValue(), Inst->getTrueValue(), 0, Inst->getCondition(),
          "", Inst, Inst->getDebugLoc());
      NewInst->takeName(Inst);
      Inst->replaceAllUsesWith(NewInst);
      ToErase.push_back(Inst);
      return true;
    }
  }
  // Normal lowering to some bit twiddling.
  Instruction *NewInst1 =
      BinaryOperator::Create(BinaryOperator::And, Inst->getOperand(0),
                             Inst->getOperand(1), Inst->getName(), Inst);
  NewInst1->setDebugLoc(Inst->getDebugLoc());
  Instruction *NewInst2 = BinaryOperator::Create(
      BinaryOperator::Xor, Inst->getOperand(0),
      Constant::getAllOnesValue(Inst->getType()), Inst->getName(), Inst);
  NewInst2->setDebugLoc(Inst->getDebugLoc());
  Instruction *NewInst3 =
      BinaryOperator::Create(BinaryOperator::And, Inst->getOperand(2), NewInst2,
                             Inst->getName(), Inst);
  NewInst3->setDebugLoc(Inst->getDebugLoc());
  Instruction *NewInst4 = BinaryOperator::Create(
      BinaryOperator::Or, NewInst1, NewInst3, Inst->getName(), Inst);
  NewInst4->setDebugLoc(Inst->getDebugLoc());
  Inst->replaceAllUsesWith(NewInst4);
  ToErase.push_back(Inst);
  return true;
}

/***********************************************************************
 * lowerBoolShuffle : lower a shufflevector (element type i1)
 *
 * Return:  whether any change was made, and thus the current instruction
 *          is now marked for erasing
 *
 * We handle four cases:
 *
 * 1. A slice of the vector, which can be turned into rdpredregion.
 *
 * 2. A splat. By default we need to lower that to a select to
 *    0 or -1 then a bitcast to the vector of i1. But if the input is the
 *    result of a cmp then we can splat the cmp as an optimization.
 *
 * 3. An unslice of the vector, which can be turned into wrpredregion.
 *
 * 4. General case. Like in the splat case we convert input via select and
 *    result is then bitcasted back to vector of i1. Converted vectors are
 *    then handled by lowerShuffleToMove
 */
bool GenXLowering::lowerBoolShuffle(ShuffleVectorInst *SI) {
  ShuffleVectorAnalyzer SVA(SI);
  // 1. Check for a slice.
  int SliceStart = SVA.getAsSlice();
  if (SliceStart >= 0) {
    unsigned Width =
        cast<IGCLLVM::FixedVectorType>(SI->getType())->getNumElements();
    // For llvm.genx.rdpredregion the number of elements to read is determined
    // from the number of elements in the return type, and must be 4, 8 or 16.
    if (Width == 4 || Width == 8 || Width == 16) {
      auto RPR = Region::createRdPredRegion(SI->getOperand(0), SliceStart,
                                            Width, "", SI, SI->getDebugLoc());
      RPR->takeName(SI);
      SI->replaceAllUsesWith(RPR);
      ToErase.push_back(SI);
      return true;
    }
  }
  // 2. Check for a splat.
  auto Splat = SVA.getAsSplat();
  if (Splat.Input)
    return lowerBoolSplat(SI, Splat.Input, Splat.Index);
  // 3. Check for an unslice. The "old value" input is operand 0 of the
  // shufflevector; the "new value" input is operand 0 of the shufflevector
  // that is operand 1 of SI. We create a wrpredregion, but GenXLowering might
  // subsequently decide that it is illegal because its "new value" input is not
  // a compare, in which case it is further lowered.
  int UnsliceStart = SVA.getAsUnslice();
  if (UnsliceStart >= 0) {
    auto InnerSI = cast<ShuffleVectorInst>(SI->getOperand(1));
    auto WPR =
        Region::createWrPredRegion(SI->getOperand(0), InnerSI->getOperand(0),
                                   UnsliceStart, "", SI, SI->getDebugLoc());
    WPR->takeName(SI);
    SI->replaceAllUsesWith(WPR);
    // Undef out the operand for InnerSI in SI, so we can directly erase InnerSI
    // if SI was its only use.
    SI->setOperand(1, UndefValue::get(InnerSI->getType()));
    ToErase.push_back(SI);
    if (InnerSI->use_empty())
      InnerSI->eraseFromParent();
    return true;
  }

  // Do not lower replicated slices.
  if (SVA.isReplicatedSlice())
    return lowerBoolShuffleReplicatedSlice(SI);

  // 4. General case.

  // The idea is to convert input i1 vector to i16 vector via select,
  // then do a shufflevector lowering for non-bool case
  // and convert back to i1 vector via icmp instruction.

  IRBuilder<> B(SI);
  unsigned WidthInput =
      cast<IGCLLVM::FixedVectorType>(SI->getOperand(0)->getType())
          ->getNumElements();
  unsigned WidthResult =
      cast<IGCLLVM::FixedVectorType>(SI->getType())->getNumElements();
  Constant *C1 = ConstantVector::getSplat(IGCLLVM::getElementCount(WidthInput),
                                          B.getInt16(1));
  Constant *C0 = ConstantVector::getSplat(IGCLLVM::getElementCount(WidthInput),
                                          B.getInt16(0));
  Value *V1 = B.CreateSelect(SI->getOperand(0), C1, C0);
  Value *V2 = B.CreateSelect(SI->getOperand(1), C1, C0);
  Value *SI1 = B.CreateShuffleVector(
      V1, V2, IGCLLVM::getShuffleMaskForBitcode(SI), SI->getName());
  Constant *C2 = ConstantVector::getSplat(IGCLLVM::getElementCount(WidthResult),
                                          B.getInt16(0));
  Value *Result = B.CreateICmpNE(SI1, C2);
  SI->replaceAllUsesWith(Result);
  ToErase.push_back(SI);

  return true;
}

bool GenXLowering::lowerBoolShuffleReplicatedSlice(ShuffleVectorInst *Inst) {
  auto *Src = dyn_cast<ShuffleVectorInst>(Inst->getOperand(0));
  if (!Src)
    return false;

  if (!isa<UndefValue>(Inst->getOperand(1)))
    return false;

  ShuffleVectorAnalyzer SVA(Src);
  if (!SVA.isReplicatedSlice())
    return false;

  SmallVector<int, 32> InstMask;
  SmallVector<int, 32> SrcMask;

  Inst->getShuffleMask(InstMask);
  Src->getShuffleMask(SrcMask);

  transform(InstMask, InstMask.begin(), [&](int Idx) {
    if (Idx == UndefMaskElem)
      return UndefMaskElem;
    return SrcMask[Idx];
  });

  IRBuilder<> Builder(Inst);

  auto *NewShuffle = Builder.CreateShuffleVector(Src->getOperand(0),
                                                 Src->getOperand(1), InstMask);
  NewShuffle->takeName(Inst);
  Inst->replaceAllUsesWith(NewShuffle);
  Inst->setOperand(0, UndefValue::get(Src->getType()));

  if (Src->use_empty())
    Src->eraseFromParent();

  ToErase.push_back(Inst);

  return true;
}

/***********************************************************************
 * lowerBoolSplat : lower a shufflevector (element type i1) that is a splat
 *
 * Return:  whether any change was made, and thus the current instruction
 *          is now marked for erasing
 */
bool GenXLowering::lowerBoolSplat(ShuffleVectorInst *SI, Value *In,
                                  unsigned Idx) {
  auto IsFixedVectorOfWidth = [](const Type *Ty, unsigned Width) {
    IGC_ASSERT(Ty);
    const auto *VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
    if (!VTy)
      return false;
    return VTy->getNumElements() == Width;
  };

  unsigned Width =
      cast<IGCLLVM::FixedVectorType>(SI->getType())->getNumElements();
  if (IsFixedVectorOfWidth(In->getType(), Width)) {
    IRBuilder<> B(SI);
    Constant *C1 = ConstantVector::getSplat(IGCLLVM::getElementCount(Width),
                                            B.getInt16(1));
    Constant *C0 = ConstantVector::getSplat(IGCLLVM::getElementCount(Width),
                                            B.getInt16(0));
    Value *V = B.CreateSelect(In, C1, C0);
    Region R(V);
    R.NumElements = Width;
    R.Stride = 0;
    R.VStride = 0;
    R.Offset = Idx * R.ElementBytes;
    V = R.createRdRegion(V, "splat", SI, SI->getDebugLoc());
    V = B.CreateICmpNE(V, C0);
    SI->replaceAllUsesWith(V);
    ToErase.push_back(SI);
    return true;
  }
  // This is a splat. See if the input is a cmp, possibly via a bitcast.
  if (auto BC = dyn_cast<BitCastInst>(In))
    In = BC->getOperand(0);
  if (auto Cmp = dyn_cast<CmpInst>(In)) {
    // Create a splatted version of the cmp.
    Value *CmpOpnds[2];
    Region R(Cmp->getOperand(0));
    R.NumElements = Width;
    R.Width = R.NumElements;
    R.Stride = 0;
    R.VStride = 0;
    R.Offset = Idx * R.ElementBytes;
    for (unsigned i = 0; i != 2; ++i) {
      auto Opnd = Cmp->getOperand(i);
      if (auto C = dyn_cast<Constant>(Opnd)) {
        Constant *SplatC =
            C->getType()->isVectorTy() ? C->getAggregateElement(Idx) : C;
        CmpOpnds[i] = ConstantVector::getSplat(
            IGCLLVM::getElementCount(R.NumElements), SplatC);
        continue;
      }
      if (!isa<VectorType>(Opnd->getType())) {
        auto NewBC =
            CastInst::Create(Instruction::BitCast, Opnd,
                             IGCLLVM::FixedVectorType::get(Opnd->getType(), 1),
                             Opnd->getName() + ".bc", Cmp);
        NewBC->setDebugLoc(Cmp->getDebugLoc());
        Opnd = NewBC;
      }
      CmpOpnds[i] =
          R.createRdRegion(Opnd, Cmp->getOperand(i)->getName() + ".splat",
                           Cmp /*InsertBefore*/, Cmp->getDebugLoc());
    }
    auto NewCmp = CmpInst::Create(
        Cmp->getOpcode(), Cmp->getPredicate(), CmpOpnds[0], CmpOpnds[1],
        Cmp->getName() + ".splat", Cmp /*InsertBefore*/);
    NewCmp->setDebugLoc(Cmp->getDebugLoc());
    SI->replaceAllUsesWith(NewCmp);
    ToErase.push_back(SI);
    return true;
  }
  // Default code. Select int and bitcast to vector of i1.
  if (IsFixedVectorOfWidth(In->getType(), 1)) {
    // First convert v1i1 to i1.
    auto NewBC = CastInst::Create(Instruction::BitCast, In,
                                  In->getType()->getScalarType(),
                                  In->getName() + ".scalar", SI);
    NewBC->setDebugLoc(SI->getDebugLoc());
    In = NewBC;
  } else if (isa<VectorType>(In->getType())) {
    auto Index = ConstantInt::get(Type::getInt32Ty(SI->getContext()), Idx);
    auto Extract =
        ExtractElementInst::Create(In, Index, SI->getName() + ".scalar", SI);
    Extract->setDebugLoc(SI->getDebugLoc());
    In = Extract;
  }
  if (Width == 8 || Width == 16 || Width == 32) {
    auto IntTy = Type::getIntNTy(SI->getContext(), Width);
    auto Sel = SelectInst::Create(In, Constant::getAllOnesValue(IntTy),
                                  Constant::getNullValue(IntTy),
                                  SI->getName() + ".sel", SI);
    Sel->setDebugLoc(SI->getDebugLoc());
    auto NewBC =
        CastInst::Create(Instruction::BitCast, Sel, SI->getType(), "", SI);
    NewBC->takeName(SI);
    NewBC->setDebugLoc(SI->getDebugLoc());
    SI->replaceAllUsesWith(NewBC);
    ToErase.push_back(SI);
    return true;
  }

  IRBuilder<> Builder(SI);
  auto Val = Builder.CreateSelect(In, Builder.getInt16(1), Builder.getInt16(0),
                                  SI->getName() + ".sel");
  if (auto Inst = dyn_cast<Instruction>(Val))
    Inst->setDebugLoc(SI->getDebugLoc());
  Val = Builder.CreateBitCast(
      Val, IGCLLVM::FixedVectorType::get(Builder.getInt16Ty(), 1));
  if (auto Inst = dyn_cast<Instruction>(Val))
    Inst->setDebugLoc(SI->getDebugLoc());

  Region R(Val);
  R.Offset = 0;
  R.Width = 1;
  R.Stride = R.VStride = 0;
  R.NumElements = Width;
  Val = R.createRdRegion(Val, "", SI, SI->getDebugLoc());
  Val = Builder.CreateICmpNE(Val, ConstantVector::getNullValue(Val->getType()));
  Val->takeName(SI);
  if (auto Inst = dyn_cast<Instruction>(Val))
    Inst->setDebugLoc(SI->getDebugLoc());
  SI->replaceAllUsesWith(Val);
  ToErase.push_back(SI);
  return true;
}

/***********************************************************************
 * lowerShuffleSplat : lower a ShuffleInst (element type not i1) when it is
 *                     a splat (repetition of the same element)
 */
void GenXLowering::lowerShuffleSplat(ShuffleVectorInst *SI,
                                     ShuffleVectorAnalyzer::SplatInfo Splat) {
  // This is a splat. Turn it into a splatting rdregion.
  if (!isa<VectorType>(Splat.Input->getType())) {
    // The input is a scalar rather than a 1-vector. Bitcast it to a 1-vector.
    auto *BC = CastInst::Create(
        Instruction::BitCast, Splat.Input,
        IGCLLVM::FixedVectorType::get(Splat.Input->getType(), 1), SI->getName(),
        SI);
    BC->setDebugLoc(SI->getDebugLoc());
    Splat.Input = BC;
  }
  // Create a rdregion with a stride of 0 to represent this splat
  Region R(Splat.Input);
  R.NumElements =
      cast<IGCLLVM::FixedVectorType>(SI->getType())->getNumElements();
  R.Width = R.NumElements;
  R.Stride = 0;
  R.VStride = 0;
  R.Offset = Splat.Index * R.ElementBytes;
  Instruction *NewInst =
      R.createRdRegion(Splat.Input, "", SI /*InsertBefore*/, SI->getDebugLoc());
  NewInst->takeName(SI);
  NewInst->setDebugLoc(SI->getDebugLoc());
  SI->replaceAllUsesWith(NewInst);
  ToErase.push_back(SI);
}

/***********************************************************************
 * lowerShuffle : lower a ShuffleInst (element type not i1)
 *
 * Mostly these are splats. These are lowered to a rdregion
 * Any other shuffle is currently unsupported
 */
bool GenXLowering::lowerShuffle(ShuffleVectorInst *SI) {
  auto Splat = ShuffleVectorAnalyzer(SI).getAsSplat();
  if (Splat.Input) {
    lowerShuffleSplat(SI, Splat);
    return true;
  }
  if (lowerShuffleToSelect(SI))
    return true;
  lowerShuffleToMove(SI);
  return true;
}

// Lower those shufflevector that can be implemented efficiently as select.
bool GenXLowering::lowerShuffleToSelect(ShuffleVectorInst *SI) {
  int NumElements =
      cast<IGCLLVM::FixedVectorType>(SI->getType())->getNumElements();
  int NumOpnd = SI->getNumOperands();
  for (int i = 0; i < NumOpnd; ++i) {
    if (cast<IGCLLVM::FixedVectorType>(SI->getOperand(i)->getType())
            ->getNumElements() != NumElements)
      return false;
  }
  for (int i = 0; i < NumElements; ++i) {
    int idx = SI->getMaskValue(i);
    // undef index returns -1.
    if (idx < 0)
      continue;
    if (idx != i && idx != i + NumElements)
      return false;
  }
  IRBuilder<> Builder(SI);
  Type *Int1Ty = Builder.getInt1Ty();
  SmallVector<Constant *, 16> MaskVec;
  MaskVec.reserve(NumElements);
  for (int i = 0; i < NumElements; ++i) {
    int idx = SI->getMaskValue(i);
    // undef index returns -1.
    if (idx == i || idx < 0)
      MaskVec.push_back(ConstantInt::get(Int1Ty, 1));
    else
      MaskVec.push_back(ConstantInt::get(Int1Ty, 0));
  }
  Value *Mask = ConstantVector::get(MaskVec);
  auto NewSel =
      SelectInst::Create(Mask, SI->getOperand(0), SI->getOperand(1), "", SI);
  NewSel->takeName(SI);
  NewSel->setDebugLoc(SI->getDebugLoc());
  SI->replaceAllUsesWith(NewSel);
  ToErase.push_back(SI);
  return true;
}

template <typename Iter> Iter skipUndefs(Iter First, Iter Last) {
  return std::find_if(First, Last, [](int MaskVal) { return MaskVal != -1; });
}

/***********************************************************************
 * lowerShuffleToMove : lower a ShuffleInst (element type is not i1) to a
 *                      sequence of rd/wrregion intrinsics
 */
void GenXLowering::lowerShuffleToMove(ShuffleVectorInst *SI) {
  ShuffleVectorAnalyzer Analyzer(SI);
  std::vector<ShuffleVectorAnalyzer::OperandRegionInfo> RdRegions;
  std::vector<Region> WrRegions;
  auto MaskVals = SI->getShuffleMask();

  // Filling read and write regions based on shuffle mask.
  for (auto It = skipUndefs(MaskVals.begin(), MaskVals.end());
       It != MaskVals.end();
       It = skipUndefs(std::next(It, RdRegions.back().R.NumElements),
                       MaskVals.end())) {
    int Idx = It - MaskVals.begin();
    auto OpRegion = Analyzer.getMaskRegionPrefix(Idx);
    IGC_ASSERT_MESSAGE(OpRegion.R.NumElements > 0,
                       "should've match at least 1 element region");
    Region WrRegion(SI);
    WrRegion.Offset = Idx * WrRegion.ElementBytes;
    WrRegion.NumElements = WrRegion.Width = OpRegion.R.NumElements;
    RdRegions.push_back(std::move(OpRegion));
    WrRegions.push_back(std::move(WrRegion));
  }

  // Building rdregion intrinsics or promoting the operand if possible.
  std::vector<Value *> RdRegionInsts;
  std::transform(
      RdRegions.begin(), RdRegions.end(), std::back_inserter(RdRegionInsts),
      [SI](ShuffleVectorAnalyzer::OperandRegionInfo &OpRegion) -> Value * {
        if (OpRegion.R.isWhole(OpRegion.Op->getType()))
          return OpRegion.Op;
        return OpRegion.R.createRdRegion(
            OpRegion.Op, SI->getName() + ".shuffle.rd", SI, SI->getDebugLoc());
      });

  // Obtaining SI replacement (sequence of wrregion intrinsics in the
  // most common case).
  Value *Result;
  if (WrRegions.size() == 0)
    Result = UndefValue::get(SI->getType());
  else if (WrRegions.size() == 1 &&
           WrRegions.front().NumElements ==
               cast<IGCLLVM::FixedVectorType>(SI->getType())->getNumElements())
    Result = RdRegionInsts.back();
  else {
    auto WrRegionArgs = zip(WrRegions, RdRegionInsts);
    Result = std::accumulate(
        WrRegionArgs.begin(), WrRegionArgs.end(),
        static_cast<Value *>(UndefValue::get(SI->getType())),
        [SI](Value *PrevWrRegionInst,
             const std::tuple<Region &, Value *> &Args) {
          return std::get<0>(Args).createWrRegion(
              PrevWrRegionInst, std::get<1>(Args),
              SI->getName() + ".shuffle.wr", SI, SI->getDebugLoc());
        });
  }

  SI->replaceAllUsesWith(Result);
  ToErase.push_back(SI);
}

/***********************************************************************
 * lowerShr : lower Shl followed by AShr/LShr by the same amount
 *    into trunc+sext/zext
 *
 * Return:  whether any change was made, and thus the current instruction
 *          is now marked for erasing
 *
 * See convertShlShr below.
 */
bool GenXLowering::lowerShr(Instruction *Inst) {
  Instruction *NewInst = convertShlShr(Inst);
  if (!NewInst)
    return false; // no conversion done
  ToErase.push_back(Inst);
  auto Shl = cast<Instruction>(Inst->getOperand(0));
  if (Shl->hasOneUse())
    ToErase.push_back(Shl);
  return true;
}

/***********************************************************************
 * convertShlShr : convert Shl followed by AShr/LShr by the same amount
 *    into trunc+sext/zext
 *
 * Enter:   Inst = the AShr or LShr instruction
 *
 * Return:  0 if no conversion done, else the new SExt/ZExt instruction.
 *          The original AShr/LShr is now unused, but neither original
 *          instruction is erased.
 *
 * This is the opposite to what instruction combining does! We want to change
 * it back to trunc then extend because the trunc can then be lowered into
 * a region, and the extend can sometimes be baled into whatever uses it.
 *
 * This is a separately callable global function so it can also be used
 * from GenXReduceIntSize, which for other reasons of convenience runs before
 * GenXLowering.
 */
Instruction *llvm::genx::convertShlShr(Instruction *Inst) {
  unsigned NumBits = Inst->getType()->getScalarType()->getPrimitiveSizeInBits();
  auto C = dyn_cast<Constant>(Inst->getOperand(1));
  if (!C)
    return nullptr;
  auto Shl = dyn_cast<Instruction>(Inst->getOperand(0));
  if (!Shl)
    return nullptr;
  if (Shl->getOpcode() != Instruction::Shl)
    return nullptr;
  if (Shl->getOperand(1) != C)
    return nullptr;
  if (isa<VectorType>(C->getType())) {
    C = C->getSplatValue();
    if (!C)
      return nullptr;
  }
  unsigned ShiftBits = cast<ConstantInt>(C)->getSExtValue();
  unsigned RemainingBits = NumBits - ShiftBits;
  if (RemainingBits != 8 && RemainingBits != 16)
    return nullptr;
  // We have Shl+AShr or Shl+LShr that can be turned into trunc+sext/zext.
  Type *ConvTy = Type::getIntNTy(Inst->getContext(), RemainingBits);
  if (auto *VT = dyn_cast<IGCLLVM::FixedVectorType>(Inst->getType()))
    ConvTy = IGCLLVM::FixedVectorType::get(ConvTy, VT->getNumElements());
  auto Trunc = CastInst::Create(Instruction::Trunc, Shl->getOperand(0), ConvTy,
                                "", Inst);
  Trunc->takeName(Shl);
  Trunc->setDebugLoc(Inst->getDebugLoc());
  auto Ext = CastInst::Create(Inst->getOpcode() == Instruction::AShr
                                  ? Instruction::SExt
                                  : Instruction::ZExt,
                              Trunc, Inst->getType(), "", Inst);
  Ext->takeName(Inst);
  Ext->setDebugLoc(Inst->getDebugLoc());
  Inst->replaceAllUsesWith(Ext);
  return Ext;
}

/***********************************************************************
 * lowerExtractValue : remove extractvalue if possible
 *
 * Return:  whether any change was made, and thus the current instruction
 *          is now marked for erasing
 *
 * If we can trace the input of the extractvalue to the point where the
 * value was inserted, use that value instead.
 *
 * Because we have already split struct phi nodes, we should just be left
 * with insertvalue/extractvalue pairs that we can remove here. The
 * exception is when a struct is passed in to or returned from a call.
 * Then we leave the extractvalue for later handling in the register
 * allocator.
 */
bool GenXLowering::lowerExtractValue(ExtractValueInst *Inst) {
  ArrayRef<unsigned> EVIndices = Inst->getIndices();
  ArrayRef<unsigned> Indices = EVIndices;
  Value *V = Inst->getAggregateOperand();
  for (;;) {
    InsertValueInst *IV = dyn_cast<InsertValueInst>(V);
    if (!IV) {
      // If we used up any indices, create a new extractvalue for the
      // remaining ones.
      if (Indices.size() != EVIndices.size()) {
        Instruction *NewIV = ExtractValueInst::Create(
            Inst->getAggregateOperand(), Indices, Inst->getName(), Inst);
        NewIV->setDebugLoc(Inst->getDebugLoc());
        Inst->replaceAllUsesWith(NewIV);
        ToErase.push_back(Inst);
        return true;
      }
      return false;
    }
    // We have an insertvalue. See how many of the indices agree.
    ArrayRef<unsigned> IVIndices = IV->getIndices();
    unsigned Match = 0;
    while (Match < Indices.size() && Match < IVIndices.size() &&
           Indices[Match] == IVIndices[Match])
      ++Match;
    if (!Match) {
      // No match at all. Go back to the previous insertvalue.
      V = IV->getAggregateOperand();
      continue;
    }
    // Use the inserted value here.
    V = IV->getInsertedValueOperand();
    // Chop off the indices we have used up. If none left, we have finished.
    Indices = Indices.slice(Match);
    if (!Indices.size())
      break;
  }
  // We have found the struct element value V.
  Inst->replaceAllUsesWith(V);
  ToErase.push_back(Inst);
  return true;
}

/***********************************************************************
 * lowerInsertValue : remove insertvalue if possible
 *
 * Return:  whether any change was made, and thus the current instruction
 *          is now marked for erasing
 *
 * In most cases, by the time we get to an insertvalue, it will be unused
 * because of extractvalue removal.
 *
 * In a case where it is still used (probably because this function has an
 * arg or return value that is a struct, or we call a function like that),
 * the struct value is dealt with in register allocation.
 */
bool GenXLowering::lowerInsertValue(InsertValueInst *Inst) {
  if (Inst->use_empty()) {
    ToErase.push_back(Inst);
    return true;
  }
  return false;
}

/***********************************************************************
 * lowerUAddWithOverflow : lower llvm.uadd.with.overflow
 *
 * This could potentially be implemented with the vISA addc instruction.
 * However an intrinsic for that would need extra GenX backend support for
 * returning a struct containing two vectors, and that support does not exist
 * now.
 *
 * So for now we use the old DEC Alpha trick of comparing the result with
 * one of the operands.
 */
bool GenXLowering::lowerUAddWithOverflow(CallInst *CI) {
  const DebugLoc &DL = CI->getDebugLoc();
  // Do the add.
  auto Add =
      BinaryOperator::Create(Instruction::Add, CI->getArgOperand(0),
                             CI->getArgOperand(1), CI->getName() + ".add", CI);
  Add->setDebugLoc(DL);
  // Do the comparison. (An unsigned add has overflowed if the result is
  // smaller than one of the operands, and, if it has overflowed, the result
  // is smaller than both of the operands. So it doesn't matter which operand
  // we use for the comparison.)
  auto Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, Add,
                             CI->getArgOperand(1), CI->getName() + ".cmp", CI);
  Cmp->setDebugLoc(DL);
  // For any extractvalue use of the result of the original add with overflow,
  // replace it directly.
  SmallVector<ExtractValueInst *, 4> Extracts;
  for (auto ui = CI->use_begin(), ue = CI->use_end(); ui != ue; ++ui)
    if (auto EVI = dyn_cast<ExtractValueInst>(ui->getUser()))
      Extracts.push_back(EVI);
  for (auto ei = Extracts.begin(), ee = Extracts.end(); ei != ee; ++ei) {
    auto EVI = *ei;
    EVI->replaceAllUsesWith(EVI->getIndices()[0] ? (Value *)Cmp : (Value *)Add);
    EVI->setOperand(0, UndefValue::get(CI->getType()));
    ToErase.push_back(EVI);
  }
  // If any uses of the original intrinsic remain, recreate the struct value.
  if (!CI->use_empty()) {
    auto Insert = InsertValueInst::Create(UndefValue::get(CI->getType()), Add,
                                          0, CI->getName() + ".insertadd", CI);
    Insert->setDebugLoc(DL);
    Insert = InsertValueInst::Create(Insert, Cmp, 1,
                                     CI->getName() + ".insertcmp", CI);
    Insert->setDebugLoc(DL);
    // ... and use it to replace the original intrinsic.
    CI->replaceAllUsesWith(Insert);
  }
  ToErase.push_back(CI);
  return true;
}

// common subroutine for sub+sat and add+sat: builds uadd with sat
static CallInst *buildUAddWithSat(CallInst *CI, Value *Arg0, Value *Arg1,
                                  Instruction *InsertPoint, bool IsSignedSrc) {
  IGC_ASSERT(CI);
  const DebugLoc &DL = CI->getDebugLoc();
  Module *M = CI->getModule();
  IRBuilder<> Builder(InsertPoint);
  Type *ArgTypes[] = {CI->getType(), Arg0->getType()};
  Value *Args[] = {Arg0, Arg1};
  auto Fn = GenXIntrinsic::getGenXDeclaration(
      M,
      IsSignedSrc ? GenXIntrinsic::genx_usadd_sat
                  : GenXIntrinsic::genx_uuadd_sat,
      ArgTypes);
  auto *UUAddInst = Builder.CreateCall(Fn, Args, CI->getName());
  UUAddInst->setDebugLoc(DL);
  return UUAddInst;
}

// llvm.uadd.sat
// here we are lucky to reuse genx intrinsic
bool GenXLowering::lowerUAddWithSat(CallInst *CI) {
  IGC_ASSERT(CI);
  Value *Arg0 = CI->getArgOperand(0);
  Value *Arg1 = CI->getArgOperand(1);
  auto *UUAddInst = buildUAddWithSat(CI, Arg0, Arg1, CI, /*IsSignedSrc*/ false);
  CI->replaceAllUsesWith(UUAddInst);
  ToErase.push_back(CI);
  return true;
}

// llvm.usub.sat i.e. sat(a - b) we can treat as sat(a + (-b))
// i.e. we are building -b and then uadd with saturation
bool GenXLowering::lowerUSubWithSat(CallInst *CI) {
  IGC_ASSERT(CI);
  Value *Arg0 = CI->getArgOperand(0);
  Value *Arg1 = IRBuilder<>(CI).CreateNeg(CI->getArgOperand(1), CI->getName());
  auto *UUAddInst = buildUAddWithSat(CI, Arg0, Arg1, CI, /*IsSignedSrc*/ true);
  CI->replaceAllUsesWith(UUAddInst);
  ToErase.push_back(CI);
  return true;
}

bool GenXLowering::lowerDebugTrap(CallInst *CI) {
  Module *M = CI->getModule();
  IRBuilder<> Builder(CI);

  auto *Cr0Ty = IGCLLVM::FixedVectorType::get(Builder.getInt32Ty(), 4);

  auto *ReadPredefRegFn = vc::getAnyDeclaration(
      M, GenXIntrinsic::genx_read_predef_reg, {Cr0Ty, Cr0Ty});
  auto *WritePredefRegFn = vc::getAnyDeclaration(
      M, GenXIntrinsic::genx_write_predef_reg, {Cr0Ty, Cr0Ty});

  auto *Cr0Id = Builder.getInt32(PreDefined_Vars::PREDEFINED_CR0);
  auto *Cr0V =
      Builder.CreateCall(ReadPredefRegFn, {Cr0Id, UndefValue::get(Cr0Ty)});

  // CR0.1 region
  Region R{Cr0V};
  R.NumElements = 1;
  R.VStride = 0;
  R.Width = 1;
  R.Stride = 0;
  R.Offset = DWordBytes;
  IGC_ASSERT(R.isScalar());

  constexpr unsigned SWExceptionControl = 29;
  constexpr unsigned CR0Mask = 1 << SWExceptionControl;

  auto &DL = CI->getDebugLoc();
  auto *SrcV = R.createRdRegion(Cr0V, "cr0.1", CI, DL);
  auto *DstV = Builder.CreateOr(SrcV, CR0Mask);
  auto *ResV = R.createWrRegion(Cr0V, DstV, "cr0.new", CI, DL);

  Builder.CreateCall(WritePredefRegFn, {Cr0Id, ResV});

  ToErase.push_back(CI);
  return true;
}

bool GenXLowering::lowerTrap(CallInst *CI) {
  Module *M = CI->getModule();
  IRBuilder<> Builder(CI);

  const unsigned Width = ST->getGRFByteSize() / DWordBytes;
  auto *PayloadTy = IGCLLVM::FixedVectorType::get(Builder.getInt32Ty(), Width);
  auto *PayloadFunc =
      vc::getAnyDeclaration(M, GenXIntrinsic::genx_r0, {PayloadTy});
  auto *Payload = Builder.CreateCall(PayloadFunc, {});

  SmallVector<Value *, 8> Args{
      Builder.getInt8(2),                        // modifier (EOT)
      Builder.getInt8(0),                        // log2(exec size)
      Builder.getTrue(),                         // predicate
      Builder.getInt8(1),                        // number of source registers
      Builder.getInt8(ST->hasLSCMessages() ? 3   // Gateway
                                           : 7), // Thread Spawner
      Builder.getInt32(0),                       // extened message descriptor
      Builder.getInt32(0x02000010),              // message descriptor
      Payload,
  };

  auto *SendFunc =
      vc::getAnyDeclaration(M, GenXIntrinsic::genx_raw_send2_noresult,
                            {Builder.getInt1Ty(), PayloadTy});

  Builder.CreateCall(SendFunc, Args);
  ToErase.push_back(CI);

  return true;
}

bool GenXLowering::lowerCtpop(CallInst *CI) {
  Module *M = CI->getModule();
  IRBuilder<> Builder(CI);
  Builder.SetCurrentDebugLocation(CI->getDebugLoc());

  Type *Int32Ty = IntegerType::getInt32Ty(CI->getContext());
  Type *RetTy = nullptr;
  if (auto *VT = dyn_cast<IGCLLVM::FixedVectorType>(CI->getType()))
    RetTy = IGCLLVM::FixedVectorType::get(Int32Ty, VT->getNumElements());
  else
    RetTy = Int32Ty;

  auto *CBitDecl = GenXIntrinsic::getGenXDeclaration(
      M, GenXIntrinsic::genx_cbit, {RetTy, CI->getType()});
  Value *CBitInst =
      Builder.CreateCall(CBitDecl, CI->getOperand(0), CI->getName());

  CBitInst = Builder.CreateZExtOrTrunc(CBitInst, CI->getType());
  CI->replaceAllUsesWith(CBitInst);
  ToErase.push_back(CI);

  return true;
}

// Lower cmp instructions that GenX cannot deal with.
bool GenXLowering::lowerFCmpInst(FCmpInst *Inst) {
  IRBuilder<> Builder(Inst);
  Builder.SetCurrentDebugLocation(Inst->getDebugLoc());
  Value *Ops[] = {Inst->getOperand(0), Inst->getOperand(1)};

  switch (Inst->getPredicate()) {
  default:
    return lowerUnorderedFCmpInst(Inst);
  case CmpInst::FCMP_ORD: // True if ordered (no nans)
  {
    // %c = fcmp ord %a %b
    // =>
    // %1 = fcmp oeq %a %a
    // %2 = fcmp oeq %b %b
    // %c = and %1 %2
    Value *LHS = Builder.CreateFCmpOEQ(Ops[0], Ops[0]);
    Value *RHS = Builder.CreateFCmpOEQ(Ops[1], Ops[1]);
    Value *New = Builder.CreateAnd(LHS, RHS);
    Inst->replaceAllUsesWith(New);
    ToErase.push_back(Inst);
    return true;
  }
  case CmpInst::FCMP_UNO: // True if unordered: isnan(X) | isnan(Y)
  {
    // %c = fcmp uno %a %b
    // =>
    // %1 = fcmp une %a %a
    // %2 = fcmp une %b %b
    // %c = or %1 %2
    Value *LHS = Builder.CreateFCmpUNE(Ops[0], Ops[0]);
    Value *RHS = Builder.CreateFCmpUNE(Ops[1], Ops[1]);
    Value *New = Builder.CreateOr(LHS, RHS);
    Inst->replaceAllUsesWith(New);
    ToErase.push_back(Inst);
    return true;
  }
  case CmpInst::FCMP_UEQ: // UEQ cannot be replaced with NOT ONE because we do
                          // not have ONE
  {
    // %c = fcmp ueq %a %b
    // =>
    // %1 = fcmp olt %a %b
    // %2 = fcmp ogt %a %b
    // %3 = or %1 %2
    // %c = not %3
    Value *LHS = Builder.CreateFCmpOLT(Ops[0], Ops[1]);
    Value *RHS = Builder.CreateFCmpOGT(Ops[0], Ops[1]);
    Value *Or = Builder.CreateOr(LHS, RHS);
    Value *New = Builder.CreateNot(Or);
    Inst->replaceAllUsesWith(New);
    ToErase.push_back(Inst);
    return true;
  }
  case CmpInst::FCMP_ONE: // NE is unordered
  {
    // %c = fcmp one %a %b
    // =>
    // %1 = fcmp olt %a %b
    // %2 = fcmp ogt %a %b
    // %c = or %1 %2
    Value *LHS = Builder.CreateFCmpOLT(Ops[0], Ops[1]);
    Value *RHS = Builder.CreateFCmpOGT(Ops[0], Ops[1]);
    Value *New = Builder.CreateOr(LHS, RHS);
    Inst->replaceAllUsesWith(New);
    ToErase.push_back(Inst);
    return true;
  }
  }

  return false;
}

bool GenXLowering::lowerCttz(CallInst *CI) {
  // T Cttz(T src) {
  //   T src_reverse = reverse_bits(src);
  //   return count_leading_zeros(src_reverse);
  // }
  IRBuilder<> Builder(CI);
  auto *ResTy = CI->getType();

  Value *Reverse = Builder.CreateIntrinsic(Intrinsic::bitreverse, {ResTy},
                                           {CI->getArgOperand(0)});
  Value *Result = Builder.CreateIntrinsic(Intrinsic::ctlz, {ResTy},
                                          {Reverse, CI->getArgOperand(1)});

  CI->replaceAllUsesWith(Result);
  ToErase.push_back(CI);
  return true;
}

bool GenXLowering::lowerCtlz(CallInst *CI) {
  // The hardware only supports the ctlz operation for 32-bit integers, so we lower 8 and 16 bit integer operations as follows:
  // 1. Zext i8 -> i32
  // 2. ctlz.i32
  // 3. res = ctlz.i32 - (32 - 8)

  auto *Ty = CI->getType()->getScalarType();
  unsigned OpTypeWidth = cast<IntegerType>(Ty)->getBitWidth();

  if ((OpTypeWidth != 8) && (OpTypeWidth != 16))
    return false;

  IRBuilder<> Builder(CI);
  Type *ResTy = Builder.getInt32Ty();
  if (auto *SrcVTy = dyn_cast<IGCLLVM::FixedVectorType>(CI->getType()))
    ResTy = IGCLLVM::FixedVectorType::get(ResTy, SrcVTy->getNumElements());

  auto *Op = CI->getOperand(0);
  auto *Zext = Builder.CreateZExt(Op, ResTy);
  auto *Ctlz = Builder.CreateIntrinsic(Intrinsic::ctlz, {ResTy},
                                       {Zext, CI->getArgOperand(1)});
  auto *Result =
      Builder.CreateSub(Ctlz, ConstantInt::get(ResTy, (32 - OpTypeWidth)));
  Result = Builder.CreateTrunc(Result, CI->getType());
  CI->replaceAllUsesWith(Result);
  ToErase.push_back(CI);
  return true;
}

// FCmp with NE is the only one supported unordered cmp inst. All the rest must
// be lowered.
bool GenXLowering::lowerUnorderedFCmpInst(FCmpInst *Inst) {
  CmpInst::Predicate Pred = Inst->getPredicate();
  if (CmpInst::isOrdered(Pred))
    return false;

  // We support UNE.
  if (Pred == CmpInst::FCMP_UNE)
    return false;

  // For UNO and UEQ we have replacement in lowerFCmpInst.
  IGC_ASSERT(Pred != CmpInst::FCMP_UNO);
  IGC_ASSERT(Pred != CmpInst::FCMP_UEQ);

  CmpInst *InverseFCmp = CmpInst::Create(
      Inst->getOpcode(), CmpInst::getInversePredicate(Pred),
      Inst->getOperand(0), Inst->getOperand(1),
      Inst->getName() + ".ordered.inversed", Inst->getNextNode());
  Instruction *Result = BinaryOperator::CreateNot(
      InverseFCmp, InverseFCmp->getName() + ".not", InverseFCmp->getNextNode());
  InverseFCmp->setDebugLoc(Inst->getDebugLoc());
  Result->setDebugLoc(Inst->getDebugLoc());
  Inst->replaceAllUsesWith(Result);
  ToErase.push_back(Inst);

  return true;
}

// Lower llvm.sqrt to genx.ieee.sqrt equivalent.
bool GenXLowering::lowerSqrt(CallInst *CI) {
  IGC_ASSERT_MESSAGE(vc::getAnyIntrinsicID(CI) == Intrinsic::sqrt,
                     "llvm.sqrt expected");
  auto *ResTy = CI->getType();

  auto *Ty = CI->getType()->getScalarType();
  bool IsFast =
      !Ty->isDoubleTy() && (CI->hasApproxFunc() || !ST->hasIEEEDivSqrt());
  GenXIntrinsic::ID SqrtID =
      IsFast ? GenXIntrinsic::genx_sqrt : GenXIntrinsic::genx_ieee_sqrt;

  auto *SqrtDecl =
      GenXIntrinsic::getGenXDeclaration(CI->getModule(), SqrtID, {ResTy});
  Value *Result = IRBuilder<>(CI).CreateCall(SqrtDecl, {CI->getArgOperand(0)},
                                             CI->getName());
  CI->replaceAllUsesWith(Result);
  ToErase.push_back(CI);
  return true;
}

// Some GenX-specific mul intrinsics require special lowering
bool GenXLowering::lowerGenXMul(CallInst *CI, unsigned IID) {
  // DxD -> Q mul should be handled specially if MulDDQ is NOT desired
  if (ST->useMulDDQ())
    return false;
  if (IID == GenXIntrinsic::genx_sumul || IID == GenXIntrinsic::genx_usmul)
    return false;
  IGC_ASSERT(IID == GenXIntrinsic::genx_uumul ||
             IID == GenXIntrinsic::genx_ssmul);
  bool IsSigned = IID == GenXIntrinsic::genx_ssmul;

  auto ScalarType = [](Value *V) { return V->getType()->getScalarType(); };
  if (!ScalarType(CI)->isIntegerTy(64))
    return false;
  Value *LH = CI->getOperand(0);
  Value *RH = CI->getOperand(1);
  IGC_ASSERT(LH->getType() == RH->getType());
  if (ScalarType(LH)->isIntegerTy(64))
    return false;

  IRBuilder<> B(CI);

  if (ScalarType(LH)->isIntegerTy(8) || ScalarType(LH)->isIntegerTy(16)) {
    // The result can't exceed 32 bit. Get rid of 64-bit multiplication
    auto *SrcTy = LH->getType();
    Type *MulTy = B.getIntNTy(ScalarType(LH)->isIntegerTy(8) ? 16 : 32);
    if (auto *SrcVTy = dyn_cast<IGCLLVM::FixedVectorType>(SrcTy))
      MulTy = IGCLLVM::FixedVectorType::get(MulTy, SrcVTy->getNumElements());

    auto *MulFunc =
        GenXIntrinsic::getAnyDeclaration(CI->getModule(), IID, {MulTy, SrcTy});
    auto *Mul = B.CreateCall(MulFunc, {LH, RH}, CI->getName());

    Value *Ext =
        IsSigned ? B.CreateSExt(Mul, CI->getType(), CI->getName() + ".sext")
                 : B.CreateZExt(Mul, CI->getType(), CI->getName() + ".zext");
    CI->replaceAllUsesWith(Ext);
    ToErase.push_back(CI);
    return true;
  }

  auto NewIID =
      IsSigned ? GenXIntrinsic::genx_simad : GenXIntrinsic::genx_uimad;

  auto *Ty = LH->getType();
  auto *Func =
      GenXIntrinsic::getGenXDeclaration(CI->getModule(), NewIID, {Ty, Ty});

  auto *Zero = Constant::getNullValue(Ty);
  auto *IMad = B.CreateCall(Func, {LH, RH, Zero});

  auto *MulLo = B.CreateExtractValue(IMad, {1}, CI->getName() + ".lo.");
  auto *MulHi = B.CreateExtractValue(IMad, {0}, CI->getName() + ".hi.");

  IGC_ASSERT(MulLo->getType() == MulHi->getType());
  if (!MulLo->getType()->isVectorTy()) {
    auto *VTy = IGCLLVM::FixedVectorType::get(B.getInt32Ty(), 1);
    MulLo = B.CreateBitCast(MulLo, VTy, MulLo->getName() + "v.");
    MulHi = B.CreateBitCast(MulHi, VTy, MulHi->getName() + "v.");
  }

  IVSplitter SplitBuilder(*CI);
  Value *Result = SplitBuilder.combineLoHiSplit(
      {MulLo, MulHi}, CI->getName() + ".", CI->getType()->isIntegerTy());
  CI->replaceAllUsesWith(Result);
  ToErase.push_back(CI);
  return true;
}

// Lower integer DWxDW mul with saturation since it is not supported on HW.
bool GenXLowering::lowerGenXMulSat(CallInst *CI, unsigned IntrinsicID) {
  Type *ResType = CI->getType();
  IGC_ASSERT(ResType->isIntOrIntVectorTy());

  IGC_ASSERT(CI->getOperand(0)->getType() == CI->getOperand(1)->getType());
  Type *OpType = CI->getOperand(0)->getType();
  IGC_ASSERT(OpType->isIntOrIntVectorTy());

  unsigned OpTypeWidth =
      cast<IntegerType>(OpType->getScalarType())->getBitWidth();
  IGC_ASSERT_MESSAGE(OpTypeWidth != 64, "i64 types are not supported");

  if (OpTypeWidth != 32)
    return false;

  auto IsSignedMulSat = [](unsigned ID) -> std::pair<bool, bool> {
    switch (ID) {
    case GenXIntrinsic::genx_uumul_sat:
      return {false, false};
    case GenXIntrinsic::genx_usmul_sat:
      return {false, true};
    case GenXIntrinsic::genx_sumul_sat:
      return {true, false};
    case GenXIntrinsic::genx_ssmul_sat:
      return {true, true};
    default:
      llvm_unreachable("Inst should be *mul.sat intrinsic");
    }
  };

  auto GetTruncSatIntrinsicId = [](bool ResSigned, bool OpSigned) {
    return ResSigned ? (OpSigned ? GenXIntrinsic::genx_sstrunc_sat
                                 : GenXIntrinsic::genx_sutrunc_sat)
                     : (OpSigned ? GenXIntrinsic::genx_ustrunc_sat
                                 : GenXIntrinsic::genx_uutrunc_sat);
  };

  auto [IsSignedRes, IsSignedOps] = IsSignedMulSat(IntrinsicID);

  // Create type that doesn't overflow in multiplication.
  Type *MulType = IntegerType::get(OpType->getContext(), 2 * OpTypeWidth);
  if (auto *OpVTy = dyn_cast<IGCLLVM::FixedVectorType>(OpType))
    MulType = IGCLLVM::FixedVectorType::get(MulType, OpVTy->getNumElements());

  IRBuilder<> B(CI);
  auto *M = CI->getModule();

  auto MulIID =
      IsSignedOps ? GenXIntrinsic::genx_ssmul : GenXIntrinsic::genx_uumul;
  auto *MulFunc =
      GenXIntrinsic::getGenXDeclaration(M, MulIID, {MulType, OpType});
  auto *Mul = B.CreateCall(MulFunc, {CI->getOperand(0), CI->getOperand(1)},
                           CI->getName());

  auto TruncSatIID = GetTruncSatIntrinsicId(IsSignedRes, IsSignedOps);
  auto *TruncSatFunc =
      GenXIntrinsic::getGenXDeclaration(M, TruncSatIID, {ResType, MulType});
  auto *Result = B.CreateCall(TruncSatFunc, {Mul}, CI->getName() + ".sat");
  CI->replaceAllUsesWith(Result);
  ToErase.push_back(CI);
  return true;
}

// Generic lowering for mul64
// GenX backend does not support 64-bit multiplication, so we try
// To lower it to a sequence of 32-bit ops
bool GenXLowering::lowerMul64(Instruction *Inst) {

  IVSplitter SplitBuilder(*Inst);
  if (!SplitBuilder.IsI64Operation())
    return false;

  IRBuilder<> Builder(Inst);

  auto Src0 = SplitBuilder.splitOperandLoHi(0);
  auto Src1 = SplitBuilder.splitOperandLoHi(1);

  auto *M = Inst->getModule();
  Value *Cari = nullptr;
  Value *ResL = nullptr;

  if (ST->useMulDDQ()) {
    // Create uumul intrinsic for DxD->Q multiplication
    SmallVector<Type *, 2> Tys{Inst->getType(), Src0.Lo->getType()};
    Function *UUMulFunc =
        GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_uumul, Tys);

    SmallVector<Value *, 2> Args{Src0.Lo, Src1.Lo};
    auto *UUMul = Builder.CreateCall(UUMulFunc, Args);
    auto Res = SplitBuilder.splitValueLoHi(*UUMul);

    Cari = Res.Hi;
    ResL = Res.Lo;
  } else {
    // create uimad intrinsic for DxD->Q multiplication
    auto *Ty = Src0.Lo->getType();
    auto *UMadIFunc = GenXIntrinsic::getGenXDeclaration(
        M, GenXIntrinsic::genx_uimad, {Ty, Ty});

    auto *Zero = Constant::getNullValue(Ty);
    SmallVector<Value *, 3> Args{Src0.Lo, Src1.Lo, Zero};
    auto *UMadI = Builder.CreateCall(UMadIFunc, Args);

    ResL = Builder.CreateExtractValue(UMadI, {1});
    Cari = Builder.CreateExtractValue(UMadI, {0}, ".cari");
  }

  // create muls and adds
  auto *Temp0 = Builder.CreateMul(Src0.Lo, Src1.Hi);
  auto *Temp1 = Builder.CreateAdd(Cari, Temp0);
  auto *Temp2 = Builder.CreateMul(Src0.Hi, Src1.Lo);
  auto *ResH = Builder.CreateAdd(Temp2, Temp1);

  // create the bitcast to the destination-type
  auto *Replace = SplitBuilder.combineLoHiSplit({ResL, ResH}, "mul64",
                                                Inst->getType()->isIntegerTy());
  Inst->replaceAllUsesWith(Replace);
  ToErase.push_back(Inst);
  return true;
}

struct LoHiRes {
  Value *Lo;
  Value *Hi;
};

// Result of madw is a single vector with width == 2GRF/sizeof(i32),
// first GRF contains hi part of result, second -- lo part.
// This function extract hi and lo parts and return those as struct {Lo, Hi}.
static LoHiRes getLoHiFromMadw(CallInst *Madw, IRBuilder<> &IRB) {
  IGC_ASSERT(Madw);
  IGC_ASSERT_MESSAGE(Madw->getType()->isVectorTy(),
                     "result of madw must be i32 vector type");

  unsigned OpWidth =
      cast<IGCLLVM::FixedVectorType>(Madw->getArgOperand(0)->getType())
          ->getNumElements();
  unsigned ResWidth =
      cast<IGCLLVM::FixedVectorType>(Madw->getType())->getNumElements();

  Region AddrHi{Madw};
  AddrHi.getSubregion(ResWidth / 2, OpWidth);
  auto *RdrHi = AddrHi.createRdRegion(Madw, Madw->getName() + ".hi",
                                      &*IRB.GetInsertPoint(),
                                      IRB.getCurrentDebugLocation());

  Region AddrLo{Madw};
  AddrLo.getSubregion(0, OpWidth);
  auto *RdrLo = AddrLo.createRdRegion(Madw, Madw->getName() + ".lo",
                                      &*IRB.GetInsertPoint(),
                                      IRB.getCurrentDebugLocation());

  return {RdrLo, RdrHi};
}

CallInst *buildLegalMadw(ArrayRef<Value *> Args, bool Signed, const Twine &Name,
                         const GenXSubtarget &ST, IRBuilder<> &IRB) {
  IGC_ASSERT_MESSAGE(Args.size() == 3, "madw must have 3 arguments");
  IGC_ASSERT_MESSAGE(Args[0]->getType()->isVectorTy() &&
                         Args[0]->getType()->isIntOrIntVectorTy(),
                     "madw support only i32 vector types");

  unsigned TargetWidth = ST.getGRFByteSize() / genx::DWordBytes;
  auto *VecTy = cast<IGCLLVM::FixedVectorType>(Args[0]->getType());
  IGC_ASSERT(std::all_of(Args.begin(), Args.end(), [VecTy](auto Arg) {
    return Arg->getType() == VecTy;
  }));
  unsigned OpWidth = VecTy->getNumElements();
  IGC_ASSERT_MESSAGE(OpWidth <= TargetWidth && isPowerOf2_32(OpWidth),
                     "attempt to build madw with incorrect argument width");

  unsigned ResWidth = 2 * TargetWidth;
  Module *M = IRB.GetInsertPoint()->getModule();

  auto *RetTy = IGCLLVM::FixedVectorType::get(VecTy->getScalarType(), ResWidth);
  Type *Tys[2] = {RetTy, VecTy};
  Function *IntrinFunc = GenXIntrinsic::getGenXDeclaration(
      M, Signed ? GenXIntrinsic::genx_smadw : GenXIntrinsic::genx_umadw, Tys);
  return IRB.CreateCall(IntrinFunc, Args, Name);
}

// Join one result of madw to another.
// Insert the JoinedRes.(Lo|Hi) vector into the Res.(Lo|Hi) vector starting
// from the StartIdx.
static LoHiRes joinLoHiResults(const LoHiRes &Res, const LoHiRes &JoinedRes,
                               unsigned StartIdx, IRBuilder<> &IRB) {
  IGC_ASSERT_MESSAGE(JoinedRes.Hi->getType() == JoinedRes.Lo->getType(),
                     "Lo and Hi parts of madw result must have the same type");
  IGC_ASSERT_MESSAGE(Res.Hi->getType() == Res.Lo->getType(),
                     "Lo and Hi parts of madw result must have the same type");
  IGC_ASSERT_MESSAGE(JoinedRes.Hi->getType()->isVectorTy(),
                     "result of madw must be i32 vector");
  IGC_ASSERT_MESSAGE(Res.Hi->getType()->isVectorTy(),
                     "result of madw must be i32 vector");

  auto *VTy = cast<IGCLLVM::FixedVectorType>(JoinedRes.Hi->getType());
  unsigned SplitWidth = VTy->getNumElements();

  Region JoinSplitRes{Res.Hi};
  JoinSplitRes.getSubregion(StartIdx, SplitWidth);

  Value *Hi = JoinSplitRes.createWrRegion(
      Res.Hi, JoinedRes.Hi,
      JoinedRes.Hi->getName() + ".join.hi" + Twine(StartIdx),
      &*IRB.GetInsertPoint(), IRB.getCurrentDebugLocation());

  Value *Lo = JoinSplitRes.createWrRegion(
      Res.Lo, JoinedRes.Lo,
      JoinedRes.Lo->getName() + ".join.lo" + Twine(StartIdx),
      &*IRB.GetInsertPoint(), IRB.getCurrentDebugLocation());

  return {Lo, Hi};
}

Value *getSplitOperand(Value *Opnd, unsigned StartIdx, unsigned Size,
                       IRBuilder<> &IRB) {
  if (auto *C = dyn_cast<Constant>(Opnd))
    return getConstantSubvector(C, StartIdx, Size);
  auto *OpTy = Opnd->getType();
  Region R{OpTy};
  R.getSubregion(StartIdx, Size);
  return R.createRdRegion(Opnd, Opnd->getName() + ".split" + Twine(StartIdx),
                          &*IRB.GetInsertPoint(),
                          IRB.getCurrentDebugLocation());
}

static LoHiRes buildIMadWithMadw(ArrayRef<Value *> Args, bool Signed,
                                 const Twine &Name, const GenXSubtarget &ST,
                                 IRBuilder<> &IRB) {
  IGC_ASSERT_MESSAGE(Args.size() == 3, "madw must have 3 arguments");
  IGC_ASSERT_MESSAGE(Args[0]->getType()->isVectorTy() &&
                         Args[0]->getType()->isIntOrIntVectorTy(),
                     "madw support only i32 vector types");

  auto *OpTy = cast<IGCLLVM::FixedVectorType>(Args[0]->getType());
  IGC_ASSERT(std::all_of(Args.begin(), Args.end(),
                         [OpTy](auto Arg) { return Arg->getType() == OpTy; }));

  unsigned OpWidth = OpTy->getNumElements();
  unsigned TargetWidth = ST.getGRFByteSize() / genx::DWordBytes;

  if (OpWidth <= TargetWidth && isPowerOf2_32(OpWidth)) {
    CallInst *Madw = buildLegalMadw(Args, Signed, Name, ST, IRB);
    return getLoHiFromMadw(Madw, IRB);
  }

  LoHiRes Res = {UndefValue::get(OpTy), UndefValue::get(OpTy)};
  unsigned StartIdx = 0;
  while (StartIdx < OpWidth) {
    unsigned SplitWidth = std::min(
        TargetWidth, static_cast<unsigned>(PowerOf2Floor(OpWidth - StartIdx)));

    std::array<Value *, 3> SplitArgs;
    for (unsigned i = 0; i < Args.size(); ++i)
      SplitArgs[i] = getSplitOperand(Args[i], StartIdx, SplitWidth, IRB);

    CallInst *Madw = buildLegalMadw(SplitArgs, Signed,
                                    Name + ".split" + Twine(StartIdx), ST, IRB);

    auto SplitRes = getLoHiFromMadw(Madw, IRB);
    Res = joinLoHiResults(Res, SplitRes, StartIdx, IRB);

    StartIdx += SplitWidth;
  }

  return Res;
}

static LoHiRes buildIMadWithMulDDQ(ArrayRef<Value *> Args, bool Signed,
                                   const Twine &Name, IRBuilder<> &IRB) {
  IGC_ASSERT_MESSAGE(Args.size() == 3, "imad must have 3 arguments");
  IGC_ASSERT_MESSAGE(Args[0]->getType()->isVectorTy() &&
                         Args[0]->getType()->isIntOrIntVectorTy(32),
                     "i32 vector type expected");

  auto *OpTy = Args[0]->getType();
  IGC_ASSERT(std::all_of(Args.begin(), Args.end(),
                         [OpTy](auto Arg) { return Arg->getType() == OpTy; }));
  unsigned Width = cast<IGCLLVM::FixedVectorType>(OpTy)->getNumElements();
  auto *Int64Ty = Type::getInt64Ty(OpTy->getContext());
  auto *VInt64Ty = IGCLLVM::FixedVectorType::get(Int64Ty, Width);

  Type *Tys[] = {VInt64Ty, OpTy};
  Module *M = IRB.GetInsertPoint()->getModule();
  Function *I64MulFunc =
      Signed
          ? GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_ssmul, Tys)
          : GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_uumul,
                                              Tys);

  Value *Res = IRB.CreateCall(I64MulFunc, {Args[0], Args[1]}, Name + ".mul64");

  // If last argument of imad isn't zeroinitiliazer we should produce add
  auto *C = dyn_cast<Constant>(Args[2]);
  if (!(C && C->isZeroValue())) {
    auto *Ext = CastInst::Create(
        Signed ? Instruction::SExt : Instruction::ZExt, Args[2], VInt64Ty,
        Args[2]->getName() + ".cast", &*IRB.GetInsertPoint());
    Ext->setDebugLoc(IRB.getCurrentDebugLocation());
    Res = IRB.CreateAdd(Res, Ext, Name + ".add64");
  }

  unsigned BaseOpIdx = 0;
  IVSplitter SplitBuilder(*IRB.GetInsertPoint(), &BaseOpIdx);
  auto Split = SplitBuilder.splitValueLoHi(*Res);

  return {Split.Lo, Split.Hi};
}

void replaceUsesOfIMad(CallInst *IMad, const LoHiRes &Res, IRBuilder<> &IRB) {
  bool NeedsInsertValues = false;
  SmallVector<Instruction *, 2> ExtValsToErase;
  for (const auto &U : IMad->uses()) {
    IGC_ASSERT_MESSAGE(isa<Instruction>(U.getUser()),
                       "expected that user is instruction");
    if (auto *ExtVal = dyn_cast<ExtractValueInst>(U.getUser())) {
      unsigned Idx = *ExtVal->idx_begin();
      IGC_ASSERT_MESSAGE(
          ExtVal->getNumIndices() == 1 && (Idx == 0 || Idx == 1),
          "index of extractvalue from imad result must be 0 or 1");
      if (Idx == 0) // Hi part
        ExtVal->replaceAllUsesWith(Res.Hi);
      else // Idx == 1, Lo part
        ExtVal->replaceAllUsesWith(Res.Lo);
      ExtValsToErase.push_back(ExtVal);
    } else {
      NeedsInsertValues = true;
    }
  }

  // Erase extractvalues.
  // This is necessary cos we don't want to replace useless imad users.
  for (auto *ExtVal : ExtValsToErase)
    ExtVal->eraseFromParent();

  if (!NeedsInsertValues)
    return;

  auto *InsertHi =
      IRB.CreateInsertValue(UndefValue::get(IMad->getType()), Res.Hi, 0);
  auto *InsertLo = IRB.CreateInsertValue(InsertHi, Res.Lo, 1);
  IMad->replaceAllUsesWith(InsertLo);
}

// Lower imad to sequence of madw
// After that imad isn't supported
bool GenXLowering::lowerGenXIMad(CallInst *CI, unsigned IntrinsicID) {
  IGC_ASSERT_MESSAGE(IntrinsicID == GenXIntrinsic::genx_simad ||
                         IntrinsicID == GenXIntrinsic::genx_uimad,
                     "expected genx_*imad");

  std::array<Value *, 3> Args = {CI->getArgOperand(0), CI->getArgOperand(1),
                                 CI->getArgOperand(2)};
  auto *OpTy = CI->getArgOperand(0)->getType();
  IGC_ASSERT(std::all_of(Args.begin(), Args.end(),
                         [OpTy](auto Arg) { return Arg->getType() == OpTy; }));

  IRBuilder<> Builder{CI};
  if (!OpTy->isVectorTy()) {
    auto *VecTy = IGCLLVM::FixedVectorType::get(OpTy, 1);
    std::for_each(Args.begin(), Args.end(), [VecTy, &Builder](auto &Arg) {
      Arg = Builder.CreateBitCast(Arg, VecTy, Arg->getName() + ".bitcast");
    });
  }

  bool Signed = (IntrinsicID == GenXIntrinsic::genx_simad);
  LoHiRes Res =
      ST->useMulDDQ()
          ? buildIMadWithMulDDQ(Args, Signed, CI->getName(), Builder)
          : buildIMadWithMadw(Args, Signed, CI->getName(), *ST, Builder);

  if (!OpTy->isVectorTy()) {
    Res.Hi =
        Builder.CreateBitCast(Res.Hi, OpTy, Res.Hi->getName() + ".bitcast");
    Res.Lo =
        Builder.CreateBitCast(Res.Lo, OpTy, Res.Lo->getName() + ".bitcast");
  }

  replaceUsesOfIMad(CI, Res, Builder);

  ToErase.push_back(CI);
  return true;
}

bool GenXLowering::lowerLzd(Instruction *Inst) {
  const unsigned OpIndex = 0;
  IVSplitter SplitBuilder(*Inst, &OpIndex);
  if (!SplitBuilder.IsI64Operation())
    return false;

  IRBuilder<> Builder(Inst);

  auto Src = SplitBuilder.splitOperandLoHi(0);

  auto *VTy32 = cast<VectorType>(Src.Lo->getType());
  IGC_ASSERT(VTy32->getScalarType() == Builder.getInt32Ty());
  auto *Zero = ConstantInt::getNullValue(VTy32);
  auto *K32 = ConstantInt::get(VTy32, 32);

  auto *LzdF = GenXIntrinsic::getAnyDeclaration(
      Inst->getModule(), GenXIntrinsic::genx_lzd, {Src.Lo->getType()});

  // Lzd64 is lowered as:
  // LoLzd  = lzd(Src.Lo)
  // HiLzd  = lzd(Src.Hi)
  // Result = (Src.Hi == 0) ? (LoLzd + 32) : HiLzd
  auto *VlzdLo = Builder.CreateCall(LzdF, Src.Lo, "lower.lzd64.lo.");
  auto *VlzdHi = Builder.CreateCall(LzdF, Src.Hi, "lower.lzd64.hi.");
  auto *FlagHiZero = Builder.CreateICmpEQ(Src.Hi, Zero, "lower.lzd64.hicmp.");
  auto *LoPathResult = Builder.CreateAdd(VlzdLo, K32, "lower.lzd64.lores.");
  auto *Result =
      Builder.CreateSelect(FlagHiZero, LoPathResult, VlzdHi, "lower.lzd32.");
  // TODO: allow lzd to have type of destination != operand type
  if (Inst->getType()->getScalarType()->isIntegerTy(64)) {
    Result = Builder.CreateZExt(Result,
                                VectorType::getExtendedElementVectorType(VTy32),
                                "lower.lzd64.");
  }
  auto *CastedResult =
      scalarizeOrVectorizeIfNeeded(cast<Instruction>(Result), Inst);
  if (CastedResult)
    Result = CastedResult;
  Inst->replaceAllUsesWith(Result);
  Result->takeName(Inst);
  ToErase.push_back(Inst);
  return true;
}

bool GenXLowering::lower64Bitreverse(CallInst *CI) {
  // %1 = call i64 @llvm.bitreverse.i64(i64 %in)
  //  to
  // {%inH, %inL} = rdregion.32 %in
  // %inRH = bitreverse.32 %inH
  // %inRL = bitreverse.32 %inL
  // %res = wrregion.64 {%inRL, %inRH}

  auto *InType = CI->getType();
  IGC_ASSERT(InType->getScalarSizeInBits() == 64);
  IRBuilder<> IRB{CI};
  auto Split = IVSplitter(*CI).splitValueLoHi(*CI->getOperand(0));
  auto *ResTy = Split.Lo->getType();
  Value *LoReverse =
      IRB.CreateIntrinsic(Intrinsic::bitreverse, {ResTy}, {Split.Lo});
  Value *HiReverse =
      IRB.CreateIntrinsic(Intrinsic::bitreverse, {ResTy}, {Split.Hi});
  Value *Result = IVSplitter(*CI).combineLoHiSplit(
      {LoReverse, HiReverse}, CI->getName() + ".", InType->isIntegerTy());

  CI->replaceAllUsesWith(Result);
  ToErase.push_back(CI);
  return true;
}

// %1 = call i8 @llvm.bitreverse.i8(i8 %in)
// to
// %1.zext = zext i8 %in to i32
// %halfres.bigger = call i32 @genx.bfrev.i32(i32 %1.zext)
// // calculated val = 32 - 8
// %res.bigger = lshr i32 %halfres.bigger, val
// %res = trunc i32 %res.bigger to i8
bool GenXLowering::lowerBitreverse(CallInst *CI) {
  IGC_ASSERT(CI);
  LLVMContext &Ctx = CI->getContext();

  Value *ValueToBitReverse = CI->getOperand(0);
  auto IDBfrev = GenXIntrinsic::genx_bfrev;
  Type *OriginalType = ValueToBitReverse->getType();
  Type *BfrevType =
      isa<IGCLLVM::FixedVectorType>(OriginalType)
          ? cast<Type>(IGCLLVM::FixedVectorType::get(
                Type::getInt32Ty(Ctx),
                cast<IGCLLVM::FixedVectorType>(OriginalType)->getNumElements()))
          : Type::getInt32Ty(Ctx);

  auto *DeclGenXBfReverse =
      GenXIntrinsic::getGenXDeclaration(CI->getModule(), IDBfrev, {BfrevType});
  auto OriginalElementBitSize = OriginalType->getScalarSizeInBits();
  int ShiftSize = 32 - OriginalElementBitSize;
  if (ShiftSize < 0) {
    return lower64Bitreverse(CI);
  }
  Value *ShiftSizeVal = ConstantInt::get(BfrevType, ShiftSize);
  IRBuilder<> Builder(CI);
  Value *Zext = Builder.CreateZExt(ValueToBitReverse, BfrevType);
  Value *WiderRes = Builder.CreateCall(DeclGenXBfReverse->getFunctionType(),
                                       DeclGenXBfReverse, {Zext}, "bfRev");
  if (ShiftSize > 0)
    WiderRes = Builder.CreateLShr(WiderRes, ShiftSizeVal, "lshl");
  Value *Res = Builder.CreateTrunc(WiderRes, OriginalType);

  CI->replaceAllUsesWith(Res);
  ToErase.push_back(CI);

  return true;
}

Value *GenXLowering::swapLowHighHalves(IRBuilder<> &Builder, Value *Arg) const {
  IGC_ASSERT(isa<IGCLLVM::FixedVectorType>(Arg->getType()));

  Module *M = Builder.GetInsertPoint()->getModule();

  auto *VTy = cast<IGCLLVM::FixedVectorType>(Arg->getType());
  auto *ETy = VTy->getElementType();

  auto ElementBits = ETy->getIntegerBitWidth();
  auto NumElements = VTy->getNumElements();

  // If target platform supports bit rotate operations, it's more efficient to
  // rotate a vector element by a half of it's bit width as follows
  //   %res = @rol(<K x iN> %arg, N / 2)
  if ((ElementBits < QWordBits && ST->hasBitRotate()) || ST->has64BitRotate()) {
    auto *Func = GenXIntrinsic::getAnyDeclaration(M, GenXIntrinsic::genx_rol,
                                                  {VTy, VTy});
    auto *AmountV =
        ConstantVector::getSplat(IGCLLVM::getElementCount(NumElements),
                                 ConstantInt::get(ETy, ElementBits / 2));
    return Builder.CreateCall(Func, {Arg, AmountV});
  }

  // Swap low and high halves for each vector element as follows:
  //   %cast = bitcast <K x iN> %arg to <2*K x iN/2>
  //   %stride = 2
  //   %lo = @rdregion(%cast, %stride, 0)
  //   %hi = @rdregion(%cast, %stride, (N/2) / ByteBits)
  //   %tmp = @wrregion(undef, %lo, %stride, (N/2) / ByteBits)
  //   %res = @wrregion(%tmp, %hi, %stride, 0)
  auto *SliceETy = Builder.getIntNTy(ElementBits / 2);
  auto *SliceVTy = IGCLLVM::FixedVectorType::get(SliceETy, NumElements * 2);
  auto *SliceHalfVTy = IGCLLVM::FixedVectorType::get(SliceETy, NumElements);

  auto *Cast = Builder.CreateBitCast(Arg, SliceVTy);

  auto *RdRgnFunc = GenXIntrinsic::getAnyDeclaration(
      M, GenXIntrinsic::genx_rdregioni,
      {SliceHalfVTy, SliceVTy, Builder.getInt16Ty()});
  auto *WrRgnFunc = GenXIntrinsic::getAnyDeclaration(
      M, GenXIntrinsic::genx_wrregioni,
      {SliceVTy, SliceHalfVTy, Builder.getInt16Ty(), Builder.getInt1Ty()});

  auto *VStride = Builder.getInt32(2);
  auto *Width = Builder.getInt32(1);
  auto *Stride = Builder.getInt32(0);
  auto *ParentWidth = UndefValue::get(Builder.getInt32Ty());

  auto *LoOffset = Builder.getInt16(0);
  auto *HiOffset = Builder.getInt16(SliceETy->getIntegerBitWidth() / ByteBits);

  auto *Lo = Builder.CreateCall(
      RdRgnFunc, {Cast, VStride, Width, Stride, LoOffset, ParentWidth});
  auto *Hi = Builder.CreateCall(
      RdRgnFunc, {Cast, VStride, Width, Stride, HiOffset, ParentWidth});

  auto *LoToHi = Builder.CreateCall(
      WrRgnFunc, {UndefValue::get(SliceVTy), Lo, VStride, Width, Stride,
                  HiOffset, ParentWidth, Builder.getTrue()});
  auto *HiToLo =
      Builder.CreateCall(WrRgnFunc, {LoToHi, Hi, VStride, Width, Stride,
                                     LoOffset, ParentWidth, Builder.getTrue()});

  return HiToLo;
}

//
// Implement byte reversal logic.
// Undo some of LLVM's InstCombine transformations by expanding
//   %res = call iN @llvm.bswap.iN(iN %arg)
// into a sequence of rdregion/wrregion or rotate-left operations
//
bool GenXLowering::lowerByteSwap(CallInst *CI) {
  IGC_ASSERT(CI);
  Type *BSwapTy = CI->getType();
  IGC_ASSERT(BSwapTy->isIntOrIntVectorTy());
  unsigned ElementBits = BSwapTy->getScalarSizeInBits();
  IGC_ASSERT_MESSAGE(isPowerOf2_32(ElementBits) && ElementBits >= WordBits &&
                         ElementBits <= QWordBits,
                     "Unexpected integer type of llvm.bswap intrinsic");
  unsigned InputNumElements = 1;
  if (auto *BSwapVecTy = dyn_cast<IGCLLVM::FixedVectorType>(BSwapTy)) {
    InputNumElements = BSwapVecTy->getNumElements();
  }
  unsigned FullBitWidth = ElementBits * InputNumElements;

  llvm::IRBuilder<> Builder(CI);

  auto *IntSliceV = CI->getArgOperand(0);

  for (auto SliceBits = WordBits; SliceBits <= ElementBits; SliceBits *= 2) {
    auto SliceNumElements = FullBitWidth / SliceBits;

    auto *SliceETy = Builder.getIntNTy(SliceBits);
    auto *SliceVTy = IGCLLVM::FixedVectorType::get(SliceETy, SliceNumElements);

    auto *Cast = Builder.CreateBitCast(IntSliceV, SliceVTy);
    IntSliceV = swapLowHighHalves(Builder, Cast);
  }

  CI->replaceAllUsesWith(Builder.CreateBitCast(IntSliceV, BSwapTy));
  ToErase.push_back(CI);
  return true;
}

bool GenXLowering::lowerFunnelShift(CallInst *CI, unsigned IntrinsicID) {
  IGC_ASSERT(CI);
  unsigned BitWidth = CI->getType()->getScalarSizeInBits();
  IGC_ASSERT(isPowerOf2_32(BitWidth) && BitWidth <= 64);
  Value *Fst = CI->getOperand(0), *Snd = CI->getOperand(1);
  Value *ShiftAmnt = CI->getOperand(2);
  IRBuilder<> Builder(CI);
  // If Fst == Snd, funnel shift is equivalent to rotate. Lower to appropriate
  // intrinsic on supported platforms (>= ICLLP)
  // 64-bit rotates are supported only on PVC+. On unsupported platforms lower
  // it as generic funnel shift, so it will be emulated later.
  if (((ST->hasBitRotate() && BitWidth != 64) || ST->has64BitRotate()) &&
      Fst == Snd) {
    auto RotateIntrinsicID = IntrinsicID == Intrinsic::fshl
                                 ? GenXIntrinsic::genx_rol
                                 : GenXIntrinsic::genx_ror;
    auto *Decl = GenXIntrinsic::getGenXDeclaration(
        CI->getModule(), RotateIntrinsicID, {CI->getType(), CI->getType()});
    Value *Rotate = Builder.CreateCall(Decl, {Fst, ShiftAmnt}, "rotate");
    CI->replaceAllUsesWith(Rotate);
    ToErase.push_back(CI);
    return true;
  }
  // Otherwise, emulate it with next sequence:
  // ShiftAmnt &= (BitWidth - 1)
  // For fshl: Res = (Fst << ShiftAmnt) | (Snd >> (BitWidth - ShiftAmnt))
  // For fshr: Res = (Fst << (BitWidth - ShiftAmnt)) | (Snd >> ShiftAmnt)
  ShiftAmnt = Builder.CreateAnd(ShiftAmnt, BitWidth - 1, "shiftamnt");
  Constant *BitWidthConstant =
      Constant::getIntegerValue(CI->getType(), APInt(BitWidth, BitWidth));
  Value *ComplementShiftAmnt =
      Builder.CreateSub(BitWidthConstant, ShiftAmnt, "complementshiftamnt");
  Fst = Builder.CreateShl(
      Fst, IntrinsicID == Intrinsic::fshl ? ShiftAmnt : ComplementShiftAmnt,
      "fstpart");
  Snd = Builder.CreateLShr(
      Snd, IntrinsicID == Intrinsic::fshl ? ComplementShiftAmnt : ShiftAmnt,
      "sndpart");
  Value *Res = Builder.CreateOr(Fst, Snd, "funnelshift");
  CI->replaceAllUsesWith(Res);
  ToErase.push_back(CI);
  return true;
}

bool GenXLowering::lowerFMulAdd(CallInst *CI) {
  IGC_ASSERT(CI);
  auto *Decl = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::fma,
                                         {CI->getType()});
  SmallVector<Value *, 3> Args{CI->args()};
  auto *FMA = CallInst::Create(Decl, Args, CI->getName(), CI);
  FMA->setDebugLoc(CI->getDebugLoc());
  CI->replaceAllUsesWith(FMA);

  ToErase.push_back(CI);
  return true;
}

bool GenXLowering::lowerPowI(CallInst *CI) {
  IGC_ASSERT(CI);
  IRBuilder<> IRB{CI};
  auto *Decl = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::pow,
                                         {CI->getType()});
  auto *Cast =
      IRB.CreateCast(Instruction::SIToFP, CI->getOperand(1), CI->getType());
  auto *Pow = IRB.CreateCall(Decl, {CI->getOperand(0), Cast}, CI->getName());
  Pow->setHasApproxFunc(true);
  CI->replaceAllUsesWith(Pow);
  ToErase.push_back(CI);
  return true;
}

constexpr unsigned AIdx = 0;
constexpr unsigned BIdx = 1;

// Get structure index for value
static inline auto getXId(unsigned IntrinsicID) {
  switch (IntrinsicID) {
  case GenXIntrinsic::genx_addc:
    return llvm::GenXIntrinsic::GenXResult::IdxAddc_Add;
  case GenXIntrinsic::genx_subb:
    return llvm::GenXIntrinsic::GenXResult::IdxSubb_Sub;
  case GenXIntrinsic::genx_add3c:
    return llvm::GenXIntrinsic::GenXResult::IdxAdd3c_Add;
  default:
    IGC_ASSERT_UNREACHABLE();
  }
}

// Get structure index for carry/borrow
static inline auto getCId(unsigned IntrinsicID) {
  switch (IntrinsicID) {
  case GenXIntrinsic::genx_addc:
    return llvm::GenXIntrinsic::GenXResult::IdxAddc_Carry;
  case GenXIntrinsic::genx_subb:
    return llvm::GenXIntrinsic::GenXResult::IdxSubb_Borrow;
  case GenXIntrinsic::genx_add3c:
    return llvm::GenXIntrinsic::GenXResult::IdxAdd3c_Carry;
  default:
    IGC_ASSERT_UNREACHABLE();
  }
}

/*
  VISA inst expected sequence:
  [addc|subb].32 x1 c1 a1 b1     (1)
  [addc|subb].32 x2 c2 a2 b2     (2)
  [addc|subb].32 x2 c3 x2 c1     (2)
  or.32       c  c2 c3        (3)

  {a1, a2} = rdregion.32 a
  {b1, b2} = rdregion.32 b
  {x1, c1} = [addc|subb].32 a1 b1
  {x_, c_} = [addc|subb].32 a2 b2
  {x2, c2} = [addc|subb].32 x_ c1
  c32 = or.32 c2 c_
  c = zextend.64 c32
  // Result
  x = wrregion.64 {x1, x2}
  result = struct {x, c}
*/

struct AddcRes {
  Value *X;
  Value *C;
};

AddcRes GenExtractFromStruct(IRBuilder<> &IRB, CallInst *Addc, unsigned idxX,
                             unsigned idxC) {
  auto *X1 = IRB.CreateExtractValue(Addc, {idxX}, Addc->getName() + ".X.");
  auto *C1 = IRB.CreateExtractValue(Addc, {idxC}, Addc->getName() + ".C.");
  return {X1, C1};
}

auto *Generate32bitSequence(CallInst *CI, unsigned IntrinsicID) {
  auto *A = CI->getOperand(0);
  auto *B = CI->getOperand(1);
  auto *InType = A->getType();

  IRBuilder<> IRB{CI};
  auto Size = cast<IGCLLVM::FixedVectorType>(InType)->getNumElements();
  auto Opc = GenXIntrinsic::ID(IntrinsicID);
  auto *RetTy = CI->getType();
  auto *ConvertTy = IGCLLVM::FixedVectorType::get(IRB.getInt32Ty(), Size);

  auto ASplit = IVSplitter(*CI, &AIdx).splitValueLoHi(*A);
  auto BSplit = IVSplitter(*CI, &BIdx).splitValueLoHi(*B);

  auto *Decl = GenXIntrinsic::getGenXDeclaration(CI->getModule(), Opc,
                                                 {ConvertTy, ConvertTy});
  // {x1, c1} = addc.32 a1 b1
  auto AddcLo = GenExtractFromStruct(
      IRB,
      IRB.CreateCall(Decl, {ASplit.Lo, BSplit.Lo}, CI->getName() + "addc.1."),
      getXId(Opc), getCId(Opc));
  // {x_, c_} = addc.32 a2 b2
  auto Addc_ = GenExtractFromStruct(
      IRB,
      IRB.CreateCall(Decl, {ASplit.Hi, BSplit.Hi}, CI->getName() + "addc.2."),
      getXId(Opc), getCId(Opc));
  // {x2, c2} = addc.32 x_ c1
  auto AddcHi = GenExtractFromStruct(
      IRB, IRB.CreateCall(Decl, {Addc_.X, AddcLo.C}, CI->getName() + "addc.3."),
      getXId(Opc), getCId(Opc));
  // c = or.32 c2 c_
  auto *C = cast<llvm::Instruction>(IRB.CreateOr(AddcHi.C, Addc_.C));
  // ==> Pack result to structure
  Value *Result =
      IVSplitter(*C).combineLoHiSplit({AddcLo.X, AddcHi.X}, "Result", false);
  // Struct = CI->getType()
  auto *PrevVal = UndefValue::get(RetTy);
  C = cast<llvm::Instruction>(IRB.CreateZExt(C, Result->getType()));

  auto *ArrResult = IRB.CreateInsertValue(PrevVal, Result, getXId(Opc));
  ArrResult = IRB.CreateInsertValue(ArrResult, C, getCId(Opc));
  return ArrResult;
}

/*
  VISA inst expected sequence:

  [addc|subb].32 x1 c1 a1 b1
  add3o.32 x2 pc2  a2 [-]b2 [-]c1
  zext.64 c pc
  Result = {{x1, x2}, c}
*/
auto *GenerateAdd3Sequence(CallInst *CI, unsigned IntrinsicID) {

  auto *A = CI->getOperand(0);
  auto *B = CI->getOperand(1);
  auto *InType = A->getType();

  IRBuilder<> IRB{CI};
  auto Size = cast<IGCLLVM::FixedVectorType>(InType)->getNumElements();
  auto Opc = GenXIntrinsic::ID(IntrinsicID);
  auto *RetTy = CI->getType();

  IVSplitter SplitBuilderA(*CI, &AIdx);
  IVSplitter SplitBuilderB(*CI, &BIdx);
  auto ASplit = SplitBuilderA.splitValueLoHi(*A);
  auto BSplit = SplitBuilderB.splitValueLoHi(*B);

  auto *ConvertTy = IGCLLVM::FixedVectorType::get(IRB.getInt32Ty(), Size);

  auto *Decl = GenXIntrinsic::getGenXDeclaration(CI->getModule(), Opc,
                                                 {ConvertTy, ConvertTy});
  // {x1, c1} = [addc|subb].32 a1 b1
  auto AddcLo = GenExtractFromStruct(
      IRB, CallInst::Create(Decl, {ASplit.Lo, BSplit.Lo}, CI->getName(), CI),
      getXId(Opc), getCId(Opc));

  auto *MaskTy = IGCLLVM::FixedVectorType::get(IRB.getInt1Ty(), Size);
  auto IntrOpc = GenXIntrinsic::genx_add3c;
  auto *Add3Funct = GenXIntrinsic::getGenXDeclaration(CI->getModule(), IntrOpc,
                                                      {MaskTy, ConvertTy});

  auto *BHI = BSplit.Hi;
  auto *LoC = AddcLo.C;
  if (Opc == GenXIntrinsic::genx_subb) {
    BHI = IRB.CreateNeg(BHI, BHI->getName() + ".neg");
    LoC = IRB.CreateNeg(LoC, LoC->getName() + ".neg");
  }

  // add3o.32 x2 pc2   a2 [-]b2 [-]c1
  auto Add3c = GenExtractFromStruct(
      IRB,
      IRB.CreateCall(Add3Funct, {ASplit.Hi, BHI, LoC}, CI->getName() + ".add3"),
      getXId(IntrOpc), getCId(IntrOpc));

  Value *Result = IVSplitter(*cast<Instruction>(Add3c.C))
                      .combineLoHiSplit({AddcLo.X, Add3c.X}, "Result", false);

  // zext.64 c pc
  auto *C = IRB.CreateZExt(Add3c.C, InType);

  auto *PrevVal = UndefValue::get(RetTy);
  auto *ArrResult = IRB.CreateInsertValue(PrevVal, Result, getXId(Opc));
  ArrResult = IRB.CreateInsertValue(ArrResult, C, getCId(Opc));
  return ArrResult;
}

bool GenXLowering::lowerAddcSubb(CallInst *CI, unsigned IntrinsicID) {
  IGC_ASSERT(CI);

  auto *InType = (cast<Instruction>(CI->getOperand(0)))->getType();
  if (!InType->getScalarType()->isIntegerTy(64) ||
      !dyn_cast<IGCLLVM::FixedVectorType>(InType))
    return false;

  IGC_ASSERT(InType->getScalarType()->isIntegerTy(64));

  if (ST->hasAdd3Bfn())
    CI->replaceAllUsesWith(GenerateAdd3Sequence(CI, IntrinsicID));
  else
    CI->replaceAllUsesWith(Generate32bitSequence(CI, IntrinsicID));

  ToErase.push_back(CI);

  return true;
}

bool GenXLowering::lowerAbs(CallInst *CI) {
  IGC_ASSERT(CI && vc::getAnyIntrinsicID(CI) == Intrinsic::abs);
  IRBuilder<> Builder(CI);
  auto *M = CI->getModule();
  Type *Ty = CI->getType();

  auto *Decl = vc::getAnyDeclaration(M, GenXIntrinsic::genx_absi, {Ty});
  auto *NewI = Builder.CreateCall(Decl, {CI->getArgOperand(0)});
  NewI->takeName(CI);
  CI->replaceAllUsesWith(NewI);

  ToErase.push_back(CI);
  return true;
}

bool GenXLowering::lowerMathIntrinsic(CallInst *CI, GenXIntrinsic::ID GenXID,
                                      bool IsHalfAllowed) {
  IGC_ASSERT(CI);
  auto *ResTy = CI->getType();
  auto *ResScalarTy = ResTy->getScalarType();
  if (!ResScalarTy->isFloatTy() &&
      !(IsHalfAllowed && ResScalarTy->isHalfTy())) {
    vc::fatal(CI->getContext(), "GenXLowering",
              "Sorry there is only f16 and f32 native instruction", CI);
    return false;
  }
  auto *Decl = GenXIntrinsic::getGenXDeclaration(CI->getModule(), GenXID,
                                                 {CI->getType()});
  SmallVector<Value *, 2> Args{CI->args()};
  IRBuilder<> Builder{CI};
  auto *Res = Builder.CreateCall(Decl, Args, CI->getName());
  CI->replaceAllUsesWith(Res);
  ToErase.push_back(CI);
  return true;
}

bool GenXLowering::lowerFastMathIntrinsic(CallInst *CI,
                                          GenXIntrinsic::ID GenXID) {
  if (!CI->hasApproxFunc())
    vc::fatal(CI->getContext(), "GenXLowering",
              "Sorry there is only low precision native instruction", CI);
  return lowerMathIntrinsic(CI, GenXID, /*IsHalfAllowed=*/true);
}

bool GenXLowering::lowerStackSave(CallInst *CI) {
  IRBuilder<> IRB{CI};

  auto *Ty = IRB.getInt64Ty();
  auto *Func = GenXIntrinsic::getGenXDeclaration(
      CI->getModule(), GenXIntrinsic::genx_read_predef_reg, {Ty, Ty});

  auto *PredefReg = IRB.getInt32(PreDefined_Vars::PREDEFINED_FE_SP);
  auto *FeSp = IRB.CreateCall(Func, {PredefReg, UndefValue::get(Ty)});
  auto *Res = IRB.CreateIntToPtr(FeSp, CI->getType());
  CI->replaceAllUsesWith(Res);
  Res->takeName(CI);

  ToErase.push_back(CI);

  return true;
}

bool GenXLowering::lowerStackRestore(CallInst *CI) {
  IRBuilder<> IRB{CI};

  auto *Ty = IRB.getInt64Ty();

  auto *Func = GenXIntrinsic::getGenXDeclaration(
      CI->getModule(), GenXIntrinsic::genx_write_predef_reg, {Ty, Ty});

  auto *FeSp = IRB.CreatePtrToInt(CI->getOperand(0), Ty);
  auto *PredefReg = IRB.getInt32(PreDefined_Vars::PREDEFINED_FE_SP);
  IRB.CreateCall(Func, {PredefReg, FeSp});

  ToErase.push_back(CI);

  return true;
}

bool GenXLowering::lowerHardwareThreadID(CallInst *CI) {
  IRBuilder<> IRB{CI};

  auto *Ty = CI->getType();
  auto *ReadPredefFunc = GenXIntrinsic::getGenXDeclaration(
      CI->getModule(), GenXIntrinsic::genx_read_predef_reg, {Ty, Ty});

  auto RegID = ST->getsHWTIDFromPredef() ? PREDEFINED_HW_TID : PREDEFINED_SR0;
  Value *Res = IRB.CreateCall(ReadPredefFunc,
                              {IRB.getInt32(RegID), UndefValue::get(Ty)});

  if (!ST->getsHWTIDFromPredef()) {
    // Drop reserved bits
    for (auto &[Offset, Width] : ST->getThreadIdReservedBits()) {
      auto Mask = (1 << Offset) - 1;

      // res = (src & mask) | ((src >> width) & ~mask)
      auto *Shift = IRB.CreateLShr(Res, ConstantInt::get(Ty, Width));
      auto *And = IRB.CreateAnd(Res, ConstantInt::get(Ty, Mask));
      auto *AndNot = IRB.CreateAnd(Shift, ConstantInt::get(Ty, ~Mask));
      Res = IRB.CreateOr(AndNot, And);
    }

    auto *MaskC = ConstantInt::get(Ty, ST->getMaxThreadsNumPerSubDevice() - 1);
    Res = IRB.CreateAnd(Res, MaskC);
  }

  CI->replaceAllUsesWith(Res);
  ToErase.push_back(CI);
  return true;
}

static Value *extractBitfields(IRBuilder<> &IRB, Value *To, Value *From,
                               ArrayRef<std::pair<int, int>> Fields,
                               int &InsertTo) {
  auto *Ty = From->getType();
  for (auto &[Offset, Width] : Fields) {
    auto Mask = ((1 << Width) - 1) << Offset;
    auto Shift = Offset - InsertTo;

    auto *ExtractV = IRB.CreateAnd(From, ConstantInt::get(Ty, Mask));
    Value *ShiftV;
    if (Shift == 0)
      ShiftV = ExtractV;
    else if (Shift > 0)
      ShiftV = IRB.CreateLShr(ExtractV, ConstantInt::get(Ty, Shift));
    else
      ShiftV = IRB.CreateShl(ExtractV, ConstantInt::get(Ty, -Shift));

    To = To ? IRB.CreateOr(To, ShiftV) : ShiftV;
    InsertTo += Width;
  }
  IGC_ASSERT_EXIT(To);
  return To;
}

bool GenXLowering::lowerLogicalThreadID(CallInst *CI) {
  unsigned NumThreads = ST->getNumThreadsPerEU();
  if (ST->getsHWTIDFromPredef() ||
      (isPowerOf2_32(NumThreads) && !ST->hasPreemption()))
    return lowerHardwareThreadID(CI);

  IRBuilder<> IRB{CI};

  auto *Ty = CI->getType();
  auto *ReadPredefFunc = GenXIntrinsic::getGenXDeclaration(
      CI->getModule(), GenXIntrinsic::genx_read_predef_reg, {Ty, Ty});

  int InsertTo = 0;
  auto *SR0 = IRB.CreateCall(
      ReadPredefFunc, {IRB.getInt32(PREDEFINED_SR0), UndefValue::get(Ty)});
  Value *Res = nullptr;
  auto *TID = extractBitfields(IRB, Res, SR0, ST->getThreadIdBits(), InsertTo);

  if (isPowerOf2_32(NumThreads))
    Res = TID;
  else
    InsertTo = 0;

  Res = extractBitfields(IRB, Res, SR0, ST->getEUIdBits(), InsertTo);

  auto *SubsliceReg =
      ST->hasPreemption()
          ? IRB.CreateCall(ReadPredefFunc,
                           {IRB.getInt32(PREDEFINED_MSG0), UndefValue::get(Ty)})
          : SR0;
  Res = extractBitfields(IRB, Res, SubsliceReg, ST->getSubsliceIdBits(),
                         InsertTo);

  if (!isPowerOf2_32(NumThreads)) {
    IGC_ASSERT_EXIT(Res);
    IGC_ASSERT_EXIT(TID);
    auto *Mul = IRB.CreateMul(Res, ConstantInt::get(Ty, NumThreads));
    Res = IRB.CreateAdd(Mul, TID);
  }
  IGC_ASSERT_EXIT(Res);

  CI->replaceAllUsesWith(Res);
  ToErase.push_back(CI);
  return true;
}

bool GenXLowering::lowerDpas(CallInst *CI) {
  const auto IID = vc::getAnyIntrinsicID(CI);
  bool IsLoweringRequired = false;

  IRBuilder<> Builder(CI);
  auto *Ty = cast<IGCLLVM::FixedVectorType>(CI->getType());
  if (Ty->isIntOrIntVectorTy(16)) {
    Ty = IGCLLVM::FixedVectorType::get(Builder.getBFloatTy(),
                                       Ty->getNumElements());
    IsLoweringRequired = true;
  }

  SmallVector<Value *, 10> Args(CI->args());

  if (IID != GenXIntrinsic::genx_dpas_nosrc0) {
    auto *Acc = CI->getOperand(0);
    auto *AccTy = cast<IGCLLVM::FixedVectorType>(Acc->getType());
    if (AccTy->isIntOrIntVectorTy(16)) {
      auto *NewAccTy = IGCLLVM::FixedVectorType::get(Builder.getBFloatTy(),
                                                     AccTy->getNumElements());
      Args[0] = Builder.CreateBitCast(Acc, NewAccTy);
      IsLoweringRequired = true;
    }
  }

  if (!IsLoweringRequired)
    return false;

  SmallVector<Type *, 6> Types = {Ty};

  for (auto &IdxArg : enumerate(Args))
    if (vc::isOverloadedArg(IID, IdxArg.index()))
      Types.push_back(IdxArg.value()->getType());

  auto *F = vc::getAnyDeclaration(CI->getModule(), IID, Types);
  auto *NewCI = Builder.CreateCall(F, Args);
  auto *Cast = Builder.CreateBitCast(NewCI, CI->getType());

  NewCI->takeName(CI);
  CI->replaceAllUsesWith(Cast);
  ToErase.push_back(CI);
  return true;
}

bool GenXLowering::lowerNamedBarrierArrive(CallInst *CI) {
  IGC_ASSERT(vc::getAnyIntrinsicID(CI) == GenXIntrinsic::genx_nbarrier_arrive);
  if (!ST->hasNBarrier()) {
    CI->getContext().emitError(CI, "Named barriers are not suppported by " +
                                       ST->getCPU());
    return false;
  }

  Module *M = CI->getModule();
  IRBuilder<> Builder(CI);

  const unsigned Width = ST->getGRFByteSize() / DWordBytes;
  auto *Int32Ty = Builder.getInt32Ty();
  auto *PayloadTy = IGCLLVM::FixedVectorType::get(Int32Ty, Width);
  auto *UndefV = UndefValue::get(PayloadTy);

  // Prepare named barrier message payload as follows:
  //   payload[2][31:24]: number of consumers
  //   payload[2][23:16]: number of producers
  //   payload[2][15:14]: thread role
  //   payload[2][4:0]: barrier id
  auto *BarrierId = Builder.CreateZExt(CI->getArgOperand(0), Int32Ty);
  auto *Role = Builder.CreateZExt(CI->getArgOperand(1), Int32Ty);
  auto *NumProducers = Builder.CreateZExt(CI->getArgOperand(2), Int32Ty);
  auto *NumConsumers = Builder.CreateZExt(CI->getArgOperand(3), Int32Ty);

  auto *Payload = Builder.CreateAdd(BarrierId, Builder.CreateShl(Role, 14));
  Payload = Builder.CreateAdd(Payload, Builder.CreateShl(NumProducers, 16));
  Payload = Builder.CreateAdd(Payload, Builder.CreateShl(NumConsumers, 24));

  Payload = Builder.CreateInsertElement(UndefV, Payload, 2);

  SmallVector<Value *, 8> Args = {
      Builder.getInt8(0),           // modifier (none)
      Builder.getInt8(0),           // log2(exec size)
      Builder.getTrue(),            // predicate
      Builder.getInt8(1),           // number of source registers
      Builder.getInt8(3),           // Gateway
      Builder.getInt32(0),          // extened message descriptor
      Builder.getInt32(0x02000004), // message descriptor: barrier
      Payload,
  };

  auto *SendFunc =
      vc::getAnyDeclaration(M, GenXIntrinsic::genx_raw_send2_noresult,
                            {Builder.getInt1Ty(), PayloadTy});

  Builder.CreateCall(SendFunc, Args);
  ToErase.push_back(CI);

  return true;
}

template <typename BuilderOp>
bool GenXLowering::lowerReduction(CallInst *CI, Value *Src, Value *Start,
                                  BuilderOp Builder) {
  const auto &DebugLoc = CI->getDebugLoc();

  auto *Ty = CI->getType();
  // VC doesn't support lowering of ordered floating-point reduction
  if (Ty->isFloatingPointTy() && !CI->hasAllowReassoc())
    return false;

  auto *SrcVTy = cast<IGCLLVM::FixedVectorType>(Src->getType());
  auto SrcWidth = SrcVTy->getNumElements();

  const uint64_t MaxSimd = 2 * ST->getGRFByteSize() * genx::ByteBits /
                           DL->getTypeStoreSizeInBits(Ty);
  const auto LinearGrain = std::min<uint64_t>(32, MaxSimd);
  auto TailWidth = SrcWidth % LinearGrain;
  const auto LinearWidth = SrcWidth - TailWidth;
  auto TailIndex = LinearWidth;

  auto *Acc = Src;

  if (LinearWidth > LinearGrain) {
    IGC_ASSERT(LinearWidth % LinearGrain == 0);
    auto *AccTy = IGCLLVM::FixedVectorType::get(Ty, LinearGrain);

    vc::CMRegion R(AccTy, DL);
    R.Offset = 0;

    Acc = R.createRdRegion(Src, "", CI, DebugLoc);

    const auto GrainBytes = LinearGrain * R.ElementBytes;
    R.Offset = GrainBytes;
    for (; R.getOffsetInElements() < LinearWidth; R.Offset += GrainBytes) {
      auto *NewRgn = R.createRdRegion(Src, "", CI, DebugLoc);
      Acc = Builder(Acc, NewRgn);
    }
    SrcWidth = LinearGrain;
  } else if (!isPowerOf2_32(SrcWidth)) {
    TailIndex = PowerOf2Floor(SrcWidth);
    IGC_ASSERT_EXIT(TailIndex);
    TailWidth = SrcWidth % TailIndex;
    SrcWidth = TailIndex;
  }

  for (SrcWidth /= 2; SrcWidth > 0; SrcWidth /= 2) {
    auto *OpTy = IGCLLVM::FixedVectorType::get(Ty, SrcWidth);
    vc::CMRegion R(OpTy, DL);

    R.Offset = 0;
    auto *Op0 = R.createRdRegion(Acc, "", CI, DebugLoc);

    R.Offset = R.ElementBytes * SrcWidth;
    auto *Op1 = R.createRdRegion(Acc, "", CI, DebugLoc);

    Acc = Builder(Op0, Op1);

    if ((TailWidth & SrcWidth) != 0) {
      vc::CMRegion RTail(OpTy, DL);
      R.Offset = TailIndex * R.ElementBytes;
      auto *Tail = R.createRdRegion(Src, "", CI, DebugLoc);

      Acc = Builder(Acc, Tail);
      TailIndex += SrcWidth;
      TailWidth -= SrcWidth;
    }
  }

  IGC_ASSERT(TailWidth == 0);

  IRBuilder<> IRB(CI);
  auto *Res = IRB.CreateBitCast(Acc, Ty);
  if (Start)
    Res = Builder(Res, Start);

  CI->replaceAllUsesWith(Res);
  ToErase.push_back(CI);
  return true;
}

bool GenXLowering::lowerReduction(CallInst *CI, Instruction::BinaryOps Opcode) {
  Value *Start = nullptr;
  auto *Src = CI->getArgOperand(0);

  if (Opcode == Instruction::FAdd || Opcode == Instruction::FMul) {
    Start = CI->getArgOperand(0);
    Src = CI->getArgOperand(1);
  }

  IRBuilder<> Builder(CI);

  return lowerReduction(CI, Src, Start, [&](Value *LHS, Value *RHS) {
    return Builder.CreateBinOp(Opcode, LHS, RHS);
  });
}

bool GenXLowering::lowerReduction(CallInst *CI, Intrinsic::ID IID) {
  Value *Start = nullptr;
  auto *Src = CI->getArgOperand(0);

  IRBuilder<> Builder(CI);

  return lowerReduction(CI, Src, Start, [&](Value *LHS, Value *RHS) {
    return Builder.CreateBinaryIntrinsic(IID, LHS, RHS);
  });
}

bool GenXLowering::lowerCopySign(CallInst *CI) {
  IRBuilder<> Builder(CI);

  auto *Ty = CI->getType()->getScalarType();
  auto ElementSize = Ty->getPrimitiveSizeInBits();
  auto Stride = ElementSize / genx::WordBits;
  IGC_ASSERT(ElementSize % genx::WordBits == 0);
  IGC_ASSERT(Stride == 1 || Stride == 2 || Stride == 4);

  auto NumElements = 1;
  if (auto *VTy = dyn_cast<IGCLLVM::FixedVectorType>(CI->getType()))
    NumElements = VTy->getNumElements();
  auto CastNumElements = NumElements * Stride;

  auto *Int16Ty = Builder.getInt16Ty();
  auto *CastTy = IGCLLVM::FixedVectorType::get(Int16Ty, CastNumElements);
  auto *LowerTy = IGCLLVM::FixedVectorType::get(Int16Ty, NumElements);

  auto *Mag = CI->getOperand(0);
  auto *Sign = CI->getOperand(1);

  auto *MagCast = Builder.CreateBitCast(Mag, CastTy);
  auto *MagInt = MagCast;
  auto *SignInt = Builder.CreateBitCast(Sign, CastTy);

  vc::CMRegion R(LowerTy, DL);
  auto &DebugLoc = CI->getDebugLoc();

  if (Stride > 1) {
    R.VStride = Stride;
    R.Width = 1;
    R.Stride = 0;
    R.Offset = (Stride - 1) * genx::WordBytes;

    MagInt = R.createRdRegion(MagInt, "", CI, DebugLoc);
    SignInt = R.createRdRegion(SignInt, "", CI, DebugLoc);
  }

  auto *MagMask = ConstantInt::get(Int16Ty, 0x7FFF);
  auto *SignMask = ConstantInt::get(Int16Ty, 0x8000);

  auto *MagAbs = Builder.CreateAnd(
      MagInt, Builder.CreateVectorSplat(NumElements, MagMask));
  auto *SignBit = Builder.CreateAnd(
      SignInt, Builder.CreateVectorSplat(NumElements, SignMask));

  auto *Res = Builder.CreateOr(MagAbs, SignBit);

  if (Stride > 1)
    Res = R.createWrRegion(MagCast, Res, "", CI, DebugLoc);

  Res = Builder.CreateBitCast(Res, CI->getType());

  Res->takeName(CI);
  CI->replaceAllUsesWith(Res);
  ToErase.push_back(CI);

  return true;
}

/***********************************************************************
 * widenByteOp : widen a vector byte operation to short if that might
 *               improve code
 *
 * Return:  whether any change was made, and thus the current instruction
 *          is now marked for erasing
 *
 * Gen has restrictions on byte operands. The jitter copes with that, but
 * sometimes it needs to do even-odd splitting, which can lead to suboptimal
 * code if cmps and predicates are involved.
 * Here we attempt to pick up the common cases by converting a byte operation
 * to short.
 *
 * Note that we might end up with the extends being baled into the instruction
 * anyway, resulting in a byte operation in vISA.
 */
bool GenXLowering::widenByteOp(Instruction *Inst) {
  if (!EnableGenXByteWidening)
    return false;
  Type *Ty = Inst->getType();
  if (isa<CmpInst>(Inst))
    Ty = Inst->getOperand(0)->getType();
  if (!isa<VectorType>(Ty) || !Ty->getScalarType()->isIntegerTy(8))
    return false; // not byte operation
  if (Inst->use_empty())
    return false; // result unused
  // check use, if use is a phi, stop widenning
  if (!isa<CmpInst>(Inst)) {
    for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) {
      auto User = cast<Instruction>(ui->getUser());
      if (isa<PHINode>(User))
        return false;
    }
  }
  // For a predicated wrregion, widen by separating the predication into a
  // rdregion and select, which can then be widened.
  if (GenXIntrinsic::isWrRegion(Inst)) {
    Region R = makeRegionFromBaleInfo(Inst, BaleInfo());
    if (R.NumElements == 1 || !R.Mask)
      return false;
    // Can only do this if the predicate is the right size. (We could handle
    // the wrong size case by adding an rdpredregion, but then we would need
    // to ensure that GenXLegalization can cope with an arbitrary size
    // rdpredregion.)
    if (cast<IGCLLVM::FixedVectorType>(R.Mask->getType())->getNumElements() !=
        R.NumElements)
      return false;
    // Create the rdregion and select.
    auto NewRd =
        R.createRdRegion(Inst->getOperand(0), Inst->getName() + ".byteselrdr",
                         Inst, Inst->getDebugLoc());
    auto NewSel =
        SelectInst::Create(R.Mask, Inst->getOperand(1), NewRd, "", Inst);
    NewSel->takeName(Inst);
    NewSel->setDebugLoc(Inst->getDebugLoc());
    // Modify the existing wrregion.
    Inst->setName(NewSel->getName() + ".byteselwrr");
    Inst->setOperand(1, NewSel);
    Inst->setOperand(GenXIntrinsic::GenXRegion::PredicateOperandNum,
                     Constant::getAllOnesValue(R.Mask->getType()));
    // Fall through for the select to get widened.
    Inst = NewSel;
  }
  // Do the widening for:
  // 1. a compare or select
  // 2. used in a zext that indicates that the user has probably already been
  //    widened by this code.
  bool Widen = false;
  if (isa<CmpInst>(Inst) || isa<SelectInst>(Inst))
    Widen = true;
  else {
    auto user = cast<Instruction>(Inst->use_begin()->getUser());
    if (isa<ZExtInst>(user))
      Widen = true;
  }
  if (!Widen)
    return false;
  // Widen to short.
  // Decide whether to zero or sign extend. Also decide whether the result is
  // guaranteed to have all 0 bits in the extended part.
  Instruction::CastOps ExtOpcode = Instruction::ZExt;
  bool ExtendedIsZero = false;
  switch (Inst->getOpcode()) {
  case Instruction::SDiv:
  case Instruction::AShr:
    ExtOpcode = Instruction::SExt;
    break;
  case Instruction::And:
  case Instruction::Or:
  case Instruction::Xor:
  case Instruction::LShr:
    ExtendedIsZero = true;
    break;
  case Instruction::ICmp:
    if (cast<CmpInst>(Inst)->isSigned())
      ExtOpcode = Instruction::SExt;
    break;
  default:
    break;
  }
  // Get the range of operands to process.
  unsigned StartIdx = 0, EndIdx = Inst->getNumOperands();
  if (auto CI = dyn_cast<CallInst>(Inst))
    EndIdx = IGCLLVM::getNumArgOperands(CI);
  else if (isa<SelectInst>(Inst))
    StartIdx = 1;
  // Extend the operands.
  auto ExtTy = IGCLLVM::FixedVectorType::get(
      Type::getInt16Ty(Inst->getContext()),
      cast<IGCLLVM::FixedVectorType>(Inst->getOperand(StartIdx)->getType())
          ->getNumElements());
  SmallVector<Value *, 4> Opnds;
  for (unsigned Idx = 0; Idx != EndIdx; ++Idx) {
    Value *Opnd = Inst->getOperand(Idx);
    if (Idx >= StartIdx) {
      if (auto C = dyn_cast<Constant>(Opnd))
        Opnd = ConstantExpr::getCast(ExtOpcode, C, ExtTy);
      else {
        auto NewExt = CastInst::Create(ExtOpcode, Opnd, ExtTy,
                                       Inst->getName() + ".byteext", Inst);
        NewExt->setDebugLoc(Inst->getDebugLoc());
        Opnd = NewExt;
      }
    }
    Opnds.push_back(Opnd);
  }
  // Create the replacement instruction.
  Instruction *NewInst = nullptr;
  if (isa<BinaryOperator>(Inst))
    NewInst = BinaryOperator::Create((Instruction::BinaryOps)Inst->getOpcode(),
                                     Opnds[0], Opnds[1], "", Inst);
  else if (auto CI = dyn_cast<CmpInst>(Inst))
    NewInst = CmpInst::Create(CI->getOpcode(), CI->getPredicate(), Opnds[0],
                              Opnds[1], "", CI);
  else if (isa<SelectInst>(Inst))
    NewInst = SelectInst::Create(Opnds[0], Opnds[1], Opnds[2], "", Inst);
  else
    IGC_ASSERT_EXIT_MESSAGE(0, "unhandled instruction in widenByteOp");
  NewInst->takeName(Inst);
  NewInst->setDebugLoc(Inst->getDebugLoc());
  if (ExtendedIsZero) {
    // We know that the extended part of the result contains 0 bits. If we
    // find that any use is a zext (probably from also being byte widened
    // in this code), we can replace the use directly and save the
    // trunc/zext pair. First put the uses in a vector as the use list will
    // change under our feet.
    SmallVector<Use *, 4> Uses;
    for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui)
      Uses.push_back(&*ui);
    for (auto ui = Uses.begin(), ue = Uses.end(); ui != ue; ++ui) {
      if (auto user = dyn_cast<ZExtInst>((*ui)->getUser())) {
        if (user->getType() == NewInst->getType()) {
          user->replaceAllUsesWith(NewInst);
          ToErase.push_back(user);
          // Remove the use of Inst from the trunc so we can tell whether there
          // are any uses left below.
          *(*ui) = UndefValue::get(Inst->getType());
        }
      }
    }
  }
  if (!Inst->use_empty()) {
    // Truncate the result.
    if (!isa<CmpInst>(Inst)) {
      NewInst = CastInst::Create(Instruction::Trunc, NewInst, Inst->getType(),
                                 Inst->getName() + ".bytetrunc", Inst);
      NewInst->setDebugLoc(Inst->getDebugLoc());
    }
    // Replace uses.
    Inst->replaceAllUsesWith(NewInst);
  }
  ToErase.push_back(Inst);
  return true;
}