File: LSCCacheOptimizationPass.cpp

package info (click to toggle)
intel-graphics-compiler 1.0.12504.6-1%2Bdeb12u1
links: PTS, VCS
area: main
in suites: bookworm
size: 83,912 kB
sloc: cpp: 910,147; lisp: 202,655; ansic: 15,197; python: 4,025; yacc: 2,241; lex: 1,570; pascal: 244; sh: 104; makefile: 25
file content (501 lines) | stat: -rw-r--r-- 28,299 bytes
/*========================== begin_copyright_notice ============================

Copyright (C) 2021-2022 Intel Corporation

SPDX-License-Identifier: MIT

============================= end_copyright_notice ===========================*/

/**
 * @file  LSCCacheOptimizationPass.cpp
 * @author  Konstantin Rebrov
 *
 * @brief  This file implements the LSCCacheOptimizationPass
 * This pass performs an optimization upon the Load Store Cache, makes eligible store instructions go into the L1 cache
 * Instead of the L3 cache, which is the default setting.
 * It utilizes the L1 cache for store instructions that are in these four memory regions:
 * RTAsynctack, SWStack, SWHotZone, RTSynctack
 *
 * @details  This pass examines all store instructions, and if a store instruction is identified as being eligible,
 * and if the optimization is necessary and possible, the pass performs a Read Write Modify operation on the store instruction.
 *
 * The requirements for a store instruction to go into the L1 cache are:
 *   The address for the store needs to be 16-byte aligned.
 *   The size of the stored data needs to be a multiple of 16 bytes.
 *   .ca on load and store instructions should not be marked as uncached
 * If these requirements are not satisfied then the store instruction is expanded to include the padding around the stored data.
 * This padding is preliminarily loaded into a virtual register to save those values.
 *
 * green blocks represent a dword that we don't want to overwrite (leave it as is)
 *   this is a padding blocks surrounding the (red) original memory location of the store instruction
 *
 * red blocks represent a dword that we actually do want to overwrite, specifically the old value
 *   this is the memory location that we want to store the new value into
 *   it is the region of memory which is addressed to by the original pointer operand of the input store instruction
 *
 * blue blocks represent the new value that we want to overwrite on top of the red dword
 */

#include "LSCCacheOptimizationPass.h"

#include "AdaptorCommon/RayTracing/MemRegionAnalysis.h"  // for getRegionOffset(), RTMemRegion
#include "getCacheOpts.h"                                // for getCacheOptsStoreInst()
#include "GenISAIntrinsics/GenIntrinsicInst.h"           // for llvm::GenIntrinsicInst
#include "AdaptorCommon/RayTracing/RTStackFormat.h"      // for RTStackFormat::LSC_WRITE_GRANULARITY
#include "IGCPassSupport.h"

#include "common/LLVMWarningsPush.hpp"                   // for suppressing LLVM warnings
#include "llvmWrapper/Support/Alignment.h"
#include <llvm/IR/LLVMContext.h>                         // for llvm::LLVMContext
#include <llvm/IR/IRBuilder.h>                           // for llvm::IRBuilder
#include <llvm/IR/Function.h>                            // for llvm::Function
#include <llvm/IR/BasicBlock.h>                          // for llvm::BasicBlock
#include <llvm/IR/Value.h>                               // for llvm::Value
#include <llvm/IR/Type.h>                                // for llvm::Type
#include <llvm/IR/DerivedTypes.h>                        // for llvm::VectorType
#include "llvmWrapper/IR/DerivedTypes.h"
#include <llvm/IR/Constant.h>                            // for llvm::Constant
#include <llvm/IR/Constants.h>                           // for llvm::ConstantInt, llvm::ConstantFP, llvm::ConstantVector, llvm::ConstantDataVector, llvm::UndefValue
#include <llvm/IR/Instruction.h>                         // for llvm::Instruction
#include <llvm/IR/Instructions.h>                        // for llvm::StoreInst, llvm::CallInst
#include <llvm/ADT/Optional.h>                           // for llvm::Optional
#include <llvm/ADT/APInt.h>                              // for llvm::APInt, llvm::ArrayRef
#include <llvm/Support/raw_ostream.h>                    // for llvm::raw_ostream, llvm::raw_string_ostream, llvm::outs(), llvm::errs()
#include "common/LLVMWarningsPop.hpp"                    // for suppressing LLVM warnings

#include <climits>    // for CHAR_BIT
#include <cstdint>    // for std::uint64_t
#include <string>     // for std::string
#include <vector>     // for std::vector
#include <algorithm>  // for std::find

using namespace IGC;

using RTStackFormat::LSC_WRITE_GRANULARITY;

using namespace llvm;
using namespace llvm::GenISAIntrinsic;

using std::uint64_t;
using std::string;
using std::vector;


/**
 * @param builder  An IRBuilder to create instructions.
 *
 * @param data_type  The data type of the zeroed vector, can be floating point or integer.
 *
 * @param num_elements  The number of elements in the zeroed vector.
 *
 * @return  An rvalue which is a zeroed vector with num_elements of the specified data_type
 *          It can be used as an argument for a constructed instruction, such as insertelement
 */
inline Value* getZeroedVector(IRBuilder<>& builder, Type* data_type, uint64_t num_elements) {
    if (data_type->isFloatingPointTy()) {
        return ConstantDataVector::getSplat((unsigned)num_elements, ConstantFP::get(data_type, 0.0));
    }
    else {
        return ConstantDataVector::getSplat((unsigned)num_elements, ConstantInt::get(data_type, 0ull));
    }
}


/**
 * This utility function being given a source rvalue, which is usually a vector, extracts elements marked by a range out of that vector
 * and caches those resulting rvalues into a container. We might want to use those rvalues in future instructions.
 * This is a half-open range marked by the two last parameters: [begin, end)
 * The range is intended to be used for extracting elements from only part of the vector.
 *
 * @param builder  An IRBuilder to create instructions.
 *
 * @param source_rvalue  The rvalue from which to extract the elements.
 *                       It can be rvalues of both scalar and vector data types, both % virtual registers as well as literal values.
 *                       This function can extract elements from both green vectors and blue vectors.
 *
 * @param extracted_elements  Each element is extracted and saved as a Value* into this container.
 *
 * @param begin  This index points to the first element in the range.
 *
 * @param end  This index points to one after the last element in the range.
 */
static void extract_elements(IRBuilder<>& builder, Value* source_rvalue, vector<Value*>& extracted_elements, uint64_t begin, uint64_t end) {
    IGC_ASSERT(begin <= end && extracted_elements.size() == (end - begin));
    if (begin == end)
        return;

    Type* type = source_rvalue->getType();
    // If the source rvalue is a vector of integers.
    if (isa<VectorType>(type)) {
        vector<Value*>::iterator i = extracted_elements.begin();
        for (uint64_t extract_at = begin; extract_at < end; ++extract_at, ++i) {
            auto* element = builder.CreateExtractElement(source_rvalue, extract_at);
            *i = element;
        }
    }
    // If the source rvalue is a scalar integer.
    else {
        extracted_elements[0] = source_rvalue;
    }
}


/**
 * This utility function being given a source rvalue, which is usually a vector, extracts N elements out of that vector
 * and caches those resulting rvalues into a container. We might want to use those rvalues in future instructions.
 *
 * @param builder  An IRBuilder to create instructions.
 *
 * @param source_rvalue  The rvalue from which to extract the elements.
 *                       It can be rvalues of both scalar and vector data types, both % virtual registers as well as literal values.
 *                       This function can extract elements from both green vectors and blue vectors.
 *
 * @param extracted_elements  Each element is extracted and saved as a Value* into this container.
 *
 * @param num_elements  The number of elements to extract.
 */
static void extract_elements(IRBuilder<>& builder, Value* source_rvalue, vector<Value*>& extracted_elements, uint64_t num_elements) {
    return extract_elements(builder, source_rvalue, extracted_elements, 0, num_elements);
}


/**
 * This function takes a container having rvalues of individual elements, and inserts each one of them into the given llvm vector,
 * the positions within that vector where to insert the elements are marked by the range.
 * This is a half-open range marked by the two last parameters: [begin, end)
 * The range is intended to be used if we want to insert elements into only part of the vector.
 *
 * @param builder  An IRBuilder to create instructions.
 *
 * @param source_rvalue  The rvalue vector which will be the base for the insertions.
 *                       In other words this is the vector into which we insert the elements.
 *
 * @param extracted_elements  A container containing Value* objects to insert into the vector.
 *
 * @param begin  This index points to the first element in the range.
 *
 * @param end  This index points to one after the last element in the range.
 *
 * @return Value*  A virtual register of the vector with all the inserted elements.
 */
static Value* insert_elements(IRBuilder<>& builder, Value* source_rvalue, const vector<Value*>& extracted_elements, uint64_t begin, uint64_t end) {
    Value* temp_vector = source_rvalue;

    IGC_ASSERT(begin <= end && extracted_elements.size() == (end - begin));

    vector<Value*>::const_iterator i = extracted_elements.cbegin();
    for (uint64_t insert_at = begin; insert_at < end; ++insert_at, ++i) {
        // Use the previous vector to compute the insertelement insruction for the current vector,
        // overwriting the current vector into the variable, discarding the information for the previous vector.
        temp_vector = builder.CreateInsertElement(temp_vector, *i, insert_at);
    }
    return temp_vector;
}


bool LSCCacheOptimizationPass::runOnFunction(Function& function)
{
    m_CGCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
    changed_IR = false;
    current_function = &function;
    current_context = &function.getContext();

#if 0
    // This method is only used for testing purposes.
    // In order to insert a 48 wide store instruction into the Function, if it doesn't already have one.
    // Most shaders that I've run it through don't have such an instruction, so I have to manually insert one
    // like that in order to test the fourth case.
    create_48_wide_store(function);
#endif

    // This method indirectly calls visitStoreInst() to process the LLVM IR function.
    // It basically performs the RMW operation on any eligible instructions.
    visit(function);
    return changed_IR;
}


void LSCCacheOptimizationPass::visitStoreInst(StoreInst& storeInst)
{
    auto cacheOpts = getCacheOptsStorePolicy(storeInst, *m_CGCtx);

    // cacheOpts will be not None for the following memory regions:
    // RTAsynctack, SWStack, SWHotZone, RTSynctack
    if (!cacheOpts)
        return;

    auto store_cache_policy = cacheOpts.getValue();
    if (store_cache_policy == LSC_L1UC_L3UC || store_cache_policy == LSC_L1UC_L3C_WB) {
        // The cache policy is uncached for the L1 cache.
        // unsuccessful early exit
        return;
    }

    auto& DL = current_function->getParent()->getDataLayout();

    uint64_t offset = 0;
    uint64_t region_size = 0;

    const Instruction* address = dyn_cast<Instruction>(storeInst.getPointerOperand());
    auto Region = getRegionOffset(address, &DL, &offset, &region_size);
    if (!Region)
        return;

    IGC_ASSERT(offset < region_size);  // the offset is within the bounds of the memory region

    Value* value = storeInst.getValueOperand();

    Type* type = value->getType();
    uint64_t data_size = 0;  // measured in bytes
    uint64_t element_size = 0;  // measured in bytes
    Type* element_type;
    // If the stored data is a vector variable.
    if (IGCLLVM::FixedVectorType* vectorType = dyn_cast<IGCLLVM::FixedVectorType>(type)) {
        element_type = vectorType->getElementType();
        uint64_t num_elements = vectorType->getNumElements();
        element_size = DL.getTypeSizeInBits(element_type) / CHAR_BIT;
        data_size = num_elements * element_size;
    }
    // If the stored data is a scalar variable.
    else {
        data_size = element_size = DL.getTypeSizeInBits(type) / CHAR_BIT;
        element_type = type;
    }

    // If the offset is a multiple of 16 bytes (16 byte aligned store address)
    // && the size of the stored data is a multiple of 16 bytes
    if ((offset % LSC_WRITE_GRANULARITY == 0) && (data_size % LSC_WRITE_GRANULARITY == 0)) {
        // successful early exit
        return;
    }

    // Convert the address from the start of the memory region into
    // the offset from the start of the previous nearest 16 byte boundary.
    offset %= LSC_WRITE_GRANULARITY;

    // Create an IRBuilder with an insertion point set to the given intrinsic_call instruction.
    // IRBuilder automatically inserts instructions when it creates them,
    // and the inserted instructions (dynamically allocated) are deleted when the function is destroyed.
    IRBuilder<> builder(&storeInst);
    auto* initial_pointer = storeInst.getPointerOperand();
    unsigned addrspace = storeInst.getPointerAddressSpace();

    uint64_t right_boundary;  // measured in bytes
    // If the stored data straddles across three 16 bytes size chunks,
    // we break it up into two stores: first 32 bytes and last 16 bytes
    /* This is the fourth case. */
    if (offset + data_size > 32) {
        right_boundary = 48;
        // get the left pad
        // get the right pad
        // get the blue blocks
        // construct and store the first 32 bytes vector
        // construct and store the last 16 bytes vector

        uint64_t num_blue_blocks = data_size / element_size;

        uint64_t num_green_blocks_left = offset / element_size;
        uint64_t num_blue_blocks_left = (32 - offset) / element_size;
        uint64_t num_total_blocks_left = num_green_blocks_left + num_blue_blocks_left;

        uint64_t num_green_blocks_right = (right_boundary - (offset + data_size)) / element_size;
        uint64_t num_blue_blocks_right = (offset + data_size - 32) / element_size;
        uint64_t num_total_blocks_right = num_green_blocks_right + num_blue_blocks_right;

        /* First do the GGRR 32 wide store */
        // %0 = bitcast <>* %baseAddress to i8*
        auto* bitcast1 = builder.CreateBitCast(initial_pointer, builder.getInt8PtrTy(addrspace));
        // %1 = getelementptr i8, i8* %0, i64 -offset
        auto* left_green_address = builder.CreateGEP(builder.getInt8Ty(), bitcast1, builder.getInt64(-1 * offset));
        // %2 = bitcast i8* %1 to <num_green_blocks_left x iN>*
        auto* left_green_vector_pointer = builder.CreateBitCast(left_green_address, IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_green_blocks_left)->getPointerTo(addrspace));
        // %3 = load <num_green_blocks_left x iN>, <num_green_blocks_left x iN>* %2
        auto* left_green_vector_rvalue = builder.CreateLoad(IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_green_blocks_left), left_green_vector_pointer);

        // The static_cast is needed to remove this warning or error, which appears only in QuickBuild Windows
        // https://ubit-gfx.intel.com/build/9814699/step_status
        //   error C2220: the following warning is treated as an error
        //   warning C4244: 'argument': conversion from 'uint64_t' to 'const unsigned int', possible loss of data
        vector<Value*> green_elements_left(static_cast<const unsigned int>(num_green_blocks_left));
        extract_elements(builder, left_green_vector_rvalue, green_elements_left, num_green_blocks_left);
        vector<Value*> blue_elements_left(static_cast<const unsigned int>(num_blue_blocks_left));
        extract_elements(builder, value, blue_elements_left, num_blue_blocks_left);

        auto* initial_vector_left = UndefValue::get(IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_total_blocks_left));
        auto* intermediate_vector_left = insert_elements(builder, initial_vector_left, green_elements_left, 0, num_green_blocks_left);
        auto* final_vector_left = insert_elements(builder, intermediate_vector_left, blue_elements_left, num_green_blocks_left, num_total_blocks_left);

        auto* final_vector_pointer_left = builder.CreateBitCast(left_green_address, IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_total_blocks_left)->getPointerTo(addrspace));
        builder.CreateAlignedStore(final_vector_left, final_vector_pointer_left, IGCLLVM::Align(LSC_WRITE_GRANULARITY));

        auto* right_part_address = builder.CreateGEP(builder.getInt8Ty(), bitcast1, builder.getInt64(32 - offset));

        vector<Value*> blue_elements_right(static_cast<const unsigned int>(num_blue_blocks_right));
        extract_elements(builder, value, blue_elements_right, num_blue_blocks_left, num_blue_blocks);

        vector<Value*> green_elements_right(static_cast<const unsigned int>(num_green_blocks_right));
        auto* right_green_address = builder.CreateGEP(builder.getInt8Ty(), bitcast1, builder.getInt64(data_size));
        auto* right_green_vector_pointer = builder.CreateBitCast(right_green_address, IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_green_blocks_right)->getPointerTo(addrspace));
        auto* right_green_vector_rvalue = builder.CreateLoad(IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_green_blocks_right), right_green_vector_pointer);
        extract_elements(builder, right_green_vector_rvalue, green_elements_right, num_green_blocks_right);

        auto* initial_vector_right = UndefValue::get(IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_total_blocks_right));
        auto* intermediate_vector_right = insert_elements(builder, initial_vector_right, blue_elements_right, 0, num_blue_blocks_right);
        auto* final_vector_right = insert_elements(builder, intermediate_vector_right, green_elements_right, num_blue_blocks_right, num_total_blocks_right);

        auto* final_vector_pointer_right = builder.CreateBitCast(right_part_address, IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_total_blocks_right)->getPointerTo(addrspace));
        builder.CreateAlignedStore(final_vector_right, final_vector_pointer_right, IGCLLVM::Align(LSC_WRITE_GRANULARITY));
    }
    // If the data straddles across one or two 16 byte size chunks,
    // we do only one load or store, but we can optimize it to
    // load only the green blocks if they are contiguous
    else {
        right_boundary = (offset + data_size > 16) ? 32 : 16;
        // the red blocks are on the left side, contiguous green blocks on the right side
        // load only the contiguous green blocks
        /* This is the first case. */
        if (offset == 0) {
            // get the right pad
            // get the blue blocks
            // construct and store the right_boundary size vector

            uint64_t num_green_blocks = (right_boundary - data_size) / element_size;
            uint64_t num_blue_blocks = data_size / element_size;

            // %0 = bitcast <>* %baseAddress to i8*
            auto* bitcast1 = builder.CreateBitCast(initial_pointer, builder.getInt8PtrTy(addrspace));
            // %1 = getelementptr i8, i8* %0, i64 data_size
            auto* green_address = builder.CreateGEP(builder.getInt8Ty(), bitcast1, builder.getInt64(data_size));
            // %2 = bitcast i8* %1 to <num_green_blocks x iN>*
            auto* green_vector_pointer = builder.CreateBitCast(green_address, IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_green_blocks)->getPointerTo(addrspace));
            // %3 = load <num_green_blocks x iN>, <num_green_blocks x iN>* %2
            auto* green_vector_rvalue = builder.CreateLoad(IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_green_blocks), green_vector_pointer);

            vector<Value*> green_elements(static_cast<const unsigned int>(num_green_blocks));
            extract_elements(builder, green_vector_rvalue, green_elements, num_green_blocks);
            vector<Value*> blue_elements(static_cast<const unsigned int>(num_blue_blocks));
            extract_elements(builder, value, blue_elements, num_blue_blocks);

            uint64_t num_elements = num_green_blocks + num_blue_blocks;
            auto* initial_vector = UndefValue::get(IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_elements));
            auto* intermediate_vector = insert_elements(builder, initial_vector, blue_elements, 0, num_blue_blocks);
            auto* final_vector = insert_elements(builder, intermediate_vector, green_elements, num_blue_blocks, num_elements);

            auto* final_vector_pointer = builder.CreateBitCast(initial_pointer, IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_elements)->getPointerTo(addrspace));
            builder.CreateAlignedStore(final_vector, final_vector_pointer, IGCLLVM::Align(LSC_WRITE_GRANULARITY));
        }
        // the red blocks are on the right side, contiguous green blocks on the left side
        // load only the contiguous green blocks
        /* This is the second case. */
        else if (offset + data_size == right_boundary) {
            // get the left pad
            // get the blue blocks
            // construct and store the right_boundary size vector

            uint64_t num_green_blocks = offset / element_size;
            uint64_t num_blue_blocks = data_size / element_size;

            // %0 = bitcast <>* %baseAddress to i8*
            auto* bitcast1 = builder.CreateBitCast(initial_pointer, builder.getInt8PtrTy(addrspace));
            // %1 = getelementptr i8, i8* %0, i64 -offset
            auto* green_address = builder.CreateGEP(builder.getInt8Ty(), bitcast1, builder.getInt64(-1 * offset));
            // %2 = bitcast i8* %1 to <num_green_blocks x iN>*
            auto* green_vector_pointer = builder.CreateBitCast(green_address, IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_green_blocks)->getPointerTo(addrspace));
            // %3 = load <num_green_blocks x iN>, <num_green_blocks x iN>* %2
            auto* green_vector_rvalue = builder.CreateLoad(IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_green_blocks), green_vector_pointer);

            vector<Value*> green_elements(static_cast<const unsigned int>(num_green_blocks));
            extract_elements(builder, green_vector_rvalue, green_elements, num_green_blocks);
            vector<Value*> blue_elements(static_cast<const unsigned int>(num_blue_blocks));
            extract_elements(builder, value, blue_elements, num_blue_blocks);

            uint64_t num_elements = num_green_blocks + num_blue_blocks;
            auto* initial_vector = UndefValue::get(IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_elements));
            auto* intermediate_vector = insert_elements(builder, initial_vector, green_elements, 0, num_green_blocks);
            auto* final_vector = insert_elements(builder, intermediate_vector, blue_elements, num_green_blocks, num_elements);

            auto* final_vector_pointer = builder.CreateBitCast(green_address, IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_elements)->getPointerTo(addrspace));
            builder.CreateAlignedStore(final_vector, final_vector_pointer, IGCLLVM::Align(LSC_WRITE_GRANULARITY));
        }
        // the red blocks in the middle, around them discontinuous green blocks
        // load the green blocks on the left, red blocks in the middle, and green blocks on the right
        /* This is the third case. */
        else {
            // get the whole thing
            // get the blue blocks
            // construct and store the right_boundary size vector, overwriting the red blocks in the middle

            uint64_t num_total_blocks = right_boundary / element_size;

            // %0 = bitcast <>* %baseAddress to i8*
            auto* bitcast1 = builder.CreateBitCast(initial_pointer, builder.getInt8PtrTy(addrspace));
            // %1 = getelementptr i8, i8* %0, i64 -offset
            auto* starting_address = builder.CreateGEP(builder.getInt8Ty(), bitcast1, builder.getInt64(-1 * offset));
            // %2 = bitcast i8* %1 to <num_total_blocks x iN>*
            auto* full_vector_pointer = builder.CreateBitCast(starting_address, IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_total_blocks)->getPointerTo(addrspace));
            // %3 = load <num_total_blocks x iN>, <num_total_blocks x iN>* %2
            auto* full_vector_rvalue = builder.CreateLoad(IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_total_blocks), full_vector_pointer);

            uint64_t left_padding_size_blocks = offset / element_size;
            uint64_t num_blue_blocks = data_size / element_size;
            vector<Value*> blue_elements(static_cast<const unsigned int>(num_blue_blocks));
            extract_elements(builder, value, blue_elements, num_blue_blocks);

            auto* final_vector = insert_elements(builder, full_vector_rvalue, blue_elements, left_padding_size_blocks, left_padding_size_blocks + num_blue_blocks);
            builder.CreateAlignedStore(final_vector, full_vector_pointer, IGCLLVM::Align(LSC_WRITE_GRANULARITY));
        }
    }

    storeInst.eraseFromParent();
    changed_IR = true;
}


#if 0
bool LSCCacheOptimizationPass::create_48_wide_store(Function& function)
{
    Function::iterator bb = function.begin(), bb_end = function.end();
    for (; bb != bb_end; ++bb) {
        for (BasicBlock::iterator i = bb->begin(), i_end = bb->end(); i != i_end; ++i) {
            // Loop through all instructions in a function, search for a call to a GenIntrinsicInst::GenISA_AsyncStackPtr
            // which would look something like this:
            // %perLaneAsyncStackPointer24 = call noalias align 128 dereferenceable(256) %"struct.RTStackFormat::RTStack" addrspace(1)* @"llvm.genx.GenISA.AsyncStackPtr.p1struct.RTStackFormat::RTStack.i64"(i64 %19)
            if (auto* intrinsic_call = dyn_cast<GenIntrinsicInst>(i)) {
                if (intrinsic_call->getIntrinsicID() == llvm::GenISAIntrinsic::GenISA_AsyncStackPtr) {
                    // Create an IRBuilder with an insertion point set to the given intrinsic_call instruction.
                    // IRBuilder automatically inserts instructions when it creates them,
                    // and the inserted instructions (dynamically allocated) are deleted when the function is destroyed.
                    IRBuilder<> builder(intrinsic_call);
                    Type* return_type = intrinsic_call->getFunctionType()->getReturnType();
                    unsigned addrspace = return_type->getPointerAddressSpace();

                    uint64_t offset = 8;          // in bytes
                    uint64_t num_red_blocks = 8;  // in dwords
                    Type* element_type = builder.getInt32Ty();
                    // %0 = bitcast <>* %baseAddress to i8*
                    auto* bitcast1 = builder.CreateBitCast(intrinsic_call, builder.getInt8PtrTy(addrspace));
                    // %1 = getelementptr i8, i8* %0, i64 offset
                    auto* red_address = builder.CreateGEP(builder.getInt8Ty(), bitcast1, builder.getInt64(offset));
                    // %2 = bitcast i8* %1 to <num_red_blocks x iN>*
                    auto* red_vector_pointer = builder.CreateBitCast(red_address, IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_red_blocks)->getPointerTo(addrspace));
                    auto* red_vector_rvalue = getZeroedVector(builder, element_type, num_red_blocks);
                    builder.CreateStore(red_vector_rvalue, red_vector_pointer);

                    changed_IR = true;
                    return true;
                }
            }
        }
    }
    return false;
}
#endif

char LSCCacheOptimizationPass::ID = 0;
#define PASS_FLAG "LSC-Cache-Optimization-pass"
#define PASS_DESCRIPTION "Load/Store cache optimization pass"
#define PASS_CFG_ONLY false
#define PASS_ANALYSIS false
IGC_INITIALIZE_PASS_BEGIN(LSCCacheOptimizationPass, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
IGC_INITIALIZE_PASS_END(LSCCacheOptimizationPass, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)