File: IBiF_matrix_generator.cpp

package info (click to toggle)
intel-graphics-compiler2 2.18.5-1
links: PTS, VCS
area: main
in suites: sid
size: 107,080 kB
sloc: cpp: 807,289; lisp: 287,855; ansic: 16,414; python: 4,004; yacc: 2,588; lex: 1,666; pascal: 313; sh: 186; makefile: 35
file content (1560 lines) | stat: -rw-r--r-- 59,480 bytes
/*========================== begin_copyright_notice ============================

Copyright (C) 2025 Intel Corporation

SPDX-License-Identifier: MIT

============================= end_copyright_notice ===========================*/

//
// See README.md for more information about Matrix builtins and this generator.
//

#include <string>
#include <iostream>
#include <fstream>
#include <unordered_set>
#include <assert.h>
using namespace std;

static constexpr int         BITS_8                     = 8;
static constexpr int         BITS_16                    = 16;
static constexpr int         BITS_32                    = 32;
static constexpr int         BITS_64                    = 64;
static constexpr int         SUB_GROUP_8                = 8;
static constexpr int         SUB_GROUP_16               = 16;
static constexpr int         SUB_GROUP_32               = 32;
static constexpr int         MAX_ROW_BITS_2D_BLOCK_LOAD = 8 * 64;
static unordered_set<string> CreatedFuncsSet;
static string                CreatedFuncsWarningLog;

static string FileHeader()
{
    return "// This file is auto-generated by IBiF_matrix_generator.cpp\n"
           "// Do not modify it directly\n"
           "//\n";
}

//
// Math and string helper functions
//
static int Bytes(int bits)
{
    assert((bits % 8) == 0);
    return bits / 8;
}

// Rounds up an integer to the nearest power of two
static int CeilPow2(int v)
{
    assert(v > 0 && "CeilPow2 input must be a positive non-zero integer");
    v--;
    v |= v >> 1;
    v |= v >> 2;
    v |= v >> 4;
    v |= v >> 8;
    v |= v >> 16;
    v++;
    return v;
}

static string ToStringAbove1(int number)
{
    if (number <= 1) return "";
    return to_string(number);
}

// Replaces all occurrences of `toReplace` with `newText` in `source`
static string Replace(string source, string toReplace, const string &newText)
{
    size_t pos = 0;
    while ((pos = source.find(toReplace, pos)) != std::string::npos)
    {
        source.replace(pos, toReplace.length(), newText);
        pos += newText.length();
    }
    return source;
}

//
// enums & structs declarations
//
enum LayoutType
{
    Layout_PackedA_RowMajor,
    Layout_PackedA_ColumnMajor,
    Layout_PackedB_RowMajor,
    Layout_PackedB_ColumnMajor,
    Layout_PackedB_PackedB,
    Layout_Accumulator_RowMajor,
    Layout_Accumulator_ColumnMajor,
};
static string ToString(LayoutType layout)
{
    switch (layout)
    {
    case Layout_PackedA_RowMajor:
        return "PackedA_RowMajor";
    case Layout_PackedA_ColumnMajor:
        return "PackedA_ColumnMajor";
    case Layout_PackedB_RowMajor:
        return "PackedB_RowMajor";
    case Layout_PackedB_ColumnMajor:
        return "PackedB_ColumnMajor";
    case Layout_PackedB_PackedB:
        return "PackedB_PackedB";
    case Layout_Accumulator_RowMajor:
        return "Accumulator_RowMajor";
    case Layout_Accumulator_ColumnMajor:
        return "Accumulator_ColumnMajor";
    }
    assert(false && "Unknown Layout value in ToString");
    return "?";
}

enum OrderType
{
    Order_RowMajor,
    Order_ColMajor,
    Order_Vnni,
};

enum AddrSpace
{
    AddrSpace_Global,
    AddrSpace_Local,
    AddrSpace_Generic,
};
static string ToString(AddrSpace as)
{
    switch (as)
    {
    case AddrSpace_Global:
        return "global";
    case AddrSpace_Local:
        return "local";
    case AddrSpace_Generic:
        return "generic";
    }
    assert(false && "Unknown AddrSpace value in ToString");
    return "?";
}

struct MatrixSpec
{
    int        SubGroupSize;
    LayoutType Layout;
    int        Rows, Cols;
    int        BitWidth;

    OrderType Order;
    int       VnniFactor;
    int       VnniedRows, VnniedCols;
    int       WiRows;
    int       ContribBitWidth;
    int       DpasSubGroupSize;

    MatrixSpec(int sgSize, LayoutType layout, int rows, int cols, int bitWidth)
        : SubGroupSize(sgSize), Layout(layout), Rows(rows), Cols(cols), BitWidth(bitWidth)
    {
        switch (Layout)
        {
        case Layout_PackedA_RowMajor:
            Order = Order_RowMajor;
            break;
        case Layout_PackedA_ColumnMajor:
            Order = Order_ColMajor;
            break;
        case Layout_PackedB_RowMajor:
            Order = Order_Vnni;
            break;
        case Layout_PackedB_ColumnMajor:
            Order = Order_ColMajor;
            break;
        case Layout_PackedB_PackedB:
            Order = Order_RowMajor;
            break;
        case Layout_Accumulator_RowMajor:
            Order = Order_RowMajor;
            break;
        case Layout_Accumulator_ColumnMajor:
            Order = Order_ColMajor;
            break;
        default:
            assert(false);
        }

        // Vnni packs smaller data into 32 bit elements.
        // For BitWidth >= 32 it needs be the same as RowMajor.
        if (BitWidth >= BITS_32 && Order == Order_Vnni) Order = Order_RowMajor;

        // Calculate Vnni dimensions for PackedB layouts when BitWidth is under 32 bits.
        VnniFactor = 1;
        if (BitWidth < BITS_32 &&
            (Layout == Layout_PackedB_RowMajor || Layout == Layout_PackedB_ColumnMajor ||
             Layout == Layout_PackedB_PackedB))
        {
            VnniFactor = BITS_32 / BitWidth;
        }
        VnniedRows = Rows * VnniFactor;
        VnniedCols = Cols / VnniFactor;

        // Even if our kernel is compiled for sub group size 32
        // we are still using sub group 16 dpas instructions.
        // This impacts in what shape the final loaded data has to be in.
        DpasSubGroupSize = SubGroupSize;
        if (DpasSubGroupSize == SUB_GROUP_32) DpasSubGroupSize = SUB_GROUP_16;

        // ContribBitWidth calculates the optimal size/granularity of the load that we can perform.
        // It's a concept mostly ported as is from the old macro-heavy OpenCL C implementation.
        // It's a bit complicated because it's predicting what value the load implementations will need -
        //   - and load implementations themselves have a lot of special cases around this too.
        // It might be a good idea to refactor this and let the individual load implementations
        //   calculate this value themselves instead of calculating it ahead of the time here.
        {
            ContribBitWidth = Cols * BitWidth / DpasSubGroupSize;

            // Limit ContribBitWidth for LoadLarge
            ContribBitWidth = min(BITS_32, ContribBitWidth);

            // ContribBitWidth shouldn't be smaller than BitWidth
            ContribBitWidth = max(ContribBitWidth, BitWidth);

            // Special case - perhaps this could be removed with refactoring
            if (Layout == Layout_PackedA_RowMajor && SubGroupSize == SUB_GROUP_16 &&
                Cols == 32 && BitWidth == BITS_16)
                ContribBitWidth = BITS_16;
            // Special case - when bitwidth is 16 and layout is accumulator, the contribBitWidth is also 16
            if ((Layout == Layout_Accumulator_RowMajor ||
                 Layout == Layout_Accumulator_ColumnMajor) &&
                BitWidth == BITS_16)
                ContribBitWidth = BITS_16;

            if (Order == Order_Vnni) assert(ContribBitWidth == BITS_32);
        }

        // WiRows - amount of rows per one work item (1 out of SubGroupSize).
        // It's used for calculations and to distinguish between
        // SubGroupSize 16 and SubGroupSize 32 loads.
        {
            int totalBits     = Rows * Cols * BitWidth;
            int canHandleBits = ContribBitWidth * SubGroupSize;
            WiRows = totalBits / canHandleBits + (totalBits % canHandleBits ? 1 : 0);
        }
    }
};

//
// Helper functions
//
static string GetUnsignedType(int bitWidth)
{
    switch (bitWidth)
    {
    case BITS_8:
        return "uchar";
    case BITS_16:
        return "ushort";
    case BITS_32:
        return "uint";
    case BITS_64:
        return "ulong";
    }
    assert(false && "Unexpected unsigned type bit width");
    return "?";
}

static string GetVectorLoadSuffix(int bitWidth)
{
    switch (bitWidth)
    {
    case BITS_8:
        return "_uc";
    case BITS_16:
        return "_us";
    case BITS_32:
        return "";
    case BITS_64:
        return "_ul";
    }
    assert(false && "Unsupported bitWidth in GetVectorLoadSuffix");
    return "";
}

static string ConstructVector(string vecType, string varPrefix, int count)
{
    string s = "(" + vecType + ")(";
    for (int i = 0; i < count; i++)
    {
        s += varPrefix + to_string(i);
        if (i + 1 < count) s += ", ";
    }
    s += ")";
    return s;
}

static string GetMatrixFunctionName(MatrixSpec spec, AddrSpace addr, bool isChecked)
{
    string s;

    if (isChecked)
        s += "__builtin_spriv_OpJointMatrixLoadCheckedINTEL_";
    else
        s += "__builtin_spriv_OpJointMatrixLoadINTEL_";

    s += ToString(spec.Layout);

    if (spec.DpasSubGroupSize > SUB_GROUP_8)
        s += "_SG" + to_string(spec.DpasSubGroupSize);

    s += "_" + to_string(spec.VnniedRows) + "x" + to_string(spec.VnniedCols);
    s += "_i" + to_string(spec.BitWidth);
    s += "_" + to_string(spec.WiRows);

    if (!isChecked) s += "_" + ToString(addr);

    s += "_v8i8_pi32_i32";
    return s;
}

static bool CheckIfFunctionNameIsUnique(string funcName)
{
    if (CreatedFuncsSet.find(funcName) != CreatedFuncsSet.end())
    {
        CreatedFuncsWarningLog +=
            "/* Skipped creation of a function with duplicate name: " + funcName + "*/\n";
        return false;
    }
    CreatedFuncsSet.insert(funcName);
    return true;
}

//
// Small load implementations
//

// For a few special cases we need to shuffle the data to put the data
// in required memory order after doing 2D block load.
static string
ImplementSmallLoad2DBlockDataShuffle(MatrixSpec spec, int resultTypeBitWidth)
{
    string elemType = GetUnsignedType(spec.BitWidth);
    assert(resultTypeBitWidth == BITS_32);
    int loadPackFactor = (resultTypeBitWidth / spec.BitWidth);
    int packFactor     = 2;

    int shuffleCount = spec.SubGroupSize == SUB_GROUP_16 ? SUB_GROUP_16 : SUB_GROUP_8;
    if (spec.BitWidth == BITS_16) shuffleCount = spec.WiRows;

    string s;
    s += "int slid = get_sub_group_local_id();\n";
    s += "ShuffleType *data = (ShuffleType *)&res;\n";
    s += "ShuffleType tdata;\n";
    s += "for (int i = 0; i < ShuffleCount; i++) {\n";

    if (spec.BitWidth == BITS_16)
        s += "int from_slid = slid % Cols + (i % PackFactor) * Cols;\n";
    else
        s += "int from_slid = (slid * PackFactor) % SubGroupSize + i % PackFactor;\n";

    if (spec.SubGroupSize == BITS_16)
    {
        s += "ElemType tmp0 = sub_group_shuffle((*data)[(i / PackFactor) + (i / "
             "(LoadPackFactor * PackFactor) + 0) * LoadPackFactor], from_slid);\n";
        s += "ElemType tmp1 = sub_group_shuffle((*data)[(i / PackFactor) + (i / "
             "(LoadPackFactor * PackFactor) + 1) * LoadPackFactor], from_slid);\n";
    }
    else
    {
        s += "ElemType tmp0 = sub_group_shuffle((*data)[(i / PackFactor) * PackFactor + "
             "0], from_slid);\n";
        s += "ElemType tmp1 = sub_group_shuffle((*data)[(i / PackFactor) * PackFactor + "
             "1], from_slid);\n";
    }

    s += "tdata[i] = slid < " + to_string(spec.SubGroupSize / packFactor) +
         " ? tmp0 : tmp1;\n";
    s += "}\n";
    s += "*(__private ResultType *)dst = *(__private ResultType *)&tdata;\n";

    s = Replace(s, "ElemType", elemType);
    s = Replace(s, "ShuffleType", elemType + to_string(shuffleCount));
    s = Replace(s, "ShuffleCount", to_string(shuffleCount));
    s = Replace(s, "LoadPackFactor", to_string(loadPackFactor));
    s = Replace(s, "PackFactor", to_string(packFactor));
    return s;
}

static string ImplementSmallLoad2DBlock(MatrixSpec spec, bool isChecked)
{
    // Reject cases where we couldn't construct a proper load using 2D block load.
    if (spec.SubGroupSize < SUB_GROUP_16 || spec.Rows > 32) return "";

    // retNum is different from spec.WiRows only in special Layout_PackedA_ColumnMajor implementation that uses shuffling.
    int retNum = spec.WiRows;

    // resultTypeBitWidth is the same as blockBitWidth in all cases except for Vnni loads.
    // Where we load values as ushort16 but store it into uint8 to achieve proper memory layout.
    int resultTypeBitWidth = spec.ContribBitWidth;
    int blockBitWidth      = spec.ContribBitWidth;
    int blockHeight        = 0;

    // blockFunc encodes compile-time 2D block load parameters which are mangled into the function name.
    // Examples:
    // __builtin_IB_subgroup_block_read_flat_cacheopts_u16_wi1_m1k16v1
    // __builtin_IB_subgroup_block_read_flat_cacheopts_transpose_u32_wi4_m16_k4
    string blockFunc;

    // This scope calculates proper values for the above
    // block parameters which will be used to construct 2D block load call.
    {
        int blockWidth = 0;

        if (spec.Order == Order_RowMajor)
        {
            int contribCols = spec.Cols / (spec.ContribBitWidth / spec.BitWidth);
            blockWidth      = contribCols;
            blockHeight     = spec.Rows;
        }
        else if (spec.Order == Order_ColMajor)
        {
            blockWidth  = spec.Rows;
            blockHeight = spec.VnniedCols;
        }
        else if (spec.Order == Order_Vnni)
        {
            blockWidth  = spec.VnniedCols;
            blockHeight = spec.VnniedRows;
            blockBitWidth =
                spec.BitWidth; // Vnni uses special vnni-transform 2D block load operation which needs to operate on native small elements.
        }

        if (spec.Layout == Layout_PackedA_ColumnMajor)
        {
            // retNum exists only for this load implementation. It is the number of elements in the return vector from block2d call.
            // In other cases it is equal to spec.WiRows but in this case, because we use 32-bit data size to load data,
            // while "contrib type" is still 16-bit, we need to use different return type.
            retNum        = (spec.WiRows / (BITS_32 / spec.BitWidth)) * (spec.Cols / 16);
            blockBitWidth = BITS_32;
            resultTypeBitWidth = BITS_32;

            int contribRows = spec.Rows / (BITS_32 / spec.BitWidth);
            blockWidth      = contribRows;
            blockHeight     = spec.Cols;
        }

        // Reject cases where we blockRowSize is too big for 2D block load.
        int blockRowSizeInBits = blockWidth * spec.BitWidth;
        if (blockRowSizeInBits > MAX_ROW_BITS_2D_BLOCK_LOAD) return "";

        blockFunc = "__builtin_IB_subgroup_block_"s + "read" + "_flat_cacheopts";
        if (spec.Order == Order_ColMajor)
            blockFunc += "_transpose";
        else if (spec.Order == Order_Vnni)
            blockFunc += "_transform";

        blockFunc += "_u" + to_string(blockBitWidth);
        blockFunc += "_wi" + to_string(retNum);

        if (spec.Order == Order_ColMajor)
        {
            blockFunc += "_m" + to_string(blockHeight) + "_k" + to_string(blockWidth);
        }
        else if (spec.Order == Order_Vnni)
        {
            blockFunc += "_k" + to_string(blockHeight) + "n" + to_string(blockWidth);
        }
        else
        {
            blockFunc +=
                "_m" + to_string(blockHeight) + "k" + to_string(blockWidth) + "v1";
        }
    }

    // Prepare a call to 2D block load.
    string s;
    if (isChecked)
    {
        s += "long offset = as_long(mem);\n";
        s += "int pack_factor = " + to_string(blockBitWidth / spec.BitWidth) + ";\n";
        s += "int2 coords = (int2)(x / pack_factor, y);\n";
        s += "int width_bytes = ElemBytes * width - 1;\n";
        s += "int pitch_bytes = ElemBytes * stride - 1;\n";
        s += "int height_minus_one = height - 1;\n";
        s += "ResultType BlockFunc(long, int, int, int, int2, int);\n";
        s += "ResultType res = BlockFunc(offset, width_bytes, height_minus_one, "
             "pitch_bytes, coords, cacheOpt);\n";
    }
    else
    {
        // Check if BLOCK2D_IMPL implementation is available.
        s += "if (BIF_FLAG_CTRL_GET(JointMatrixLoadStoreOpt) >= BLOCK2D_IMPL) {\n";
        s += "long offset = as_long(mem);\n";
        s += "long baseoffset = offset & (~0x3f);\n";
        s +=
            "long x = (offset - baseoffset) / " + to_string(Bytes(blockBitWidth)) + ";\n";
        s += "int2 coords = (int2)(x, 0);\n";
        s += "int width_bytes = ElemBytes * stride - 1;\n";
        s += "int pitch_bytes = width_bytes;\n";
        s += "int height_minus_one = " + to_string(blockHeight) + " - 1;\n";
        s += "ResultType BlockFunc(long, int, int, int, int2, int);\n";
        s += "ResultType res = BlockFunc(baseoffset, width_bytes, height_minus_one, "
             "pitch_bytes, coords, cacheOpt);\n";
    }

    // There are matrix combinations that couldn't be handled with 2D block load instructions directly.
    // For performance reasons, we want to use 2D block load, as it's the fastest way to load big chunks of data.
    // For a few special matrix cases, we use a 2D block load instruction,
    // and then we use shuffle instructions to transform the order of the data in the memory.
    bool dataRequiresShuffle =
        (spec.Layout == Layout_PackedA_ColumnMajor &&
         ((spec.BitWidth == BITS_16 && spec.SubGroupSize == SUB_GROUP_32) ||
          spec.BitWidth == BITS_8));

    if (dataRequiresShuffle)
        s += ImplementSmallLoad2DBlockDataShuffle(spec, resultTypeBitWidth);
    else // If shuffling wasn't required we can just store the load results.
        s += "*(__private ResultType *)dst = res;\n";

    // Close scope: if (BIF_FLAG_CTRL_GET(JointMatrixLoadStoreOpt) >= BLOCK2D_IMPL) {
    if (!isChecked)
    {
        s += "return;\n";
        s += "}\n";
    }

    // resultType can be equal to strings like: "uint", "uint2", "uint4", "ushort8", "ushort16" etc
    string resultType =
        GetUnsignedType(resultTypeBitWidth) + ToStringAbove1(CeilPow2(retNum));
    // Replace template strings.
    s = Replace(s, "Cols", to_string(spec.Cols));
    s = Replace(s, "SubGroupSize", to_string(spec.SubGroupSize));
    s = Replace(s, "ResultType", resultType);
    s = Replace(s, "BlockFunc", blockFunc);
    s = Replace(s, "ElemBytes", to_string(Bytes(spec.BitWidth)));
    return s;
}

static string ImplementSmallLoadVectorContinuous(MatrixSpec spec, AddrSpace addr)
{
    // Reject cases where we couldn't construct a proper load using continuous vector load.
    if (spec.WiRows < spec.Rows || ((spec.WiRows % spec.Rows) != 0)) return "";

    if (!(spec.Order == Order_RowMajor &&
          (spec.Rows == 1 || spec.Rows == 2 || spec.Rows == 4 || spec.Rows == 8 ||
           (spec.Rows == 16 && spec.ContribBitWidth <= BITS_16))))
        return "";

    string s;
    // Check if VECTOR_CONT_IMPL implementation is available.
    bool skipStrideCheck =
        ((spec.WiRows > spec.Rows) && ((spec.WiRows % spec.Rows) == 0));
    s += "if (BIF_FLAG_CTRL_GET(JointMatrixLoadStoreOpt) >= VECTOR_CONT_IMPL";
    if (!skipStrideCheck) s += " && stride == " + to_string(spec.Cols);
    s += ") {\n";

    // Load and save the data.
    s += "AlignedType OVERLOADABLE VecFunc(const MemType *);\n";
    s += "AlignedType res = VecFunc((MemType *)mem);\n";
    s += "*(__private ResultType *)dst = *(__private ResultType *)&res;\n";

    // Close scope: if (BIF_FLAG_CTRL_GET(JointMatrixLoadStoreOpt) >= VECTOR_CONT_IMPL ...) {;
    s += "return;\n";
    s += "}\n";

    // Replace template strings.
    string vecFunc = "intel_sub_group_block_read" +
                     GetVectorLoadSuffix(spec.ContribBitWidth) +
                     ToStringAbove1(spec.WiRows);
    s = Replace(s, "VecFunc", vecFunc);
    // Types are replaced with strings like: "ushort", "uint", "uint2", "uint4" etc
    s = Replace(
        s,
        "AlignedType",
        GetUnsignedType(spec.ContribBitWidth) + ToStringAbove1(CeilPow2(spec.WiRows)));
    s = Replace(
        s,
        "ResultType",
        GetUnsignedType(spec.ContribBitWidth) + ToStringAbove1(spec.WiRows));
    s = Replace(
        s,
        "MemType",
        "__" + ToString(addr) + " " + GetUnsignedType(spec.ContribBitWidth));
    return s;
}

static string ImplementSmallLoadVector(MatrixSpec spec, AddrSpace addr)
{
    // Reject cases where we couldn't construct a proper load using vector loads.
    if (spec.WiRows < spec.Rows || ((spec.WiRows % spec.Rows) != 0)) return "";

    if (!((spec.Order == Order_RowMajor || spec.Order == Order_Vnni) &&
          (spec.Rows != 1 || spec.SubGroupSize != SUB_GROUP_32)))
        return "";

    string s;
    // Check if VECTOR_IMPL implementation is available.
    s += "if (BIF_FLAG_CTRL_GET(JointMatrixLoadStoreOpt) >= VECTOR_IMPL) {\n";

    int packFactor = spec.ContribBitWidth / spec.BitWidth;
    s += "long packed_stride = stride / " + to_string(packFactor) + ";\n";
    s += "__private ContribType *wi_contrib = (__private ContribType *)dst;\n";
    s += "MemType *src = (MemType *)mem;\n";

    if (spec.Order == Order_Vnni && spec.WiRows == spec.Rows)
    {
        // Special vnni transform vector load implementation.
        int iterationCount = BITS_32 / spec.BitWidth;
        assert(iterationCount >= 1);

        s += "for (int i = 0; i < Rows; i++) {\n";
        for (int iter = 0; iter < iterationCount; iter++)
        {
            s += "ElemType rowIterIndex = VecFunc(src + (IterCount * i + IterIndex) * "
                 "packed_stride);\n";
            s = Replace(s, "IterIndex", to_string(iter));
        }

        if (iterationCount > 1)
            s += "wi_contrib[i] = as_uint(" +
                 ConstructVector("ElemTypeIterCount", "row", iterationCount) + ");\n";
        else
            s += "wi_contrib[i] = row0;\n";

        s += "}\n";

        string vecFunc =
            "intel_sub_group_block_"s + "read" + GetVectorLoadSuffix(spec.BitWidth);
        s = Replace(s, "VecFunc", vecFunc);
        s = Replace(s, "IterCount", to_string(iterationCount));
    }
    else
    {
        // Regular vector load implementation.
        int ratio = spec.WiRows / spec.Rows;
        if (ratio != 1) s += "int ratio = " + to_string(spec.WiRows / spec.Rows) + ";\n";

        s += "for (int i = 0; i < WiRows; i++)\n";
        if (ratio != 1)
            s += "wi_contrib[i] = VecFunc(src + (i/ratio)*packed_stride + "
                 "(i%ratio)*SubGroupSize);\n";
        else
            s += "wi_contrib[i] = VecFunc(src + i*packed_stride);\n";

        string vecFunc = "intel_sub_group_block_"s + "read" +
                         GetVectorLoadSuffix(spec.ContribBitWidth);
        s = Replace(s, "VecFunc", vecFunc);
    }

    // Close scope: if (BIF_FLAG_CTRL_GET(JointMatrixLoadStoreOpt) >= VECTOR_IMPL) {
    s += "return;\n";
    s += "}\n";

    // Replace template strings.
    s = Replace(s, "ElemType", GetUnsignedType(spec.BitWidth));
    s = Replace(s, "ContribType", GetUnsignedType(spec.ContribBitWidth));
    s = Replace(
        s,
        "MemType",
        "__" + ToString(addr) + " " + GetUnsignedType(spec.ContribBitWidth));
    s = Replace(s, "WiRows", to_string(spec.WiRows));
    s = Replace(s, "SubGroupSize", to_string(spec.SubGroupSize));
    s = Replace(s, "Rows", to_string(spec.Rows));
    return s;
}

static string ImplementSmallLoadScalar(MatrixSpec spec, AddrSpace addr)
{
    string s;
    s += "int slid = get_sub_group_local_id();\n";

    int packFactor = spec.ContribBitWidth / spec.BitWidth;
    assert(packFactor > 0);
    int sgCols     = spec.Cols / packFactor;
    int skipFactor = spec.SubGroupSize / sgCols;

    if (spec.Layout == Layout_PackedA_ColumnMajor && spec.BitWidth == 8 &&
        spec.ContribBitWidth == BITS_16)
    {
        s += "for (int i = 0; i < ElemNum; i++)\n";
        s += "dst[i] = mem[(i % PackFactor) * stride + ((slid * PackFactor) % Cols)";
        s += "* stride + (i / PackFactor) * SkipFactor + (slid * PackFactor) / Cols];\n";

        int elemNum = (spec.Rows * spec.Cols) / spec.SubGroupSize;
        s           = Replace(s, "ElemNum", to_string(elemNum));
        s           = Replace(s, "PackFactor", to_string(packFactor));
        s           = Replace(s, "SkipFactor", to_string(skipFactor));
        s           = Replace(s, "Cols", to_string(spec.Cols));
        return s;
    }

    s += "long packed_stride = stride / " + to_string(packFactor) + ";\n";
    s += "__private ContribType *wi_contrib = (__private ContribType *)dst;\n";

    if (spec.Order == Order_Vnni)
    {
        s += "AddrSpace ElemType *src = (AddrSpace ElemType *)mem;\n";
        s += "for (int i = 0; i < " + to_string(spec.Rows) + "; i++) {\n";

        int iterationCount = BITS_32 / spec.BitWidth;
        assert(iterationCount >= 1);
        for (int iter = 0; iter < iterationCount; iter++)
        {
            s += "ElemType rowIterIndex = src[(IterCount * i + IterIndex) * stride + "
                 "slid];\n";
            s = Replace(s, "IterIndex", to_string(iter));
        }

        if (iterationCount > 1)
            s += "wi_contrib[i] = as_uint(" +
                 ConstructVector("ElemTypeIterCount", "row", iterationCount) + ");\n";
        else
            s += "wi_contrib[i] = row0;\n";

        s += "}\n";
        s += "return;\n";

        s = Replace(s, "IterCount", to_string(iterationCount));
    }
    else if (spec.SubGroupSize >= sgCols)
    {
        s += "AddrSpace ContribType *src = (AddrSpace ContribType *)mem;\n";
        s += "for (int i = 0; i < WiRows; i++) {\n";

        s += "if ((i*SkipFactor + slid/SgCols) < Rows)\n";
        if (spec.Order == Order_RowMajor)
            s += "wi_contrib[i] = src[(slid/SgCols + i*SkipFactor)*packed_stride + "
                 "(slid%SgCols)];";
        else if (spec.Order == Order_ColMajor)
            s += "wi_contrib[i] = src[(slid/SgCols + i*SkipFactor) + "
                 "(slid%SgCols)*packed_stride];";

        s += "else\n";
        s += "wi_contrib[i] = 0;\n";
        s += "}\n";
    }
    else
    {
        s += "AddrSpace ContribType *src = (AddrSpace ContribType *)mem;\n";
        s += "for (int i = 0; i < WiRows; i++)\n";

        int ratio = spec.WiRows / spec.Rows;
        if (ratio == 1)
            s += "wi_contrib[i] = src[i*stride + slid];\n";
        else
            s += "wi_contrib[i] = src[(i/Ratio)*stride + (i%Ratio)*SubGroupSize + "
                 "slid];\n";

        s = Replace(s, "Ratio", to_string(ratio));
    }

    s = Replace(s, "SgCols", to_string(sgCols));
    s = Replace(s, "SkipFactor", to_string(skipFactor));
    s = Replace(s, "AddrSpace", "__" + ToString(addr));
    s = Replace(s, "ElemType", GetUnsignedType(spec.BitWidth));
    s = Replace(s, "ContribType", GetUnsignedType(spec.ContribBitWidth));
    s = Replace(s, "WiRows", to_string(spec.WiRows));
    s = Replace(s, "SubGroupSize", to_string(spec.SubGroupSize));
    s = Replace(s, "Rows", to_string(spec.Rows));
    return s;
}

//
// Small load function creators
//

// Define non-checked API load for a single address space.
static string DefineSmallLoadForAddressSpace(MatrixSpec spec, AddrSpace addr)
{
    string funcName = GetMatrixFunctionName(spec, addr, false);
    if (!CheckIfFunctionNameIsUnique(funcName)) return "";

    string s;
    s += "INLINE void " + funcName;
    s += "(__private char *dst, char *mem, long stride, int cacheOpt) {\n";

    if (addr == AddrSpace_Generic)
    {
        s += "__builtin_assume((__global char*)mem != 0);\n";
        s += "int memIsGlobal = (0 != SPIRV_BUILTIN(GenericCastToPtrExplicit, "
             "_p1i8_p4i8_i32, _ToGlobal)(__builtin_astype((mem), __generic char*), "
             "StorageWorkgroup));\n";

        s += "if (memIsGlobal) {\n";
        s += ImplementSmallLoad2DBlock(spec, false);
        s += ImplementSmallLoadVectorContinuous(spec, AddrSpace_Global);
        s += ImplementSmallLoadVector(spec, AddrSpace_Global);
        s += ImplementSmallLoadScalar(spec, AddrSpace_Global);

        s += "} else { /* mem is local */\n";
        s += ImplementSmallLoadVectorContinuous(spec, AddrSpace_Local);
        s += ImplementSmallLoadVector(spec, AddrSpace_Local);
        s += ImplementSmallLoadScalar(spec, AddrSpace_Local);
        s += "}\n";
    }
    else
    {
        if (addr == AddrSpace_Global) s += ImplementSmallLoad2DBlock(spec, false);

        s += ImplementSmallLoadVectorContinuous(spec, addr);
        s += ImplementSmallLoadVector(spec, addr);
        s += ImplementSmallLoadScalar(spec, addr);
    }

    s += "}\n\n";
    return s;
}

// Define small load for 3 address spaces and a checked load (if possible).
static string DefineSmallLoad(MatrixSpec spec)
{
    string s;
    s += DefineSmallLoadForAddressSpace(spec, AddrSpace_Generic);
    s += DefineSmallLoadForAddressSpace(spec, AddrSpace_Local);
    s += DefineSmallLoadForAddressSpace(spec, AddrSpace_Global);

    // Implement checked API small load.
    string checkedBlockImpl = ImplementSmallLoad2DBlock(spec, true);
    // If checkedBlockImpl string is empty then implementing
    // 2D block load for specified parameters wasn't possible.
    if (checkedBlockImpl.size())
    {
        string funcName = GetMatrixFunctionName(spec, AddrSpace_Global, true);
        if (CheckIfFunctionNameIsUnique(funcName))
        {
            s += "INLINE void " + funcName;
            s += "(__private char *dst, char *mem, int y, int x, int height, int width, "
                 "long stride, int cacheOpt) {\n";
            s += checkedBlockImpl;
            s += "}\n\n";
        }
    }

    return s;
}

// Define small loads (3 address spaces + checked) for all row permutations.
static string
DefineSmallLoadPermuteRows(MatrixSpec spec, int rows_start = 1, int rows_end = 8)
{
    string s;
    for (int rows = rows_end; rows >= rows_start; rows--)
    {
        s += DefineSmallLoad(
            MatrixSpec(spec.SubGroupSize, spec.Layout, rows, spec.Cols, spec.BitWidth));
    }
    return s;
}

//
// Large load implementations
//
static string
ImplementLargeLoadVectorContinuous(MatrixSpec spec, AddrSpace addr, int numLoads)
{
    string s;
    if (numLoads == 2 && spec.Layout == Layout_PackedA_RowMajor)
    {
        /* Optimization for big shapes 1d load, where number of columns is multiple of sub-group size
        specifically, for sub group size 16 and number of columns 32, we can load 2 elements in one instruction */
        s += "for (int i = 0; i < Rows; i++) {\n";
        s += "  ushort2 row = intel_sub_group_block_read_us2((AddrSpace ushort *)(mem + "
             "i * stride * ElemByteWidth));\n";
        s += "  *((__private ushort *)(dst +  i         * ContribByteWidth)) = row.x;\n";
        s += "  *((__private ushort *)(dst + (i + Rows) * ContribByteWidth)) = row.y;\n";
        s += "}\n";
    }
    else if (numLoads == 4)
    {
        /* Optimization for big shapes 1d load, where number of columns is multiple of sub-group size
        specifically, for sub group size 16 and number of columns 64, we can load 4 elements in one instruction */
        if (spec.Layout == Layout_Accumulator_RowMajor ||
            spec.Layout == Layout_PackedB_PackedB)
        {
            s += "for (int i = 0; i < Rows; i++) {\n";
            s += "  uint4 row = intel_sub_group_block_read4((AddrSpace uint *)(mem + i * "
                 "stride * ElemByteWidth));\n";
            s += "  *((__private uint *)(dst +  i           * ContribByteWidth)) = "
                 "row.x;\n";
            s += "  *((__private uint *)(dst + (i + Rows  ) * ContribByteWidth)) = "
                 "row.y;\n";
            s += "  *((__private uint *)(dst + (i + Rows*2) * ContribByteWidth)) = "
                 "row.z;\n";
            s += "  *((__private uint *)(dst + (i + Rows*3) * ContribByteWidth)) = "
                 "row.w;\n";
            s += "}\n";
        }
        else
        {
            s += "for (int i = 0; i < Rows; i++) {\n";
            s += "  ushort4 row0 = intel_sub_group_block_read_us4((AddrSpace uint *)(mem "
                 "+ (2*i    ) * stride * ElemByteWidth));\n";
            s += "  ushort4 row1 = intel_sub_group_block_read_us4((AddrSpace uint *)(mem "
                 "+ (2*i + 1) * stride * ElemByteWidth));\n";
            s += "  *((__private uint *)(dst +  i           * ContribByteWidth)) = "
                 "as_int((ushort2)(row0.x, row1.x));\n";
            s += "  *((__private uint *)(dst + (i + Rows  ) * ContribByteWidth)) = "
                 "as_int((ushort2)(row0.y, row1.y));\n";
            s += "  *((__private uint *)(dst + (i + Rows*2) * ContribByteWidth)) = "
                 "as_int((ushort2)(row0.z, row1.z));\n";
            s += "  *((__private uint *)(dst + (i + Rows*3) * ContribByteWidth)) = "
                 "as_int((ushort2)(row0.w, row1.w));\n";
            s += "}\n";
        }
    }

    if (!s.size()) return "";

    // Add VECTOR_CONT_IMPL guards
    s = "if (BIF_FLAG_CTRL_GET(JointMatrixLoadStoreOpt) == VECTOR_CONT_IMPL) {\n" + s +
        "return;\n}\n";

    // Replace template words
    s = Replace(s, "Rows", to_string(spec.Rows));
    s = Replace(s, "AddrSpace", "__" + ToString(addr));
    s = Replace(s, "ElemByteWidth", to_string(Bytes(spec.BitWidth)));
    s = Replace(s, "ContribByteWidth", to_string(Bytes(spec.ContribBitWidth)));
    s = Replace(s, "ElemBytes", to_string(Bytes(spec.BitWidth)));
    return s;
}

// ImplementLargeLoadBase - default implementation for large shapes which is reusing smaller loads
static string
ImplementLargeLoadBase(MatrixSpec spec, AddrSpace addr, int numLoads, bool isChecked)
{
    int wiRowsPerLoad = spec.WiRows / numLoads;

    struct StridesResult
    {
        int row, col;
    };

    StridesResult strides = {spec.Rows, spec.Cols};
    if (numLoads == 2)
    {
        switch (spec.Layout)
        {
        default:
            break;

        /* Explanation of calculation for row stride and column stride.
            PackedA_RowMajor 16x16, sub_group_size=16, using 2 stores example:
            Each subgroup stores 2 of 8x16 slices. Hence, row_stride (# of rows between consecutive stores) = R / 2 = 16 / 2 = 8
            and column_stride (# of columns between consecutive stores) = C = 16. */
        case Layout_PackedA_RowMajor:
        case Layout_Accumulator_RowMajor:
            strides.col /= 2;
            break;
        }
    }
    else if (numLoads == 4)
    {
        switch (spec.Layout)
        {
        default:
            break;

        /* PackedA_RowMajor 32x16, sub_group_size=8, using 4 stores example:
            Each subgroup stores 4 of 8x16 slices. Hence, row_stride = R / 4 = 32 / 4 = 8 and column_stride = C = 16. */
        case Layout_PackedA_RowMajor:
            strides.row /= 4;
            break;

        /* PackedB_RowMajor, 16x32 (VNNI shape 8x64), sub_group_size=8, using 4 stores example.
            Each subgroup stores 4 of 16x8 slices. Since the shape for matrix B is in VNNI format on device, we store 16 x 8 slice as 8x16.
            Hence, row_stride = R (VNNI'ed) = 8 and column_stride = C (VNNI'ed) / 4 = 64 / 4 = 16. */
        case Layout_PackedB_RowMajor:
        /* PackedB_PackedB, d16 R=16, C=128 (orig shape: d16 32x64), sub_group_size=16, using 8 stores:
            1 store opeartion handles d32 8x16 (d16 8x32). Hence, row_stride = R /2 = 8 and column_stride = C / 4 = 128 / 4 = 32. */
        case Layout_PackedB_PackedB:
        /* Accumulator_RowMajor 32x32, sub_group_size=8, using 4 stores example:
            Each subgroup stores 4 of 32x8 slices. Hence, row_stride = R = 32 and column_stride = C / 4 = 32 / 4 = 8. */
        case Layout_Accumulator_RowMajor:
            strides.col /= 4;
            break;
        }
    }

    // Helper function
    auto GetMemOffset = [](MatrixSpec spec, StridesResult strides, int index) -> string {
        /* GetMemOffset calculates the memory offset, used in big shapes implementation */
        // Calculates memory offset for multiple loads/stores, where RowStride and ColumnStride are shape of one store
        // and NumLoads is stride in units equal to RowStride or ColumnStride, depending on order in which small matrices are
        // loaded/stored as big matrix.
        // For example, if matrix has shape 32x32 and is being stored using 8 stores 8x16 in that col-major order:
        // 0, 4 <-- each number is matrix 8x16 and index.
        // 1, 5
        // 2, 6
        // 3, 7
        // then parameters would be:
        // RowStride = 8, ColumnStride = 16, NumLoads = 4

        bool
            useRowMajor = // Split into 4 slices col-wise. Host memory location increments by column_stride.
            (spec.Layout == Layout_PackedB_PackedB ||
             spec.Layout == Layout_Accumulator_RowMajor ||
             spec.Layout == Layout_PackedB_RowMajor);

        string colMajor =
            "((IterIndex % NumLoads)*RowStride*stride + (IterIndex / "
            "NumLoads)*ColStride)";
        string rowMajor =
            "((IterIndex / NumLoads)*RowStride*stride + (IterIndex % "
            "NumLoads)*ColStride)";
        string result = (useRowMajor ? rowMajor : colMajor);
        result        = Replace(result, "IterIndex", to_string(index));

        int rowStride = strides.row;
        int colStride = strides.col;
        // PackedB_RowMajor is split into 4 slices col-wise. Host memory location increments by column_stride converted to original shape.
        if (spec.Layout == Layout_PackedB_RowMajor) colStride /= 2;

        result = Replace(result, "RowStride", to_string(rowStride));
        result = Replace(result, "ColStride", to_string(colStride));
        result = Replace(result, "IterIndex", to_string(index));
        return result;
    };

    string s;
    if (isChecked)
    {
        if (numLoads == 2)
        {
            // Prepare dst pointers
            s += "__private char *dst0 = dst;\n";
            s += "__private char *dst1 = dst + WiRowsPerLoad * ContribByteWidth;\n";
            // Parepare x offsets
            s += "int x0 = x;";
            s += "int x1 = x + 16;";
            // Call load sub-functions
            s += "LoadFunc(dst0, mem, y, x0, height, width, stride, cacheOpt);\n";
            s += "LoadFunc(dst1, mem, y, x1, height, width, stride, cacheOpt);\n";
        }
        else if (numLoads == 4)
        {
            // Prepare dst pointers
            s += "__private char *dst0 = dst;\n";
            s += "__private char *dst1 = dst +     WiRowsPerLoad * ContribByteWidth;\n";
            s += "__private char *dst2 = dst + 2 * WiRowsPerLoad * ContribByteWidth;\n";
            s += "__private char *dst3 = dst + 3 * WiRowsPerLoad * ContribByteWidth;\n";
            // Parepare x offsets
            s += "int x0 = x + " + GetMemOffset(spec, strides, 0) + ";";
            s += "int x1 = x + " + GetMemOffset(spec, strides, 1) + ";";
            s += "int x2 = x + " + GetMemOffset(spec, strides, 2) + ";";
            s += "int x3 = x + " + GetMemOffset(spec, strides, 3) + ";";
            // Call load sub-functions
            s += "LoadFunc(dst0, mem, y, x0, height, width, stride, cacheOpt);\n";
            s += "LoadFunc(dst1, mem, y, x1, height, width, stride, cacheOpt);\n";
            s += "LoadFunc(dst2, mem, y, x2, height, width, stride, cacheOpt);\n";
            s += "LoadFunc(dst3, mem, y, x3, height, width, stride, cacheOpt);\n";
        }
    }
    else
    {
        if (numLoads == 2)
        {
            // Prepare dst pointers
            s += "__private char *dst0 = dst;\n";
            s += "__private char *dst1 = dst + WiRowsPerLoad * ContribByteWidth;\n";
            // Prepare mem (source) pointers
            s += "char *mem0 = mem;\n";
            s += "char *mem1 = mem + 16 * ElemByteWidth;\n";
            // Call load sub-functions
            s += "LoadFunc(dst0, mem0, stride, cacheOpt);\n";
            s += "LoadFunc(dst1, mem1, stride, cacheOpt);\n";
        }
        else if (numLoads == 4)
        {
            // Prepare dst pointers
            s += "__private char *dst0 = dst;\n";
            s += "__private char *dst1 = dst +     WiRowsPerLoad * ContribByteWidth;\n";
            s += "__private char *dst2 = dst + 2 * WiRowsPerLoad * ContribByteWidth;\n";
            s += "__private char *dst3 = dst + 3 * WiRowsPerLoad * ContribByteWidth;\n";
            // Prepare mem (source) pointers
            s += "char *mem0 = mem + " + GetMemOffset(spec, strides, 0) +
                 " * ElemByteWidth;\n";
            s += "char *mem1 = mem + " + GetMemOffset(spec, strides, 1) +
                 " * ElemByteWidth;\n";
            s += "char *mem2 = mem + " + GetMemOffset(spec, strides, 2) +
                 " * ElemByteWidth;\n";
            s += "char *mem3 = mem + " + GetMemOffset(spec, strides, 3) +
                 " * ElemByteWidth;\n";
            // Call load sub-functions
            s += "LoadFunc(dst0, mem0, stride, cacheOpt);\n";
            s += "LoadFunc(dst1, mem1, stride, cacheOpt);\n";
            s += "LoadFunc(dst2, mem2, stride, cacheOpt);\n";
            s += "LoadFunc(dst3, mem3, stride, cacheOpt);\n";
        }
    }

    // Replace template strings.
    MatrixSpec subMatrixSpec(
        spec.SubGroupSize, spec.Layout, strides.row, strides.col, spec.BitWidth);
    string loadFunc = GetMatrixFunctionName(subMatrixSpec, addr, isChecked);

    s = Replace(s, "LoadFunc", loadFunc);
    s = Replace(s, "AddrSpace", "__" + ToString(addr));
    s = Replace(s, "ElemByteWidth", "ElemBytes");
    s = Replace(s, "ElemBytes", to_string(Bytes(spec.BitWidth)));
    s = Replace(s, "ContribByteWidth", to_string(Bytes(spec.ContribBitWidth)));
    s = Replace(s, "WiRowsPerLoad", to_string(wiRowsPerLoad));
    s = Replace(s, "NumLoads", to_string(numLoads));
    s = Replace(s, "Rows", to_string(spec.Rows));
    s = Replace(s, "LoadArgs", "");
    return s;
}

//
// Large load function creators
//

// Define non-checked large load for a single address space.
static string
DefineLargeLoadForAddressSpace(MatrixSpec spec, AddrSpace addr, int numLoads)
{
    string funcName = GetMatrixFunctionName(spec, addr, false);
    if (!CheckIfFunctionNameIsUnique(funcName)) return "";

    string s;
    s += "INLINE void " + funcName;
    s += "(__private char *dst, char *mem, long stride, int cacheOpt) {\n";

    if (addr == AddrSpace_Generic)
    {
        s += "__builtin_assume((__global char*)mem != 0);\n";
        s += "int memIsGlobal = (0 != SPIRV_BUILTIN(GenericCastToPtrExplicit, "
             "_p1i8_p4i8_i32, _ToGlobal)(__builtin_astype((mem), __generic char*), "
             "StorageWorkgroup));\n";

        s += "if (memIsGlobal) {\n";
        s += ImplementLargeLoadVectorContinuous(spec, AddrSpace_Global, numLoads);
        s += ImplementLargeLoadBase(spec, AddrSpace_Global, numLoads, false);

        s += "} else { /* mem is local */\n";
        s += ImplementLargeLoadVectorContinuous(spec, AddrSpace_Local, numLoads);
        s += ImplementLargeLoadBase(spec, AddrSpace_Local, numLoads, false);
        s += "}\n";
    }
    else
    {
        s += ImplementLargeLoadVectorContinuous(spec, addr, numLoads);
        s += ImplementLargeLoadBase(spec, addr, numLoads, false);
    }

    s += "}\n";
    return s;
}

// Define large load for 3 address spaces and a checked large load.
static string DefineLargeLoad(MatrixSpec spec)
{
    int numLoads = 4;
    if (spec.ContribBitWidth == BITS_16) numLoads = 2;

    string s;
    s += DefineLargeLoadForAddressSpace(spec, AddrSpace_Generic, numLoads);
    s += DefineLargeLoadForAddressSpace(spec, AddrSpace_Local, numLoads);
    s += DefineLargeLoadForAddressSpace(spec, AddrSpace_Global, numLoads);

    // Implement checked API large load.
    if (spec.DpasSubGroupSize >= SUB_GROUP_16)
    {
        string funcName = GetMatrixFunctionName(spec, AddrSpace_Global, true);
        if (CheckIfFunctionNameIsUnique(funcName))
        {
            s += "INLINE void " + funcName;
            s += "(__private char *dst, char *mem, int y, int x, int height, int width, "
                 "long stride, int cacheOpt) {\n";
            s += ImplementLargeLoadBase(spec, AddrSpace_Global, numLoads, true);
            s += "}\n\n";
        }
    }

    return s;
}

//
// Special large load function creators
//
static string DefineSpecialLarge1x64AddrSpace(MatrixSpec spec, AddrSpace addr)
{
    string funcName = GetMatrixFunctionName(spec, addr, false);
    if (!CheckIfFunctionNameIsUnique(funcName)) return "";

    string implBlock2D =
        "if (BIF_FLAG_CTRL_GET(JointMatrixLoadStoreOpt) >= BLOCK2D_IMPL) {\n"
        "  long offset = as_long(mem);\n" // align to 64-byte
        "  long baseoffset = offset & (~0x3f);\n" // load 1x64 as 4x16(32bit) or 2x32(16bit), hence, width is 16 int in bytes
        "  int width_bytes = ElemBytes * Width_1x64 - 1;\n" // load 1x64 as 4x16(32bit) or 2x32(16bit), hence, width is 16 int in bytes
        "  int height_minus_one = Height_1x64 - 1;\n" // row count
        "  int pitch_bytes = width_bytes;\n" // JointMatrices are expected to be contiguous in memory, without padding at the end of a row
        "  long x = (offset - baseoffset) / ElemBytes;\n" // in elements
        "  int2 coords = (int2)(x, 0);\n"
        "  ElemType4 BlockLoadFunc(long, int, int, int, int2, int);\n"
        "  ElemType4 res = BlockLoadFunc(baseoffset, width_bytes, height_minus_one, "
        "pitch_bytes, coords, cacheOpt);\n"
        "  *(__private ElemType4 *)dst = res;\n"
        "  return;\n"
        "}\n";

    string implVectors =
        "if(BIF_FLAG_CTRL_GET(JointMatrixLoadStoreOpt) >= VECTOR_CONT_IMPL) { \n"
        "       *(__private ElemType4 *) dst = VecFunc4((AddrSpace ElemType *)mem); \n"
        "   return; \n"
        "} \n"
        "if(BIF_FLAG_CTRL_GET(JointMatrixLoadStoreOpt) >= VECTOR_IMPL) {\n"
        "  __private ElemType *wi_contrib = (__private ElemType *)dst;\n"
        "  for (int i = 0; i < 4; i++)\n"
        "    wi_contrib[i] = VecFunc((__global ElemType *)mem + i*16);\n"
        "  return;\n"
        "}\n";

    string implScalar =
        "AddrSpace ElemType *ptr = (AddrSpace ElemType *)mem;\n"
        "int slid = get_sub_group_local_id();\n"
        "__private ElemType *wi_contrib = (__private ElemType *)dst;\n"
        "for (int i = 0; i < 4; i++)\n"
        "  wi_contrib[i] = ptr[i*16 + slid];\n";

    string s;
    s += "INLINE void " + funcName;
    s += "(__private char *dst, char *mem, long stride, int cacheOpt) {\n";

    if (addr == AddrSpace_Generic)
    {
        s += "__builtin_assume((__global char*)mem != 0);\n";
        s += "int memIsGlobal = (0 != SPIRV_BUILTIN(GenericCastToPtrExplicit, "
             "_p1i8_p4i8_i32, _ToGlobal)(__builtin_astype((mem), __generic char*), "
             "StorageWorkgroup));\n";
        s += "if (memIsGlobal) {\n";
        s += implBlock2D;
        s += implVectors;
        s += implScalar;
        s = Replace(s, "AddrSpace", "__" + ToString(AddrSpace_Global));
        s += "} else { /* mem is local */\n";
        s += implVectors;
        s += implScalar;
        s = Replace(s, "AddrSpace", "__" + ToString(AddrSpace_Local));
        s += "}\n";
    }
    else if (addr == AddrSpace_Global)
    {
        s += implBlock2D;
        s += implVectors;
        s += implScalar;
    }
    else
    {
        s += implVectors;
        s += implScalar;
    }

    s += "}\n\n";
    string vecFunc4 = "intel_sub_group_block_read" +
                      GetVectorLoadSuffix(spec.ContribBitWidth) +
                      ToStringAbove1(spec.WiRows);
    string vecFunc =
        "intel_sub_group_block_read" + GetVectorLoadSuffix(spec.ContribBitWidth);
    if (spec.BitWidth == 32)
    {
        string blockLoadFunc =
            "__builtin_IB_subgroup_block_read_flat_uElemBits_wiWiRows_m4k16v1";
        s = Replace(s, "BlockLoadFunc", blockLoadFunc);
        s = Replace(s, "Width_1x64", string("16"));
        s = Replace(s, "Height_1x64", string("4"));
    }
    else
    {
        string blockLoadFunc =
            "__builtin_IB_subgroup_block_read_flat_uElemBits_wiWiRows_m2k32v1";
        s = Replace(s, "BlockLoadFunc", blockLoadFunc);
        s = Replace(s, "Width_1x64", string("32"));
        s = Replace(s, "Height_1x64", string("2"));
    }
    s = Replace(s, "ElemBits", to_string(spec.BitWidth));
    s = Replace(s, "VecFunc4", vecFunc4);
    s = Replace(s, "VecFunc", vecFunc);
    s = Replace(s, "AddrSpace", "__" + ToString(addr));
    s = Replace(s, "ElemBytes", to_string(Bytes(spec.BitWidth)));
    s = Replace(s, "ElemType", GetUnsignedType(spec.BitWidth));
    s = Replace(s, "WiRows", to_string(spec.WiRows));
    return s;
}

static string DefineSpecialLarge1x64(MatrixSpec spec)
{
    string s;
    s += DefineSpecialLarge1x64AddrSpace(spec, AddrSpace_Generic);
    s += DefineSpecialLarge1x64AddrSpace(spec, AddrSpace_Local);
    s += DefineSpecialLarge1x64AddrSpace(spec, AddrSpace_Global);

    // Checked API special large load
    {
        string funcName = GetMatrixFunctionName(spec, AddrSpace_Global, true);
        if (CheckIfFunctionNameIsUnique(funcName))
        {
            s += "INLINE void " + funcName;
            s += "(__private char *dst, char *mem, int y, int x, int height, int width, "
                 "long stride, int cacheOpt) {\n";
            // load 1x64 as 4 loads 1x16
            s += "__private char *dst0 = dst + 0 * 1 * sizeof(int);\n"
                 "__private char *dst1 = dst + 1 * 1 * sizeof(int);\n"
                 "__private char *dst2 = dst + 2 * 1 * sizeof(int);\n"
                 "__private char *dst3 = dst + 3 * 1 * sizeof(int);\n"
                 "__builtin_spriv_OpJointMatrixLoadCheckedINTEL_Accumulator_RowMajor_"
                 "SG16_1x16_i32_1_v8i8_pi32_i32(dst0, mem, y, x + 0 * 16, height, width, "
                 "stride, cacheOpt);\n"
                 "__builtin_spriv_OpJointMatrixLoadCheckedINTEL_Accumulator_RowMajor_"
                 "SG16_1x16_i32_1_v8i8_pi32_i32(dst1, mem, y, x + 1 * 16, height, width, "
                 "stride, cacheOpt);\n"
                 "__builtin_spriv_OpJointMatrixLoadCheckedINTEL_Accumulator_RowMajor_"
                 "SG16_1x16_i32_1_v8i8_pi32_i32(dst2, mem, y, x + 2 * 16, height, width, "
                 "stride, cacheOpt);\n"
                 "__builtin_spriv_OpJointMatrixLoadCheckedINTEL_Accumulator_RowMajor_"
                 "SG16_1x16_i32_1_v8i8_pi32_i32(dst3, mem, y, x + 3 * 16, height, width, "
                 "stride, cacheOpt);\n";
            s += "}\n\n";
        }
    }
    s = Replace(s, "ElemBits", to_string(spec.BitWidth));
    s = Replace(s, "ElemBytes", to_string(Bytes(spec.BitWidth)));
    return s;
}

//
// Listings of load functions
//
static string DefineAllSmallLoads()
{
    string s;

    // PackedA, i8:
    s += DefineSmallLoadPermuteRows(
        MatrixSpec(SUB_GROUP_8, Layout_PackedA_RowMajor, 8, 32, BITS_8));

    s += DefineSmallLoadPermuteRows(
        MatrixSpec(SUB_GROUP_16, Layout_PackedA_RowMajor, 8, 32, BITS_8));
    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_16, Layout_PackedA_ColumnMajor, 8, 32, BITS_8));

    s += DefineSmallLoadPermuteRows(
        MatrixSpec(SUB_GROUP_32, Layout_PackedA_RowMajor, 8, 32, BITS_8));
    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_32, Layout_PackedA_ColumnMajor, 8, 32, BITS_8));

    // PackedA, i16:
    s += DefineSmallLoadPermuteRows(
        MatrixSpec(SUB_GROUP_8, Layout_PackedA_RowMajor, 8, 16, BITS_16));

    s += DefineSmallLoadPermuteRows(
        MatrixSpec(SUB_GROUP_16, Layout_PackedA_RowMajor, 8, 16, BITS_16));
    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_16, Layout_PackedA_RowMajor, 1, 32, BITS_16));
    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_16, Layout_PackedA_ColumnMajor, 8, 16, BITS_16));

    s += DefineSmallLoadPermuteRows(
        MatrixSpec(SUB_GROUP_32, Layout_PackedA_RowMajor, 8, 16, BITS_16));
    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_32, Layout_PackedA_ColumnMajor, 8, 16, BITS_16));

    // PackedA, i32 (tf32):
    s +=
        DefineSmallLoad(MatrixSpec(SUB_GROUP_16, Layout_PackedA_RowMajor, 8, 8, BITS_32));

    s +=
        DefineSmallLoad(MatrixSpec(SUB_GROUP_32, Layout_PackedA_RowMajor, 8, 8, BITS_32));


    // PackedB, i8:
    s += DefineSmallLoad(MatrixSpec(SUB_GROUP_8, Layout_PackedB_RowMajor, 8, 32, BITS_8));
    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_8, Layout_PackedB_ColumnMajor, 8, 32, BITS_8));
    s += DefineSmallLoad(MatrixSpec(SUB_GROUP_8, Layout_PackedB_PackedB, 8, 32, BITS_8));

    s +=
        DefineSmallLoad(MatrixSpec(SUB_GROUP_16, Layout_PackedB_RowMajor, 8, 64, BITS_8));
    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_16, Layout_PackedB_ColumnMajor, 8, 64, BITS_8));
    s += DefineSmallLoad(MatrixSpec(SUB_GROUP_16, Layout_PackedB_PackedB, 8, 64, BITS_8));

    s +=
        DefineSmallLoad(MatrixSpec(SUB_GROUP_32, Layout_PackedB_RowMajor, 8, 64, BITS_8));
    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_32, Layout_PackedB_ColumnMajor, 8, 64, BITS_8));
    s += DefineSmallLoad(MatrixSpec(SUB_GROUP_32, Layout_PackedB_PackedB, 8, 64, BITS_8));

    // PackedB, i16:
    s +=
        DefineSmallLoad(MatrixSpec(SUB_GROUP_8, Layout_PackedB_RowMajor, 8, 16, BITS_16));
    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_8, Layout_PackedB_ColumnMajor, 8, 16, BITS_16));
    s += DefineSmallLoad(MatrixSpec(SUB_GROUP_8, Layout_PackedB_PackedB, 8, 16, BITS_16));

    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_16, Layout_PackedB_RowMajor, 8, 32, BITS_16));
    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_16, Layout_PackedB_ColumnMajor, 8, 32, BITS_16));
    s +=
        DefineSmallLoad(MatrixSpec(SUB_GROUP_16, Layout_PackedB_PackedB, 8, 32, BITS_16));

    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_32, Layout_PackedB_RowMajor, 8, 32, BITS_16));
    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_32, Layout_PackedB_ColumnMajor, 8, 32, BITS_16));
    s +=
        DefineSmallLoad(MatrixSpec(SUB_GROUP_32, Layout_PackedB_PackedB, 8, 32, BITS_16));

    // PackedB, i32 (tf32):
    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_16, Layout_PackedB_RowMajor, 8, 16, BITS_32));

    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_32, Layout_PackedB_RowMajor, 8, 16, BITS_32));


    // Acumulator, i16
    s += DefineSmallLoadPermuteRows(
        MatrixSpec(SUB_GROUP_16, Layout_Accumulator_RowMajor, 8, 16, BITS_16));

    // Accumulator, i32:
    /* Load accumulator is a special case of load packed A, both are row major: */
    s += DefineSmallLoadPermuteRows(
        MatrixSpec(SUB_GROUP_8, Layout_Accumulator_RowMajor, 8, 8, BITS_32));
    s += DefineSmallLoadPermuteRows(
        MatrixSpec(SUB_GROUP_8, Layout_Accumulator_ColumnMajor, 8, 8, BITS_32));

    s += DefineSmallLoadPermuteRows(
        MatrixSpec(SUB_GROUP_16, Layout_Accumulator_RowMajor, 8, 16, BITS_32));
    s += DefineSmallLoadPermuteRows(
        MatrixSpec(SUB_GROUP_16, Layout_Accumulator_ColumnMajor, 8, 16, BITS_32));

    s += DefineSmallLoadPermuteRows(
        MatrixSpec(SUB_GROUP_32, Layout_Accumulator_RowMajor, 8, 16, BITS_32));
    s += DefineSmallLoadPermuteRows(
        MatrixSpec(SUB_GROUP_32, Layout_Accumulator_ColumnMajor, 8, 16, BITS_32));


    //
    // Special loads used by big shape loads:
    //

    // PackedA, i16:
    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_8, Layout_PackedA_RowMajor, 32, 16, BITS_16));

    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_16, Layout_PackedA_RowMajor, 16, 16, BITS_16));
    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_16, Layout_PackedA_RowMajor, 32, 16, BITS_16));

    // PackedB, i16
    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_16, Layout_PackedB_RowMajor, 16, 32, BITS_16));
    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_16, Layout_PackedB_PackedB, 16, 32, BITS_16));

    // Accumulator, i16
    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_16, Layout_Accumulator_RowMajor, 16, 16, BITS_16));
    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_16, Layout_Accumulator_RowMajor, 32, 32, BITS_16));

    // Accumulator, i32:
    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_8, Layout_Accumulator_RowMajor, 32, 8, BITS_32));

    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_16, Layout_Accumulator_RowMajor, 16, 16, BITS_32));
    s += DefineSmallLoad(
        MatrixSpec(SUB_GROUP_16, Layout_Accumulator_RowMajor, 32, 16, BITS_32));

    return s;
}

static string DefineAllLargeLoads()
{
    string s;
    // PackedA, i16:
    // technically it can be implemented using 1 load 32x32, but in that case to put together
    // 8 matrices for dpas, we would need 64 mov instructions
    // each matrix would be composed like that: (wi[0], wi[2], ..., wi[14]), (wi[1], wi[3], ..., wi[15]), ...
    // since only 16 elements are contiguous in memory for each 8x16 matrix for dpas
    // hence implementing load as 2 loads 32x16. That way we get 2 contiguous (for dpas) matrices,
    // which can be easily split on 4 8x16 matrices each.
    s += DefineLargeLoad(
        MatrixSpec(SUB_GROUP_16, Layout_PackedA_RowMajor, 32, 32, BITS_16));

    // PackedB, i16:
    s += DefineLargeLoad(MatrixSpec(SUB_GROUP_8, Layout_PackedB_PackedB, 8, 64, BITS_16));
    s +=
        DefineLargeLoad(MatrixSpec(SUB_GROUP_8, Layout_PackedB_RowMajor, 8, 64, BITS_16));

    s += DefineLargeLoad(
        MatrixSpec(SUB_GROUP_16, Layout_PackedB_PackedB, 8, 128, BITS_16));
    s += DefineLargeLoad(
        MatrixSpec(SUB_GROUP_16, Layout_PackedB_PackedB, 16, 128, BITS_16));
    s += DefineLargeLoad(
        MatrixSpec(SUB_GROUP_16, Layout_PackedB_RowMajor, 8, 128, BITS_16));
    s += DefineLargeLoad(
        MatrixSpec(SUB_GROUP_16, Layout_PackedB_RowMajor, 16, 128, BITS_16));

    // Accumulator, i32:
    s += DefineLargeLoad(
        MatrixSpec(SUB_GROUP_8, Layout_Accumulator_RowMajor, 32, 32, BITS_32));

    s += DefineLargeLoad(
        MatrixSpec(SUB_GROUP_16, Layout_Accumulator_RowMajor, 32, 64, BITS_32));

    // Accumulator, i16:
    s += DefineLargeLoad(
        MatrixSpec(SUB_GROUP_16, Layout_Accumulator_RowMajor, 32, 64, BITS_16));

    //
    // Special large loads
    //

    // Accumulator, i32 - 1x64
    s += DefineSpecialLarge1x64(
        MatrixSpec(SUB_GROUP_16, Layout_Accumulator_RowMajor, 1, 64, BITS_32));

    // Accumulator, i16 - 1x64
    s += DefineSpecialLarge1x64(
        MatrixSpec(SUB_GROUP_16, Layout_Accumulator_RowMajor, 1, 64, BITS_16));
    return s;
}

//
// main function prepares outputString and saves it to file
//
int main(int argc, char **argv)
{
    string outputString = FileHeader();
    outputString += DefineAllSmallLoads();
    outputString += DefineAllLargeLoads();
    outputString += CreatedFuncsWarningLog;

    const char *outputPath = "IBiF_matrix_generated.h";
    if (argc > 1)
    {
        outputPath = argv[1];
    }

    // Write outputString to file
    ofstream outputFile(outputPath);
    outputFile << outputString;
    outputFile.close();

    int errorCode = (outputFile ? 0 : 1);
    if (errorCode)
    {
        cerr << "Generation of matrix functions finished with write error. Output file: "
             << outputPath << "\n";
    }
    else
    {
        cout << "Generation of matrix functions finished. Output file: " << outputPath
             << "\n";
    }
    return errorCode;
}