1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
|
/*
* Copyright (C) 2018-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/local_id_gen.h"
#include <array>
namespace NEO {
template <typename Vec, int simd>
inline void generateLocalIDsSimd(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup,
const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) {
const int passes = simd / Vec::numChannels;
int pass = 0;
uint32_t xDimNum = dimensionsOrder[0];
uint32_t yDimNum = dimensionsOrder[1];
uint32_t zDimNum = dimensionsOrder[2];
const Vec vLwsX(localWorkgroupSize[xDimNum]);
const Vec vLwsY(localWorkgroupSize[yDimNum]);
auto zero = Vec::zero();
auto one = Vec::one();
const auto threadSkipSize = ((simd == 32 || chooseMaxRowSize) ? 32 : 16) * sizeof(uint16_t);
Vec vSimdX(simd);
Vec vSimdY = zero;
Vec vSimdZ = zero;
Vec xWrap;
Vec yWrap;
// We need to convert simd into appropriate delta adders
do {
xWrap = vSimdX >= vLwsX;
// xWrap ? lwsX : 0;
auto deltaX = blend(vLwsX, zero, xWrap);
// x -= xWrap ? lwsX : 0;
vSimdX -= deltaX;
// xWrap ? 1 : 0;
auto deltaY = blend(one, zero, xWrap);
// y += xWrap ? 1 : 0;
vSimdY += deltaY;
yWrap = vSimdY >= vLwsY;
// yWrap ? lwsY : 0;
auto deltaY2 = blend(vLwsY, zero, yWrap);
// y -= yWrap ? lwsY : 0;
vSimdY -= deltaY2;
// yWrap ? 1 : 0;
auto deltaZ = blend(one, zero, yWrap);
// z += yWrap ? 1 : 0;
vSimdZ += deltaZ;
} while (xWrap || yWrap);
// Loop for each of the passes
do {
auto buffer = b;
Vec x(&initialLocalID[pass * Vec::numChannels]);
Vec y = zero;
Vec z = zero;
// Convert the initial SIMD lanes to localIDs
do {
xWrap = x >= vLwsX;
// xWrap ? lwsX : 0;
auto deltaX = blend(vLwsX, zero, xWrap);
// x -= xWrap ? lwsX : 0;
x -= deltaX;
// xWrap ? 1 : 0;
auto deltaY = blend(one, zero, xWrap);
// y += xWrap ? 1 : 0;
y += deltaY;
yWrap = y >= vLwsY;
// yWrap ? lwsY : 0;
auto deltaY2 = blend(vLwsY, zero, yWrap);
// y -= yWrap ? lwsY : 0;
y -= deltaY2;
// yWrap ? 1 : 0;
auto deltaZ = blend(one, zero, yWrap);
// z += yWrap ? 1 : 0;
z += deltaZ;
} while (xWrap);
for (size_t i = 0; i < threadsPerWorkGroup; ++i) {
x.store(ptrOffset(buffer, xDimNum * threadSkipSize));
y.store(ptrOffset(buffer, yDimNum * threadSkipSize));
z.store(ptrOffset(buffer, zDimNum * threadSkipSize));
x += vSimdX;
y += vSimdY;
z += vSimdZ;
xWrap = x >= vLwsX;
// xWrap ? lwsX : 0;
auto deltaX = blend(vLwsX, zero, xWrap);
// x -= xWrap ? lwsX : 0;
x -= deltaX;
// xWrap ? 1 : 0;
auto deltaY = blend(one, zero, xWrap);
// y += xWrap ? 1 : 0;
y += deltaY;
yWrap = y >= vLwsY;
// yWrap ? lwsY : 0;
auto deltaY2 = blend(vLwsY, zero, yWrap);
// y -= yWrap ? lwsY : 0;
y -= deltaY2;
// yWrap ? 1 : 0;
auto deltaZ = blend(one, zero, yWrap);
// z += yWrap ? 1 : 0;
z += deltaZ;
buffer = ptrOffset(buffer, 3 * threadSkipSize);
}
// Adjust buffer for next pass
b = ptrOffset(b, Vec::numChannels * sizeof(uint16_t));
} while (++pass < passes);
}
} // namespace NEO
|