File: local_id_gen.cpp

package info (click to toggle)
intel-compute-runtime 20.44.18297-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 34,780 kB
  • sloc: cpp: 379,729; lisp: 4,931; python: 299; sh: 196; makefile: 8
file content (138 lines) | stat: -rw-r--r-- 6,176 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
/*
 * Copyright (C) 2017-2020 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

#include "shared/source/helpers/local_id_gen.h"

#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/utilities/cpu_info.h"

#include <array>

namespace NEO {

struct uint16x8_t;
struct uint16x16_t;

// This is the initial value of SIMD for local ID
// computation.  It correlates to the SIMD lane.
// Must be 32byte aligned for AVX2 usage
ALIGNAS(32)
const uint16_t initialLocalID[] = {
    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};

// Lookup table for generating LocalIDs based on the SIMD of the kernel
void (*LocalIDHelper::generateSimd8)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 8>;
void (*LocalIDHelper::generateSimd16)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 16>;
void (*LocalIDHelper::generateSimd32)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 32>;

// Initialize the lookup table based on CPU capabilities
LocalIDHelper::LocalIDHelper() {
    bool supportsAVX2 = CpuInfo::getInstance().isFeatureSupported(CpuInfo::featureAvX2);
    if (supportsAVX2) {
        LocalIDHelper::generateSimd8 = generateLocalIDsSimd<uint16x8_t, 8>;
        LocalIDHelper::generateSimd16 = generateLocalIDsSimd<uint16x16_t, 16>;
        LocalIDHelper::generateSimd32 = generateLocalIDsSimd<uint16x16_t, 32>;
    }
}

LocalIDHelper LocalIDHelper::initializer;

//traditional function to generate local IDs
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
    auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
    bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
    if (useLayoutForImages) {
        generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
    } else if (simd == 32) {
        LocalIDHelper::generateSimd32(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
    } else if (simd == 16) {
        LocalIDHelper::generateSimd16(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
    } else if (simd == 8) {
        LocalIDHelper::generateSimd8(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
    } else {
        generateLocalIDsForSimdOne(buffer, localWorkgroupSize, dimensionsOrder, grfSize);
    }
}

bool isCompatibleWithLayoutForImages(const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, uint16_t simd) {
    uint8_t xMask = simd == 8u ? 0b1 : 0b11;
    uint8_t yMask = 0b11;
    return dimensionsOrder.at(0) == 0 &&
           dimensionsOrder.at(1) == 1 &&
           (localWorkgroupSize.at(0) & xMask) == 0 &&
           (localWorkgroupSize.at(1) & yMask) == 0 &&
           localWorkgroupSize.at(2) == 1u;
}

inline void generateLocalIDsWithLayoutForImages(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t simd) {
    uint8_t rowWidth = simd == 32u ? 32u : 16u;
    uint8_t xDelta = simd == 8u ? 2u : 4u;                                                    // difference between corresponding values in consecutive X rows
    uint8_t yDelta = (simd == 8u || localWorkgroupSize.at(1) == 4u) ? 4u : rowWidth / xDelta; // difference between corresponding values in consecutive Y rows

    auto buffer = reinterpret_cast<uint16_t *>(b);
    uint16_t offset = 0u;
    auto numGrfs = (localWorkgroupSize.at(0) * localWorkgroupSize.at(1) * localWorkgroupSize.at(2) + (simd - 1)) / simd;
    uint8_t xMask = simd == 8u ? 0b1 : 0b11;
    uint16_t x = 0u;
    uint16_t y = 0u;
    for (auto grfId = 0; grfId < numGrfs; grfId++) {
        auto rowX = buffer + offset;
        auto rowY = buffer + offset + rowWidth;
        auto rowZ = buffer + offset + 2 * rowWidth;
        uint16_t extraX = 0u;
        uint16_t extraY = 0u;

        for (uint8_t i = 0u; i < simd; i++) {
            if (i > 0) {
                extraX++;
                if (extraX == xDelta) {
                    extraX = 0u;
                }
                if ((i & xMask) == 0) {
                    extraY++;
                    if (y + extraY == localWorkgroupSize.at(1)) {
                        extraY = 0;
                        x += xDelta;
                    }
                }
            }
            if (x == localWorkgroupSize.at(0)) {
                x = 0u;
                y += yDelta;
                if (y >= localWorkgroupSize.at(1)) {
                    y = 0u;
                }
            }
            rowX[i] = x + extraX;
            rowY[i] = y + extraY;
            rowZ[i] = 0u;
        }
        x += xDelta;
        offset += 3 * rowWidth;
    }
}

void generateLocalIDsForSimdOne(void *b, const std::array<uint16_t, 3> &localWorkgroupSize,
                                const std::array<uint8_t, 3> &dimensionsOrder, uint32_t grfSize) {
    uint32_t xDimNum = dimensionsOrder[0];
    uint32_t yDimNum = dimensionsOrder[1];
    uint32_t zDimNum = dimensionsOrder[2];

    for (int i = 0; i < localWorkgroupSize[zDimNum]; i++) {
        for (int j = 0; j < localWorkgroupSize[yDimNum]; j++) {
            for (int k = 0; k < localWorkgroupSize[xDimNum]; k++) {
                static_cast<uint16_t *>(b)[0] = k;
                static_cast<uint16_t *>(b)[1] = j;
                static_cast<uint16_t *>(b)[2] = i;
                b = ptrOffset(b, grfSize);
            }
        }
    }
}

} // namespace NEO