1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
|
#version 450 core
#extension GL_KHR_memory_scope_semantics : enable
#extension GL_KHR_cooperative_matrix : enable
#extension GL_EXT_shader_explicit_arithmetic_types : enable
#extension GL_NV_cooperative_matrix2 : enable
layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
buffer BufType {
float16_t x[];
} Buf;
uint32_t addr(const in uint32_t x, const in uint32_t y) {
return y*64+x;
}
uint32_t foo() { return 124; }
float16_t relu(const in uint32_t row, const in uint32_t col, const in float16_t x) { return max(x, float16_t(0)); }
float16_t add(const in uint32_t row, const in uint32_t col, const in float16_t x, const in float16_t y) { return x+y; }
float16_t combineSum(const in float16_t a, const in float16_t b) { return a + b; }
float16_t combineMax(const in float16_t a, const in float16_t b) { return max(a, b); }
layout(constant_id = 0) const uint32_t Dim = 32;
void main()
{
coopmat<float16_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseA> A;
coopmat<float16_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseB> B;
coopmat<float16_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseAccumulator> Acc;
A = coopmat<float16_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseA>(Acc);
B = coopmat<float16_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseB>(Acc);
coopmat<float16_t, gl_ScopeWorkgroup, 32, 64, gl_MatrixUseB> tr;
coopMatTransposeNV(tr, Acc);
coopMatReduceNV(Acc, Acc, gl_CooperativeMatrixReduceRowNV, combineSum);
coopMatReduceNV(Acc, Acc, gl_CooperativeMatrixReduceColumnNV, combineSum);
coopMatReduceNV(Acc, Acc, gl_CooperativeMatrixReduceRowAndColumnNV, combineSum);
coopmat<float16_t, gl_ScopeWorkgroup, 32, 16, gl_MatrixUseAccumulator> Acc2x2;
coopMatReduceNV(Acc2x2, Acc, gl_CooperativeMatrixReduce2x2NV, combineMax);
//coopMatLoadTensorNV(A, Buf.x, foo(), addr);
//coopMatStoreTensorNV(A, Buf.x, foo(), addr);
coopMatPerElementNV(Acc, Acc, relu);
coopMatPerElementNV(Acc, Acc, add, float16_t(1.0));
coopmat<float16_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseAccumulator> Accf16;
coopmat<float32_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseAccumulator> Accf32;
coopmat<uint32_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseAccumulator> Accu32;
coopmat<int32_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseAccumulator> Accs32;
coopmat<uint8_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseA>(Accu32);
coopmat<int8_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseA>(Accu32);
coopmat<float16_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseA>(Accu32);
coopmat<uint8_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseA>(Accs32);
coopmat<int8_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseA>(Accs32);
coopmat<float16_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseA>(Accs32);
coopmat<uint8_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseA>(Accf16);
coopmat<int8_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseA>(Accf16);
coopmat<float16_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseA>(Accf16);
coopmat<uint8_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseA>(Accf32);
coopmat<int8_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseA>(Accf32);
coopmat<float16_t, gl_ScopeWorkgroup, 64, 32, gl_MatrixUseA>(Accf32);
coopmat<float, gl_ScopeWorkgroup, Dim, Dim, gl_MatrixUseAccumulator> li, mijm1;
mijm1 = coopmat<float, gl_ScopeWorkgroup, Dim, Dim, gl_MatrixUseAccumulator>(-1.0/0.0);
}
|