File: mxfp.cpp

package info (click to toggle)
intel-graphics-compiler2 2.28.4-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 792,744 kB
  • sloc: cpp: 5,761,745; ansic: 466,928; lisp: 312,143; python: 114,790; asm: 44,736; pascal: 10,930; sh: 8,033; perl: 7,914; ml: 3,625; awk: 3,523; yacc: 2,747; javascript: 2,667; lex: 1,898; f90: 1,028; cs: 573; xml: 474; makefile: 344; objc: 162
file content (124 lines) | stat: -rw-r--r-- 4,031 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/*========================== begin_copyright_notice ============================

Copyright (C) 2024 Intel Corporation

SPDX-License-Identifier: MIT

============================= end_copyright_notice ===========================*/

#include <cm-cl/math.h>
#include <cm-cl/vector.h>

using namespace cm;

namespace {
CM_NODEBUG CM_INLINE vector<int16_t, 32 * 16>
reduce_first_step(vector<int16_t, 32 * 32> Src) {
  constexpr int NumRows = 16;
  constexpr int16_t AbsMask = 0x7FFF;

  vector<int16_t, 32 * NumRows> Res;

#pragma unroll
  for (int I = 0; I < 32; I += 2) {
    auto Row0Offset = I * 32;
    auto Row1Offset = (I + 1) * 32;

    vector<int16_t, 32> Row0 = Src.template select<32, 1>(Row0Offset);
    vector<int16_t, 32> Row1 = Src.template select<32, 1>(Row1Offset);

    Row0 = Row0 & AbsMask;
    Row1 = Row1 & AbsMask;

    auto ResOffset = I / 2 * 32;
    auto ResRow = Res.template select<32, 1>(ResOffset);
    auto ResEven = ResRow.template select<16, 2>(0);
    auto ResOdd = ResRow.template select<16, 2>(1);

    vector<int16_t, 16> Row0Even = Row0.template select<16, 2>(0);
    vector<int16_t, 16> Row0Odd = Row0.template select<16, 2>(1);
    ResEven = math::maximum(Row0Odd, Row0Even); // argument order is important

    vector<int16_t, 16> Row1Even = Row1.template select<16, 2>(0);
    vector<int16_t, 16> Row1Odd = Row1.template select<16, 2>(1);
    ResOdd = math::maximum(Row1Even, Row1Odd); // argument order is important
  }

  return Res;
}

template <int Height>
CM_NODEBUG CM_INLINE vector<int16_t, 32 * Height / 2>
reduce_next_step(vector<int16_t, 32 * Height> Src) {
  constexpr int NumRows = Height / 2;

  vector<int16_t, 32 * NumRows> Res;

#pragma unroll
  for (int I = 0; I < Height; I += 2) {
    auto Row0Offset = I * 32;
    auto Row1Offset = (I + 1) * 32;

    vector<int16_t, 32> Row0 = Src.template select<32, 1>(Row0Offset);
    vector<int16_t, 32> Row1 = Src.template select<32, 1>(Row1Offset);

    auto ResOffset = I / 2 * 32;
    auto ResRow = Res.template select<32, 1>(ResOffset);
    auto ResEven = ResRow.template select<16, 2>(0);
    auto ResOdd = ResRow.template select<16, 2>(1);

    vector<int16_t, 16> Low0 = Row0.template select<16, 1>(0);
    vector<int16_t, 16> High0 = Row0.template select<16, 1>(16);
    ResEven = math::maximum(High0, Low0); // argument order is important

    vector<int16_t, 16> Low1 = Row1.template select<16, 1>(0);
    vector<int16_t, 16> High1 = Row1.template select<16, 1>(16);
    ResOdd = math::maximum(High1, Low1); // argument order is important
  }

  return Res;
}

CM_NODEBUG CM_INLINE vector<int16_t, 32> reduce(vector<int16_t, 32 * 32> Src) {
  auto Step1 = reduce_first_step(Src);
  auto Step2 = reduce_next_step<16>(Step1);
  auto Step3 = reduce_next_step<8>(Step2);
  auto Step4 = reduce_next_step<4>(Step3);
  auto Step5 = reduce_next_step<2>(Step4);
  return Step5;
}

CM_NODEBUG CM_INLINE vector<int16_t, 32> linearize(vector<int16_t, 32> Src) {
  cl_vector<uint32_t, 16> CIndices = {0, 8, 4, 12, 2, 10, 6, 14,
                                      1, 9, 5, 13, 3, 11, 7, 15};
  vector<uint32_t, 16> Indices(CIndices);
  Indices *= 0x11;

  vector<uint32_t, 16> SrcI = Src.template format<uint32_t>();
  vector<uint8_t, 64> IndexBytes = Indices.template format<uint8_t>();

  auto ShuffleI = math::upconvert_4bit_lut<0>(SrcI, IndexBytes);
  auto Shuffle = ShuffleI.template format<int16_t>();
  auto ShuffleEven = Shuffle.template select<16, 2>(0);
  auto ShuffleOdd = Shuffle.template select<16, 2>(1);

  vector<int16_t, 32> Res;
  auto ResLow = Res.template select<16, 1>(0);
  auto ResHigh = Res.template select<16, 1>(16);

  ResLow = ShuffleEven;
  ResHigh = ShuffleOdd;

  return Res;
}
} // namespace

CM_NODEBUG CM_INLINE extern "C" cl_vector<int16_t, 32>
__vc_builtin_mxfp_reduce_32x32(cl_vector<int16_t, 32 * 32> Src) {
  return reduce(Src).cl_vector();
}

CM_NODEBUG CM_INLINE extern "C" cl_vector<int16_t, 32>
__vc_builtin_mxfp_linearize(cl_vector<int16_t, 32> Src) {
  return linearize(Src).cl_vector();
}