File: IGCBiF_Intrinsics_Dpas.cl

package info (click to toggle)
intel-graphics-compiler 1.0.12504.6-1%2Bdeb12u1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 83,912 kB
  • sloc: cpp: 910,147; lisp: 202,655; ansic: 15,197; python: 4,025; yacc: 2,241; lex: 1,570; pascal: 244; sh: 104; makefile: 25
file content (461 lines) | stat: -rw-r--r-- 26,036 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
/*========================== begin_copyright_notice ============================

Copyright (C) 2020-2021 Intel Corporation

SPDX-License-Identifier: MIT

============================= end_copyright_notice ===========================*/

#ifndef IGCBIF_INTRINSICS_DPAS_CL
#define IGCBIF_INTRINSICS_DPAS_CL
// This file is to be included into IGCBiF_Intrinsics.cl!

#pragma OPENCL EXTENSION cl_khr_fp16 : enable

// Names of SIMD8 dpas builtin functions are in the form:
//    __builtin_IB_idpas_<a's precision>_<b's precision>_<depth>_<repeatCount>  // experimental, non-subgroup
//    __builtin_IB_fdpas_<a's precision>_<b's precision>_<depth>_<repeatCount>  // experimental, non-subgroup
//    __builtin_IB_sub_group_idpas_<a's precision>_<b's precision>_<depth>_<repeatCount>
//    __builtin_IB_sub_group_fdpas_<a's precision>_<b's precision>_<depth>_<repeatCount>
//        Note that for fdpas, a and b must have the same precision!
//
//  retty, accty:
//     f
//  Precision:
//     idpas
//         u8/s8 : unsigned/signed 8 bits
//         u4/s4 : unsigned/signed 4 bits
//         u2/s2 : unsigned/signed 2 bits
//     fdpas
//         bf    : bfloat16
//         hf    : fp16 (half)
//  depth : 8
//  repeatCount: 1|2|4|8
//     No official support 3|5|6|7 even though igc handles these repeatCount already,
//     as other related features (types for 5/6/7, block read/write for vector3/5/6/7)
//     are not supported yet.
//
// Note that dpasw has the same format as dpas.
//


// Macro to generate signed/unsiged precisions
//   prefix : function's prefix, such as __builtin_IB_idpas
//   retty  : function's return type, also acc type.
//   aty    : type of argument a
//   bty    : type of argument b
//   abits  : the number of bits of argument a's precision
//   bbits  : the number of bits of argument b's precision
//   rcount : repeat count
//
#define DPAS_DEPTH_8(prefix, retty, aty, bty, abits, bbits, rcount) \
retty prefix##_s##abits##_s##bbits##_8_##rcount ( retty acc, aty a, bty b ) __attribute__((const)); \
retty prefix##_s##abits##_u##bbits##_8_##rcount ( retty acc, aty a, bty b ) __attribute__((const)); \
retty prefix##_u##abits##_s##bbits##_8_##rcount ( retty acc, aty a, bty b ) __attribute__((const)); \
retty prefix##_u##abits##_u##bbits##_8_##rcount ( retty acc, aty a, bty b ) __attribute__((const));

//
// WI version of dpas (suffix: <pa-bits>_<pb-bits>_<depth>_<rcount>)
//    Argument 'a' must be uniform. These functions can be used
//    in SIMD8 (preferred), SIMD16, etc.
// This is for experiment.
//
DPAS_DEPTH_8( __builtin_IB_idpas,  int,  int8,    int8,  8, 8, 1 )
DPAS_DEPTH_8( __builtin_IB_idpas,  int,  short8,  int8,  4, 8, 1 )
DPAS_DEPTH_8( __builtin_IB_idpas,  int,  char8,   int8,  2, 8, 1 )
DPAS_DEPTH_8( __builtin_IB_idpas,  int,  int8,    int4,  8, 4, 1 )
DPAS_DEPTH_8( __builtin_IB_idpas,  int,  int8,    int2,  8, 2, 1 )

// 2xint8 (double throughput, a & b are doubled in size)
DPAS_DEPTH_8( __builtin_IB_idpas,  int,  int8,    int8,  4, 4, 1 )
DPAS_DEPTH_8( __builtin_IB_idpas,  int,  short8,  int8,  2, 4, 1 )
DPAS_DEPTH_8( __builtin_IB_idpas,  int,  int8,    int4,  4, 2, 1 )
DPAS_DEPTH_8( __builtin_IB_idpas,  int,  short8,  int4,  2, 2, 1 )

float __builtin_IB_fdpas_bf_bf_8_1 (float acc, int8 a, int8 b) __attribute__((const));
float __builtin_IB_fdpas_hf_hf_8_1 (float acc, int8 a, int8 b) __attribute__((const));

//
// Sub group version of dpas. (suffix: <pa-bits>_<pb-bits>_<depth>_<rcount>)
//    The reason for naming them with "sub_group" is because that each work-item
//    needs to read data from other simd lanes. The single work-item's data alone
//    is not enough to do computation.
//
//    Note that these functions work for SIMD8 only.
//
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int,   int,     int8,  8, 8, 1 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int,   short,   int8,  4, 8, 1 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int,   char,    int8,  2, 8, 1 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int,   int,     int4,  8, 4, 1 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int,   int,     int2,  8, 2, 1 )

DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int2,  int2,    int8,  8, 8, 2 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int2,  short2,  int8,  4, 8, 2 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int2,  char2,   int8,  2, 8, 2 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int2,  int2,    int4,  8, 4, 2 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int2,  int2,    int2,  8, 2, 2 )

DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int4,  int4,    int8,  8, 8, 4 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int4,  short4,  int8,  4, 8, 4 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int4,  char4,   int8,  2, 8, 4 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int4,  int4,    int4,  8, 4, 4 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int4,  int4,    int2,  8, 2, 4 )

DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int8,  int8,    int8,  8, 8, 8 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int8,  short8,  int8,  4, 8, 8 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int8,  char8,   int8,  2, 8, 8 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int8,  int8,    int4,  8, 4, 8 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int8,  int8,    int2,  8, 2, 8 )

// 2xint8 (double throughput, a & b are doubled in size)
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int,   int,     int8,  4, 4, 1 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int,   short,   int8,  2, 4, 1 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int,   int,     int4,  4, 2, 1 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int,   short,   int4,  2, 2, 1 )

DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int2,  int2,    int8,  4, 4, 2 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int2,  short2,  int8,  2, 4, 2 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int2,  int2,    int4,  4, 2, 2 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int2,  short2,  int4,  2, 2, 2 )

DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int4,  int4,    int8,  4, 4, 4 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int4,  short4,  int8,  2, 4, 4 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int4,  int4,    int4,  4, 2, 4 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int4,  short4,  int4,  2, 2, 4 )

DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int8,  int8,    int8,  4, 4, 8 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int8,  short8,  int8,  2, 4, 8 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int8,  int8,    int4,  4, 2, 8 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpas,  int8,  short8,  int4,  2, 2, 8 )

// bfloat16 (deprecated)
float  __builtin_IB_sub_group_fdpas_8_1 (float  acc, int  a, int8 b) __attribute__((const)); // deprecated
float2 __builtin_IB_sub_group_fdpas_8_2 (float2 acc, int2 a, int8 b) __attribute__((const)); // deprecated
float4 __builtin_IB_sub_group_fdpas_8_4 (float4 acc, int4 a, int8 b) __attribute__((const)); // deprecated
float8 __builtin_IB_sub_group_fdpas_8_8 (float8 acc, int8 a, int8 b) __attribute__((const)); // deprecated

// bfloat16
float  __builtin_IB_sub_group_fdpas_bf_bf_8_1 (float  acc, int  a, int8 b) __attribute__((const));
float2 __builtin_IB_sub_group_fdpas_bf_bf_8_2 (float2 acc, int2 a, int8 b) __attribute__((const));
float4 __builtin_IB_sub_group_fdpas_bf_bf_8_4 (float4 acc, int4 a, int8 b) __attribute__((const));
float8 __builtin_IB_sub_group_fdpas_bf_bf_8_8 (float8 acc, int8 a, int8 b) __attribute__((const));

// half
float  __builtin_IB_sub_group_fdpas_hf_hf_8_1 (float  acc, int  a, int8 b) __attribute__((const));
float2 __builtin_IB_sub_group_fdpas_hf_hf_8_2 (float2 acc, int2 a, int8 b) __attribute__((const));
float4 __builtin_IB_sub_group_fdpas_hf_hf_8_4 (float4 acc, int4 a, int8 b) __attribute__((const));
float8 __builtin_IB_sub_group_fdpas_hf_hf_8_8 (float8 acc, int8 a, int8 b) __attribute__((const));

//
// dpasw: 'a' size is the half of the dpas version.
//    Not all combination of precisions are supported. For a dpas builin in which 'a'
//    is either 2, 4, 8 GRFs (simd8), its corresponding dpasw is supported. In this way,
//    'a' is fetched evenly from both EU0 and EU1.
//
//    The following shows what are supported and what are not supported:
//      1. r = 1 : no builtin supported.
//      2. r = 2 : (int8)   8-bit precision of 'a', and
//                 (2xint8) 4-bit precision of 'a'.
//      3. r = 4 : all combinations of a and b except a=2-bit and b=8-bit.
//      4. r = 8 : all combinations
//
DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int2,  int,     int8,  8, 8, 2 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int2,  int,     int4,  8, 4, 2 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int2,  int,     int2,  8, 2, 2 )

DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int4,  int2,    int8,  8, 8, 4 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int4,  short2,  int8,  4, 8, 4 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int4,  int2,    int4,  8, 4, 4 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int4,  int2,    int2,  8, 2, 4 )

DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int8,  int4,    int8,  8, 8, 8 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int8,  short4,  int8,  4, 8, 8 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int8,  char4,   int8,  2, 8, 8 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int8,  int4,    int4,  8, 4, 8 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int8,  int4,    int2,  8, 2, 8 )

// 2xint8 (double throughput)
DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int2,  int,     int8,  4, 4, 2 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int2,  int,     int4,  4, 2, 2 )

DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int4,  int2,    int8,  4, 4, 4 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int4,  short2,  int8,  2, 4, 4 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int4,  int2,    int4,  4, 2, 4 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int4,  short2,  int4,  2, 2, 4 )

DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int8,  int4,    int8,  4, 4, 8 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int8,  short4,  int8,  2, 4, 8 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int8,  int4,    int4,  4, 2, 8 )
DPAS_DEPTH_8( __builtin_IB_sub_group_idpasw,  int8,  short4,  int4,  2, 2, 8 )

// bfloat16
float2 __builtin_IB_sub_group_fdpasw_bf_bf_8_2 (float2 acc, int  a, int8 b) __attribute__((const));
float4 __builtin_IB_sub_group_fdpasw_bf_bf_8_4 (float4 acc, int2 a, int8 b) __attribute__((const));
float8 __builtin_IB_sub_group_fdpasw_bf_bf_8_8 (float8 acc, int4 a, int8 b) __attribute__((const));

// half
float2 __builtin_IB_sub_group_fdpasw_hf_hf_8_2 (float2 acc, int  a, int8 b) __attribute__((const));
float4 __builtin_IB_sub_group_fdpasw_hf_hf_8_4 (float4 acc, int2 a, int8 b) __attribute__((const));
float8 __builtin_IB_sub_group_fdpasw_hf_hf_8_8 (float8 acc, int4 a, int8 b) __attribute__((const));

// Names of SIMD16 dpas builtin functions are in the form:
//    __builtin_IB_sub_group16_idpas_<a's precision>_<b's precision>_<depth>_<repeatCount>
//    __builtin_IB_sub_group16_fdpas_<retty>_<accty>_<a's precision>_<b's precision>_<depth>_<repeatCount>
//        Note that for fdpas, a and b must have the same precision!
//
// In addition to the operand types supported on XeHP_SDV, PVC has supported additional types. here are all
// supported types:
//
// retty, accty:
//    f, bf, hf
//
//  Precision:
//     idpas
//         u8/s8 : unsigned/signed 8 bits
//         u4/s4 : unsigned/signed 4 bits
//         u2/s2 : unsigned/signed 2 bits
//     fdpas
//         bf    : bfloat16
//         hf    : fp16 (half)
//         bf8   : bfloat8
//         tf32  : tensorFloat
//  depth : 8
//  repeatCount: 1|2|4|8
//     No official support 3|5|6|7 even though igc handles these repeatCount already,
//     as other related features (types for 5/6/7, block read/write for vector3/5/6/7)
//     are not supported yet.
//
//   SIMD16 DPAS, the base type of 'a' will be halfed for each work-item compared
//   with XeHP_SDV SIMD8 dpas.  Thus, a's type is changed from int to short,  short to
//   char.  As there is no 4-bit integer type,  SIMD16 version of simd8 dpas with char
//   as a's base type will not be provided, thus no support via intrinsics for now. Those
//   are for src1-src2 pair [2-bit, 8-bit precision] only.
//
//   Also, the intrinsic is not overloaded, substring "sub_group16" is used to
//   distinguish with XeHP_SDV's simd8 intrinsic "sub_group". And for fdpas, the
//   return type and acc's type are encoded in the names. See format defined
//   right before XeHP_SDV intrinsics in this file.
//
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int,   short,   int8,  8, 8, 1 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int,   char,    int8,  4, 8, 1 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int,   short,   int4,  8, 4, 1 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int,   short,   int2,  8, 2, 1 )

DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int2,  short2,  int8,  8, 8, 2 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int2,  char2,   int8,  4, 8, 2 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int2,  short2,  int4,  8, 4, 2 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int2,  short2,  int2,  8, 2, 2 )

DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int4,  short4,  int8,  8, 8, 4 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int4,  char4,   int8,  4, 8, 4 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int4,  short4,  int4,  8, 4, 4 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int4,  short4,  int2,  8, 2, 4 )

DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int8,  short8,  int8,  8, 8, 8 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int8,  char8,   int8,  4, 8, 8 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int8,  short8,  int4,  8, 4, 8 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int8,  short8,  int2,  8, 2, 8 )

// 2xint8 (double throughput, a & b are doubled in size)
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int,   short,   int8,  4, 4, 1 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int,   char,    int8,  2, 4, 1 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int,   short,   int4,  4, 2, 1 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int,   char,    int4,  2, 2, 1 )

DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int2,  short2,  int8,  4, 4, 2 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int2,  char2,   int8,  2, 4, 2 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int2,  short2,  int4,  4, 2, 2 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int2,  char2,   int4,  2, 2, 2 )

DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int4,  short4,  int8,  4, 4, 4 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int4,  char4,   int8,  2, 4, 4 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int4,  short4,  int4,  4, 2, 4 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int4,  char4,   int4,  2, 2, 4 )

DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int8,  short8,  int8,  4, 4, 8 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int8,  char8,   int8,  2, 4, 8 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int8,  short8,  int4,  4, 2, 8 )
DPAS_DEPTH_8( __builtin_IB_sub_group16_idpas,  int8,  char8,   int4,  2, 2, 8 )

//
//  float/bfloat sub_group dpas naming convention (PVC, simd16 only)
//    __builtin_IB_sub_group16_fdpas_<rty>_<accty>_<aty>_<bty>_<depth>_<rcount>
//        rty:  return base type:  f (float32), hf(half), bf(bfloat)
//      accty:  acc's base type:   f (float32), hf(half), bf(bfloat)
//    aty/bty:  a/b's base type:  bf (bfloat),  hf(half)
//                                bf8, tf32(tensorFloat)
//
//  Note that as OCL has no bfloat type, use short instead.
//

// bfloat16, rcount = 1, simd16
float  __builtin_IB_sub_group16_fdpas_f_f_bf_bf_8_1   (float  acc, short  a, int8 b) __attribute__((const));
short  __builtin_IB_sub_group16_fdpas_bf_f_bf_bf_8_1  (float  acc, short  a, int8 b) __attribute__((const));
float  __builtin_IB_sub_group16_fdpas_f_bf_bf_bf_8_1  (short  acc, short  a, int8 b) __attribute__((const));
short  __builtin_IB_sub_group16_fdpas_bf_bf_bf_bf_8_1 (short  acc, short  a, int8 b) __attribute__((const));

// bfloat16, rcount = 2, simd16
float2  __builtin_IB_sub_group16_fdpas_f_f_bf_bf_8_2   (float2  acc, short2  a, int8 b) __attribute__((const));
short2  __builtin_IB_sub_group16_fdpas_bf_f_bf_bf_8_2  (float2  acc, short2  a, int8 b) __attribute__((const));
float2  __builtin_IB_sub_group16_fdpas_f_bf_bf_bf_8_2  (short2  acc, short2  a, int8 b) __attribute__((const));
short2  __builtin_IB_sub_group16_fdpas_bf_bf_bf_bf_8_2 (short2  acc, short2  a, int8 b) __attribute__((const));

// bfloat16, rcount = 4, simd16
float4  __builtin_IB_sub_group16_fdpas_f_f_bf_bf_8_4   (float4  acc, short4  a, int8 b) __attribute__((const));
short4  __builtin_IB_sub_group16_fdpas_bf_f_bf_bf_8_4  (float4  acc, short4  a, int8 b) __attribute__((const));
float4  __builtin_IB_sub_group16_fdpas_f_bf_bf_bf_8_4  (short4  acc, short4  a, int8 b) __attribute__((const));
short4  __builtin_IB_sub_group16_fdpas_bf_bf_bf_bf_8_4 (short4  acc, short4  a, int8 b) __attribute__((const));

// bfloat16, rcount = 8, simd16
float8  __builtin_IB_sub_group16_fdpas_f_f_bf_bf_8_8   (float8  acc, short8  a, int8 b) __attribute__((const));
short8  __builtin_IB_sub_group16_fdpas_bf_f_bf_bf_8_8  (float8  acc, short8  a, int8 b) __attribute__((const));
float8  __builtin_IB_sub_group16_fdpas_f_bf_bf_bf_8_8  (short8  acc, short8  a, int8 b) __attribute__((const));
short8  __builtin_IB_sub_group16_fdpas_bf_bf_bf_bf_8_8 (short8  acc, short8  a, int8 b) __attribute__((const));

// half, rcount = 1, simd16
float  __builtin_IB_sub_group16_fdpas_f_f_hf_hf_8_1   (float acc,  short  a, int8 b) __attribute__((const));
half   __builtin_IB_sub_group16_fdpas_hf_f_hf_hf_8_1  (float acc,  short  a, int8 b) __attribute__((const));
float  __builtin_IB_sub_group16_fdpas_f_hf_hf_hf_8_1  (half  acc,  short  a, int8 b) __attribute__((const));
half   __builtin_IB_sub_group16_fdpas_hf_hf_hf_hf_8_1 (half  acc,  short  a, int8 b) __attribute__((const));

// half, rcount = 2, simd16
float2  __builtin_IB_sub_group16_fdpas_f_f_hf_hf_8_2   (float2 acc,  short2  a, int8 b) __attribute__((const));
half2   __builtin_IB_sub_group16_fdpas_hf_f_hf_hf_8_2  (float2 acc,  short2  a, int8 b) __attribute__((const));
float2  __builtin_IB_sub_group16_fdpas_f_hf_hf_hf_8_2  (half2  acc,  short2  a, int8 b) __attribute__((const));
half2   __builtin_IB_sub_group16_fdpas_hf_hf_hf_hf_8_2 (half2  acc,  short2  a, int8 b) __attribute__((const));

// half, rcount = 4, simd16
float4  __builtin_IB_sub_group16_fdpas_f_f_hf_hf_8_4   (float4 acc,  short4  a, int8 b) __attribute__((const));
half4   __builtin_IB_sub_group16_fdpas_hf_f_hf_hf_8_4  (float4 acc,  short4  a, int8 b) __attribute__((const));
float4  __builtin_IB_sub_group16_fdpas_f_hf_hf_hf_8_4  (half4  acc,  short4  a, int8 b) __attribute__((const));
half4   __builtin_IB_sub_group16_fdpas_hf_hf_hf_hf_8_4 (half4  acc,  short4  a, int8 b) __attribute__((const));

// half, rcount = 8, simd16
float8  __builtin_IB_sub_group16_fdpas_f_f_hf_hf_8_8   (float8 acc,  short8  a, int8 b) __attribute__((const));
half8   __builtin_IB_sub_group16_fdpas_hf_f_hf_hf_8_8  (float8 acc,  short8  a, int8 b) __attribute__((const));
float8  __builtin_IB_sub_group16_fdpas_f_hf_hf_hf_8_8  (half8  acc,  short8  a, int8 b) __attribute__((const));
half8   __builtin_IB_sub_group16_fdpas_hf_hf_hf_hf_8_8 (half8  acc,  short8  a, int8 b) __attribute__((const));



// tf32, rcount = 1, simd16
float   __builtin_IB_sub_group16_fdpas_f_f_tf32_tf32_8_1  (float  acc, short  a, int8 b) __attribute__((const));

// tf32, rcount = 2, simd16
float2  __builtin_IB_sub_group16_fdpas_f_f_tf32_tf32_8_2  (float2 acc, short2 a, int8 b) __attribute__((const));

// tf32, rcount = 4, simd16
float4  __builtin_IB_sub_group16_fdpas_f_f_tf32_tf32_8_4  (float4 acc, short4 a, int8 b) __attribute__((const));

// tf32, rcount = 8, simd16
float8  __builtin_IB_sub_group16_fdpas_f_f_tf32_tf32_8_8  (float8 acc, short8 a, int8 b) __attribute__((const));


//
// Pure bloat16/half float DPAS (acc and return type are also bloat16/half)
short __builtin_IB_bfdpas_bf_bf_8_1 (short acc, int8 a, int8 b) __attribute__((const));  // deprecated
half  __builtin_IB_hfdpas_hf_hf_8_1 (half acc, int8 a, int8 b) __attribute__((const));   // deprecated
// pure sub group version of fdpas. (suffix: <pa-bits>_<pb-bits>_<depth>_<rcount>)
// pure bfloat16  -- deprecated
short  __builtin_IB_sub_group_bfdpas_bf_bf_8_1 (short  acc, int  a, int8 b) __attribute__((const));
short2 __builtin_IB_sub_group_bfdpas_bf_bf_8_2 (short2 acc, int2 a, int8 b) __attribute__((const));
short4 __builtin_IB_sub_group_bfdpas_bf_bf_8_4 (short4 acc, int4 a, int8 b) __attribute__((const));
short8 __builtin_IB_sub_group_bfdpas_bf_bf_8_8 (short8 acc, int8 a, int8 b) __attribute__((const));
// pure half -- deprecated
half  __builtin_IB_sub_group_hfdpas_hf_hf_8_1 (half  acc, int  a, int8 b) __attribute__((const));
half2 __builtin_IB_sub_group_hfdpas_hf_hf_8_2 (half2 acc, int2 a, int8 b) __attribute__((const));
half4 __builtin_IB_sub_group_hfdpas_hf_hf_8_4 (half4 acc, int4 a, int8 b) __attribute__((const));
half8 __builtin_IB_sub_group_hfdpas_hf_hf_8_8 (half8 acc, int8 a, int8 b) __attribute__((const));


// bf <--> float conversion
//    bf : no igc type for bf yet. Use short as *opaque* type for it.
//
// float -> bf conversion builtins (rte rounding mode)
short   __builtin_IB_ftobf_1 (float   a) __attribute__((const));
short2  __builtin_IB_ftobf_2 (float2  a) __attribute__((const));
short3  __builtin_IB_ftobf_3 (float3  a) __attribute__((const));
short4  __builtin_IB_ftobf_4 (float4  a) __attribute__((const));
short8  __builtin_IB_ftobf_8 (float8  a) __attribute__((const));
short16 __builtin_IB_ftobf_16(float16 a) __attribute__((const));

// bf -> float conversion builtins (precise conversion)
float   __builtin_IB_bftof_1 (short   a) __attribute__((const));
float2  __builtin_IB_bftof_2 (short2  a) __attribute__((const));
float3  __builtin_IB_bftof_3 (short3  a) __attribute__((const));
float4  __builtin_IB_bftof_4 (short4  a) __attribute__((const));
float8  __builtin_IB_bftof_8 (short8  a) __attribute__((const));
float16 __builtin_IB_bftof_16(short16 a) __attribute__((const));

// 2 floats --> packed 2 bf (rte rounding mode)
int   __builtin_IB_2fto2bf_1 (float   a, float   b) __attribute__((const));
int2  __builtin_IB_2fto2bf_2 (float2  a, float2  b) __attribute__((const));
int3  __builtin_IB_2fto2bf_3 (float3  a, float3  b) __attribute__((const));
int4  __builtin_IB_2fto2bf_4 (float4  a, float4  b) __attribute__((const));
int8  __builtin_IB_2fto2bf_8 (float8  a, float8  b) __attribute__((const));
int16 __builtin_IB_2fto2bf_16(float16 a, float16 b) __attribute__((const));


// bf8 <--> half float conversion
//    bf8 : no igc type for bf8 yet. Use char as *opaque* type for it.
//    Rounding: RTNE
//
// hf -> bf8 conversion builtins (rte rounding mode)
char   __builtin_IB_hftobf8_1 (half   a) __attribute__((const));
char2  __builtin_IB_hftobf8_2 (half2  a) __attribute__((const));
char3  __builtin_IB_hftobf8_3 (half3  a) __attribute__((const));
char4  __builtin_IB_hftobf8_4 (half4  a) __attribute__((const));
char8  __builtin_IB_hftobf8_8 (half8  a) __attribute__((const));
char16 __builtin_IB_hftobf8_16(half16 a) __attribute__((const));

// bf8 -> hf conversion builtins (precise conversion)
half   __builtin_IB_bf8tohf_1 (char   a) __attribute__((const));
half2  __builtin_IB_bf8tohf_2 (char2  a) __attribute__((const));
half3  __builtin_IB_bf8tohf_3 (char3  a) __attribute__((const));
half4  __builtin_IB_bf8tohf_4 (char4  a) __attribute__((const));
half8  __builtin_IB_bf8tohf_8 (char8  a) __attribute__((const));
half16 __builtin_IB_bf8tohf_16(char16 a) __attribute__((const));


// tf32 <--> float float conversion
//    tf32 : no igc type for tf32. Use int as *opaque* type for it.
//           (tf32: 19 bits, taking 32bit storage)
//
// f -> tf32 conversion builtins (rte rounding mode)
int   __builtin_IB_ftotf32_1 (float   a) __attribute__((const));
int2  __builtin_IB_ftotf32_2 (float2  a) __attribute__((const));
int3  __builtin_IB_ftotf32_3 (float3  a) __attribute__((const));
int4  __builtin_IB_ftotf32_4 (float4  a) __attribute__((const));
int8  __builtin_IB_ftotf32_8 (float8  a) __attribute__((const));
int16 __builtin_IB_ftotf32_16(float16 a) __attribute__((const));

// tf32 -> f conversion builtins (precise conversion, use shl a, 13)
float   __builtin_IB_tf32tof_1 (int   a) __attribute__((const));
float2  __builtin_IB_tf32tof_2 (int2  a) __attribute__((const));
float3  __builtin_IB_tf32tof_3 (int3  a) __attribute__((const));
float4  __builtin_IB_tf32tof_4 (int4  a) __attribute__((const));
float8  __builtin_IB_tf32tof_8 (int8  a) __attribute__((const));
float16 __builtin_IB_tf32tof_16(int16 a) __attribute__((const));

// Stochastic rounding : srnd d  a  r
//      d: bf8 | hf
//      a: hf | f
//      r: random number, has the same type as a's
//  HF -> BF8
char   __builtin_IB_srnd_hftobf8_1 (half   a, half   r) __attribute__((const));
char2  __builtin_IB_srnd_hftobf8_2 (half2  a, half2  r) __attribute__((const));
char3  __builtin_IB_srnd_hftobf8_3 (half3  a, half3  r) __attribute__((const));
char4  __builtin_IB_srnd_hftobf8_4 (half4  a, half4  r) __attribute__((const));
char8  __builtin_IB_srnd_hftobf8_8 (half8  a, half8  r) __attribute__((const));
char16 __builtin_IB_srnd_hftobf8_16(half16 a, half16 r) __attribute__((const));


// F -> HF
half   __builtin_IB_srnd_ftohf_1 (float   a, float   r) __attribute__((const));
half2  __builtin_IB_srnd_ftohf_2 (float2  a, float2  r) __attribute__((const));
half3  __builtin_IB_srnd_ftohf_3 (float3  a, float3  r) __attribute__((const));
half4  __builtin_IB_srnd_ftohf_4 (float4  a, float4  r) __attribute__((const));
half8  __builtin_IB_srnd_ftohf_8 (float8  a, float8  r) __attribute__((const));
half16 __builtin_IB_srnd_ftohf_16(float16 a, float16 r) __attribute__((const));

#endif // IGCBIF_INTRINSICS_DPAS_CL