File: RayDispatchGlobalData.h

package info (click to toggle)
intel-graphics-compiler2 2.16.0-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 106,644 kB
  • sloc: cpp: 805,640; lisp: 287,672; ansic: 16,414; python: 3,952; yacc: 2,588; lex: 1,666; pascal: 313; sh: 186; makefile: 35
file content (386 lines) | stat: -rw-r--r-- 17,271 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
/*========================== begin_copyright_notice ============================

Copyright (C) 2020-2025 Intel Corporation

SPDX-License-Identifier: MIT

============================= end_copyright_notice ===========================*/

#pragma once

#include <stddef.h>
#include <stdint.h>
#undef HAS_INCLUDE_TYPE_TRAITS
#ifdef _MSC_VER
#if !defined(__clang__) || (__clang_major__ > 16)
#define HAS_INCLUDE_TYPE_TRAITS
#endif
#endif // _MSC_VER
#ifdef HAS_INCLUDE_TYPE_TRAITS
#include <type_traits>
#endif

// This code defines a set of macros, that allows us to intentionally disable certain warnings
// in a way that is both concise and portable across compilers.
// We need to disable the warnings in such way because this C++ file may be compiled with Clang, GCC, and Microsoft
// Visual Studio Compiler This works around (at least) two known problems:
//  - QuickBuild treats compiler warnings as errors
//  - RTStackReflection.exe generates warnings when compiling this file
//
// The only macros that are used by the main code are:
//     DISABLE_WARNING_PUSH
//     DISABLE_WARNING_POP
//     and the macros for the individual warnings
// The others are lower level internal utilities, used only by higher level macros.
//
// Explanation of code in original resource : https://www.fluentcpp.com/2019/08/30/how-to-disable-a-warning-in-cpp/

// clang-format off
#if defined(__clang__)
    #define DO_PRAGMA_DIAG(X) _Pragma(#X)                                                     // internal utility
    #define DISABLE_WARNING_PUSH         DO_PRAGMA_DIAG(GCC diagnostic push)
    #define DISABLE_WARNING_POP          DO_PRAGMA_DIAG(GCC diagnostic pop)
    #define DISABLE_WARNING(warningName) DO_PRAGMA_DIAG(GCC diagnostic ignored #warningName)  // internal utility
    // Add warnings that you want to disable here:
    #define DISABLE_WARNING_ANONYMOUS_STRUCT_UNION   \
        DISABLE_WARNING(-Wgnu-anonymous-struct)  // anonymous structs are a GNU extension
    #define DISABLE_WARNING_PADDING_AT_END_OF_STRUCT
    #define DISABLE_WARNING_ANON_TYPES_IN_ANON_UNION \
        DISABLE_WARNING(-Wnested-anon-types)     // anonymous types declared in an anonymous union are an extension
#elif defined(__GNUC__)
    #define DO_PRAGMA_DIAG(X) _Pragma(#X)                                                     // internal utility
    #define DISABLE_WARNING_PUSH         DO_PRAGMA_DIAG(GCC diagnostic push)
    #define DISABLE_WARNING_POP          DO_PRAGMA_DIAG(GCC diagnostic pop)
    // Treats only this header file as a system header, AKA disables all warnings in this header file only.
    // The effect does not extend into any file that includes this header file.
    #pragma GCC system_header
    // define the macros that are used in the code down below to prevent compiler failure
    // NOTE: Apparently GCC compiler doesn't support disabling individual warnings.
    #define DISABLE_WARNING_ANONYMOUS_STRUCT_UNION
    #define DISABLE_WARNING_PADDING_AT_END_OF_STRUCT
    #define DISABLE_WARNING_ANON_TYPES_IN_ANON_UNION
#elif defined(_MSC_VER)
    #define DISABLE_WARNING_PUSH           __pragma(warning( push ))
    #define DISABLE_WARNING_POP            __pragma(warning( pop ))
    #define DISABLE_WARNING(warningNumber) __pragma(warning( disable : warningNumber ))  // internal utility
    // Add warnings that you want to disable here:
    #define DISABLE_WARNING_ANONYMOUS_STRUCT_UNION   \
        DISABLE_WARNING(4201)  // nonstandard extension used: nameless struct/union
    #define DISABLE_WARNING_PADDING_AT_END_OF_STRUCT \
        DISABLE_WARNING(4820)  // 'MemTravStack::<unnamed-tag>': '4' bytes padding added after data member 'MemTravStack::<unnamed-tag>::offset'
    #define DISABLE_WARNING_ANON_TYPES_IN_ANON_UNION
#else
    // define the macros that are used in the code down below to prevent compiler failure
    // NOTE: internal utility macros should not be defined here
    #define DISABLE_WARNING_PUSH
    #define DISABLE_WARNING_POP
    // Add warnings that you want to disable here:
    #define DISABLE_WARNING_ANONYMOUS_STRUCT_UNION
    #define DISABLE_WARNING_PADDING_AT_END_OF_STRUCT
    #define DISABLE_WARNING_ANON_TYPES_IN_ANON_UNION
#endif
// clang-format on

// save the current pragma state, save the current compiler settings select the
// warnings that we want to disable, for this file only
DISABLE_WARNING_PUSH
DISABLE_WARNING_ANONYMOUS_STRUCT_UNION
DISABLE_WARNING_PADDING_AT_END_OF_STRUCT
DISABLE_WARNING_ANON_TYPES_IN_ANON_UNION

namespace IGC {
// A class to be used, by the UMD, as a base for implementation
// of a class for populating the content of RayDispatchGlobalData
// structure.
class RayDispatchGlobalDataAdaptor {
public:
  uint64_t GetRayStackBufferAddress() const;
  uint64_t GetCallStackHandlerPtr() const;
  uint32_t GetStackSizePerRay() const;
  uint32_t GetNumDSSRTStacks() const;
  uint32_t GetMaxBVHLevels() const;
  uint64_t GetHitGroupTable() const;
  uint64_t GetMissShaderTable() const;
  uint64_t GetCallableShaderTable() const;
  uint32_t GetHitGroupStride() const;
  uint32_t GetMissStride() const;
  uint32_t GetCallableStride() const;
  uint32_t GetDispatchWidth() const;
  uint32_t GetDispatchHeight() const;
  uint32_t GetDispatchDepth() const;
  uint32_t GetSWStackSizePerRay() const;
  uint64_t GetBindlessHeapBasePtr() const { return 0; };
  uint64_t GetPrintfBufferBasePtr() const { return 0; };
  uint64_t GetProfilingBufferGpuVa() const { return 0; };
  uint64_t GetStatelessScratchPtr() const { return 0; };
  uint32_t GetBaseSSHOffset() const { return 0; };
  uint32_t GetUberTilesMap() const { return 0; };
};

// Layout used to pass global data to the shaders
struct RayDispatchGlobalData {
  // Stack size is encoded in # of 64 byte chunks.
  // e.g., 0x0 = 64 bytes, 0x1 = 128 bytes, etc.
  static constexpr uint32_t StackChunkSize = 64;
  static constexpr uint32_t LargeStack = StackChunkSize * 64;

  struct RayDispatchGlobalDataCommon {
    // For anything that appears above this line that we want to duplicate below
    // for a more packed layout, we prefix it with a 'p' to denote that it is a
    // 'private' copy of the value.  The strategy here is to order the fields
    // from top to bottom in order from least frequently used to most.
    uint64_t printfBufferBasePtr;  // base pointer of printf buffer
    uint64_t ProfilingBufferGpuVa; // GTPin buffer to collect profiling data
    union {
      uint64_t padding;
    };
    union {
      uint64_t statelessScratchPtr; // Stateless scratch buffer pointer
    };
    uint64_t pCallableShaderBasePtr; // base pointer of callable shader record array (8-bytes alignment)
    uint32_t pCallableShaderStride;  // stride of callable shader records (8-bytes alignment)
    uint32_t pNumDSSRTStacks;        // number of stacks per DSS
    union {
      uint64_t bindlessHeapBasePtr; // base pointer of bindless heap
    };
    uint64_t pHitGroupBasePtr;   // base pointer of hit group shader record array (16-bytes alignment)
    uint64_t pMissShaderBasePtr; // base pointer of miss shader record array (8-bytes alignment)
    uint32_t pHitGroupStride;    // stride of hit group shader records (16-bytes alignment)
    uint32_t pMissShaderStride;  // stride of miss shader records (8-bytes alignment)
    uint64_t pRtMemBasePtr;      // base address of the allocated stack memory

    union {
      struct {
        uint32_t baseSSHOffset; // (index/bindless offset) of first memory region for stateful indirect accesses
        uint32_t paddingBits3;  // 32-bits of padding
      };

    };

    uint64_t uberTilesMap; // base address of the uber tiles map used for AtomicPull model
    uint32_t paddingBits4; // 1 dword of padding to make sure the struct size is multiple of 8 on all platforms

    uint32_t pStackSizePerRay;          // maximal stack size of a ray
    uint32_t swStackSizePerRay;         // size in bytes per ray of the SWStack
    uint32_t dispatchRaysDimensions[3]; // dispatch dimensions of the thread grid

    template <class TAPIAdaptor> void populate(const TAPIAdaptor &umd) {
      printfBufferBasePtr = umd.GetPrintfBufferBasePtr();
      ProfilingBufferGpuVa = umd.GetProfilingBufferGpuVa();
      statelessScratchPtr = umd.GetStatelessScratchPtr();
      pCallableShaderBasePtr = umd.GetCallableShaderTable();
      pCallableShaderStride = umd.GetCallableStride();
      bindlessHeapBasePtr = umd.GetBindlessHeapBasePtr();
      pHitGroupBasePtr = umd.GetHitGroupTable();
      pMissShaderBasePtr = umd.GetMissShaderTable();
      pHitGroupStride = umd.GetHitGroupStride();
      pMissShaderStride = umd.GetMissStride();
      pRtMemBasePtr = umd.GetRayStackBufferAddress();
      baseSSHOffset = umd.GetBaseSSHOffset();
      pStackSizePerRay = umd.GetStackSizePerRay();
      swStackSizePerRay = umd.GetSWStackSizePerRay();
      pNumDSSRTStacks = umd.GetNumDSSRTStacks();
      dispatchRaysDimensions[0] = umd.GetDispatchWidth();
      dispatchRaysDimensions[1] = umd.GetDispatchHeight();
      dispatchRaysDimensions[2] = umd.GetDispatchDepth();
      uberTilesMap = umd.GetUberTilesMap();

    }
  };

  union RT {
    struct Xe {
      template <class TAPIAdaptor> void populate(const TAPIAdaptor &umd) {
        rtMemBasePtr = umd.GetRayStackBufferAddress();
        callStackHandlerPtr = umd.GetCallStackHandlerPtr();
        stack_size_info.stackSizePerRay = umd.GetStackSizePerRay();
        num_stacks_info.numDSSRTStacks = umd.GetNumDSSRTStacks();
        rt_data_info.maxBVHLevels = umd.GetMaxBVHLevels();

        common.populate(umd);
      }

      // Cached by HW (32 bytes)
      uint64_t rtMemBasePtr;        // base address of the allocated stack memory
      uint64_t callStackHandlerPtr; // this is the KSP of the continuation handler that is invoked by BTD when the read
                                    // KSP is 0
      union {
        uint32_t stackSizePerRay; // maximal stack size of a ray
        struct {
          uint32_t sizePerRay : 8;
          uint32_t MBZ1 : 24;
        };
      } stack_size_info;
      union {
        uint32_t numDSSRTStacks; // number of stacks per DSS
        struct {
          uint32_t numRTStacks : 12;
          uint32_t MBZ2 : 20;
        };
      } num_stacks_info;
      union {
        uint32_t maxBVHLevels; // the maximal number of supported instancing levels
        struct {
          uint32_t bvhLevels : 3;
          uint32_t MBZ3 : 29;
        };
      } rt_data_info;
      // In addition to the dword of padding to align `common`, we also
      // add 8 dwords so Xe and Xe3 both have the same RTGlobals size.
      uint32_t paddingBits[1 + 8]; // padding

      // HW doesn't read anything below this point.
      RayDispatchGlobalDataCommon common;
    } xe;
    struct Xe3 {
      template <class TAPIAdaptor> void populate(const TAPIAdaptor &umd) {
        rtMemBasePtr = umd.GetRayStackBufferAddress();
        callStackHandlerPtr = umd.GetCallStackHandlerPtr();
        stack_size_info.stackSizePerRay = umd.GetStackSizePerRay();
        num_stacks_info.numRTStacks = umd.GetNumDSSRTStacks();

       // _pad1_mbz higher 16 bits must be zero.
        num_stacks_info.numRTStacks = (num_stacks_info.numRTStacks & 0x0000FFFF);

        constexpr uint32_t strideMask = (1 << 13) - 1;
        const uint32_t hgs = umd.GetHitGroupStride() & strideMask;
        const uint32_t mss = umd.GetMissStride() & strideMask;
        rt_data_info.packedData = (umd.GetMaxBVHLevels() << 0) | (hgs << 3) | (mss << 16);

        hitGroupBasePtr = umd.GetHitGroupTable();
        missShaderBasePtr = umd.GetMissShaderTable();

        common.populate(umd);
      }

      uint64_t rtMemBasePtr;        // base address of the allocated stack memory
      uint64_t callStackHandlerPtr; // this is the KSP of the continuation handler that is invoked by BTD when the read
                                    // KSP is 0
      union {
        uint32_t stackSizePerRay; // async-RT stack size in 64 byte blocks
        uint32_t _pad0_mbz : 32;
      } stack_size_info;
      union {
        uint32_t numRTStacks; // number of stacks per DSS
        struct {
          uint32_t numDSSRTStacks : 16; // number of asynch stacks per DSS
          uint32_t _pad1_mbz : 16;
        };

      } num_stacks_info;
      union {
        uint32_t packedData;
        struct {
          uint32_t maxBVHLevels : 3;      // the maximal number of supported instancing levels (0->8, 1->1, 2->2, ...)
          uint32_t hitGroupStride : 13;   // stride of hit group shader records (16-bytes alignment)
          uint32_t missShaderStride : 13; // stride of miss shader records (8-bytes alignment)
          uint32_t _pad2_mbz : 3;
        };
      } rt_data_info;
      uint32_t flags : 1; // per context control flags
      uint32_t pad_mbz : 31;
      uint64_t hitGroupBasePtr;   // base pointer of hit group shader record array (16-bytes alignment)
      uint64_t missShaderBasePtr; // base pointer of miss shader record array (8-bytes alignment)
      uint32_t _align_mbz[4];     // pad hardware section to 64 bytes

      // HW doesn't read anything below this point.
      RayDispatchGlobalDataCommon common;
    } xe3;
  } rt;
};

// The actual pointer will probably be aligned to a greater value than this,
// but just specify a value large enough that we can definitely do block reads.
constexpr uint32_t RTGlobalsAlign = 256;

// The 'stackSizePerRay' from 'RayDispatchGlobalData' counts the # of 64B
// chunks of stack so this must be, at minimum, 64-byte aligned.
constexpr uint32_t RTStackAlign = 128;
static_assert(RTStackAlign % RayDispatchGlobalData::StackChunkSize == 0, "no?");

static_assert((sizeof(RayDispatchGlobalData::RT::Xe) - sizeof(RayDispatchGlobalData::RayDispatchGlobalDataCommon)) %
                      64 ==
                  0,
              "Unexpected GlobalData alignment");
static_assert((sizeof(RayDispatchGlobalData::RT::Xe3) - sizeof(RayDispatchGlobalData::RayDispatchGlobalDataCommon)) %
                      64 ==
                  0,
              "Unexpected GlobalData alignment");

static_assert(sizeof(RayDispatchGlobalData) == 192, "unexpected size?");

static_assert(sizeof(RayDispatchGlobalData::RT::Xe) == sizeof(RayDispatchGlobalData), "unexpected size?");
static_assert(sizeof(RayDispatchGlobalData::RT::Xe3) == sizeof(RayDispatchGlobalData), "unexpected size?");
static_assert(offsetof(RayDispatchGlobalData::RT::Xe, common) == offsetof(RayDispatchGlobalData::RT::Xe3, common),
              "unexpected size?");
#ifdef HAS_INCLUDE_TYPE_TRAITS
static_assert(std::is_standard_layout<RayDispatchGlobalData>::value, "no?");
#endif // HAS_INCLUDE_TYPE_TRAITS

// This data is passed in as inline data into the raygeneration shader
// by the UMD.
struct RayDispatchInlinedData {
  uint64_t RayDispatchDescriptorAddress; // ShaderRecord*, r2.0:uq
  uint64_t RayDispatchGlobalDataPtr;     //                r2.1:uq
  uint64_t AtomicPullGlobalQueuePtr;     //                r2.2:uq

  static constexpr unsigned NumElts = 3;
};


template <typename Type> constexpr Type Align(const Type value, const size_t alignment) {
  // alignment must be power of 2.
  Type mask = static_cast<Type>(alignment) - 1;
  return (value + mask) & ~mask;
}

// Maximum size of a block read that we will do from RayDispatchGlobalData.
static constexpr uint32_t MAX_BLOCK_SIZE = 256;

struct RTGlobalsAllocationData {
  // Globals layout should look like this:
  // RayDispatchGlobalData #0 <- main
  // global root signature
  // RayDispatchGlobalData #1 <- auxiliary
  // ...
  // RayDispatchGlobalData #n
  uint32_t totalSize;
  uint32_t rootSigOffset;
  uint32_t auxRTGlobalsStartOffset;
  uint32_t auxRTGlobalsIncrementSize;
};

inline RTGlobalsAllocationData GlobalsAllocationSize(uint32_t GlobalRootSigSize, uint32_t numOfAuxRTGlobals = 0) {
  RTGlobalsAllocationData data = {0, 0, 0, 0};
  constexpr uint32_t RTGlobalsPointerIncrementSize = uint32_t(Align(sizeof(RayDispatchGlobalData), RTGlobalsAlign));
  constexpr uint32_t GlobalRootSigOffset = uint32_t(Align(sizeof(RayDispatchGlobalData), sizeof(uint64_t)));

  uint32_t TotalSize = GlobalRootSigOffset + GlobalRootSigSize + MAX_BLOCK_SIZE;

  data.rootSigOffset = GlobalRootSigOffset;
  data.totalSize = Align(TotalSize, 64);

  if (numOfAuxRTGlobals) {
    data.auxRTGlobalsStartOffset = Align(data.totalSize, RTGlobalsAlign);
    data.auxRTGlobalsIncrementSize = RTGlobalsPointerIncrementSize;
    data.totalSize = Align(data.auxRTGlobalsStartOffset + numOfAuxRTGlobals * data.auxRTGlobalsIncrementSize, 64);
  }

  return data;
}

enum class RT_TILE_LAYOUT { _1D, _2D };

// We only really care about the 'Y' dimension right now. Even though a dispatch
// like (1024, 1, 1024) could be labelled as 2D, we would better off labelling
// it as 1D. Regardless, we mostly expect the dimensions to be populated
// from X -> Y -> Z so we wouldn't do well on, for example, (1, 1024, 1024).
//
// Pass in the Width, Height, Depth from D3D12_DISPATCH_RAYS_DESC.
constexpr RT_TILE_LAYOUT selectRTTileLayout(uint32_t /*Width*/, uint32_t Height, uint32_t /*Depth*/) {
  return (Height == 1) ? RT_TILE_LAYOUT::_1D : RT_TILE_LAYOUT::_2D;
}

} // namespace IGC

DISABLE_WARNING_POP // restore the previously saved pragma state, restore former compiler settings