File: device_enqueue.h

package info (click to toggle)
intel-compute-runtime 20.44.18297-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 34,780 kB
  • sloc: cpp: 379,729; lisp: 4,931; python: 299; sh: 196; makefile: 8
file content (134 lines) | stat: -rw-r--r-- 4,643 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/*
 * Copyright (C) 2017-2020 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

#pragma once

// Uncomment this macro to build "empty" schedulers
//#define WA_DISABLE_SCHEDULERS 1

#if !defined(__OPENCL_VERSION__)
#include <cstdint>

typedef uint32_t uint;
typedef uint64_t ulong;
#endif

#define OCLRT_SIZEOF_MEDIA_INTERFACE_DESCRIPTOR_LOAD_DEVICE_CMD (4 * sizeof(uint))
#define OCLRT_SIZEOF_MEDIA_CURBE_LOAD_DEVICE_CMD (4 * sizeof(uint))
#define OCLRT_SIZEOF_MEDIA_STATE_FLUSH (2 * sizeof(uint))
#define OCLRT_SIZEOF_MI_ATOMIC_CMD (11 * sizeof(uint))
#define OCLRT_SIZEOF_MEDIA_VFE_STATE_CMD (9 * sizeof(uint))
#define OCLRT_SIZEOF_MI_ARB_CHECK (1 * sizeof(uint))

#define OCLRT_SIZEOF_MEDIA_INTERFACE_DESCRIPTOR_LOAD_DEVICE_CMD_DWORD_OFFSET (4)
#define OCLRT_SIZEOF_MI_ATOMIC_CMD_DWORD_OFFSET (11)
#define OCLRT_SIZEOF_MEDIA_CURBE_LOAD_DEVICE_CMD_DWORD_OFFSET (4)
#define OCLRT_IMM_LOAD_REGISTER_CMD_DEVICE_CMD_DWORD_OFFSET (3)

#define OCLRT_SIZEOF_MSFLUSH_DWORD (2)
#define OCLRT_SIZEOF_MI_ARB_CHECK_DWORD (1)
#define OCLRT_SIZEOF_MEDIA_VFE_STATE_DWORD (9)

#define OCLRT_BATCH_BUFFER_END_CMD (83886080)

//Constant buffer stuff
#define COMPILER_DATA_PARAMETER_GLOBAL_SURFACE (49)

#define SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT (50)

#define SCHEDULER_DATA_PARAMETER_GLOBAL_POINTER_SHIFT (63)
#define SCHEDULER_DATA_PARAMETER_SAMPLER_SHIFT (51)
#define SCHEDULER_DATA_PARAMETER_SAMPLER_ADDED_VALUE (2 * SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT)

#define CS_PREFETCH_SIZE (8 * 64)

#define ALL_BITS_SET_DWORD_MASK (0xffffffff)
#define DWORD_SIZE_IN_BITS (32)

#define CL_sRGB 0x10BF
#define CL_sRGBX 0x10C0
#define CL_sRGBA 0x10C1
#define CL_sBGRA 0x10C2

//scheduler currently can spawn up to 8 GPGPU_WALKERS between scheduler runs, so it needs 8 * 3 HW threads for scheduling blocks + 1 HW thread to scheduler next scheduler
//each HW group consist of 3 HW threads that are capable of scheduling 1 block

//!!! Make sure value of this define equals MAX_NUMBER_OF_PARALLEL_GPGPU_WALKERS in DeviceEnqueueInternalTypes.h
#define PARALLEL_SCHEDULER_HW_GROUPS (8)
#define PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP (3)
#define PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20 (3)
#define PARALLEL_SCHEDULER_HW_GROUPS_IN_THREADS (PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP * PARALLEL_SCHEDULER_HW_GROUPS)

#define PARALLEL_SCHEDULER_NUMBER_HW_THREADS (PARALLEL_SCHEDULER_HW_GROUPS_IN_THREADS + 1)

//parallel scheduler 2.0 is compiled in simd8
#define PARALLEL_SCHEDULER_COMPILATION_SIZE_20 (8)

#define HW_GROUP_ID_SHIFT(COMPILATION_SIZE) ((COMPILATION_SIZE & 0x10) ? 4 : 3)

#define GRF_SIZE (32)
#define SIZEOF_3GRFS (3 * GRF_SIZE)

//estimation for dynamic payload size
#define SCHEDULER_DYNAMIC_PAYLOAD_SIZE (PARALLEL_SCHEDULER_NUMBER_HW_THREADS * SIZEOF_3GRFS)

//assume that max DSH per walker is 9472B ( assuming registers can take up to 4KB, and max dynamic payload is around 96B * 56(HW threads) it should be fine.
#define MAX_DSH_SIZE_PER_ENQUEUE 9472

#define MAX_BINDING_TABLE_INDEX (253)
#define MAX_SSH_PER_KERNEL_SIZE (MAX_BINDING_TABLE_INDEX * 64) //max SSH that can be one kernel. It is 253 binding table entries multiplied by the Surface State size.

#define OCLRT_ARG_OFFSET_TO_SAMPLER_OBJECT_ID(ArgOffset) (ArgOffset + MAX_SSH_PER_KERNEL_SIZE)
#define OCLRT_IMAGE_MAX_OBJECT_ID (MAX_SSH_PER_KERNEL_SIZE - 1)
#define OCLRT_SAMPLER_MIN_OBJECT_ID (MAX_SSH_PER_KERNEL_SIZE)

typedef enum tagDebugDataTypes {
    DBG_DEFAULT = 0,
    DBG_COMMAND_QUEUE = 1,
    DBG_EVENTS_UPDATE = 2,
    DBG_EVENTS_NUMBER = 3,
    DBG_STACK_UPDATE = 4,
    DBG_BEFORE_PATCH = 5,
    DBG_KERNELID = 6,
    DBG_DSHOFFSET = 7,
    DBG_IDOFFSET = 8,
    DBG_AFTER_PATCH = 9,
    DBG_UNSPECIFIED = 10,
    DBG_ENQUEUES_NUMBER = 11,
    DBG_LOCAL_ID,
    DBG_WKG_ID,
    DBG_SCHEDULER_END,
    // Add here new debug enums
    DBG_MAX
} DebugDataTypes;
// Struct for debugging kernels
typedef struct
{
    DebugDataTypes m_dataType;
    uint m_dataSize;
} DebugDataInfo;
typedef struct
{
    enum DDBFlags { DDB_HAS_DATA_INFO = 1,
                    DDB_SCHEDULER_PROFILING = 2,
                    DDB_COMMAND_QUEUE_RAW = 4 } ddbFlags;
    uint m_size;
    uint m_stackTop;    //index of data stack
    uint m_dataInfoTop; //index of the top of DataInfo stack, this stacks grows with decrementing address
    uint m_stackBottom;
    uint m_dataInfoBottom; //index of the bottom of DataInfo
    uint m_dataInfoSize;
    uint m_flags;

    uint m_offset;    //current offset indicates free place
    uint m_data[100]; //buffer
} DebugDataBuffer;

#pragma pack(push)
#pragma pack(4)
#include "DeviceEnqueueInternalTypes.h"
#pragma pack(pop)