File: gpu_context.h

package info (click to toggle)
halide 21.0.0-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 55,752 kB
  • sloc: cpp: 289,334; ansic: 22,751; python: 7,486; makefile: 4,299; sh: 2,508; java: 1,549; javascript: 282; pascal: 207; xml: 127; asm: 9
file content (296 lines) | stat: -rw-r--r-- 9,597 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
#if defined(TEST_OPENCL)
// Implement OpenCL custom context.

#define CL_TARGET_OPENCL_VERSION 120
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

// Create the global context. This is just a helper function not called by Halide.
inline bool create_opencl_context(cl_context &cl_ctx, cl_command_queue &cl_q) {
    cl_int err = 0;

    const cl_uint maxPlatforms = 4;
    cl_platform_id platforms[maxPlatforms];
    cl_uint platformCount = 0;

    err = clGetPlatformIDs(maxPlatforms, platforms, &platformCount);
    if (err != CL_SUCCESS) {
        printf("clGetPlatformIDs failed (%d)\n", err);
        return false;
    }

    cl_platform_id platform = nullptr;

    if (platformCount > 0) {
        platform = platforms[0];
    }
    if (platform == nullptr) {
        printf("Failed to get platform\n");
        return false;
    }

    cl_device_type device_type = CL_DEVICE_TYPE_ALL;

    // Make sure we have a device
    const cl_uint maxDevices = 4;
    cl_device_id devices[maxDevices];
    cl_uint deviceCount = 0;
    err = clGetDeviceIDs(platform, device_type, maxDevices, devices, &deviceCount);
    if (err != CL_SUCCESS) {
        printf("clGetDeviceIDs failed (%d)\n", err);
        return false;
    }
    if (deviceCount == 0) {
        printf("Failed to get device\n");
        return false;
    }

    cl_device_id dev = devices[deviceCount - 1];

    // Create context and command queue.
    cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
                                          0};
    cl_ctx = clCreateContext(properties, 1, &dev, nullptr, nullptr, &err);
    if (err != CL_SUCCESS) {
        printf("clCreateContext failed (%d)\n", err);
        return false;
    }

    cl_q = clCreateCommandQueue(cl_ctx, dev, 0, &err);
    if (err != CL_SUCCESS) {
        printf("clCreateCommandQueue failed (%d)\n", err);
        return false;
    }
    return true;
}

inline void destroy_opencl_context(cl_context cl_ctx, cl_command_queue cl_q) {
    clReleaseCommandQueue(cl_q);
    clReleaseContext(cl_ctx);
}

#elif defined(TEST_CUDA)
// Implement CUDA custom context.
#include <cuda.h>

inline bool create_cuda_context(CUcontext &cuda_ctx) {
    // Initialize CUDA
    CUresult err = cuInit(0);
    if (err != CUDA_SUCCESS) {
        printf("cuInit failed (%d)\n", err);
        return false;
    }

    // Make sure we have a device
    int deviceCount = 0;
    err = cuDeviceGetCount(&deviceCount);
    if (err != CUDA_SUCCESS) {
        printf("cuGetDeviceCount failed (%d)\n", err);
        return false;
    }
    if (deviceCount <= 0) {
        printf("No CUDA devices available\n");
        return false;
    }

    CUdevice dev;
    // Get device
    CUresult status;
    // Try to get a device >0 first, since 0 should be our display device
    // For now, don't try devices > 2 to maintain compatibility with previous behavior.
    if (deviceCount > 2) deviceCount = 2;
    for (int id = deviceCount - 1; id >= 0; id--) {
        status = cuDeviceGet(&dev, id);
        if (status == CUDA_SUCCESS) break;
    }

    if (status != CUDA_SUCCESS) {
        printf("Failed to get CUDA device\n");
        return status;
    }

    // Create context
    err = cuCtxCreate(&cuda_ctx, 0, dev);
    if (err != CUDA_SUCCESS) {
        printf("cuCtxCreate failed (%d)\n", err);
        return false;
    }

    return true;
}

inline void destroy_cuda_context(CUcontext cuda_ctx) {
    cuCtxDestroy(cuda_ctx);
}

#elif defined(TEST_METAL) && defined(__OBJC__)
#include <Metal/MTLCommandQueue.h>
#include <Metal/MTLDevice.h>

inline bool create_metal_context(id<MTLDevice> &device, id<MTLCommandQueue> &queue) {
    device = MTLCreateSystemDefaultDevice();
    if (device == nullptr) {
        NSArray<id<MTLDevice>> *devices = MTLCopyAllDevices();
        if (devices != nullptr) {
            device = devices[0];
        }
    }
    if (device == nullptr) {
        printf("Failed to find Metal device.\n");
        return false;
    }
    queue = [device newCommandQueue];
    if (queue == nullptr) {
        printf("Failed to create Metal command queue.\n");
        return false;
    }
    return true;
}

inline void destroy_metal_context(id<MTLDevice> device, id<MTLCommandQueue> queue) {
    [queue release];
    [device release];
}

#elif defined(TEST_WEBGPU)

#if defined(__EMSCRIPTEN__)
#include <webgpu/webgpu_cpp.h>
#else
#include "mini_webgpu.h"
#endif

extern "C" {
// TODO: Remove all of this when wgpuInstanceProcessEvents() is supported.
// See https://github.com/halide/Halide/issues/7248
#ifdef WITH_DAWN_NATIVE
// From <unistd.h>, used to spin-lock while waiting for device initialization.
int usleep(uint32_t);
#else
// Defined by Emscripten, and used to yield execution to asynchronous Javascript
// work in combination with Emscripten's "Asyncify" mechanism.
void emscripten_sleep(unsigned int ms);
#endif
}

inline bool create_webgpu_context(WGPUInstance *instance_out, WGPUAdapter *adapter_out, WGPUDevice *device_out, WGPUBuffer *staging_buffer_out) {
    struct Results {
        WGPUInstance instance = nullptr;
        WGPUAdapter adapter = nullptr;
        WGPUDevice device = nullptr;
        WGPUBuffer staging_buffer = nullptr;
        bool success = true;
    } results;

    results.instance = wgpuCreateInstance(nullptr);

    auto request_adapter_callback = [](WGPURequestAdapterStatus status, WGPUAdapter adapter, char const *message, void *userdata) {
        auto *results = (Results *)userdata;

        if (status != WGPURequestAdapterStatus_Success) {
            results->success = false;
            return;
        }
        results->adapter = adapter;

        // Use the defaults for most limits.
        WGPURequiredLimits requestedLimits{};
        requestedLimits.nextInChain = nullptr;
        memset(&requestedLimits.limits, 0xFF, sizeof(WGPULimits));

        // TODO: Enable for Emscripten when wgpuAdapterGetLimits is supported.
        // See https://github.com/halide/Halide/issues/7248
#ifdef WITH_DAWN_NATIVE
        WGPUSupportedLimits supportedLimits{};
        supportedLimits.nextInChain = nullptr;
        if (!wgpuAdapterGetLimits(adapter, &supportedLimits)) {
            results->success = false;
            return;
        } else {
            // Raise the limits on buffer size and workgroup storage size.
            requestedLimits.limits.maxBufferSize = supportedLimits.limits.maxBufferSize;
            requestedLimits.limits.maxStorageBufferBindingSize = supportedLimits.limits.maxStorageBufferBindingSize;
            requestedLimits.limits.maxComputeWorkgroupStorageSize = supportedLimits.limits.maxComputeWorkgroupStorageSize;
        }
#endif

        auto device_lost_callback = [](WGPUDeviceLostReason reason,
                                       char const *message,
                                       void *userdata) {
            // Apparently this should not be treated as a fatal error
            if (reason == WGPUDeviceLostReason_Destroyed) {
                return;
            }
            fprintf(stderr, "WGPU Device Lost: %d %s", (int)reason, message);
            abort();
        };

        WGPUDeviceDescriptor desc{};
        desc.nextInChain = nullptr;
        desc.label = nullptr;
        desc.requiredFeatureCount = 0;
        desc.requiredFeatures = nullptr;
        desc.requiredLimits = &requestedLimits;
        desc.deviceLostCallback = device_lost_callback;

        auto request_device_callback = [](WGPURequestDeviceStatus status,
                                          WGPUDevice device,
                                          char const *message,
                                          void *userdata) {
            auto *results = (Results *)userdata;
            if (status != WGPURequestDeviceStatus_Success) {
                results->success = false;
                return;
            }
            results->device = device;

            // Create a staging buffer for transfers.
            constexpr int kStagingBufferSize = 4 * 1024 * 1024;
            WGPUBufferDescriptor desc{};
            desc.nextInChain = nullptr;
            desc.label = nullptr;
            desc.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead;
            desc.size = kStagingBufferSize;
            desc.mappedAtCreation = false;
            results->staging_buffer = wgpuDeviceCreateBuffer(device, &desc);
            if (results->staging_buffer == nullptr) {
                results->success = false;
                return;
            }
        };

        wgpuAdapterRequestDevice(adapter, &desc, request_device_callback, userdata);
    };

    wgpuInstanceRequestAdapter(results.instance, nullptr, request_adapter_callback, &results);

    // Wait for device initialization to complete.
    while (!results.device && results.success) {
        // TODO: Use wgpuInstanceProcessEvents() when it is supported.
        // See https://github.com/halide/Halide/issues/7248
#ifndef WITH_DAWN_NATIVE
        emscripten_sleep(10);
#else
        usleep(1000);
#endif
    }

    *instance_out = results.instance;
    *adapter_out = results.adapter;
    *device_out = results.device;
    *staging_buffer_out = results.staging_buffer;
    return results.success;
}

inline void destroy_webgpu_context(WGPUInstance instance, WGPUAdapter adapter, WGPUDevice device, WGPUBuffer staging_buffer) {
    wgpuBufferRelease(staging_buffer);
    wgpuDeviceRelease(device);
    wgpuAdapterRelease(adapter);
    wgpuInstanceRelease(instance);
}

#endif