1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
|
#if defined(TEST_OPENCL)
// Implement OpenCL custom context.
#define CL_TARGET_OPENCL_VERSION 120
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
// Create the global context. This is just a helper function not called by Halide.
inline bool create_opencl_context(cl_context &cl_ctx, cl_command_queue &cl_q) {
cl_int err = 0;
const cl_uint maxPlatforms = 4;
cl_platform_id platforms[maxPlatforms];
cl_uint platformCount = 0;
err = clGetPlatformIDs(maxPlatforms, platforms, &platformCount);
if (err != CL_SUCCESS) {
printf("clGetPlatformIDs failed (%d)\n", err);
return false;
}
cl_platform_id platform = nullptr;
if (platformCount > 0) {
platform = platforms[0];
}
if (platform == nullptr) {
printf("Failed to get platform\n");
return false;
}
cl_device_type device_type = CL_DEVICE_TYPE_ALL;
// Make sure we have a device
const cl_uint maxDevices = 4;
cl_device_id devices[maxDevices];
cl_uint deviceCount = 0;
err = clGetDeviceIDs(platform, device_type, maxDevices, devices, &deviceCount);
if (err != CL_SUCCESS) {
printf("clGetDeviceIDs failed (%d)\n", err);
return false;
}
if (deviceCount == 0) {
printf("Failed to get device\n");
return false;
}
cl_device_id dev = devices[deviceCount - 1];
// Create context and command queue.
cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
0};
cl_ctx = clCreateContext(properties, 1, &dev, nullptr, nullptr, &err);
if (err != CL_SUCCESS) {
printf("clCreateContext failed (%d)\n", err);
return false;
}
cl_q = clCreateCommandQueue(cl_ctx, dev, 0, &err);
if (err != CL_SUCCESS) {
printf("clCreateCommandQueue failed (%d)\n", err);
return false;
}
return true;
}
inline void destroy_opencl_context(cl_context cl_ctx, cl_command_queue cl_q) {
clReleaseCommandQueue(cl_q);
clReleaseContext(cl_ctx);
}
#elif defined(TEST_CUDA)
// Implement CUDA custom context.
#include <cuda.h>
inline bool create_cuda_context(CUcontext &cuda_ctx) {
// Initialize CUDA
CUresult err = cuInit(0);
if (err != CUDA_SUCCESS) {
printf("cuInit failed (%d)\n", err);
return false;
}
// Make sure we have a device
int deviceCount = 0;
err = cuDeviceGetCount(&deviceCount);
if (err != CUDA_SUCCESS) {
printf("cuGetDeviceCount failed (%d)\n", err);
return false;
}
if (deviceCount <= 0) {
printf("No CUDA devices available\n");
return false;
}
CUdevice dev;
// Get device
CUresult status;
// Try to get a device >0 first, since 0 should be our display device
// For now, don't try devices > 2 to maintain compatibility with previous behavior.
if (deviceCount > 2) deviceCount = 2;
for (int id = deviceCount - 1; id >= 0; id--) {
status = cuDeviceGet(&dev, id);
if (status == CUDA_SUCCESS) break;
}
if (status != CUDA_SUCCESS) {
printf("Failed to get CUDA device\n");
return status;
}
// Create context
err = cuCtxCreate(&cuda_ctx, 0, dev);
if (err != CUDA_SUCCESS) {
printf("cuCtxCreate failed (%d)\n", err);
return false;
}
return true;
}
inline void destroy_cuda_context(CUcontext cuda_ctx) {
cuCtxDestroy(cuda_ctx);
}
#elif defined(TEST_METAL) && defined(__OBJC__)
#include <Metal/MTLCommandQueue.h>
#include <Metal/MTLDevice.h>
inline bool create_metal_context(id<MTLDevice> &device, id<MTLCommandQueue> &queue) {
device = MTLCreateSystemDefaultDevice();
if (device == nullptr) {
NSArray<id<MTLDevice>> *devices = MTLCopyAllDevices();
if (devices != nullptr) {
device = devices[0];
}
}
if (device == nullptr) {
printf("Failed to find Metal device.\n");
return false;
}
queue = [device newCommandQueue];
if (queue == nullptr) {
printf("Failed to create Metal command queue.\n");
return false;
}
return true;
}
inline void destroy_metal_context(id<MTLDevice> device, id<MTLCommandQueue> queue) {
[queue release];
[device release];
}
#elif defined(TEST_WEBGPU)
#if defined(__EMSCRIPTEN__)
#include <webgpu/webgpu_cpp.h>
#else
#include "mini_webgpu.h"
#endif
extern "C" {
// TODO: Remove all of this when wgpuInstanceProcessEvents() is supported.
// See https://github.com/halide/Halide/issues/7248
#ifdef WITH_DAWN_NATIVE
// From <unistd.h>, used to spin-lock while waiting for device initialization.
int usleep(uint32_t);
#else
// Defined by Emscripten, and used to yield execution to asynchronous Javascript
// work in combination with Emscripten's "Asyncify" mechanism.
void emscripten_sleep(unsigned int ms);
#endif
}
inline bool create_webgpu_context(WGPUInstance *instance_out, WGPUAdapter *adapter_out, WGPUDevice *device_out, WGPUBuffer *staging_buffer_out) {
struct Results {
WGPUInstance instance = nullptr;
WGPUAdapter adapter = nullptr;
WGPUDevice device = nullptr;
WGPUBuffer staging_buffer = nullptr;
bool success = true;
} results;
results.instance = wgpuCreateInstance(nullptr);
auto request_adapter_callback = [](WGPURequestAdapterStatus status, WGPUAdapter adapter, char const *message, void *userdata) {
auto *results = (Results *)userdata;
if (status != WGPURequestAdapterStatus_Success) {
results->success = false;
return;
}
results->adapter = adapter;
// Use the defaults for most limits.
WGPURequiredLimits requestedLimits{};
requestedLimits.nextInChain = nullptr;
memset(&requestedLimits.limits, 0xFF, sizeof(WGPULimits));
// TODO: Enable for Emscripten when wgpuAdapterGetLimits is supported.
// See https://github.com/halide/Halide/issues/7248
#ifdef WITH_DAWN_NATIVE
WGPUSupportedLimits supportedLimits{};
supportedLimits.nextInChain = nullptr;
if (!wgpuAdapterGetLimits(adapter, &supportedLimits)) {
results->success = false;
return;
} else {
// Raise the limits on buffer size and workgroup storage size.
requestedLimits.limits.maxBufferSize = supportedLimits.limits.maxBufferSize;
requestedLimits.limits.maxStorageBufferBindingSize = supportedLimits.limits.maxStorageBufferBindingSize;
requestedLimits.limits.maxComputeWorkgroupStorageSize = supportedLimits.limits.maxComputeWorkgroupStorageSize;
}
#endif
auto device_lost_callback = [](WGPUDeviceLostReason reason,
char const *message,
void *userdata) {
// Apparently this should not be treated as a fatal error
if (reason == WGPUDeviceLostReason_Destroyed) {
return;
}
fprintf(stderr, "WGPU Device Lost: %d %s", (int)reason, message);
abort();
};
WGPUDeviceDescriptor desc{};
desc.nextInChain = nullptr;
desc.label = nullptr;
desc.requiredFeatureCount = 0;
desc.requiredFeatures = nullptr;
desc.requiredLimits = &requestedLimits;
desc.deviceLostCallback = device_lost_callback;
auto request_device_callback = [](WGPURequestDeviceStatus status,
WGPUDevice device,
char const *message,
void *userdata) {
auto *results = (Results *)userdata;
if (status != WGPURequestDeviceStatus_Success) {
results->success = false;
return;
}
results->device = device;
// Create a staging buffer for transfers.
constexpr int kStagingBufferSize = 4 * 1024 * 1024;
WGPUBufferDescriptor desc{};
desc.nextInChain = nullptr;
desc.label = nullptr;
desc.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead;
desc.size = kStagingBufferSize;
desc.mappedAtCreation = false;
results->staging_buffer = wgpuDeviceCreateBuffer(device, &desc);
if (results->staging_buffer == nullptr) {
results->success = false;
return;
}
};
wgpuAdapterRequestDevice(adapter, &desc, request_device_callback, userdata);
};
wgpuInstanceRequestAdapter(results.instance, nullptr, request_adapter_callback, &results);
// Wait for device initialization to complete.
while (!results.device && results.success) {
// TODO: Use wgpuInstanceProcessEvents() when it is supported.
// See https://github.com/halide/Halide/issues/7248
#ifndef WITH_DAWN_NATIVE
emscripten_sleep(10);
#else
usleep(1000);
#endif
}
*instance_out = results.instance;
*adapter_out = results.adapter;
*device_out = results.device;
*staging_buffer_out = results.staging_buffer;
return results.success;
}
inline void destroy_webgpu_context(WGPUInstance instance, WGPUAdapter adapter, WGPUDevice device, WGPUBuffer staging_buffer) {
wgpuBufferRelease(staging_buffer);
wgpuDeviceRelease(device);
wgpuAdapterRelease(adapter);
wgpuInstanceRelease(instance);
}
#endif
|