1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
|
/*******************************************************************************
* Copyright 2019-2022 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
/// @example gpu_opencl_interop.cpp
/// @copybrief gpu_opencl_interop_cpp
/// > Annotated version: @ref gpu_opencl_interop_cpp
/// @page gpu_opencl_interop_cpp Getting started on GPU with OpenCL extensions API
/// This C++ API example demonstrates programming for Intel(R) Processor
/// Graphics with OpenCL* extensions API in oneDNN.
///
/// > Example code: @ref gpu_opencl_interop.cpp
///
/// The workflow includes following steps:
/// - Create a GPU engine. It uses OpenCL as the runtime in this sample.
/// - Create a GPU memory descriptor/object.
/// - Create an OpenCL kernel for GPU data initialization
/// - Access a GPU memory via OpenCL interoperability interface
/// - Access a GPU command queue via OpenCL interoperability interface
/// - Execute a OpenCL kernel with related GPU command queue and GPU memory
/// - Create operation descriptor/operation primitives descriptor/primitive .
/// - Execute the primitive with the initialized GPU memory
/// - Validate the result by mapping the OpenCL memory via OpenCL interoperability
/// interface
///
/// @page gpu_opencl_interop_cpp
/// @section gpu_opencl_interop_cpp_headers Public headers
///
/// To start using oneDNN, we must first include the @ref dnnl.hpp
/// header file in the application. We also include CL/cl.h for using
/// OpenCL APIs and @ref dnnl_debug.h, which contains some debugging
/// facilities such as returning a string representation
/// for common oneDNN C types.
/// All C++ API types and functions reside in the `dnnl` namespace.
/// For simplicity of the example we import this namespace.
/// @page gpu_opencl_interop_cpp
/// @snippet gpu_opencl_interop.cpp Prologue
// [Prologue]
#include <iostream>
#include <numeric>
#include <stdexcept>
#include <CL/cl.h>
#include "oneapi/dnnl/dnnl.hpp"
#include "oneapi/dnnl/dnnl_ocl.hpp"
#include "example_utils.hpp"
using namespace dnnl;
using namespace std;
// [Prologue]
#define OCL_CHECK(x) \
do { \
cl_int s = (x); \
if (s != CL_SUCCESS) { \
std::cout << "[" << __FILE__ << ":" << __LINE__ << "] '" << #x \
<< "' failed (status code: " << s << ")." << std::endl; \
exit(1); \
} \
} while (0)
cl_kernel create_init_opencl_kernel(
cl_context ocl_ctx, const char *kernel_name, const char *ocl_code) {
cl_int err;
const char *sources[] = {ocl_code};
cl_program ocl_program
= clCreateProgramWithSource(ocl_ctx, 1, sources, nullptr, &err);
OCL_CHECK(err);
OCL_CHECK(
clBuildProgram(ocl_program, 0, nullptr, nullptr, nullptr, nullptr));
cl_kernel ocl_kernel = clCreateKernel(ocl_program, kernel_name, &err);
OCL_CHECK(err);
OCL_CHECK(clReleaseProgram(ocl_program));
return ocl_kernel;
}
/// @page gpu_opencl_interop_cpp
/// @section gpu_opencl_interop_cpp_tutorial gpu_opencl_interop_tutorial() function
///
void gpu_opencl_interop_tutorial() {
/// @page gpu_opencl_interop_cpp
/// @subsection gpu_opencl_interop_cpp_sub1 Engine and stream
///
/// All oneDNN primitives and memory objects are attached to a
/// particular @ref dnnl::engine, which is an abstraction of a
/// computational device (see also @ref dev_guide_basic_concepts). The
/// primitives are created and optimized for the device to which they are
/// attached, and the memory objects refer to memory residing on the
/// corresponding device. In particular, that means neither memory objects
/// nor primitives that were created for one engine can be used on
/// another.
///
/// To create engines, we must specify the @ref dnnl::engine::kind
/// and the index of the device of the given kind. In this example we use
/// the first available GPU engine, so the index for the engine is 0.
/// This example assumes OpenCL being a runtime for GPU. In such case,
/// during engine creation, an OpenCL context is also created and attaches
/// to the created engine.
///
/// @snippet gpu_opencl_interop.cpp Initialize engine
// [Initialize engine]
engine eng(engine::kind::gpu, 0);
// [Initialize engine]
/// In addition to an engine, all primitives require a @ref dnnl::stream
/// for the execution. The stream encapsulates an execution context and is
/// tied to a particular engine.
///
/// In this example, a GPU stream is created.
/// This example assumes OpenCL being a runtime for GPU. During stream creation,
/// an OpenCL command queue is also created and attaches to this stream.
///
/// @snippet gpu_opencl_interop.cpp Initialize stream
// [Initialize stream]
dnnl::stream strm(eng);
// [Initialize stream]
/// @subsection gpu_opencl_interop_cpp_sub2 Wrapping data into oneDNN memory object
///
/// Next, we create a memory object. We need to specify dimensions of our
/// memory by passing a memory::dims object. Then we create a memory
/// descriptor with these dimensions, with the dnnl::memory::data_type::f32
/// data type, and with the dnnl::memory::format_tag::nchw memory format.
/// Finally, we construct a memory object and pass the memory descriptor.
/// The library allocates memory internally.
/// @snippet gpu_opencl_interop.cpp memory alloc
// [memory alloc]
memory::dims tz_dims = {2, 3, 4, 5};
const size_t N = std::accumulate(tz_dims.begin(), tz_dims.end(), (size_t)1,
std::multiplies<size_t>());
memory::desc mem_d(
tz_dims, memory::data_type::f32, memory::format_tag::nchw);
memory mem(mem_d, eng);
// [memory alloc]
/// @subsection gpu_opencl_interop_cpp_sub3 Initialize the data by executing a custom OpenCL kernel
/// We are going to create an OpenCL kernel that will initialize our data.
/// It requires writing a bit of C code to create an OpenCL program from a
/// string literal source. The kernel initializes the data by the
/// 0, -1, 2, -3, ... sequence: `data[i] = (-1)^i * i`.
/// @snippet gpu_opencl_interop.cpp ocl kernel
// [ocl kernel]
const char *ocl_code
= "__kernel void init(__global float *data) {"
" int id = get_global_id(0);"
" data[id] = (id % 2) ? -id : id;"
"}";
// [ocl kernel]
/// Create/Build Opencl kernel by `create_init_opencl_kernel()` function.
/// Refer to the full code example for the `create_init_opencl_kernel()`
/// function.
/// @snippet gpu_opencl_interop.cpp oclkernel create
// [oclkernel create]
const char *kernel_name = "init";
cl_kernel ocl_init_kernel = create_init_opencl_kernel(
ocl_interop::get_context(eng), kernel_name, ocl_code);
// [oclkernel create]
/// The next step is to execute our OpenCL kernel by setting its arguments
/// and enqueueing to an OpenCL queue. You can extract the underlying OpenCL
/// buffer from the memory object using the interoperability interface:
/// dnnl::memory::get_ocl_mem_object() . For simplicity we can just construct a
/// stream, extract the underlying OpenCL queue, and enqueue the kernel to
/// this queue.
/// @snippet gpu_opencl_interop.cpp oclexecution
// [oclexecution]
cl_mem ocl_buf = ocl_interop::get_mem_object(mem);
OCL_CHECK(clSetKernelArg(ocl_init_kernel, 0, sizeof(ocl_buf), &ocl_buf));
cl_command_queue ocl_queue = ocl_interop::get_command_queue(strm);
OCL_CHECK(clEnqueueNDRangeKernel(ocl_queue, ocl_init_kernel, 1, nullptr, &N,
nullptr, 0, nullptr, nullptr));
// [oclexecution]
/// @subsection gpu_opencl_interop_cpp_sub4 Create and execute a primitive
/// There are three steps to create an operation primitive in oneDNN:
/// 1. Create an operation descriptor.
/// 2. Create a primitive descriptor.
/// 3. Create a primitive.
///
/// Let's create the primitive to perform the ReLU (rectified linear unit)
/// operation: x = max(0, x). An operation descriptor has no dependency on a
/// specific engine - it just describes some operation. On the contrary,
/// primitive descriptors are attached to a specific engine and represent
/// some implementation for this engine. A primitive object is a realization
/// of a primitive descriptor, and its construction is usually much
/// "heavier".
/// @snippet gpu_opencl_interop.cpp relu creation
// [relu creation]
auto relu_pd = eltwise_forward::primitive_desc(eng, prop_kind::forward,
algorithm::eltwise_relu, mem_d, mem_d, 0.0f);
auto relu = eltwise_forward(relu_pd);
// [relu creation]
/// Next, execute the primitive.
/// @snippet gpu_opencl_interop.cpp relu exec
// [relu exec]
relu.execute(strm, {{DNNL_ARG_SRC, mem}, {DNNL_ARG_DST, mem}});
strm.wait();
// [relu exec]
///
///@note
/// Our primitive mem serves as both input and output parameter.
///
///
///@note
/// Primitive submission on GPU is asynchronous; However, the user can
/// call dnnl:stream::wait() to synchronize the stream and ensure that all
/// previously submitted primitives are completed.
///
/// @page gpu_opencl_interop_cpp
/// @subsection gpu_opencl_interop_cpp_sub5 Validate the results
///
/// Before running validation codes, we need to copy the OpenCL memory to
/// the host. This can be done using OpenCL API. For convenience, we use a
/// utility function read_from_dnnl_memory() implementing required OpenCL API
/// calls. After we read the data to the host, we can run validation codes
/// on the host accordingly.
/// @snippet gpu_opencl_interop.cpp Check the results
// [Check the results]
std::vector<float> mem_data(N);
read_from_dnnl_memory(mem_data.data(), mem);
for (size_t i = 0; i < N; i++) {
float expected = (i % 2) ? 0.0f : (float)i;
if (mem_data[i] != expected) {
std::cout << "Expect " << expected << " but got " << mem_data[i]
<< "." << std::endl;
throw std::logic_error("Accuracy check failed.");
}
}
// [Check the results]
OCL_CHECK(clReleaseKernel(ocl_init_kernel));
}
int main(int argc, char **argv) {
return handle_example_errors(
{engine::kind::gpu}, gpu_opencl_interop_tutorial);
}
/// @page gpu_opencl_interop_cpp Getting started on GPU with OpenCL extensions API
///
/// <b></b>
///
/// Upon compiling and running the example, the output should be just:
///
/// ~~~
/// Example passed.
/// ~~~
///
|