File: gpu_opencl_interop.cpp

package info (click to toggle)
onednn 3.9.1%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 79,124 kB
  • sloc: cpp: 850,217; ansic: 37,403; lisp: 16,757; python: 3,463; asm: 831; sh: 78; javascript: 66; makefile: 41
file content (273 lines) | stat: -rw-r--r-- 11,387 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
/*******************************************************************************
* Copyright 2019-2022 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

/// @example  gpu_opencl_interop.cpp
/// @copybrief gpu_opencl_interop_cpp
/// > Annotated version: @ref gpu_opencl_interop_cpp

/// @page  gpu_opencl_interop_cpp Getting started on GPU with OpenCL extensions API
/// This C++ API example demonstrates programming for Intel(R) Processor
/// Graphics with OpenCL* extensions API in oneDNN.
///
/// > Example code: @ref gpu_opencl_interop.cpp
///
/// The workflow includes following steps:
///   - Create a GPU engine. It uses OpenCL as the runtime in this sample.
///   - Create a GPU memory descriptor/object.
///   - Create an OpenCL kernel for GPU data initialization
///   - Access a GPU memory via OpenCL interoperability interface
///   - Access a GPU command queue via OpenCL interoperability interface
///   - Execute a OpenCL kernel with related GPU command queue and GPU memory
///   - Create operation descriptor/operation primitives descriptor/primitive .
///   - Execute the primitive with the initialized GPU memory
///   - Validate the result by mapping the OpenCL memory via OpenCL interoperability
///     interface
///

/// @page gpu_opencl_interop_cpp
/// @section gpu_opencl_interop_cpp_headers Public headers
///
/// To start using oneDNN, we must first include the @ref dnnl.hpp
/// header file in the application. We also include CL/cl.h for using
/// OpenCL APIs and @ref dnnl_debug.h, which  contains some debugging
/// facilities such as returning a string representation
/// for common oneDNN C types.
/// All C++ API types and functions reside in the `dnnl` namespace.
/// For simplicity of the example we import this namespace.
/// @page gpu_opencl_interop_cpp
/// @snippet  gpu_opencl_interop.cpp Prologue
// [Prologue]
#include <iostream>
#include <numeric>
#include <stdexcept>

#include <CL/cl.h>

#include "oneapi/dnnl/dnnl.hpp"
#include "oneapi/dnnl/dnnl_ocl.hpp"

#include "example_utils.hpp"

using namespace dnnl;
using namespace std;
// [Prologue]

#define OCL_CHECK(x) \
    do { \
        cl_int s = (x); \
        if (s != CL_SUCCESS) { \
            std::cout << "[" << __FILE__ << ":" << __LINE__ << "] '" << #x \
                      << "' failed (status code: " << s << ")." << std::endl; \
            exit(1); \
        } \
    } while (0)

cl_kernel create_init_opencl_kernel(
        cl_context ocl_ctx, const char *kernel_name, const char *ocl_code) {
    cl_int err;
    const char *sources[] = {ocl_code};
    cl_program ocl_program
            = clCreateProgramWithSource(ocl_ctx, 1, sources, nullptr, &err);
    OCL_CHECK(err);

    OCL_CHECK(
            clBuildProgram(ocl_program, 0, nullptr, nullptr, nullptr, nullptr));

    cl_kernel ocl_kernel = clCreateKernel(ocl_program, kernel_name, &err);
    OCL_CHECK(err);

    OCL_CHECK(clReleaseProgram(ocl_program));
    return ocl_kernel;
}

/// @page gpu_opencl_interop_cpp
/// @section gpu_opencl_interop_cpp_tutorial gpu_opencl_interop_tutorial() function
///
void gpu_opencl_interop_tutorial() {
    /// @page gpu_opencl_interop_cpp
    /// @subsection gpu_opencl_interop_cpp_sub1 Engine and stream
    ///
    /// All oneDNN primitives and memory objects are attached to a
    /// particular @ref dnnl::engine, which is an abstraction of a
    /// computational device (see also @ref dev_guide_basic_concepts). The
    /// primitives are created and optimized for the device to which they are
    /// attached, and the memory objects refer to memory residing on the
    /// corresponding device. In particular, that means neither memory objects
    /// nor primitives that were created for one engine can be used on
    /// another.
    ///
    /// To create engines, we must specify the @ref dnnl::engine::kind
    /// and the index of the device of the given kind. In this example we use
    /// the first available GPU engine, so the index for the engine is 0.
    /// This example assumes OpenCL being a runtime for GPU. In such case,
    /// during engine creation, an OpenCL context is also created and attaches
    /// to the created engine.
    ///
    /// @snippet  gpu_opencl_interop.cpp Initialize engine
    // [Initialize engine]
    engine eng(engine::kind::gpu, 0);
    // [Initialize engine]

    /// In addition to an engine, all primitives require a @ref dnnl::stream
    /// for the execution. The stream encapsulates an execution context and is
    /// tied to a particular engine.
    ///
    /// In this example, a GPU stream is created.
    /// This example assumes OpenCL being a runtime for GPU. During stream creation,
    /// an OpenCL command queue is also created and attaches to this stream.
    ///
    /// @snippet  gpu_opencl_interop.cpp Initialize stream
    // [Initialize stream]
    dnnl::stream strm(eng);
    // [Initialize stream]

    /// @subsection  gpu_opencl_interop_cpp_sub2 Wrapping data into oneDNN memory object
    ///
    /// Next, we create a memory object. We need to specify dimensions of our
    /// memory by passing a memory::dims object. Then we create a memory
    /// descriptor with these dimensions, with the dnnl::memory::data_type::f32
    /// data type, and with the dnnl::memory::format_tag::nchw memory format.
    /// Finally, we construct a memory object and pass the memory descriptor.
    /// The library allocates memory internally.
    /// @snippet  gpu_opencl_interop.cpp memory alloc
    //  [memory alloc]
    memory::dims tz_dims = {2, 3, 4, 5};
    const size_t N = std::accumulate(tz_dims.begin(), tz_dims.end(), (size_t)1,
            std::multiplies<size_t>());

    memory::desc mem_d(
            tz_dims, memory::data_type::f32, memory::format_tag::nchw);

    memory mem(mem_d, eng);
    //  [memory alloc]

    /// @subsection  gpu_opencl_interop_cpp_sub3 Initialize the data by executing a custom OpenCL kernel
    /// We are going to create an OpenCL kernel that will initialize our data.
    /// It requires writing a bit of C code to create an OpenCL program from a
    /// string literal source. The kernel initializes the data by the
    /// 0, -1, 2, -3, ... sequence: `data[i] = (-1)^i * i`.
    /// @snippet  gpu_opencl_interop.cpp ocl kernel
    //  [ocl kernel]
    const char *ocl_code
            = "__kernel void init(__global float *data) {"
              "    int id = get_global_id(0);"
              "    data[id] = (id % 2) ? -id : id;"
              "}";
    //  [ocl kernel]

    /// Create/Build Opencl kernel by `create_init_opencl_kernel()` function.
    /// Refer to the full code example for the `create_init_opencl_kernel()`
    /// function.
    /// @snippet  gpu_opencl_interop.cpp oclkernel create
    // [oclkernel create]
    const char *kernel_name = "init";
    cl_kernel ocl_init_kernel = create_init_opencl_kernel(
            ocl_interop::get_context(eng), kernel_name, ocl_code);
    //  [oclkernel create]

    /// The next step is to execute our OpenCL kernel by setting its arguments
    /// and enqueueing to an OpenCL queue. You can extract the underlying OpenCL
    /// buffer from the memory object using  the interoperability interface:
    /// dnnl::memory::get_ocl_mem_object() . For simplicity we can just construct a
    /// stream, extract the underlying OpenCL queue, and enqueue the kernel to
    /// this queue.
    /// @snippet  gpu_opencl_interop.cpp oclexecution
    // [oclexecution]
    cl_mem ocl_buf = ocl_interop::get_mem_object(mem);
    OCL_CHECK(clSetKernelArg(ocl_init_kernel, 0, sizeof(ocl_buf), &ocl_buf));

    cl_command_queue ocl_queue = ocl_interop::get_command_queue(strm);
    OCL_CHECK(clEnqueueNDRangeKernel(ocl_queue, ocl_init_kernel, 1, nullptr, &N,
            nullptr, 0, nullptr, nullptr));
    // [oclexecution]

    /// @subsection gpu_opencl_interop_cpp_sub4 Create and execute a primitive
    /// There are three steps to create an operation primitive in oneDNN:
    /// 1. Create an operation descriptor.
    /// 2. Create a primitive descriptor.
    /// 3. Create a primitive.
    ///
    /// Let's create the primitive to perform the ReLU (rectified linear unit)
    /// operation: x = max(0, x). An operation descriptor has no dependency on a
    /// specific engine - it just describes some operation. On the contrary,
    /// primitive descriptors are attached to a specific engine and represent
    /// some implementation for this engine. A primitive object is a realization
    /// of a primitive descriptor, and its construction is usually much
    /// "heavier".
    /// @snippet gpu_opencl_interop.cpp relu creation
    //  [relu creation]
    auto relu_pd = eltwise_forward::primitive_desc(eng, prop_kind::forward,
            algorithm::eltwise_relu, mem_d, mem_d, 0.0f);
    auto relu = eltwise_forward(relu_pd);
    //  [relu creation]

    /// Next, execute the primitive.
    /// @snippet gpu_opencl_interop.cpp relu exec
    // [relu exec]
    relu.execute(strm, {{DNNL_ARG_SRC, mem}, {DNNL_ARG_DST, mem}});
    strm.wait();
    // [relu exec]
    ///
    ///@note
    ///    Our primitive mem serves as both input and output parameter.
    ///
    ///
    ///@note
    ///    Primitive submission on GPU is asynchronous; However, the user can
    ///    call dnnl:stream::wait() to synchronize the stream and ensure that all
    ///    previously submitted primitives are completed.
    ///

    /// @page gpu_opencl_interop_cpp
    /// @subsection gpu_opencl_interop_cpp_sub5 Validate the results
    ///
    /// Before running validation codes, we need to copy the OpenCL memory to
    /// the host. This can be done using OpenCL API. For convenience, we use a
    /// utility function read_from_dnnl_memory() implementing required OpenCL API
    /// calls. After we read the data to the host, we can run validation codes
    /// on the host accordingly.
    /// @snippet gpu_opencl_interop.cpp Check the results
    // [Check the results]
    std::vector<float> mem_data(N);
    read_from_dnnl_memory(mem_data.data(), mem);
    for (size_t i = 0; i < N; i++) {
        float expected = (i % 2) ? 0.0f : (float)i;
        if (mem_data[i] != expected) {
            std::cout << "Expect " << expected << " but got " << mem_data[i]
                      << "." << std::endl;
            throw std::logic_error("Accuracy check failed.");
        }
    }
    // [Check the results]

    OCL_CHECK(clReleaseKernel(ocl_init_kernel));
}

int main(int argc, char **argv) {
    return handle_example_errors(
            {engine::kind::gpu}, gpu_opencl_interop_tutorial);
}

/// @page  gpu_opencl_interop_cpp Getting started on GPU with OpenCL extensions API
///
/// <b></b>
///
/// Upon compiling and running the example, the output should be just:
///
/// ~~~
/// Example passed.
/// ~~~
///