File: performance_profiling.cpp

package info (click to toggle)
onednn 3.9.1%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 79,124 kB
  • sloc: cpp: 850,217; ansic: 37,403; lisp: 16,757; python: 3,463; asm: 831; sh: 78; javascript: 66; makefile: 41
file content (570 lines) | stat: -rw-r--r-- 26,975 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
/*******************************************************************************
* Copyright 2019-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

/// @example performance_profiling.cpp
/// @copybrief performance_profiling_cpp
/// > Annotated version: @ref performance_profiling_cpp

/// @page performance_profiling_cpp Performance Profiling Example
/// This example demonstrates the best practices for application performance
/// optimizations with oneDNN.
///
/// > Example code: @ref performance_profiling.cpp
///
/// This example uses [ONEDNN_VERBOSE](@ref dev_guide_verbose) trace output
/// to tune oneDNN code to align
/// with the [best practices](@ref dev_guide_inference).
///
/// It assumes knowledge of memory formats and their usage in
/// oneDNN. You can read more about this topic
/// [here](@ref memory_format_propagation_cpp).
///
/// Additionally, see the [article for recommended environment for
/// running benchmarks](@ref dev_guide_performance_settings).
///
/// The example has three different implementations of the mathematical
/// operation:
/// 1. *Naive implementation* executes 2D convolution followed by
/// ReLU on the data in **NCHW** format. This implementation
/// does not align with oneDNN best practices and results in
/// suboptimal performance.
/// 2. *Blocked format implementation* executes the same operations
/// sequence on the **blocked format** optimized for convolution
/// performance. This implementation uses `format_tag=ANY` to create a
/// convolution memory descriptor to determine the data format optimal
/// for the convolution implementation. It then **propagates the blocked
/// format** to the non-intensive ReLU. This implementation results
/// in better overall performance than the naive implementation.
/// 3. *Fused implementation* executes convolution fused with ReLU on
/// blocked data format. This implementation uses
/// `format_tag=ANY` to create a convolution memory descriptor, and then
/// adds ReLU as a **post-op** to the convolution primitive. This version
/// implements all of the best practices for inference resulting in the
/// best overall performance.
///
/// @section performance_profiling_cpp_walkthrough Walkthrough
///
/// The program in \ref performance_profiling.cpp includes all three
/// implementations introduced above. You can select the specific implementation
/// using command line options.
///
/// After compilation, you can execute each implementation with:
/// ~~~sh
/// ./program.exe [cpu|gpu] [implementation]
/// ~~~
///
/// Before you run the program, set your `ONEDNN_VERBOSE` environment
/// variable to `profile_exec`:
/// ~~~sh
/// export ONEDNN_VERBOSE=profile_exec
/// ~~~
///
/// The program starts by creating oneDNN memory objects in **NCHW**
/// format. These are called `user_` because they are meant to represent the
/// user's source data entering oneDNN with the NCHW format.
/// @page performance_profiling_cpp
/// @snippet performance_profiling.cpp Set dimensions
/// @page performance_profiling_cpp
/// @note Here the library allocates memory.
/// @page performance_profiling_cpp
/// @snippet performance_profiling.cpp Create memory objects
/// @page performance_profiling_cpp
/// @note You can change the batch size to easily increase/decrease the workload.
///
/// The following descriptions of each implementation will reference each other,
/// and are meant to be read in order.
///

#include <iostream>
#include <stdexcept>
#include <vector>

#include "oneapi/dnnl/dnnl.hpp"

#include "example_utils.hpp"

using namespace dnnl;

// [Prologue]

// Set Strides and Padding
const memory::dims strides = {4, 4};
const memory::dims padding = {0, 0};

// [Prologue]
//
// function to init data
void init_data(memory &m, float v) {
    size_t size = m.get_desc().get_size() / sizeof(float);
    std::vector<float> data(size, v);
    write_to_dnnl_memory(data.data(), m);
}

// function to execute non-fused relu
void create_and_execute_relu(memory &data, engine &eng, stream &s) {
    // relu operates on whatever data format is given to it

    // create a primitive
    auto relu_pd = eltwise_forward::primitive_desc(eng,
            prop_kind::forward_inference, algorithm::eltwise_relu,
            data.get_desc(), data.get_desc(), 0.f, 0.f);
    auto relu = eltwise_forward(relu_pd);

    // execute it (in-place)
    relu.execute(s, {{DNNL_ARG_SRC, data}, {DNNL_ARG_DST, data}});
}

// [Create post_op attr with relu]
// function to create post-op attribute for fused relu
primitive_attr create_attr_with_relu_post_op() {
    // create a post-op with relu
    post_ops ops;
    ops.append_eltwise(algorithm::eltwise_relu, 0.f, 0.f);

    // create an attribute and set the corresponding post op
    primitive_attr attr;
    attr.set_post_ops(ops);

    return attr;
}
// [Create post_op attr with relu]

// Implementation for naive convolution on nchw (data) and oihw (weights),
// followed by execution of non-fused relu
void conv_relu_naive(const memory &user_src, const memory &user_wei,
        memory user_dst, engine &eng, stream &s) {
    /// @section performance_profiling_cpp_implementation1 Naive Implementation
    /// This implementation is launched with the following shell code:
    /// ~~~sh
    /// ./program.exe cpu naive
    /// ~~~
    /// The program will call the implementation defined in the function
    /// `conv_relu_naive()`.
    ///
    /// First it sets the dimensions and format for convolution memory
    /// descriptors (`_md`) to match `user_` values--one `md` each for source,
    /// destination, and weight data. Then it uses those `md` to create the
    /// convolution primitive descriptor `conv_pd`, which tells oneDNN to use
    /// plain format (NCHW) for the convolution.
    /// @page performance_profiling_cpp
    /// @snippet performance_profiling.cpp Create mem_desc
    // [Create mem_desc]
    // copy the dimensions and format from user's memory
    auto conv_src_md = memory::desc(user_src.get_desc());
    auto conv_wei_md = memory::desc(user_wei.get_desc());
    auto conv_dst_md = memory::desc(user_dst.get_desc());
    // [Create mem_desc]
    /// @page performance_profiling_cpp
    /// Next the program creates a convolution primitive descriptor `conv_pd`
    /// and convolution primitive `conv`. These structs will inherit
    /// NCHW format from `md` by way of the `conv_d`. Finally it creates
    /// the convolution primitive `conv` and adds it to the stream `s`, and then
    /// executes the `create_and_execute_relu(user_dst)` function.
    /// @page performance_profiling_cpp
    /// @snippet performance_profiling.cpp Create conv_prim_desc
    // [Create conv_prim_desc]
    // create a convolution primitive descriptor
    auto conv_pd = convolution_forward::primitive_desc(eng,
            prop_kind::forward_inference, algorithm::convolution_direct,
            conv_src_md, conv_wei_md, conv_dst_md, strides, padding, padding);
    // [Create conv_prim_desc]
    /// @page performance_profiling_cpp
    /// @snippet performance_profiling.cpp Create conv_primitive
    // [Create conv_primitive]
    // create convolution primitive
    auto conv = convolution_forward(conv_pd);
    // [Create conv_primitive]
    /// @page performance_profiling_cpp
    /// @snippet performance_profiling.cpp Add to stream
    // [Add to stream]
    // execute convolution by adding it to the stream s
    conv.execute(s,
            {{DNNL_ARG_SRC, user_src}, {DNNL_ARG_WEIGHTS, user_wei},
                    {DNNL_ARG_DST, user_dst}});
    // [Add to stream]
    /// @page performance_profiling_cpp
    /// @snippet performance_profiling.cpp Create and execute relu
    // [Create and execute relu]
    // execute relu (on convolution's destination format, whatever it is)
    create_and_execute_relu(user_dst, eng, s);
    s.wait();
    // [Create and execute relu]
    /// @page performance_profiling_cpp
    /// @note The function for creation and execution of ReLU primitive is
    /// defined elsewhere to keep this example clean. It is an non-intensive
    /// operation, so the `create_and_execute_relu()` function uses whatever
    /// the input data format is at the time it is called.
    ///
    /// Using NCHW data format may result in suboptimal performance for compute
    /// intensive primitives, as shown in the following ONEDNN_VERBOSE output
    /// by the convolution and relu execution
    /// times of 38.3 and 2.9 milliseconds, respectively.
    ///
    /// *ONEDNN_VERBOSE output (see configuration notice\*):*
    /// ~~~sh
    /// onednn_verbose,v0,exec,cpu,convolution,gemm:jit,forward_inference,src_f32::blocked:abcd:f0 wei_f32::blocked:abcd:f0 bia_undef::undef::f0 dst_f32::blocked:abcd:f0,,alg:convolution_direct,mb128_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0,38.314
    /// onednn_verbose,v0,exec,cpu,eltwise,jit:avx512_common,forward_inference,data_f32::blocked:abcd:f0 diff_undef::undef::f0,,alg:eltwise_relu alpha:0 beta:0,128x96x55x55,2.87695
    /// ~~~
    /// In *Blocked format implementation*, we will incorporate the best
    /// practice of letting oneDNN determine the optimal format
    /// for convolution primitive.
}

// Implementation for convolution on blocked format for data and
// weights, followed by execution of non-fused relu
void conv_relu_blocked(memory user_src, memory user_wei, memory user_dst,
        engine &eng, stream &s) {
    /// @page performance_profiling_cpp
    /// @section performance_profiling_cpp_implementation2 Blocked format implementation
    /// This implementation is launched with the following shell code:
    /// ~~~sh
    /// ./program.exe cpu blocked
    /// ~~~
    /// The program will call the implementation defined in the function
    /// `conv_relu_blocked()`.
    ///
    /// First it creates the md as in **naive implementation**. Next it changes
    /// the dnnl::memory::format_tag for each md to `ANY`. Then it uses those
    /// md to create the convolution primitive  descriptor conv_pd, which tells
    /// oneDNN to use whatever format it recommends for the convolution.
    /// oneDNN will choose a friendly blocked format.
    /// @page performance_profiling_cpp
    /// @snippet performance_profiling.cpp Create mem_desc with tag=any
    // [Create mem_desc with tag=any]
    // copy the dimensions and data type from user's memory and set format tag
    // to "any" to allow convolution to pick the best implementation
    auto conv_src_md = memory::desc(user_src.get_desc().get_dims(),
            user_src.get_desc().get_data_type(), memory::format_tag::any);
    auto conv_wei_md = memory::desc(user_wei.get_desc().get_dims(),
            user_wei.get_desc().get_data_type(), memory::format_tag::any);
    auto conv_dst_md = memory::desc(user_dst.get_desc().get_dims(),
            user_dst.get_desc().get_data_type(), memory::format_tag::any);
    // [Create mem_desc with tag=any]

    /// Next the program creates a convolution primitive descriptor conv_pd and
    /// convolution primitive conv as in naive implementation.
    /// However, in this implementation the structs will inherit blocked format
    /// from md by way of the conv_d.
    /// @page performance_profiling_cpp
    /// @snippet performance_profiling.cpp Create conv_prim_desc implementation2
    // [Create conv_prim_desc implementation2]
    // create a convolution primitive descriptor and primitive
    auto conv_pd = convolution_forward::primitive_desc(eng,
            prop_kind::forward_inference, algorithm::convolution_direct,
            conv_src_md, conv_wei_md, conv_dst_md, strides, padding, padding);
    // [Create conv_prim_desc implementation2]
    /// Since the resulting convolution primitive will expect
    /// blocked source data, conditional reorders are inserted to convert
    /// input data to blocked format if required.
    /// The input data user_src is NCHW, so this conditional will be triggered:
    ///
    /// @note The reorders are applied using oneDNN `reorder` primitive.
    /// @page performance_profiling_cpp
    /// @snippet performance_profiling.cpp Conditionally create and execute reorder prims
    // [Conditionally create and execute reorder prims]
    // prepare convolution source
    memory conv_src = user_src;
    if (conv_pd.src_desc() != user_src.get_desc()) {
        conv_src = memory(conv_pd.src_desc(), eng);
        auto r_pd = reorder::primitive_desc(user_src, conv_src);
        reorder(r_pd).execute(s, user_src, conv_src);
    }

    // prepare convolution weights
    memory conv_wei = user_wei;
    if (conv_pd.weights_desc() != user_wei.get_desc()) {
        conv_wei = memory(conv_pd.weights_desc(), eng);
        auto r_pd = reorder::primitive_desc(user_wei, conv_wei);
        reorder(r_pd).execute(s, user_wei, conv_wei);
    }

    // prepare convolution destination
    memory conv_dst = user_dst;
    if (conv_pd.dst_desc() != user_dst.get_desc())
        conv_dst = memory(conv_pd.dst_desc(), eng);
    // [Conditionally create and execute reorder prims]
    /// Finally it creates the convolution primitive `conv` and adds it to the
    /// stream `s` with the reordered data (`conv_src`, `conv_wei`, `conv_dst1`)
    /// as inputs and then executes the
    /// `create_and_execute_relu(conv_dst)` function.
    /// @page performance_profiling_cpp
    /// @snippet performance_profiling.cpp Create conv_primitive implementation2
    // [Create conv_primitive implementation2]
    // create convolution primitive
    auto conv = convolution_forward(conv_pd);
    // [Create conv_primitive implementation2]
    /// @page performance_profiling_cpp
    /// @snippet performance_profiling.cpp Add to stream implementation2
    // [Add to stream implementation2]
    // execute convolution by adding it to the stream s
    conv.execute(s,
            {{DNNL_ARG_SRC, conv_src}, {DNNL_ARG_WEIGHTS, conv_wei},
                    {DNNL_ARG_DST, conv_dst}});
    // [Add to stream implementation2]
    /// @page performance_profiling_cpp
    /// @snippet performance_profiling.cpp Create and execute relu implementation2
    // [Create and execute relu implementation2]
    // execute relu (on convolution's destination format, whatever it is)
    create_and_execute_relu(conv_dst, eng, s);
    // [Create and execute relu implementation2]
    if (conv_pd.dst_desc() != user_dst.get_desc()) {
        auto r_pd = reorder::primitive_desc(conv_dst, user_dst);
        reorder(r_pd).execute(s, conv_dst, user_dst);
    }
    s.wait();
    /// @page performance_profiling_cpp
    /// Blocked memory format is recommended for oneDNN primitive
    /// execution and provides better performance, as shown in the
    /// ONEDNN_VERBOSE output by the convolution and relu execution times of
    /// 18.3 and 2.7 milliseconds (down from 38.3 and 2.9 in
    /// *naive implementation*), respectively.
    /// In this implementation, there is an additional reorder operation that
    /// executes before and after the the conv + relu. This small cost is worth
    /// the gain from executing in blocked format. If fact, it becomes
    /// negligible when chaining together multiple oneDNN operations in
    /// succession. In these situations, you can do one reorder at the beginning
    /// and one at the end of the chain, and only pay the reorder penalty at
    /// those points in the execution.
    ///
    /// *ONEDNN_VERBOSE output (see configuration notice\*):*
    /// ~~~sh
    /// onednn_verbose,v0,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:abcd:f0 dst_f32::blocked:Acdb16a:f0,,,96x3x11x11,0.0310059
    /// onednn_verbose,v0,exec,cpu,convolution,jit:avx512_common,forward_inference,src_f32::blocked:abcd:f0 wei_f32::blocked:Acdb16a:f0 bia_undef::undef::f0 dst_f32::blocked:aBcd16b:f0,,alg:convolution_direct,mb128_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0,18.3101
    /// onednn_verbose,v0,exec,cpu,eltwise,jit:avx512_common,forward_inference,data_f32::blocked:aBcd16b:f0 diff_undef::undef::f0,,alg:eltwise_relu alpha:0 beta:0,128x96x55x55,2.66895
    /// onednn_verbose,v0,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:aBcd16b:f0 dst_f32::blocked:abcd:f0,,,128x96x55x55,4.80396
    /// ~~~
    /// This inference implementation is closer to best practices than
    /// *naive implementation* because it uses oneDNN recommended memory
    /// format. *fused implementation* will further optimize the performance by
    /// fusing convolution with ReLU using oneDNN
    /// [post-ops](@ref dev_guide_attributes_post_ops).
    // reorder data to the user's format if needed.
}

// Implementation for convolution on blocked format for data and
// weights and the relu operation fused via a post-op attribute added to the
// convolution prim_descriptor
void conv_relu_fused(memory user_src, memory user_wei, memory user_dst,
        const engine &eng, stream &s) {
    /// @section performance_profiling_cpp_implementation3 Fused Implementation
    /// This implementation is launched with the following shell code:
    /// ~~~sh
    /// ./program.exe cpu fused
    /// ~~~
    /// The program will call the implementation defined in the function
    /// `conv_relu_fused()`.
    /// @page performance_profiling_cpp
    ///
    /// First the memory descriptors and convolution primitive descriptor are
    /// created as in *naive implementation*.
    // copy the dimensions data type from user's memory and set format tag
    // to any to allow convolution to pick the best implementation
    auto conv_src_md = memory::desc(user_src.get_desc().get_dims(),
            user_src.get_desc().get_data_type(), memory::format_tag::any);
    auto conv_wei_md = memory::desc(user_wei.get_desc().get_dims(),
            user_wei.get_desc().get_data_type(), memory::format_tag::any);
    auto conv_dst_md = memory::desc(user_dst.get_desc().get_dims(),
            user_dst.get_desc().get_data_type(), memory::format_tag::any);

    /// Then in preparation for the convolution prim descriptor, a ReLU post-op
    /// is built and added to the primitive attribute `attr`:
    /// @page performance_profiling_cpp
    /// @snippet performance_profiling.cpp Create post_op attr with relu

    // Next the convolution prim descriptor is created, which inherits the ReLU
    /// post-op by way of the attributes `attr`:
    /// @page performance_profiling_cpp
    /// @snippet performance_profiling.cpp Create prim_desc with attr
    // [Create prim_desc with attr]
    // create an attribute for fused relu
    auto attr = create_attr_with_relu_post_op();

    // create a convolution primitive descriptor
    auto conv_pd = convolution_forward::primitive_desc(eng,
            prop_kind::forward_inference, algorithm::convolution_direct,
            conv_src_md, conv_wei_md, conv_dst_md, strides, padding, padding,
            attr);
    // [Create prim_desc with attr]
    /// Then conditional reorders are applied as in *blocked format
    /// implementation* to convert `user_` format NCHW to blocked. Finally, it
    /// creates the convolution primitive `conv` and adds it to the stream `s`
    /// with the reordered data (`conv_src`, `conv_wei`, `conv_dst1`).
    // prepare convolution source
    memory conv_src = user_src;
    if (conv_pd.src_desc() != user_src.get_desc()) {
        conv_src = memory(conv_pd.src_desc(), eng);
        auto r_pd = reorder::primitive_desc(user_src, conv_src);
        reorder(r_pd).execute(s, user_src, conv_src);
    }

    // prepare convolution weights
    memory conv_wei = user_wei;
    if (conv_pd.weights_desc() != user_wei.get_desc()) {
        conv_wei = memory(conv_pd.weights_desc(), eng);
        auto r_pd = reorder::primitive_desc(user_wei, conv_wei);
        reorder(r_pd).execute(s, user_wei, conv_wei);
    }

    // prepare convolution destination
    memory conv_dst = user_dst;
    if (conv_pd.dst_desc() != user_dst.get_desc())
        conv_dst = memory(conv_pd.dst_desc(), eng);
    /// @page performance_profiling_cpp
    /// @note There is no separate addition to the stream for the ReLU
    /// operation because it has been added as a post-op to the `conv` primitive.
    /// @page performance_profiling_cpp
    /// @snippet performance_profiling.cpp Create conv_primitive implementation3
    // [Create conv_primitive implementation3]
    // create convolution primitive
    auto conv = convolution_forward(conv_pd);
    // [Create conv_primitive implementation3]
    /// @page performance_profiling_cpp
    /// @snippet performance_profiling.cpp Add to stream implementation3
    // [Add to stream implementation3]
    // execute convolution by adding it to the stream s
    conv.execute(s,
            {{DNNL_ARG_SRC, conv_src}, {DNNL_ARG_WEIGHTS, conv_wei},
                    {DNNL_ARG_DST, conv_dst}});
    // [Add to stream implementation3]
    // reorder data to user's format if needed
    if (conv_pd.dst_desc() != user_dst.get_desc()) {
        auto r_pd = reorder::primitive_desc(conv_dst, user_dst);
        reorder(r_pd).execute(s, conv_dst, user_dst);
    }
    s.wait();
    /// @page performance_profiling_cpp
    /// This implementation complies with best practices for f32 inference by
    /// using the oneDNN recommended blocked format for convolution and
    /// adding ReLU as a post-op to execute a fused version of conv + ReLU.
    /// The consequence to following best practices can be seen in the execution
    /// time of the fused primitive of 18.0 milliseconds.
    ///
    /// *ONEDNN_VERBOSE output (see configuration notice\*):*
    /// ~~~sh
    /// onednn_verbose,v0,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:abcd:f0 dst_f32::blocked:Acdb16a:f0,,,96x3x11x11,0.0148926
    /// onednn_verbose,v0,exec,cpu,convolution,jit:avx512_common,forward_inference,src_f32::blocked:abcd:f0 wei_f32::blocked:Acdb16a:f0 bia_undef::undef::f0 dst_f32::blocked:aBcd16b:f0,post_ops:'eltwise_relu;';,alg:convolution_direct,mb128_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0,17.968
    /// onednn_verbose,v0,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:aBcd16b:f0 dst_f32::blocked:abcd:f0,,,128x96x55x55,4.66797
    /// ~~~
}

/// @page performance_profiling_cpp
/// @section performance_profiling_cpp_roundup Performance summary
///
/// | Implementation | Time, ms | Cumulative speedup |
/// | :--            |      --: |                --: |
/// | Naive          |     41.2 |                1.0 |
/// | Blocked format |     21.0 |                2.0 |
/// | Fused          |     18.0 |                2.3 |
///
/// **  **
/// @page performance_profiling_cpp
/// @section performance_profiling_cpp_config Configuration Notice
/// @note This example is meant to demonstrate oneDNN best practices.
/// @note It is not meant for benchmarking purposes. The platform is not fully
/// @note optimized, so the primitive execution times are only relevant in
/// @note relation to the other times in this example.
///
/// Runtime Settings:
/// * OMP_NUM_THREADS=14
/// * KMP_AFFINITY=granularity=fine,compact
///
/// Platform:
/// * CPU: Intel(R) Xeon(R) Platinum 8180 CPU @ 2.50GHz
/// * Thread(s) per core:    1
/// * Core(s) per socket:    28
/// * Socket(s):             2
/// * NUMA node(s):          2
/// * RAM (DDR4): 192 GB

void performance_profiling(engine::kind engine_kind, int argc, char **argv) {
    // Initialize engine
    engine eng(engine_kind, 0);

    // Initialize stream
    stream s(eng);
    // [Set dimensions]
    // set dimensions for synthetic data and weights
    const memory::dim BATCH = 128;
    const memory::dim IC = 3, OC = 96;
    const memory::dim IH = 227, KH = 11, OH = 55;
    const memory::dim IW = 227, KW = 11, OW = 55;
    // [Set dimensions]

    // [Create memory objects]
    // create oneDNN memory objects for user's tensors (in nchw and oihw formats)
    auto user_src = memory({{BATCH, IC, IH, IW}, memory::data_type::f32,
                                   memory::format_tag::nchw},
            eng);
    auto user_wei = memory({{OC, IC, KH, KW}, memory::data_type::f32,
                                   memory::format_tag::oihw},
            eng);
    auto user_dst = memory({{BATCH, OC, OH, OW}, memory::data_type::f32,
                                   memory::format_tag::nchw},
            eng);
    // [Create memory objects]

    // fill source, destination, and weights with synthetic data
    init_data(user_src, 1);
    init_data(user_dst, -1);
    init_data(user_wei, .5);

    // set implementation ("naive"||"blocked"||"fused") setting implementation
    // to "validation" will run all implementations
    std::string implementation;
    if (argc <= 2)
        implementation = "validation";
    else if (argc == 3)
        implementation = argv[2];

    if (!(implementation == "validation" || implementation == "naive"
                || implementation == "blocked" || implementation == "fused")) {
        std::cout << "The implementation can be one of:\n";
        std::cout << " - naive: NCHW format without fusion\n";
        std::cout << " - blocked: format propagation without fusion\n";
        std::cout << " - fused: format propagation with fusion\n";
        std::cout << " - validation: runs all implementations\n\n";
        std::cout << "Validation will run if no parameters are specified.\n\n";

        throw std::invalid_argument("Incorrect input arguments.");
    }

    if (implementation == "naive" || implementation == "validation") {
        std::cout << "Implementation: naive.\n";
        // run conv + relu w/o fusing
        conv_relu_naive(user_src, user_wei, user_dst, eng, s);
        std::cout << "Conv + ReLU w/ nchw format completed.\n";
    }

    if (implementation == "blocked" || implementation == "validation") {
        std::cout << "Implementation: blocked.\n";
        // run conv + relu w/o fusing
        conv_relu_blocked(user_src, user_wei, user_dst, eng, s);
        std::cout << "Conv + ReLU w/ blocked format completed.\n";
    }

    if (implementation == "fused" || implementation == "validation") {
        std::cout << "Implementation: fused.\n";
        // run conv + relu w/ fusing
        conv_relu_fused(user_src, user_wei, user_dst, eng, s);
        std::cout << "Conv + ReLU w/ fusing completed.\n";
    }
}

int main(int argc, char **argv) {
    engine::kind engine_kind = parse_engine_kind(argc, argv, 1);
    return handle_example_errors(
            performance_profiling, engine_kind, argc, argv);
}