1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570
|
/*******************************************************************************
* Copyright 2019-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
/// @example performance_profiling.cpp
/// @copybrief performance_profiling_cpp
/// > Annotated version: @ref performance_profiling_cpp
/// @page performance_profiling_cpp Performance Profiling Example
/// This example demonstrates the best practices for application performance
/// optimizations with oneDNN.
///
/// > Example code: @ref performance_profiling.cpp
///
/// This example uses [ONEDNN_VERBOSE](@ref dev_guide_verbose) trace output
/// to tune oneDNN code to align
/// with the [best practices](@ref dev_guide_inference).
///
/// It assumes knowledge of memory formats and their usage in
/// oneDNN. You can read more about this topic
/// [here](@ref memory_format_propagation_cpp).
///
/// Additionally, see the [article for recommended environment for
/// running benchmarks](@ref dev_guide_performance_settings).
///
/// The example has three different implementations of the mathematical
/// operation:
/// 1. *Naive implementation* executes 2D convolution followed by
/// ReLU on the data in **NCHW** format. This implementation
/// does not align with oneDNN best practices and results in
/// suboptimal performance.
/// 2. *Blocked format implementation* executes the same operations
/// sequence on the **blocked format** optimized for convolution
/// performance. This implementation uses `format_tag=ANY` to create a
/// convolution memory descriptor to determine the data format optimal
/// for the convolution implementation. It then **propagates the blocked
/// format** to the non-intensive ReLU. This implementation results
/// in better overall performance than the naive implementation.
/// 3. *Fused implementation* executes convolution fused with ReLU on
/// blocked data format. This implementation uses
/// `format_tag=ANY` to create a convolution memory descriptor, and then
/// adds ReLU as a **post-op** to the convolution primitive. This version
/// implements all of the best practices for inference resulting in the
/// best overall performance.
///
/// @section performance_profiling_cpp_walkthrough Walkthrough
///
/// The program in \ref performance_profiling.cpp includes all three
/// implementations introduced above. You can select the specific implementation
/// using command line options.
///
/// After compilation, you can execute each implementation with:
/// ~~~sh
/// ./program.exe [cpu|gpu] [implementation]
/// ~~~
///
/// Before you run the program, set your `ONEDNN_VERBOSE` environment
/// variable to `profile_exec`:
/// ~~~sh
/// export ONEDNN_VERBOSE=profile_exec
/// ~~~
///
/// The program starts by creating oneDNN memory objects in **NCHW**
/// format. These are called `user_` because they are meant to represent the
/// user's source data entering oneDNN with the NCHW format.
/// @page performance_profiling_cpp
/// @snippet performance_profiling.cpp Set dimensions
/// @page performance_profiling_cpp
/// @note Here the library allocates memory.
/// @page performance_profiling_cpp
/// @snippet performance_profiling.cpp Create memory objects
/// @page performance_profiling_cpp
/// @note You can change the batch size to easily increase/decrease the workload.
///
/// The following descriptions of each implementation will reference each other,
/// and are meant to be read in order.
///
#include <iostream>
#include <stdexcept>
#include <vector>
#include "oneapi/dnnl/dnnl.hpp"
#include "example_utils.hpp"
using namespace dnnl;
// [Prologue]
// Set Strides and Padding
const memory::dims strides = {4, 4};
const memory::dims padding = {0, 0};
// [Prologue]
//
// function to init data
void init_data(memory &m, float v) {
size_t size = m.get_desc().get_size() / sizeof(float);
std::vector<float> data(size, v);
write_to_dnnl_memory(data.data(), m);
}
// function to execute non-fused relu
void create_and_execute_relu(memory &data, engine &eng, stream &s) {
// relu operates on whatever data format is given to it
// create a primitive
auto relu_pd = eltwise_forward::primitive_desc(eng,
prop_kind::forward_inference, algorithm::eltwise_relu,
data.get_desc(), data.get_desc(), 0.f, 0.f);
auto relu = eltwise_forward(relu_pd);
// execute it (in-place)
relu.execute(s, {{DNNL_ARG_SRC, data}, {DNNL_ARG_DST, data}});
}
// [Create post_op attr with relu]
// function to create post-op attribute for fused relu
primitive_attr create_attr_with_relu_post_op() {
// create a post-op with relu
post_ops ops;
ops.append_eltwise(algorithm::eltwise_relu, 0.f, 0.f);
// create an attribute and set the corresponding post op
primitive_attr attr;
attr.set_post_ops(ops);
return attr;
}
// [Create post_op attr with relu]
// Implementation for naive convolution on nchw (data) and oihw (weights),
// followed by execution of non-fused relu
void conv_relu_naive(const memory &user_src, const memory &user_wei,
memory user_dst, engine &eng, stream &s) {
/// @section performance_profiling_cpp_implementation1 Naive Implementation
/// This implementation is launched with the following shell code:
/// ~~~sh
/// ./program.exe cpu naive
/// ~~~
/// The program will call the implementation defined in the function
/// `conv_relu_naive()`.
///
/// First it sets the dimensions and format for convolution memory
/// descriptors (`_md`) to match `user_` values--one `md` each for source,
/// destination, and weight data. Then it uses those `md` to create the
/// convolution primitive descriptor `conv_pd`, which tells oneDNN to use
/// plain format (NCHW) for the convolution.
/// @page performance_profiling_cpp
/// @snippet performance_profiling.cpp Create mem_desc
// [Create mem_desc]
// copy the dimensions and format from user's memory
auto conv_src_md = memory::desc(user_src.get_desc());
auto conv_wei_md = memory::desc(user_wei.get_desc());
auto conv_dst_md = memory::desc(user_dst.get_desc());
// [Create mem_desc]
/// @page performance_profiling_cpp
/// Next the program creates a convolution primitive descriptor `conv_pd`
/// and convolution primitive `conv`. These structs will inherit
/// NCHW format from `md` by way of the `conv_d`. Finally it creates
/// the convolution primitive `conv` and adds it to the stream `s`, and then
/// executes the `create_and_execute_relu(user_dst)` function.
/// @page performance_profiling_cpp
/// @snippet performance_profiling.cpp Create conv_prim_desc
// [Create conv_prim_desc]
// create a convolution primitive descriptor
auto conv_pd = convolution_forward::primitive_desc(eng,
prop_kind::forward_inference, algorithm::convolution_direct,
conv_src_md, conv_wei_md, conv_dst_md, strides, padding, padding);
// [Create conv_prim_desc]
/// @page performance_profiling_cpp
/// @snippet performance_profiling.cpp Create conv_primitive
// [Create conv_primitive]
// create convolution primitive
auto conv = convolution_forward(conv_pd);
// [Create conv_primitive]
/// @page performance_profiling_cpp
/// @snippet performance_profiling.cpp Add to stream
// [Add to stream]
// execute convolution by adding it to the stream s
conv.execute(s,
{{DNNL_ARG_SRC, user_src}, {DNNL_ARG_WEIGHTS, user_wei},
{DNNL_ARG_DST, user_dst}});
// [Add to stream]
/// @page performance_profiling_cpp
/// @snippet performance_profiling.cpp Create and execute relu
// [Create and execute relu]
// execute relu (on convolution's destination format, whatever it is)
create_and_execute_relu(user_dst, eng, s);
s.wait();
// [Create and execute relu]
/// @page performance_profiling_cpp
/// @note The function for creation and execution of ReLU primitive is
/// defined elsewhere to keep this example clean. It is an non-intensive
/// operation, so the `create_and_execute_relu()` function uses whatever
/// the input data format is at the time it is called.
///
/// Using NCHW data format may result in suboptimal performance for compute
/// intensive primitives, as shown in the following ONEDNN_VERBOSE output
/// by the convolution and relu execution
/// times of 38.3 and 2.9 milliseconds, respectively.
///
/// *ONEDNN_VERBOSE output (see configuration notice\*):*
/// ~~~sh
/// onednn_verbose,v0,exec,cpu,convolution,gemm:jit,forward_inference,src_f32::blocked:abcd:f0 wei_f32::blocked:abcd:f0 bia_undef::undef::f0 dst_f32::blocked:abcd:f0,,alg:convolution_direct,mb128_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0,38.314
/// onednn_verbose,v0,exec,cpu,eltwise,jit:avx512_common,forward_inference,data_f32::blocked:abcd:f0 diff_undef::undef::f0,,alg:eltwise_relu alpha:0 beta:0,128x96x55x55,2.87695
/// ~~~
/// In *Blocked format implementation*, we will incorporate the best
/// practice of letting oneDNN determine the optimal format
/// for convolution primitive.
}
// Implementation for convolution on blocked format for data and
// weights, followed by execution of non-fused relu
void conv_relu_blocked(memory user_src, memory user_wei, memory user_dst,
engine &eng, stream &s) {
/// @page performance_profiling_cpp
/// @section performance_profiling_cpp_implementation2 Blocked format implementation
/// This implementation is launched with the following shell code:
/// ~~~sh
/// ./program.exe cpu blocked
/// ~~~
/// The program will call the implementation defined in the function
/// `conv_relu_blocked()`.
///
/// First it creates the md as in **naive implementation**. Next it changes
/// the dnnl::memory::format_tag for each md to `ANY`. Then it uses those
/// md to create the convolution primitive descriptor conv_pd, which tells
/// oneDNN to use whatever format it recommends for the convolution.
/// oneDNN will choose a friendly blocked format.
/// @page performance_profiling_cpp
/// @snippet performance_profiling.cpp Create mem_desc with tag=any
// [Create mem_desc with tag=any]
// copy the dimensions and data type from user's memory and set format tag
// to "any" to allow convolution to pick the best implementation
auto conv_src_md = memory::desc(user_src.get_desc().get_dims(),
user_src.get_desc().get_data_type(), memory::format_tag::any);
auto conv_wei_md = memory::desc(user_wei.get_desc().get_dims(),
user_wei.get_desc().get_data_type(), memory::format_tag::any);
auto conv_dst_md = memory::desc(user_dst.get_desc().get_dims(),
user_dst.get_desc().get_data_type(), memory::format_tag::any);
// [Create mem_desc with tag=any]
/// Next the program creates a convolution primitive descriptor conv_pd and
/// convolution primitive conv as in naive implementation.
/// However, in this implementation the structs will inherit blocked format
/// from md by way of the conv_d.
/// @page performance_profiling_cpp
/// @snippet performance_profiling.cpp Create conv_prim_desc implementation2
// [Create conv_prim_desc implementation2]
// create a convolution primitive descriptor and primitive
auto conv_pd = convolution_forward::primitive_desc(eng,
prop_kind::forward_inference, algorithm::convolution_direct,
conv_src_md, conv_wei_md, conv_dst_md, strides, padding, padding);
// [Create conv_prim_desc implementation2]
/// Since the resulting convolution primitive will expect
/// blocked source data, conditional reorders are inserted to convert
/// input data to blocked format if required.
/// The input data user_src is NCHW, so this conditional will be triggered:
///
/// @note The reorders are applied using oneDNN `reorder` primitive.
/// @page performance_profiling_cpp
/// @snippet performance_profiling.cpp Conditionally create and execute reorder prims
// [Conditionally create and execute reorder prims]
// prepare convolution source
memory conv_src = user_src;
if (conv_pd.src_desc() != user_src.get_desc()) {
conv_src = memory(conv_pd.src_desc(), eng);
auto r_pd = reorder::primitive_desc(user_src, conv_src);
reorder(r_pd).execute(s, user_src, conv_src);
}
// prepare convolution weights
memory conv_wei = user_wei;
if (conv_pd.weights_desc() != user_wei.get_desc()) {
conv_wei = memory(conv_pd.weights_desc(), eng);
auto r_pd = reorder::primitive_desc(user_wei, conv_wei);
reorder(r_pd).execute(s, user_wei, conv_wei);
}
// prepare convolution destination
memory conv_dst = user_dst;
if (conv_pd.dst_desc() != user_dst.get_desc())
conv_dst = memory(conv_pd.dst_desc(), eng);
// [Conditionally create and execute reorder prims]
/// Finally it creates the convolution primitive `conv` and adds it to the
/// stream `s` with the reordered data (`conv_src`, `conv_wei`, `conv_dst1`)
/// as inputs and then executes the
/// `create_and_execute_relu(conv_dst)` function.
/// @page performance_profiling_cpp
/// @snippet performance_profiling.cpp Create conv_primitive implementation2
// [Create conv_primitive implementation2]
// create convolution primitive
auto conv = convolution_forward(conv_pd);
// [Create conv_primitive implementation2]
/// @page performance_profiling_cpp
/// @snippet performance_profiling.cpp Add to stream implementation2
// [Add to stream implementation2]
// execute convolution by adding it to the stream s
conv.execute(s,
{{DNNL_ARG_SRC, conv_src}, {DNNL_ARG_WEIGHTS, conv_wei},
{DNNL_ARG_DST, conv_dst}});
// [Add to stream implementation2]
/// @page performance_profiling_cpp
/// @snippet performance_profiling.cpp Create and execute relu implementation2
// [Create and execute relu implementation2]
// execute relu (on convolution's destination format, whatever it is)
create_and_execute_relu(conv_dst, eng, s);
// [Create and execute relu implementation2]
if (conv_pd.dst_desc() != user_dst.get_desc()) {
auto r_pd = reorder::primitive_desc(conv_dst, user_dst);
reorder(r_pd).execute(s, conv_dst, user_dst);
}
s.wait();
/// @page performance_profiling_cpp
/// Blocked memory format is recommended for oneDNN primitive
/// execution and provides better performance, as shown in the
/// ONEDNN_VERBOSE output by the convolution and relu execution times of
/// 18.3 and 2.7 milliseconds (down from 38.3 and 2.9 in
/// *naive implementation*), respectively.
/// In this implementation, there is an additional reorder operation that
/// executes before and after the the conv + relu. This small cost is worth
/// the gain from executing in blocked format. If fact, it becomes
/// negligible when chaining together multiple oneDNN operations in
/// succession. In these situations, you can do one reorder at the beginning
/// and one at the end of the chain, and only pay the reorder penalty at
/// those points in the execution.
///
/// *ONEDNN_VERBOSE output (see configuration notice\*):*
/// ~~~sh
/// onednn_verbose,v0,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:abcd:f0 dst_f32::blocked:Acdb16a:f0,,,96x3x11x11,0.0310059
/// onednn_verbose,v0,exec,cpu,convolution,jit:avx512_common,forward_inference,src_f32::blocked:abcd:f0 wei_f32::blocked:Acdb16a:f0 bia_undef::undef::f0 dst_f32::blocked:aBcd16b:f0,,alg:convolution_direct,mb128_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0,18.3101
/// onednn_verbose,v0,exec,cpu,eltwise,jit:avx512_common,forward_inference,data_f32::blocked:aBcd16b:f0 diff_undef::undef::f0,,alg:eltwise_relu alpha:0 beta:0,128x96x55x55,2.66895
/// onednn_verbose,v0,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:aBcd16b:f0 dst_f32::blocked:abcd:f0,,,128x96x55x55,4.80396
/// ~~~
/// This inference implementation is closer to best practices than
/// *naive implementation* because it uses oneDNN recommended memory
/// format. *fused implementation* will further optimize the performance by
/// fusing convolution with ReLU using oneDNN
/// [post-ops](@ref dev_guide_attributes_post_ops).
// reorder data to the user's format if needed.
}
// Implementation for convolution on blocked format for data and
// weights and the relu operation fused via a post-op attribute added to the
// convolution prim_descriptor
void conv_relu_fused(memory user_src, memory user_wei, memory user_dst,
const engine &eng, stream &s) {
/// @section performance_profiling_cpp_implementation3 Fused Implementation
/// This implementation is launched with the following shell code:
/// ~~~sh
/// ./program.exe cpu fused
/// ~~~
/// The program will call the implementation defined in the function
/// `conv_relu_fused()`.
/// @page performance_profiling_cpp
///
/// First the memory descriptors and convolution primitive descriptor are
/// created as in *naive implementation*.
// copy the dimensions data type from user's memory and set format tag
// to any to allow convolution to pick the best implementation
auto conv_src_md = memory::desc(user_src.get_desc().get_dims(),
user_src.get_desc().get_data_type(), memory::format_tag::any);
auto conv_wei_md = memory::desc(user_wei.get_desc().get_dims(),
user_wei.get_desc().get_data_type(), memory::format_tag::any);
auto conv_dst_md = memory::desc(user_dst.get_desc().get_dims(),
user_dst.get_desc().get_data_type(), memory::format_tag::any);
/// Then in preparation for the convolution prim descriptor, a ReLU post-op
/// is built and added to the primitive attribute `attr`:
/// @page performance_profiling_cpp
/// @snippet performance_profiling.cpp Create post_op attr with relu
// Next the convolution prim descriptor is created, which inherits the ReLU
/// post-op by way of the attributes `attr`:
/// @page performance_profiling_cpp
/// @snippet performance_profiling.cpp Create prim_desc with attr
// [Create prim_desc with attr]
// create an attribute for fused relu
auto attr = create_attr_with_relu_post_op();
// create a convolution primitive descriptor
auto conv_pd = convolution_forward::primitive_desc(eng,
prop_kind::forward_inference, algorithm::convolution_direct,
conv_src_md, conv_wei_md, conv_dst_md, strides, padding, padding,
attr);
// [Create prim_desc with attr]
/// Then conditional reorders are applied as in *blocked format
/// implementation* to convert `user_` format NCHW to blocked. Finally, it
/// creates the convolution primitive `conv` and adds it to the stream `s`
/// with the reordered data (`conv_src`, `conv_wei`, `conv_dst1`).
// prepare convolution source
memory conv_src = user_src;
if (conv_pd.src_desc() != user_src.get_desc()) {
conv_src = memory(conv_pd.src_desc(), eng);
auto r_pd = reorder::primitive_desc(user_src, conv_src);
reorder(r_pd).execute(s, user_src, conv_src);
}
// prepare convolution weights
memory conv_wei = user_wei;
if (conv_pd.weights_desc() != user_wei.get_desc()) {
conv_wei = memory(conv_pd.weights_desc(), eng);
auto r_pd = reorder::primitive_desc(user_wei, conv_wei);
reorder(r_pd).execute(s, user_wei, conv_wei);
}
// prepare convolution destination
memory conv_dst = user_dst;
if (conv_pd.dst_desc() != user_dst.get_desc())
conv_dst = memory(conv_pd.dst_desc(), eng);
/// @page performance_profiling_cpp
/// @note There is no separate addition to the stream for the ReLU
/// operation because it has been added as a post-op to the `conv` primitive.
/// @page performance_profiling_cpp
/// @snippet performance_profiling.cpp Create conv_primitive implementation3
// [Create conv_primitive implementation3]
// create convolution primitive
auto conv = convolution_forward(conv_pd);
// [Create conv_primitive implementation3]
/// @page performance_profiling_cpp
/// @snippet performance_profiling.cpp Add to stream implementation3
// [Add to stream implementation3]
// execute convolution by adding it to the stream s
conv.execute(s,
{{DNNL_ARG_SRC, conv_src}, {DNNL_ARG_WEIGHTS, conv_wei},
{DNNL_ARG_DST, conv_dst}});
// [Add to stream implementation3]
// reorder data to user's format if needed
if (conv_pd.dst_desc() != user_dst.get_desc()) {
auto r_pd = reorder::primitive_desc(conv_dst, user_dst);
reorder(r_pd).execute(s, conv_dst, user_dst);
}
s.wait();
/// @page performance_profiling_cpp
/// This implementation complies with best practices for f32 inference by
/// using the oneDNN recommended blocked format for convolution and
/// adding ReLU as a post-op to execute a fused version of conv + ReLU.
/// The consequence to following best practices can be seen in the execution
/// time of the fused primitive of 18.0 milliseconds.
///
/// *ONEDNN_VERBOSE output (see configuration notice\*):*
/// ~~~sh
/// onednn_verbose,v0,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:abcd:f0 dst_f32::blocked:Acdb16a:f0,,,96x3x11x11,0.0148926
/// onednn_verbose,v0,exec,cpu,convolution,jit:avx512_common,forward_inference,src_f32::blocked:abcd:f0 wei_f32::blocked:Acdb16a:f0 bia_undef::undef::f0 dst_f32::blocked:aBcd16b:f0,post_ops:'eltwise_relu;';,alg:convolution_direct,mb128_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0,17.968
/// onednn_verbose,v0,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:aBcd16b:f0 dst_f32::blocked:abcd:f0,,,128x96x55x55,4.66797
/// ~~~
}
/// @page performance_profiling_cpp
/// @section performance_profiling_cpp_roundup Performance summary
///
/// | Implementation | Time, ms | Cumulative speedup |
/// | :-- | --: | --: |
/// | Naive | 41.2 | 1.0 |
/// | Blocked format | 21.0 | 2.0 |
/// | Fused | 18.0 | 2.3 |
///
/// ** **
/// @page performance_profiling_cpp
/// @section performance_profiling_cpp_config Configuration Notice
/// @note This example is meant to demonstrate oneDNN best practices.
/// @note It is not meant for benchmarking purposes. The platform is not fully
/// @note optimized, so the primitive execution times are only relevant in
/// @note relation to the other times in this example.
///
/// Runtime Settings:
/// * OMP_NUM_THREADS=14
/// * KMP_AFFINITY=granularity=fine,compact
///
/// Platform:
/// * CPU: Intel(R) Xeon(R) Platinum 8180 CPU @ 2.50GHz
/// * Thread(s) per core: 1
/// * Core(s) per socket: 28
/// * Socket(s): 2
/// * NUMA node(s): 2
/// * RAM (DDR4): 192 GB
void performance_profiling(engine::kind engine_kind, int argc, char **argv) {
// Initialize engine
engine eng(engine_kind, 0);
// Initialize stream
stream s(eng);
// [Set dimensions]
// set dimensions for synthetic data and weights
const memory::dim BATCH = 128;
const memory::dim IC = 3, OC = 96;
const memory::dim IH = 227, KH = 11, OH = 55;
const memory::dim IW = 227, KW = 11, OW = 55;
// [Set dimensions]
// [Create memory objects]
// create oneDNN memory objects for user's tensors (in nchw and oihw formats)
auto user_src = memory({{BATCH, IC, IH, IW}, memory::data_type::f32,
memory::format_tag::nchw},
eng);
auto user_wei = memory({{OC, IC, KH, KW}, memory::data_type::f32,
memory::format_tag::oihw},
eng);
auto user_dst = memory({{BATCH, OC, OH, OW}, memory::data_type::f32,
memory::format_tag::nchw},
eng);
// [Create memory objects]
// fill source, destination, and weights with synthetic data
init_data(user_src, 1);
init_data(user_dst, -1);
init_data(user_wei, .5);
// set implementation ("naive"||"blocked"||"fused") setting implementation
// to "validation" will run all implementations
std::string implementation;
if (argc <= 2)
implementation = "validation";
else if (argc == 3)
implementation = argv[2];
if (!(implementation == "validation" || implementation == "naive"
|| implementation == "blocked" || implementation == "fused")) {
std::cout << "The implementation can be one of:\n";
std::cout << " - naive: NCHW format without fusion\n";
std::cout << " - blocked: format propagation without fusion\n";
std::cout << " - fused: format propagation with fusion\n";
std::cout << " - validation: runs all implementations\n\n";
std::cout << "Validation will run if no parameters are specified.\n\n";
throw std::invalid_argument("Incorrect input arguments.");
}
if (implementation == "naive" || implementation == "validation") {
std::cout << "Implementation: naive.\n";
// run conv + relu w/o fusing
conv_relu_naive(user_src, user_wei, user_dst, eng, s);
std::cout << "Conv + ReLU w/ nchw format completed.\n";
}
if (implementation == "blocked" || implementation == "validation") {
std::cout << "Implementation: blocked.\n";
// run conv + relu w/o fusing
conv_relu_blocked(user_src, user_wei, user_dst, eng, s);
std::cout << "Conv + ReLU w/ blocked format completed.\n";
}
if (implementation == "fused" || implementation == "validation") {
std::cout << "Implementation: fused.\n";
// run conv + relu w/ fusing
conv_relu_fused(user_src, user_wei, user_dst, eng, s);
std::cout << "Conv + ReLU w/ fusing completed.\n";
}
}
int main(int argc, char **argv) {
engine::kind engine_kind = parse_engine_kind(argc, argv, 1);
return handle_example_errors(
performance_profiling, engine_kind, argc, argv);
}
|