1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
|
/*******************************************************************************
* Copyright 2019-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
/// @example cpu_sgemm_and_matmul.cpp
/// > Annotated version: @ref cpu_sgemm_and_matmul_cpp
///
/// @page cpu_sgemm_and_matmul_cpp_short
/// C++ API example demonstrating [MatMul](@ref dev_guide_matmul)
/// as a replacement for SGEMM functions.
///
/// Concepts:
/// - Create primitive once, use multiple times
/// - Run-time tensor shapes: #DNNL_RUNTIME_DIM_VAL
/// - Scales: dnnl::primitive_attr::set_scales_mask()
///
/// @page cpu_sgemm_and_matmul_cpp MatMul Tutorial: Comparison with SGEMM
/// @copydetails cpu_sgemm_and_matmul_cpp_short
///
/// We will show two modes for the MatMul primitive:
/// 1. The shapes of the input and output matrices are passed at execution time.
/// This enables you to create a primitive only once and use it for different
/// matrices, just like normal SGEMM (though with a handle -- oneDNN
/// primitive).
/// To indicate the unknown dimensions and floating point values, you should
/// use #DNNL_RUNTIME_DIM_VAL and #DNNL_RUNTIME_F32_VAL respectively.
/// 2. The shapes of the input and output matrices are passed at creation time,
/// as in oneDNN programming model.
/// This enables creating a highly specialized kernel for the given problem
/// sizes with the loss of generality.
///
/// Users are free to choose between these two options, as well as any
/// intermediate ones (e.g., specifying some of the parameters at creation time
/// while leaving the others until execution time). This enables balancing
/// between flexibility and performance.
///
/// @note
/// The more you specify at creation time, the better performance is.
///
/// @include cpu_sgemm_and_matmul.cpp
#include <cassert>
#include <cctype>
#include <cmath>
#include <cstdio>
#include <iostream>
#include <random>
#include <stdexcept>
#include <vector>
#include "oneapi/dnnl/dnnl.hpp"
#include "example_utils.hpp"
using namespace dnnl;
namespace {
void init_vector(std::vector<float> &v) {
std::mt19937 gen;
std::uniform_real_distribution<float> u(-1, 1);
for (auto &e : v)
e = u(gen);
}
int compare_vectors(const std::vector<float> &v1, const std::vector<float> &v2,
int64_t K, const char *message) {
double v1_l2 = 0, diff_l2 = 0;
for (size_t n = 0; n < v1.size(); ++n) {
float diff = v1[n] - v2[n];
v1_l2 += v1[n] * v1[n];
diff_l2 += diff * diff;
}
v1_l2 = std::sqrt(v1_l2);
diff_l2 = std::sqrt(diff_l2);
// Finding the reasonable (tight and accurate) threshold is quite difficult
// problem.
// The implementation testing might also use special data filling to
// alleviate issues related to the finite precision arithmetic.
// However, in simple cases the machine epsilon multiplied by log(K) should
// work reasonably well.
const double threshold = std::numeric_limits<float>::epsilon()
* std::log(std::max(2., (double)K));
bool ok = diff_l2 <= threshold * v1_l2;
printf("%s\n\tL2 Norms"
"\n\t\tReference matrix:%g\n\t\tError:%g\n\t\tRelative_error:%g\n"
"\tAccuracy check: %s\n",
message, v1_l2, diff_l2, diff_l2 / v1_l2, ok ? "OK" : "FAILED");
return ok ? 0 : 1;
}
} // namespace
int number_of_runs = 1;
float fixed_beta = 0.f;
const engine &eng() {
static const engine eng(engine::kind::cpu, 0);
return eng;
}
// Create a _dynamic_ MatMul primitive that can work with arbitrary shapes
// and alpha parameters.
// Warning: current limitation is that beta parameter should be known in
// advance (use fixed_beta).
matmul dynamic_matmul_create() {
// We assume that beta is known at the primitive creation time
float beta = fixed_beta;
memory::dims a_shape = {DNNL_RUNTIME_DIM_VAL, DNNL_RUNTIME_DIM_VAL};
memory::dims b_shape = {DNNL_RUNTIME_DIM_VAL, DNNL_RUNTIME_DIM_VAL};
memory::dims c_shape = {DNNL_RUNTIME_DIM_VAL, DNNL_RUNTIME_DIM_VAL};
memory::dims a_strides = {DNNL_RUNTIME_DIM_VAL, DNNL_RUNTIME_DIM_VAL};
memory::dims b_strides = {DNNL_RUNTIME_DIM_VAL, DNNL_RUNTIME_DIM_VAL};
memory::dims c_strides = {DNNL_RUNTIME_DIM_VAL, 1};
memory::desc a_md(a_shape, memory::data_type::f32, a_strides);
memory::desc b_md(b_shape, memory::data_type::f32, b_strides);
memory::desc c_md(c_shape, memory::data_type::f32, c_strides);
// Create attributes (to handle alpha dynamically and beta if necessary)
primitive_attr attr;
attr.set_scales_mask(DNNL_ARG_WEIGHTS, /* mask */ 0);
if (beta != 0.f) {
post_ops po;
po.append_sum(beta);
attr.set_post_ops(po);
}
// Create a MatMul primitive
matmul::primitive_desc matmul_pd(eng(), a_md, b_md, c_md, attr);
return matmul(matmul_pd);
}
// Execute a _dynamic_ MatMul primitive created earlier. All the parameters are
// passed at a run-time (except for beta which has to be specified at the
// primitive creation time due to the current limitation).
void dynamic_matmul_execute(matmul &matmul_p, char transA, char transB,
int64_t M, int64_t N, int64_t K, float alpha, const float *A,
int64_t lda, const float *B, int64_t ldb, float beta, float *C,
int64_t ldc) {
using dims = memory::dims;
if (beta != fixed_beta)
throw std::logic_error("Run-time beta is not yet supported.");
// Translate transA and transB
dims a_strides = tolower(transA) == 'n' ? dims {lda, 1} : dims {1, lda};
dims b_strides = tolower(transB) == 'n' ? dims {ldb, 1} : dims {1, ldb};
// Wrap raw pointers into oneDNN memories (with proper shapes)
memory A_m({{M, K}, memory::data_type::f32, a_strides}, eng(), (void *)A);
memory B_m({{K, N}, memory::data_type::f32, b_strides}, eng(), (void *)B);
memory C_m({{M, N}, memory::data_type::f32, {ldc, 1}}, eng(), (void *)C);
// Prepare oneDNN memory for alpha
memory alpha_m({{1}, memory::data_type::f32, {1}}, eng(), &alpha);
// Execute the MatMul primitive
stream s(eng());
matmul_p.execute(s,
{{DNNL_ARG_SRC, A_m}, {DNNL_ARG_WEIGHTS, B_m}, {DNNL_ARG_DST, C_m},
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, alpha_m}});
s.wait();
}
void sgemm_and_matmul_with_params(char transA, char transB, int64_t M,
int64_t N, int64_t K, float alpha, float beta) {
if (beta != fixed_beta)
throw std::logic_error("Run-time beta is not yet supported.");
// Allocate and initialize matrices
std::vector<float> A(M * K);
init_vector(A);
std::vector<float> B(K * N);
init_vector(B);
std::vector<float> C_sgemm(M * N);
init_vector(C_sgemm);
std::vector<float> C_dynamic_matmul = C_sgemm;
// Prepare leading dimensions
int64_t lda = tolower(transA) == 'n' ? K : M;
int64_t ldb = tolower(transB) == 'n' ? N : K;
int64_t ldc = N;
// 1. Execute sgemm
for (int run = 0; run < number_of_runs; ++run)
dnnl_sgemm(transA, transB, M, N, K, alpha, A.data(), lda, B.data(), ldb,
beta, C_sgemm.data(), ldc);
// 2.a Create dynamic MatMul
auto dynamic_matmul = dynamic_matmul_create();
// 2.b Execute
for (int run = 0; run < number_of_runs; ++run)
dynamic_matmul_execute(dynamic_matmul, transA, transB, M, N, K, alpha,
A.data(), lda, B.data(), ldb, beta, C_dynamic_matmul.data(),
ldc);
int rc = 0;
rc |= compare_vectors(
C_sgemm, C_dynamic_matmul, K, "Compare SGEMM vs dynamic MatMul");
if (rc) throw std::logic_error("The resulting matrices diverged too much.");
}
void sgemm_and_matmul() {
sgemm_and_matmul_with_params('N', 'T', 10, 20, 30, 1.1f, fixed_beta);
}
int main(int argc, char **argv) {
return handle_example_errors({engine::kind::cpu}, sgemm_and_matmul);
}
|