1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
|
// !!!! PLEASE READ !!!!
// Minimize (transitively) included headers from _avx*.cc because some of the
// functions defined in the headers compiled with platform dependent compiler
// options can be reused by other translation units generating illegal
// instruction run-time error.
// Common utilities for writing performance kernels and easy dispatching of
// different backends.
/*
The general workflow shall be as follows, say we want to
implement a functionality called void foo(int a, float b).
In foo.h, do:
void foo(int a, float b);
In foo_avx512.cc, do:
void foo__avx512(int a, float b) {
[actual avx512 implementation]
}
In foo_avx2.cc, do:
void foo__avx2(int a, float b) {
[actual avx2 implementation]
}
In foo_avx.cc, do:
void foo__avx(int a, float b) {
[actual avx implementation]
}
In foo.cc, do:
// The base implementation should *always* be provided.
void foo__base(int a, float b) {
[base, possibly slow implementation]
}
decltype(foo__base) foo__avx512;
decltype(foo__base) foo__avx2;
decltype(foo__base) foo__avx;
void foo(int a, float b) {
// You should always order things by their preference, faster
// implementations earlier in the function.
AVX512_DO(foo, a, b);
AVX2_DO(foo, a, b);
AVX_DO(foo, a, b);
BASE_DO(foo, a, b);
}
*/
// Details: this functionality basically covers the cases for both build time
// and run time architecture support.
//
// During build time:
// The build system should provide flags CAFFE2_PERF_WITH_AVX512,
// CAFFE2_PERF_WITH_AVX2, and CAFFE2_PERF_WITH_AVX that corresponds to the
// __AVX512F__, __AVX512DQ__, __AVX512VL__, __AVX2__, and __AVX__ flags the
// compiler provides. Note that we do not use the compiler flags but rely on
// the build system flags, because the common files (like foo.cc above) will
// always be built without __AVX512F__, __AVX512DQ__, __AVX512VL__, __AVX2__
// and __AVX__.
// During run time:
// we use cpuinfo to identify cpu support and run the proper functions.
#pragma once
#if defined(CAFFE2_PERF_WITH_SVE) || defined(CAFFE2_PERF_WITH_AVX512) || \
defined(CAFFE2_PERF_WITH_AVX2) || defined(CAFFE2_PERF_WITH_AVX)
#include <cpuinfo.h>
#endif
// DO macros: these should be used in your entry function, similar to foo()
// above, that routes implementations based on CPU capability.
#define BASE_DO(funcname, ...) return funcname##__base(__VA_ARGS__);
#ifdef CAFFE2_PERF_WITH_SVE
#define SVE_DO(funcname, ...) \
{ \
static const bool isDo = cpuinfo_initialize() && cpuinfo_has_arm_sve(); \
if (isDo) { \
return funcname##__sve(__VA_ARGS__); \
} \
}
#else // CAFFE2_PERF_WITH_SVE
#define SVE_DO(funcname, ...)
#endif // CAFFE2_PERF_WITH_SVE
#ifdef CAFFE2_PERF_WITH_AVX512
#define AVX512_DO(funcname, ...) \
{ \
static const bool isDo = cpuinfo_initialize() && \
cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512dq() && \
cpuinfo_has_x86_avx512vl(); \
if (isDo) { \
return funcname##__avx512(__VA_ARGS__); \
} \
}
#else // CAFFE2_PERF_WITH_AVX512
#define AVX512_DO(funcname, ...)
#endif // CAFFE2_PERF_WITH_AVX512
#ifdef CAFFE2_PERF_WITH_AVX2
#define AVX2_DO(funcname, ...) \
{ \
static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx2(); \
if (isDo) { \
return funcname##__avx2(__VA_ARGS__); \
} \
}
#define AVX2_FMA_DO(funcname, ...) \
{ \
static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx2() && \
cpuinfo_has_x86_fma3(); \
if (isDo) { \
return funcname##__avx2_fma(__VA_ARGS__); \
} \
}
#else // CAFFE2_PERF_WITH_AVX2
#define AVX2_DO(funcname, ...)
#define AVX2_FMA_DO(funcname, ...)
#endif // CAFFE2_PERF_WITH_AVX2
#ifdef CAFFE2_PERF_WITH_AVX
#define AVX_DO(funcname, ...) \
{ \
static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx(); \
if (isDo) { \
return funcname##__avx(__VA_ARGS__); \
} \
}
#define AVX_F16C_DO(funcname, ...) \
{ \
static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx() && \
cpuinfo_has_x86_f16c(); \
if (isDo) { \
return funcname##__avx_f16c(__VA_ARGS__); \
} \
}
#else // CAFFE2_PERF_WITH_AVX
#define AVX_DO(funcname, ...)
#define AVX_F16C_DO(funcname, ...)
#endif // CAFFE2_PERF_WITH_AVX
|