1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
|
load("@org_tensorflow//tensorflow/tsl:tsl.bzl", "tf_openmp_copts")
load("@org_tensorflow//third_party/mkl:build_defs.bzl", "if_mkl")
load("@org_tensorflow//third_party/mkl_dnn:build_defs.bzl", "if_mkldnn_openmp")
load("@org_tensorflow//third_party/mkl:build_defs.bzl", "if_mkl_ml")
load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
exports_files(["LICENSE"])
_CMAKE_COMMON_LIST = {
"#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
"#cmakedefine DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE": "#undef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE",
"#cmakedefine DNNL_WITH_SYCL": "#undef DNNL_WITH_SYCL",
"#cmakedefine DNNL_WITH_LEVEL_ZERO": "#undef DNNL_WITH_LEVEL_ZERO",
"#cmakedefine DNNL_SYCL_CUDA": "#undef DNNL_SYCL_CUDA",
"#cmakedefine DNNL_SYCL_HIP": "#undef DNNL_SYCL_HIP",
"#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
"#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
"#cmakedefine ONEDNN_BUILD_GRAPH": "#undef ONEDNN_BUILD_GRAPH",
"#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
"#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
"#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
"#cmakedefine01 BUILD_BATCH_NORMALIZATION": "#define BUILD_BATCH_NORMALIZATION 0",
"#cmakedefine01 BUILD_BINARY": "#define BUILD_BINARY 0",
"#cmakedefine01 BUILD_CONCAT": "#define BUILD_CONCAT 0",
"#cmakedefine01 BUILD_CONVOLUTION": "#define BUILD_CONVOLUTION 0",
"#cmakedefine01 BUILD_DECONVOLUTION": "#define BUILD_DECONVOLUTION 0",
"#cmakedefine01 BUILD_ELTWISE": "#define BUILD_ELTWISE 0",
"#cmakedefine01 BUILD_INNER_PRODUCT": "#define BUILD_INNER_PRODUCT 0",
"#cmakedefine01 BUILD_LAYER_NORMALIZATION": "#define BUILD_LAYER_NORMALIZATION 0",
"#cmakedefine01 BUILD_LRN": "#define BUILD_LRN 0",
"#cmakedefine01 BUILD_MATMUL": "#define BUILD_MATMUL 0",
"#cmakedefine01 BUILD_POOLING": "#define BUILD_POOLING 0",
"#cmakedefine01 BUILD_PRELU": "#define BUILD_PRELU 0",
"#cmakedefine01 BUILD_REDUCTION": "#define BUILD_REDUCTION 0",
"#cmakedefine01 BUILD_REORDER": "#define BUILD_REORDER 0",
"#cmakedefine01 BUILD_RESAMPLING": "#define BUILD_RESAMPLING 0",
"#cmakedefine01 BUILD_RNN": "#define BUILD_RNN 0",
"#cmakedefine01 BUILD_SHUFFLE": "#define BUILD_SHUFFLE 0",
"#cmakedefine01 BUILD_SOFTMAX": "#define BUILD_SOFTMAX 0",
"#cmakedefine01 BUILD_SUM": "#define BUILD_SUM 0",
"#cmakedefine01 BUILD_PRIMITIVE_CPU_ISA_ALL": "#define BUILD_PRIMITIVE_CPU_ISA_ALL 1",
"#cmakedefine01 BUILD_SSE41": "#define BUILD_SSE41 0",
"#cmakedefine01 BUILD_AVX2": "#define BUILD_AVX2 0",
"#cmakedefine01 BUILD_AVX512": "#define BUILD_AVX512 0",
"#cmakedefine01 BUILD_AMX": "#define BUILD_AMX 0",
"#cmakedefine01 BUILD_PRIMITIVE_GPU_ISA_ALL": "#define BUILD_PRIMITIVE_GPU_ISA_ALL 0",
"#cmakedefine01 BUILD_GEN9": "#define BUILD_GEN9 0",
"#cmakedefine01 BUILD_GEN11": "#define BUILD_GEN11 0",
"#cmakedefine01 BUILD_XELP": "#define BUILD_XELP 0",
"#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
"#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
"#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
}
_DNNL_RUNTIME_OMP = {
"#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
"#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
}
_DNNL_RUNTIME_OMP.update(_CMAKE_COMMON_LIST)
_DNNL_RUNTIME_THREADPOOL = {
"#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_THREADPOOL",
"#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_THREADPOOL",
}
_DNNL_RUNTIME_THREADPOOL.update(_CMAKE_COMMON_LIST)
expand_template(
name = "dnnl_config_h",
out = "include/oneapi/dnnl/dnnl_config.h",
substitutions = select({
"@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_openmp": _DNNL_RUNTIME_OMP,
"//conditions:default": _DNNL_RUNTIME_THREADPOOL,
}),
template = "include/oneapi/dnnl/dnnl_config.h.in",
)
# Create the file dnnl_version.h with DNNL version numbers.
# Currently, the version numbers are hard coded here. If DNNL is upgraded then
# the version numbers have to be updated manually. The version numbers can be
# obtained from the PROJECT_VERSION settings in CMakeLists.txt. The variable is
# set to "version_major.version_minor.version_patch". The git hash version can
# be set to NA.
# TODO(agramesh1): Automatically get the version numbers from CMakeLists.txt.
expand_template(
name = "dnnl_version_h",
out = "include/oneapi/dnnl/dnnl_version.h",
substitutions = {
"@DNNL_VERSION_MAJOR@": "3",
"@DNNL_VERSION_MINOR@": "2",
"@DNNL_VERSION_PATCH@": "1",
"@DNNL_VERSION_HASH@": "N/A",
},
template = "include/oneapi/dnnl/dnnl_version.h.in",
)
_COPTS_LIST = select({
"@org_tensorflow//tensorflow/tsl:windows": [],
"//conditions:default": ["-fexceptions"],
}) + [
"-UUSE_MKL",
"-UUSE_CBLAS",
"-DDNNL_ENABLE_MAX_CPU_ISA",
"-DDNNL_ENABLE_ITT_TASKS",
] + tf_openmp_copts()
_INCLUDES_LIST = [
"include",
"src",
"src/common",
"src/common/ittnotify",
"src/cpu",
"src/cpu/gemm",
"src/cpu/x64/xbyak",
]
_TEXTUAL_HDRS_LIST = glob([
"include/**/*",
"src/common/*.hpp",
"src/common/ittnotify/**/*.h",
"src/cpu/*.hpp",
"src/cpu/**/*.hpp",
"src/cpu/jit_utils/**/*.hpp",
"src/cpu/x64/xbyak/*.h",
]) + [
":dnnl_config_h",
":dnnl_version_h",
]
# Large autogen files take too long time to compile with usual optimization
# flags. These files just generate binary kernels and are not the hot spots,
# so we factor them out to lower compiler optimizations in ":dnnl_autogen".
# Using -O1 to enable optimizations to reduce stack consumption. (With -O0,
# compiler doesn't clean up stack from temporary objects.)
cc_library(
name = "onednn_autogen",
srcs = glob(["src/cpu/x64/gemm/**/*_kern_autogen*.cpp"]),
copts = [
"-O1",
"-U_FORTIFY_SOURCE",
] + _COPTS_LIST,
includes = _INCLUDES_LIST,
textual_hdrs = _TEXTUAL_HDRS_LIST,
visibility = ["//visibility:public"],
)
cc_library(
name = "mkl_dnn",
srcs = glob(
[
"src/common/*.cpp",
"src/cpu/*.cpp",
"src/cpu/**/*.cpp",
"src/common/ittnotify/*.c",
"src/cpu/jit_utils/**/*.cpp",
],
exclude = [
"src/cpu/aarch64/**",
"src/cpu/rv64/**",
"src/cpu/x64/gemm/**/*_kern_autogen.cpp",
],
),
copts = _COPTS_LIST,
includes = _INCLUDES_LIST,
# TODO(penpornk): Use lrt_if_needed from tensorflow.bzl instead.
linkopts = select({
"@org_tensorflow//tensorflow/tsl:linux_aarch64": ["-lrt"],
"@org_tensorflow//tensorflow/tsl:linux_x86_64": ["-lrt"],
"@org_tensorflow//tensorflow/tsl:linux_ppc64le": ["-lrt"],
"//conditions:default": [],
}),
textual_hdrs = _TEXTUAL_HDRS_LIST,
visibility = ["//visibility:public"],
deps = [":onednn_autogen"] + if_mkl_ml(
["@org_tensorflow//third_party/mkl:intel_binary_blob"],
[],
),
)
|