File: common.h

package info (click to toggle)
pytorch-cuda 2.6.0%2Bdfsg-7
  • links: PTS, VCS
  • area: contrib
  • in suites: forky, sid, trixie
  • size: 161,620 kB
  • sloc: python: 1,278,832; cpp: 900,322; ansic: 82,710; asm: 7,754; java: 3,363; sh: 2,811; javascript: 2,443; makefile: 597; ruby: 195; xml: 84; objc: 68
file content (140 lines) | stat: -rw-r--r-- 6,149 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
// !!!! PLEASE READ !!!!
// Minimize (transitively) included headers from _avx*.cc because some of the
// functions defined in the headers compiled with platform dependent compiler
// options can be reused by other translation units generating illegal
// instruction run-time error.

// Common utilities for writing performance kernels and easy dispatching of
// different backends.
/*
The general workflow shall be as follows, say we want to
implement a functionality called void foo(int a, float b).

In foo.h, do:
   void foo(int a, float b);

In foo_avx512.cc, do:
   void foo__avx512(int a, float b) {
     [actual avx512 implementation]
   }

In foo_avx2.cc, do:
   void foo__avx2(int a, float b) {
     [actual avx2 implementation]
   }

In foo_avx.cc, do:
   void foo__avx(int a, float b) {
     [actual avx implementation]
   }

In foo.cc, do:
   // The base implementation should *always* be provided.
   void foo__base(int a, float b) {
     [base, possibly slow implementation]
   }
   decltype(foo__base) foo__avx512;
   decltype(foo__base) foo__avx2;
   decltype(foo__base) foo__avx;
   void foo(int a, float b) {
     // You should always order things by their preference, faster
     // implementations earlier in the function.
     AVX512_DO(foo, a, b);
     AVX2_DO(foo, a, b);
     AVX_DO(foo, a, b);
     BASE_DO(foo, a, b);
   }

*/
// Details: this functionality basically covers the cases for both build time
// and run time architecture support.
//
// During build time:
//    The build system should provide flags CAFFE2_PERF_WITH_AVX512,
//    CAFFE2_PERF_WITH_AVX2, and CAFFE2_PERF_WITH_AVX that corresponds to the
//    __AVX512F__, __AVX512DQ__, __AVX512VL__, __AVX2__, and __AVX__ flags the
//    compiler provides. Note that we do not use the compiler flags but rely on
//    the build system flags, because the common files (like foo.cc above) will
//    always be built without __AVX512F__, __AVX512DQ__, __AVX512VL__, __AVX2__
//    and __AVX__.
// During run time:
//    we use cpuinfo to identify cpu support and run the proper functions.

#pragma once
#if defined(CAFFE2_PERF_WITH_SVE) || defined(CAFFE2_PERF_WITH_AVX512) || \
    defined(CAFFE2_PERF_WITH_AVX2) || defined(CAFFE2_PERF_WITH_AVX)
#include <cpuinfo.h>
#endif

// DO macros: these should be used in your entry function, similar to foo()
// above, that routes implementations based on CPU capability.

#define BASE_DO(funcname, ...) return funcname##__base(__VA_ARGS__);

#ifdef CAFFE2_PERF_WITH_SVE
#define SVE_DO(funcname, ...)                                               \
  {                                                                         \
    static const bool isDo = cpuinfo_initialize() && cpuinfo_has_arm_sve(); \
    if (isDo) {                                                             \
      return funcname##__sve(__VA_ARGS__);                                  \
    }                                                                       \
  }
#else // CAFFE2_PERF_WITH_SVE
#define SVE_DO(funcname, ...)
#endif // CAFFE2_PERF_WITH_SVE

#ifdef CAFFE2_PERF_WITH_AVX512
#define AVX512_DO(funcname, ...)                                   \
  {                                                                \
    static const bool isDo = cpuinfo_initialize() &&               \
        cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512dq() && \
        cpuinfo_has_x86_avx512vl();                                \
    if (isDo) {                                                    \
      return funcname##__avx512(__VA_ARGS__);                      \
    }                                                              \
  }
#else // CAFFE2_PERF_WITH_AVX512
#define AVX512_DO(funcname, ...)
#endif // CAFFE2_PERF_WITH_AVX512

#ifdef CAFFE2_PERF_WITH_AVX2
#define AVX2_DO(funcname, ...)                                               \
  {                                                                          \
    static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx2(); \
    if (isDo) {                                                              \
      return funcname##__avx2(__VA_ARGS__);                                  \
    }                                                                        \
  }
#define AVX2_FMA_DO(funcname, ...)                                             \
  {                                                                            \
    static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx2() && \
        cpuinfo_has_x86_fma3();                                                \
    if (isDo) {                                                                \
      return funcname##__avx2_fma(__VA_ARGS__);                                \
    }                                                                          \
  }
#else // CAFFE2_PERF_WITH_AVX2
#define AVX2_DO(funcname, ...)
#define AVX2_FMA_DO(funcname, ...)
#endif // CAFFE2_PERF_WITH_AVX2

#ifdef CAFFE2_PERF_WITH_AVX
#define AVX_DO(funcname, ...)                                               \
  {                                                                         \
    static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx(); \
    if (isDo) {                                                             \
      return funcname##__avx(__VA_ARGS__);                                  \
    }                                                                       \
  }
#define AVX_F16C_DO(funcname, ...)                                            \
  {                                                                           \
    static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx() && \
        cpuinfo_has_x86_f16c();                                               \
    if (isDo) {                                                               \
      return funcname##__avx_f16c(__VA_ARGS__);                               \
    }                                                                         \
  }
#else // CAFFE2_PERF_WITH_AVX
#define AVX_DO(funcname, ...)
#define AVX_F16C_DO(funcname, ...)
#endif // CAFFE2_PERF_WITH_AVX