1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
|
#include <c10/util/numa.h>
C10_DEFINE_bool(caffe2_cpu_numa_enabled, false, "Use NUMA whenever possible.");
#if defined(__linux__) && defined(C10_USE_NUMA) && !defined(C10_MOBILE)
#include <numa.h>
#include <numaif.h>
#include <unistd.h>
#define C10_ENABLE_NUMA
#endif
// This code used to have a lot of VLOGs. However, because allocation might be
// triggered during static initialization, it's unsafe to invoke VLOG here
namespace c10 {
#ifdef C10_ENABLE_NUMA
bool IsNUMAEnabled() {
return FLAGS_caffe2_cpu_numa_enabled && numa_available() >= 0;
}
void NUMABind(int numa_node_id) {
if (numa_node_id < 0) {
return;
}
if (!IsNUMAEnabled()) {
return;
}
TORCH_CHECK(
numa_node_id <= numa_max_node(),
"NUMA node id ",
numa_node_id,
" is unavailable");
auto bm = numa_allocate_nodemask();
numa_bitmask_setbit(bm, numa_node_id);
numa_bind(bm);
numa_bitmask_free(bm);
}
int GetNUMANode(const void* ptr) {
if (!IsNUMAEnabled()) {
return -1;
}
AT_ASSERT(ptr);
int numa_node = -1;
TORCH_CHECK(
get_mempolicy(
&numa_node,
NULL,
0,
const_cast<void*>(ptr),
MPOL_F_NODE | MPOL_F_ADDR) == 0,
"Unable to get memory policy, errno:",
errno);
return numa_node;
}
int GetNumNUMANodes() {
if (!IsNUMAEnabled()) {
return -1;
}
return numa_num_configured_nodes();
}
void NUMAMove(void* ptr, size_t size, int numa_node_id) {
if (numa_node_id < 0) {
return;
}
if (!IsNUMAEnabled()) {
return;
}
AT_ASSERT(ptr);
uintptr_t page_start_ptr =
((reinterpret_cast<uintptr_t>(ptr)) & ~(getpagesize() - 1));
ptrdiff_t offset = reinterpret_cast<uintptr_t>(ptr) - page_start_ptr;
// Avoid extra dynamic allocation and NUMA api calls
AT_ASSERT(
numa_node_id >= 0 &&
static_cast<unsigned>(numa_node_id) < sizeof(unsigned long) * 8);
unsigned long mask = 1UL << numa_node_id;
TORCH_CHECK(
mbind(
reinterpret_cast<void*>(page_start_ptr),
size + offset,
MPOL_BIND,
&mask,
sizeof(mask) * 8,
MPOL_MF_MOVE | MPOL_MF_STRICT) == 0,
"Could not move memory to a NUMA node");
}
int GetCurrentNUMANode() {
if (!IsNUMAEnabled()) {
return -1;
}
auto n = numa_node_of_cpu(sched_getcpu());
return n;
}
#else // C10_ENABLE_NUMA
bool IsNUMAEnabled() {
return false;
}
void NUMABind(int numa_node_id) {}
int GetNUMANode(const void* ptr) {
return -1;
}
int GetNumNUMANodes() {
return -1;
}
void NUMAMove(void* ptr, size_t size, int numa_node_id) {}
int GetCurrentNUMANode() {
return -1;
}
#endif // C10_NUMA_ENABLED
} // namespace c10
|