1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
|
#pragma once
#include <cstddef>
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/logging.h"
#include <nccl.h>
#include <unordered_map>
#define NCCL_VERSION_MIN(major, minor, patch) \
((NCCL_MAJOR > major) || \
((NCCL_MAJOR == major) && \
((NCCL_MINOR > minor) || \
((NCCL_MINOR == minor) && (NCCL_PATCH >= patch)))))
namespace caffe2 {
namespace nccl {
#define CAFFE_NCCL_CHECK(condition) \
do { \
ncclResult_t status = (condition); \
CAFFE_ENFORCE_EQ( \
status, \
ncclSuccess, \
" ", \
"Error at: ", \
__FILE__, \
__LINE__, \
": ", \
ncclGetErrorString(status)); \
} while (0)
struct NCCLElement {
const TensorCUDA* src{nullptr};
TensorCUDA* dst{nullptr};
int device{0};
};
struct NCCLExecution {
int stream_gpu_id{0};
cudaStream_t stream{nullptr};
std::vector<NCCLElement> elements;
size_t root{0};
};
// Called when the last NCCL op is destructed and all lazily created
// NCCLContext instances can safely be destroyed.
void destroyContexts();
template <typename T>
class NCCL {
public:
static void AllReduce(const NCCLExecution& ex);
static void Broadcast(const NCCLExecution& ex);
static void Reduce(const NCCLExecution& ex);
static void AllGather(const NCCLExecution& ex);
static void ReduceScatter(const NCCLExecution& ex);
};
} // namespace nccl
} // namespace caffe2
|