File: mpi_ops_gpu.cc

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (75 lines) | stat: -rw-r--r-- 2,971 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#include "caffe2/core/context_gpu.h"
#include "caffe2/mpi/mpi_ops.h"
#include "caffe2/operators/operator_fallback_gpu.h"

namespace caffe2 {

// Here is a bunch of MPI macro definitions that allow us to see if the MPI
// version supports CUDA aware MPI functions or not.

#if OPEN_MPI
#define CAFFE2_OMPI_VERSION \
  OMPI_MAJOR_VERSION * 10000 + OMPI_MINOR_VERSION * 100 + OMPI_RELEASE_VERSION
#if CAFFE2_OMPI_VERSION >= 20000
// OpenMPI 2.x now supports compile time check whether CUDA is supported.
#include "mpi-ext.h" /* Needed for CUDA-aware check */
#if MPIX_CUDA_AWARE_SUPPORT
#define CAFFE2_HAS_CUDA_MPI_BASICS 1
#define CAFFE2_HAS_CUDA_MPI_ALLREDUCE 1
#endif // MPIX_CUDA_AWARE_SUPPORT
#else // CAFFE2_OMPI_VERSION >= 2000
// In the case of OpenMPI 1.x, we don't have compile-time flags to
// figure out if CUDA is supported; as a result, we will assume that
// the user has built OpenMPI with CUDA support.
// CUDA-aware MPIBroadcast is introduced after OpenMPI 1.7.
#if CAFFE2_OMPI_VERSION >= 10700
#define CAFFE2_HAS_CUDA_MPI_BASICS 1
#else // CAFFE2_OMPI_VERSION >= 10700
#define CAFFE2_HAS_CUDA_MPI_BASICS 0
#endif // CAFFE2_OMPI_VERSION >= 10700
// CUDA-aware MPIAllreduce is introduced after OpenMPI 1.8.5.
#if CAFFE2_OMPI_VERSION >= 10805
#define CAFFE2_HAS_CUDA_MPI_ALLREDUCE 1
#else // CAFFE2_OMPI_VERSION >= 10805
#define CAFFE2_HAS_CUDA_MPI_ALLREDUCE 0
#endif // CAFFE2_OMPI_VERSION >= 10805
#endif // CAFFE2_OMPI_VERSION >= 2000
#else // !OPEN_MPI
// We have not really tested against other MPI environments, so let's go for a
// safe path and basically say we don't have cuda-aware functions.
#define CAFFE2_HAS_CUDA_MPI_BASICS 0
#define CAFFE2_HAS_CUDA_MPI_ALLREDUCE 0
#endif // OPEN_MPI

// We allow a macro to force using fallback functions.
#ifdef CAFFE2_FORCE_FALLBACK_CUDA_MPI
#undef CAFFE2_HAS_CUDA_MPI_BASICS
#undef CAFFE2_HAS_CUDA_MPI_ALLREDUCE
#define CAFFE2_HAS_CUDA_MPI_BASICS 0
#define CAFFE2_HAS_CUDA_MPI_ALLREDUCE 0
#endif // CAFFE2_FORCE_FALLBACK_CUDA_MPI

REGISTER_CUDA_OPERATOR(
    MPICreateCommonWorld,
    MPICreateCommonWorldOp<CUDAContext>);
#if CAFFE2_HAS_CUDA_MPI_BASICS
REGISTER_CUDA_OPERATOR(MPIBroadcast, MPIBroadcastOp<CUDAContext>);
REGISTER_CUDA_OPERATOR(MPIReduce, MPIReduceOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(MPIAllgather, MPIAllgatherOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(MPISendTensor, MPISendTensorOp<CUDAContext>);
REGISTER_CUDA_OPERATOR(MPIReceiveTensor, MPIReceiveTensorOp<CUDAContext>);
#else
REGISTER_CUDA_OPERATOR(MPIBroadcast, GPUFallbackOp);
REGISTER_CUDA_OPERATOR(MPIReduce, GPUFallbackOp);
REGISTER_CUDA_OPERATOR(MPIAllgather, GPUFallbackOp);
REGISTER_CUDA_OPERATOR(MPISendTensor, GPUFallbackOp);
REGISTER_CUDA_OPERATOR(MPIReceiveTensor, GPUFallbackOpEx<SkipIndices<1, 2>>);
#endif

#if CAFFE2_HAS_CUDA_MPI_ALLREDUCE
REGISTER_CUDA_OPERATOR(MPIAllreduce, MPIAllreduceOp<float, CUDAContext>);
#else
REGISTER_CUDA_OPERATOR(MPIAllreduce, GPUFallbackOp);
#endif

} // namespace caffe2