File: use_cupy.py

package info (click to toggle)
mpi4py 4.1.0-4
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 4,540 kB
  • sloc: python: 34,465; ansic: 16,475; makefile: 614; sh: 325; cpp: 193; f90: 178
file content (42 lines) | stat: -rw-r--r-- 1,285 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# Demonstrate how to work with Python GPU arrays using CUDA-aware MPI.
# We choose the CuPy library for simplicity, but any CUDA array which
# has the __cuda_array_interface__ attribute defined will work.
#
# Run this script using the following command:
# mpiexec -n 2 python use_cupy.py

import cupy

from mpi4py import MPI

comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()

# Allreduce
sendbuf = cupy.arange(10, dtype="i")
recvbuf = cupy.empty_like(sendbuf)
# always make sure the GPU buffer is ready before any MPI operation
cupy.cuda.get_current_stream().synchronize()
comm.Allreduce(sendbuf, recvbuf)
assert cupy.allclose(recvbuf, sendbuf * size)

# Bcast
if rank == 0:
    buf = cupy.arange(100, dtype=cupy.complex64)
else:
    buf = cupy.empty(100, dtype=cupy.complex64)
cupy.cuda.get_current_stream().synchronize()
comm.Bcast(buf)
assert cupy.allclose(buf, cupy.arange(100, dtype=cupy.complex64))

# Send-Recv
if rank == 0:
    buf = cupy.arange(20, dtype=cupy.float64)
    cupy.cuda.get_current_stream().synchronize()
    comm.Send(buf, dest=1, tag=88)
else:
    buf = cupy.empty(20, dtype=cupy.float64)
    cupy.cuda.get_current_stream().synchronize()
    comm.Recv(buf, source=0, tag=88)
    assert cupy.allclose(buf, cupy.arange(20, dtype=cupy.float64))