File: opal_datatype_cuda.c

package info (click to toggle)
openmpi 3.1.3-11
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 118,572 kB
  • sloc: ansic: 628,972; f90: 17,993; makefile: 13,761; sh: 7,051; java: 6,360; perl: 3,215; cpp: 2,225; python: 1,350; lex: 988; fortran: 52; tcl: 12
file content (193 lines) | stat: -rw-r--r-- 5,717 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
/*
 * Copyright (c) 2011-2014 NVIDIA Corporation.  All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */

#include "opal_config.h"

#include <errno.h>
#include <string.h>
#include <unistd.h>

#include "opal/align.h"
#include "opal/util/output.h"
#include "opal/datatype/opal_convertor.h"
#include "opal/datatype/opal_datatype_cuda.h"

static bool initialized = false;
int opal_cuda_verbose = 0;
static int opal_cuda_enabled = 0; /* Starts out disabled */
static int opal_cuda_output = 0;
static void opal_cuda_support_init(void);
static int (*common_cuda_initialization_function)(opal_common_cuda_function_table_t *) = NULL;
static opal_common_cuda_function_table_t ftable;

/* This function allows the common cuda code to register an
 * initialization function that gets called the first time an attempt
 * is made to send or receive a GPU pointer.  This allows us to delay
 * some CUDA initialization until after MPI_Init().
 */
void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *)) {
    common_cuda_initialization_function = fptr;
}

/**
 * This function is called when a convertor is instantiated.  It has to call
 * the opal_cuda_support_init() function once to figure out if CUDA support
 * is enabled or not.  If CUDA is not enabled, then short circuit out
 * for all future calls.
 */
void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
{
    /* Only do the initialization on the first GPU access */
    if (!initialized) {
        opal_cuda_support_init();
    }

    /* This is needed to handle case where convertor is not fully initialized
     * like when trying to do a sendi with convertor on the statck */
    convertor->cbmemcpy = (memcpy_fct_t)&opal_cuda_memcpy;

    /* If not enabled, then nothing else to do */
    if (!opal_cuda_enabled) {
        return;
    }

    if (ftable.gpu_is_gpu_buffer(pUserBuf, convertor)) {
        convertor->flags |= CONVERTOR_CUDA;
    }
}

/* Checks the type of pointer
 *
 * @param dest   One pointer to check
 * @param source Another pointer to check
 */
bool opal_cuda_check_bufs(char *dest, char *src)
{
    /* Only do the initialization on the first GPU access */
    if (!initialized) {
        opal_cuda_support_init();
    }

    if (!opal_cuda_enabled) {
        return false;
    }

    if (ftable.gpu_is_gpu_buffer(dest, NULL) || ftable.gpu_is_gpu_buffer(src, NULL)) {
        return true;
    } else {
        return false;
    }
}

/*
 * With CUDA enabled, all contiguous copies will pass through this function.
 * Therefore, the first check is to see if the convertor is a GPU buffer.
 * Note that if there is an error with any of the CUDA calls, the program
 * aborts as there is no recovering.
 */
void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t* convertor)
{
    int res;

    if (!(convertor->flags & CONVERTOR_CUDA)) {
        return memcpy(dest, src, size);
    }

    if (convertor->flags & CONVERTOR_CUDA_ASYNC) {
        res = ftable.gpu_cu_memcpy_async(dest, (void *)src, size, convertor);
    } else {
        res = ftable.gpu_cu_memcpy(dest, (void *)src, size);
    }

    if (res != 0) {
        opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
                    res, dest, src, (int)size);
        abort();
    } else {
        return dest;
    }
}

/*
 * This function is needed in cases where we do not have contiguous
 * datatypes.  The current code has macros that cannot handle a convertor
 * argument to the memcpy call.
 */
void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size)
{
    int res;
    res = ftable.gpu_cu_memcpy(dest, src, size);
    if (res != 0) {
        opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
                    res, dest, src, (int)size);
        abort();
    } else {
        return dest;
    }
}

/*
 * In some cases, need an implementation of memmove.  This is not fast, but
 * it is not often needed.
 */
void *opal_cuda_memmove(void *dest, void *src, size_t size)
{
    int res;

    res = ftable.gpu_memmove(dest, src, size);
    if(res != 0){
        opal_output(0, "CUDA: Error in gpu memmove: res=%d, dest=%p, src=%p, size=%d",
                    res, dest, src, (int)size);
        abort();
    }
    return dest;
}

/**
 * This function gets called once to check if the program is running in a cuda
 * environment.
 */
static void opal_cuda_support_init(void)
{
    if (initialized) {
        return;
    }

    /* Set different levels of verbosity in the cuda related code. */
    opal_cuda_output = opal_output_open(NULL);
    opal_output_set_verbosity(opal_cuda_output, opal_cuda_verbose);

    /* Callback into the common cuda initialization routine. This is only
     * set if some work had been done already in the common cuda code.*/
    if (NULL != common_cuda_initialization_function) {
        if (0 == common_cuda_initialization_function(&ftable)) {
            opal_cuda_enabled = 1;
        }
    }

    if (1 == opal_cuda_enabled) {
        opal_output_verbose(10, opal_cuda_output,
                            "CUDA: enabled successfully, CUDA device pointers will work");
    } else {
        opal_output_verbose(10, opal_cuda_output,
                            "CUDA: not enabled, CUDA device pointers will not work");
    }

    initialized = true;
}

/**
 * Tell the convertor that copies will be asynchronous CUDA copies.  The
 * flags are cleared when the convertor is reinitialized.
 */
void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream)
{
    convertor->flags |= CONVERTOR_CUDA_ASYNC;
    convertor->stream = stream;
}