1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
|
/*
* Copyright (c) 2011-2014 NVIDIA Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include <errno.h>
#include <string.h>
#include <unistd.h>
#include "opal/align.h"
#include "opal/util/output.h"
#include "opal/datatype/opal_convertor.h"
#include "opal/datatype/opal_datatype_cuda.h"
static bool initialized = false;
int opal_cuda_verbose = 0;
static int opal_cuda_enabled = 0; /* Starts out disabled */
static int opal_cuda_output = 0;
static void opal_cuda_support_init(void);
static int (*common_cuda_initialization_function)(opal_common_cuda_function_table_t *) = NULL;
static opal_common_cuda_function_table_t ftable;
/* This function allows the common cuda code to register an
* initialization function that gets called the first time an attempt
* is made to send or receive a GPU pointer. This allows us to delay
* some CUDA initialization until after MPI_Init().
*/
void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *)) {
common_cuda_initialization_function = fptr;
}
/**
* This function is called when a convertor is instantiated. It has to call
* the opal_cuda_support_init() function once to figure out if CUDA support
* is enabled or not. If CUDA is not enabled, then short circuit out
* for all future calls.
*/
void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
{
/* Only do the initialization on the first GPU access */
if (!initialized) {
opal_cuda_support_init();
}
/* This is needed to handle case where convertor is not fully initialized
* like when trying to do a sendi with convertor on the statck */
convertor->cbmemcpy = (memcpy_fct_t)&opal_cuda_memcpy;
/* If not enabled, then nothing else to do */
if (!opal_cuda_enabled) {
return;
}
if (ftable.gpu_is_gpu_buffer(pUserBuf, convertor)) {
convertor->flags |= CONVERTOR_CUDA;
}
}
/* Checks the type of pointer
*
* @param dest One pointer to check
* @param source Another pointer to check
*/
bool opal_cuda_check_bufs(char *dest, char *src)
{
/* Only do the initialization on the first GPU access */
if (!initialized) {
opal_cuda_support_init();
}
if (!opal_cuda_enabled) {
return false;
}
if (ftable.gpu_is_gpu_buffer(dest, NULL) || ftable.gpu_is_gpu_buffer(src, NULL)) {
return true;
} else {
return false;
}
}
/*
* With CUDA enabled, all contiguous copies will pass through this function.
* Therefore, the first check is to see if the convertor is a GPU buffer.
* Note that if there is an error with any of the CUDA calls, the program
* aborts as there is no recovering.
*/
void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t* convertor)
{
int res;
if (!(convertor->flags & CONVERTOR_CUDA)) {
return memcpy(dest, src, size);
}
if (convertor->flags & CONVERTOR_CUDA_ASYNC) {
res = ftable.gpu_cu_memcpy_async(dest, (void *)src, size, convertor);
} else {
res = ftable.gpu_cu_memcpy(dest, (void *)src, size);
}
if (res != 0) {
opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
res, dest, src, (int)size);
abort();
} else {
return dest;
}
}
/*
* This function is needed in cases where we do not have contiguous
* datatypes. The current code has macros that cannot handle a convertor
* argument to the memcpy call.
*/
void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size)
{
int res;
res = ftable.gpu_cu_memcpy(dest, src, size);
if (res != 0) {
opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
res, dest, src, (int)size);
abort();
} else {
return dest;
}
}
/*
* In some cases, need an implementation of memmove. This is not fast, but
* it is not often needed.
*/
void *opal_cuda_memmove(void *dest, void *src, size_t size)
{
int res;
res = ftable.gpu_memmove(dest, src, size);
if(res != 0){
opal_output(0, "CUDA: Error in gpu memmove: res=%d, dest=%p, src=%p, size=%d",
res, dest, src, (int)size);
abort();
}
return dest;
}
/**
* This function gets called once to check if the program is running in a cuda
* environment.
*/
static void opal_cuda_support_init(void)
{
if (initialized) {
return;
}
/* Set different levels of verbosity in the cuda related code. */
opal_cuda_output = opal_output_open(NULL);
opal_output_set_verbosity(opal_cuda_output, opal_cuda_verbose);
/* Callback into the common cuda initialization routine. This is only
* set if some work had been done already in the common cuda code.*/
if (NULL != common_cuda_initialization_function) {
if (0 == common_cuda_initialization_function(&ftable)) {
opal_cuda_enabled = 1;
}
}
if (1 == opal_cuda_enabled) {
opal_output_verbose(10, opal_cuda_output,
"CUDA: enabled successfully, CUDA device pointers will work");
} else {
opal_output_verbose(10, opal_cuda_output,
"CUDA: not enabled, CUDA device pointers will not work");
}
initialized = true;
}
/**
* Tell the convertor that copies will be asynchronous CUDA copies. The
* flags are cleared when the convertor is reinitialized.
*/
void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream)
{
convertor->flags |= CONVERTOR_CUDA_ASYNC;
convertor->stream = stream;
}
|