1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
|
/*
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates.
* All Rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "opal_config.h"
#include <cuda.h>
#include "accelerator_cuda.h"
#include "opal/mca/accelerator/base/base.h"
#include "opal/mca/dl/base/base.h"
#include "opal/runtime/opal_params.h"
#include "opal/util/argv.h"
#include "opal/util/printf.h"
#include "opal/util/proc.h"
#include "opal/util/show_help.h"
#include "opal/sys/atomic.h"
/* Define global variables, used in accelerator_cuda.c */
CUstream opal_accelerator_cuda_memcpy_stream = NULL;
opal_mutex_t opal_accelerator_cuda_stream_lock = {0};
int opal_accelerator_cuda_num_devices = 0;
/* Initialization lock for delayed cuda initialization */
static opal_mutex_t accelerator_cuda_init_lock;
static bool accelerator_cuda_init_complete = false;
#define STRINGIFY2(x) #x
#define STRINGIFY(x) STRINGIFY2(x)
/* Unused variable that we register at init time and unregister at fini time.
* This is used to detect if user has done a device reset prior to MPI_Finalize.
* This is a workaround to avoid SEGVs.
*/
static int checkmem;
static int ctx_ok = 1;
/*
* Public string showing the accelerator cuda component version number
*/
const char *opal_accelerator_cuda_component_version_string
= "OPAL cuda accelerator MCA component version " OPAL_VERSION;
/*
* Local function
*/
static int accelerator_cuda_open(void);
static int accelerator_cuda_close(void);
static int accelerator_cuda_component_register(void);
static opal_accelerator_base_module_t* accelerator_cuda_init(void);
static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module);
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
opal_accelerator_cuda_component_t mca_accelerator_cuda_component = {{
/* First, the mca_component_t struct containing meta information
* about the component itself */
.base_version =
{
/* Indicate that we are a accelerator v1.1.0 component (which also
* implies a specific MCA version) */
OPAL_ACCELERATOR_BASE_VERSION_1_0_0,
/* Component name and version */
.mca_component_name = "cuda",
MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION,
OPAL_RELEASE_VERSION),
/* Component open and close functions */
.mca_open_component = accelerator_cuda_open,
.mca_close_component = accelerator_cuda_close,
.mca_register_component_params = accelerator_cuda_component_register,
},
/* Next the MCA v1.0.0 component meta data */
.base_data =
{ /* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT},
.accelerator_init = accelerator_cuda_init,
.accelerator_finalize = accelerator_cuda_finalize,
}};
static int accelerator_cuda_open(void)
{
/* construct the component fields */
return OPAL_SUCCESS;
}
static int accelerator_cuda_close(void)
{
return OPAL_SUCCESS;
}
static int accelerator_cuda_component_register(void)
{
return OPAL_SUCCESS;
}
int opal_accelerator_cuda_delayed_init()
{
int result = OPAL_SUCCESS;
CUcontext cuContext;
/* Double checked locking to avoid having to
* grab locks post lazy-initialization. */
opal_atomic_rmb();
if (true == accelerator_cuda_init_complete) {
return OPAL_SUCCESS;
}
OPAL_THREAD_LOCK(&accelerator_cuda_init_lock);
/* If already initialized, just exit */
if (true == accelerator_cuda_init_complete) {
goto out;
}
cuDeviceGetCount(&opal_accelerator_cuda_num_devices);
/* Check to see if this process is running in a CUDA context. If
* so, all is good. If not, then disable registration of memory. */
result = cuCtxGetCurrent(&cuContext);
if (CUDA_SUCCESS != result) {
opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent failed");
goto out;
} else if ((CUDA_SUCCESS == result) && (NULL == cuContext)) {
opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent returned NULL context");
result = OPAL_ERROR;
goto out;
} else {
opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent succeeded");
}
/* Create stream for use in cuMemcpyAsync synchronous copies */
result = cuStreamCreate(&opal_accelerator_cuda_memcpy_stream, 0);
if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true,
OPAL_PROC_MY_HOSTNAME, result);
goto out;
}
result = cuMemHostRegister(&checkmem, sizeof(int), 0);
if (result != CUDA_SUCCESS) {
/* If registering the memory fails, print a message and continue.
* This is not a fatal error. */
opal_show_help("help-accelerator-cuda.txt", "cuMemHostRegister during init failed", true,
&checkmem, sizeof(int), OPAL_PROC_MY_HOSTNAME, result, "checkmem");
} else {
opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
"CUDA: cuMemHostRegister OK on test region");
}
result = OPAL_SUCCESS;
opal_atomic_wmb();
accelerator_cuda_init_complete = true;
out:
OPAL_THREAD_UNLOCK(&accelerator_cuda_init_lock);
return result;
}
static opal_accelerator_base_module_t* accelerator_cuda_init(void)
{
OBJ_CONSTRUCT(&opal_accelerator_cuda_stream_lock, opal_mutex_t);
OBJ_CONSTRUCT(&accelerator_cuda_init_lock, opal_mutex_t);
/* First check if the support is enabled. In the case that the user has
* turned it off, we do not need to continue with any CUDA specific
* initialization. Do this after MCA parameter registration. */
if (!opal_cuda_support) {
return NULL;
}
opal_accelerator_cuda_delayed_init();
return &opal_accelerator_cuda_module;
}
static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module)
{
CUresult result;
/* This call is in here to make sure the context is still valid.
* This was the one way of checking which did not cause problems
* while calling into the CUDA library. This check will detect if
* a user has called cudaDeviceReset prior to MPI_Finalize. If so,
* then this call will fail and we skip cleaning up CUDA resources. */
result = cuMemHostUnregister(&checkmem);
if (CUDA_SUCCESS != result) {
ctx_ok = 0;
}
if ((NULL != opal_accelerator_cuda_memcpy_stream) && ctx_ok) {
cuStreamDestroy(opal_accelerator_cuda_memcpy_stream);
}
OBJ_DESTRUCT(&opal_accelerator_cuda_stream_lock);
OBJ_DESTRUCT(&accelerator_cuda_init_lock);
return;
}
|