1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
|
/*
* Copyright (c) 2004-2023 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2014-2015 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "coll_cuda.h"
#include <stdio.h>
#include "ompi/op/op.h"
#include "opal/datatype/opal_convertor.h"
/*
* reduce_log_inter
*
* Function: - reduction using O(N) algorithm
* Accepts: - same as MPI_Reduce()
* Returns: - MPI_SUCCESS or error code
*/
int
mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
int root, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module;
int rank = ompi_comm_rank(comm);
ptrdiff_t gap;
char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL;
size_t bufsize;
int rc;
bufsize = opal_datatype_span(&dtype->super, count, &gap);
rc = mca_coll_cuda_check_buf((void *)sbuf);
if (rc < 0) {
return rc;
}
if ((MPI_IN_PLACE != sbuf) && (rc > 0)) {
sbuf1 = (char*)malloc(bufsize);
if (NULL == sbuf1) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
mca_coll_cuda_memcpy(sbuf1, sbuf, bufsize);
sbuf = sbuf1 - gap;
}
rc = mca_coll_cuda_check_buf(rbuf);
if (rc < 0) {
return rc;
}
if ((rank == root) && (rc > 0)) {
rbuf1 = (char*)malloc(bufsize);
if (NULL == rbuf1) {
if (NULL != sbuf1) free(sbuf1);
return OMPI_ERR_OUT_OF_RESOURCE;
}
mca_coll_cuda_memcpy(rbuf1, rbuf, bufsize);
rbuf2 = rbuf; /* save away original buffer */
rbuf = rbuf1 - gap;
}
rc = s->c_coll.coll_reduce((void *) sbuf, rbuf, count,
dtype, op, root, comm,
s->c_coll.coll_reduce_module);
if (NULL != sbuf1) {
free(sbuf1);
}
if (NULL != rbuf1) {
rbuf = rbuf2;
mca_coll_cuda_memcpy(rbuf, rbuf1, bufsize);
free(rbuf1);
}
return rc;
}
int
mca_coll_cuda_reduce_local(const void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
mca_coll_base_module_t *module)
{
ptrdiff_t gap;
char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL;
size_t bufsize;
int rc;
bufsize = opal_datatype_span(&dtype->super, count, &gap);
rc = mca_coll_cuda_check_buf((void *)sbuf);
if (rc < 0) {
return rc;
}
if ((MPI_IN_PLACE != sbuf) && (rc > 0)) {
sbuf1 = (char*)malloc(bufsize);
if (NULL == sbuf1) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
mca_coll_cuda_memcpy(sbuf1, sbuf, bufsize);
sbuf = sbuf1 - gap;
}
rc = mca_coll_cuda_check_buf(rbuf);
if (rc < 0) {
return rc;
}
if (rc > 0) {
rbuf1 = (char*)malloc(bufsize);
if (NULL == rbuf1) {
if (NULL != sbuf1) free(sbuf1);
return OMPI_ERR_OUT_OF_RESOURCE;
}
mca_coll_cuda_memcpy(rbuf1, rbuf, bufsize);
rbuf2 = rbuf; /* save away original buffer */
rbuf = rbuf1 - gap;
}
ompi_op_reduce(op, (void *)sbuf, rbuf, count, dtype);
rc = OMPI_SUCCESS;
if (NULL != sbuf1) {
free(sbuf1);
}
if (NULL != rbuf1) {
rbuf = rbuf2;
mca_coll_cuda_memcpy(rbuf, rbuf1, bufsize);
free(rbuf1);
}
return rc;
}
|