File: btl_ofi.h

package info (click to toggle)
openmpi 5.0.8-10
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 201,692 kB
  • sloc: ansic: 613,078; makefile: 42,351; sh: 11,194; javascript: 9,244; f90: 7,052; java: 6,404; perl: 5,179; python: 1,859; lex: 740; fortran: 61; cpp: 20; tcl: 12
file content (396 lines) | stat: -rw-r--r-- 14,205 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
 * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2018 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights
 *                         reserved.
 * Copyright (c) 2018-2019 Intel, Inc.  All rights reserved.
 * Copyright (c) 2020      Amazon.com, Inc. or its affiliates.
 *                         All Rights reserved.
 * Copyright (c) 2022      Triad National Security, LLC. All rights
 *                         reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */
/**
 * @file
 */
#ifndef MCA_BTL_OFI_H
#define MCA_BTL_OFI_H

#include "opal_config.h"
#include <string.h>
#include <sys/types.h>

/* Open MPI includes */
#include "opal/mca/btl/base/base.h"
#include "opal/mca/btl/base/btl_base_error.h"
#include "opal/mca/btl/btl.h"
#include "opal/mca/mpool/mpool.h"
#include "opal/mca/pmix/pmix-internal.h"
#include "opal/mca/rcache/base/base.h"
#include "opal/util/event.h"

#include "opal/class/opal_hash_table.h"

#include <rdma/fabric.h>
#include <rdma/fi_cm.h>
#include <rdma/fi_domain.h>
#include <rdma/fi_endpoint.h>
#include <rdma/fi_errno.h>
#include <rdma/fi_rma.h>

BEGIN_C_DECLS
#define MCA_BTL_OFI_MAX_MODULES  16
#define MCA_BTL_OFI_NUM_CQE_READ 64

#define MCA_BTL_OFI_DEFAULT_RD_NUM             10
#define MCA_BTL_OFI_DEFAULT_MAX_CQE            128
#define MCA_BTL_OFI_DEFAULT_PROGRESS_THRESHOLD 64

#define MCA_BTL_OFI_ABORT(args) mca_btl_ofi_exit(args)

#define TWO_SIDED_ENABLED mca_btl_ofi_component.two_sided_enabled

enum mca_btl_ofi_mode {
    MCA_BTL_OFI_MODE_ONE_SIDED = 0,
    MCA_BTL_OFI_MODE_TWO_SIDED,
    MCA_BTL_OFI_MODE_FULL_SUPPORT,
    MCA_BTL_OFI_MODE_TOTAL
};

enum mca_btl_ofi_hdr_type {
    MCA_BTL_OFI_TYPE_PUT = 0,
    MCA_BTL_OFI_TYPE_GET,
    MCA_BTL_OFI_TYPE_AOP,
    MCA_BTL_OFI_TYPE_AFOP,
    MCA_BTL_OFI_TYPE_CSWAP,
    MCA_BTL_OFI_TYPE_SEND,
    MCA_BTL_OFI_TYPE_RECV,
    MCA_BTL_OFI_TYPE_TOTAL
};

struct mca_btl_ofi_context_t {
    int32_t context_id;

    /* transmit context */
    struct fid_ep *tx_ctx;
    struct fid_ep *rx_ctx;

    /* completion queue */
    struct fid_cq *cq;

    /* completion info freelist */
    /* We have it per context to reduce the thread contention
     * on the freelist. Things can get really slow. */
    opal_free_list_t rdma_comp_list;
    opal_free_list_t frag_comp_list;
    opal_free_list_t frag_list;

    /* for thread locking */
    opal_atomic_int32_t lock;
};
typedef struct mca_btl_ofi_context_t mca_btl_ofi_context_t;

/**
 * @brief OFI BTL module
 */
struct mca_btl_ofi_module_t {
    /** base BTL interface */
    mca_btl_base_module_t super;

    /* libfabric components */
    struct fi_info *fabric_info;
    struct fid_fabric *fabric;
    struct fid_domain *domain;
    struct fid_ep *ofi_endpoint;
    struct fid_av *av;

    int num_contexts;
    mca_btl_ofi_context_t *contexts;

    char *linux_device_name;

    /** whether the module has been fully initialized or not */
    bool initialized;
    bool use_virt_addr;
    bool is_scalable_ep;
    bool use_fi_mr_bind;

    opal_atomic_int64_t outstanding_rdma;
    opal_atomic_int64_t outstanding_send;

    /** linked list of BTL endpoints. this list is never searched so
     * there is no need for a complicated structure here at this time*/
    opal_list_t endpoints;

    opal_mutex_t module_lock;
    opal_hash_table_t id_to_endpoint;

    /** registration cache */
    mca_rcache_base_module_t *rcache;
    /* If the underlying OFI provider has its own cache, we want to bypass
     * rcache registration */
    bool bypass_cache;
};
typedef struct mca_btl_ofi_module_t mca_btl_ofi_module_t;

extern mca_btl_ofi_module_t mca_btl_ofi_module_template;

/**
 * @brief OFI BTL component
 */
struct mca_btl_ofi_component_t {
    mca_btl_base_component_3_0_0_t super; /**< base BTL component */

    /** number of TL modules */
    int module_count;
    int num_contexts_per_module;
    int num_cqe_read;
    int progress_threshold;
    int mode;
    int rd_num;
    bool two_sided_enabled;

    size_t namelen;

    /** Maximum inject size */
    size_t max_inject_size;
    bool disable_inject;

    bool disable_hmem;

    /** All BTL OFI modules (1 per tl) */
    mca_btl_ofi_module_t *modules[MCA_BTL_OFI_MAX_MODULES];
};
typedef struct mca_btl_ofi_component_t mca_btl_ofi_component_t;

OPAL_DECLSPEC extern mca_btl_ofi_component_t mca_btl_ofi_component;

struct mca_btl_base_registration_handle_t {
    uint64_t rkey;
    void *desc;
    void *base_addr;
};

struct mca_btl_ofi_reg_t {
    mca_rcache_base_registration_t base;
    struct fid_mr *ur_mr;

    /* remote handle */
    mca_btl_base_registration_handle_t handle;
};
typedef struct mca_btl_ofi_reg_t mca_btl_ofi_reg_t;

OBJ_CLASS_DECLARATION(mca_btl_ofi_reg_t);

struct mca_btl_ofi_header_t {
    mca_btl_base_tag_t tag;
    size_t len;
};
typedef struct mca_btl_ofi_header_t mca_btl_ofi_header_t;

struct mca_btl_ofi_base_frag_t {
    mca_btl_base_descriptor_t base;
    mca_btl_base_segment_t segments[2];

    int context_id;
    struct mca_btl_ofi_module_t *btl;
    struct mca_btl_base_endpoint_t *endpoint;
    opal_free_list_t *free_list;
    mca_btl_ofi_header_t hdr;
};

typedef struct mca_btl_ofi_base_frag_t mca_btl_ofi_base_frag_t;

OBJ_CLASS_DECLARATION(mca_btl_ofi_base_frag_t);

struct mca_btl_ofi_completion_context_t {
    struct fi_context2 ctx;
    void *comp;
};

typedef struct mca_btl_ofi_completion_context_t mca_btl_ofi_completion_context_t;

/* completion structure store information needed
 * for RDMA callbacks */
struct mca_btl_ofi_base_completion_t {
    opal_free_list_item_t comp_list;

    opal_free_list_t *my_list;

    struct mca_btl_base_module_t *btl;
    struct mca_btl_base_endpoint_t *endpoint;
    struct mca_btl_ofi_context_t *my_context;
    int type;
};
typedef struct mca_btl_ofi_base_completion_t mca_btl_ofi_base_completion_t;

struct mca_btl_ofi_rdma_completion_t {
    mca_btl_ofi_base_completion_t base;
    mca_btl_ofi_completion_context_t comp_ctx;
    void *local_address;
    mca_btl_base_registration_handle_t *local_handle;

    uint64_t operand;
    uint64_t compare;

    mca_btl_base_rdma_completion_fn_t cbfunc;
    void *cbcontext;
    void *cbdata;
};
typedef struct mca_btl_ofi_rdma_completion_t mca_btl_ofi_rdma_completion_t;

struct mca_btl_ofi_frag_completion_t {
    mca_btl_ofi_base_completion_t base;
    mca_btl_ofi_completion_context_t comp_ctx;
    mca_btl_ofi_base_frag_t *frag;
};
typedef struct mca_btl_ofi_frag_completion_t mca_btl_ofi_frag_completion_t;

OBJ_CLASS_DECLARATION(mca_btl_ofi_rdma_completion_t);
OBJ_CLASS_DECLARATION(mca_btl_ofi_frag_completion_t);

/**
 * Initiate an asynchronous put.
 * Completion Semantics: if this function returns a 1 then the operation
 *                       is complete. a return of OPAL_SUCCESS indicates
 *                       the put operation has been queued with the
 *                       network. the local_handle can not be deregistered
 *                       until all outstanding operations on that handle
 *                       have been completed.
 *
 * @param btl (IN)            BTL module
 * @param endpoint (IN)       BTL addressing information
 * @param local_address (IN)  Local address to put from (registered)
 * @param remote_address (IN) Remote address to put to (registered remotely)
 * @param local_handle (IN)   Registration handle for region containing
 *                            (local_address, local_address + size)
 * @param remote_handle (IN)  Remote registration handle for region containing
 *                            (remote_address, remote_address + size)
 * @param size (IN)           Number of bytes to put
 * @param flags (IN)          Flags for this put operation
 * @param order (IN)          Ordering
 * @param cbfunc (IN)         Function to call on completion (if queued)
 * @param cbcontext (IN)      Context for the callback
 * @param cbdata (IN)         Data for callback
 *
 * @retval OPAL_SUCCESS    The descriptor was successfully queued for a put
 * @retval OPAL_ERROR      The descriptor was NOT successfully queued for a put
 * @retval OPAL_ERR_OUT_OF_RESOURCE  Insufficient resources to queue the put
 *                         operation. Try again later
 * @retval OPAL_ERR_NOT_AVAILABLE  Put can not be performed due to size or
 *                         alignment restrictions.
 */
int mca_btl_ofi_put(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
                    void *local_address, uint64_t remote_address,
                    struct mca_btl_base_registration_handle_t *local_handle,
                    struct mca_btl_base_registration_handle_t *remote_handle, size_t size,
                    int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext,
                    void *cbdata);

/**
 * Initiate an asynchronous get.
 * Completion Semantics: if this function returns a 1 then the operation
 *                       is complete. a return of OPAL_SUCCESS indicates
 *                       the get operation has been queued with the
 *                       network. the local_handle can not be deregistered
 *                       until all outstanding operations on that handle
 *                       have been completed.
 *
 * @param btl (IN)            BTL module
 * @param endpoint (IN)       BTL addressing information
 * @param local_address (IN)  Local address to put from (registered)
 * @param remote_address (IN) Remote address to put to (registered remotely)
 * @param local_handle (IN)   Registration handle for region containing
 *                            (local_address, local_address + size)
 * @param remote_handle (IN)  Remote registration handle for region containing
 *                            (remote_address, remote_address + size)
 * @param size (IN)           Number of bytes to put
 * @param flags (IN)          Flags for this put operation
 * @param order (IN)          Ordering
 * @param cbfunc (IN)         Function to call on completion (if queued)
 * @param cbcontext (IN)      Context for the callback
 * @param cbdata (IN)         Data for callback
 *
 * @retval OPAL_SUCCESS    The descriptor was successfully queued for a put
 * @retval OPAL_ERROR      The descriptor was NOT successfully queued for a put
 * @retval OPAL_ERR_OUT_OF_RESOURCE  Insufficient resources to queue the put
 *                         operation. Try again later
 * @retval OPAL_ERR_NOT_AVAILABLE  Put can not be performed due to size or
 *                         alignment restrictions.
 */
int mca_btl_ofi_get(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
                    void *local_address, uint64_t remote_address,
                    struct mca_btl_base_registration_handle_t *local_handle,
                    struct mca_btl_base_registration_handle_t *remote_handle, size_t size,
                    int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext,
                    void *cbdata);

int mca_btl_ofi_aop(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
                    uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
                    mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
                    mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);

int mca_btl_ofi_afop(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
                     void *local_address, uint64_t remote_address,
                     mca_btl_base_registration_handle_t *local_handle,
                     mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
                     uint64_t operand, int flags, int order,
                     mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);

int mca_btl_ofi_acswap(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
                       void *local_address, uint64_t remote_address,
                       mca_btl_base_registration_handle_t *local_handle,
                       mca_btl_base_registration_handle_t *remote_handle, uint64_t compare,
                       uint64_t value, int flags, int order,
                       mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);

int mca_btl_ofi_flush(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint);

int mca_btl_ofi_finalize(mca_btl_base_module_t *btl);

void mca_btl_ofi_rcache_init(mca_btl_ofi_module_t *module);
int mca_btl_ofi_reg_mem(void *reg_data, void *base, size_t size,
                        mca_rcache_base_registration_t *reg);
int mca_btl_ofi_dereg_mem(void *reg_data, mca_rcache_base_registration_t *reg);

int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context);

mca_btl_ofi_module_t *mca_btl_ofi_module_alloc(int mode);

int mca_btl_ofi_post_recvs(mca_btl_base_module_t *module, mca_btl_ofi_context_t *context,
                           int count);
void mca_btl_ofi_exit(void);

/* thread atomics */
static inline bool mca_btl_ofi_context_trylock(mca_btl_ofi_context_t *context)
{
    return (context->lock || OPAL_ATOMIC_SWAP_32(&context->lock, 1));
}

static inline void mca_btl_ofi_context_lock(mca_btl_ofi_context_t *context)
{
    while (mca_btl_ofi_context_trylock(context))
        ;
}

static inline void mca_btl_ofi_context_unlock(mca_btl_ofi_context_t *context)
{
    opal_atomic_mb();
    context->lock = 0;
}

END_C_DECLS
#endif