1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
|
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2020-2024 Triad National Security, LLC. All rights
* reserved.
* Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OPAL_MCA_COMMON_OFI_H
#define OPAL_MCA_COMMON_OFI_H
#include "opal/util/proc.h"
#include "opal/memoryhooks/memory.h"
#include <rdma/fabric.h>
#include <rdma/fi_cm.h>
BEGIN_C_DECLS
typedef struct opal_common_ofi_module {
char **prov_include;
char **prov_exclude;
int output;
} opal_common_ofi_module_t;
/**
* When attempting to execute an OFI operation we need to handle
* resource overrun cases. When a call to an OFI OP fails with -FI_EAGAIN
* the OFI mtl/btl will attempt to progress any pending Completion Queue
* events that may prevent additional operations to be enqueued.
* If the call to ofi progress is successful, then the function call
* will be retried.
*/
#define OFI_RETRY_UNTIL_DONE(FUNC, RETURN) \
do { \
do { \
RETURN = FUNC; \
if (OPAL_LIKELY(0 == RETURN)) {break;} \
if (OPAL_LIKELY(RETURN == -FI_EAGAIN)) { \
opal_progress(); \
} \
} while (OPAL_LIKELY(-FI_EAGAIN == RETURN)); \
} while (0);
extern opal_common_ofi_module_t opal_common_ofi;
/**
* Common MCA registration
*
* Common MCA registration handlinge. After calling this function,
* \code opal_common_ofi.output will be properly initialized.
*
* @param component (IN) OFI component being initialized
*
* @returns OPAL_SUCCESS on success, OPAL error code on failure
*/
OPAL_DECLSPEC int opal_common_ofi_mca_register(const mca_base_component_t *component);
/**
* Initializes common objects for libfabric
*
* Initialize common libfabric interface. This should be called from
* any other OFI component's component_open() call.
*
* @note This function is not thread safe and must be called in a
* serial portion of the code.
*/
OPAL_DECLSPEC int opal_common_ofi_open(void);
/**
* Cleans up common objects for libfabric
*
* Clean up common libfabric interface. This should be called from
* any other OFI component's component_close() call. Resource cleanup
* is reference counted, so any successful call to
* opal_common_ofi_init().
*
* @note This function is not thread safe and must be called in a
* serial portion of the code.
*/
OPAL_DECLSPEC int opal_common_ofi_close(void);
/**
* Export our memory hooks into Libfabric monitor
*
* Use Open MPI's memory hooks to provide monitor notifications to
* Libfabric via the external mr_cache facility. This must be called
* before any domain is initialized (ie, before any Libfabric memory
* monitor is configured).
*
* @returns A libfabric error code is returned on error
*/
OPAL_DECLSPEC int opal_common_ofi_export_memory_monitor(void);
/**
* Search function for provider names
*
* This function will take a provider name string and a list of lower
* provider name strings as inputs. It will return true if the lower
* provider in the item string matches a lower provider in the list.
*
* @param list (IN) List of strings corresponding to lower providers.
* @param item (IN) Single string corresponding to a provider.
*
* @return 0 The lower provider of the item string is not in
* list or an input was NULL
* @return 1 The lower provider of the item string matches
* a string in the item list.
*
*/
OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item);
/**
* Get the number of providers whose names are included in a list
*
* This function takes a list of providers and a list of name strings
* as inputs, and return the number of providers whose names are included
* in the name strings.
*
* @param provider_list (IN) List of providers
* @param list (IN) List of name string
*
* @return Number of matched providers
*
*/
OPAL_DECLSPEC int opal_common_ofi_count_providers_in_list(struct fi_info *provider_list,
char **list);
/**
* Determine whether all providers are included in a list
*
* This function takes a list of providers and a list of name strings
* as inputs, and return whether all provider names are included in the name strings.
*
* @param provider_list (IN) List of providers
* @param list (IN) List of name string
*
* @return 0 At least one provider's name is not included in the name strings.
* @return 1 All provider names are included in the name strings.
*
*/
OPAL_DECLSPEC int opal_common_ofi_providers_subset_of_list(struct fi_info *provider_list,
char **list);
/**
* Selects NIC (provider) based on hardware locality
*
* The selection is based on the following priority:
*
* Single-NIC:
*
* If only 1 provider is available, always return that provider.
*
* Multi-NIC:
*
* 1. If the process is NOT bound, pick a NIC using (local rank % number
* of providers of the same type). This gives a fair chance to each
* qualified NIC and balances overall utilization.
*
* 2. If the process is bound, we compare providers in the list that have
* the same type as the first provider, and find the provider with the
* shortest distance to the current process.
*
* i. If the provider has PCI BDF data, we attempt to compute the
* distance between the NIC and the current process cpuset. The NIC
* with the shortest distance is returned.
*
* * For equidistant NICs, we select a NIC in round-robin fashion
* using the package rank of the current process, i.e. (package
* rank % number of providers with the same distance).
*
* ii. If we cannot compute the distance between the NIC and the
* current process, e.g. PCI BDF data is not available, a NIC will be
* selected in a round-robin fashion using package rank, i.e. (package
* rank % number of providers of the same type).
*
* @param[in] provider_list struct fi_info* An initially selected
* provider NIC. The provider name and
* attributes are used to restrict NIC
* selection. This provider is returned if the
* NIC selection fails.
*
* @param[in] process_info opal_process_info_t* The current process info
*
* @param[out] provider struct fi_info* object with the selected
* provider if the selection succeeds
* if the selection fails, returns the fi_info
* object that was initially provided.
*
* All errors should be recoverable and will return the initially provided
* provider. However, if an error occurs we can no longer guarantee
* that the provider returned is local to the process or that the processes will
* balance across available NICs.
*
*/
OPAL_DECLSPEC struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
opal_process_info_t *process_info);
/**
* Obtain EP endpoint name
*
* Obtain the EP endpoint name and length for the supplied endpoint fid.
*
* @param fid (IN) fid of (S)EP endpoint
* @param addr (OUT) buffer containing endpoint name
* @param addrlen (OUT) length of allocated buffer in bytes
*
* @return OPAL_SUCCESS or OPAL error code
*
* The caller is responsible for freeing the buffer allocated to
* contain the endpoint name.
*
*/
OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *addrlen);
END_C_DECLS
#endif /* OPAL_MCA_COMMON_OFI_H */
|