1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461
|
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2006 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Matching Transport Layer
*
* The Matching Transport Layer (MTL) provides device-layer support
* for transfer of MPI point-to-point messages over devices that
* support hardware / library message matching. This layer is used
* with the MTL PML component to provide lowest latency and highest
* bandwidth on given architectures. Features found in other PML
* interfaces, such as message fragmenting, multi-device support, and
* NIC failover are not provided by the upper layers.
*
* In general, this interface should not be used for transport layer
* support. Instead, the BTL interface should be used. The BTL
* interface allows for multiplexing between multiple users
* (point-to-point, one-sided, etc.) and provides many features not
* found in this interface (RDMA from arbitrary buffers, active
* messaging, reasonable pinned memory caching, etc.)
*/
#ifndef OMPI_MTL_H
#define OMPI_MTL_H
#include "ompi_config.h"
#include "mpi.h" /* needed for MPI_ANY_TAG */
#include "ompi/mca/mca.h"
#include "ompi/mca/pml/pml_constants.h" /* for send_mode enum */
#include "ompi/request/request.h"
BEGIN_C_DECLS
struct ompi_request_t;
struct opal_convertor_t;
struct mca_mtl_base_module_t;
struct mca_mtl_request_t {
/** pointer to associated ompi_request_t */
struct ompi_request_t *ompi_req;
void (*completion_callback)(struct mca_mtl_request_t* mtl_request);
};
typedef struct mca_mtl_request_t mca_mtl_request_t;
/**
* MTL module flags
*/
#define MCA_MTL_BASE_FLAG_REQUIRE_WORLD 0x00000001
#define MCA_MTL_BASE_FLAG_ACCELERATOR_INIT_DISABLE 0x00000002
#define MCA_MTL_BASE_FLAG_SUPPORTS_EXT_CID 0x00000004
/**
* Initialization routine for MTL component
*
* Initialization routine for MTL component. This function should
* allocate resources for communication and try to do all local setup.
* It should not attempt to contact it's peers, as that should be
* done at add_procs time. Contact information should be published
* during this initialization function. It will be made available
* during add_procs().
*
* @param enable_progress_threads (IN) Progress threads have been
* enabled by the user and the component must be
* capable of making asynchronous progress (either
* with its own thread, with the kernel, or with
* the event library.
* @param enable_mpi_threads (IN) MPI threads have been enabled by the
* user and the component must be capable of coping
* with threads. If the component can cope with
* MPI_THREAD_MULTIPLE, enable_mpi_thread_multiple
* should be set to true. Otherwise, it is assumed
* that only THREAD_FUNNELLED and THREAD_SERIALIZED
* can be used.
* @param enable_mpi_thread_multiple (OUT) Component does / does not
* support MPI_THREAD_MULTIPLE. This variable only
* needs to be set if enable_mpi_threads is true.
* Otherwise, the return value will be ignored.
* @param accelerator_support (OUT) Component does / does not support
* direct transfers with an accelerator buffer.
*
* @retval NULL component can not operate on the current machine
* @retval non-NULL component interface function
*/
typedef struct mca_mtl_base_module_t*
(*mca_mtl_base_component_init_fn_t)(bool enable_progress_threads,
bool enable_mpi_threads,
bool *accelerator_support);
struct mca_mtl_base_component_2_0_0_t {
mca_base_component_t mtl_version;
mca_base_component_data_t mtl_data;
mca_mtl_base_component_init_fn_t mtl_init;
bool accelerator_support;
};
typedef struct mca_mtl_base_component_2_0_0_t mca_mtl_base_component_2_0_0_t;
typedef struct mca_mtl_base_component_2_0_0_t mca_mtl_base_component_t;
/**
* MCA->MTL Clean up any resources held by MTL module
*
* Opposite of module_init. Called when communication will no longer
* be necessary. Usually this is during MPI_FINALIZE, but it can be
* earlier if the component was not selected to run. Assuming
* module_init was called, finalize will always be called before the
* component_close function is called.
*
* @param mtl (IN) MTL module returned from call to initialize
*
* @retval OMPI_SUCCESS cleanup finished successfully
* @retval other failure during cleanup
*
*/
typedef int (*mca_mtl_base_module_finalize_fn_t)(struct mca_mtl_base_module_t* mtl);
/**
* PML->MTL notification of change in the process list.
*
* The mca_mtl_base_module_add_procs_fn_t() is used by the PML to
* notify the MTL that new processes are connected to the current
* process. Any addressing information exported by the peer via the
* ompi_modex_send() function should be available during this
* call via the corresponding ompi_modex_recv() function. The
* MTL may utilize this information to determine reachability of each
* peer process.
*
* It is an error for a proc to not be reachable by the given MTL, and
* an error should be returned if that case is detected. If a MTL
* requires per-endpoint data, it must handle storage, either using a
* static endpoint tag (MTL is the default tag that should generally
* be used) or a dynamic endpoint tag (although it should be noted
* that OMPI can be built without dynamic endpoint tag support).
*
* @param mtl (IN) MTL module
* @param nprocs (IN) Number of processes
* @param procs (IN) Set of processes
*
* @retval OMPI_SUCCESS successfully connected to processes
* @retval other failure during setup
*/
typedef int (*mca_mtl_base_module_add_procs_fn_t)(
struct mca_mtl_base_module_t* mtl,
size_t nprocs,
struct ompi_proc_t** procs);
/**
* Notification of change to the process list.
*
* When the process list changes, the PML notifies the MTL of the
* change, to provide the opportunity to cleanup or release any
* resources associated with the peer. The MTL is responsible for
* releasing any memory associated with the endpoint data it may have
* stored during add_procs().
*
* @param mtl (IN) MTL module
* @param nprocs (IN) Number of processes
* @param proc (IN) Set of processes
* @param peer (IN) Set of peer addressing information.
*
* @return Status indicating if cleanup was successful
*/
typedef int (*mca_mtl_base_module_del_procs_fn_t)(
struct mca_mtl_base_module_t* mtl,
size_t nprocs,
struct ompi_proc_t** procs);
/**
* Blocking send to peer
*
* Blocking send (Call should not return until the user buffer may be
* used again). Standard MPI semantics must be met by this call, as
* mandated in the mode argument. There is one special mode argument,
* MCA_PML_BASE_SEND_COMPLETE, which requires local completion before
* the function can return. This is an optimization for coillective
* routines that can otherwise lead to degenerate performance for
* broadcast-based collectives.
*
* @param comm (IN) Communicator used for operation
* @param dest (IN) Destination rank for send (relative to comm)
* @param tag (IN) MPI tag used for sending. See note below.
* @param convertor (IN) Datatype convertor describing send datatype.
* Already prepared for send.
* @param mode (IN) Mode for send operation
*
* @return OMPI_SUCCESS or error value
*
* \note Open MPI is built around non-blocking operations. This
* function is provided for networks where progressing events outside
* of point-to-point (for example, collectives, I/O, one-sided) can
* occur without a progress function regularly being triggered.
*
* \note While MPI does not allow users to specify negative tags, they
* are used internally in Open MPI to provide a unique channel for
* collective operations. Therefore, the MTL can *not* cause an error
* if a negative tag is used.
*/
typedef int (*mca_mtl_base_module_send_fn_t)(
struct mca_mtl_base_module_t* mtl,
struct ompi_communicator_t *comm,
int dest,
int tag,
struct opal_convertor_t *convertor,
mca_pml_base_send_mode_t mode);
/**
* Non-blocking send to peer
*
* Non-blocking send to peer. Standard MPI semantics must be met by
* this call, as mandated in the mode argument. There is one special
* mode argument, MCA_PML_BASE_SEND_COMPLETE, which requires local
* completion before the request is marked as complete.
*
* The PML will handle creation of the request, leaving the number of
* bytes requested in the module structure available for the MTL
* directly after the ompi_request_t structure. The PML will handle
* proper destruction of the request once it can safely be destructed
* (it has been completed and freed by a call to REQUEST_FReE or
* TEST/WAIT). The MTL should remove all resources associated with
* the request when it is marked as completed.
*
* @param comm (IN) Communicator used for operation
* @param dest (IN) Destination rank for send (relative to comm)
* @param tag (IN) MPI tag used for sending. See note below.
* @param convertor (IN) Datatype convertor describing send datatype.
* Already prepared for send.
* @param mode (IN) Mode for send operation (see pml.h)
* @param blocking (IN) True if the call originated from a blocking
* call, but the PML decided to use a
* non-blocking operation, likely for
* internal performance decisions This is an
* optimization flag and is not needed for
* correctness.
* @param mtl_request (IN) Pointer to mtl_request. The ompi_req field
* will be populated with an initialized
* ompi_request_t before calling.
*
* @return OMPI_SUCCESS or error value
*
* \note While MPI does not allow users to specify negative tags, they
* are used internally in Open MPI to provide a unique channel for
* collective operations. Therefore, the MTL can *not* cause an error
* if a negative tag is used.
*/
typedef int (*mca_mtl_base_module_isend_fn_t)(
struct mca_mtl_base_module_t* mtl,
struct ompi_communicator_t *comm,
int dest,
int tag,
struct opal_convertor_t *convertor,
mca_pml_base_send_mode_t mode,
bool blocking,
mca_mtl_request_t *mtl_request);
/**
* Non-blocking receive
*
* Non-blocking receive function. Standard MPI semantics for
* MPI_Irecv must be implemented by this call.
*
* The PML will handle creation of the request, leaving the number of
* bytes requested in the module structure available for the MTL,
* directly after the ompi_request_t structure. The PML will handle
* proper destruction of the request once it can safely be destroyed
* (it has been completed and free'ed by a call to REQUEST_FREE or
* TEST/WAIT). The MTL should remove all resources associated with
* the request when it is marked as completed.
*
* @param comm (IN) Communicator used for operation
* @param src (IN) Source rank for send (relative to comm)
* @param tag (IN) MPI tag used for sending. See note below.
* @param convertor (IN) Datatype convertor describing receive datatype.
* Already prepared for receive.
* @param mtl_request (IN) Pointer to mtl_request. The ompi_req field
* will be populated with an initialized
* ompi_request_t before calling.
*
* @return OMPI_SUCCESS or error value
*
* \note While MPI does not allow users to specify negative tags, they
* are used internally in Open MPI to provide a unique channel for
* collective operations. Therefore, the MTL can *not* cause an error
* if a negative tag is used. Further, MPI_ANY_TAG should *not* match
* against negative tags.
*/
typedef int (*mca_mtl_base_module_irecv_fn_t)(
struct mca_mtl_base_module_t* mtl,
struct ompi_communicator_t *comm,
int src,
int tag,
struct opal_convertor_t *convertor,
struct mca_mtl_request_t *mtl_request);
/**
* Non-blocking probe
*
* Non-blocking probe function. Standard MPI semantics for MPI_IPROBE
* must be implemented by this call.
*
* @param comm (IN) Communicator used for operation
* @param src (IN) Source rank for send (relative to comm)
* @param tag (IN) MPI tag used for sending. See note below.
* @param flag (OUT) true if message available, false otherwise
* @param status (OUT) Status structure for information on
* available message
*
* \note While MPI does not allow users to specify negative tags, they
* are used internally in Open MPI to provide a unique channel for
* collective operations. Therefore, the MTL can *not* cause an error
* if a negative tag is used. Further, MPI_ANY_TAG should *not* match
* against negative tags.
*/
typedef int (*mca_mtl_base_module_iprobe_fn_t)(
struct mca_mtl_base_module_t* mtl,
struct ompi_communicator_t *comm,
int src,
int tag,
int *flag,
struct ompi_status_public_t *status);
typedef int (*mca_mtl_base_module_imrecv_fn_t)(struct mca_mtl_base_module_t* mtl,
struct opal_convertor_t *convertor,
struct ompi_message_t **message,
struct mca_mtl_request_t *mtl_request);
typedef int (*mca_mtl_base_module_improbe_fn_t)(struct mca_mtl_base_module_t *mtl,
struct ompi_communicator_t *comm,
int src,
int tag,
int *matched,
struct ompi_message_t **message,
struct ompi_status_public_t *status);
/**
* Cancel an existing request
*
* Attempt to cancel an existing request. The (poorly defined)
* semantics for MPI_CANCEL must be implemented by this call. This,
* of course, allows the MTL module to do nothing at all.
* Implementations of the MTL should make a good faith effort to
* cancel receive requests that have not been started, as the "post a
* receive for control messages" paradigm is a common one in loosely
* coupled MPI applications.
*
* @param request(IN) Request that should be cancelled
* @param flag Unknown exactly what this does.
*
*/
typedef int (*mca_mtl_base_module_cancel_fn_t)(
struct mca_mtl_base_module_t* mtl,
mca_mtl_request_t *mtl_request,
int flag);
/**
* Downcall from PML layer when a new communicator is created.
*
* @param comm Communicator
* @return OMPI_SUCCESS or failure status.
*
* Provides the MTL the opportunity to initialize/cache a data structure
* on the communicator.
*/
typedef int (*mca_mtl_base_module_add_comm_fn_t)(
struct mca_mtl_base_module_t* mtl,
struct ompi_communicator_t* comm);
/**
* Downcall from PML layer when a communicator is destroyed.
*
* @param comm Communicator
* @return OMPI_SUCCESS or failure status.
*
* Provides the MTL the opportunity to cleanup any datastructures
* associated with the communicator.
*/
typedef int (*mca_mtl_base_module_del_comm_fn_t)(
struct mca_mtl_base_module_t* mtl,
struct ompi_communicator_t* comm);
/**
* MTL module interface functions and attributes.
*/
struct mca_mtl_base_module_t {
int mtl_max_contextid; /**< maximum allowable contextid */
int mtl_max_tag; /**< maximum tag value. note that negative tags must be allowed */
size_t mtl_request_size; /**< number of bytes to reserve with request structure */
uint32_t mtl_flags; /**< flags (put/get...) */
/* MTL function table */
mca_mtl_base_module_add_procs_fn_t mtl_add_procs;
mca_mtl_base_module_del_procs_fn_t mtl_del_procs;
mca_mtl_base_module_finalize_fn_t mtl_finalize;
mca_mtl_base_module_send_fn_t mtl_send;
mca_mtl_base_module_isend_fn_t mtl_isend;
mca_mtl_base_module_irecv_fn_t mtl_irecv;
mca_mtl_base_module_iprobe_fn_t mtl_iprobe;
mca_mtl_base_module_imrecv_fn_t mtl_imrecv;
mca_mtl_base_module_improbe_fn_t mtl_improbe;
/* Optional MTL functions */
mca_mtl_base_module_cancel_fn_t mtl_cancel;
mca_mtl_base_module_add_comm_fn_t mtl_add_comm;
mca_mtl_base_module_del_comm_fn_t mtl_del_comm;
};
typedef struct mca_mtl_base_module_t mca_mtl_base_module_t;
/*
* Macro for use in modules that are of type mtl
*/
#define MCA_MTL_BASE_VERSION_2_0_0 \
OMPI_MCA_BASE_VERSION_2_1_0("mtl", 2, 0, 0)
OMPI_DECLSPEC extern mca_mtl_base_module_t *ompi_mtl;
/*
* macro for doing direct call / call through struct
*/
#if MCA_ompi_mtl_DIRECT_CALL
#define OMPI_MTL_CALL_STAMP(a, b) ompi_mtl_ ## a ## _ ## b
#define OMPI_MTL_CALL_EXPANDER(a, b) OMPI_MTL_CALL_STAMP(a,b)
#define OMPI_MTL_CALL(a) OMPI_MTL_CALL_EXPANDER(MCA_ompi_mtl_DIRECT_CALL_COMPONENT, a)
#include MCA_ompi_mtl_DIRECT_CALL_HEADER
#else
#define OMPI_MTL_CALL(a) ompi_mtl->mtl_ ## a
#endif
END_C_DECLS
#endif
|