File: ft-mpi

package info (click to toggle)
openmpi 5.0.8-3
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 201,692 kB
  • sloc: ansic: 613,078; makefile: 42,353; sh: 11,194; javascript: 9,244; f90: 7,052; java: 6,404; perl: 5,179; python: 1,859; lex: 740; fortran: 61; cpp: 20; tcl: 12
file content (89 lines) | stat: -rw-r--r-- 2,772 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#
# Copyright (c) 2020     The University of Tennessee and The University
#                         of Tennessee Research Foundation.  All rights
#                         reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# An Aggregate MCA Parameter Set to setup an environment that can support
# User-Level Failure Mitigation (ULFM) fault tolerance (must also be 
# compiled in with --with-ft=mpi).
#
# Usage:
#   shell$ mpirun --tune ft-mpi ./app
#

mpi_ft_enable=true

# Since failures are expected, reduce the verbosity of the transport errors
btl_base_warn_peer_error=false

#
# Performance tuning parameters (default shown)
# By default the PRTE failure detector is used (see README.ULFM.md)
#mpi_ft_detector=false
#mpi_ft_detector_thread=false
#mpi_ft_detector_rdma_heartbeat=false
#mpi_ft_detector_period=3.
#mpi_ft_detector_timeout=10.
#


#
# Select only ULFM ready components
# disabling non-tested and known broken components in FT-MPI builds
#

#
# The following frameworks/components are TESTED
# They handle faults amd should be preferred when running with FT.
#   pml     ob1
#   btl     tcp, self, sm(+xpmem,+cma), ugni, uct
#   coll    base/basic, tuned, ftagree, libnbc
pml=ob1
threads=pthreads

#
# The following frameworks/components are UNTESTED, but **may** work.
# They should run without faults, and **may** work with faults.
# You may try and report if successful.
#   btl     ofi, portals4, smcuda, usnic, sm(+knem)
#   coll    inter, sm, sync, cuda, monitoring
#   pml     monitoring, v/vprotocol
# We will disable only the components for which good components are known to exist.
btl=^usnic
# older versions of xpmem generate bus errors when the other end is dead.
#btl_sm_single_copy_mechanism=cma


#
# The following frameworks/components are UNTESTED, and probably won't work.
# They should run without faults, and will probably crash/deadlock after a fault.
# You may try at your own risk.
#   coll    hcoll, portals4
#   topo    (all)
#   osc     (all)
#   io      (all)
#   fcoll   (all)
#   fbtl    (all)
# We will disable only the components for which good components are known to exist.
# Other untested components are selectable but will issue a runtime warning at
# initiation if FT is enabled.
coll=^hcoll,portals4

#
# The following frameworks/components are NOT WORKING. Do not enable these with FT.
#   mtl     (all)
#   pml     cm, crcpw, ucx
mtl=^ofi,portals4,psm2
# already enforced by pml=ob1 above
#pml=^cm,crcpw,ucx
# already enforced by threads=pthreads above
#threads=^argobots,qthreads
# There is a bug in libevent with the "select" backend that causes an infinite loop
# when an unplanned disconnect happens. Use something else, or bail.
opal_event_include=epoll,devpoll,kqueue,evport,poll