1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361
|
distributed:
version: 2
# logging:
# distributed: info
# distributed.client: warning
# distributed.gc: warning
# bokeh: error
# # http://stackoverflow.com/questions/21234772/python-tornado-disable-logging-to-stderr
# tornado: critical
# tornado.application: error
scheduler:
allowed-failures: 3 # number of retries before a task is considered bad
bandwidth: 100000000 # 100 MB/s estimated worker-worker bandwidth
blocked-handlers: []
contact-address: null
default-data-size: 1kiB
# Number of seconds to wait until workers or clients are removed from the events log
# after they have been removed from the scheduler
events-cleanup-delay: 1h
idle-timeout: null # Shut down after this duration, like "1h" or "30 minutes"
no-workers-timeout: null # If a task remains unrunnable for longer than this, it fails.
work-stealing: True # workers should steal tasks from each other
work-stealing-interval: 100ms # Callback time for work stealing
worker-saturation: 1.1 # Send this fraction of nthreads root tasks to workers
rootish-taskgroup: 5 # number of dependencies of a rootish tg
rootish-taskgroup-dependencies: 5 # number of dependencies of the dependencies of the rootish tg
worker-ttl: "5 minutes" # like '60s'. Time to live for workers. They must heartbeat faster than this
preload: [] # Run custom modules with Scheduler
preload-argv: [] # See https://docs.dask.org/en/latest/how-to/customize-initialization.html
unknown-task-duration: 500ms # Default duration for all tasks with unknown durations ("15m", "2h")
default-task-durations: # How long we expect function names to run ("1h", "1s") (helps for long tasks)
rechunk-split: 1us
split-shuffle: 1us
split-taskshuffle: 1us
split-stage: 1us
validate: False # Check scheduler state at every step for debugging
dashboard:
status:
task-stream-length: 1000
tasks:
task-stream-length: 100000
tls:
ca-file: null
key: null
cert: null
bokeh-application: # keywords to pass to BokehTornado application
allow_websocket_origin: ["*"]
keep_alive_milliseconds: 500
check_unused_sessions_milliseconds: 500
locks:
lease-validation-interval: 10s # The interval in which the scheduler validates staleness of all acquired leases. Must always be smaller than the lease-timeout itself.
lease-timeout: 30s # Maximum interval to wait for a Client refresh before a lease is invalidated and released.
http:
routes:
- distributed.http.scheduler.prometheus
- distributed.http.scheduler.info
- distributed.http.scheduler.json
- distributed.http.health
- distributed.http.proxy
- distributed.http.statics
allowed-imports:
- dask
- distributed
active-memory-manager:
# Set to true to auto-start the Active Memory Manager on Scheduler start; if false
# you'll have to either manually start it with client.amm.start() or run it once
# with client.amm.run_once().
start: true
# Once started, run the AMM cycle every <interval>
interval: 2s
# Memory measure to use. Must be one of the attributes of
# distributed.scheduler.MemoryState.
measure: optimistic
# Policies that should be executed at every cycle. Any additional keys in each
# object are passed as keyword arguments to the policy constructor.
policies:
- class: distributed.active_memory_manager.ReduceReplicas
worker:
blocked-handlers: []
multiprocessing-method: spawn
use-file-locking: True
transfer:
message-bytes-limit: 50MB
connections: # Maximum concurrent connections for data
outgoing: 50 # This helps to control network saturation
incoming: 10
preload: [] # Run custom modules with Worker
preload-argv: [] # See https://docs.dask.org/en/latest/how-to/customize-initialization.html
daemon: True
validate: False # Check worker state at every step for debugging
resources: {} # Key: value pairs specifying worker resources.
lifetime:
duration: null # Time after which to gracefully shutdown the worker
stagger: 0 seconds # Random amount by which to stagger lifetimes
restart: False # Do we ressurrect the worker after the lifetime deadline?
profile:
enabled: True # Whether or not to enable profiling
interval: 10ms # Time between statistical profiling queries
cycle: 1000ms # Time between starting new profile
low-level: False # Whether or not to include low-level functions
# Requires https://github.com/numba/stacktrace
memory:
# When there is an increase in process memory (as observed by the operating
# system) that is not accounted for by the dask keys stored on the worker, ignore
# it for this long before considering it in non-critical memory measures.
# This should be set to be longer than the duration of most dask tasks.
recent-to-old-time: 30s
rebalance:
# Memory measure to rebalance upon. Possible choices are:
# process
# Total process memory, as measured by the OS.
# optimistic
# Managed by dask (instantaneous) + unmanaged (without any increases
# happened in the last <distributed.worker.memory.recent-to-old-time>).
# Recommended for use on CPython with large (2MiB+) numpy-based data chunks.
# managed
# Only consider the data allocated by dask in RAM. Recommended if RAM is not
# released in a timely fashion back to the OS after the Python objects are
# dereferenced, but remains available for reuse by PyMalloc.
#
# If this is your problem on Linux, you should alternatively consider
# setting the MALLOC_TRIM_THRESHOLD_ environment variable (note the final
# underscore) to a low value; refer to the mallopt man page and to the
# comments about M_TRIM_THRESHOLD on
# https://sourceware.org/git/?p=glibc.git;a=blob;f=malloc/malloc.c
# managed_total
# Only consider data allocated by dask, including that spilled to disk.
# Recommended if disk occupation of the spill file is an issue.
measure: optimistic
# Fraction of worker process memory at which we start potentially sending
# data to other workers. Ignored when max_memory is not set.
sender-min: 0.30
# Fraction of worker process memory at which we stop potentially accepting
# data from other workers. Ignored when max_memory is not set.
recipient-max: 0.60
# Fraction of worker process memory, around the cluster mean, where a worker is
# neither a sender nor a recipient of data during a rebalance operation. E.g. if
# the mean cluster occupation is 50%, sender-recipient-gap=0.10 means that only
# nodes above 55% will donate data and only nodes below 45% will receive them.
# This helps avoid data from bouncing around the cluster repeatedly.
# Ignored when max_memory is not set.
sender-recipient-gap: 0.10
# Fractions of worker process memory at which we take action to avoid memory
# blowup. Set any of the values to False to turn off the behavior entirely.
# All fractions are relative to each worker's memory_limit.
transfer: 0.10 # fractional size of incoming data transfers where we start
# throttling incoming data transfers
target: 0.60 # fraction of managed memory where we start spilling to disk
spill: 0.70 # fraction of process memory where we start spilling to disk
pause: 0.80 # fraction of process memory at which we pause worker threads
terminate: 0.95 # fraction of process memory at which we terminate the worker
# Max size of the spill file on disk (e.g. "10 GB")
# Set to false for no maximum.
max-spill: false
spill-compression: auto # See also: distributed.comm.compression
# Interval between checks for the spill, pause, and terminate thresholds.
# The target threshold is checked every time new data is inserted.
monitor-interval: 100ms
http:
routes:
- distributed.http.worker.prometheus
- distributed.http.health
- distributed.http.statics
nanny:
preload: [] # Run custom modules with Nanny
preload-argv: [] # See https://docs.dask.org/en/latest/how-to/customize-initialization.html
# Override environment variables after spawning the Worker process.
# Use whenever you are sure that nothing will read them before the end of the worker
# initialization.
environ: {}
# Override environment variables *before* spawning the Worker initialization.
# Use for variables that are parsed in or before the worker init.
# Note that this leaks variables into the nanny process.
# Read important caveats at
# https://distributed.dask.org/en/stable/worker.html#nanny.
pre-spawn-environ:
# See https://distributed.dask.org/en/stable/worker-memory.html#automatically-trim-memory
MALLOC_TRIM_THRESHOLD_: 65536
# Numpy configuration
OMP_NUM_THREADS: 1
MKL_NUM_THREADS: 1
OPENBLAS_NUM_THREADS: 1
client:
heartbeat: 5s # Interval between client heartbeats
scheduler-info-interval: 2s # Interval between scheduler-info updates
security-loader: null # A callable to load security credentials if none are provided explicitl
preload: [] # Run custom modules with Client
preload-argv: [] # See https://docs.dask.org/en/latest/how-to/customize-initialization.html
deploy:
lost-worker-timeout: 15s # Interval after which to hard-close a lost worker job
cluster-repr-interval: 500ms # Interval between calls to update cluster-repr for the widget
adaptive:
interval: 1s # Interval between scaling evaluations
target-duration: 5s # Time an entire graph calculation is desired to take ("1m", "30m")
minimum: 0 # Minimum number of workers
maximum: .inf # Maximum number of workers
wait-count: 3 # Number of times a worker should be suggested for removal before removing it
comm:
retry: # some operations (such as gathering data) are subject to re-tries with the below parameters
count: 0 # the maximum retry attempts. 0 disables re-trying.
delay:
min: 1s # the first non-zero delay between re-tries
max: 20s # the maximum delay between re-tries
compression: false # See also: distributed.worker.memory.spill-compression
shard: 64MiB
offload: 10MiB # Size after which we choose to offload serialization to another thread
default-scheme: tcp
socket-backlog: 2048
ucx:
cuda-copy: null # enable cuda-copy
tcp: null # enable tcp
nvlink: null # enable cuda_ipc
infiniband: null # enable Infiniband
rdmacm: null # enable RDMACM
create-cuda-context: null # create CUDA context before UCX initialization
environment: {} # Any other environment settings to
# be transferred to UCX. Name
# munging: key-name => UCX_KEY_NAME
zstd:
level: 3 # Compression level, between 1 and 22.
threads: 0 # Threads to use. 0 for single-threaded, -1 to infer from cpu count.
timeouts:
connect: 30s # time before connecting fails
tcp: 30s # time before calling an unresponsive connection dead
require-encryption: null # Whether to require encryption on non-local comms
tls:
ciphers: null # Allowed ciphers, specified as an OpenSSL cipher string.
min-version: 1.2 # The minimum TLS version supported.
max-version: null # The maximum TLS version supported.
ca-file: null # Path to a CA file, in pem format, optional
scheduler:
cert: null # Path to certificate file for scheduler.
key: null # Path to key file for scheduler. Alternatively, the key
# can be appended to the cert file above, and this field
# left blank.
worker:
key: null
cert: null
client:
key: null
cert: null
websockets:
shard: 8MiB
diagnostics:
nvml: True
cudf: False
computations:
max-history: 100
nframes: 0
ignore-modules:
- asyncio
- functools
- threading
- datashader
- dask
- debugpy
- distributed
- coiled
- cudf
- cuml
- matplotlib
- pluggy # part of pytest
- prefect
- rechunker
- xarray
- xgboost
- xdist
- __channelexec__ # more xdist
- execnet # more xdist
ignore-files:
- runpy\.py # `python -m pytest` (or other module) shell command
- pytest # `pytest` shell command
- py\.test # `py.test` shell command
- pytest-script\.py # `pytest` shell command in Windows
- _pytest # pytest implementation
- pycharm # Run pytest from PyCharm GUI
- vscode_pytest
- get_output_via_markers\.py
erred-tasks:
max-history: 100
p2p:
comm:
buffer: 1 GiB
concurrency: 10
message-bytes-limit: 2 MiB
retry:
count: 10
delay:
min: 1s # the first non-zero delay between re-tries
max: 30s # the maximum delay between re-tries
storage:
buffer: 100 MiB
disk: True
threads: null
###################
# Bokeh dashboard #
###################
dashboard:
link: "{scheme}://{host}:{port}/status"
export-tool: False
graph-max-items: 5000 # maximum number of tasks to try to plot in graph view
prometheus:
namespace: "dask"
##################
# Administrative #
##################
admin:
large-graph-warning-threshold: 10MB # Threshold for warning on large graph
tick:
interval: 20ms # time between event loop health checks
limit: 3s # time allowed before triggering a warning
cycle: 1s # time between checking event loop speed
max-error-length: 10000 # Maximum size traceback after error to return
log-length: 10000 # Maximum length of worker/scheduler logs to keep in memory
log-format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
low-level-log-length: 1000 # Maximum length of various logs for developers
pdb-on-err: False # enter debug mode on scheduling error
system-monitor:
interval: 500ms
log-length: 7200 # Maximum number of samples to keep in memory
disk: true # Monitor host-wide disk I/O
host-cpu: false # Monitor host-wide CPU usage, with very granular breakdown
gil:
enabled: true # Monitor GIL contention
interval: "1ms" # Frequency to poll GIL
event-loop: tornado
rmm:
pool-size: null
|