File: error_handlers.py

package info (click to toggle)
python-parsl 2025.01.13%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 12,072 kB
  • sloc: python: 23,817; makefile: 349; sh: 276; ansic: 45
file content (67 lines) | stat: -rw-r--r-- 2,319 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from __future__ import annotations

from typing import Dict, Tuple

import parsl.executors.status_handling as status_handling
from parsl.jobs.errors import TooManyJobFailuresError
from parsl.jobs.states import JobState, JobStatus


def noop_error_handler(executor: status_handling.BlockProviderExecutor, status: Dict[str, JobStatus], threshold: int = 3) -> None:
    pass


def simple_error_handler(executor: status_handling.BlockProviderExecutor, status: Dict[str, JobStatus], threshold: int = 3) -> None:
    (total_jobs, failed_jobs) = _count_jobs(status)
    if hasattr(executor.provider, "init_blocks"):
        threshold = max(1, executor.provider.init_blocks)

    if total_jobs >= threshold and failed_jobs == total_jobs:
        executor.set_bad_state_and_fail_all(_get_error(status))


def windowed_error_handler(executor: status_handling.BlockProviderExecutor, status: Dict[str, JobStatus], threshold: int = 3) -> None:
    sorted_status = [(key, status[key]) for key in sorted(status, key=lambda x: int(x))]
    current_window = dict(sorted_status[-threshold:])
    total, failed = _count_jobs(current_window)
    if failed == threshold:
        executor.set_bad_state_and_fail_all(_get_error(status))


def _count_jobs(status: Dict[str, JobStatus]) -> Tuple[int, int]:
    total = 0
    failed = 0
    for js in status.values():
        total += 1
        if js.state == JobState.FAILED or js.state == JobState.MISSING:
            failed += 1
    return total, failed


def _get_error(status: Dict[str, JobStatus]) -> Exception:
    """Concatenate all errors."""
    err = ""
    count = 1
    for js in status.values():
        err = err + f"Error {count}:\n"
        count += 1

        if js.message is not None:
            err = err + f"\t{js.message}\n"

        if js.exit_code is not None:
            err = err + f"\tEXIT CODE: {js.exit_code}\n"

        stdout = js.stdout_summary
        if stdout:
            err = err + "\tSTDOUT: {}\n".format(stdout)

        stderr = js.stderr_summary
        if stderr:
            err = err + "\tSTDERR: {}\n".format(stderr)

    if len(err) == 0:
        err = "No error messages received"
    # wrapping things in an exception here doesn't really help in providing more information
    # than the string itself
    return TooManyJobFailuresError(err)