File: usage.py

package info (click to toggle)
python-parsl 2025.01.13%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 12,072 kB
  • sloc: python: 23,817; makefile: 349; sh: 276; ansic: 45
file content (239 lines) | stat: -rw-r--r-- 8,918 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
import json
import logging
import platform
import socket
import sys
import time
import uuid

from parsl.dataflow.states import States
from parsl.errors import ConfigurationError
from parsl.multiprocessing import ForkProcess
from parsl.usage_tracking.api import get_parsl_usage
from parsl.usage_tracking.levels import DISABLED as USAGE_TRACKING_DISABLED
from parsl.usage_tracking.levels import LEVEL_3 as USAGE_TRACKING_LEVEL_3
from parsl.utils import setproctitle
from parsl.version import VERSION as PARSL_VERSION

logger = logging.getLogger(__name__)

from typing import Callable

from typing_extensions import ParamSpec

# protocol version byte: when (for example) compression parameters are changed
# that cannot be inferred from the compressed message itself, this version
# ID needs to imply those parameters.

# Earlier protocol versions: b'{' - the original pure-JSON protocol pre-March 2024
PROTOCOL_VERSION = b'1'

P = ParamSpec("P")


def async_process(fn: Callable[P, None]) -> Callable[P, None]:
    """ Decorator function to launch a function as a separate process """

    def run(*args, **kwargs):
        proc = ForkProcess(target=fn, args=args, kwargs=kwargs, name="Usage-Tracking")
        proc.start()
        return proc

    return run


@async_process
def udp_messenger(domain_name: str, UDP_PORT: int, sock_timeout: int, message: bytes) -> None:
    """Send UDP messages to usage tracker asynchronously

    This multiprocessing based messenger was written to overcome the limitations
    of signalling/terminating a thread that is blocked on a system call.

    Args:
          - domain_name (str) : Domain name string
          - UDP_PORT (int) : UDP port to send out on
          - sock_timeout (int) : Socket timeout
    """
    setproctitle("parsl: Usage tracking")

    try:
        UDP_IP = socket.gethostbyname(domain_name)

        sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)  # UDP
        sock.settimeout(sock_timeout)
        sock.sendto(message, (UDP_IP, UDP_PORT))
        sock.close()

    except socket.timeout:
        logger.debug("Failed to send usage tracking data: socket timeout")
    except OSError as e:
        logger.debug("Failed to send usage tracking data: OSError: {}".format(e))
    except Exception as e:
        logger.debug("Failed to send usage tracking data: Exception: {}".format(e))


class UsageTracker:
    """Usage Tracking for Parsl.

    The server for this is here: https://github.com/Parsl/parsl_tracking
    This issue captures the discussion that went into functionality
    implemented here: https://github.com/Parsl/parsl/issues/34

    """

    def __init__(self, dfk, port=50077,
                 domain_name='tracking.parsl-project.org'):
        """Initialize usage tracking unless the user has opted-out.

        We will try to resolve the hostname specified in kwarg:domain_name
        and if that fails attempt to use the kwarg:ip. Determining the
        IP and sending message happens in an asynchronous processs to avoid
        slowing down DFK initialization.

        Tracks usage stats by inspecting the internal state of the dfk.

        Args:
             - dfk (DFK object) : Data Flow Kernel object

        KWargs:
             - port (int) : Port number, Default:50077
             - domain_name (string) : Domain name, will override IP
                  Default: tracking.parsl-project.org
        """

        self.domain_name = domain_name
        # The sock timeout will only apply to UDP send and not domain resolution
        self.sock_timeout = 5
        self.UDP_PORT = port
        self.procs = []
        self.dfk = dfk
        self.config = self.dfk.config
        self.correlator_uuid = str(uuid.uuid4())
        self.parsl_version = PARSL_VERSION
        self.python_version = "{}.{}.{}".format(sys.version_info.major,
                                                sys.version_info.minor,
                                                sys.version_info.micro)
        self.tracking_level = self.check_tracking_level()
        self.project_name = self.config.project_name
        self.start_time = None
        logger.debug("Tracking level: {}".format(self.tracking_level))

    def check_tracking_level(self) -> int:
        """Check if tracking is enabled and return level.

        Checks usage_tracking in Config
            - Possible values: [True, False, 0, 1, 2, 3]

        True/False values are treated as Level 1/Level 0 respectively.

        Returns: int
            - 0 : Tracking is disabled
            - 1 : Tracking is enabled with level 1
                  Share info about Parsl version, Python version, platform
            - 2 : Tracking is enabled with level 2
                  Share info about config + level 1
            - 3 : Tracking is enabled with level 3
                  Share info about app count, app fails, execution time + level 2
        """
        if not USAGE_TRACKING_DISABLED <= self.config.usage_tracking <= USAGE_TRACKING_LEVEL_3:
            raise ConfigurationError(
                f"Usage Tracking values must be 0, 1, 2, or 3 and not {self.config.usage_tracking}"
            )

        return self.config.usage_tracking

    def construct_start_message(self) -> bytes:
        """Collect preliminary run info at the start of the DFK.

        Returns :
              - Message dict dumped as json string, ready for UDP
        """
        message = {'correlator': self.correlator_uuid,
                   'parsl_v': self.parsl_version,
                   'python_v': self.python_version,
                   'platform.system': platform.system(),
                   'tracking_level': int(self.tracking_level)}

        if self.project_name:
            message['project_name'] = self.project_name

        if self.tracking_level >= 2:
            message['components'] = get_parsl_usage(self.dfk._config)

        if self.tracking_level == 3:
            self.start_time = int(time.time())
            message['start'] = self.start_time

        logger.debug(f"Usage tracking start message: {message}")

        return self.encode_message(message)

    def construct_end_message(self) -> bytes:
        """Collect the final run information at the time of DFK cleanup.
        This is only called if tracking level is 3.

        Returns:
             - Message dict dumped as json string, ready for UDP
        """
        end_time = int(time.time())

        app_count = self.dfk.task_count

        app_fails = self.dfk.task_state_counts[States.failed] + self.dfk.task_state_counts[States.dep_fail]

        # the DFK is tangled into this code as a god-object, so it is
        # handled separately from the usual traversal code, but presenting
        # the same protocol-level report.
        dfk_component = {'c': type(self.dfk).__module__ + "." + type(self.dfk).__name__,
                         'app_count': app_count,
                         'app_fails': app_fails}

        message = {'correlator': self.correlator_uuid,
                   'end': end_time,
                   'execution_time': end_time - self.start_time,
                   'components': [dfk_component] + get_parsl_usage(self.dfk._config)}

        if self.project_name:
            message['project_name'] = self.project_name

        logger.debug(f"Usage tracking end message (unencoded): {message}")

        return self.encode_message(message)

    def encode_message(self, obj):
        return PROTOCOL_VERSION + json.dumps(obj).encode()

    def send_UDP_message(self, message: bytes) -> None:
        """Send UDP message."""
        try:
            proc = udp_messenger(self.domain_name, self.UDP_PORT, self.sock_timeout, message)
            self.procs.append(proc)
        except Exception as e:
            logger.debug("Usage tracking failed: {}".format(e))

    def send_start_message(self) -> None:
        if self.tracking_level:
            self.start_time = time.time()
            message = self.construct_start_message()
            self.send_UDP_message(message)

    def send_end_message(self) -> None:
        if self.tracking_level == 3:
            message = self.construct_end_message()
            self.send_UDP_message(message)

    def close(self, timeout: float = 10.0) -> None:
        """First give each process one timeout period to finish what it is
        doing, then kill it (SIGKILL). There's no softer SIGTERM step,
        because that adds one join period of delay for what is almost
        definitely either: going to behave broadly the same as to SIGKILL,
        or won't respond to SIGTERM.
        """
        for proc in self.procs:
            logger.debug("Joining usage tracking process %s", proc)
            proc.join(timeout=timeout)
            if proc.is_alive():
                logger.warning("Usage tracking process did not end itself; sending SIGKILL")
                proc.kill()

            proc.close()