File: _service_manager.py

package info (click to toggle)
python-cotyledon 2.2.0-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 300 kB
  • sloc: python: 1,336; makefile: 16
file content (581 lines) | stat: -rw-r--r-- 20,615 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

from __future__ import annotations

import collections
import concurrent.futures
import contextlib
import logging
import multiprocessing
import os
import signal
import socket
import sys
import threading
import time
import typing
import uuid

from cotyledon import _service_worker
from cotyledon import _utils
from cotyledon import types as t


if typing.TYPE_CHECKING:
    import types

    from cotyledon import _service

LOG = logging.getLogger(__name__)


class WorkerInfo(typing.NamedTuple):
    worker_id: t.WorkerId
    started_event: multiprocessing.synchronize.Event


ProcessInfo = typing.NewType("ProcessInfo", dict[multiprocessing.Process, WorkerInfo])
RunningServices = typing.NewType("RunningServices", dict[t.ServiceId, ProcessInfo])

OnTerminateHook: typing.TypeAlias = typing.Callable[[], None]
OnReloadHook: typing.TypeAlias = typing.Callable[[], None]
OnDeadWorkerHook: typing.TypeAlias = typing.Callable[
    [t.ServiceId, t.WorkerId, int],
    None,
]


class Hooks(typing.TypedDict):
    terminate: list[OnTerminateHook]
    reload: list[OnReloadHook]
    new_worker: list[_service_worker.OnNewWorkerHook]
    dead_worker: list[OnDeadWorkerHook]


class ServiceManager(_utils.SignalManager):
    """Manage lifetimes of services

    :py:class:`ServiceManager` acts as a master process that controls the
    lifetime of children processes and restart them if they die unexpectedly.
    It also propagate some signals (SIGTERM, SIGALRM, SIGINT and SIGHUP) to
    them.

    Each child process (:py:class:`ServiceWorker`) runs an instance of
    a :py:class:`Service`.

    An application must create only one :py:class:`ServiceManager` class and
    use :py:meth:`ServiceManager.run()` as main loop of the application.



    Usage::

        class MyService(Service):
            def __init__(self, worker_id, myconf):
                super(MyService, self).__init__(worker_id)
                preparing_my_job(myconf)
                self.running = True

            def run(self):
                while self.running:
                    do_my_job()

            def terminate(self):
                self.running = False
                gracefully_stop_my_jobs()

            def reload(self):
                restart_my_job()


        class MyManager(ServiceManager):
            def __init__(self):
                super(MyManager, self).__init__()
                self.register_hooks(on_reload=self.reload)

                conf = {'foobar': 2}
                self.service_id = self.add(MyService, 5, conf)

            def reload(self):
                self.reconfigure(self.service_id, 10)

        MyManager().run()

    This will create 5 children processes running the service MyService.

    """

    _process_runner_already_created = False

    def __init__(
        self,
        wait_interval: float = 0.01,
        graceful_shutdown_timeout: int = 60,
        mp_context: multiprocessing.context.BaseContext | None = None,
    ) -> None:
        """Creates the ServiceManager object

        :param wait_interval: time between each new process spawn
        :type wait_interval: float
        :param graceful_shutdown_timeout: timeout for graceful shutdown
        :type graceful_shutdown_timeout: int
        :param mp_context: multiprocessing context to use (defaults to 'fork')
        :type mp_context: multiprocessing.context.BaseContext | None

        By default, workers are spawned using the 'fork' start method.
        To use a different start method (e.g., 'spawn' for macOS compatibility),
        pass the context via the mp_context parameter.

        Example::

            import multiprocessing
            # Use 'spawn' start method (useful on macOS)
            manager = ServiceManager(mp_context=multiprocessing.get_context('spawn'))
            manager.add(MyService, workers=5)
            manager.run()

        """

        if self._process_runner_already_created:
            msg = "Only one instance of ServiceManager per application is allowed"
            raise RuntimeError(msg)
        ServiceManager._process_runner_already_created = True
        super().__init__()

        # We use OrderedDict to start services in adding order
        self._services: dict[
            t.ServiceId,
            _service_worker.ServiceConfig[typing.Any, typing.Any],
        ] = collections.OrderedDict()
        self._running_services: RunningServices = collections.defaultdict(dict)  # type: ignore[assignment]
        self._forktimes: list[float] = []
        self._graceful_shutdown_timeout: int = graceful_shutdown_timeout
        self._wait_interval: float = wait_interval

        self._dead = threading.Event()
        # NOTE(sileht): Set it on startup, so first iteration
        # will spawn initial workers
        self._got_sig_chld = threading.Event()
        self._got_sig_chld.set()

        self._child_supervisor: threading.Thread | None = None

        self._hooks: Hooks = {
            "terminate": [],
            "reload": [],
            "new_worker": [],
            "dead_worker": [],
        }

        _utils.set_process_title(
            "{}: master process [{}]".format(
                _utils.get_process_name(),
                " ".join(sys.argv),
            ),
        )

        # Try to create a session id if possible
        with contextlib.suppress(OSError, AttributeError):
            os.setsid()

        # Default to multiprocessing (fork-compatible) if not provided
        self.mp_context: multiprocessing.context.BaseContext = (
            mp_context or multiprocessing.get_context()
        )

        self._death_detection_pipe = self.mp_context.Pipe(duplex=False)

        if os.name == "posix":
            signal.signal(signal.SIGCHLD, self._signal_catcher)

    def register_hooks(
        self,
        on_terminate: OnTerminateHook | None = None,
        on_reload: OnReloadHook | None = None,
        on_new_worker: _service_worker.OnNewWorkerHook | None = None,
        on_dead_worker: OnDeadWorkerHook | None = None,
    ) -> None:
        """Register hook methods

        This can be callable multiple times to add more hooks, hooks are
        executed in added order. If a hook raised an exception, next hooks
        will be not executed.

        :param on_terminate: method called on SIGTERM
        :type on_terminate: callable()
        :param on_reload: method called on SIGHUP
        :type on_reload: callable()
        :param on_new_worker: method called in the child process when this one
                              is ready
        :type on_new_worker: callable(service_id, worker_id, service_obj)
        :param on_new_worker: method called when a child died
        :type on_new_worker: callable(service_id, worker_id, exit_code)

        If window support is planned, hooks callable must support
        to be pickle.pickle(). See CPython multiprocessing module documentation
        for more detail.
        """

        if on_terminate is not None:
            _utils.check_callable(on_terminate, "on_terminate")
            self._hooks["terminate"].append(on_terminate)
        if on_reload is not None:
            _utils.check_callable(on_reload, "on_reload")
            self._hooks["reload"].append(on_reload)
        if on_new_worker is not None:
            _utils.check_callable(on_new_worker, "on_new_worker")
            self._hooks["new_worker"].append(on_new_worker)
        if on_dead_worker is not None:
            _utils.check_callable(on_dead_worker, "on_dead_worker")
            self._hooks["dead_worker"].append(on_dead_worker)

    @typing.overload
    def _run_hooks(
        self,
        name: typing.Literal["dead_worker"],
        service_id: t.ServiceId,
        worker_id: t.WorkerId,
        exit_code: int,
    ) -> None: ...

    @typing.overload
    def _run_hooks(
        self,
        name: typing.Literal["new_worker"],
        service_id: t.ServiceId,
        worker_id: t.WorkerId,
        service: _service.Service,
    ) -> None: ...

    @typing.overload
    def _run_hooks(
        self,
        name: typing.Literal["reload"],
    ) -> None: ...

    @typing.overload
    def _run_hooks(
        self,
        name: typing.Literal["terminate"],
    ) -> None: ...

    def _run_hooks(
        self,
        name: typing.Literal["terminate", "reload", "new_worker", "dead_worker"],
        *args: typing.Any,
        **kwargs: typing.Any,
    ) -> None:
        _utils.run_hooks(name, self._hooks[name], *args, **kwargs)

    def add(
        self,
        service: type[_service.Service],
        workers: int = 1,
        args: typing.Any = None,  # noqa: ANN401
        kwargs: typing.Any = None,  # noqa: ANN401
    ) -> t.ServiceId:
        """Add a new service to the ServiceManager

        :param service: callable that return an instance of :py:class:`Service`
        :type service: callable
        :param workers: number of processes/workers for this service
        :type workers: int
        :param args: additional positional arguments for this service
        :type args: tuple
        :param kwargs: additional keywoard arguments for this service
        :type kwargs: dict

        :return: a service id
        :rtype: uuid.uuid4
        """
        _utils.check_callable(service, "service")
        _utils.check_workers(workers, 1)
        service_id = t.ServiceId(uuid.uuid4())
        self._services[service_id] = _service_worker.ServiceConfig(
            service_id,
            service,
            workers,
            args,
            kwargs,
        )
        return service_id

    def reconfigure(self, service_id: t.ServiceId, workers: int) -> None:
        """Reconfigure a service registered in ServiceManager

        :param service_id: the service id
        :type service_id: uuid.uuid4
        :param workers: number of processes/workers for this service
        :type workers: int
        :raises: ValueError
        """
        try:
            sc = self._services[service_id]
        except KeyError:
            msg = f"{service_id} service id doesn't exists"
            raise ValueError(msg) from None
        else:
            _utils.check_workers(workers, minimum=(1 - sc.workers))
            sc.workers = workers
            # Reset forktimes to respawn services quickly
            self._forktimes = []

    def run(self) -> None:
        """Start and supervise services workers

        This method will start and supervise all children processes
        until the master process asked to shutdown by a SIGTERM.

        All spawned processes are part of the same unix process group.
        """

        self._systemd_notify_once()
        self._child_supervisor = _utils.spawn(
            self._child_supervisor_thread,
        )
        self._wait_forever()

    def _child_supervisor_thread(self) -> None:
        while not self._dead.is_set():
            self._got_sig_chld.wait()
            self._got_sig_chld.clear()

            info = self._get_last_worker_died()
            while info is not None:
                if self._dead.is_set():
                    return
                service_id, worker_id = info
                self._start_worker(service_id, worker_id)
                info = self._get_last_worker_died()

            self._adjust_workers()

    def _on_signal_received(self, sig: int) -> None:
        if sig == _utils.SIGALRM:
            self._alarm()
        elif sig == signal.SIGINT:
            self._fast_exit()
        elif sig == signal.SIGTERM:
            self._shutdown()
        elif sig == _utils.SIGHUP:
            self._reload()
        elif sig == _utils.SIGCHLD:
            self._got_sig_chld.set()
        else:
            LOG.debug("unhandled signal %s", sig)

    def _alarm(self) -> None:
        self._fast_exit(
            reason="Graceful shutdown timeout exceeded, "
            "instantaneous exiting of master process",
        )

    def _reload(self) -> None:
        """reload all children

        posix only
        """
        self._run_hooks("reload")

        # Reset forktimes to respawn services quickly
        self._forktimes = []
        signal.signal(signal.SIGHUP, signal.SIG_IGN)
        os.killpg(0, signal.SIGHUP)
        signal.signal(signal.SIGHUP, self._signal_catcher)

    def shutdown(self) -> None:
        LOG.info("Manager shutdown requested")
        os.kill(os.getpid(), signal.SIGTERM)
        self._dead.wait()

    def _shutdown(self) -> None:
        LOG.info("Caught SIGTERM signal, graceful exiting of master process")
        signal.signal(signal.SIGTERM, signal.SIG_IGN)

        if self._graceful_shutdown_timeout > 0:
            if os.name == "posix":
                signal.alarm(self._graceful_shutdown_timeout)
            else:
                threading.Timer(self._graceful_shutdown_timeout, self._alarm).start()

        # NOTE(sileht): Stop the child supervisor
        self._dead.set()
        self._got_sig_chld.set()

        if self._child_supervisor is not None:
            self._child_supervisor.join()

        # NOTE(sileht): During startup if we receive SIGTERM, python
        # multiprocess may fork the process after we send the killpg(0)
        # To workaround the issue we sleep a bit, so multiprocess can finish
        # its work.
        with concurrent.futures.ThreadPoolExecutor() as pool:
            futures = [
                pool.submit(worker_info.started_event.wait, timeout=1)
                for processes in self._running_services.values()
                for worker_info in processes.values()
            ]
            concurrent.futures.wait(futures)

        self._run_hooks("terminate")

        LOG.debug("Killing services with signal SIGTERM")
        # NOTE(sileht): we don't have killpg so we
        # kill all known processes instead
        # NOTE(sileht): We should use CTRL_BREAK_EVENT on windows
        # when CREATE_NEW_PROCESS_GROUP will be set on child
        # process
        for process in self._child_processes:
            process.terminate()

        LOG.debug("Waiting services to terminate")
        for process in self._child_processes:
            process.join()
            process.close()

        LOG.debug("Shutdown finish")
        sys.exit(0)

    @property
    def _child_processes(self) -> list[multiprocessing.Process]:
        return [
            process
            for processes in self._running_services.values()
            for process in processes
        ]

    def _adjust_workers(self) -> None:
        for service_id, conf in self._services.items():
            running_workers = len(self._running_services[service_id])
            if running_workers < conf.workers:
                for worker_id in range(running_workers, conf.workers):
                    self._start_worker(service_id, t.WorkerId(worker_id))
            elif running_workers > conf.workers:
                for worker_id in range(conf.workers, running_workers):
                    self._stop_worker(service_id, t.WorkerId(worker_id))

    def _get_last_worker_died(self) -> tuple[t.ServiceId, t.WorkerId] | None:
        """Return the last died worker information or None"""
        for service_id in list(self._running_services.keys()):
            # We copy the list to clean the orignal one
            processes = list(self._running_services[service_id].items())
            for process, worker_info in processes:
                if not process.is_alive():
                    if process.exitcode is None:
                        raise RuntimeError("Dead process without exitcode")  # noqa: TRY003, EM101
                    self._run_hooks(
                        "dead_worker",
                        service_id,
                        worker_info.worker_id,
                        process.exitcode,
                    )
                    if process.exitcode < 0:
                        sig = _utils.signal_to_name(process.exitcode)
                        LOG.info(
                            "Child %(pid)d killed by signal %(sig)s",
                            {"pid": process.pid, "sig": sig},
                        )
                    else:
                        LOG.info(
                            "Child %(pid)d exited with status %(code)d",
                            {"pid": process.pid, "code": process.exitcode},
                        )
                    del self._running_services[service_id][process]
                    return service_id, worker_info.worker_id
        return None

    @staticmethod
    def _fast_exit(
        signo: int | None = None,
        frame: types.FrameType | None = None,
        reason: str = "Caught SIGINT signal, instantaneous exiting",
    ) -> None:
        if os.name == "posix":
            signal.signal(signal.SIGINT, signal.SIG_IGN)
            signal.signal(signal.SIGALRM, signal.SIG_IGN)
            LOG.info(reason)
            os.killpg(0, signal.SIGINT)
            # Wait a bit as os._exit(1) will break the child pipe and the child
            # will end with SIGTERM instead of SIGINT
            time.sleep(0.1)
        else:
            # NOTE(sileht): On windows killing the master process
            # with SIGINT kill automatically children
            LOG.info(reason)
        os._exit(1)

    def _slowdown_respawn_if_needed(self) -> None:
        # Limit ourselves to one process a second (over the period of
        # number of workers * 1 second). This will allow workers to
        # start up quickly but ensure we don't fork off children that
        # die instantly too quickly.
        expected_children = sum(s.workers for s in self._services.values())
        if len(self._forktimes) > expected_children:
            if time.time() - self._forktimes[0] < expected_children:
                LOG.info("Forking too fast, sleeping")
                time.sleep(5)
            self._forktimes.pop(0)
        else:
            time.sleep(self._wait_interval)
        self._forktimes.append(time.time())

    def _start_worker(self, service_id: t.ServiceId, worker_id: t.WorkerId) -> None:
        self._slowdown_respawn_if_needed()

        started_event = self.mp_context.Event()

        # Create and run a new service
        p = _utils.spawn_process(
            _service_worker.ServiceWorker.create_and_wait,
            self.mp_context,
            started_event,
            self._services[service_id],
            service_id,
            worker_id,
            self._death_detection_pipe,
            self._hooks["new_worker"],
            self._graceful_shutdown_timeout,
        )

        self._running_services[service_id][p] = WorkerInfo(worker_id, started_event)

    def _stop_worker(self, service_id: t.ServiceId, worker_id: t.WorkerId) -> None:
        for process, worker_info in self._running_services[service_id].items():
            if worker_info.worker_id == worker_id:
                # NOTE(sileht): We should use CTRL_BREAK_EVENT on windows
                # when CREATE_NEW_PROCESS_GROUP will be set on child process
                process.terminate()

    @staticmethod
    def _systemd_notify_once() -> None:
        """Send notification once to Systemd that service is ready.

        Systemd sets NOTIFY_SOCKET environment variable with the name of the
        socket listening for notifications from services.
        This method removes the NOTIFY_SOCKET environment variable to ensure
        notification is sent only once.
        """

        notify_socket = os.getenv("NOTIFY_SOCKET")
        if notify_socket:
            if notify_socket.startswith("@"):
                # abstract namespace socket
                notify_socket = f"\0{notify_socket[1:]}"
            sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
            with contextlib.closing(sock):
                try:
                    sock.connect(notify_socket)
                    sock.sendall(b"READY=1")
                    del os.environ["NOTIFY_SOCKET"]
                except OSError:
                    LOG.debug("Systemd notification failed", exc_info=True)