1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
|
#!/usr/bin/env python3
# Owner(s): ["oncall: r2p"]
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import io
import os
import shutil
import sys
import tempfile
import time
import unittest
from concurrent.futures import wait
from concurrent.futures._base import ALL_COMPLETED
from concurrent.futures.thread import ThreadPoolExecutor
from typing import Dict, Set
from unittest import mock
from torch.distributed.elastic.multiprocessing.tail_log import TailLog
def write(max: int, sleep: float, file: str):
with open(file, "w") as fp:
for i in range(max):
print(i, file=fp, flush=True)
time.sleep(sleep)
class TailLogTest(unittest.TestCase):
def setUp(self):
self.test_dir = tempfile.mkdtemp(prefix=f"{self.__class__.__name__}_")
self.threadpool = ThreadPoolExecutor()
def tearDown(self):
shutil.rmtree(self.test_dir)
def test_tail(self):
"""
writer() writes 0 - max (on number on each line) to a log file.
Run nprocs such writers and tail the log files into an IOString
and validate that all lines are accounted for.
"""
nprocs = 32
max = 1000
interval_sec = 0.0001
log_files = {
local_rank: os.path.join(self.test_dir, f"{local_rank}_stdout.log")
for local_rank in range(nprocs)
}
dst = io.StringIO()
tail = TailLog(
name="writer", log_files=log_files, dst=dst, interval_sec=interval_sec
).start()
# sleep here is intentional to ensure that the log tail
# can gracefully handle and wait for non-existent log files
time.sleep(interval_sec * 10)
futs = []
for local_rank, file in log_files.items():
f = self.threadpool.submit(
write, max=max, sleep=interval_sec * local_rank, file=file
)
futs.append(f)
wait(futs, return_when=ALL_COMPLETED)
self.assertFalse(tail.stopped())
tail.stop()
dst.seek(0)
actual: Dict[int, Set[int]] = {}
for line in dst.readlines():
header, num = line.split(":")
nums = actual.setdefault(header, set())
nums.add(int(num))
self.assertEqual(nprocs, len(actual))
self.assertEqual(
{f"[writer{i}]": set(range(max)) for i in range(nprocs)}, actual
)
self.assertTrue(tail.stopped())
def test_tail_with_custom_prefix(self):
"""
writer() writes 0 - max (on number on each line) to a log file.
Run nprocs such writers and tail the log files into an IOString
and validate that all lines are accounted for.
"""
nprocs = 3
max = 10
interval_sec = 0.0001
log_files = {
local_rank: os.path.join(self.test_dir, f"{local_rank}_stdout.log")
for local_rank in range(nprocs)
}
dst = io.StringIO()
log_line_prefixes = {n: f"[worker{n}][{n}]:" for n in range(nprocs)}
tail = TailLog(
"writer",
log_files,
dst,
interval_sec=interval_sec,
log_line_prefixes=log_line_prefixes,
).start()
# sleep here is intentional to ensure that the log tail
# can gracefully handle and wait for non-existent log files
time.sleep(interval_sec * 10)
futs = []
for local_rank, file in log_files.items():
f = self.threadpool.submit(
write, max=max, sleep=interval_sec * local_rank, file=file
)
futs.append(f)
wait(futs, return_when=ALL_COMPLETED)
self.assertFalse(tail.stopped())
tail.stop()
dst.seek(0)
headers: Set[str] = set()
for line in dst.readlines():
header, _ = line.split(":")
headers.add(header)
self.assertEqual(nprocs, len(headers))
for i in range(nprocs):
self.assertIn(f"[worker{i}][{i}]", headers)
self.assertTrue(tail.stopped())
def test_tail_no_files(self):
"""
Ensures that the log tail can gracefully handle no log files
in which case it does nothing.
"""
tail = TailLog("writer", log_files={}, dst=sys.stdout).start()
self.assertFalse(tail.stopped())
tail.stop()
self.assertTrue(tail.stopped())
def test_tail_logfile_never_generates(self):
"""
Ensures that we properly shutdown the threadpool
even when the logfile never generates.
"""
tail = TailLog("writer", log_files={0: "foobar.log"}, dst=sys.stdout).start()
tail.stop()
self.assertTrue(tail.stopped())
self.assertTrue(tail._threadpool._shutdown)
@mock.patch("torch.distributed.elastic.multiprocessing.tail_log.logger")
def test_tail_logfile_error_in_tail_fn(self, mock_logger):
"""
Ensures that when there is an error in the tail_fn (the one that runs in the
threadpool), it is dealt with and raised properly.
"""
# try giving tail log a directory (should fail with an IsADirectoryError
tail = TailLog("writer", log_files={0: self.test_dir}, dst=sys.stdout).start()
tail.stop()
mock_logger.error.assert_called_once()
|