1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
|
# Owner(s): ["oncall: r2p"]
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import subprocess
import threading
import time
from base64 import b64encode
from typing import cast, ClassVar
from unittest import TestCase
from etcd import EtcdKeyNotFound # type: ignore[import]
from rendezvous_backend_test import RendezvousBackendTestMixin
from torch.distributed.elastic.rendezvous import (
RendezvousConnectionError,
RendezvousParameters,
)
from torch.distributed.elastic.rendezvous.api import RendezvousStoreInfo
from torch.distributed.elastic.rendezvous.etcd_rendezvous_backend import (
create_backend,
EtcdRendezvousBackend,
)
from torch.distributed.elastic.rendezvous.etcd_server import EtcdServer
from torch.distributed.elastic.rendezvous.etcd_store import EtcdStore
class EtcdRendezvousBackendTest(TestCase, RendezvousBackendTestMixin):
_server: ClassVar[EtcdServer]
@classmethod
def setUpClass(cls) -> None:
cls._server = EtcdServer()
cls._server.start(stderr=subprocess.DEVNULL)
@classmethod
def tearDownClass(cls) -> None:
cls._server.stop()
def setUp(self) -> None:
self._client = self._server.get_client()
# Make sure we have a clean slate.
try:
self._client.delete("/dummy_prefix", recursive=True, dir=True)
except EtcdKeyNotFound:
pass
self._backend = EtcdRendezvousBackend(
self._client, "dummy_run_id", "/dummy_prefix"
)
def _corrupt_state(self) -> None:
self._client.write("/dummy_prefix/dummy_run_id", "non_base64")
class CreateBackendTest(TestCase):
_server: ClassVar[EtcdServer]
@classmethod
def setUpClass(cls) -> None:
cls._server = EtcdServer()
cls._server.start(stderr=subprocess.DEVNULL)
@classmethod
def tearDownClass(cls) -> None:
cls._server.stop()
def setUp(self) -> None:
self._params = RendezvousParameters(
backend="dummy_backend",
endpoint=self._server.get_endpoint(),
run_id="dummy_run_id",
min_nodes=1,
max_nodes=1,
protocol="hTTp",
read_timeout="10",
)
self._expected_read_timeout = 10
def test_create_backend_returns_backend(self) -> None:
backend, store = create_backend(self._params)
self.assertEqual(backend.name, "etcd-v2")
self.assertIsInstance(store, EtcdStore)
etcd_store = cast(EtcdStore, store)
self.assertEqual(etcd_store.client.read_timeout, self._expected_read_timeout) # type: ignore[attr-defined]
client = self._server.get_client()
backend.set_state(b"dummy_state")
result = client.get("/torch/elastic/rendezvous/" + self._params.run_id)
self.assertEqual(result.value, b64encode(b"dummy_state").decode())
self.assertLessEqual(result.ttl, 7200)
store.set("dummy_key", "dummy_value")
result = client.get("/torch/elastic/store/" + b64encode(b"dummy_key").decode())
self.assertEqual(result.value, b64encode(b"dummy_value").decode())
def test_create_backend_returns_backend_if_protocol_is_not_specified(self) -> None:
del self._params.config["protocol"]
self.test_create_backend_returns_backend()
def test_create_backend_returns_backend_if_read_timeout_is_not_specified(
self,
) -> None:
del self._params.config["read_timeout"]
self._expected_read_timeout = 60
self.test_create_backend_returns_backend()
def test_create_backend_raises_error_if_etcd_is_unreachable(self) -> None:
self._params.endpoint = "dummy:1234"
with self.assertRaisesRegex(
RendezvousConnectionError,
r"^The connection to etcd has failed. See inner exception for details.$",
):
create_backend(self._params)
def test_create_backend_raises_error_if_protocol_is_invalid(self) -> None:
self._params.config["protocol"] = "dummy"
with self.assertRaisesRegex(
ValueError, r"^The protocol must be HTTP or HTTPS.$"
):
create_backend(self._params)
def test_create_backend_raises_error_if_read_timeout_is_invalid(self) -> None:
for read_timeout in ["0", "-10"]:
with self.subTest(read_timeout=read_timeout):
self._params.config["read_timeout"] = read_timeout
with self.assertRaisesRegex(
ValueError, r"^The read timeout must be a positive integer.$"
):
create_backend(self._params)
def test_get_waits_for_store_prefix_key(self) -> None:
def store_get(store, result_dict):
start_time = time.perf_counter()
result_dict["get_result"] = store.get(
RendezvousStoreInfo.MASTER_ADDR_KEY
).decode(encoding="UTF-8")
end_time = time.perf_counter()
result_dict["time"] = end_time - start_time
def store_set(store):
time.sleep(2)
store.set(RendezvousStoreInfo.MASTER_ADDR_KEY, b"foo")
backend, store = create_backend(self._params)
backend.set_state(b"dummy_state")
result_dict = {}
get_thread = threading.Thread(target=store_get, args=(store, result_dict))
set_thread = threading.Thread(target=store_set, args=(store,))
get_thread.start()
set_thread.start()
get_thread.join()
set_thread.join()
assert result_dict["get_result"] == "foo"
assert result_dict["time"] >= 2
|