1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
|
# Owner(s): ["oncall: r2p"]
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import subprocess
from base64 import b64encode
from typing import ClassVar, cast
from unittest import TestCase
from etcd import EtcdKeyNotFound # type: ignore[import]
from torch.distributed.elastic.rendezvous import RendezvousConnectionError, RendezvousParameters
from torch.distributed.elastic.rendezvous.etcd_rendezvous_backend import (
EtcdRendezvousBackend,
create_backend,
)
from torch.distributed.elastic.rendezvous.etcd_server import EtcdServer
from torch.distributed.elastic.rendezvous.etcd_store import EtcdStore
from rendezvous_backend_test import RendezvousBackendTestMixin
class EtcdRendezvousBackendTest(TestCase, RendezvousBackendTestMixin):
_server: ClassVar[EtcdServer]
@classmethod
def setUpClass(cls) -> None:
cls._server = EtcdServer()
cls._server.start(stderr=subprocess.DEVNULL)
@classmethod
def tearDownClass(cls) -> None:
cls._server.stop()
def setUp(self) -> None:
self._client = self._server.get_client()
# Make sure we have a clean slate.
try:
self._client.delete("/dummy_prefix", recursive=True, dir=True)
except EtcdKeyNotFound:
pass
self._backend = EtcdRendezvousBackend(self._client, "dummy_run_id", "/dummy_prefix")
def _corrupt_state(self) -> None:
self._client.write("/dummy_prefix/dummy_run_id", "non_base64")
class CreateBackendTest(TestCase):
_server: ClassVar[EtcdServer]
@classmethod
def setUpClass(cls) -> None:
cls._server = EtcdServer()
cls._server.start(stderr=subprocess.DEVNULL)
@classmethod
def tearDownClass(cls) -> None:
cls._server.stop()
def setUp(self) -> None:
self._params = RendezvousParameters(
backend="dummy_backend",
endpoint=self._server.get_endpoint(),
run_id="dummy_run_id",
min_nodes=1,
max_nodes=1,
protocol="hTTp",
read_timeout="10",
)
self._expected_read_timeout = 10
def test_create_backend_returns_backend(self) -> None:
backend, store = create_backend(self._params)
self.assertEqual(backend.name, "etcd-v2")
self.assertIsInstance(store, EtcdStore)
etcd_store = cast(EtcdStore, store)
self.assertEqual(etcd_store.client.read_timeout, self._expected_read_timeout) # type: ignore[attr-defined]
client = self._server.get_client()
backend.set_state(b"dummy_state")
result = client.get("/torch/elastic/rendezvous/" + self._params.run_id)
self.assertEqual(result.value, b64encode(b"dummy_state").decode())
self.assertLessEqual(result.ttl, 7200)
store.set("dummy_key", "dummy_value")
result = client.get("/torch/elastic/store/" + b64encode(b"dummy_key").decode())
self.assertEqual(result.value, b64encode(b"dummy_value").decode())
def test_create_backend_returns_backend_if_protocol_is_not_specified(self) -> None:
del self._params.config["protocol"]
self.test_create_backend_returns_backend()
def test_create_backend_returns_backend_if_read_timeout_is_not_specified(self) -> None:
del self._params.config["read_timeout"]
self._expected_read_timeout = 60
self.test_create_backend_returns_backend()
def test_create_backend_raises_error_if_etcd_is_unreachable(self) -> None:
self._params.endpoint = "dummy:1234"
with self.assertRaisesRegex(
RendezvousConnectionError,
r"^The connection to etcd has failed. See inner exception for details.$",
):
create_backend(self._params)
def test_create_backend_raises_error_if_protocol_is_invalid(self) -> None:
self._params.config["protocol"] = "dummy"
with self.assertRaisesRegex(ValueError, r"^The protocol must be HTTP or HTTPS.$"):
create_backend(self._params)
def test_create_backend_raises_error_if_read_timeout_is_invalid(self) -> None:
for read_timeout in ["0", "-10"]:
with self.subTest(read_timeout=read_timeout):
self._params.config["read_timeout"] = read_timeout
with self.assertRaisesRegex(
ValueError, r"^The read timeout must be a positive integer.$"
):
create_backend(self._params)
|