File: test_efa_protocol_selection.py

package info (click to toggle)
libfabric 2.1.0-1.1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 26,108 kB
  • sloc: ansic: 387,262; python: 3,171; sh: 2,555; makefile: 1,313; cpp: 617; perl: 474; ruby: 123; asm: 27
file content (70 lines) | stat: -rw-r--r-- 3,388 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import pytest

from efa.efa_common import has_gdrcopy, has_rdma


# This test skips efa-direct because it does not have the read protocol
# TODO Expand this test to run on all memory types (and rename)
@pytest.mark.serial
@pytest.mark.functional
@pytest.mark.cuda_memory
@pytest.mark.parametrize("fabtest_name,cntrl_env_var", [("fi_rdm_tagged_bw", "FI_EFA_INTER_MIN_READ_MESSAGE_SIZE"), ("fi_rma_bw", "FI_EFA_INTER_MIN_READ_WRITE_SIZE")])
def test_transfer_with_read_protocol_cuda(cmdline_args, fabtest_name, cntrl_env_var):
    """
    Verify that the read protocol is used for a 1024 byte message when the env variable
    switches are set to force the read protocol at 1000 bytes.
    """
    import copy
    from common import has_cuda, has_hmem_support
    from efa.efa_common import efa_run_client_server_test, efa_retrieve_hw_counter_value

    if cntrl_env_var == "FI_EFA_INTER_MIN_READ_WRITE_SIZE" and has_rdma(cmdline_args, "write"):
        pytest.skip("FI_EFA_INTER_MIN_READ_WRITE_SIZE is only applied to emulated write protocols")

    if cmdline_args.server_id == cmdline_args.client_id:
        pytest.skip("No read for intra-node communication")

    if (not has_hmem_support(cmdline_args, cmdline_args.client_id) or
        not has_hmem_support(cmdline_args, cmdline_args.server_id)):
        pytest.skip("Client and server both need hmem support for RDMA Read test")

    if not has_cuda(cmdline_args.client_id) or not has_cuda(cmdline_args.server_id):
        pytest.skip("Client and server both need a Cuda device")

    message_size = 1024

    cmdline_args_copy = copy.copy(cmdline_args)
    cmdline_args_copy.append_environ("FI_EFA_USE_DEVICE_RDMA=1")
    cmdline_args_copy.append_environ(f"{cntrl_env_var}=1000")
    cmdline_args_copy.append_environ("FI_EFA_RUNT_SIZE=0")

    # wrs stands for work requests
    server_read_wrs_before_test = efa_retrieve_hw_counter_value(cmdline_args.server_id, "rdma_read_wrs")
    if server_read_wrs_before_test is None:
        pytest.skip("No HW counter support")
        return
    server_read_bytes_before_test = efa_retrieve_hw_counter_value(cmdline_args.server_id, "rdma_read_bytes")

    efa_run_client_server_test(cmdline_args_copy,
                               fabtest_name,
                               iteration_type="1",
                               completion_semantic="transmit_complete",
                               memory_type="cuda_to_cuda",
                               message_size=message_size,
                               warmup_iteration_type="0",
                               fabric="efa")

    server_read_wrs_after_test = efa_retrieve_hw_counter_value(cmdline_args.server_id, "rdma_read_wrs")
    server_read_bytes_after_test = efa_retrieve_hw_counter_value(cmdline_args.server_id, "rdma_read_bytes")

    bytes_read = server_read_bytes_after_test - server_read_bytes_before_test
    # Check that the READ protocol was the only protocol used.
    # The hw counter should record:
    # - The exact message size if gdrcopy is enabled, or
    # - More if gdrcopy is disabled and localread is used.
    if (has_gdrcopy(cmdline_args.server_id)):
        assert bytes_read == message_size
        assert server_read_wrs_after_test == server_read_wrs_before_test + 1
    else:
        assert bytes_read > message_size
        assert server_read_wrs_after_test > server_read_wrs_before_test + 1