File: btl_usnic_hwloc.c

package info (click to toggle)
openmpi 5.0.8-4
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 201,684 kB
  • sloc: ansic: 613,078; makefile: 42,353; sh: 11,194; javascript: 9,244; f90: 7,052; java: 6,404; perl: 5,179; python: 1,859; lex: 740; fortran: 61; cpp: 20; tcl: 12
file content (269 lines) | stat: -rw-r--r-- 7,941 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
/*
 * Copyright (c) 2013-2019 Cisco Systems, Inc.  All rights reserved
 * Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */

#include "opal_config.h"

#include "opal/constants.h"
#include "opal/mca/hwloc/base/base.h"

#include "opal/mca/btl/base/base.h"

#include "btl_usnic_hwloc.h"

/*
 * Local variables
 */
static hwloc_obj_t my_numa_node = NULL;
static int num_numa_nodes = 0;
static struct hwloc_distances_s *matrix = NULL;
#if HWLOC_API_VERSION >= 0x20000
static unsigned int matrix_nr = 1;
#endif

/*
 * Get the hwloc distance matrix (if we don't already have it).
 */
static int get_distance_matrix(void)
{
#if HWLOC_API_VERSION < 0x20000
    /* Note that the matrix data structure belongs to hwloc; we are not
     * responsible for freeing it. */

    if (NULL == matrix) {
        matrix = hwloc_get_whole_distance_matrix_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE);
    }

    return (NULL == matrix) ? OPAL_ERROR : OPAL_SUCCESS;
#else
    if (0
            != hwloc_distances_get_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE, &matrix_nr, &matrix,
                                           HWLOC_DISTANCES_KIND_MEANS_LATENCY, 0)
        || 0 == matrix_nr) {
        return OPAL_ERROR;
    }
    return OPAL_SUCCESS;
#endif
}

/*
 * Find the NUMA node that covers a given cpuset
 */
static hwloc_obj_t find_numa_node(hwloc_bitmap_t cpuset)
{
    hwloc_obj_t obj;

    obj = hwloc_get_first_largest_obj_inside_cpuset(opal_hwloc_topology, cpuset);

    /* Go upwards until we hit the NUMA node or run out of parents */
    while (obj->type > HWLOC_OBJ_NODE && NULL != obj->parent) {
        obj = obj->parent;
    }

    /* Make sure we ended up on the NUMA node */
    if (obj->type != HWLOC_OBJ_NODE) {
        opal_output_verbose(5, USNIC_OUT,
                            "btl:usnic:filter_numa: could not find NUMA node where this process is "
                            "bound; filtering by NUMA distance not possible");
        return NULL;
    }

    /* Finally, make sure that our cpuset doesn't span more than 1
       NUMA node */
    if (hwloc_get_nbobjs_inside_cpuset_by_type(opal_hwloc_topology, cpuset, HWLOC_OBJ_NODE) > 1) {
        opal_output_verbose(5, USNIC_OUT,
                            "btl:usnic:filter_numa: this process is bound to more than 1 NUMA "
                            "node; filtering by NUMA distance not possible");
        return NULL;
    }

    return obj;
}

/*
 * Find my NUMA node in the hwloc topology.  This is a Cisco
 * UCS-specific BTL, so I know that I'll always have a NUMA node
 * (i.e., not some unknown server type that may not have or report a
 * NUMA node).
 *
 * Note that the my_numa_node value we find is just a handle; we
 * aren't responsible for freeing it.
 */
static int find_my_numa_node(void)
{
    hwloc_obj_t obj;
    hwloc_bitmap_t cpuset;

    if (NULL != my_numa_node) {
        return OPAL_SUCCESS;
    }

    /* Get this process' binding */
    cpuset = hwloc_bitmap_alloc();
    if (NULL == cpuset) {
        return OPAL_ERR_OUT_OF_RESOURCE;
    }
    if (0 != hwloc_get_cpubind(opal_hwloc_topology, cpuset, 0)) {
        hwloc_bitmap_free(cpuset);
        return OPAL_ERR_NOT_AVAILABLE;
    }

    /* Get the largest object type in the cpuset */
    obj = find_numa_node(cpuset);
    hwloc_bitmap_free(cpuset);
    if (NULL == obj) {
        return OPAL_ERR_NOT_AVAILABLE;
    }

    /* Happiness */
    my_numa_node = obj;
    num_numa_nodes = hwloc_get_nbobjs_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE);
    return OPAL_SUCCESS;
}

/*
 * Find a NUMA node covering the device associated with this module
 */
static hwloc_obj_t find_device_numa(opal_btl_usnic_module_t *module)
{
    struct fi_usnic_info *uip;
    hwloc_obj_t obj;

    /* Bozo checks */
    assert(NULL != matrix);
    assert(NULL != my_numa_node);

    uip = &module->usnic_info;

    /* Look for the IP device name in the hwloc topology (the usnic
       device is simply an alternate API to reach the same device, so
       if we find the IP device name, we've found the usNIC device) */
    obj = NULL;
    while (NULL != (obj = hwloc_get_next_osdev(opal_hwloc_topology, obj))) {
        assert(HWLOC_OBJ_OS_DEVICE == obj->type);
        if (0 == strcmp(obj->name, uip->ui.v1.ui_ifname)) {
            break;
        }
    }

    /* Did not find it */
    if (NULL == obj) {
        return NULL;
    }

    /* Search upwards to find the device's NUMA node */
    /* Go upwards until we hit the NUMA node or run out of parents */
    while (obj->type > HWLOC_OBJ_NODE && NULL != obj->parent) {
        obj = obj->parent;
    }

    /* Make sure we ended up on the NUMA node */
    if (obj->type != HWLOC_OBJ_NODE) {
        opal_output_verbose(5, USNIC_OUT,
                            "btl:usnic:filter_numa: could not find NUMA node for %s; filtering by "
                            "NUMA distance not possible",
                            module->linux_device_name);
        return NULL;
    }

    return obj;
}

/*
 * Public entry point: find the hwloc NUMA distance from this process
 * to the usnic device in the specified module.
 */
int opal_btl_usnic_hwloc_distance(opal_btl_usnic_module_t *module)
{
    int ret;
    hwloc_obj_t dev_numa;

    /* Bozo check */
    assert(NULL != module);

    /* Is this process bound? */
    if (!proc_bound()) {
        opal_output_verbose(
            5, USNIC_OUT,
            "btl:usnic:filter_numa: not sorting devices by NUMA distance (process not bound)");
        return OPAL_SUCCESS;
    }

    opal_output_verbose(5, USNIC_OUT, "btl:usnic:filter_numa: filtering devices by NUMA distance");

    /* ensure we have the topology */
    if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
        opal_output_verbose(
            5, USNIC_OUT,
            "btl:usnic:filter_numa: not sorting devices by NUMA distance (topology not available)");
        return OPAL_SUCCESS;
    }

    /* Get the hwloc distance matrix for all NUMA nodes */
    if (OPAL_SUCCESS != (ret = get_distance_matrix())) {
        return ret;
    }

    /* Find my NUMA node */
    if (OPAL_SUCCESS != (ret = find_my_numa_node())) {
        return ret;
    }
    /* If my_numa_node is still NULL, that means we span more than 1
       NUMA node.  So... no sorting/pruning for you! */
    if (NULL == my_numa_node) {
        return OPAL_SUCCESS;
    }

    /* Find the NUMA node covering this module's device */
    dev_numa = find_device_numa(module);

    /* Lookup the distance between my NUMA node and the NUMA node of
       the device */
#if HWLOC_API_VERSION < 0x20000
    if (NULL != dev_numa) {
        module->numa_distance = matrix->latency[dev_numa->logical_index * num_numa_nodes
                                                + my_numa_node->logical_index];

        opal_output_verbose(5, USNIC_OUT, "btl:usnic:filter_numa: %s is distance %d from me",
                            module->linux_device_name, module->numa_distance);
    }
#else
    if (NULL != dev_numa) {
        int myindex, devindex;
        unsigned int j;
        myindex = -1;
        for (j = 0; j < matrix_nr; j++) {
            if (my_numa_node == matrix->objs[j]) {
                myindex = j;
                break;
            }
        }
        if (-1 == myindex) {
            return OPAL_SUCCESS;
        }
        devindex = -1;
        for (j = 0; j < matrix_nr; j++) {
            if (dev_numa == matrix->objs[j]) {
                devindex = j;
                break;
            }
        }
        if (-1 == devindex) {
            return OPAL_SUCCESS;
        }

        module->numa_distance = matrix->values[(devindex * num_numa_nodes) + myindex];

        opal_output_verbose(5, USNIC_OUT, "btl:usnic:filter_numa: %s is distance %d from me",
                            module->linux_device_name, module->numa_distance);
    }
#endif

    return OPAL_SUCCESS;
}