File: topology-nvml.c

package info (click to toggle)
hwloc-contrib 2.12.0-3~bpo12%2B1
links: PTS, VCS
area: contrib
in suites: bookworm-backports
size: 23,516 kB
sloc: ansic: 60,875; xml: 13,559; sh: 7,332; makefile: 2,154; javascript: 879; cpp: 93; php: 8; sed: 5
file content (482 lines) | stat: -rw-r--r-- 15,519 bytes
parent folder | download | duplicates (7)
/*
 * Copyright © 2012-2025 Inria.  All rights reserved.
 * See COPYING in top-level directory.
 */

#include "private/autogen/config.h"
#include "hwloc.h"
#include "hwloc/plugins.h"

/* private headers allowed for convenience because this plugin is built within hwloc */
#include "private/misc.h"
#include "private/debug.h"

#include <nvml.h>


#ifdef NVML_NVLINK_MAX_LINKS

static unsigned
hwloc__nvml_get_peer_gpu_by_pci(nvmlPciInfo_t peer, unsigned nb, nvmlPciInfo_t *gpus)
{
  unsigned i;
  for(i=0; i<nb; i++)
    if (gpus[i].domain == peer.domain && gpus[i].bus == peer.bus && gpus[i].device == peer.device)
      return i;
  return (unsigned)-1;
}

#if !HAVE_DECL_NVMLDEVICEGETNVLINKREMOTEDEVICETYPE
typedef unsigned nvmlIntNvLinkDeviceType_t;
#define NVML_NVLINK_DEVICE_TYPE_IBMNPU  0x01
#define NVML_NVLINK_DEVICE_TYPE_SWITCH  0x02
#define NVML_NVLINK_DEVICE_TYPE_UNKNOWN 0xFF
#endif

static hwloc_obj_t
hwloc__nvml_get_peer_obj_by_pci(struct hwloc_topology *topology, hwloc_obj_t gpu, nvmlPciInfo_t peer_bdf, nvmlIntNvLinkDeviceType_t dtype)
{
  hwloc_obj_t obj;

  /* we want the exact object here because we'll use it PCI class below
   * (we can't use hwloc_pci_find_parent_by_busid() which is enough for inserting OSdev by locality).
   */
  obj = hwloc_pci_find_by_busid(topology, peer_bdf.domain, peer_bdf.bus, peer_bdf.device, 0);
  if (!obj) {
    enum hwloc_type_filter_e pfilter;
    hwloc_topology_get_type_filter(topology, HWLOC_OBJ_PCI_DEVICE, &pfilter);
    /* we need PCI devices to be filtered-in */
    if (pfilter != HWLOC_TYPE_FILTER_KEEP_NONE) {
      static int warned = 0;
      if (!warned && HWLOC_SHOW_ALL_ERRORS())
        fprintf(stderr, "hwloc/nvml: failed to find NVLink peer %04x:%02x:%02x\n",
                peer_bdf.domain, peer_bdf.bus, peer_bdf.device);
      warned = 1;
    } else {
      static int warned = 0;
      if (!warned)
        hwloc_debug("hwloc failed to find NVLink peer %04x:%02x:%02x because PCI devices are filtered-out\n",
                    peer_bdf.domain, peer_bdf.bus, peer_bdf.device);
      warned = 1;
    }
    return NULL;
  }

  if (dtype == NVML_NVLINK_DEVICE_TYPE_UNKNOWN) {
    /* We want a non-PCI bridge.
     * On POWER8/9, it's class 0680 vendor 1014 (IBM) model 04ea prog-if 00.
     * For NVSwitch, it's class 0680 with prog-if 01 vendor 10de (NVIDIA).
     * Baseclass 0x06 is enough to avoid GPUs (baseclass 0x03),
     * and that's needed because some GPUs may be hidden from us because of cgroups.
     */
    if (obj->type != HWLOC_OBJ_PCI_DEVICE || (obj->attr->pcidev.class_id >> 8 != 0x06))
      return NULL;

    /* try to guess from the PCI info */
    switch (obj->attr->pcidev.vendor_id) {
    case 0x1014: dtype = NVML_NVLINK_DEVICE_TYPE_IBMNPU; break;
    case 0x10de: dtype = NVML_NVLINK_DEVICE_TYPE_SWITCH; break;
    default: break;
    }
  }

  switch (dtype) {
  case NVML_NVLINK_DEVICE_TYPE_IBMNPU: {
    /* IBM OpenCAPI port, return the CPU object. */
    if (!getenv("HWLOC_NVML_USE_OPENCAPI_LOCALITY")) {
      /* OpenCAPI Bridge PCI locality is wrong on POWER8 (equal to the entire machine).
       * Both POWER8 and POWER9 have correct GPU locality, use that one instead.
       * This will only break if PCI and NVLink are not connected to the same location, unlikely.
       */
      obj = gpu;
    }
    /* return a CPU side parent */
    while (!obj->cpuset)
      obj = obj->parent;
    return obj;
  }
  case NVML_NVLINK_DEVICE_TYPE_SWITCH: {
    /* NVIDIA NVSwitch, return the PCI object, we don't have anything better.
     * Mark it as subtype NVSwitch so that the core doesn't remove it.
     */
    if (!obj->subtype)
      obj->subtype = strdup("NVSwitch");
    return obj;
  }
  default: {
    static int warned = 0;
    if (!warned && HWLOC_SHOW_ALL_ERRORS())
      fprintf(stderr, "hwloc/nvml: failed to recognize NVLink peer %04x:%02x:%02x class %04x vendor %04x device %04x\n",
              peer_bdf.domain, peer_bdf.bus, peer_bdf.device,
              obj->attr->pcidev.class_id, obj->attr->pcidev.vendor_id, obj->attr->pcidev.device_id);
    warned = 1;
    return NULL;
  }
  }
}

static unsigned
hwloc__nvml_store_peer_obj(hwloc_obj_t obj,
                           unsigned nbgpus, unsigned *nbobjs, hwloc_obj_t *objs)
{
  unsigned i;
  /* is it already in the array? */
  for(i=nbgpus; i<*nbobjs; i++)
    if (objs[i] == obj)
      return i;
  /* append it */
  objs[*nbobjs] = obj;
  return (*nbobjs)++;
}

static int
hwloc__nvml_add_nvlink_bandwidth(hwloc_topology_t topology,
                                 unsigned nbobjs, hwloc_obj_t *objs, hwloc_uint64_t *bws)
{
  void *handle;
  int err;

  handle = hwloc_backend_distances_add_create(topology, "NVLinkBandwidth",
                                              HWLOC_DISTANCES_KIND_FROM_OS|HWLOC_DISTANCES_KIND_MEANS_BANDWIDTH,
                                              0);
  if (!handle)
    goto out;

  err = hwloc_backend_distances_add_values(topology, handle, nbobjs, objs, bws, 0);
  if (err < 0)
    goto out;
  /* arrays are now attached to the handle */
  objs = NULL;
  bws = NULL;

  err = hwloc_backend_distances_add_commit(topology, handle, 0 /* don't group GPUs */);
  if (err < 0)
    goto out;

  return 0;

 out:
  free(objs);
  free(bws);
  return -1;
}
#endif /* NVML_NVLINK_MAX_LINKS */


static int
hwloc_nvml_discover(struct hwloc_backend *backend, struct hwloc_disc_status *dstatus)
{
  /*
   * This backend uses the underlying OS.
   * However we don't enforce topology->is_thissystem so that
   * we may still force use this backend when debugging with !thissystem.
   */

  struct hwloc_topology *topology = backend->topology;
  enum hwloc_type_filter_e filter;
  nvmlReturn_t ret;
  unsigned nb, i;
#ifdef NVML_NVLINK_MAX_LINKS
  unsigned nbobjs, j;
  hwloc_obj_t *objs;
  unsigned *peer_indexes;
  nvmlPciInfo_t *gpu_bdfs;
  hwloc_uint64_t *bws;
  int found_nvlinks = 0;
#endif

  assert(dstatus->phase == HWLOC_DISC_PHASE_IO);

  hwloc_topology_get_type_filter(topology, HWLOC_OBJ_OS_DEVICE, &filter);
  if (filter == HWLOC_TYPE_FILTER_KEEP_NONE)
    return 0;

  ret = nvmlInit();
  if (NVML_SUCCESS != ret) {
    if (HWLOC_SHOW_ALL_ERRORS()) {
      const char *error = nvmlErrorString(ret);
      hwloc_debug("hwloc/nvml: Failed to initialize with nvmlInit(): %s\n", error);
    }
    return -1;
  }
  ret = nvmlDeviceGetCount(&nb);
  if (NVML_SUCCESS != ret || !nb) {
    nvmlShutdown();
    return 0;
  }

#ifdef NVML_NVLINK_MAX_LINKS
  /* the PCI BDF of each GPU */
  gpu_bdfs = calloc(nb, sizeof(*gpu_bdfs));
  /* the nvlink matrix will require one slot per GPU and possibly additional slots for non-GPU endpoints,
   * usually one per CPU, but let's take an easy upper bound.
   */
  objs = calloc(nb * NVML_NVLINK_MAX_LINKS, sizeof(*objs));
  bws = calloc(nb * NVML_NVLINK_MAX_LINKS * nb * NVML_NVLINK_MAX_LINKS, sizeof(*bws));
  /* array to translate peer of i-th link of j-th GPU into an peer object index inside objs */
  peer_indexes = calloc(nb * NVML_NVLINK_MAX_LINKS, sizeof(*peer_indexes));
  if (!gpu_bdfs || !objs || !gpu_bdfs || !bws || !peer_indexes) {
    free(gpu_bdfs);
    free(objs);
    free(bws);
    free(peer_indexes);
    return -1;
  }
#endif

  for(i=0; i<nb; i++) {
    nvmlPciInfo_t pci;
    nvmlDevice_t device;
    hwloc_obj_t osdev, parent;
    char buffer[64];

    ret = nvmlDeviceGetHandleByIndex(i, &device);
    assert(ret == NVML_SUCCESS);

    osdev = hwloc_alloc_setup_object(topology, HWLOC_OBJ_OS_DEVICE, HWLOC_UNKNOWN_INDEX);
    snprintf(buffer, sizeof(buffer), "nvml%u", i);
    osdev->name = strdup(buffer);
    osdev->subtype = strdup("NVML");
    osdev->depth = HWLOC_TYPE_DEPTH_UNKNOWN;
    osdev->attr->osdev.type = HWLOC_OBJ_OSDEV_GPU;

    hwloc_obj_add_info(osdev, "Backend", "NVML");
    hwloc_obj_add_info(osdev, "GPUVendor", "NVIDIA Corporation");

    buffer[0] = '\0';
    ret = nvmlDeviceGetName(device, buffer, sizeof(buffer));
    hwloc_obj_add_info(osdev, "GPUModel", buffer);

    /* these may fail with NVML_ERROR_NOT_SUPPORTED on old devices */
    buffer[0] = '\0';
    ret = nvmlDeviceGetSerial(device, buffer, sizeof(buffer));
    if (buffer[0] != '\0')
      hwloc_obj_add_info(osdev, "NVIDIASerial", buffer);

    buffer[0] = '\0';
    ret = nvmlDeviceGetUUID(device, buffer, sizeof(buffer));
    if (buffer[0] != '\0')
      hwloc_obj_add_info(osdev, "NVIDIAUUID", buffer);

    parent = NULL;
    if (NVML_SUCCESS == nvmlDeviceGetPciInfo(device, &pci)) {
#ifdef NVML_NVLINK_MAX_LINKS
      gpu_bdfs[i] = pci;
#endif
      parent = hwloc_pci_find_parent_by_busid(topology, pci.domain, pci.bus, pci.device, 0);
#if HAVE_DECL_NVMLDEVICEGETCURRPCIELINKGENERATION
      if (parent && parent->type == HWLOC_OBJ_PCI_DEVICE) {
	unsigned maxwidth = 0, maxgen = 0;
	nvmlDeviceGetCurrPcieLinkWidth(device, &maxwidth);
	nvmlDeviceGetCurrPcieLinkGeneration(device, &maxgen);
        parent->attr->pcidev.linkspeed = hwloc__pci_link_speed(maxgen, maxwidth);
      }
#endif
    }
    if (!parent)
      parent = hwloc_get_root_obj(topology);

    hwloc_insert_object_by_parent(topology, parent, osdev);
#ifdef NVML_NVLINK_MAX_LINKS
    objs[i] = osdev;
#endif
  }

#ifdef NVML_NVLINK_MAX_LINKS
  nbobjs = nb;

  /* list peer objects */
  for(i=0; i<nb; i++) {
    /* look at nvlinks */
    nvmlDevice_t device;
    nvmlPciInfo_t pci;

    ret = nvmlDeviceGetHandleByIndex(i, &device);
    assert(ret == NVML_SUCCESS);

    hwloc_debug("looking at NVLinks for NVML GPU #%u...\n", i);
    for(j=0; j<NVML_NVLINK_MAX_LINKS; j++) {
      nvmlEnableState_t isActive;
      nvmlIntNvLinkDeviceType_t dtype;

      /* mark the peer as unknown for now */
      peer_indexes[i*NVML_NVLINK_MAX_LINKS+j] = (unsigned) -1;

      ret = nvmlDeviceGetNvLinkState(device, j, &isActive);
      if (ret != NVML_SUCCESS)
        break;
      if (isActive != NVML_FEATURE_ENABLED)
        continue;
      found_nvlinks++;
      hwloc_debug("  NVLink #%u is active\n", j);

#if HAVE_DECL_NVMLDEVICEGETNVLINKREMOTEDEVICETYPE
      ret = nvmlDeviceGetNvLinkRemoteDeviceType(device, j, &dtype);
      if (ret != NVML_SUCCESS)
        dtype = NVML_NVLINK_DEVICE_TYPE_UNKNOWN;
#else
      dtype = NVML_NVLINK_DEVICE_TYPE_UNKNOWN;
#endif

      ret = nvmlDeviceGetNvLinkRemotePciInfo(device, j, &pci);
      if (ret == NVML_SUCCESS) {
        unsigned peer_index;
        hwloc_debug("    goes to PCI %04x:%02x:%02x\n", pci.domain, pci.bus, pci.device);
        peer_index = hwloc__nvml_get_peer_gpu_by_pci(pci, nb, gpu_bdfs);
        if (peer_index == (unsigned)-1) {
          hwloc_obj_t peer_obj = hwloc__nvml_get_peer_obj_by_pci(topology, objs[i], pci, dtype);
          if (!peer_obj)
            continue;

          peer_index = hwloc__nvml_store_peer_obj(peer_obj, nb, &nbobjs, objs);
          hwloc_debug("    adding NVML peer index #%u\n", peer_index);
        } else {
          hwloc_debug("    reusing NVML peer index #%u\n", peer_index);
        }
        peer_indexes[i*NVML_NVLINK_MAX_LINKS+j] = peer_index;
      }
    }
  }
  hwloc_debug("NVML found %u GPUs within %u peers total, with %u nvlinks total\n", nb, nbobjs, found_nvlinks);

  if (hwloc_topology_get_flags(topology) & HWLOC_TOPOLOGY_FLAG_NO_DISTANCES)
    found_nvlinks = 0;

  if (found_nvlinks) {
    /* now build the matrix */
    found_nvlinks = 0; /* reset back in case the version is unknown below and the matrix remains empty */
    for(i=0; i<nb; i++) {
      nvmlDevice_t device;
      ret = nvmlDeviceGetHandleByIndex(i, &device);
      assert(ret == NVML_SUCCESS);

      for(j=0; j<NVML_NVLINK_MAX_LINKS; j++) {
        static int warned = 0;
        unsigned version;
        hwloc_uint64_t bw;
        unsigned peer_index = peer_indexes[i*NVML_NVLINK_MAX_LINKS+j];

        if (peer_index == (unsigned)-1)
          continue;

        /* For GPU-to-GPU link, we'll get info for both direction, while GPU-to-CPU info is unique.
         * Only store once on both sides of diagonal.
         */
        if (peer_index < i)
          continue;

        ret = nvmlDeviceGetNvLinkVersion(device, j, &version);
        if (ret != NVML_SUCCESS)
          continue;

        hwloc_debug("GPU #%u NVLink #%u has version %u\n", i, j, version);
        /* NVIDIA often shows bidirection bandwidths,
         * or even the bidirectional bandwidth of all links agregated for a single GPU.
         *
         * v1 = P100 = 160GB/s = 4 links * 20GB/s (link full duplex rate) * 2 (full duplex)
         * v2 = V100 = 300GB/s = 6 * 25 * 2
         * v3 = A100 = 600GB/s = 12 * 25 * 2 (twice bigger pairs but half number of pairs per link)
         * v4 = H100 = 900GB/s = 18 * 25 *2
         * v5 = B100 = 1800GB/s = 18 * 50 *2
         *
         * We want the unidirectional bandwidth of each link.
         * Multiple links may connect same GPUs.
         */
        switch (version) {
        /* enum nvmlNvlinkVersion_enum added only in CUDA 12.7 */
        case 1: /* NVML_NVLINK_VERSION_1_0 */
          bw = 20000;
          break;
        case 2: /* NVML_NVLINK_VERSION_2_0 */
        case 3: /* NVML_NVLINK_VERSION_2_2 */
        case 4: /* NVML_NVLINK_VERSION_3_0 */
        case 5: /* NVML_NVLINK_VERSION_3_1 */
        case 6: /* NVML_NVLINK_VERSION_4_0 */
          bw = 25000;
          break;
        case 7: /* NVML_NVLINK_VERSION_5_0 */
          bw = 50000;
          break;
        default:
          if (!warned && HWLOC_SHOW_ALL_ERRORS())
            fprintf(stderr, "hwloc/nvml: Failed to recognize NVLink version %u\n", version);
          warned = 1;
          continue;
        }

        bws[i*nbobjs+peer_index] += bw;
        bws[peer_index*nbobjs+i] += bw;
        found_nvlinks++;
      }
    }
    if (found_nvlinks) {
      /* add very high artifical values on the diagonal since local is faster than remote.
       * there are 6 link per GPU max for now, 150GB/s, use 1TB/s for local, it somehow matches the HBM.
       */
      for(i=0; i<nbobjs; i++)
        bws[i*nbobjs+i] = 1000000;

      hwloc__nvml_add_nvlink_bandwidth(topology, nbobjs, objs, bws);
      /* matrices don't need to be freed anymore */
      objs = NULL;
      bws = NULL;
    }
  }
  free(objs);
  free(bws);
  free(gpu_bdfs);
  free(peer_indexes);
#endif /* NVML_NVLINK_MAX_LINKS */

  nvmlShutdown();
  return 0;
}

static struct hwloc_backend *
hwloc_nvml_component_instantiate(struct hwloc_topology *topology,
				 struct hwloc_disc_component *component,
				 unsigned excluded_phases __hwloc_attribute_unused,
				 const void *_data1 __hwloc_attribute_unused,
				 const void *_data2 __hwloc_attribute_unused,
				 const void *_data3 __hwloc_attribute_unused)
{
  struct hwloc_backend *backend;

  backend = hwloc_backend_alloc(topology, component);
  if (!backend)
    return NULL;
  backend->discover = hwloc_nvml_discover;
  return backend;
}

static struct hwloc_disc_component hwloc_nvml_disc_component = {
  "nvml",
  HWLOC_DISC_PHASE_IO,
  HWLOC_DISC_PHASE_GLOBAL,
  hwloc_nvml_component_instantiate,
  5, /* after pci, and after cuda since likely less useful */
  1,
  NULL
};

static int
hwloc_nvml_component_init(unsigned long flags)
{
  if (flags)
    return -1;
  if (hwloc_plugin_check_namespace("nvml", "hwloc_backend_alloc") < 0)
    return -1;
  return 0;
}

#ifdef HWLOC_INSIDE_PLUGIN
HWLOC_DECLSPEC extern const struct hwloc_component hwloc_nvml_component;
#endif

const struct hwloc_component hwloc_nvml_component = {
  HWLOC_COMPONENT_ABI,
  hwloc_nvml_component_init, NULL,
  HWLOC_COMPONENT_TYPE_DISC,
  0,
  &hwloc_nvml_disc_component
};