File: impl.cpp

package info (click to toggle)
llvm-toolchain-16 1%3A16.0.6-15~deb12u1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,634,792 kB
  • sloc: cpp: 6,179,261; ansic: 1,216,205; asm: 741,319; python: 196,614; objc: 75,325; f90: 49,640; lisp: 32,396; pascal: 12,286; sh: 9,394; perl: 7,442; ml: 5,494; awk: 3,523; makefile: 2,723; javascript: 1,206; xml: 886; fortran: 581; cs: 573
file content (182 lines) | stat: -rw-r--r-- 6,548 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
//===--- amdgpu/impl/impl.cpp ------------------------------------- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "rt.h"
#include <memory>

/*
 * Data
 */

hsa_status_t is_locked(void *ptr, void **agentBaseAddress) {
  hsa_status_t err = HSA_STATUS_SUCCESS;
  hsa_amd_pointer_info_t info;
  info.size = sizeof(hsa_amd_pointer_info_t);
  err = hsa_amd_pointer_info(ptr, &info, /*alloc=*/nullptr,
                             /*num_agents_accessible=*/nullptr,
                             /*accessible=*/nullptr);
  if (err != HSA_STATUS_SUCCESS) {
    DP("Error when getting pointer info\n");
    return err;
  }

  if (info.type == HSA_EXT_POINTER_TYPE_LOCKED) {
    // When user passes in a basePtr+offset we need to fix the
    // locked pointer to include the offset: ROCr always returns
    // the base locked address, not the shifted one.
    if ((char *)info.hostBaseAddress <= (char *)ptr &&
        (char *)ptr < (char *)info.hostBaseAddress + info.sizeInBytes)
      *agentBaseAddress =
          (void *)((uint64_t)info.agentBaseAddress + (uint64_t)ptr -
                   (uint64_t)info.hostBaseAddress);
    else // address is already device-agent accessible, no need to compute
         // offset
      *agentBaseAddress = ptr;
  } else
    *agentBaseAddress = nullptr;

  return HSA_STATUS_SUCCESS;
}

// host pointer (either src or dest) must be locked via hsa_amd_memory_lock
static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest,
                                    hsa_agent_t agent, const void *src,
                                    size_t size) {
  const hsa_signal_value_t init = 1;
  const hsa_signal_value_t success = 0;
  hsa_signal_store_screlease(signal, init);

  hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0,
                                               nullptr, signal);
  if (err != HSA_STATUS_SUCCESS)
    return err;

  // async_copy reports success by decrementing and failure by setting to < 0
  hsa_signal_value_t got = init;
  while (got == init)
    got = hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_NE, init,
                                    UINT64_MAX, HSA_WAIT_STATE_BLOCKED);

  if (got != success)
    return HSA_STATUS_ERROR;

  return err;
}

struct implFreePtrDeletor {
  void operator()(void *p) {
    core::Runtime::Memfree(p); // ignore failure to free
  }
};

enum CopyDirection { H2D, D2H };

static hsa_status_t locking_async_memcpy(enum CopyDirection direction,
                                         hsa_signal_t signal, void *dest,
                                         hsa_agent_t agent, void *src,
                                         void *lockingPtr, size_t size) {
  void *lockedPtr = nullptr;
  hsa_status_t err = is_locked(lockingPtr, &lockedPtr);
  bool HostPtrIsLocked = true;
  if (err != HSA_STATUS_SUCCESS)
    return err;
  if (!lockedPtr) { // not locked
    HostPtrIsLocked = false;
    hsa_agent_t agents[1] = {agent};
    err = hsa_amd_memory_lock(lockingPtr, size, agents, /*num_agent=*/1,
                              (void **)&lockedPtr);
    if (err != HSA_STATUS_SUCCESS)
      return err;
    DP("locking_async_memcpy: lockingPtr=%p lockedPtr=%p Size = %lu\n",
       lockingPtr, lockedPtr, size);
  }

  switch (direction) {
  case H2D:
    err = invoke_hsa_copy(signal, dest, agent, lockedPtr, size);
    break;
  case D2H:
    err = invoke_hsa_copy(signal, lockedPtr, agent, src, size);
    break;
  }

  if (err != HSA_STATUS_SUCCESS && !HostPtrIsLocked) {
    // do not leak locked host pointers, but discard potential error message
    // because the initial error was in the copy function
    hsa_amd_memory_unlock(lockingPtr);
    return err;
  }

  // unlock only if not user locked
  if (!HostPtrIsLocked)
    err = hsa_amd_memory_unlock(lockingPtr);
  if (err != HSA_STATUS_SUCCESS)
    return err;

  return HSA_STATUS_SUCCESS;
}

hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
                             void *hostSrc, size_t size,
                             hsa_agent_t device_agent,
                             hsa_amd_memory_pool_t MemoryPool) {
  hsa_status_t err;

  err = locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
                             device_agent, hostSrc, hostSrc, size);

  if (err == HSA_STATUS_SUCCESS)
    return err;

  // async memcpy sometimes fails in situations where
  // allocate + copy succeeds. Looks like it might be related to
  // locking part of a read only segment. Fall back for now.
  void *tempHostPtr;
  hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
  if (ret != HSA_STATUS_SUCCESS) {
    DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size);
    return ret;
  }
  std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
  memcpy(tempHostPtr, hostSrc, size);

  return locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
                              device_agent, tempHostPtr, tempHostPtr, size);
}

hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *hostDest,
                             void *deviceSrc, size_t size,
                             hsa_agent_t deviceAgent,
                             hsa_amd_memory_pool_t MemoryPool) {
  hsa_status_t err;

  // device has always visibility over both pointers, so use that
  err = locking_async_memcpy(CopyDirection::D2H, signal, hostDest, deviceAgent,
                             deviceSrc, hostDest, size);

  if (err == HSA_STATUS_SUCCESS)
    return err;

  // hsa_memory_copy sometimes fails in situations where
  // allocate + copy succeeds. Looks like it might be related to
  // locking part of a read only segment. Fall back for now.
  void *tempHostPtr;
  hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
  if (ret != HSA_STATUS_SUCCESS) {
    DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size);
    return ret;
  }
  std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);

  err = locking_async_memcpy(CopyDirection::D2H, signal, tempHostPtr,
                             deviceAgent, deviceSrc, tempHostPtr, size);
  if (err != HSA_STATUS_SUCCESS)
    return HSA_STATUS_ERROR;

  memcpy(hostDest, tempHostPtr, size);
  return HSA_STATUS_SUCCESS;
}