1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
|
//===-- Shared memory RPC client / server utilities -------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SHARED_RPC_UTIL_H
#define LLVM_LIBC_SHARED_RPC_UTIL_H
#include <stddef.h>
#include <stdint.h>
#if (defined(__NVPTX__) || defined(__AMDGPU__)) && \
!((defined(__CUDA__) && !defined(__CUDA_ARCH__)) || \
(defined(__HIP__) && !defined(__HIP_DEVICE_COMPILE__)))
#include <gpuintrin.h>
#define RPC_TARGET_IS_GPU
#endif
// Workaround for missing __has_builtin in < GCC 10.
#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
#ifndef RPC_ATTRS
#if defined(__CUDA__) || defined(__HIP__)
#define RPC_ATTRS __attribute__((host, device)) inline
#else
#define RPC_ATTRS inline
#endif
#endif
namespace rpc {
template <typename T> struct type_identity {
using type = T;
};
template <class T, T v> struct type_constant {
static inline constexpr T value = v;
};
template <class T> struct remove_reference : type_identity<T> {};
template <class T> struct remove_reference<T &> : type_identity<T> {};
template <class T> struct remove_reference<T &&> : type_identity<T> {};
template <class T> struct is_const : type_constant<bool, false> {};
template <class T> struct is_const<const T> : type_constant<bool, true> {};
/// Freestanding implementation of std::move.
template <class T>
RPC_ATTRS constexpr typename remove_reference<T>::type &&move(T &&t) {
return static_cast<typename remove_reference<T>::type &&>(t);
}
/// Freestanding implementation of std::forward.
template <typename T>
RPC_ATTRS constexpr T &&forward(typename remove_reference<T>::type &value) {
return static_cast<T &&>(value);
}
template <typename T>
RPC_ATTRS constexpr T &&forward(typename remove_reference<T>::type &&value) {
return static_cast<T &&>(value);
}
struct in_place_t {
RPC_ATTRS explicit in_place_t() = default;
};
struct nullopt_t {
RPC_ATTRS constexpr explicit nullopt_t() = default;
};
constexpr inline in_place_t in_place{};
constexpr inline nullopt_t nullopt{};
/// Freestanding and minimal implementation of std::optional.
template <typename T> class optional {
template <typename U> struct OptionalStorage {
union {
char empty;
U stored_value;
};
bool in_use = false;
RPC_ATTRS ~OptionalStorage() { reset(); }
RPC_ATTRS constexpr OptionalStorage() : empty() {}
template <typename... Args>
RPC_ATTRS constexpr explicit OptionalStorage(in_place_t, Args &&...args)
: stored_value(forward<Args>(args)...) {}
RPC_ATTRS constexpr void reset() {
if (in_use)
stored_value.~U();
in_use = false;
}
};
OptionalStorage<T> storage;
public:
RPC_ATTRS constexpr optional() = default;
RPC_ATTRS constexpr optional(nullopt_t) {}
RPC_ATTRS constexpr optional(const T &t) : storage(in_place, t) {
storage.in_use = true;
}
RPC_ATTRS constexpr optional(const optional &) = default;
RPC_ATTRS constexpr optional(T &&t) : storage(in_place, move(t)) {
storage.in_use = true;
}
RPC_ATTRS constexpr optional(optional &&O) = default;
RPC_ATTRS constexpr optional &operator=(T &&t) {
storage = move(t);
return *this;
}
RPC_ATTRS constexpr optional &operator=(optional &&) = default;
RPC_ATTRS constexpr optional &operator=(const T &t) {
storage = t;
return *this;
}
RPC_ATTRS constexpr optional &operator=(const optional &) = default;
RPC_ATTRS constexpr void reset() { storage.reset(); }
RPC_ATTRS constexpr const T &value() const & { return storage.stored_value; }
RPC_ATTRS constexpr T &value() & { return storage.stored_value; }
RPC_ATTRS constexpr explicit operator bool() const { return storage.in_use; }
RPC_ATTRS constexpr bool has_value() const { return storage.in_use; }
RPC_ATTRS constexpr const T *operator->() const {
return &storage.stored_value;
}
RPC_ATTRS constexpr T *operator->() { return &storage.stored_value; }
RPC_ATTRS constexpr const T &operator*() const & {
return storage.stored_value;
}
RPC_ATTRS constexpr T &operator*() & { return storage.stored_value; }
RPC_ATTRS constexpr T &&value() && { return move(storage.stored_value); }
RPC_ATTRS constexpr T &&operator*() && { return move(storage.stored_value); }
};
/// Suspend the thread briefly to assist the thread scheduler during busy loops.
RPC_ATTRS void sleep_briefly() {
#if __has_builtin(__nvvm_reflect)
if (__nvvm_reflect("__CUDA_ARCH") >= 700)
asm("nanosleep.u32 64;" ::: "memory");
#elif __has_builtin(__builtin_amdgcn_s_sleep)
__builtin_amdgcn_s_sleep(2);
#elif __has_builtin(__builtin_ia32_pause)
__builtin_ia32_pause();
#elif __has_builtin(__builtin_arm_isb)
__builtin_arm_isb(0xf);
#else
// Simply do nothing if sleeping isn't supported on this platform.
#endif
}
/// Conditional to indicate if this process is running on the GPU.
RPC_ATTRS constexpr bool is_process_gpu() {
#ifdef RPC_TARGET_IS_GPU
return true;
#else
return false;
#endif
}
/// Wait for all lanes in the group to complete.
RPC_ATTRS void sync_lane([[maybe_unused]] uint64_t lane_mask) {
#ifdef RPC_TARGET_IS_GPU
return __gpu_sync_lane(lane_mask);
#endif
}
/// Copies the value from the first active thread to the rest.
RPC_ATTRS uint32_t broadcast_value([[maybe_unused]] uint64_t lane_mask,
uint32_t x) {
#ifdef RPC_TARGET_IS_GPU
return __gpu_read_first_lane_u32(lane_mask, x);
#else
return x;
#endif
}
/// Returns the number lanes that participate in the RPC interface.
RPC_ATTRS uint32_t get_num_lanes() {
#ifdef RPC_TARGET_IS_GPU
return __gpu_num_lanes();
#else
return 1;
#endif
}
/// Returns the id of the thread inside of an AMD wavefront executing together.
RPC_ATTRS uint64_t get_lane_mask() {
#ifdef RPC_TARGET_IS_GPU
return __gpu_lane_mask();
#else
return 1;
#endif
}
/// Returns the id of the thread inside of an AMD wavefront executing together.
RPC_ATTRS uint32_t get_lane_id() {
#ifdef RPC_TARGET_IS_GPU
return __gpu_lane_id();
#else
return 0;
#endif
}
/// Conditional that is only true for a single thread in a lane.
RPC_ATTRS bool is_first_lane([[maybe_unused]] uint64_t lane_mask) {
#ifdef RPC_TARGET_IS_GPU
return __gpu_is_first_in_lane(lane_mask);
#else
return true;
#endif
}
/// Returns a bitmask of threads in the current lane for which \p x is true.
RPC_ATTRS uint64_t ballot([[maybe_unused]] uint64_t lane_mask, bool x) {
#ifdef RPC_TARGET_IS_GPU
return __gpu_ballot(lane_mask, x);
#else
return x;
#endif
}
/// Return \p val aligned "upwards" according to \p align.
template <typename V, typename A>
RPC_ATTRS constexpr V align_up(V val, A align) {
return ((val + V(align) - 1) / V(align)) * V(align);
}
/// Utility to provide a unified interface between the CPU and GPU's memory
/// model. On the GPU stack variables are always private to a lane so we can
/// simply use the variable passed in. On the CPU we need to allocate enough
/// space for the whole lane and index into it.
template <typename V> RPC_ATTRS V &lane_value(V *val, uint32_t id) {
if constexpr (is_process_gpu())
return *val;
return val[id];
}
/// Advance the \p p by \p bytes.
template <typename T, typename U> RPC_ATTRS T *advance(T *ptr, U bytes) {
if constexpr (is_const<T>::value)
return reinterpret_cast<T *>(reinterpret_cast<const uint8_t *>(ptr) +
bytes);
else
return reinterpret_cast<T *>(reinterpret_cast<uint8_t *>(ptr) + bytes);
}
/// Wrapper around the optimal memory copy implementation for the target.
RPC_ATTRS void rpc_memcpy(void *dst, const void *src, size_t count) {
__builtin_memcpy(dst, src, count);
}
template <class T> RPC_ATTRS constexpr const T &max(const T &a, const T &b) {
return (a < b) ? b : a;
}
} // namespace rpc
#endif // LLVM_LIBC_SHARED_RPC_UTIL_H
|