1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
|
//===---- omptargeti.h - OpenMP GPU initialization --------------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains the declarations of all library macros, types,
// and functions.
//
//===----------------------------------------------------------------------===//
#include "common/target_atomic.h"
////////////////////////////////////////////////////////////////////////////////
// Task Descriptor
////////////////////////////////////////////////////////////////////////////////
INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() const {
// sched starts from 1..4; encode it as 0..3; so add 1 here
uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1;
return (omp_sched_t)rc;
}
INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
// sched starts from 1..4; encode it as 0..3; so sub 1 here
uint8_t val = ((uint8_t)sched) - 1;
// clear current sched
items.flags &= ~TaskDescr_SchedMask;
// set new sched
items.flags |= val;
}
INLINE void
omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
// slow method
// flag:
// default sched is static,
// dyn is off (unused now anyway, but may need to sample from host ?)
// not in parallel
items.flags = 0;
items.threadId = 0; // is master
items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
}
// This is called when all threads are started together in SPMD mode.
// OMP directives include target parallel, target distribute parallel for, etc.
INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
omptarget_nvptx_TaskDescr *parentTaskDescr) {
// slow method
// flag:
// default sched is static,
// dyn is off (unused now anyway, but may need to sample from host ?)
// in L1 parallel
items.flags =
TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
items.threadId =
GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
prev = parentTaskDescr;
}
INLINE void omptarget_nvptx_TaskDescr::CopyData(
omptarget_nvptx_TaskDescr *sourceTaskDescr) {
items = sourceTaskDescr->items;
}
INLINE void
omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) {
CopyData(sourceTaskDescr);
prev = sourceTaskDescr->prev;
}
INLINE void omptarget_nvptx_TaskDescr::CopyParent(
omptarget_nvptx_TaskDescr *parentTaskDescr) {
CopyData(parentTaskDescr);
prev = parentTaskDescr;
}
INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
omptarget_nvptx_TaskDescr *parentTaskDescr) {
CopyParent(parentTaskDescr);
items.flags = items.flags & ~TaskDescr_IsParConstr;
ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task");
}
INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
omptarget_nvptx_TaskDescr *masterTaskDescr) {
CopyParent(masterTaskDescr);
// overwrite specific items;
items.flags |=
TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
}
INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
omptarget_nvptx_TaskDescr *workTaskDescr) {
Copy(workTaskDescr);
//
// overwrite specific items;
//
// The threadID should be GetThreadIdInBlock() % GetMasterThreadID().
// This is so that the serial master (first lane in the master warp)
// gets a threadId of 0.
// However, we know that this function is always called in a parallel
// region where only workers are active. The serial master thread
// never enters this region. When a parallel region is executed serially,
// the threadId is set to 0 elsewhere and the kmpc_serialized_* functions
// are called, which never activate this region.
items.threadId =
GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
}
INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) {
CopyParent(parentTaskDescr);
items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
items.threadId = tid;
}
INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() {
loopData.loopUpperBound =
omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId);
loopData.nextLowerBound =
omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId);
loopData.schedule =
omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId);
loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId);
loopData.stride =
omptarget_nvptx_threadPrivateContext->Stride(items.threadId);
}
INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const {
omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk;
omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) =
loopData.loopUpperBound;
omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) =
loopData.nextLowerBound;
omptarget_nvptx_threadPrivateContext->Stride(items.threadId) =
loopData.stride;
omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) =
loopData.schedule;
}
////////////////////////////////////////////////////////////////////////////////
// Thread Private Context
////////////////////////////////////////////////////////////////////////////////
INLINE omptarget_nvptx_TaskDescr *
omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) const {
ASSERT0(
LT_FUSSY, tid < MAX_THREADS_PER_TEAM,
"Getting top level, tid is larger than allocated data structure size");
return topTaskDescr[tid];
}
INLINE void
omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) {
// levelOneTaskDescr is init when starting the parallel region
// top task descr is NULL (team master version will be fixed separately)
topTaskDescr[tid] = NULL;
// no num threads value has been pushed
nextRegion.tnum[tid] = 0;
// the following don't need to be init here; they are init when using dyn
// sched
// current_Event, events_Number, chunk, num_Iterations, schedule
}
////////////////////////////////////////////////////////////////////////////////
// Team Descriptor
////////////////////////////////////////////////////////////////////////////////
INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
levelZeroTaskDescr.InitLevelZeroTaskDescr();
}
////////////////////////////////////////////////////////////////////////////////
// Get private data structure for thread
////////////////////////////////////////////////////////////////////////////////
// Utility routines for CUDA threads
INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() {
return omptarget_nvptx_threadPrivateContext->TeamContext();
}
INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() {
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
return currTeamDescr.WorkDescr();
}
INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
}
INLINE omptarget_nvptx_TaskDescr *
getMyTopTaskDescriptor(bool isSPMDExecutionMode) {
return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock(isSPMDExecutionMode));
}
////////////////////////////////////////////////////////////////////////////////
// Memory management runtime functions.
////////////////////////////////////////////////////////////////////////////////
INLINE void omptarget_nvptx_SimpleMemoryManager::Release() {
ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
"SlotIdx is too big or uninitialized.");
ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT,
"MemIdx is too big or uninitialized.");
MemDataTy &MD = MemData[usedSlotIdx];
__kmpc_atomic_exchange((unsigned *)&MD.keys[usedMemIdx], 0u);
}
INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
size_t size) {
ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
"SlotIdx is too big or uninitialized.");
const unsigned sm = usedSlotIdx;
MemDataTy &MD = MemData[sm];
unsigned i = hash(GetBlockIdInKernel());
while (__kmpc_atomic_cas((unsigned *)&MD.keys[i], 0u, 1u) != 0) {
i = hash(i + 1);
}
usedSlotIdx = sm;
usedMemIdx = i;
return static_cast<const char *>(buf) + (sm * OMP_STATE_COUNT + i) * size;
}
|