File: block_sync_atomic.cu

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (54 lines) | stat: -rw-r--r-- 1,740 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54

// Counter-based block synchronization. Only meant to be used for
// debugging and validating synchronization. This should be replaced
// with cuda::barrier::arrive_and_wait as that should be more robust.

namespace block_sync {

using CounterType = unsigned int;
static constexpr CounterType COUNTER_TYPE_MAX = ~(CounterType)0;
__shared__ CounterType sync_counter;

__device__ void init() {
  const unsigned int tid = threadIdx.x + threadIdx.y * blockDim.x +
      threadIdx.z * blockDim.x * blockDim.y;
  if (tid == 0) {
    sync_counter = 0;
  }
  __syncthreads();
}

// Emulate __syncthreads() with a synchronization counter
__device__ void sync() {
  unsigned int backoff = 8;
  const unsigned int backoff_max = 256;
  const unsigned int num_threads = blockDim.x * blockDim.y * blockDim.z;

  __threadfence_block();

  // Use counter range only up to a limit so that the next val won't
  // overflow.

  const auto counter_max = (COUNTER_TYPE_MAX / num_threads) * num_threads;
  const auto old = atomicInc(&sync_counter, counter_max - 1);

  const auto next = (old / num_threads) * num_threads + num_threads;

  auto local_sync_counter = *(volatile CounterType*)(&sync_counter);

  // sync_counter may wrap around, which means local_sync_counter
  // becomes smaller than old. In that case, it's guaranteed that all
  // threads have incremented the counter.
  while (local_sync_counter < next && old < local_sync_counter) {
#if __CUDA_ARCH__ >= 700
    // __nanosleep only available on compute capability 7.0 or higher
    __nanosleep(backoff); // avoids busy waiting
#endif
    if (backoff < backoff_max) {
      backoff *= 2;
    }
    local_sync_counter = *(volatile CounterType*)(&sync_counter);
  }
}

} // namespace block_sync