File: shuffle.h

package info (click to toggle)
llvm-toolchain-14 1%3A14.0.6-12
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,496,180 kB
  • sloc: cpp: 5,593,972; ansic: 986,872; asm: 585,869; python: 184,223; objc: 72,530; lisp: 31,119; f90: 27,793; javascript: 9,780; pascal: 9,762; sh: 9,482; perl: 7,468; ml: 5,432; awk: 3,523; makefile: 2,538; xml: 953; cs: 573; fortran: 567
file content (102 lines) | stat: -rw-r--r-- 3,070 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
//===- shuffle.h - OpenMP variants of the shuffle idiom for all targets -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Shuffle function implementations for all supported targets.
//
// Note: We unify the mask type to uint64_t instead of __kmpc_impl_lanemask_t.
//
//===----------------------------------------------------------------------===//

#ifndef LIBOMPTARGET_DEVICERTL_SHUFFLE_H
#define LIBOMPTARGET_DEVICERTL_SHUFFLE_H

#include <stdint.h>

#pragma omp declare target

/// External shuffle API
///
///{

extern "C" {
int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
}

///}

/// Forward declarations
///
///{
extern "C" {
unsigned GetLaneId();
unsigned __kmpc_get_warp_size();
void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
}
///}

/// Fallback implementations of the shuffle sync idiom.
/// Unavailable at present (would error at link time if used).
///
///{

int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var, int32_t SrcLane);

int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var, uint32_t Delta,
                                   int32_t Width);

///}

/// AMDGCN implementations of the shuffle sync idiom.
///
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})

inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
                                     int32_t SrcLane) {
  int Width = __kmpc_get_warp_size();
  int Self = GetLaneId();
  int Index = SrcLane + (Self & ~(Width - 1));
  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
}

inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
                                          uint32_t LaneDelta, int32_t Width) {
  int Self = GetLaneId();
  int Index = Self + LaneDelta;
  Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
}

#pragma omp end declare variant
///}

/// NVPTX implementations of the shuffle and shuffle sync idiom.
///
///{
#pragma omp begin declare variant match(                                       \
    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})

inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
                                     int32_t SrcLane) {
  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
}

inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
                                          uint32_t Delta, int32_t Width) {
  int32_t T = ((__kmpc_get_warp_size() - Width) << 8) | 0x1f;
  return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
}

#pragma omp end declare variant
///}

#pragma omp end declare target

#endif