1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
|
// aligned register array for vectorized load/store
template <typename scalar_t, int size, int align_size>
struct alignas(sizeof(scalar_t) * align_size) Array {
scalar_t array[size];
__device__ void set(scalar_t v) {
#pragma unroll
for (int i = 0; i < size; ++i) {
array[i] = v;
}
}
__device__ scalar_t& operator[](const unsigned int i) {
return array[i];
}
};
// Used for vectorized allocations that are not in registers
template <typename scalar_t, int vec_size>
__device__ void arraySet(scalar_t* buff, scalar_t val) {
#pragma unroll
for (int i = 0; i < vec_size; ++i) {
buff[i] = val;
}
}
template <typename scalar_t, int vec_size>
__device__ void loadGeneric(scalar_t* to, scalar_t* from) {
// It would be really nice to use memcpy here, but one example was failing
// with:
//
// memcpy(to, from, vec_size * sizeof(scalar_t));
//
// Yet passing with:
//
// for(int i = 0; i < vec_size; i++){
// to[i] = from[i];
// }
switch (sizeof(scalar_t) * vec_size) {
case 1:
*reinterpret_cast<uchar1*>(to) = *reinterpret_cast<uchar1*>(from);
break;
case 2:
*reinterpret_cast<uchar2*>(to) = *reinterpret_cast<uchar2*>(from);
break;
case 4:
*reinterpret_cast<uint1*>(to) = *reinterpret_cast<uint1*>(from);
break;
case 8:
*reinterpret_cast<uint2*>(to) = *reinterpret_cast<uint2*>(from);
break;
case 12:
*reinterpret_cast<uint3*>(to) = *reinterpret_cast<uint3*>(from);
break;
case 16:
*reinterpret_cast<uint4*>(to) = *reinterpret_cast<uint4*>(from);
break;
}
}
// Volatile version only works with c++ fundamnetal types
template <
typename scalar_t,
int vec_size,
bool is_volatile_to,
bool is_volatile_from>
__device__ void loadGenericVolatile(
typename MaybeVolatile<scalar_t, is_volatile_to>::type* to,
typename MaybeVolatile<scalar_t, is_volatile_from>::type* from) {
switch (sizeof(scalar_t) * vec_size) {
// Reinterpret cast like this with volatile types only works for C++
// fundamental types otherwise the = operator is not defined
case 1:
*reinterpret_cast<
typename MaybeVolatile<unsigned char, is_volatile_to>::type*>(to) =
*reinterpret_cast<
typename MaybeVolatile<unsigned char, is_volatile_from>::type*>(
from);
break;
case 2:
*reinterpret_cast<typename MaybeVolatile<short, is_volatile_to>::type*>(
to) =
*reinterpret_cast<
typename MaybeVolatile<short, is_volatile_from>::type*>(from);
break;
case 4:
*reinterpret_cast<
typename MaybeVolatile<unsigned int, is_volatile_to>::type*>(to) =
*reinterpret_cast<
typename MaybeVolatile<unsigned int, is_volatile_from>::type*>(
from);
break;
case 8:
*reinterpret_cast<typename MaybeVolatile<double, is_volatile_to>::type*>(
to) =
*reinterpret_cast<
typename MaybeVolatile<double, is_volatile_from>::type*>(from);
break;
}
}
template <typename scalar_t, int vec_size, bool is_volatile>
__device__ void loadLocalToGlobal(
typename MaybeVolatile<scalar_t, is_volatile>::type* to,
scalar_t* from) {
switch (sizeof(scalar_t) * vec_size) {
case 1:
case 2:
case 4:
loadGenericVolatile<scalar_t, vec_size, is_volatile, false>(to, from);
break;
case 8: {
uint2 const& data = *reinterpret_cast<uint2*>(from);
if (is_volatile) {
asm volatile(
"st.volatile.global.v2.s32 [%0], {%1,%2};" ::"l"(
(typename MaybeVolatile<uint2, is_volatile>::type*)to),
"r"(data.x),
"r"(data.y));
} else {
asm volatile(
"st.global.cs.v2.s32 [%0], {%1,%2};" ::"l"(
(typename MaybeVolatile<uint2, is_volatile>::type*)to),
"r"(data.x),
"r"(data.y));
}
break;
}
case 16: {
uint4 const& data = *reinterpret_cast<uint4*>(from);
if (is_volatile) {
asm volatile(
"st.volatile.global.v4.s32 [%0], {%1,%2,%3,%4};" ::"l"(
(typename MaybeVolatile<uint4, is_volatile>::type*)to),
"r"(data.x),
"r"(data.y),
"r"(data.z),
"r"(data.w));
} else {
asm volatile(
"st.global.cs.v4.s32 [%0], {%1,%2,%3,%4};" ::"l"(
(typename MaybeVolatile<uint4, is_volatile>::type*)to),
"r"(data.x),
"r"(data.y),
"r"(data.z),
"r"(data.w));
}
break;
}
}
}
template <typename scalar_t, int vec_size, bool is_volatile>
__device__ void loadGlobalToLocal(
scalar_t* to,
typename MaybeVolatile<scalar_t, is_volatile>::type* from) {
switch (sizeof(scalar_t) * vec_size) {
case 1:
case 2:
case 4:
loadGenericVolatile<scalar_t, vec_size, false, is_volatile>(to, from);
break;
case 8: {
if (is_volatile) {
uint2& data = *reinterpret_cast<uint2*>(to);
asm volatile("ld.volatile.global.v2.s32 {%0,%1}, [%2];"
: "=r"(data.x), "=r"(data.y)
: "l"((uint2*)from));
break;
} else {
uint2& data = *reinterpret_cast<uint2*>(to);
asm volatile("ld.global.cs.v2.s32 {%0,%1}, [%2];"
: "=r"(data.x), "=r"(data.y)
: "l"((uint2*)from));
}
break;
}
case 16: {
if (is_volatile) {
uint4& data = *reinterpret_cast<uint4*>(to);
asm volatile("ld.volatile.global.v4.s32 {%0,%1,%2,%3}, [%4];"
: "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
: "l"((uint4*)from));
} else {
uint4& data = *reinterpret_cast<uint4*>(to);
asm volatile("ld.global.cs.v4.s32 {%0,%1,%2,%3}, [%4];"
: "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
: "l"((uint4*)from));
}
break;
}
}
}
template <
typename scalar_t,
int vec_size,
bool is_volatile_to,
bool is_volatile_from>
__device__ void loadGlobalToGlobal(
typename MaybeVolatile<scalar_t, is_volatile_to>::type* to,
typename MaybeVolatile<scalar_t, is_volatile_from>::type* from) {
switch (sizeof(scalar_t) * vec_size) {
// Reinterpret cast like this with volatile types only works for C++
// fundamental types otherwise the = operator is not defined
case 1:
case 2:
case 4:
case 8:
loadGenericVolatile<scalar_t, vec_size, is_volatile_to, is_volatile_from>(
to, from);
break;
case 12: {
uint3 local_intermediate;
loadGlobalToLocal<scalar_t, vec_size, is_volatile_from>(
reinterpret_cast<scalar_t*>(&local_intermediate), from);
loadLocalToGlobal<scalar_t, vec_size, is_volatile_to>(
to, reinterpret_cast<scalar_t*>(&local_intermediate));
break;
}
case 16: {
uint4 local_intermediate;
loadGlobalToLocal<scalar_t, vec_size, is_volatile_from>(
reinterpret_cast<scalar_t*>(&local_intermediate), from);
loadLocalToGlobal<scalar_t, vec_size, is_volatile_to>(
to, reinterpret_cast<scalar_t*>(&local_intermediate));
break;
}
}
}
|