1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
|
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(embed_forward,Dtype)(const int_tp nthreads,
__global const Dtype* bottom_data,
__global const Dtype* weight,
const int_tp M, const int_tp N,
const int_tp K,
__global Dtype* top_data) {
for (int_tp top_index = get_global_id(0); top_index < nthreads;
top_index += get_global_size(0)) {
const int_tp n = top_index / N;
const int_tp d = top_index % N;
const int_tp index = (int_tp)(bottom_data[n]);
const int_tp weight_index = index * N + d;
top_data[top_index] = weight[weight_index];
}
}
// atomic_add from: http://suhorukov.blogspot.com/2011/12/opencl-11-atomic-operations-on-floating.html
#if (TYPE == TYPE_FLOAT)
inline void TEMPLATE(atomic_add,Dtype)(volatile __global Dtype *source, const Dtype operand) {
union {
uint_tp intVal;
Dtype floatVal;
} newVal;
union {
uint_tp intVal;
Dtype floatVal;
} prevVal;
do {
prevVal.floatVal = *source;
newVal.floatVal = prevVal.floatVal + operand;
} while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
}
__kernel void TEMPLATE(embed_backward,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data,
__global const Dtype* top_diff, const int_tp M, const int_tp N, const int_tp K,
__global Dtype* weight_diff) {
for (int_tp top_index = get_global_id(0); top_index < nthreads;
top_index += get_global_size(0)) {
const int_tp n = top_index / N;
const int_tp d = top_index % N;
const int_tp index = (int_tp)(bottom_data[n]);
const int_tp weight_index = index * N + d;
TEMPLATE(atomic_add,Dtype)((weight_diff + weight_index), *(top_diff + top_index));
}
}
#endif
#if (TYPE == TYPE_DOUBLE)
#ifdef ATOMICS_64_AVAILABLE
inline void TEMPLATE(atomic_add,Dtype)(volatile __global Dtype *source, const Dtype operand) {
union {
unsigned long intVal;
Dtype floatVal;
} newVal;
union {
unsigned long intVal;
Dtype floatVal;
} prevVal;
do {
prevVal.floatVal = *source;
newVal.floatVal = prevVal.floatVal + operand;
} while (atom_cmpxchg((volatile __global unsigned long *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
}
__kernel void TEMPLATE(embed_backward,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data,
__global const Dtype* top_diff, const int_tp M, const int_tp N, const int_tp K,
__global Dtype* weight_diff) {
for (int_tp top_index = get_global_id(0); top_index < nthreads;
top_index += get_global_size(0)) {
const int_tp n = top_index / N;
const int_tp d = top_index % N;
const int_tp index = (int_tp)(bottom_data[n]);
const int_tp weight_index = index * N + d;
TEMPLATE(atomic_add,Dtype)((weight_diff + weight_index), *(top_diff + top_index));
}
}
#endif
#endif
|