File: vector_kernel.cu

package info (click to toggle)
lammps 20220106.git7586adbb6a%2Bds1-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 348,064 kB
  • sloc: cpp: 831,421; python: 24,896; xml: 14,949; f90: 10,845; ansic: 7,967; sh: 4,226; perl: 4,064; fortran: 2,424; makefile: 1,501; objc: 238; lisp: 163; csh: 16; awk: 14; tcl: 6
file content (469 lines) | stat: -rw-r--r-- 17,846 bytes parent folder | download | duplicates (8)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
// -------------------------------------------------------------
// CUDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
//  $Revision: 5632 $
//  $Date: 2009-07-01 14:36:01 +1000 (Wed, 01 Jul 2009) $
// ------------------------------------------------------------- 
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// ------------------------------------------------------------- 

/**
 * @file
 * vector_kernel.cu
 * 
 * @brief CUDA kernel methods for basic operations on vectors.  
 * 
 * CUDA kernel methods for basic operations on vectors.  
 * 
 * Examples: 
 * - vectorAddConstant(): d_vector + constant 
 * - vectorAddUniform():  d_vector + uniform (per-block constants)
 * - vectorAddVectorVector(): d_vector + d_vector
 */

// MJH: these functions assume there are 2N elements for N threads.  
// Is this always going to be a good idea?  There may be cases where
// we have as many threads as elements, but for large problems
// we are probably limited by max CTA size for simple kernels like 
// this so we should process multiple elements per thread.
// we may want to extend these with looping versions that process 
// many elements per thread.

#include "cudpp_util.h"
#include "sharedmem.h"
#include "cudpp.h"

/** \addtogroup cudpp_kernel
  * @{
  */

/** @name Vector Functions
 * CUDA kernel methods for basic operations on vectors.  
 * @{
 */

/** @brief Adds a constant value to all values in the input d_vector
 *  
 * Each thread adds two pairs of elements.
 * @todo Test this function -- it is currently not yet used.
 *
 * @param[in,out] d_vector The array of elements to be modified
 * @param[in] constant The constant value to be added to elements of 
 * \a d_vector
 * @param[in] n The number of elements in the d_vector to be modified
 * @param[in] baseIndex An optional offset to the beginning of the
 * elements in the input array to be processed
 */
template <class T>
__global__  void vectorAddConstant(T   *d_vector, 
                                   T   constant, 
                                   int n, 
                                   int baseIndex)
{
    // Compute this thread's output address
    unsigned int address = baseIndex + threadIdx.x +
        __mul24(blockIdx.x, (blockDim.x << 1)); 

    // note two adds per thread: one in first half of the block, one in last
    d_vector[address]              += constant;
    d_vector[address + blockDim.x] += (threadIdx.x + blockDim.x < n) * constant;
}

 /** @brief Add a uniform value to each data element of an array
  *
  * This function reads one value per CTA from \a d_uniforms into shared
  * memory and adds that value to all values "owned" by the CTA in \a
  * d_vector.  Each thread adds two pairs of values.
  *
  * @param[out] d_vector The d_vector whose values will have the uniform added
  * @param[in] d_uniforms The array of uniform values (one per CTA)
  * @param[in] numElements The number of elements in \a d_vector to process
  * @param[in] blockOffset an optional offset to the beginning of this block's
  * data.
  * @param[in] baseIndex an optional offset to the beginning of the array 
  * within \a d_vector.
  */
template <class T>
__global__ void vectorAddUniform(T       *d_vector, 
                                 const T *d_uniforms, 
                                 int     numElements, 
                                 int     blockOffset, 
                                 int     baseIndex)
{
    __shared__ T uni;
    // Get this block's uniform value from the uniform array in device memory
    // We store it in shared memory so that the hardware's shared memory 
    // broadcast capability can be used to share among all threads in each warp
    // in a single cycle
    if (threadIdx.x == 0)
    {
        uni = d_uniforms[blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset];
    }

    // Compute this thread's output address
    int width = __mul24(gridDim.x,(blockDim.x << 1));

    unsigned int address = baseIndex + __mul24(width, blockIdx.y)
        + threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 1)); 

    __syncthreads();

    // note two adds per thread: one in first half of the block, one in last
    d_vector[address]              += uni;
    if (threadIdx.x + blockDim.x < numElements) d_vector[address + blockDim.x] += uni;
}


/** @brief Add a uniform value to each data element of an array (vec4 version)
  *
  * This function reads one value per CTA from \a d_uniforms into shared
  * memory and adds that value to all values "owned" by the CTA in \a d_vector.  
  * Each thread adds the uniform value to eight values in \a d_vector.
  *
  * @param[out] d_vector The d_vector whose values will have the uniform added
  * @param[in] d_uniforms The array of uniform values (one per CTA)
  * @param[in] numElements The number of elements in \a d_vector to process
  * @param[in] vectorRowPitch For 2D arrays, the pitch (in elements) of the 
  * rows of \a d_vector.
  * @param[in] uniformRowPitch For 2D arrays, the pitch (in elements) of the 
  * rows of \a d_uniforms.
  * @param[in] blockOffset an optional offset to the beginning of this block's
  * data.
  * @param[in] baseIndex an optional offset to the beginning of the array 
  * within \a d_vector.
  */
template <class T, CUDPPOperator op, int elementsPerThread>
__global__ void vectorAddUniform4(T       *d_vector, 
                                  const T *d_uniforms, 
                                  int      numElements,             
                                  int      vectorRowPitch,     // width of input array in elements
                                  int      uniformRowPitch,    // width of uniform array in elements
                                  int      blockOffset, 
                                  int      baseIndex)
{
    __shared__ T uni;
    // Get this block's uniform value from the uniform array in device memory
    // We store it in shared memory so that the hardware's shared memory 
    // broadcast capability can be used to share among all threads in each warp
    // in a single cycle
    if (threadIdx.x == 0)
    {
        uni = d_uniforms[blockIdx.x + __umul24(uniformRowPitch, blockIdx.y) + blockOffset];
    }

    // Compute this thread's output address
    //int width = __mul24(gridDim.x,(blockDim.x << 1));
   
    unsigned int address = baseIndex + __umul24(vectorRowPitch, blockIdx.y)
        + threadIdx.x + __umul24(blockIdx.x, (blockDim.x * elementsPerThread)); 
    numElements += __umul24(vectorRowPitch, blockIdx.y);

    __syncthreads();

    switch (op)
    {
    case CUDPP_ADD:
        for (int i = 0; i < elementsPerThread && address < numElements; i++)
        {
            d_vector[address] += uni;
            address += blockDim.x;
        }
        break;

    case CUDPP_MULTIPLY:
        for (int i = 0; i < elementsPerThread && address < numElements; i++)
        {
            d_vector[address] *= uni;
            address += blockDim.x;
        }
        break;

    case CUDPP_MAX:
        for (int i = 0; i < elementsPerThread && address < numElements; i++)
        {
            d_vector[address] = max(d_vector[address], uni);
            address += blockDim.x;
        }
        break;

    case CUDPP_MIN:
        for (int i = 0; i < elementsPerThread && address < numElements; i++)
        {
            d_vector[address] = min(d_vector[address], uni);
            address += blockDim.x;
        }
        break;
    default:
        break;
    }    
}

/** @brief Adds together two vectors
 *  
 * Each thread adds two pairs of elements.
 * @todo Test this function -- it is currently not yet used.
 *
 * @param[out] d_vectorA The left operand array and the result
 * @param[in] d_vectorB The right operand array
 * @param[in] numElements The number of elements in the vectors to be added.
 * @param[in] baseIndex An optional offset to the beginning of the
 * elements in the input arrays to be processed
 */
template <class T>
__global__ void vectorAddVector(T       *d_vectorA,        // A += B
                                const T *d_vectorB,
                                int     numElements,
                                int     baseIndex)
{
    // Compute this thread's output address
    unsigned int address = baseIndex + threadIdx.x +
        __mul24(blockIdx.x, (blockDim.x << 1)); 

    // note two adds per thread: one in first half of the block, one in last
    d_vectorA[address]              += d_vectorB[address];
    d_vectorA[address + blockDim.x] += 
        (threadIdx.x + blockDim.x < numElements) * d_vectorB[address];
}

/** @brief Add a uniform value to data elements of an array (vec4 version)
  *
  * This function reads one value per CTA from \a d_uniforms into shared
  * memory and adds that value to values "owned" by the CTA in \a d_vector.
  * The uniform value is added to only those values "owned" by the CTA which
  * have an index less than d_maxIndex. If d_maxIndex for that CTA is UINT_MAX
  * it adds the uniform to all values "owned" by the CTA.
  * Each thread adds the uniform value to eight values in \a d_vector.
  *
  * @param[out] d_vector The d_vector whose values will have the uniform added
  * @param[in] d_uniforms The array of uniform values (one per CTA)
  * @param[in] d_maxIndices The array of maximum indices (one per CTA). This is
  *            index upto which the uniform would be added. If this is UINT_MAX
  *            the uniform is added to all elements of the CTA. This index is
  *            1-based.
  * @param[in] numElements The number of elements in \a d_vector to process
  * @param[in] blockOffset an optional offset to the beginning of this block's
  * data.
  * @param[in] baseIndex an optional offset to the beginning of the array 
  * within \a d_vector.
  */
template <class T, CUDPPOperator oper, bool isLastBlockFull>
__global__ void vectorSegmentedAddUniform4(T                  *d_vector, 
                                           const T            *d_uniforms, 
                                           const unsigned int *d_maxIndices,
                                           unsigned int       numElements,
                                           int                blockOffset, 
                                           int                baseIndex)
{
    __shared__ T uni[2];

    unsigned int blockAddress = 
        blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset;

    // Get this block's uniform value from the uniform array in device memory
    // We store it in shared memory so that the hardware's shared memory 
    // broadcast capability can be used to share among all threads in each warp
    // in a single cycle
    
    if (threadIdx.x == 0)
    {
        if (blockAddress > 0)
            uni[0] = d_uniforms[blockAddress-1];
        else
            uni[0] = Operator<T, oper>::identity(); 
        
        // Tacit assumption that T is four-byte wide
        uni[1] = (T)(d_maxIndices[blockAddress]);
    }

    // Compute this thread's output address
    int width = __mul24(gridDim.x,(blockDim.x << 1));

    unsigned int address = baseIndex + __mul24(width, blockIdx.y)
                           + threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 3)); 

    __syncthreads();

    unsigned int maxIndex = (unsigned int)(uni[1]);

    bool isLastBlock = (blockIdx.x == (gridDim.x-1));
     
    if (maxIndex < UINT_MAX)
    {
        // Since maxIndex is a 1 based index
        --maxIndex;
        bool leftLess = address < maxIndex;
        bool rightLess = (address + 7 * blockDim.x) < maxIndex;

        if (leftLess)
        {
            if (rightLess)
            {
                for (unsigned int i = 0; i < 8; ++i)
                    d_vector[address + i * blockDim.x] = 
                        Operator<T, oper>::op(d_vector[address + i * blockDim.x], uni[0]);
            }
            else
            {
                for (unsigned int i=0; i < 8; ++i)
                {
                    if (address < maxIndex)
                        d_vector[address] = 
                            Operator<T, oper>::op(d_vector[address], uni[0]);

                    address += blockDim.x;
                }
            }
        }
    }
    else
    {
        if (!isLastBlockFull && isLastBlock)
        {
            for (unsigned int i = 0; i < 8; ++i)
            {
                if (address < numElements)
                    d_vector[address] = 
                        Operator<T, oper>::op(d_vector[address], uni[0]);
                
                address += blockDim.x;
            }
        }
        else
        {
            for (unsigned int i=0; i<8; ++i)
            {
                d_vector[address] = 
                    Operator<T, oper>::op(d_vector[address], uni[0]);
                
                address += blockDim.x;
            }            
        }
    }
}

/** @brief Add a uniform value to data elements of an array (vec4 version)
  *
  * This function reads one value per CTA from \a d_uniforms into shared
  * memory and adds that value to values "owned" by the CTA in \a d_vector.
  * The uniform value is added to only those values "owned" by the CTA which
  * have an index greater than d_minIndex. If d_minIndex for that CTA is 0
  * it adds the uniform to all values "owned" by the CTA.
  * Each thread adds the uniform value to eight values in \a d_vector.
  *
  * @param[out] d_vector The d_vector whose values will have the uniform added
  * @param[in] d_uniforms The array of uniform values (one per CTA)
  * @param[in] d_minIndices The array of minimum indices (one per CTA). The
  *            uniform is added to the right of this index (that is, to every index
  *            that is greater than this index). If this is 0, the uniform is 
  *            added to all elements of the CTA. This index is 1-based to
  *            prevent overloading of what 0 means. In our case it means
  *            absence of a flag. But if the first element of a CTA has
  *            flag the index will also be 0. Hence we use 1-based indices
  *            so the index is 1 in the latter case.
  * @param[in] numElements The number of elements in \a d_vector to process
  * @param[in] blockOffset an optional offset to the beginning of this block's
  * data.
  * @param[in] baseIndex an optional offset to the beginning of the array 
  * within \a d_vector.
  *
  */
template <class T, CUDPPOperator oper, bool isLastBlockFull>
__global__ void vectorSegmentedAddUniformToRight4(T                  *d_vector, 
                                                  const T            *d_uniforms, 
                                                  const unsigned int *d_minIndices,
                                                  unsigned int       numElements,
                                                  int                blockOffset, 
                                                  int                baseIndex)
{
    __shared__ T uni[2];

    unsigned int blockAddress = 
        blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset;

    // Get this block's uniform value from the uniform array in device memory
    // We store it in shared memory so that the hardware's shared memory 
    // broadcast capability can be used to share among all threads in each warp
    // in a single cycle
    
    if (threadIdx.x == 0)
    {
        // FIXME - blockAddress test here is incompatible with how it is calculated
        // above
        if (blockAddress < (gridDim.x-1))
            uni[0] = d_uniforms[blockAddress+1];
        else
            uni[0] = Operator<T, oper>::identity(); 
        
        // Tacit assumption that T is four-byte wide
        uni[1] = (T)(d_minIndices[blockAddress]);
    }

    // Compute this thread's output address
    int width = __mul24(gridDim.x,(blockDim.x << 1));

    unsigned int address = baseIndex + __mul24(width, blockIdx.y)
                           + threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 3)); 

    __syncthreads();

    unsigned int minIndex = (unsigned int)(uni[1]);

    bool isLastBlock = (blockIdx.x == (gridDim.x-1));
     
    if (minIndex > 0)
    {
        // Since minIndex is a 1 based index
        --minIndex;
        bool leftInRange = address > minIndex;
        bool rightInRange = (address + 7 * blockDim.x) > minIndex;

        if (rightInRange)
        {
            if (leftInRange)
            {
                for (unsigned int i = 0; i < 8; ++i)
                    d_vector[address + i * blockDim.x] = 
                        Operator<T, oper>::op(d_vector[address + i * blockDim.x], uni[0]);
            }
            else
            {
                for (unsigned int i=0; i < 8; ++i)
                {
                    if (address > minIndex)
                        d_vector[address] = 
                            Operator<T, oper>::op(d_vector[address], uni[0]);

                    address += blockDim.x;
                }
            }
        }
    }
    else
    {
        if (!isLastBlockFull && isLastBlock)
        {
            for (unsigned int i = 0; i < 8; ++i)
            {
                if (address < numElements)
                    d_vector[address] = 
                        Operator<T, oper>::op(d_vector[address], uni[0]);
                
                address += blockDim.x;
            }
        }
        else
        {
            for (unsigned int i=0; i<8; ++i)
            {
                d_vector[address] = 
                    Operator<T, oper>::op(d_vector[address], uni[0]);
                
                address += blockDim.x;
            }            
        }
    }
}

/** @} */ // end d_vector functions
/** @} */ // end cudpp_kernel