File: cudpp.cpp

package info (click to toggle)
lammps 20220106.git7586adbb6a%2Bds1-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 348,064 kB
  • sloc: cpp: 831,421; python: 24,896; xml: 14,949; f90: 10,845; ansic: 7,967; sh: 4,226; perl: 4,064; fortran: 2,424; makefile: 1,501; objc: 238; lisp: 163; csh: 16; awk: 14; tcl: 6
file content (417 lines) | stat: -rw-r--r-- 15,909 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision: 5632 $
// $Date: 2009-07-01 14:36:01 +1000 (Wed, 01 Jul 2009) $
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------

/**
 * @file
 * cudpp.cpp
 *
 * @brief Main library source file.  Implements wrappers for public
 * interface.
 *
 * Main library source file.  Implements wrappers for public
 * interface.  These wrappers call application-level operators.
 * As this grows we may decide to partition into multiple source
 * files.
 */

/**
 * \defgroup publicInterface CUDPP Public Interface
 * The CUDA public interface comprises the functions, structs, and enums
 * defined in cudpp.h.  Public interface functions call functions in the
 * \link cudpp_app Application-Level\endlink interface. The public
 * interface functions include Plan Interface functions and Algorithm
 * Interface functions.  Plan Interface functions are used for creating
 * CUDPP Plan objects which contain configuration details, intermediate
 * storage space, and in the case of cudppSparseMatrix(), data.  The
 * Algorithm Interface is the set of functions that do the real work
 * of CUDPP, such as cudppScan() and cudppSparseMatrixVectorMultiply.
 *
 * @{
 */

/** @name Algorithm Interface
 * @{
 */

#include "cudpp.h"
#include "cudpp_plan_manager.h"
#include "cudpp_scan.h"
//#include "cudpp_segscan.h"
//#include "cudpp_compact.h"
//#include "cudpp_spmvmult.h"
#include "cudpp_radixsort.h"
//#include "cudpp_rand.h"

/**
 * @brief Performs a scan operation of numElements on its input in
 * GPU memory (d_in) and places the output in GPU memory
 * (d_out), with the scan parameters specified in the plan pointed to by
 * planHandle.

 * The input to a scan operation is an input array, a binary associative
 * operator (like + or max), and an identity element for that operator
 * (+'s identity is 0). The output of scan is the same size as its input.
 * Informally, the output at each element is the result of operator
 * applied to each input that comes before it. For instance, the
 * output of sum-scan at each element is the sum of all the input
 * elements before that input.
 *
 * More formally, for associative operator
 * @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly,
 * <var>out<sub>i</sub></var> = <var>in<sub>0</sub></var>
 * @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly
 * <var>in<sub>1</sub></var>
 * @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly ...
 * @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly
 * <var>in<sub>i-1</sub></var>.
 *
 * CUDPP supports "exclusive" and "inclusive" scans. For the ADD operator,
 * an exclusive scan computes the sum of all input elements before the
 * current element, while an inclusive scan computes the sum of all input
 * elements up to and including the current element.
 *
 * Before calling scan, create an internal plan using cudppPlan().
 *
 * After you are finished with the scan plan, clean up with cudppDestroyPlan().
 *
 * @param[in] planHandle Handle to plan for this scan
 * @param[out] d_out output of scan, in GPU memory
 * @param[in] d_in input to scan, in GPU memory
 * @param[in] numElements number of elements to scan
 *
 * @see cudppPlan, cudppDestroyPlan
 */
CUDPP_DLL
CUDPPResult cudppScan(CUDPPHandle planHandle,
                      void        *d_out,
                      const void  *d_in,
                      size_t      numElements)
{
    CUDPPScanPlan *plan = (CUDPPScanPlan*)CUDPPPlanManager::GetPlan(planHandle);
    if (plan != nullptr)
    {
        cudppScanDispatch(d_out, d_in, numElements, 1, plan);
        return CUDPP_SUCCESS;
    }
    else
    {
        return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
    }
}

/**
 * @brief Performs a segmented scan operation of numElements on its input in
 * GPU memory (d_idata) and places the output in GPU memory
 * (d_out), with the scan parameters specified in the plan pointed to by
 * planHandle.

 * The input to a segmented scan operation is an input array of data,
 * an input array of flags which demarcate segments, a binary associative
 * operator (like + or max), and an identity element for that operator
 * (+'s identity is 0). The array of flags is the same length as the input
 * with 1 marking the the first element of a segment and 0 otherwise. The
 * output of segmented scan is the same size as its input. Informally, the
 * output at each element is the result of operator applied to each input
 * that comes before it in that segment. For instance, the output of
 * segmented sum-scan at each element is the sum of all the input elements
 * before that input in that segment.
 *
 * More formally, for associative operator
 * @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly,
 * <var>out<sub>i</sub></var> = <var>in<sub>k</sub></var>
 * @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly
 * <var>in<sub>k+1</sub></var>
 * @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly ...
 * @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly
 * <var>in<sub>i-1</sub></var>.
 * <i>k</i> is the index of the first element of the segment in which <i>i</i> lies
 *
 * We support both "exclusive" and "inclusive" variants. For a segmented sum-scan,
 * the exclusive variant computes the sum of all input elements before the
 * current element in that segment, while the inclusive variant computes the
 * sum of all input elements up to and including the current element, in
 * that segment.
 *
 * Before calling segmented scan, create an internal plan using cudppPlan().
 *
 * After you are finished with the scan plan, clean up with cudppDestroyPlan().
 * @param[in] planHandle Handle to plan for this scan
 * @param[out] d_out output of segmented scan, in GPU memory
 * @param[in] d_idata input data to segmented scan, in GPU memory
 * @param[in] d_iflags input flags to segmented scan, in GPU memory
 * @param[in] numElements number of elements to perform segmented scan on
 *
 * @see cudppPlan, cudppDestroyPlan

CUDPP_DLL
CUDPPResult cudppSegmentedScan(CUDPPHandle        planHandle,
                               void               *d_out,
                               const void         *d_idata,
                               const unsigned int *d_iflags,
                               size_t             numElements)
{
    CUDPPSegmentedScanPlan *plan =
        (CUDPPSegmentedScanPlan*)CUDPPPlanManager::GetPlan(planHandle);
    if (plan != nullptr)
    {
        cudppSegmentedScanDispatch(d_out, d_idata, d_iflags, numElements, plan);
        return CUDPP_SUCCESS;
    }
    else
    {
        return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
    }
}
*/
/**
 * @brief Performs numRows parallel scan operations of numElements
 * each on its input (d_in) and places the output in d_out,
 * with the scan parameters set by config. Exactly like cudppScan
 * except that it runs on multiple rows in parallel.
 *
 * Note that to achieve good performance with cudppMultiScan one should
 * allocate the device arrays passed to it so that all rows are aligned
 * to the correct boundaries for the architecture the app is running on.
 * The easy way to do this is to use cudaMallocPitch() to allocate a
 * 2D array on the device.  Use the \a rowPitch parameter to cudppPlan()
 * to specify this pitch. The easiest way is to pass the device pitch
 * returned by cudaMallocPitch to cudppPlan() via \a rowPitch.
 *
 * @param[in] planHandle handle to CUDPPScanPlan
 * @param[out] d_out output of scan, in GPU memory
 * @param[in] d_in input to scan, in GPU memory
 * @param[in] numElements number of elements (per row) to scan
 * @param[in] numRows number of rows to scan in parallel
 *
 * @see cudppScan, cudppPlan

CUDPP_DLL
CUDPPResult cudppMultiScan(CUDPPHandle planHandle,
                            void       *d_out,
                            const void *d_in,
                            size_t     numElements,
                            size_t     numRows)
{
    CUDPPScanPlan *plan = (CUDPPScanPlan*)CUDPPPlanManager::GetPlan(planHandle);
    if (plan != nullptr)
    {
        cudppScanDispatch(d_out, d_in, numElements, numRows, plan);
        return CUDPP_SUCCESS;
    }
    else
    {
        return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
    }
}
*/

/**
 * @brief Given an array \a d_in and an array of 1/0 flags in \a
 * deviceValid, returns a compacted array in \a d_out of corresponding
 * only the "valid" values from \a d_in.
 *
 * Takes as input an array of elements in GPU memory
 * (\a d_in) and an equal-sized unsigned int array in GPU memory
 * (\a deviceValid) that indicate which of those input elements are
 * valid. The output is a packed array, in GPU memory, of only those
 * elements marked as valid.
 *
 * Internally, uses cudppScan.
 *
 * Example:
 * \code
 * d_in    = [ a b c d e f ]
 * deviceValid = [ 1 0 1 1 0 1 ]
 * d_out   = [ a c d f ]
 * \endcode
 *
 * @todo [MJH] We need to evaluate whether cudppCompact should be a core member
 * of the public interface.  It's not clear to me that what the user always
 * wants is a final compacted array.  Often one just wants the array of indices
 * to which each input element should go in the output. The split() routine used
 * in radix sort might make more sense to expose.
 *
 * @param[in] planHandle handle to CUDPPCompactPlan
 * @param[out] d_out compacted output
 * @param[out] d_numValidElements set during cudppCompact; is set with the
 * number of elements valid flags in the d_isValid input array
 * @param[in] d_in input to compact
 * @param[in] d_isValid which elements in d_in are valid
 * @param[in] numElements number of elements in d_in

CUDPP_DLL
CUDPPResult cudppCompact(CUDPPHandle        planHandle,
                         void               *d_out,
                         size_t             *d_numValidElements,
                         const void         *d_in,
                         const unsigned int *d_isValid,
                         size_t             numElements)
{
    CUDPPCompactPlan *plan = (CUDPPCompactPlan*)CUDPPPlanManager::GetPlan(planHandle);
    if (plan != nullptr)
    {
        cudppCompactDispatch(d_out, d_numValidElements, d_in, d_isValid,
            numElements, plan);
        return CUDPP_SUCCESS;
    }
    else
    {
        return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors.
    }
}
*/
/**
 * @brief Sorts key-value pairs or keys only
 *
 * Takes as input an array of keys in GPU memory
 * (d_keys) and an optional array of corresponding values,
 * and outputs sorted arrays of keys and (optionally) values in place.
 * Key-value and key-only sort is selected through the configuration of
 * the plan, using the options CUDPP_OPTION_KEYS_ONLY and
 * CUDPP_OPTION_KEY_VALUE_PAIRS.
 *
 * Supported key types are CUDPP_FLOAT and CUDPP_UINT.  Values can be
 * any 32-bit type (internally, values are treated only as a payload
 * and cast to unsigned int).
 *
 * @todo Determine if we need to provide an "out of place" sort interface.
 *
 * @param[in] planHandle handle to CUDPPSortPlan
 * @param[out] d_keys keys by which key-value pairs will be sorted
 * @param[in] d_values values to be sorted
 * @param[in] keyBits the number of least significant bits in each element
 *            of d_keys to sort by
 * @param[in] numElements number of elements in d_keys and d_values
 *
 * @see cudppPlan, CUDPPConfiguration, CUDPPAlgorithm
 */
CUDPP_DLL
CUDPPResult cudppSort(CUDPPHandle planHandle,
                      void        *d_keys,
                      void        *d_values,
                      int         keyBits,
                      size_t      numElements)
{
    CUDPPRadixSortPlan *plan = (CUDPPRadixSortPlan*)CUDPPPlanManager::GetPlan(planHandle);
    if (plan != nullptr)
    {
        cudppRadixSortDispatch(d_keys, d_values, numElements, keyBits, plan);
        return CUDPP_SUCCESS;
    }
    else
    {
        return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors.
    }
}

/** @brief Perform matrix-vector multiply y = A*x for arbitrary sparse matrix A and vector x
  *
  * Given a matrix object handle (which has been initialized using cudppSparseMatrix()),
  * This function multiplies the input vector \a d_x by the matrix referred to by
  * \a sparseMatrixHandle, returning the result in \a d_y.
  *
  * @param sparseMatrixHandle Handle to a sparse matrix object created with cudppSparseMatrix()
  * @param d_y The output vector, y
  * @param d_x The input vector, x
  *
  * @see cudppSparseMatrix, cudppDestroySparseMatrix

CUDPP_DLL
CUDPPResult cudppSparseMatrixVectorMultiply(CUDPPHandle        sparseMatrixHandle,
                                            void               *d_y,
                                            const void         *d_x)
{
    CUDPPSparseMatrixVectorMultiplyPlan *plan =
        (CUDPPSparseMatrixVectorMultiplyPlan*)CUDPPPlanManager::GetPlan(sparseMatrixHandle);

    if (plan != nullptr)
    {
        cudppSparseMatrixVectorMultiplyDispatch(d_y, d_x, plan);
        return CUDPP_SUCCESS;
    }
    else
    {
        return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors.
    }
}
*/
/**
 * @brief Rand puts \a numElements random 32-bit elements into \a d_out
 *

 * Outputs \a numElements random values to \a d_out. \a d_out must be of
 * type unsigned int, allocated in device memory.
 *
 * The algorithm used for the random number generation is stored in \a planHandle.
 * Depending on the specification of the pseudo random number generator(PRNG),
 * the generator may have one or more seeds.  To set the seed, use cudppRandSeed().
 *
 * @todo Currently only MD5 PRNG is supported.  We may provide more rand routines in
 * the future.
 *
 * @param[in] planHandle Handle to plan for rand
 * @param[in] numElements number of elements in d_out.
 * @param[out] d_out output of rand, in GPU memory.  Should be an array of unsigned integers.
 *
 * @see cudppPlan, CUDPPConfiguration, CUDPPAlgorithm

CUDPP_DLL
CUDPPResult cudppRand(CUDPPHandle planHandle,void * d_out, size_t numElements)
{
    CUDPPRandPlan * plan = (CUDPPRandPlan *) CUDPPPlanManager::GetPlan(planHandle);
    if(plan != nullptr)
    {
        //dispatch the rand algorithm here
        cudppRandDispatch(d_out, numElements, plan);
        return CUDPP_SUCCESS;
    }
    else
        return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
}
*/

/**@brief Sets the seed used for rand
 *
 * The seed is crucial to any random number generator as it allows a
 * sequence of random numbers to be replicated.  Since there may be
 * multiple different rand algorithms in CUDPP, cudppRandSeed
 * uses \a planHandle to determine which seed to set.  Each rand
 * algorithm has its own  unique set of seeds depending on what
 * the algorithm needs.
 *
 * @param[in] planHandle the handle to the plan which specifies which rand seed to set
 * @param[in] seed the value which the internal cudpp seed will be set to

CUDPP_DLL
CUDPPResult cudppRandSeed(const CUDPPHandle planHandle, unsigned int seed)
{
    CUDPPRandPlan * plan = (CUDPPRandPlan *) CUDPPPlanManager::GetPlan(planHandle);
    //switch on the plan to figure out which seed to update
    switch(plan->m_config.algorithm)
    {
    case CUDPP_RAND_MD5:
        plan->m_seed = seed;
        break;
    default:
        break;
    }

    return CUDPP_SUCCESS;
}//end cudppRandSeed
*/
/** @} */ // end Algorithm Interface
/** @} */ // end of publicInterface group

// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End: