1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417
|
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision: 5632 $
// $Date: 2009-07-01 14:36:01 +1000 (Wed, 01 Jul 2009) $
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* cudpp.cpp
*
* @brief Main library source file. Implements wrappers for public
* interface.
*
* Main library source file. Implements wrappers for public
* interface. These wrappers call application-level operators.
* As this grows we may decide to partition into multiple source
* files.
*/
/**
* \defgroup publicInterface CUDPP Public Interface
* The CUDA public interface comprises the functions, structs, and enums
* defined in cudpp.h. Public interface functions call functions in the
* \link cudpp_app Application-Level\endlink interface. The public
* interface functions include Plan Interface functions and Algorithm
* Interface functions. Plan Interface functions are used for creating
* CUDPP Plan objects which contain configuration details, intermediate
* storage space, and in the case of cudppSparseMatrix(), data. The
* Algorithm Interface is the set of functions that do the real work
* of CUDPP, such as cudppScan() and cudppSparseMatrixVectorMultiply.
*
* @{
*/
/** @name Algorithm Interface
* @{
*/
#include "cudpp.h"
#include "cudpp_plan_manager.h"
#include "cudpp_scan.h"
//#include "cudpp_segscan.h"
//#include "cudpp_compact.h"
//#include "cudpp_spmvmult.h"
#include "cudpp_radixsort.h"
//#include "cudpp_rand.h"
/**
* @brief Performs a scan operation of numElements on its input in
* GPU memory (d_in) and places the output in GPU memory
* (d_out), with the scan parameters specified in the plan pointed to by
* planHandle.
* The input to a scan operation is an input array, a binary associative
* operator (like + or max), and an identity element for that operator
* (+'s identity is 0). The output of scan is the same size as its input.
* Informally, the output at each element is the result of operator
* applied to each input that comes before it. For instance, the
* output of sum-scan at each element is the sum of all the input
* elements before that input.
*
* More formally, for associative operator
* @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly,
* <var>out<sub>i</sub></var> = <var>in<sub>0</sub></var>
* @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly
* <var>in<sub>1</sub></var>
* @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly ...
* @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly
* <var>in<sub>i-1</sub></var>.
*
* CUDPP supports "exclusive" and "inclusive" scans. For the ADD operator,
* an exclusive scan computes the sum of all input elements before the
* current element, while an inclusive scan computes the sum of all input
* elements up to and including the current element.
*
* Before calling scan, create an internal plan using cudppPlan().
*
* After you are finished with the scan plan, clean up with cudppDestroyPlan().
*
* @param[in] planHandle Handle to plan for this scan
* @param[out] d_out output of scan, in GPU memory
* @param[in] d_in input to scan, in GPU memory
* @param[in] numElements number of elements to scan
*
* @see cudppPlan, cudppDestroyPlan
*/
CUDPP_DLL
CUDPPResult cudppScan(CUDPPHandle planHandle,
void *d_out,
const void *d_in,
size_t numElements)
{
CUDPPScanPlan *plan = (CUDPPScanPlan*)CUDPPPlanManager::GetPlan(planHandle);
if (plan != nullptr)
{
cudppScanDispatch(d_out, d_in, numElements, 1, plan);
return CUDPP_SUCCESS;
}
else
{
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
}
}
/**
* @brief Performs a segmented scan operation of numElements on its input in
* GPU memory (d_idata) and places the output in GPU memory
* (d_out), with the scan parameters specified in the plan pointed to by
* planHandle.
* The input to a segmented scan operation is an input array of data,
* an input array of flags which demarcate segments, a binary associative
* operator (like + or max), and an identity element for that operator
* (+'s identity is 0). The array of flags is the same length as the input
* with 1 marking the the first element of a segment and 0 otherwise. The
* output of segmented scan is the same size as its input. Informally, the
* output at each element is the result of operator applied to each input
* that comes before it in that segment. For instance, the output of
* segmented sum-scan at each element is the sum of all the input elements
* before that input in that segment.
*
* More formally, for associative operator
* @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly,
* <var>out<sub>i</sub></var> = <var>in<sub>k</sub></var>
* @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly
* <var>in<sub>k+1</sub></var>
* @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly ...
* @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly
* <var>in<sub>i-1</sub></var>.
* <i>k</i> is the index of the first element of the segment in which <i>i</i> lies
*
* We support both "exclusive" and "inclusive" variants. For a segmented sum-scan,
* the exclusive variant computes the sum of all input elements before the
* current element in that segment, while the inclusive variant computes the
* sum of all input elements up to and including the current element, in
* that segment.
*
* Before calling segmented scan, create an internal plan using cudppPlan().
*
* After you are finished with the scan plan, clean up with cudppDestroyPlan().
* @param[in] planHandle Handle to plan for this scan
* @param[out] d_out output of segmented scan, in GPU memory
* @param[in] d_idata input data to segmented scan, in GPU memory
* @param[in] d_iflags input flags to segmented scan, in GPU memory
* @param[in] numElements number of elements to perform segmented scan on
*
* @see cudppPlan, cudppDestroyPlan
CUDPP_DLL
CUDPPResult cudppSegmentedScan(CUDPPHandle planHandle,
void *d_out,
const void *d_idata,
const unsigned int *d_iflags,
size_t numElements)
{
CUDPPSegmentedScanPlan *plan =
(CUDPPSegmentedScanPlan*)CUDPPPlanManager::GetPlan(planHandle);
if (plan != nullptr)
{
cudppSegmentedScanDispatch(d_out, d_idata, d_iflags, numElements, plan);
return CUDPP_SUCCESS;
}
else
{
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
}
}
*/
/**
* @brief Performs numRows parallel scan operations of numElements
* each on its input (d_in) and places the output in d_out,
* with the scan parameters set by config. Exactly like cudppScan
* except that it runs on multiple rows in parallel.
*
* Note that to achieve good performance with cudppMultiScan one should
* allocate the device arrays passed to it so that all rows are aligned
* to the correct boundaries for the architecture the app is running on.
* The easy way to do this is to use cudaMallocPitch() to allocate a
* 2D array on the device. Use the \a rowPitch parameter to cudppPlan()
* to specify this pitch. The easiest way is to pass the device pitch
* returned by cudaMallocPitch to cudppPlan() via \a rowPitch.
*
* @param[in] planHandle handle to CUDPPScanPlan
* @param[out] d_out output of scan, in GPU memory
* @param[in] d_in input to scan, in GPU memory
* @param[in] numElements number of elements (per row) to scan
* @param[in] numRows number of rows to scan in parallel
*
* @see cudppScan, cudppPlan
CUDPP_DLL
CUDPPResult cudppMultiScan(CUDPPHandle planHandle,
void *d_out,
const void *d_in,
size_t numElements,
size_t numRows)
{
CUDPPScanPlan *plan = (CUDPPScanPlan*)CUDPPPlanManager::GetPlan(planHandle);
if (plan != nullptr)
{
cudppScanDispatch(d_out, d_in, numElements, numRows, plan);
return CUDPP_SUCCESS;
}
else
{
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
}
}
*/
/**
* @brief Given an array \a d_in and an array of 1/0 flags in \a
* deviceValid, returns a compacted array in \a d_out of corresponding
* only the "valid" values from \a d_in.
*
* Takes as input an array of elements in GPU memory
* (\a d_in) and an equal-sized unsigned int array in GPU memory
* (\a deviceValid) that indicate which of those input elements are
* valid. The output is a packed array, in GPU memory, of only those
* elements marked as valid.
*
* Internally, uses cudppScan.
*
* Example:
* \code
* d_in = [ a b c d e f ]
* deviceValid = [ 1 0 1 1 0 1 ]
* d_out = [ a c d f ]
* \endcode
*
* @todo [MJH] We need to evaluate whether cudppCompact should be a core member
* of the public interface. It's not clear to me that what the user always
* wants is a final compacted array. Often one just wants the array of indices
* to which each input element should go in the output. The split() routine used
* in radix sort might make more sense to expose.
*
* @param[in] planHandle handle to CUDPPCompactPlan
* @param[out] d_out compacted output
* @param[out] d_numValidElements set during cudppCompact; is set with the
* number of elements valid flags in the d_isValid input array
* @param[in] d_in input to compact
* @param[in] d_isValid which elements in d_in are valid
* @param[in] numElements number of elements in d_in
CUDPP_DLL
CUDPPResult cudppCompact(CUDPPHandle planHandle,
void *d_out,
size_t *d_numValidElements,
const void *d_in,
const unsigned int *d_isValid,
size_t numElements)
{
CUDPPCompactPlan *plan = (CUDPPCompactPlan*)CUDPPPlanManager::GetPlan(planHandle);
if (plan != nullptr)
{
cudppCompactDispatch(d_out, d_numValidElements, d_in, d_isValid,
numElements, plan);
return CUDPP_SUCCESS;
}
else
{
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors.
}
}
*/
/**
* @brief Sorts key-value pairs or keys only
*
* Takes as input an array of keys in GPU memory
* (d_keys) and an optional array of corresponding values,
* and outputs sorted arrays of keys and (optionally) values in place.
* Key-value and key-only sort is selected through the configuration of
* the plan, using the options CUDPP_OPTION_KEYS_ONLY and
* CUDPP_OPTION_KEY_VALUE_PAIRS.
*
* Supported key types are CUDPP_FLOAT and CUDPP_UINT. Values can be
* any 32-bit type (internally, values are treated only as a payload
* and cast to unsigned int).
*
* @todo Determine if we need to provide an "out of place" sort interface.
*
* @param[in] planHandle handle to CUDPPSortPlan
* @param[out] d_keys keys by which key-value pairs will be sorted
* @param[in] d_values values to be sorted
* @param[in] keyBits the number of least significant bits in each element
* of d_keys to sort by
* @param[in] numElements number of elements in d_keys and d_values
*
* @see cudppPlan, CUDPPConfiguration, CUDPPAlgorithm
*/
CUDPP_DLL
CUDPPResult cudppSort(CUDPPHandle planHandle,
void *d_keys,
void *d_values,
int keyBits,
size_t numElements)
{
CUDPPRadixSortPlan *plan = (CUDPPRadixSortPlan*)CUDPPPlanManager::GetPlan(planHandle);
if (plan != nullptr)
{
cudppRadixSortDispatch(d_keys, d_values, numElements, keyBits, plan);
return CUDPP_SUCCESS;
}
else
{
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors.
}
}
/** @brief Perform matrix-vector multiply y = A*x for arbitrary sparse matrix A and vector x
*
* Given a matrix object handle (which has been initialized using cudppSparseMatrix()),
* This function multiplies the input vector \a d_x by the matrix referred to by
* \a sparseMatrixHandle, returning the result in \a d_y.
*
* @param sparseMatrixHandle Handle to a sparse matrix object created with cudppSparseMatrix()
* @param d_y The output vector, y
* @param d_x The input vector, x
*
* @see cudppSparseMatrix, cudppDestroySparseMatrix
CUDPP_DLL
CUDPPResult cudppSparseMatrixVectorMultiply(CUDPPHandle sparseMatrixHandle,
void *d_y,
const void *d_x)
{
CUDPPSparseMatrixVectorMultiplyPlan *plan =
(CUDPPSparseMatrixVectorMultiplyPlan*)CUDPPPlanManager::GetPlan(sparseMatrixHandle);
if (plan != nullptr)
{
cudppSparseMatrixVectorMultiplyDispatch(d_y, d_x, plan);
return CUDPP_SUCCESS;
}
else
{
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors.
}
}
*/
/**
* @brief Rand puts \a numElements random 32-bit elements into \a d_out
*
* Outputs \a numElements random values to \a d_out. \a d_out must be of
* type unsigned int, allocated in device memory.
*
* The algorithm used for the random number generation is stored in \a planHandle.
* Depending on the specification of the pseudo random number generator(PRNG),
* the generator may have one or more seeds. To set the seed, use cudppRandSeed().
*
* @todo Currently only MD5 PRNG is supported. We may provide more rand routines in
* the future.
*
* @param[in] planHandle Handle to plan for rand
* @param[in] numElements number of elements in d_out.
* @param[out] d_out output of rand, in GPU memory. Should be an array of unsigned integers.
*
* @see cudppPlan, CUDPPConfiguration, CUDPPAlgorithm
CUDPP_DLL
CUDPPResult cudppRand(CUDPPHandle planHandle,void * d_out, size_t numElements)
{
CUDPPRandPlan * plan = (CUDPPRandPlan *) CUDPPPlanManager::GetPlan(planHandle);
if(plan != nullptr)
{
//dispatch the rand algorithm here
cudppRandDispatch(d_out, numElements, plan);
return CUDPP_SUCCESS;
}
else
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
}
*/
/**@brief Sets the seed used for rand
*
* The seed is crucial to any random number generator as it allows a
* sequence of random numbers to be replicated. Since there may be
* multiple different rand algorithms in CUDPP, cudppRandSeed
* uses \a planHandle to determine which seed to set. Each rand
* algorithm has its own unique set of seeds depending on what
* the algorithm needs.
*
* @param[in] planHandle the handle to the plan which specifies which rand seed to set
* @param[in] seed the value which the internal cudpp seed will be set to
CUDPP_DLL
CUDPPResult cudppRandSeed(const CUDPPHandle planHandle, unsigned int seed)
{
CUDPPRandPlan * plan = (CUDPPRandPlan *) CUDPPPlanManager::GetPlan(planHandle);
//switch on the plan to figure out which seed to update
switch(plan->m_config.algorithm)
{
case CUDPP_RAND_MD5:
plan->m_seed = seed;
break;
default:
break;
}
return CUDPP_SUCCESS;
}//end cudppRandSeed
*/
/** @} */ // end Algorithm Interface
/** @} */ // end of publicInterface group
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:
|