1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
|
// -------------------------------------------------------------
// CUDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision: 3572$
// $Date: 2010-11-23 13:04:43 -0700 (Tue, 23 Nov 2010) $
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
#ifndef __CUDPP_PLAN_H__
#define __CUDPP_PLAN_H__
typedef void* KernelPointer;
extern "C" size_t getNumCTAs(KernelPointer kernel);
extern "C" void compNumCTAs(KernelPointer kernel, size_t bytesDynamicSharedMem, size_t threadsPerBlock);
template <typename T>
size_t numCTAs(T kernel)
{
return getNumCTAs((KernelPointer)kernel);
}
template <typename T>
void computeNumCTAs(T kernel, unsigned int bytesDynamicSharedMem, size_t threadsPerBlock)
{
compNumCTAs((KernelPointer)kernel, bytesDynamicSharedMem, threadsPerBlock);
}
/** @brief Base class for CUDPP Plan data structures
*
* CUDPPPlan and its subclasses provide the internal (i.e. not visible to the
* library user) infrastructure for planning algorithm execution. They
* own intermediate storage for CUDPP algorithms as well as, in some cases,
* information about optimal execution configuration for the present hardware.
*
*/
class CUDPPPlan
{
public:
CUDPPPlan(CUDPPConfiguration config, size_t numElements, size_t numRows, size_t rowPitch);
virtual ~CUDPPPlan() {}
// Note anything passed to functions compiled by NVCC must be public
CUDPPConfiguration m_config; //!< @internal Options structure
size_t m_numElements; //!< @internal Maximum number of input elements
size_t m_numRows; //!< @internal Maximum number of input rows
size_t m_rowPitch; //!< @internal Pitch of input rows in elements
};
/** @brief Plan class for scan algorithm
*
*/
class CUDPPScanPlan : public CUDPPPlan
{
public:
CUDPPScanPlan(CUDPPConfiguration config, size_t numElements, size_t numRows, size_t rowPitch);
virtual ~CUDPPScanPlan();
void **m_blockSums; //!< @internal Intermediate block sums array
size_t *m_rowPitches; //!< @internal Pitch of each row in elements (for cudppMultiScan())
size_t m_numEltsAllocated; //!< @internal Number of elements allocated (maximum scan size)
size_t m_numRowsAllocated; //!< @internal Number of rows allocated (for cudppMultiScan())
size_t m_numLevelsAllocated; //!< @internal Number of levels allocaed (in _scanBlockSums)
};
/** @brief Plan class for segmented scan algorithm
*
*/
class CUDPPSegmentedScanPlan : public CUDPPPlan
{
public:
CUDPPSegmentedScanPlan(CUDPPConfiguration config, size_t numElements);
virtual ~CUDPPSegmentedScanPlan();
void **m_blockSums; //!< @internal Intermediate block sums array
unsigned int **m_blockFlags; //!< @internal Intermediate block flags array
unsigned int **m_blockIndices; //!< @internal Intermediate block indices array
size_t m_numEltsAllocated; //!< @internal Number of elements allocated (maximum scan size)
size_t m_numLevelsAllocated; //!< @internal Number of levels allocaed (in _scanBlockSums)
};
/** @brief Plan class for compact algorithm
*
*/
class CUDPPCompactPlan : public CUDPPPlan
{
public:
CUDPPCompactPlan(CUDPPConfiguration config, size_t numElements, size_t numRows, size_t rowPitch);
virtual ~CUDPPCompactPlan();
CUDPPScanPlan *m_scanPlan; //!< @internal Compact performs a scan of type unsigned int using this plan
unsigned int* m_d_outputIndices; //!< @internal Output address of compacted elements; this is the result of scan
};
class CUDPPRadixSortPlan : public CUDPPPlan
{
public:
CUDPPRadixSortPlan(CUDPPConfiguration config, size_t numElements);
virtual ~CUDPPRadixSortPlan();
bool m_bKeysOnly;
bool m_bManualCoalesce;
bool m_bUsePersistentCTAs;
unsigned int m_persistentCTAThreshold[2];
unsigned int m_persistentCTAThresholdFullBlocks[2];
CUDPPScanPlan *m_scanPlan; //!< @internal Sort performs a scan of type unsigned int using this plan
unsigned int m_keyBits;
mutable void *m_tempKeys; //!< @internal Intermediate storage for keys
mutable void *m_tempValues; //!< @internal Intermediate storage for values
unsigned int *m_counters; //!< @internal Counter for each radix
unsigned int *m_countersSum; //!< @internal Prefix sum of radix counters
unsigned int *m_blockOffsets; //!< @internal Global offsets of each radix in each block
};
/** @brief Plan class for sparse-matrix dense-vector multiply
*
*/
class CUDPPSparseMatrixVectorMultiplyPlan : public CUDPPPlan
{
public:
CUDPPSparseMatrixVectorMultiplyPlan(CUDPPConfiguration config, size_t numNZElts,
const void *A,
const unsigned int *rowindx,
const unsigned int *indx, size_t numRows);
virtual ~CUDPPSparseMatrixVectorMultiplyPlan();
CUDPPSegmentedScanPlan *m_segmentedScanPlan; //!< @internal Performs a segmented scan of type T using this plan
void *m_d_prod; //!< @internal Vector of products (of an element in A and its corresponding (thats is
//! belongs to the same row) element in x; this is the input and output of
//! segmented scan
unsigned int *m_d_flags; //!< @internal Vector of flags where a flag is set if an element of A is the first element
//! of its row; this is the flags vector for segmented scan
unsigned int *m_d_rowFinalIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
//! which is the last element of that row. Resides in GPU memory.
unsigned int *m_d_rowIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
//! which is the first element of that row. Resides in GPU memory.
unsigned int *m_d_index; //!<@internal Vector of column numbers one for each element in A
void *m_d_A; //!<@internal The A matrix
unsigned int *m_rowFinalIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
//! which is the last element of that row. Resides in CPU memory.
size_t m_numRows; //!< Number of rows
size_t m_numNonZeroElements; //!<Number of non-zero elements
};
/** @brief Plan class for random number generator
*
*/
class CUDPPRandPlan : public CUDPPPlan
{
public:
CUDPPRandPlan(CUDPPConfiguration config, size_t num_elements);
unsigned int m_seed; //!< @internal the seed for the random number generator
};
#endif // __CUDPP_PLAN_H__
|