File: cudpp_globals.h

package info (click to toggle)
lammps 20220106.git7586adbb6a%2Bds1-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 348,064 kB
  • sloc: cpp: 831,421; python: 24,896; xml: 14,949; f90: 10,845; ansic: 7,967; sh: 4,226; perl: 4,064; fortran: 2,424; makefile: 1,501; objc: 238; lisp: 163; csh: 16; awk: 14; tcl: 6
file content (66 lines) | stat: -rw-r--r-- 2,588 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision: 5289 $
// $Date: 2010-11-23 13:04:43 -0700 (Tue, 23 Nov 2010) $
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------

/**
 * @file
 * cudpp_globals.h
 *
 * @brief Global declarations defining machine characteristics of GPU target
 * These are currently set for best performance on G8X GPUs.  The optimal
 * parameters may change on future GPUs. In the future, we hope to make
 * CUDPP a self-tuning library.
 */

#ifndef __CUDPP_GLOBALS_H__
#define __CUDPP_GLOBALS_H__

const int NUM_BANKS = 16;                        /**< Number of shared memory banks */
const int LOG_NUM_BANKS = 4;                     /**< log_2(NUM_BANKS) */
const int CTA_SIZE = 128;                        /**< Number of threads in a CTA */
const int WARP_SIZE = 32;                        /**< Number of threads in a warp */
const int LOG_CTA_SIZE = 7;                      /**< log_2(CTA_SIZE) */
const int LOG_WARP_SIZE = 5;                     /**< log_2(WARP_SIZE) */
const int LOG_SIZEOF_FLOAT = 2;                  /**< log_2(sizeof(float)) */
const int SCAN_ELTS_PER_THREAD = 8;              /**< Number of elements per scan thread */
const int SEGSCAN_ELTS_PER_THREAD = 8;     /**< Number of elements per segmented scan thread */

const int maxSharedMemoryPerBlock = 16384; /**< Number of bytes of shared
                                              memory in each block */
const int maxThreadsPerBlock = CTA_SIZE;   /**< Maximum number of
                                             * threads in a CTA */

/**
* @brief Macro to insert necessary __syncthreads() in device emulation mode
*/
#ifdef __DEVICE_EMULATION__
#define __EMUSYNC __syncthreads()
#else
#define __EMUSYNC
#endif


#define AVOID_BANK_CONFLICTS /**< Set if by default, we want our
                              * shared memory allocation to perform
                              * additional computation to avoid bank
                              * conflicts */

#ifdef AVOID_BANK_CONFLICTS
#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS)
#else
#define CONFLICT_FREE_OFFSET(index) (0)
#endif

#endif // __CUDPP_GLOBALS_H__

// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End: