File: cudpp_maximal_launch.cpp

package info (click to toggle)
lammps 0~20161109.git9806da6-7
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 248,460 kB
  • ctags: 80,635
  • sloc: cpp: 701,185; python: 33,420; fortran: 26,434; ansic: 11,340; sh: 6,108; perl: 4,104; makefile: 2,891; xml: 2,590; f90: 1,690; objc: 238; lisp: 169; tcl: 61; csh: 16; awk: 14
file content (94 lines) | stat: -rw-r--r-- 3,501 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision$
// $Date$
// ------------------------------------------------------------- 
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// ------------------------------------------------------------- 
#include "cudpp_maximal_launch.h"
 
inline size_t min(size_t x, size_t y)
{
    return (x <= y) ? x : y;
}

inline size_t max(size_t x, size_t y)
{
    return (x >= y) ? x : y;
}

// computes next highest multiple of f from x
inline size_t multiple(size_t x, size_t f)
{
    return ((x + (f-1)) / f);
}


// MS Excel-style CEIL() function
// Rounds x up to nearest multiple of f
inline size_t ceiling(size_t x, size_t f)
{
    return multiple(x, f) * f;
}

extern "C"
size_t maxBlocks(cudaFuncAttributes &attribs, 
                 cudaDeviceProp &devprop, 
                 size_t bytesDynamicSharedMem,
                 size_t threadsPerBlock)
{
    
    // Determine the maximum number of CTAs that can be run simultaneously for each kernel
    // This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
    const unsigned int regAllocationUnit = (devprop.major < 2 && devprop.minor < 2) ? 256 : 512; // in registers
    const unsigned int warpAllocationMultiple = 2;
    const unsigned int smemAllocationUnit = 512;                                                 // in bytes
    const unsigned int maxThreadsPerSM = (devprop.major < 2 && devprop.minor < 2) ? 768 : 1024;  // sm_12 GPUs increase threads/SM to 1024
    const unsigned int maxBlocksPerSM = 8;

    // Number of warps (round up to nearest whole multiple of warp size)
    size_t numWarps = multiple(threadsPerBlock, devprop.warpSize);
    // Round up to warp allocation multiple
    numWarps = ceiling(numWarps, warpAllocationMultiple);

    // Number of regs is regs per thread times number of warps times warp size
    size_t regsPerCTA = attribs.numRegs * devprop.warpSize * numWarps;
    // Round up to multiple of register allocation unit size
    regsPerCTA = ceiling(regsPerCTA, regAllocationUnit);
    
    size_t smemBytes  = attribs.sharedSizeBytes + bytesDynamicSharedMem;
    size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit);

    size_t ctaLimitRegs    = regsPerCTA > 0 ? devprop.regsPerBlock      / regsPerCTA : maxBlocksPerSM;
    size_t ctaLimitSMem    = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
    size_t ctaLimitThreads =                  maxThreadsPerSM           / threadsPerBlock;

    return devprop.multiProcessorCount * min(ctaLimitRegs, min(ctaLimitSMem, min(ctaLimitThreads, maxBlocksPerSM)));
}

extern "C"
size_t maxBlocksFromPointer(void*  kernel, 
                            size_t bytesDynamicSharedMem,
                            size_t threadsPerBlock)
{
    cudaDeviceProp devprop;
    int deviceID = -1;
    cudaError_t err = cudaGetDevice(&deviceID);
    if (err == cudaSuccess)
    {
        err = cudaGetDeviceProperties(&devprop, deviceID);
        if (err != cudaSuccess)
            return -1;

        cudaFuncAttributes attr;
        err = cudaFuncGetAttributes(&attr, (const char*)kernel);
        if (err != cudaSuccess)
            return -1;

        return maxBlocks(attr, devprop, bytesDynamicSharedMem, threadsPerBlock);
    }

    return -1;    
}