File: GPUQREngine_Scheduler.hpp

package info (click to toggle)
suitesparse 1%3A5.8.1%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 152,716 kB
  • sloc: ansic: 774,385; cpp: 24,213; makefile: 6,310; fortran: 1,927; java: 1,826; csh: 1,686; ruby: 725; sh: 535; perl: 225; python: 209; sed: 164; awk: 60
file content (168 lines) | stat: -rw-r--r-- 5,091 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
// =============================================================================
// === GPUQREngine/Include/GPUQREngine_Scheduler.hpp ===========================
// =============================================================================
//
// The Scheduler is a principal class in the GPUQREngine.
//
// This class manages the input set of Fronts, creates BucketLists when
// necessary for factorization, and contains all logic required to coordinate
// the factorization and assembly tasks with the GPU.
//
// =============================================================================

#ifndef GPUQRENGINE_SCHEDULER_HPP
#define GPUQRENGINE_SCHEDULER_HPP

#include "GPUQREngine_Common.hpp"
#include "GPUQREngine_FrontState.hpp"
#include "GPUQREngine_TaskDescriptor.hpp"
#include "GPUQREngine_BucketList.hpp"
#include "GPUQREngine_LLBundle.hpp"
#include "GPUQREngine_Front.hpp"

#define SSGPU_MINAPPLYGRANULARITY 16

size_t ssgpu_maxQueueSize       // return size of scheduler queue
(
    size_t gpuMemorySize        // size of GPU memory, in bytes
) ;

class Scheduler
{
private:
    /* Scheduler.cpp */
    bool initialize(size_t gpuMemorySize);

    /* Scheduler_Front.cpp */
    bool pullFrontData(Int f);

    /* Scheduler_FillWorkQueue.cpp */
    void fillTasks
    (
        Int f,                          // INPUT: Current front
        TaskDescriptor *queue,          // INPUT: CPU Task entries
        Int *queueIndex                 // IN/OUT: The index of the current entry
    );

public:
    bool memory_ok;                     // Flag for the creating function to
                                        // determine whether we had enough
                                        // memory to initialize the Scheduler.
    bool cuda_ok;                       // Flag for the creating function to
                                        // determine whether we could
                                        // successfully invoke the cuda
                                        // initialization calls.

    Front *frontList;
    Int numFronts;
    Int numFrontsCompleted;

    int activeSet;

    BucketList *bucketLists;

    Int *afPerm;                        // Permutation of "active" fronts
    Int *afPinv;                        // Inverse permutation of "active" fronts
    Int numActiveFronts;

    Int maxQueueSize;
    Workspace *workQueues[2];
    Int numTasks[2];
    Int minApplyGranularity;            // The minimum number of tiles for which
                                        // we will group apply tasks

    bool *FrontDataPulled;              // A set of flags indicating whether R has
                                        // been pulled off the GPU.
    cudaEvent_t *eventFrontDataReady;   // A list of cudaEvents that are used to
                                        // coordinate when the R factor is ready
                                        // to be pulled from the GPU.
    cudaEvent_t *eventFrontDataPulled;  // A list of cudaEvents that are used to
                                        // coordinate when the R factor is finally
                                        // finished transfering off the GPU.

    // Use multiple CUDA streams to coordinate kernel launches and asynchronous
    // memory transfers between the host and the device:
    //   kernelStreams : Launch kernels on alternating streams
    //   H2D           : Asynchronous memory transfer stream (Host-to-Device)
    //   D2H           : Asynchronous memory transfer stream (Device-to-Host)
    cudaStream_t kernelStreams[2];
    cudaStream_t memoryStreamH2D;
    cudaStream_t memoryStreamD2H;

    /* Scheduler.cpp */
    void *operator new(long unsigned int, Scheduler* p){ return p; }
    Scheduler(Front *fronts, Int numFronts, size_t gpuMemorySize);
    ~Scheduler();

    /* Scheduler_Front.cpp */
    void activateFront
    (
        Int f                   // The index of the front to operate on
    );

    bool finishFront
    (
        Int f                   // The index of the front to operate on
    );

    void initializeBucketList
    (
        Int f                   // The index of the front to operate on
    )
    {
        // NOTE: tested by SPQR/Tcov, but not flagged as such in cov results
        BucketList *dlbl = (&bucketLists[f]);
        if(dlbl->useFlag) dlbl->Initialize();
    }

    /* Scheduler_TransferData.cpp */
    void transferData
    (
        void
    );

    /* Scheduler_FillWorkQueue.cpp */
    void fillWorkQueue
    (
        void
    );

    /* Scheduler_LaunchKernel.cpp */
    void launchKernel
    (
        void
    );

    /* Scheduler_PostProcess.cpp */
    bool postProcess
    (
        void
    );

    void toggleQueue
    (
        void
    )
    {
        activeSet ^= 1;
    }

    /* Stats */
    float kernelTime;
    Int numKernelLaunches;
    Int gpuFlops;

#ifdef GPUQRENGINE_RENDER
    /* Debug stuff */
    const char *TaskNames[21];
    const char *StateNames[9];
    int renderCount;
    void render();
#endif

#if 1
    void debugDumpFront(Front *front);
#endif
};

#endif