File: spqrgpu_computeFrontStaging.cpp

package info (click to toggle)
suitesparse 1%3A5.8.1%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 152,716 kB
  • sloc: ansic: 774,385; cpp: 24,213; makefile: 6,310; fortran: 1,927; java: 1,826; csh: 1,686; ruby: 725; sh: 535; perl: 225; python: 209; sed: 164; awk: 60
file content (231 lines) | stat: -rw-r--r-- 9,218 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
// =============================================================================
// === spqrgpu_computeFrontStaging =============================================
// =============================================================================

// Returns a front staging and whether the staging is feasible.
// A front staging is infeasible if a front and its children do not fit on
// the GPU at the same time.

#ifdef GPU_BLAS
#include "spqr.hpp"
#include "GPUQREngine_Scheduler.hpp"

void spqrgpu_computeFrontStaging
(
    // inputs, not modified on output
    Long numFronts,     // total number of fronts (nf in caller)
    Long *Parent,       // size nf+1, assembly tree (f=nf is placeholder)
    Long *Childp,       // size nf+2, children of f are
                        //      Child [Childp [f] ... Childp [f+1]-1]
    Long *Child,        // size nf+1.

    Long *Fm,           // size nf+1, front f has Fm [f] rows
    Long *Cm,           // size nf+1, front f has Cm [f] rows in contrib
    Long *Rp,           // size nf+1, Rj[Rp[f]...Rp[f+1]-1] are the cols in f
    Long *Sp,           // size m+1, row pointers for sparse matrix S
    Long *Sleft,        // size n+2, see spqr_stranspose for description
    Long *Super,        // size nf+1, front f pivotal cols are
                        //      Super[f]..Super[f+1]-1
    Long *Post,         // size nf+1, front f is kth, if f = Post [k]

    Long RimapSize,     // scalar, size of Rimap on the GPU (# of int's)
    Long RjmapSize,     // scalar, size of Rimap on the GPU (# of int's)

    // output, not defined on input:
    bool *feasible,     // scalar, true if feasible, false if GPU memory too low
    Long *numStages,    // scalar, number of stages
    Long *Stagingp,     // size nf+2, fronts are in the list
                        //      Post [Stagingp [stage]...Stagingp[stage+1]-1]
    Long *StageMap,     // size nf, front f is in stage StageMap [f]

    size_t *FSize,      // size nf+1, FSize[stage]: size in bytes of MongoF
    size_t *RSize,      // size nf+1, Rsize[stage]: size in bytes of MongoR
    size_t *SSize,      // size nf+1, Ssize[stage]: size in bytes of S
    Long *FOffsets,     // size nf, front f in MondoF [FOffsets[f]...] on GPU
    Long *ROffsets,     // size nf, R block in MondoR [Roffsets[f]...] on CPU
    Long *SOffsets,     // size nf, S entries for front f are in
                        //      wsS [SOffsets[f]...]

    // input/output:
    cholmod_common *cc
)
{

    // -------------------------------------------------------------------------
    // determine available GPU memory, required for all stages
    // -------------------------------------------------------------------------

    // gpuMemorySize = 0 is for testing only.  This value forces each front to
    // appear in its own stage.  gpuMemorySize = 1 is also for testing, to
    // check how this function handles problem that is infeasible due to lack
    // of GPU memory.  We must also ensure that gpuMemorySize does not
    // accidentally become negative.

    size_t gpuMemorySize = cc->gpuMemorySize;

//  printf ("GPU mem starts %g MB\n", (double) gpuMemorySize / (1024*1024)) ;

    // account for two Scheduler work queues in the GPU memory
    if (gpuMemorySize > 1)
    {
        // The GPU must hold two workspace queues, each of size maxQueueSize,
        // and where each entry is of size sizeof(TaskDescriptor)
        size_t maxQueueSize = ssgpu_maxQueueSize (gpuMemorySize) ;
        size_t s = 2 * maxQueueSize * sizeof (TaskDescriptor) ;
        gpuMemorySize = (gpuMemorySize > s) ? (gpuMemorySize-s) : 0 ;
    }

    // account for Rimap in the GPU memory
    if (gpuMemorySize > 1)
    {
        size_t s = RimapSize * sizeof (int) ;
        gpuMemorySize = (gpuMemorySize > s) ? (gpuMemorySize-s) : 0 ;
    }

    // account for Rjmap in the GPU memory
    if (gpuMemorySize > 1)
    {
        size_t s = RjmapSize * sizeof (int) ;
        gpuMemorySize = (gpuMemorySize > s) ? (gpuMemorySize-s) : 0 ;
    }

    // account for cudaMalloc memory manager overhead in the GPU memory
    if (gpuMemorySize > 1)
    {
        size_t s = 1024 * 1024 ;        // just 1 MB for good measure
        gpuMemorySize = (gpuMemorySize > s) ? (gpuMemorySize-s) : 0 ;
    }

//  printf ("GPU mem now    %g MB\n", (double) gpuMemorySize / (1024*1024)) ;

    // -------------------------------------------------------------------------
    // assign fronts to stages based on remaining GPU memory availability
    // -------------------------------------------------------------------------

    /* The memory requirement for a front is the summation of the memory
       requirements from its children plus its own memory requirement.
       If we use a postorder traversal, we only need to keep a rolling sum. */
    size_t ReqMem = 0;   // also used in FOffsets

    /* RMem is always < ReqMem.
       RMem is not just for R, but an upper-bound on the amount of front data
       we have to pull back from the GPU in the event that the front is staged.
       We already would have room to pull back the CBlock if we account for it.
       The QREngine is intelligent enough to pull only the needed FrontData.
       The language is clearer in QREngine as well ("R" renamed "FrontData") */
    size_t RMem = 0;     // also used in ROffsets

    /* SMem is twice the number of S values (one for index, one for value). */
    size_t SMem = 0;     // also used in SOffsets

    /* VTMem is the amount of memory required for the VT blocks. */
    size_t VTMem = 0;

    Long stage = 0;
    Stagingp[0] = 0;
    for(Long p=0; p<numFronts; p++)
    {
        Long f = Post[p]; // The postordering ensures we visit children first

        Long fm = Fm[f];
        Long fn = Rp[f+1] - Rp[f];
        Long fp = Super[f+1] - Super[f];
        Long frank = MIN(fm, fp);
        // Long cn = fn - fp ;
        Long cm = Cm[f] ;
        size_t frontMem = fm * fn;          // F
        size_t rMem = (frank + cm) * fn;    // R + C

        // for sMem, "2 *" assumes sizeof (SEntry) is 2*sizeof(double)
        size_t sMem = 2 * (Sp[Sleft[Super[f+1]]] - Sp[Sleft[Super[f]]]);

        // CEIL is defined in GPUQREngine
        size_t vtMem = CEIL(fm, TILESIZE) * (TILESIZE+1) * TILESIZE ;

        size_t childMemOld = 0;
        size_t childMemNew = 0;

        /* Add contribution block memory from children in earlier stages. */
        for(Long cp=Childp[f]; cp<Childp[f+1]; cp++)
        {
            Long c = Child[cp];

            // Long cfm = Fm[c];
            Long cfn = Rp[c+1] - Rp[c];
            Long cfp = Super[c+1] - Super[c];
            // Long crank = MIN(cfm, cfp);
            Long ccn = cfn - cfp ;
            Long ccm = Cm[c];

            if(StageMap[c] < stage)
            {
                childMemOld += ccm * ccn;
            }
            else
            {
                childMemNew += ccm * ccn;
            }
        }

        /* determine which stage will contain this front */
        if((ReqMem + (frontMem + childMemOld + sMem + vtMem)) * sizeof(double)
            < gpuMemorySize)
        {
            /* If we can add the front to the current stage, accum its mem. */
            FOffsets[f] = ReqMem;
            ROffsets[f] = RMem;
            SOffsets[f] = SMem / 2; // correct for data width
            ReqMem += frontMem + childMemOld;
            RMem += rMem;
            SMem += sMem;
            VTMem += vtMem;
        }
        else if (gpuMemorySize == 0 ||
            ((frontMem + childMemOld + childMemNew + sMem + vtMem)
             * sizeof(double) < gpuMemorySize))
        {
            /* Else if the front and its children fit on the GPU, add it
               to the next stage and reset the mem accumulator. */
            PR (("gpuMemorySize: move front to next stage\n")) ;
            FSize[stage] = ReqMem;  // save the sizes for the stage
            RSize[stage] = RMem;
            SSize[stage] = SMem / 2; // correct for data width
            Stagingp[++stage] = p;
            ReqMem = 0;             // move onto the next stage
            RMem = 0;
            SMem = 0;
            VTMem = 0;
            FOffsets[f] = ReqMem;
            ROffsets[f] = RMem;
            SOffsets[f] = SMem / 2; // correct for data width
            ReqMem += frontMem + childMemOld + childMemNew;
            RMem += rMem;
            SMem += sMem;
            VTMem += vtMem;
        }
        else
        {
            /* Else the front and its children can't fit on the GPU,
               so we have an infeasible schedule. */
            PR (("gpuMemorySize too small: schedule infeasible\n")) ;
            ERROR (CHOLMOD_GPU_PROBLEM, "GPU memory too small\n") ;
            *numStages = 0 ;
            *feasible = false ;
            return ;
        }
        StageMap[f] = stage;
    }

    /* Make sure that even if everything fits in one stage
       that we finalize the stage. */
    FSize[stage] = ReqMem;
    RSize[stage] = RMem;
    SSize[stage] = SMem / 2;  // correct for data width
    Stagingp[++stage] = numFronts;
    if(Stagingp[stage] == Stagingp[stage-1]) stage--;

    *numStages = stage;
    *feasible = true;
    return;
}
#endif