File: GB_AxB_dot3_slice.c

package info (click to toggle)
suitesparse-graphblas 7.4.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 67,112 kB
  • sloc: ansic: 1,072,243; cpp: 8,081; sh: 512; makefile: 506; asm: 369; python: 125; awk: 10
file content (225 lines) | stat: -rw-r--r-- 8,945 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
//------------------------------------------------------------------------------
// GB_AxB_dot3_slice: slice the entries and vectors for C<M>=A'*B
//------------------------------------------------------------------------------

// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

// Constructs a set of tasks that slice the matrix C for C<M>=A'*B.  The matrix
// C has been allocated, and its pattern will be a copy of M, but with some
// entries possibly turned into zombies.  However, on input, the pattern C->i
// does yet not contain the indices, but the work required to compute each
// entry in the dot product C(i,j)<M(i,j)> = A(:,i)'*B(:,j).

// The strategy for slicing of C and M is like GB_ek_slice, for coarse tasks.
// These coarse tasks differ from the tasks generated by GB_ewise_slice,
// since they may start in the middle of a vector.  If a single entry C(i,j)
// is costly to compute, it is possible that it is placed by itself in a
// single coarse task.

// FUTURE:: Ultra-fine tasks could also be constructed, so that the computation
// of a single entry C(i,j) can be broken into multiple tasks.  The slice of
// A(:,i) and B(:,j) would use GB_slice_vector, where no mask would be used.

#define GB_FREE_WORKSPACE                       \
{                                               \
    GB_WERK_POP (Coarse, int64_t) ;             \
}

#define GB_FREE_ALL                             \
{                                               \
    GB_FREE_WORKSPACE ;                         \
    GB_FREE_WORK (&TaskList, TaskList_size) ;   \
}

#include "GB_mxm.h"
#include "GB_search_for_vector_template.c"

//------------------------------------------------------------------------------
// GB_AxB_dot3_slice
//------------------------------------------------------------------------------

GrB_Info GB_AxB_dot3_slice
(
    // output:
    GB_task_struct **p_TaskList,    // array of structs
    size_t *p_TaskList_size,        // size of TaskList
    int *p_ntasks,                  // # of tasks constructed
    int *p_nthreads,                // # of threads to use
    // input:
    const GrB_Matrix C,             // matrix to slice
    GB_Context Context
)
{

    //--------------------------------------------------------------------------
    // check inputs
    //--------------------------------------------------------------------------

    ASSERT (p_TaskList != NULL) ;
    ASSERT (p_TaskList_size != NULL) ;
    ASSERT (p_ntasks != NULL) ;
    ASSERT (p_nthreads != NULL) ;
    // ASSERT_MATRIX_OK (C, ...) cannot be done since C->i is the work need to
    // compute the entry, not the row index itself.

    // C is always constructed as sparse or hypersparse, not full, since it
    // must accomodate zombies
    ASSERT (!GB_IS_FULL (C)) ;
    ASSERT (!GB_IS_BITMAP (C)) ;

    (*p_TaskList  ) = NULL ;
    (*p_TaskList_size) = 0 ;
    (*p_ntasks    ) = 0 ;
    (*p_nthreads  ) = 1 ;

    //--------------------------------------------------------------------------
    // determine # of threads to use
    //--------------------------------------------------------------------------

    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;

    //--------------------------------------------------------------------------
    // get C
    //--------------------------------------------------------------------------

    const int64_t *restrict Cp = C->p ;
    int64_t *restrict Cwork = C->i ;
    const int64_t cnvec = C->nvec ;
    const int64_t cvlen = C->vlen ;
    const int64_t cnz = GB_nnz_held (C) ;

    //--------------------------------------------------------------------------
    // compute the cumulative sum of the work
    //--------------------------------------------------------------------------

    // FUTURE:: handle possible int64_t overflow

    int nthreads = GB_nthreads (cnz, chunk, nthreads_max) ;
    GB_cumsum (Cwork, cnz, NULL, nthreads, Context) ;
    double total_work = (double) Cwork [cnz] ;

    //--------------------------------------------------------------------------
    // allocate the initial TaskList
    //--------------------------------------------------------------------------

    GB_WERK_DECLARE (Coarse, int64_t) ;
    int ntasks1 = 0 ;
    nthreads = GB_nthreads (total_work, chunk, nthreads_max) ;
    GB_task_struct *restrict TaskList = NULL ; size_t TaskList_size = 0 ;
    int max_ntasks = 0 ;
    int ntasks = 0 ;
    int ntasks0 = (nthreads == 1) ? 1 : (32 * nthreads) ;
    GB_REALLOC_TASK_WORK (TaskList, ntasks0, max_ntasks) ;

    //--------------------------------------------------------------------------
    // check for quick return for a single task
    //--------------------------------------------------------------------------

    if (cnvec == 0 || ntasks0 == 1)
    { 
        // construct a single coarse task that does all the work
        TaskList [0].kfirst = 0 ;
        TaskList [0].klast  = cnvec-1 ;
        TaskList [0].pC = 0 ;
        TaskList [0].pC_end  = cnz ;
        (*p_TaskList  ) = TaskList ;
        (*p_TaskList_size) = TaskList_size ;
        (*p_ntasks    ) = (cnvec == 0) ? 0 : 1 ;
        (*p_nthreads  ) = 1 ;
        return (GrB_SUCCESS) ;
    }

    //--------------------------------------------------------------------------
    // determine # of threads and tasks
    //--------------------------------------------------------------------------

    double target_task_size = total_work / (double) (ntasks0) ;
    target_task_size = GB_IMAX (target_task_size, chunk) ;
    ntasks1 = total_work / target_task_size ;
    ntasks1 = GB_IMIN (ntasks1, cnz) ;
    ntasks1 = GB_IMAX (ntasks1, 1) ;

    //--------------------------------------------------------------------------
    // slice the work into coarse tasks
    //--------------------------------------------------------------------------

    GB_WERK_PUSH (Coarse, ntasks1 + 1, int64_t) ;
    if (Coarse == NULL)
    { 
        // out of memory
        GB_FREE_ALL ;
        return (GrB_OUT_OF_MEMORY) ;
    }
    GB_pslice (Coarse, Cwork, cnz, ntasks1, false) ;

    //--------------------------------------------------------------------------
    // construct all tasks, both coarse and fine
    //--------------------------------------------------------------------------

    for (int t = 0 ; t < ntasks1 ; t++)
    {

        //----------------------------------------------------------------------
        // coarse task operates on A (:, k:klast)
        //----------------------------------------------------------------------

        int64_t pfirst = Coarse [t] ;
        int64_t plast  = Coarse [t+1] - 1 ;

        if (pfirst <= plast)
        { 
            // find the first vector of the slice for task taskid: the
            // vector that owns the entry Ci [pfirst] and Cx [pfirst].
            int64_t kfirst = GB_search_for_vector (pfirst, Cp, 0, cnvec,
                cvlen) ;

            // find the last vector of the slice for task taskid: the
            // vector that owns the entry Ci [plast] and Cx [plast].
            int64_t klast = GB_search_for_vector (plast, Cp, kfirst, cnvec,
                cvlen) ;

            // construct a coarse task that computes Ci,Cx [pfirst:plast].
            // These entries appear in C(:,kfirst:klast), but this task does
            // not compute all of C(:,kfirst), but just the subset starting at
            // Ci,Cx [pstart].  The task computes all of the vectors
            // C(:,kfirst+1:klast-1).  The task computes only part of the last
            // vector, ending at Ci,Cx [pC_end-1] or Ci,Cx [plast].  This
            // slice strategy is the same as GB_ek_slice.

            GB_REALLOC_TASK_WORK (TaskList, ntasks + 1, max_ntasks) ;
            TaskList [ntasks].kfirst = kfirst ;
            TaskList [ntasks].klast  = klast ;
            ASSERT (kfirst <= klast) ;
            TaskList [ntasks].pC     = pfirst ;
            TaskList [ntasks].pC_end = plast + 1 ;
            ntasks++ ;

        }
        else
        { 
            // This task is empty, which means the coarse task that computes
            // C(i,j) is doing too much work.
            // FUTURE:: Use ultra-fine tasks here instead, and split the work
            // for computing the single dot product C(i,j) amongst several
            // ultra-fine tasks.
            ;
        }
    }

    ASSERT (ntasks <= max_ntasks) ;

    //--------------------------------------------------------------------------
    // free workspace and return result
    //--------------------------------------------------------------------------

    GB_FREE_WORKSPACE ;
    (*p_TaskList  ) = TaskList ;
    (*p_TaskList_size) = TaskList_size ;
    (*p_ntasks    ) = ntasks ;
    (*p_nthreads  ) = nthreads ;
    return (GrB_SUCCESS) ;
}