1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
|
//------------------------------------------------------------------------------
// GB_AxB_dot3_slice: slice the entries and vectors for C<M>=A'*B
//------------------------------------------------------------------------------
// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
//------------------------------------------------------------------------------
// Constructs a set of tasks that slice the matrix C for C<M>=A'*B. The matrix
// C has been allocated, and its pattern will be a copy of M, but with some
// entries possibly turned into zombies. However, on input, the pattern C->i
// does yet not contain the indices, but the work required to compute each
// entry in the dot product C(i,j)<M(i,j)> = A(:,i)'*B(:,j).
// The strategy for slicing of C and M is like GB_ek_slice, for coarse tasks.
// These coarse tasks differ from the tasks generated by GB_ewise_slice,
// since they may start in the middle of a vector. If a single entry C(i,j)
// is costly to compute, it is possible that it is placed by itself in a
// single coarse task.
// FUTURE:: Ultra-fine tasks could also be constructed, so that the computation
// of a single entry C(i,j) can be broken into multiple tasks. The slice of
// A(:,i) and B(:,j) would use GB_slice_vector, where no mask would be used.
#define GB_FREE_WORKSPACE \
{ \
GB_WERK_POP (Coarse, int64_t) ; \
}
#define GB_FREE_ALL \
{ \
GB_FREE_WORKSPACE ; \
GB_FREE_WORK (&TaskList, TaskList_size) ; \
}
#include "GB_mxm.h"
#include "GB_search_for_vector_template.c"
//------------------------------------------------------------------------------
// GB_AxB_dot3_slice
//------------------------------------------------------------------------------
GrB_Info GB_AxB_dot3_slice
(
// output:
GB_task_struct **p_TaskList, // array of structs
size_t *p_TaskList_size, // size of TaskList
int *p_ntasks, // # of tasks constructed
int *p_nthreads, // # of threads to use
// input:
const GrB_Matrix C, // matrix to slice
GB_Context Context
)
{
//--------------------------------------------------------------------------
// check inputs
//--------------------------------------------------------------------------
ASSERT (p_TaskList != NULL) ;
ASSERT (p_TaskList_size != NULL) ;
ASSERT (p_ntasks != NULL) ;
ASSERT (p_nthreads != NULL) ;
// ASSERT_MATRIX_OK (C, ...) cannot be done since C->i is the work need to
// compute the entry, not the row index itself.
// C is always constructed as sparse or hypersparse, not full, since it
// must accomodate zombies
ASSERT (!GB_IS_FULL (C)) ;
ASSERT (!GB_IS_BITMAP (C)) ;
(*p_TaskList ) = NULL ;
(*p_TaskList_size) = 0 ;
(*p_ntasks ) = 0 ;
(*p_nthreads ) = 1 ;
//--------------------------------------------------------------------------
// determine # of threads to use
//--------------------------------------------------------------------------
GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
//--------------------------------------------------------------------------
// get C
//--------------------------------------------------------------------------
const int64_t *restrict Cp = C->p ;
int64_t *restrict Cwork = C->i ;
const int64_t cnvec = C->nvec ;
const int64_t cvlen = C->vlen ;
const int64_t cnz = GB_nnz_held (C) ;
//--------------------------------------------------------------------------
// compute the cumulative sum of the work
//--------------------------------------------------------------------------
// FUTURE:: handle possible int64_t overflow
int nthreads = GB_nthreads (cnz, chunk, nthreads_max) ;
GB_cumsum (Cwork, cnz, NULL, nthreads, Context) ;
double total_work = (double) Cwork [cnz] ;
//--------------------------------------------------------------------------
// allocate the initial TaskList
//--------------------------------------------------------------------------
GB_WERK_DECLARE (Coarse, int64_t) ;
int ntasks1 = 0 ;
nthreads = GB_nthreads (total_work, chunk, nthreads_max) ;
GB_task_struct *restrict TaskList = NULL ; size_t TaskList_size = 0 ;
int max_ntasks = 0 ;
int ntasks = 0 ;
int ntasks0 = (nthreads == 1) ? 1 : (32 * nthreads) ;
GB_REALLOC_TASK_WORK (TaskList, ntasks0, max_ntasks) ;
//--------------------------------------------------------------------------
// check for quick return for a single task
//--------------------------------------------------------------------------
if (cnvec == 0 || ntasks0 == 1)
{
// construct a single coarse task that does all the work
TaskList [0].kfirst = 0 ;
TaskList [0].klast = cnvec-1 ;
TaskList [0].pC = 0 ;
TaskList [0].pC_end = cnz ;
(*p_TaskList ) = TaskList ;
(*p_TaskList_size) = TaskList_size ;
(*p_ntasks ) = (cnvec == 0) ? 0 : 1 ;
(*p_nthreads ) = 1 ;
return (GrB_SUCCESS) ;
}
//--------------------------------------------------------------------------
// determine # of threads and tasks
//--------------------------------------------------------------------------
double target_task_size = total_work / (double) (ntasks0) ;
target_task_size = GB_IMAX (target_task_size, chunk) ;
ntasks1 = total_work / target_task_size ;
ntasks1 = GB_IMIN (ntasks1, cnz) ;
ntasks1 = GB_IMAX (ntasks1, 1) ;
//--------------------------------------------------------------------------
// slice the work into coarse tasks
//--------------------------------------------------------------------------
GB_WERK_PUSH (Coarse, ntasks1 + 1, int64_t) ;
if (Coarse == NULL)
{
// out of memory
GB_FREE_ALL ;
return (GrB_OUT_OF_MEMORY) ;
}
GB_pslice (Coarse, Cwork, cnz, ntasks1, false) ;
//--------------------------------------------------------------------------
// construct all tasks, both coarse and fine
//--------------------------------------------------------------------------
for (int t = 0 ; t < ntasks1 ; t++)
{
//----------------------------------------------------------------------
// coarse task operates on A (:, k:klast)
//----------------------------------------------------------------------
int64_t pfirst = Coarse [t] ;
int64_t plast = Coarse [t+1] - 1 ;
if (pfirst <= plast)
{
// find the first vector of the slice for task taskid: the
// vector that owns the entry Ci [pfirst] and Cx [pfirst].
int64_t kfirst = GB_search_for_vector (pfirst, Cp, 0, cnvec,
cvlen) ;
// find the last vector of the slice for task taskid: the
// vector that owns the entry Ci [plast] and Cx [plast].
int64_t klast = GB_search_for_vector (plast, Cp, kfirst, cnvec,
cvlen) ;
// construct a coarse task that computes Ci,Cx [pfirst:plast].
// These entries appear in C(:,kfirst:klast), but this task does
// not compute all of C(:,kfirst), but just the subset starting at
// Ci,Cx [pstart]. The task computes all of the vectors
// C(:,kfirst+1:klast-1). The task computes only part of the last
// vector, ending at Ci,Cx [pC_end-1] or Ci,Cx [plast]. This
// slice strategy is the same as GB_ek_slice.
GB_REALLOC_TASK_WORK (TaskList, ntasks + 1, max_ntasks) ;
TaskList [ntasks].kfirst = kfirst ;
TaskList [ntasks].klast = klast ;
ASSERT (kfirst <= klast) ;
TaskList [ntasks].pC = pfirst ;
TaskList [ntasks].pC_end = plast + 1 ;
ntasks++ ;
}
else
{
// This task is empty, which means the coarse task that computes
// C(i,j) is doing too much work.
// FUTURE:: Use ultra-fine tasks here instead, and split the work
// for computing the single dot product C(i,j) amongst several
// ultra-fine tasks.
;
}
}
ASSERT (ntasks <= max_ntasks) ;
//--------------------------------------------------------------------------
// free workspace and return result
//--------------------------------------------------------------------------
GB_FREE_WORKSPACE ;
(*p_TaskList ) = TaskList ;
(*p_TaskList_size) = TaskList_size ;
(*p_ntasks ) = ntasks ;
(*p_nthreads ) = nthreads ;
return (GrB_SUCCESS) ;
}
|