1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
|
//------------------------------------------------------------------------------
// GB_AxB_saxpy4_tasks: construct tasks for saxpy4 and bitmap_saxpy
//------------------------------------------------------------------------------
// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
//------------------------------------------------------------------------------
// GB_AxB_saxpy4_tasks constructs the tasks for GB_AxB_saxpy4, and for
// GB_AxB_bitmap_saxpy when A is sparse/hyper and B is bitmap/full.
#include "GB_AxB_saxpy.h"
void GB_AxB_saxpy4_tasks
(
// output
int *p_ntasks, // # of tasks to use
int *p_nthreads, // # of threads to use
int *p_nfine_tasks_per_vector, // # of tasks per vector (fine case only)
bool *p_use_coarse_tasks, // if true, use coarse tasks
bool *p_use_atomics, // if true, use atomics
// input
int64_t anz, // # of entries in A (sparse or hyper)
int64_t bnz, // # of entries held in B
int64_t bvdim, // # of vectors of B (bitmap or full)
int64_t cvlen, // # of vectors of C (bitmap or full)
GB_Context Context
)
{
//--------------------------------------------------------------------------
// determine the work to do
//--------------------------------------------------------------------------
GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
double work = ((double) anz) * (double) bvdim ;
int nthreads = GB_nthreads (work, chunk, nthreads_max) ;
int nfine_tasks_per_vector = 0, ntasks ;
bool use_coarse_tasks, use_atomics = false ;
//--------------------------------------------------------------------------
// create the tasks
//--------------------------------------------------------------------------
if (nthreads == 1 || bvdim == 0)
{
//----------------------------------------------------------------------
// do the entire computation with a single thread, with coarse task
//----------------------------------------------------------------------
ntasks = 1 ;
use_coarse_tasks = true ;
GBURBLE ("(coarse, threads: 1) ") ;
}
else if (nthreads <= 2 * bvdim)
{
//----------------------------------------------------------------------
// all tasks are coarse
//----------------------------------------------------------------------
// Each coarse task does 1 or more whole vectors of B
ntasks = GB_IMIN (bvdim, 2 * nthreads) ;
nthreads = GB_IMIN (ntasks, nthreads) ;
use_coarse_tasks = true ;
GBURBLE ("(coarse, threads: %d, tasks %d) ", nthreads, ntasks) ;
}
else
{
//----------------------------------------------------------------------
// use fine tasks
//----------------------------------------------------------------------
// Each task does a slice of a single vector of B, and each vector of B
// is handled by the same # of fine tasks. Determine if atomics are
// to be used or not.
use_coarse_tasks = false ;
double cnz = ((double) cvlen) * ((double) bvdim) ;
double intensity = work / fmax (cnz, 1) ;
double workspace = ((double) cvlen) * ((double) nthreads) ;
double relwspace = workspace / fmax (anz + bnz + cnz, 1) ;
GBURBLE ("(threads: %d, relwspace: %0.3g, intensity: %0.3g",
nthreads, relwspace, intensity) ;
if (intensity > 2 && relwspace < 0.5)
{
// fine non-atomic method with workspace
ntasks = nthreads ;
GBURBLE (": fine non-atomic, ") ;
}
else
{
// fine atomic method, with no workspace
use_atomics = true ;
ntasks = 4 * nthreads ;
GBURBLE (": fine atomic, ") ;
}
nfine_tasks_per_vector = ceil ((double) ntasks / (double) bvdim) ;
ntasks = bvdim * nfine_tasks_per_vector ;
ASSERT (nfine_tasks_per_vector > 1) ;
GBURBLE ("tasks: %d, tasks per vector: %d) ", ntasks,
nfine_tasks_per_vector) ;
}
//--------------------------------------------------------------------------
// return result
//--------------------------------------------------------------------------
(*p_ntasks) = ntasks ;
(*p_nthreads) = nthreads ;
(*p_nfine_tasks_per_vector) = nfine_tasks_per_vector ;
(*p_use_coarse_tasks) = use_coarse_tasks ;
(*p_use_atomics) = use_atomics ;
}
|