1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
|
//------------------------------------------------------------------------------
// GB_transpose_method: select method for GB_transpose
//------------------------------------------------------------------------------
// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
//------------------------------------------------------------------------------
#include "GB_transpose.h"
// GB_transpose can use choose between a merge-sort-based method that takes
// O(anz*log(anz)) time, or a bucket-sort method that takes O(anz+m+n) time.
// The bucket sort has 3 methods: sequential, atomic, and non-atomic.
bool GB_transpose_method // if true: use GB_builder, false: use bucket
(
const GrB_Matrix A, // matrix to transpose
int *nworkspaces_bucket, // # of slices of A for the bucket method
int *nthreads_bucket, // # of threads to use for the bucket method
GB_Context Context
)
{
//--------------------------------------------------------------------------
// get inputs
//--------------------------------------------------------------------------
// if available, A->nvec_nonempty is used to select the method
int64_t anvec = (A->nvec_nonempty < 0) ? A->nvec : A->nvec_nonempty ;
int64_t anz = GB_nnz (A) ;
int64_t avlen = A->vlen ;
// int64_t avdim = A->vdim ;
int anzlog = (anz == 0) ? 1 : (int) GB_CEIL_LOG2 (anz) ;
int mlog = (avlen == 0) ? 1 : (int) GB_CEIL_LOG2 (avlen) ;
double bucket_factor ;
// determine # of threads for bucket method
GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
int nthreads = GB_nthreads (anz + avlen, chunk, nthreads_max) ;
//--------------------------------------------------------------------------
// select between the atomic and non-atomic bucket method
//--------------------------------------------------------------------------
bool atomics ;
if (nthreads <= 2)
{
// sequential bucket method: no atomics needed
// 2 threads: always use non-atomic method
atomics = false ;
}
else if ((double) nthreads * (double) avlen > (double) anz)
{
// non-atomic workspace is too high; use atomic method
atomics = true ;
}
else
{
// select between atomic and non-atomic methods. This rule is based on
// performance on a 4-core system with 4 threads with gcc 7.5. The icc
// compiler has much slower atomics than gcc and so atol should likely
// be smaller when using icc.
int atol ;
if (anzlog < 14)
{
atol = -4 ; // fewer than 16K entries in A
}
else
{
switch (anzlog)
{
case 14: atol = -4 ; break ; // 16K entries in A
case 15: atol = -3 ; break ; // 32K
case 16: atol = -2 ; break ; // 64K
case 17: atol = -1 ; break ; // 128K
case 18: atol = 0 ; break ; // 256K
case 19: atol = 1 ; break ; // 512K
case 20: atol = 2 ; break ; // 1M
case 21: atol = 3 ; break ; // 2M
case 22: atol = 4 ; break ; // 4M
case 23: atol = 5 ; break ; // 8M
case 24: atol = 6 ; break ; // 16M
case 25: atol = 8 ; break ; // 32M
case 26: atol = 9 ; break ; // 64M
case 27: atol = 9 ; break ; // 128M
case 28: atol = 10 ; break ; // 256M
default: atol = 10 ; break ; // > 256M
}
}
if (anzlog - mlog <= atol)
{
// use atomic method
// anzlog - mlog is the log2 of the average row degree, rounded.
// If the average row degree is <= 2^atol, use the atomic method.
// That is, the atomic method works better for sparser matrices,
// and the non-atomic works better or denser matrices. However,
// the threshold changes as the problem gets larger, in terms of #
// of entries in A, when the atomic method becomes more attractive
// relative to the non-atomic method. The atomic has the
// advantange of needing much less workspace, which becomes more
// important for larger problems.
atomics = true ;
}
else
{
// use non-atomic method
atomics = false ;
}
}
(*nworkspaces_bucket) = (atomics) ? 1 : nthreads ;
(*nthreads_bucket) = nthreads ;
//--------------------------------------------------------------------------
// select between GB_builder method and bucket method
//--------------------------------------------------------------------------
// As the problem gets larger, the GB_builder method gets faster relative
// to the bucket method, in terms of the "constants" in the O(a log a) work
// for GB_builder, or O (a+m+n) for the bucket method. Clearly, O (a log
// a) and O (a+m+n) do not fully model the performance of these two
// methods. Perhaps this is because of cache effects. The bucket method
// has more irregular memory accesses. The GB_builder method uses
// mergesort, which has good memory locality.
if (anzlog < 14)
{
bucket_factor = 0.5 ; // fewer than 2^14 = 16K entries
}
else
{
switch (anzlog)
{
case 14: bucket_factor = 0.6 ; break ; // 16K entries in A
case 15: bucket_factor = 0.7 ; break ; // 32K
case 16: bucket_factor = 1.0 ; break ; // 64K
case 17: bucket_factor = 1.7 ; break ; // 128K
case 18: bucket_factor = 3.0 ; break ; // 256K
case 19: bucket_factor = 4.0 ; break ; // 512K
case 20: bucket_factor = 6.0 ; break ; // 1M
case 21: bucket_factor = 7.0 ; break ; // 2M
case 22: bucket_factor = 8.0 ; break ; // 4M
case 23: bucket_factor = 5.0 ; break ; // 8M
case 24: bucket_factor = 5.0 ; break ; // 16M
case 25: bucket_factor = 5.0 ; break ; // 32M
case 26: bucket_factor = 5.0 ; break ; // 64M
case 27: bucket_factor = 5.0 ; break ; // 128M
case 28: bucket_factor = 5.0 ; break ; // 256M
default: bucket_factor = 5.0 ; break ; // > 256M
}
}
double bucket_work = (double) (anz + avlen + anvec) * bucket_factor ;
double builder_work = (log2 ((double) anz + 1) * (anz)) ;
//--------------------------------------------------------------------------
// select the method with the least amount of work
//--------------------------------------------------------------------------
bool use_builder = (builder_work < bucket_work) ;
return (use_builder) ;
}
|