File: GB_transpose_method.c

package info (click to toggle)
suitesparse-graphblas 7.4.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 67,112 kB
  • sloc: ansic: 1,072,243; cpp: 8,081; sh: 512; makefile: 506; asm: 369; python: 125; awk: 10
file content (166 lines) | stat: -rw-r--r-- 7,116 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
//------------------------------------------------------------------------------
// GB_transpose_method: select method for GB_transpose
//------------------------------------------------------------------------------

// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

#include "GB_transpose.h"

// GB_transpose can use choose between a merge-sort-based method that takes
// O(anz*log(anz)) time, or a bucket-sort method that takes O(anz+m+n) time.
// The bucket sort has 3 methods: sequential, atomic, and non-atomic.

bool GB_transpose_method        // if true: use GB_builder, false: use bucket
(
    const GrB_Matrix A,         // matrix to transpose
    int *nworkspaces_bucket,    // # of slices of A for the bucket method
    int *nthreads_bucket,       // # of threads to use for the bucket method
    GB_Context Context
)
{

    //--------------------------------------------------------------------------
    // get inputs
    //--------------------------------------------------------------------------

    // if available, A->nvec_nonempty is used to select the method
    int64_t anvec = (A->nvec_nonempty < 0) ? A->nvec : A->nvec_nonempty ;
    int64_t anz = GB_nnz (A) ;
    int64_t avlen = A->vlen ;
//  int64_t avdim = A->vdim ;
    int anzlog = (anz   == 0) ? 1 : (int) GB_CEIL_LOG2 (anz) ;
    int mlog   = (avlen == 0) ? 1 : (int) GB_CEIL_LOG2 (avlen) ;
    double bucket_factor ;

    // determine # of threads for bucket method
    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
    int nthreads = GB_nthreads (anz + avlen, chunk, nthreads_max) ;

    //--------------------------------------------------------------------------
    // select between the atomic and non-atomic bucket method
    //--------------------------------------------------------------------------

    bool atomics ;
    if (nthreads <= 2)
    { 
        // sequential bucket method: no atomics needed
        // 2 threads: always use non-atomic method
        atomics = false ;
    }
    else if ((double) nthreads * (double) avlen > (double) anz)
    { 
        // non-atomic workspace is too high; use atomic method
        atomics = true ;
    }
    else
    {
        // select between atomic and non-atomic methods.  This rule is based on
        // performance on a 4-core system with 4 threads with gcc 7.5.  The icc
        // compiler has much slower atomics than gcc and so atol should likely
        // be smaller when using icc.

        int atol ;
        if (anzlog < 14)
        { 
            atol = -4 ;     // fewer than 16K entries in A
        }
        else
        { 
            switch (anzlog)
            {
                case 14: atol = -4 ; break ;        // 16K entries in A
                case 15: atol = -3 ; break ;        // 32K
                case 16: atol = -2 ; break ;        // 64K
                case 17: atol = -1 ; break ;        // 128K
                case 18: atol =  0 ; break ;        // 256K
                case 19: atol =  1 ; break ;        // 512K
                case 20: atol =  2 ; break ;        // 1M
                case 21: atol =  3 ; break ;        // 2M
                case 22: atol =  4 ; break ;        // 4M
                case 23: atol =  5 ; break ;        // 8M
                case 24: atol =  6 ; break ;        // 16M
                case 25: atol =  8 ; break ;        // 32M
                case 26: atol =  9 ; break ;        // 64M
                case 27: atol =  9 ; break ;        // 128M
                case 28: atol = 10 ; break ;        // 256M
                default: atol = 10 ; break ;        // > 256M
            }
        }

        if (anzlog - mlog <= atol)
        { 
            // use atomic method
            // anzlog - mlog is the log2 of the average row degree, rounded.
            // If the average row degree is <= 2^atol, use the atomic method.
            // That is, the atomic method works better for sparser matrices,
            // and the non-atomic works better or denser matrices.  However,
            // the threshold changes as the problem gets larger, in terms of #
            // of entries in A, when the atomic method becomes more attractive
            // relative to the non-atomic method.  The atomic has the
            // advantange of needing much less workspace, which becomes more
            // important for larger problems.
            atomics = true ;
        }
        else
        { 
            // use non-atomic method
            atomics = false ;
        }
    }

    (*nworkspaces_bucket) = (atomics) ? 1 : nthreads ;
    (*nthreads_bucket) = nthreads ;

    //--------------------------------------------------------------------------
    // select between GB_builder method and bucket method
    //--------------------------------------------------------------------------

    // As the problem gets larger, the GB_builder method gets faster relative
    // to the bucket method, in terms of the "constants" in the O(a log a) work
    // for GB_builder, or O (a+m+n) for the bucket method.  Clearly, O (a log
    // a) and O (a+m+n) do not fully model the performance of these two
    // methods.  Perhaps this is because of cache effects.  The bucket method
    // has more irregular memory accesses.  The GB_builder method uses
    // mergesort, which has good memory locality.

    if (anzlog < 14)
    { 
        bucket_factor = 0.5 ;       // fewer than 2^14 = 16K entries
    }
    else
    { 
        switch (anzlog)
        {
            case 14: bucket_factor = 0.6 ; break ;      // 16K entries in A
            case 15: bucket_factor = 0.7 ; break ;      // 32K
            case 16: bucket_factor = 1.0 ; break ;      // 64K
            case 17: bucket_factor = 1.7 ; break ;      // 128K
            case 18: bucket_factor = 3.0 ; break ;      // 256K
            case 19: bucket_factor = 4.0 ; break ;      // 512K
            case 20: bucket_factor = 6.0 ; break ;      // 1M
            case 21: bucket_factor = 7.0 ; break ;      // 2M
            case 22: bucket_factor = 8.0 ; break ;      // 4M
            case 23: bucket_factor = 5.0 ; break ;      // 8M
            case 24: bucket_factor = 5.0 ; break ;      // 16M
            case 25: bucket_factor = 5.0 ; break ;      // 32M
            case 26: bucket_factor = 5.0 ; break ;      // 64M
            case 27: bucket_factor = 5.0 ; break ;      // 128M
            case 28: bucket_factor = 5.0 ; break ;      // 256M
            default: bucket_factor = 5.0 ; break ;      // > 256M
        }
    }

    double bucket_work  = (double) (anz + avlen + anvec) * bucket_factor ;
    double builder_work = (log2 ((double) anz + 1) * (anz)) ;

    //--------------------------------------------------------------------------
    // select the method with the least amount of work
    //--------------------------------------------------------------------------

    bool use_builder = (builder_work < bucket_work) ;
    return (use_builder) ;
}