File: GB_transpose_bucket.c

package info (click to toggle)
suitesparse-graphblas 7.4.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 67,112 kB
  • sloc: ansic: 1,072,243; cpp: 8,081; sh: 512; makefile: 506; asm: 369; python: 125; awk: 10
file content (343 lines) | stat: -rw-r--r-- 13,753 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
//------------------------------------------------------------------------------
// GB_transpose_bucket: transpose and optionally typecast and/or apply operator
//------------------------------------------------------------------------------

// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

// C = A' or op(A').  Optionally typecasts from A->type to the new type ctype,
// and/or optionally applies a unary operator.

// If an operator z=op(x) is provided, the type of z must be the same as the
// type of C.  The type of A must be compatible with the type of of x (A is
// typecasted into the type of x).  These conditions must be checked in the
// caller.

// This function is agnostic for the CSR/CSC format of C and A.  C_is_csc is
// defined by the caller and assigned to C->is_csc, but otherwise unused.
// A->is_csc is ignored.

// The input can be hypersparse or non-hypersparse.  The output C is always
// non-hypersparse, and never shallow.  On input, C is a static header.

// If A is m-by-n in CSC format, with e nonzeros, the time and memory taken is
// O(m+n+e) if A is non-hypersparse, or O(m+e) if hypersparse.  This is fine if
// most rows and columns of A are non-empty, but can be very costly if A or A'
// is hypersparse.  In particular, if A is a non-hypersparse column vector with
// m >> e, the time and memory is O(m), which can be huge.  Thus, for
// hypersparse matrices, or for very sparse matrices, the qsort method should
// be used instead (see GB_transpose).

// This method is parallel, but not highly scalable.  At most O(e/m) threads
// are used.

#include "GB_transpose.h"

#define GB_FREE_WORKSPACE                                               \
{                                                                       \
    if (Workspaces != NULL && Workspaces_size != NULL)                  \
    {                                                                   \
        for (int tid = 0 ; tid < nworkspaces ; tid++)                   \
        {                                                               \
            GB_FREE_WORK (&(Workspaces [tid]), Workspaces_size [tid]) ; \
        }                                                               \
    }                                                                   \
    GB_WERK_POP (A_slice, int64_t) ;                                    \
    GB_WERK_POP (Workspaces_size, size_t) ;                             \
    GB_WERK_POP (Workspaces, int64_t *) ;                               \
}

#define GB_FREE_ALL                                                     \
{                                                                       \
    GB_phybix_free (C) ;                                                \
    GB_FREE_WORKSPACE ;                                                 \
}

GrB_Info GB_transpose_bucket    // bucket transpose; typecast and apply op
(
    GrB_Matrix C,               // output matrix (static header)
    const GB_iso_code C_code_iso,   // iso code for C
    const GrB_Type ctype,       // type of output matrix C
    const bool C_is_csc,        // format of output matrix C
    const GrB_Matrix A,         // input matrix
        // no operator is applied if op is NULL
        const GB_Operator op,       // unary/idxunop/binop to apply
        const GrB_Scalar scalar,    // scalar to bind to binary operator
        bool binop_bind1st,         // if true, binop(x,A) else binop(A,y)
    const int nworkspaces,      // # of workspaces to use
    const int nthreads,         // # of threads to use
    GB_Context Context
)
{

    //--------------------------------------------------------------------------
    // check inputs
    //--------------------------------------------------------------------------

    ASSERT (C != NULL && (C->static_header || GBNSTATIC)) ;
    ASSERT_TYPE_OK (ctype, "ctype for transpose", GB0) ;
    ASSERT_MATRIX_OK (A, "A input for transpose_bucket", GB0) ;
    ASSERT (!GB_PENDING (A)) ;
    ASSERT (!GB_ZOMBIES (A)) ;
    ASSERT (GB_JUMBLED_OK (A)) ;

    // if op is NULL, then no operator is applied

    // This method is only be used when A is sparse or hypersparse.
    // The full and bitmap cases are handled in GB_transpose.
    ASSERT (!GB_IS_FULL (A)) ;
    ASSERT (!GB_IS_BITMAP (A)) ;
    ASSERT (GB_IS_SPARSE (A) || GB_IS_HYPERSPARSE (A)) ;

    GB_WERK_DECLARE (A_slice, int64_t) ;            // size nthreads+1
    GB_WERK_DECLARE (Workspaces, int64_t *) ;       // size nworkspaces
    GB_WERK_DECLARE (Workspaces_size, size_t) ;     // size nworkspaces

    //--------------------------------------------------------------------------
    // get A
    //--------------------------------------------------------------------------

    int64_t anz = GB_nnz (A) ;
    int64_t vlen = A->vlen ;

    //--------------------------------------------------------------------------
    // determine the number of threads to use
    //--------------------------------------------------------------------------

    // # of threads to use in the O(vlen) loops below
    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
    int nth = GB_nthreads (vlen, chunk, nthreads_max) ;

    //--------------------------------------------------------------------------
    // allocate C: always sparse
    //--------------------------------------------------------------------------

    // The bucket transpose only works when C is sparse.
    // A can be sparse or hypersparse.

    // C->p is allocated but not initialized.
    GrB_Info info ;
    // set C->iso = C_iso   OK
    bool C_iso = (C_code_iso != GB_NON_ISO) ;
    GB_OK (GB_new_bix (&C, // sparse, existing header
        ctype, A->vdim, vlen, GB_Ap_malloc, C_is_csc,
        GxB_SPARSE, true, A->hyper_switch, vlen, anz, true, C_iso, Context)) ;

    int64_t *restrict Cp = C->p ;
    C->nvals = anz ;

    //--------------------------------------------------------------------------
    // allocate workspace
    //--------------------------------------------------------------------------

    GB_WERK_PUSH (Workspaces, nworkspaces, int64_t *) ;
    GB_WERK_PUSH (Workspaces_size, nworkspaces, size_t) ;
    if (Workspaces == NULL || Workspaces_size == NULL)
    { 
        // out of memory
        GB_FREE_ALL ;
        return (GrB_OUT_OF_MEMORY) ;
    }

    bool ok = true ;
    for (int tid = 0 ; tid < nworkspaces ; tid++)
    { 
        Workspaces [tid] = GB_MALLOC_WORK (vlen + 1, int64_t,
            &Workspaces_size [tid]) ;
        ok = ok && (Workspaces [tid] != NULL) ;
    }

    if (!ok)
    { 
        // out of memory
        GB_FREE_ALL ;
        return (GrB_OUT_OF_MEMORY) ;
    }

    //==========================================================================
    // phase1: symbolic analysis
    //==========================================================================

    // slice the A matrix, perfectly balanced for one task per thread
    GB_WERK_PUSH (A_slice, nthreads + 1, int64_t) ;
    if (A_slice == NULL)
    { 
        // out of memory
        GB_FREE_ALL ;
        return (GrB_OUT_OF_MEMORY) ;
    }
    GB_pslice (A_slice, A->p, A->nvec, nthreads, true) ;

    // sum up the row counts and find C->p
    if (nthreads == 1)
    {

        //----------------------------------------------------------------------
        // sequential method: A is not sliced
        //----------------------------------------------------------------------

        // Only requires a single int64 workspace of size vlen for a single
        // thread.  The resulting C matrix is not jumbled.
        GBURBLE ("(1-thread bucket transpose) ") ;

        // compute the row counts of A.  No need to scan the A->p pointers
        ASSERT (nworkspaces == 1) ;
        int64_t *restrict workspace = Workspaces [0] ;
        memset (workspace, 0, (vlen + 1) * sizeof (int64_t)) ;
        const int64_t *restrict Ai = A->i ;
        for (int64_t p = 0 ; p < anz ; p++)
        { 
            int64_t i = Ai [p] ;
            workspace [i]++ ;
        }

        // cumulative sum of the workspace, and copy back into C->p
        GB_cumsum (workspace, vlen, &(C->nvec_nonempty), 1, NULL) ;
        memcpy (Cp, workspace, (vlen + 1) * sizeof (int64_t)) ;

    }
    else if (nworkspaces == 1)
    {

        //----------------------------------------------------------------------
        // atomic method: A is sliced but workspace is shared
        //----------------------------------------------------------------------

        // Only requires a single int64 workspace of size vlen, shared by all
        // threads.  Scales well, but requires atomics.  If the # of rows is
        // very small and the average row degree is high, this can be very slow
        // because of contention on the atomic workspace.  Otherwise, it is
        // typically faster than the non-atomic method.  The resulting C matrix
        // is jumbled.

        GBURBLE ("(%d-thread atomic bucket transpose) ", nthreads) ;

        // compute the row counts of A.  No need to scan the A->p pointers
        int64_t *restrict workspace = Workspaces [0] ;
        GB_memset (workspace, 0, (vlen + 1) * sizeof (int64_t), nth) ;
        const int64_t *restrict Ai = A->i ;
        int64_t p ;
        #pragma omp parallel for num_threads(nthreads) schedule(static)
        for (p = 0 ; p < anz ; p++)
        { 
            int64_t i = Ai [p] ;
            // update workspace [i]++ automically:
            GB_ATOMIC_UPDATE
            workspace [i]++ ;
        }

        C->jumbled = true ; // atomic transpose leaves C jumbled

        // cumulative sum of the workspace, and copy back into C->p
        GB_cumsum (workspace, vlen, &(C->nvec_nonempty), nth, Context) ;
        GB_memcpy (Cp, workspace, (vlen+ 1) * sizeof (int64_t), nth) ;

    }
    else
    {

        //----------------------------------------------------------------------
        // non-atomic method
        //----------------------------------------------------------------------

        // compute the row counts of A for each slice, one per thread; This
        // method is parallel, but not highly scalable.  Each thread requires
        // int64 workspace of size vlen, but no atomics are required.  The
        // resulting C matrix is not jumbled, so this can save work if C needs
        // to be unjumbled later.

        GBURBLE ("(%d-thread non-atomic bucket transpose) ", nthreads) ;

        ASSERT (nworkspaces == nthreads) ;
        const int64_t *restrict Ap = A->p ;
//      const int64_t *restrict Ah = A->h ;
        const int64_t *restrict Ai = A->i ;

        int tid ;
        #pragma omp parallel for num_threads(nthreads) schedule(static)
        for (tid = 0 ; tid < nthreads ; tid++)
        {
            // get the row counts for this slice, of size A->vlen
            int64_t *restrict workspace = Workspaces [tid] ;
            memset (workspace, 0, (vlen + 1) * sizeof (int64_t)) ;
            for (int64_t k = A_slice [tid] ; k < A_slice [tid+1] ; k++)
            {
                // iterate over the entries in A(:,j)
                // int64_t j = GBH (Ah, k) ;
                int64_t pA_start = Ap [k] ;
                int64_t pA_end = Ap [k+1] ;
                for (int64_t pA = pA_start ; pA < pA_end ; pA++)
                { 
                    // count one more entry in C(i,:) for this slice
                    int64_t i = Ai [pA] ;
                    workspace [i]++ ;
                }
            }
        }

        // cumulative sum of the workspaces across the slices
        int64_t i ;
        #pragma omp parallel for num_threads(nth) schedule(static)
        for (i = 0 ; i < vlen ; i++)
        {
            int64_t s = 0 ;
            for (int tid = 0 ; tid < nthreads ; tid++)
            { 
                int64_t *restrict workspace = Workspaces [tid] ;
                int64_t c = workspace [i] ;
                workspace [i] = s ;
                s += c ;
            }
            Cp [i] = s ;
        }
        Cp [vlen] = 0 ;

        // compute the vector pointers for C
        GB_cumsum (Cp, vlen, &(C->nvec_nonempty), nth, Context) ;

        // add Cp back to all Workspaces
        #pragma omp parallel for num_threads(nth) schedule(static)
        for (i = 0 ; i < vlen ; i++)
        {
            int64_t s = Cp [i] ;
            int64_t *restrict workspace = Workspaces [0] ;
            workspace [i] = s ;
            for (int tid = 1 ; tid < nthreads ; tid++)
            { 
                int64_t *restrict workspace = Workspaces [tid] ;
                workspace [i] += s ;
            }
        }
    }

    C->magic = GB_MAGIC ;

    //==========================================================================
    // phase2: transpose A into C
    //==========================================================================

    // transpose both the pattern and the values
    if (op == NULL)
    { 
        // do not apply an operator; optional typecast to C->type
        GB_transpose_ix (C, A, Workspaces, A_slice, nworkspaces, nthreads) ;
    }
    else
    { 
        // apply an operator, C has type op->ztype
        GB_transpose_op (C, C_code_iso, op, scalar, binop_bind1st, A,
            Workspaces, A_slice, nworkspaces, nthreads) ;
    }

    //--------------------------------------------------------------------------
    // free workspace and return result
    //--------------------------------------------------------------------------

    GB_FREE_WORKSPACE ;
    ASSERT_MATRIX_OK (C, "C transpose of A", GB0) ;
    ASSERT (C->h == NULL) ;
    return (GrB_SUCCESS) ;
}