File: GB_AxB_saxpy3.c

package info (click to toggle)
suitesparse-graphblas 7.4.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 67,112 kB
  • sloc: ansic: 1,072,243; cpp: 8,081; sh: 512; makefile: 506; asm: 369; python: 125; awk: 10
file content (728 lines) | stat: -rw-r--r-- 27,166 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
//------------------------------------------------------------------------------
// GB_AxB_saxpy3: compute C=A*B, C<M>=A*B, or C<!M>=A*B in parallel
//------------------------------------------------------------------------------

// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

// GB_AxB_saxpy3 computes C=A*B, C<M>=A*B, or C<!M>=A*B in parallel.  If the
// mask matrix M has too many entries compared to the work to compute A*B, then
// it is not applied.  Instead, M is ignored and C=A*B is computed.  The mask
// is applied later, in GB_mxm.

// C is sparse or hypersparse.  M, A, and B can have any format.
// The accum operator is not handled, and C is not modified in-place.  Instead,
// C is constructed in a static header.

// For simplicity, this discussion and all comments in this code assume that
// all matrices are in CSC format, but the algorithm is CSR/CSC agnostic.

// The matrix B is split into two kinds of tasks: coarse and fine.  A coarse
// task computes C(:,j1:j2) = A*B(:,j1:j2), for a unique set of vectors j1:j2.
// Those vectors are not shared with any other tasks.  A fine task works with a
// team of other fine tasks to compute C(:,j) for a single vector j.  Each fine
// task computes A*B(k1:k2,j) for a unique range k1:k2, and sums its results
// into C(:,j) via atomic operations.

// Each coarse or fine task uses either Gustavson's method [1] or the Hash
// method [2].  There are 4 kinds of tasks:

//      fine Gustavson task
//      fine hash task
//      coarse Gustason task
//      coarse hash task

// Each of the 4 kinds tasks are then subdivided into 3 variants, for C=A*B,
// C<M>=A*B, and C<!M>=A*B, giving a total of 12 different types of tasks.

// Fine tasks are used when there would otherwise be too much work for a single
// task to compute the single vector C(:,j).  Fine tasks share all of their
// workspace with the team of fine tasks computing C(:,j).  Coarse tasks are
// prefered since they require less synchronization, but fine tasks allow for
// better parallelization when B has only a few vectors.  If B consists of a
// single vector (for GrB_mxv if A is in CSC format and not transposed, or
// for GrB_vxm if A is in CSR format and not transpose), then the only way to
// get parallelism is via fine tasks.  If a single thread is used for this
// case, a single-vector coarse task is used.

// To select between the Hash method or Gustavson's method for each task, the
// hash table size is first found.  The hash table size for a hash task depends
// on the maximum flop count for any vector in that task (which is just one
// vector for the fine tasks).  It is set to twice the smallest power of 2 that
// is greater than the flop count to compute that vector (plus the # of entries
// in M(:,j) for tasks that compute C<M>=A*B or C<!M>=A*B).  This size ensures
// the results will fit in the hash table, and with ideally only a modest
// number of collisions.  If the hash table size exceeds a threshold (currently
// m/16 if C is m-by-n), then Gustavson's method is used instead, and the hash
// table size is set to m, to serve as the gather/scatter workspace for
// Gustavson's method.

// The workspace allocated depends on the type of task.  Let s be the hash
// table size for the task, and C is m-by-n (assuming all matrices are CSC; if
// CSR, then m is replaced with n).
//
//      fine Gustavson task (shared):   int8_t  Hf [m] ; ctype Hx [m] ;
//      fine hash task (shared):        int64_t Hf [s] ; ctype Hx [s] ;
//      coarse Gustavson task:          int64_t Hf [m] ; ctype Hx [m] ;
//      coarse hash task:               int64_t Hf [s] ; ctype Hx [s] ;
//                                      int64_t Hi [s] ; 
//
// Note that the Hi array is needed only for the coarse hash task.  Additional
// workspace is allocated to construct the list of tasks, but this is freed
// before C is constructed.

// References:

// [1] Fred G. Gustavson. 1978. Two Fast Algorithms for Sparse Matrices:
// Multiplication and Permuted Transposition. ACM Trans. Math. Softw.  4, 3
// (Sept. 1978), 250–269. DOI:https://doi.org/10.1145/355791.355796

// [2] Yusuke Nagasaka, Satoshi Matsuoka, Ariful Azad, and Aydin Buluc. 2018.
// High-Performance Sparse Matrix-Matrix Products on Intel KNL and Multicore
// Architectures. In Proc. 47th Intl. Conf. on Parallel Processing (ICPP '18).
// Association for Computing Machinery, New York, NY, USA, Article 34, 1–10.
// DOI:https://doi.org/10.1145/3229710.3229720

//------------------------------------------------------------------------------

#include "GB_mxm.h"
#include "GB_AxB_saxpy_generic.h"
#include "GB_control.h"
#include "GB_AxB__include1.h"
#ifndef GBCUDA_DEV
#include "GB_AxB__include2.h"
#endif
#include "GB_unused.h"

#define GB_FREE_WORKSPACE                           \
{                                                   \
    GB_FREE_WORK (&SaxpyTasks, SaxpyTasks_size) ;   \
    GB_FREE_WORK (&Hi_all, Hi_all_size) ;           \
    GB_FREE_WORK (&Hf_all, Hf_all_size) ;           \
    GB_FREE_WORK (&Hx_all, Hx_all_size) ;           \
}

#define GB_FREE_ALL             \
{                               \
    GB_FREE_WORKSPACE ;         \
    GB_phybix_free (C) ;        \
}

//------------------------------------------------------------------------------
// GB_AxB_saxpy3: compute C=A*B, C<M>=A*B, or C<!M>=A*B in parallel
//------------------------------------------------------------------------------

GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
(
    GrB_Matrix C,                   // output, static header, not in-place
    const bool C_iso,               // true if C is iso
    const GB_void *cscalar,         // iso value of C
    int C_sparsity,                 // construct C as sparse or hypersparse
    const GrB_Matrix M_input,       // optional mask matrix
    const bool Mask_comp_input,     // if true, use !M
    const bool Mask_struct,         // if true, use the only structure of M
    const GrB_Matrix A,             // input matrix A
    const GrB_Matrix B,             // input matrix B
    const GrB_Semiring semiring,    // semiring that defines C=A*B
    const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
    bool *mask_applied,             // if true, then mask was applied
    GrB_Desc_Value AxB_method,      // Default, Gustavson, or Hash
    const int do_sort,              // if nonzero, try to sort in saxpy3
    GB_Context Context
)
{

    #ifdef GB_TIMING
    double ttt = omp_get_wtime ( ) ;
    #endif

    //--------------------------------------------------------------------------
    // check inputs
    //--------------------------------------------------------------------------

    GrB_Info info ;

    GrB_Matrix M = M_input ;        // use the mask M, until deciding otherwise
    bool Mask_comp = Mask_comp_input ;

    (*mask_applied) = false ;
    bool apply_mask = false ;

    ASSERT (C != NULL && (C->static_header || GBNSTATIC)) ;

    ASSERT_MATRIX_OK_OR_NULL (M, "M for saxpy3 A*B", GB0) ;
    ASSERT (!GB_PENDING (M)) ;
    ASSERT (GB_JUMBLED_OK (M)) ;
    ASSERT (!GB_ZOMBIES (M)) ;

    ASSERT_MATRIX_OK (A, "A for saxpy3 A*B", GB0) ;
    ASSERT (!GB_PENDING (A)) ;
    ASSERT (GB_JUMBLED_OK (A)) ;
    ASSERT (!GB_ZOMBIES (A)) ;

    ASSERT_MATRIX_OK (B, "B for saxpy3 A*B", GB0) ;
    ASSERT (!GB_PENDING (B)) ;
    ASSERT (GB_JUMBLED_OK (B)) ;
    ASSERT (!GB_ZOMBIES (B)) ;

    ASSERT_SEMIRING_OK (semiring, "semiring for saxpy3 A*B", GB0) ;
    ASSERT (A->vdim == B->vlen) ;

    ASSERT (C_sparsity == GxB_HYPERSPARSE || C_sparsity == GxB_SPARSE) ;

    //--------------------------------------------------------------------------
    // determine the # of threads to use
    //--------------------------------------------------------------------------

    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;

    //--------------------------------------------------------------------------
    // define workspace
    //--------------------------------------------------------------------------

    int64_t *restrict Hi_all = NULL ; size_t Hi_all_size = 0 ;
    int64_t *restrict Hf_all = NULL ; size_t Hf_all_size = 0 ;
    GB_void *restrict Hx_all = NULL ; size_t Hx_all_size = 0 ;
    GB_saxpy3task_struct *SaxpyTasks = NULL ; size_t SaxpyTasks_size = 0 ;

    //--------------------------------------------------------------------------
    // construct the hyper hashes for M and A
    //--------------------------------------------------------------------------

    GB_OK (GB_hyper_hash_build (M, Context)) ;    // does nothing if M is NULL
    GB_OK (GB_hyper_hash_build (A, Context)) ;

    //--------------------------------------------------------------------------
    // get the semiring operators
    //--------------------------------------------------------------------------

    GrB_BinaryOp mult = semiring->multiply ;
    GrB_Monoid add = semiring->add ;
    ASSERT (mult->ztype == add->op->ztype) ;
    bool A_is_pattern, B_is_pattern ;
    GB_binop_pattern (&A_is_pattern, &B_is_pattern, flipxy, mult->opcode) ;

    GB_Opcode mult_binop_code, add_binop_code ;
    GB_Type_code xcode, ycode, zcode ;
    bool builtin_semiring = GB_AxB_semiring_builtin (A, A_is_pattern, B,
        B_is_pattern, semiring, flipxy, &mult_binop_code, &add_binop_code,
        &xcode, &ycode, &zcode) ;

    //--------------------------------------------------------------------------
    // get A, and B
    //--------------------------------------------------------------------------

    const int64_t *restrict Ap = A->p ;
    const int64_t *restrict Ah = A->h ;
    const int64_t avlen = A->vlen ;
    const int64_t anvec = A->nvec ;
    const bool A_is_hyper = GB_IS_HYPERSPARSE (A) ;

    const int64_t *restrict Bp = B->p ;
    const int64_t *restrict Bh = B->h ;
    const int8_t  *restrict Bb = B->b ;
    const int64_t *restrict Bi = B->i ;
    const int64_t bvdim = B->vdim ;
    const int64_t bnz = GB_nnz_held (B) ;
    const int64_t bnvec = B->nvec ;
    const int64_t bvlen = B->vlen ;
    const bool B_is_hyper = GB_IS_HYPERSPARSE (B) ;

    //--------------------------------------------------------------------------
    // allocate C (just C->p and C->h, but not C->i or C->x)
    //--------------------------------------------------------------------------

    GrB_Type ctype = add->op->ztype ;
    size_t csize = ctype->size ;
    int64_t cvlen = avlen ;
    int64_t cvdim = bvdim ;
    int64_t cnvec = bnvec ;

    info = GB_new (&C, // sparse or hyper, existing header
        ctype, cvlen, cvdim, GB_Ap_malloc, true,
        C_sparsity, B->hyper_switch, cnvec, Context) ;
    if (info != GrB_SUCCESS)
    { 
        // out of memory
        GB_FREE_ALL ;
        return (info) ;
    }

    C->iso = C_iso ;    // OK

    int64_t *restrict Cp = C->p ;
    int64_t *restrict Ch = C->h ;
    if (B_is_hyper)
    { 
        // B and C are both hypersparse
        ASSERT (C_sparsity == GxB_HYPERSPARSE) ;
        int nth = GB_nthreads (cnvec, chunk, nthreads_max) ;
        GB_memcpy (Ch, Bh, cnvec * sizeof (int64_t), nth) ;
        C->nvec = bnvec ;
    }
    else
    { 
        // B is sparse, bitmap, or full; C is sparse
        ASSERT (C_sparsity == GxB_SPARSE) ;
    }

    #ifdef GB_TIMING
    ttt = omp_get_wtime ( ) - ttt ;
    GB_Global_timing_add (3, ttt) ;
    ttt = omp_get_wtime ( ) ;
    #endif

    //==========================================================================
    // phase0: create parallel tasks and allocate workspace
    //==========================================================================

    int nthreads, ntasks, nfine ;
    bool M_in_place = false ;

    if (nthreads_max == 1 && M == NULL && (AxB_method != GxB_AxB_HASH) &&
        GB_IMIN (GB_nnz (A), GB_nnz (B)) > cvlen/16)
    { 
        // Skip the flopcount analysis if only a single thread is being used,
        // no mask is present, the Hash method is not explicitly selected, and
        // the problem is not extremely sparse.  In this case, use a single
        // coarse Gustavson task only.  In this case, the flop count analysis
        // is not needed.
        GBURBLE ("(single-threaded Gustavson) ") ;
        info = GB_AxB_saxpy3_slice_quick (C, A, B,
            &SaxpyTasks, &SaxpyTasks_size, &ntasks, &nfine, &nthreads,
            Context) ;
    }
    else
    { 
        // Do the flopcount analysis and create a set of well-balanced tasks in
        // the general case.  This may select a single task for a single thread
        // anyway, but this decision would be based on the analysis.
        info = GB_AxB_saxpy3_slice_balanced (C, M, Mask_comp, A, B, AxB_method,
            builtin_semiring,
            &SaxpyTasks, &SaxpyTasks_size, &apply_mask, &M_in_place,
            &ntasks, &nfine, &nthreads, Context) ;
    }

    #ifdef GB_TIMING
    ttt = omp_get_wtime ( ) - ttt ;
    GB_Global_timing_add (4, ttt) ;
    ttt = omp_get_wtime ( ) ;
    #endif

    if (info == GrB_NO_VALUE)
    { 
        // The mask is present but has been discarded; need to discard the
        // analysis so far and redo it without the mask.  This may result in
        // GB_bitmap_AxB_saxpy being called instead of GB_AxB_saxpy3.
        ASSERT (M != NULL && !apply_mask) ;
        GB_FREE_ALL ;
        return (GrB_NO_VALUE) ;
    }
    else if (info != GrB_SUCCESS)
    { 
        // out of memory
        GB_FREE_ALL ;
        return (info) ;
    }

    if (!apply_mask)
    { 
        // disable the mask, if present.
        M = NULL ;
        Mask_comp = false ;
    }

    if (do_sort) GBURBLE ("sort ") ;

    //--------------------------------------------------------------------------
    // allocate the hash tables
    //--------------------------------------------------------------------------

    // If Gustavson's method is used (coarse tasks):
    //
    //      hash_size is cvlen.
    //      Hi is not allocated.
    //      Hf and Hx are both of size hash_size.
    //
    //      (Hf [i] == mark) is true if i is in the hash table.
    //      Hx [i] is the value of C(i,j) during the numeric phase.
    //
    //      Gustavson's method is used if the hash_size for the Hash method
    //      is a significant fraction of cvlen. 
    //
    // If the Hash method is used (coarse tasks):
    //
    //      hash_size is 2 times the smallest power of 2 that is larger than
    //      the # of flops required for any column C(:,j) being computed.  This
    //      ensures that all entries have space in the hash table, and that the
    //      hash occupancy will never be more than 50%.  It is always smaller
    //      than cvlen (otherwise, Gustavson's method is used).
    //
    //      A hash function is used for the ith entry:
    //          hash = GB_HASHF (i, hash_bits) ; in range 0 to hash_size-1
    //      If a collision occurs, linear probing is used:
    //          GB_REHASH (hash, i, hash_bits)
    //      which is:
    //          hash = (hash + 1) & (hash_size-1)
    //      where hash_bits = hash_size - 1
    //
    //      (Hf [hash] == mark) is true if the position is occupied.
    //      i = Hi [hash] gives the row index i that occupies that position.
    //      Hx [hash] is the value of C(i,j) during the numeric phase.
    //
    // For both coarse methods:
    //
    //      Hf starts out all zero (via calloc), and mark starts out as 1.  To
    //      clear Hf, mark is incremented, so that all entries in Hf are not
    //      equal to mark.

    // add some padding to the end of each hash table, to avoid false
    // sharing of cache lines between the hash tables.  But only add the
    // padding if there is more than one team.
    size_t hx_pad = 0 ;
    size_t hi_pad = 0 ;
    for (int taskid = 1 ; taskid < ntasks ; taskid++)
    {
        if (taskid == SaxpyTasks [taskid].leader)
        { 
            hx_pad = GB_ICEIL (64, csize) ;
            hi_pad = 64 / sizeof (int64_t) ;
            break ;
        }
    }

    size_t Hi_size_total = 0 ;
    size_t Hf_size_total = 0 ;
    size_t Hx_size_total = 0 ;

    //--------------------------------------------------------------------------
    // determine the total size of all hash tables
    //--------------------------------------------------------------------------

    int nfine_hash = 0 ;
    int nfine_gus = 0 ;
    int ncoarse_hash = 0 ;
    int ncoarse_1hash = 0 ;
    int ncoarse_gus = 0 ;

    for (int taskid = 0 ; taskid < ntasks ; taskid++)
    {

        // get the task type and its hash size
        int64_t hash_size = SaxpyTasks [taskid].hsize ;
        int64_t k = SaxpyTasks [taskid].vector ;
        bool is_fine = (k >= 0) ;
        bool use_Gustavson = (hash_size == cvlen) ;

        if (is_fine)
        {
            // fine task
            if (use_Gustavson)
            { 
                // fine Gustavson task
                nfine_gus++ ;
            }
            else
            { 
                // fine hash task
                nfine_hash++ ;
            }
        }
        else
        {
            // coarse task
            if (use_Gustavson)
            { 
                // coarse Gustavson task
                ncoarse_gus++ ;
            }
            else
            { 
                // coarse hash task
                ncoarse_hash++ ;
            }
        }

        if (taskid != SaxpyTasks [taskid].leader)
        { 
            // allocate a single shared hash table for all fine
            // tasks that compute a single C(:,j)
            continue ;
        }

        int64_t hi_size = GB_IMAX (hash_size, 8) ;
        int64_t hx_size = hi_size ;
        if (!GB_IS_POWER_OF_TWO (hi_size))
        { 
            hi_size += hi_pad ;
            hx_size += hx_pad ;
        }
        if (is_fine && use_Gustavson)
        { 
            // Hf is int8_t for the fine Gustavson tasks, but round up
            // to the nearest number of int64_t values.
            int64_t hi_size2 = GB_IMAX (hi_size, 64) ;
            Hf_size_total += GB_ICEIL (hi_size2, sizeof (int64_t)) ;
        }
        else
        { 
            // Hf is int64_t for all other methods
            Hf_size_total += hi_size ;
        }
        if (!is_fine && !use_Gustavson)
        { 
            // only coarse hash tasks need Hi
            Hi_size_total += hi_size ;
        }
        // all tasks use an Hx array of size hash_size
        if (!C_iso)
        { 
            // except that the ANY_PAIR semiring does not use Hx
            Hx_size_total += hx_size ;
        }
    }

    GBURBLE ("(nthreads %d", nthreads) ;
    if (ncoarse_gus  > 0) GBURBLE (" coarse: %d",      ncoarse_gus) ;
    if (ncoarse_hash > 0) GBURBLE (" coarse hash: %d", ncoarse_hash) ;
    if (nfine_gus    > 0) GBURBLE (" fine: %d",        nfine_gus) ;
    if (nfine_hash   > 0) GBURBLE (" fine hash: %d",   nfine_hash) ;
    GBURBLE (") ") ;

    //--------------------------------------------------------------------------
    // allocate space for all hash tables
    //--------------------------------------------------------------------------

    if (Hi_size_total > 0)
    { 
        Hi_all = GB_MALLOC_WORK (Hi_size_total, int64_t, &Hi_all_size) ;
    }
    if (Hf_size_total > 0)
    { 
        // Hf must be calloc'd to initialize all entries as empty 
        Hf_all = GB_CALLOC_WORK (Hf_size_total, int64_t, &Hf_all_size) ;
    }
    if (Hx_size_total > 0)
    { 
        Hx_all = GB_MALLOC_WORK (Hx_size_total * csize, GB_void, &Hx_all_size) ;
    }

    if ((Hi_size_total > 0 && Hi_all == NULL) ||
        (Hf_size_total > 0 && Hf_all == NULL) || 
        (Hx_size_total > 0 && Hx_all == NULL))
    { 
        // out of memory
        GB_FREE_ALL ;
        return (GrB_OUT_OF_MEMORY) ;
    }

    //--------------------------------------------------------------------------
    // split the space into separate hash tables
    //--------------------------------------------------------------------------

    int64_t *restrict Hi_part = Hi_all ;
    int64_t *restrict Hf_part = Hf_all ;
    GB_void *restrict Hx_part = Hx_all ;

    for (int taskid = 0 ; taskid < ntasks ; taskid++)
    {

        if (taskid != SaxpyTasks [taskid].leader)
        { 
            // allocate a single hash table for all fine
            // tasks that compute a single C(:,j)
            continue ;
        }

        int64_t hash_size = SaxpyTasks [taskid].hsize ;
        int64_t k = SaxpyTasks [taskid].vector ;
        bool is_fine = (k >= 0) ;
        bool use_Gustavson = (hash_size == cvlen) ;

        SaxpyTasks [taskid].Hi = Hi_part ;
        SaxpyTasks [taskid].Hf = (GB_void *) Hf_part ;
        SaxpyTasks [taskid].Hx = Hx_part ;

        int64_t hi_size = GB_IMAX (hash_size, 8) ;
        int64_t hx_size = hi_size ;
        if (!GB_IS_POWER_OF_TWO (hi_size))
        { 
            hi_size += hi_pad ;
            hx_size += hx_pad ;
        }
        if (is_fine && use_Gustavson)
        { 
            // Hf is int8_t for the fine Gustavson tasks, but round up
            // to the nearest number of int64_t values.
            int64_t hi_size2 = GB_IMAX (hi_size, 64) ;
            Hf_part += GB_ICEIL (hi_size2, sizeof (int64_t)) ;
        }
        else
        { 
            // Hf is int64_t for all other methods
            Hf_part += hi_size ;
        }
        if (!is_fine && !use_Gustavson)
        { 
            // only coarse hash tasks need Hi
            Hi_part += hi_size ;
        }
        // all tasks use an Hx array of size hash_size
        if (!C_iso)
        { 
            // except that the ANY_PAIR iso semiring does not use Hx
            Hx_part += hx_size * csize ;
        }
    }

    // assign shared hash tables to fine task teams
    for (int taskid = 0 ; taskid < nfine ; taskid++)
    {
        int leader = SaxpyTasks [taskid].leader ;
        ASSERT (SaxpyTasks [leader].vector >= 0) ;
        if (taskid != leader)
        { 
            // this fine task (Gustavson or hash) shares its hash table
            // with all other tasks in its team, for a single vector C(:,j).
            ASSERT (SaxpyTasks [taskid].vector == SaxpyTasks [leader].vector) ;
            SaxpyTasks [taskid].Hf = SaxpyTasks [leader].Hf ;
            SaxpyTasks [taskid].Hx = SaxpyTasks [leader].Hx ;
        }
    }

    //==========================================================================
    // phase1: symbolic analysis
    //==========================================================================

    // TODO constructing the tasks (the work above) can take a lot of time.
    // See the web graph, where it takes a total of 3.03 sec for 64 trials, vs
    // a total of 5.9 second for phase 7 (the numerical work below).
    // Figure out a faster method.

    #ifdef GB_TIMING
    ttt = omp_get_wtime ( ) - ttt ;
    GB_Global_timing_add (5, ttt) ;
    ttt = omp_get_wtime ( ) ;
    #endif

    GB_AxB_saxpy3_symbolic (C, M, Mask_comp, Mask_struct, M_in_place,
        A, B, SaxpyTasks, ntasks, nfine, nthreads) ;

// the above phase takes 1.6 seconds for 64 trials of the web graph.

    #ifdef GB_TIMING
    ttt = omp_get_wtime ( ) - ttt ;
    GB_Global_timing_add (6, ttt) ;
    ttt = omp_get_wtime ( ) ;
    #endif

    //==========================================================================
    // C = A*B, via saxpy3 method, phases 2 to 5
    //==========================================================================

    if (C_iso)
    {

        //----------------------------------------------------------------------
        // C is iso; compute the pattern of C<#>=A*B with the any_pair semiring
        //----------------------------------------------------------------------

        GBURBLE ("(iso sparse saxpy) ") ;
        info = GB (_Asaxpy3B__any_pair_iso) (C, M, Mask_comp, Mask_struct,
            M_in_place, A, B, SaxpyTasks, ntasks, nfine,
            nthreads, do_sort, Context) ;
        if (info == GrB_SUCCESS)
        { 
            memcpy (C->x, cscalar, csize) ;
        }

    }
    else
    {

        //----------------------------------------------------------------------
        // C is non-iso
        //----------------------------------------------------------------------

        GBURBLE ("(sparse saxpy) ") ;
        bool done = false ;

        #ifndef GBCUDA_DEV

            //------------------------------------------------------------------
            // define the worker for the switch factory
            //------------------------------------------------------------------

            #define GB_Asaxpy3B(add,mult,xname) \
                GB (_Asaxpy3B_ ## add ## mult ## xname)

            #define GB_AxB_WORKER(add,mult,xname)                           \
            {                                                               \
                info = GB_Asaxpy3B (add,mult,xname) (C, M, Mask_comp,       \
                    Mask_struct, M_in_place, A, B,                          \
                    SaxpyTasks, ntasks, nfine, nthreads,                    \
                    do_sort, Context) ;                                     \
                done = (info != GrB_NO_VALUE) ;                             \
            }                                                               \
            break ;

            //------------------------------------------------------------------
            // launch the switch factory
            //------------------------------------------------------------------

            if (builtin_semiring)
            { 
                #include "GB_AxB_factory.c"
            }

        #endif

        //----------------------------------------------------------------------
        // generic saxpy3 method
        //----------------------------------------------------------------------

        if (!done)
        { 
            info = GB_AxB_saxpy_generic (C, M, Mask_comp, Mask_struct,
                M_in_place, A, A_is_pattern, B, B_is_pattern, semiring,
                flipxy, GB_SAXPY_METHOD_3,
                SaxpyTasks, ntasks, nfine, nthreads, do_sort,
                Context) ;
        }
    }

    if (info != GrB_SUCCESS)
    { 
        // out of memory
        GB_FREE_ALL ;
        return (GrB_OUT_OF_MEMORY) ;
    }

    //--------------------------------------------------------------------------
    // prune empty vectors, free workspace, and return result
    //--------------------------------------------------------------------------

    #ifdef GB_TIMING
    ttt = omp_get_wtime ( ) - ttt ;
    GB_Global_timing_add (7, ttt) ;
    ttt = omp_get_wtime ( ) ;
    #endif

    C->magic = GB_MAGIC ;
    GB_FREE_WORKSPACE ;
    GB_OK (GB_hypermatrix_prune (C, Context)) ;
    ASSERT_MATRIX_OK (C, "saxpy3: output", GB0) ;
    ASSERT (!GB_ZOMBIES (C)) ;
    ASSERT (!GB_PENDING (C)) ;
    (*mask_applied) = apply_mask ;

    #ifdef GB_TIMING
    ttt = omp_get_wtime ( ) - ttt ;
    GB_Global_timing_add (8, ttt) ;
    #endif

    return (info) ;
}