File: GB_AxB_saxpy3.c

package info (click to toggle)
suitesparse-graphblas 7.4.0%2Bdfsg-2
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 67,112 kB
sloc: ansic: 1,072,243; cpp: 8,081; sh: 512; makefile: 506; asm: 369; python: 125; awk: 10
file content (728 lines) | stat: -rw-r--r-- 27,166 bytes
parent folder | download | duplicates (2)
//------------------------------------------------------------------------------
// GB_AxB_saxpy3: compute C=A*B, C<M>=A*B, or C<!M>=A*B in parallel
//------------------------------------------------------------------------------

// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

// GB_AxB_saxpy3 computes C=A*B, C<M>=A*B, or C<!M>=A*B in parallel.  If the
// mask matrix M has too many entries compared to the work to compute A*B, then
// it is not applied.  Instead, M is ignored and C=A*B is computed.  The mask
// is applied later, in GB_mxm.

// C is sparse or hypersparse.  M, A, and B can have any format.
// The accum operator is not handled, and C is not modified in-place.  Instead,
// C is constructed in a static header.

// For simplicity, this discussion and all comments in this code assume that
// all matrices are in CSC format, but the algorithm is CSR/CSC agnostic.

// The matrix B is split into two kinds of tasks: coarse and fine.  A coarse
// task computes C(:,j1:j2) = A*B(:,j1:j2), for a unique set of vectors j1:j2.
// Those vectors are not shared with any other tasks.  A fine task works with a
// team of other fine tasks to compute C(:,j) for a single vector j.  Each fine
// task computes A*B(k1:k2,j) for a unique range k1:k2, and sums its results
// into C(:,j) via atomic operations.

// Each coarse or fine task uses either Gustavson's method [1] or the Hash
// method [2].  There are 4 kinds of tasks:

//      fine Gustavson task
//      fine hash task
//      coarse Gustason task
//      coarse hash task

// Each of the 4 kinds tasks are then subdivided into 3 variants, for C=A*B,
// C<M>=A*B, and C<!M>=A*B, giving a total of 12 different types of tasks.

// Fine tasks are used when there would otherwise be too much work for a single
// task to compute the single vector C(:,j).  Fine tasks share all of their
// workspace with the team of fine tasks computing C(:,j).  Coarse tasks are
// prefered since they require less synchronization, but fine tasks allow for
// better parallelization when B has only a few vectors.  If B consists of a
// single vector (for GrB_mxv if A is in CSC format and not transposed, or
// for GrB_vxm if A is in CSR format and not transpose), then the only way to
// get parallelism is via fine tasks.  If a single thread is used for this
// case, a single-vector coarse task is used.

// To select between the Hash method or Gustavson's method for each task, the
// hash table size is first found.  The hash table size for a hash task depends
// on the maximum flop count for any vector in that task (which is just one
// vector for the fine tasks).  It is set to twice the smallest power of 2 that
// is greater than the flop count to compute that vector (plus the # of entries
// in M(:,j) for tasks that compute C<M>=A*B or C<!M>=A*B).  This size ensures
// the results will fit in the hash table, and with ideally only a modest
// number of collisions.  If the hash table size exceeds a threshold (currently
// m/16 if C is m-by-n), then Gustavson's method is used instead, and the hash
// table size is set to m, to serve as the gather/scatter workspace for
// Gustavson's method.

// The workspace allocated depends on the type of task.  Let s be the hash
// table size for the task, and C is m-by-n (assuming all matrices are CSC; if
// CSR, then m is replaced with n).
//
//      fine Gustavson task (shared):   int8_t  Hf [m] ; ctype Hx [m] ;
//      fine hash task (shared):        int64_t Hf [s] ; ctype Hx [s] ;
//      coarse Gustavson task:          int64_t Hf [m] ; ctype Hx [m] ;
//      coarse hash task:               int64_t Hf [s] ; ctype Hx [s] ;
//                                      int64_t Hi [s] ; 
//
// Note that the Hi array is needed only for the coarse hash task.  Additional
// workspace is allocated to construct the list of tasks, but this is freed
// before C is constructed.

// References:

// [1] Fred G. Gustavson. 1978. Two Fast Algorithms for Sparse Matrices:
// Multiplication and Permuted Transposition. ACM Trans. Math. Softw.  4, 3
// (Sept. 1978), 250–269. DOI:https://doi.org/10.1145/355791.355796

// [2] Yusuke Nagasaka, Satoshi Matsuoka, Ariful Azad, and Aydin Buluc. 2018.
// High-Performance Sparse Matrix-Matrix Products on Intel KNL and Multicore
// Architectures. In Proc. 47th Intl. Conf. on Parallel Processing (ICPP '18).
// Association for Computing Machinery, New York, NY, USA, Article 34, 1–10.
// DOI:https://doi.org/10.1145/3229710.3229720

//------------------------------------------------------------------------------

#include "GB_mxm.h"
#include "GB_AxB_saxpy_generic.h"
#include "GB_control.h"
#include "GB_AxB__include1.h"
#ifndef GBCUDA_DEV
#include "GB_AxB__include2.h"
#endif
#include "GB_unused.h"

#define GB_FREE_WORKSPACE                           \
{                                                   \
    GB_FREE_WORK (&SaxpyTasks, SaxpyTasks_size) ;   \
    GB_FREE_WORK (&Hi_all, Hi_all_size) ;           \
    GB_FREE_WORK (&Hf_all, Hf_all_size) ;           \
    GB_FREE_WORK (&Hx_all, Hx_all_size) ;           \
}

#define GB_FREE_ALL             \
{                               \
    GB_FREE_WORKSPACE ;         \
    GB_phybix_free (C) ;        \
}

//------------------------------------------------------------------------------
// GB_AxB_saxpy3: compute C=A*B, C<M>=A*B, or C<!M>=A*B in parallel
//------------------------------------------------------------------------------

GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
(
    GrB_Matrix C,                   // output, static header, not in-place
    const bool C_iso,               // true if C is iso
    const GB_void *cscalar,         // iso value of C
    int C_sparsity,                 // construct C as sparse or hypersparse
    const GrB_Matrix M_input,       // optional mask matrix
    const bool Mask_comp_input,     // if true, use !M
    const bool Mask_struct,         // if true, use the only structure of M
    const GrB_Matrix A,             // input matrix A
    const GrB_Matrix B,             // input matrix B
    const GrB_Semiring semiring,    // semiring that defines C=A*B
    const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
    bool *mask_applied,             // if true, then mask was applied
    GrB_Desc_Value AxB_method,      // Default, Gustavson, or Hash
    const int do_sort,              // if nonzero, try to sort in saxpy3
    GB_Context Context
)
{

    #ifdef GB_TIMING
    double ttt = omp_get_wtime ( ) ;
    #endif

    //--------------------------------------------------------------------------
    // check inputs
    //--------------------------------------------------------------------------

    GrB_Info info ;

    GrB_Matrix M = M_input ;        // use the mask M, until deciding otherwise
    bool Mask_comp = Mask_comp_input ;

    (*mask_applied) = false ;
    bool apply_mask = false ;

    ASSERT (C != NULL && (C->static_header || GBNSTATIC)) ;

    ASSERT_MATRIX_OK_OR_NULL (M, "M for saxpy3 A*B", GB0) ;
    ASSERT (!GB_PENDING (M)) ;
    ASSERT (GB_JUMBLED_OK (M)) ;
    ASSERT (!GB_ZOMBIES (M)) ;

    ASSERT_MATRIX_OK (A, "A for saxpy3 A*B", GB0) ;
    ASSERT (!GB_PENDING (A)) ;
    ASSERT (GB_JUMBLED_OK (A)) ;
    ASSERT (!GB_ZOMBIES (A)) ;

    ASSERT_MATRIX_OK (B, "B for saxpy3 A*B", GB0) ;
    ASSERT (!GB_PENDING (B)) ;
    ASSERT (GB_JUMBLED_OK (B)) ;
    ASSERT (!GB_ZOMBIES (B)) ;

    ASSERT_SEMIRING_OK (semiring, "semiring for saxpy3 A*B", GB0) ;
    ASSERT (A->vdim == B->vlen) ;

    ASSERT (C_sparsity == GxB_HYPERSPARSE || C_sparsity == GxB_SPARSE) ;

    //--------------------------------------------------------------------------
    // determine the # of threads to use
    //--------------------------------------------------------------------------

    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;

    //--------------------------------------------------------------------------
    // define workspace
    //--------------------------------------------------------------------------

    int64_t *restrict Hi_all = NULL ; size_t Hi_all_size = 0 ;
    int64_t *restrict Hf_all = NULL ; size_t Hf_all_size = 0 ;
    GB_void *restrict Hx_all = NULL ; size_t Hx_all_size = 0 ;
    GB_saxpy3task_struct *SaxpyTasks = NULL ; size_t SaxpyTasks_size = 0 ;

    //--------------------------------------------------------------------------
    // construct the hyper hashes for M and A
    //--------------------------------------------------------------------------

    GB_OK (GB_hyper_hash_build (M, Context)) ;    // does nothing if M is NULL
    GB_OK (GB_hyper_hash_build (A, Context)) ;

    //--------------------------------------------------------------------------
    // get the semiring operators
    //--------------------------------------------------------------------------

    GrB_BinaryOp mult = semiring->multiply ;
    GrB_Monoid add = semiring->add ;
    ASSERT (mult->ztype == add->op->ztype) ;
    bool A_is_pattern, B_is_pattern ;
    GB_binop_pattern (&A_is_pattern, &B_is_pattern, flipxy, mult->opcode) ;

    GB_Opcode mult_binop_code, add_binop_code ;
    GB_Type_code xcode, ycode, zcode ;
    bool builtin_semiring = GB_AxB_semiring_builtin (A, A_is_pattern, B,
        B_is_pattern, semiring, flipxy, &mult_binop_code, &add_binop_code,
        &xcode, &ycode, &zcode) ;

    //--------------------------------------------------------------------------
    // get A, and B
    //--------------------------------------------------------------------------

    const int64_t *restrict Ap = A->p ;
    const int64_t *restrict Ah = A->h ;
    const int64_t avlen = A->vlen ;
    const int64_t anvec = A->nvec ;
    const bool A_is_hyper = GB_IS_HYPERSPARSE (A) ;

    const int64_t *restrict Bp = B->p ;
    const int64_t *restrict Bh = B->h ;
    const int8_t  *restrict Bb = B->b ;
    const int64_t *restrict Bi = B->i ;
    const int64_t bvdim = B->vdim ;
    const int64_t bnz = GB_nnz_held (B) ;
    const int64_t bnvec = B->nvec ;
    const int64_t bvlen = B->vlen ;
    const bool B_is_hyper = GB_IS_HYPERSPARSE (B) ;

    //--------------------------------------------------------------------------
    // allocate C (just C->p and C->h, but not C->i or C->x)
    //--------------------------------------------------------------------------

    GrB_Type ctype = add->op->ztype ;
    size_t csize = ctype->size ;
    int64_t cvlen = avlen ;
    int64_t cvdim = bvdim ;
    int64_t cnvec = bnvec ;

    info = GB_new (&C, // sparse or hyper, existing header
        ctype, cvlen, cvdim, GB_Ap_malloc, true,
        C_sparsity, B->hyper_switch, cnvec, Context) ;
    if (info != GrB_SUCCESS)
    { 
        // out of memory
        GB_FREE_ALL ;
        return (info) ;
    }

    C->iso = C_iso ;    // OK

    int64_t *restrict Cp = C->p ;
    int64_t *restrict Ch = C->h ;
    if (B_is_hyper)
    { 
        // B and C are both hypersparse
        ASSERT (C_sparsity == GxB_HYPERSPARSE) ;
        int nth = GB_nthreads (cnvec, chunk, nthreads_max) ;
        GB_memcpy (Ch, Bh, cnvec * sizeof (int64_t), nth) ;
        C->nvec = bnvec ;
    }
    else
    { 
        // B is sparse, bitmap, or full; C is sparse
        ASSERT (C_sparsity == GxB_SPARSE) ;
    }

    #ifdef GB_TIMING
    ttt = omp_get_wtime ( ) - ttt ;
    GB_Global_timing_add (3, ttt) ;
    ttt = omp_get_wtime ( ) ;
    #endif

    //==========================================================================
    // phase0: create parallel tasks and allocate workspace
    //==========================================================================

    int nthreads, ntasks, nfine ;
    bool M_in_place = false ;

    if (nthreads_max == 1 && M == NULL && (AxB_method != GxB_AxB_HASH) &&
        GB_IMIN (GB_nnz (A), GB_nnz (B)) > cvlen/16)
    { 
        // Skip the flopcount analysis if only a single thread is being used,
        // no mask is present, the Hash method is not explicitly selected, and
        // the problem is not extremely sparse.  In this case, use a single
        // coarse Gustavson task only.  In this case, the flop count analysis
        // is not needed.
        GBURBLE ("(single-threaded Gustavson) ") ;
        info = GB_AxB_saxpy3_slice_quick (C, A, B,
            &SaxpyTasks, &SaxpyTasks_size, &ntasks, &nfine, &nthreads,
            Context) ;
    }
    else
    { 
        // Do the flopcount analysis and create a set of well-balanced tasks in
        // the general case.  This may select a single task for a single thread
        // anyway, but this decision would be based on the analysis.
        info = GB_AxB_saxpy3_slice_balanced (C, M, Mask_comp, A, B, AxB_method,
            builtin_semiring,
            &SaxpyTasks, &SaxpyTasks_size, &apply_mask, &M_in_place,
            &ntasks, &nfine, &nthreads, Context) ;
    }

    #ifdef GB_TIMING
    ttt = omp_get_wtime ( ) - ttt ;
    GB_Global_timing_add (4, ttt) ;
    ttt = omp_get_wtime ( ) ;
    #endif

    if (info == GrB_NO_VALUE)
    { 
        // The mask is present but has been discarded; need to discard the
        // analysis so far and redo it without the mask.  This may result in
        // GB_bitmap_AxB_saxpy being called instead of GB_AxB_saxpy3.
        ASSERT (M != NULL && !apply_mask) ;
        GB_FREE_ALL ;
        return (GrB_NO_VALUE) ;
    }
    else if (info != GrB_SUCCESS)
    { 
        // out of memory
        GB_FREE_ALL ;
        return (info) ;
    }

    if (!apply_mask)
    { 
        // disable the mask, if present.
        M = NULL ;
        Mask_comp = false ;
    }

    if (do_sort) GBURBLE ("sort ") ;

    //--------------------------------------------------------------------------
    // allocate the hash tables
    //--------------------------------------------------------------------------

    // If Gustavson's method is used (coarse tasks):
    //
    //      hash_size is cvlen.
    //      Hi is not allocated.
    //      Hf and Hx are both of size hash_size.
    //
    //      (Hf [i] == mark) is true if i is in the hash table.
    //      Hx [i] is the value of C(i,j) during the numeric phase.
    //
    //      Gustavson's method is used if the hash_size for the Hash method
    //      is a significant fraction of cvlen. 
    //
    // If the Hash method is used (coarse tasks):
    //
    //      hash_size is 2 times the smallest power of 2 that is larger than
    //      the # of flops required for any column C(:,j) being computed.  This
    //      ensures that all entries have space in the hash table, and that the
    //      hash occupancy will never be more than 50%.  It is always smaller
    //      than cvlen (otherwise, Gustavson's method is used).
    //
    //      A hash function is used for the ith entry:
    //          hash = GB_HASHF (i, hash_bits) ; in range 0 to hash_size-1
    //      If a collision occurs, linear probing is used:
    //          GB_REHASH (hash, i, hash_bits)
    //      which is:
    //          hash = (hash + 1) & (hash_size-1)
    //      where hash_bits = hash_size - 1
    //
    //      (Hf [hash] == mark) is true if the position is occupied.
    //      i = Hi [hash] gives the row index i that occupies that position.
    //      Hx [hash] is the value of C(i,j) during the numeric phase.
    //
    // For both coarse methods:
    //
    //      Hf starts out all zero (via calloc), and mark starts out as 1.  To
    //      clear Hf, mark is incremented, so that all entries in Hf are not
    //      equal to mark.

    // add some padding to the end of each hash table, to avoid false
    // sharing of cache lines between the hash tables.  But only add the
    // padding if there is more than one team.
    size_t hx_pad = 0 ;
    size_t hi_pad = 0 ;
    for (int taskid = 1 ; taskid < ntasks ; taskid++)
    {
        if (taskid == SaxpyTasks [taskid].leader)
        { 
            hx_pad = GB_ICEIL (64, csize) ;
            hi_pad = 64 / sizeof (int64_t) ;
            break ;
        }
    }

    size_t Hi_size_total = 0 ;
    size_t Hf_size_total = 0 ;
    size_t Hx_size_total = 0 ;

    //--------------------------------------------------------------------------
    // determine the total size of all hash tables
    //--------------------------------------------------------------------------

    int nfine_hash = 0 ;
    int nfine_gus = 0 ;
    int ncoarse_hash = 0 ;
    int ncoarse_1hash = 0 ;
    int ncoarse_gus = 0 ;

    for (int taskid = 0 ; taskid < ntasks ; taskid++)
    {

        // get the task type and its hash size
        int64_t hash_size = SaxpyTasks [taskid].hsize ;
        int64_t k = SaxpyTasks [taskid].vector ;
        bool is_fine = (k >= 0) ;
        bool use_Gustavson = (hash_size == cvlen) ;

        if (is_fine)
        {
            // fine task
            if (use_Gustavson)
            { 
                // fine Gustavson task
                nfine_gus++ ;
            }
            else
            { 
                // fine hash task
                nfine_hash++ ;
            }
        }
        else
        {
            // coarse task
            if (use_Gustavson)
            { 
                // coarse Gustavson task
                ncoarse_gus++ ;
            }
            else
            { 
                // coarse hash task
                ncoarse_hash++ ;
            }
        }

        if (taskid != SaxpyTasks [taskid].leader)
        { 
            // allocate a single shared hash table for all fine
            // tasks that compute a single C(:,j)
            continue ;
        }

        int64_t hi_size = GB_IMAX (hash_size, 8) ;
        int64_t hx_size = hi_size ;
        if (!GB_IS_POWER_OF_TWO (hi_size))
        { 
            hi_size += hi_pad ;
            hx_size += hx_pad ;
        }
        if (is_fine && use_Gustavson)
        { 
            // Hf is int8_t for the fine Gustavson tasks, but round up
            // to the nearest number of int64_t values.
            int64_t hi_size2 = GB_IMAX (hi_size, 64) ;
            Hf_size_total += GB_ICEIL (hi_size2, sizeof (int64_t)) ;
        }
        else
        { 
            // Hf is int64_t for all other methods
            Hf_size_total += hi_size ;
        }
        if (!is_fine && !use_Gustavson)
        { 
            // only coarse hash tasks need Hi
            Hi_size_total += hi_size ;
        }
        // all tasks use an Hx array of size hash_size
        if (!C_iso)
        { 
            // except that the ANY_PAIR semiring does not use Hx
            Hx_size_total += hx_size ;
        }
    }

    GBURBLE ("(nthreads %d", nthreads) ;
    if (ncoarse_gus  > 0) GBURBLE (" coarse: %d",      ncoarse_gus) ;
    if (ncoarse_hash > 0) GBURBLE (" coarse hash: %d", ncoarse_hash) ;
    if (nfine_gus    > 0) GBURBLE (" fine: %d",        nfine_gus) ;
    if (nfine_hash   > 0) GBURBLE (" fine hash: %d",   nfine_hash) ;
    GBURBLE (") ") ;

    //--------------------------------------------------------------------------
    // allocate space for all hash tables
    //--------------------------------------------------------------------------

    if (Hi_size_total > 0)
    { 
        Hi_all = GB_MALLOC_WORK (Hi_size_total, int64_t, &Hi_all_size) ;
    }
    if (Hf_size_total > 0)
    { 
        // Hf must be calloc'd to initialize all entries as empty 
        Hf_all = GB_CALLOC_WORK (Hf_size_total, int64_t, &Hf_all_size) ;
    }
    if (Hx_size_total > 0)
    { 
        Hx_all = GB_MALLOC_WORK (Hx_size_total * csize, GB_void, &Hx_all_size) ;
    }

    if ((Hi_size_total > 0 && Hi_all == NULL) ||
        (Hf_size_total > 0 && Hf_all == NULL) || 
        (Hx_size_total > 0 && Hx_all == NULL))
    { 
        // out of memory
        GB_FREE_ALL ;
        return (GrB_OUT_OF_MEMORY) ;
    }

    //--------------------------------------------------------------------------
    // split the space into separate hash tables
    //--------------------------------------------------------------------------

    int64_t *restrict Hi_part = Hi_all ;
    int64_t *restrict Hf_part = Hf_all ;
    GB_void *restrict Hx_part = Hx_all ;

    for (int taskid = 0 ; taskid < ntasks ; taskid++)
    {

        if (taskid != SaxpyTasks [taskid].leader)
        { 
            // allocate a single hash table for all fine
            // tasks that compute a single C(:,j)
            continue ;
        }

        int64_t hash_size = SaxpyTasks [taskid].hsize ;
        int64_t k = SaxpyTasks [taskid].vector ;
        bool is_fine = (k >= 0) ;
        bool use_Gustavson = (hash_size == cvlen) ;

        SaxpyTasks [taskid].Hi = Hi_part ;
        SaxpyTasks [taskid].Hf = (GB_void *) Hf_part ;
        SaxpyTasks [taskid].Hx = Hx_part ;

        int64_t hi_size = GB_IMAX (hash_size, 8) ;
        int64_t hx_size = hi_size ;
        if (!GB_IS_POWER_OF_TWO (hi_size))
        { 
            hi_size += hi_pad ;
            hx_size += hx_pad ;
        }
        if (is_fine && use_Gustavson)
        { 
            // Hf is int8_t for the fine Gustavson tasks, but round up
            // to the nearest number of int64_t values.
            int64_t hi_size2 = GB_IMAX (hi_size, 64) ;
            Hf_part += GB_ICEIL (hi_size2, sizeof (int64_t)) ;
        }
        else
        { 
            // Hf is int64_t for all other methods
            Hf_part += hi_size ;
        }
        if (!is_fine && !use_Gustavson)
        { 
            // only coarse hash tasks need Hi
            Hi_part += hi_size ;
        }
        // all tasks use an Hx array of size hash_size
        if (!C_iso)
        { 
            // except that the ANY_PAIR iso semiring does not use Hx
            Hx_part += hx_size * csize ;
        }
    }

    // assign shared hash tables to fine task teams
    for (int taskid = 0 ; taskid < nfine ; taskid++)
    {
        int leader = SaxpyTasks [taskid].leader ;
        ASSERT (SaxpyTasks [leader].vector >= 0) ;
        if (taskid != leader)
        { 
            // this fine task (Gustavson or hash) shares its hash table
            // with all other tasks in its team, for a single vector C(:,j).
            ASSERT (SaxpyTasks [taskid].vector == SaxpyTasks [leader].vector) ;
            SaxpyTasks [taskid].Hf = SaxpyTasks [leader].Hf ;
            SaxpyTasks [taskid].Hx = SaxpyTasks [leader].Hx ;
        }
    }

    //==========================================================================
    // phase1: symbolic analysis
    //==========================================================================

    // TODO constructing the tasks (the work above) can take a lot of time.
    // See the web graph, where it takes a total of 3.03 sec for 64 trials, vs
    // a total of 5.9 second for phase 7 (the numerical work below).
    // Figure out a faster method.

    #ifdef GB_TIMING
    ttt = omp_get_wtime ( ) - ttt ;
    GB_Global_timing_add (5, ttt) ;
    ttt = omp_get_wtime ( ) ;
    #endif

    GB_AxB_saxpy3_symbolic (C, M, Mask_comp, Mask_struct, M_in_place,
        A, B, SaxpyTasks, ntasks, nfine, nthreads) ;

// the above phase takes 1.6 seconds for 64 trials of the web graph.

    #ifdef GB_TIMING
    ttt = omp_get_wtime ( ) - ttt ;
    GB_Global_timing_add (6, ttt) ;
    ttt = omp_get_wtime ( ) ;
    #endif

    //==========================================================================
    // C = A*B, via saxpy3 method, phases 2 to 5
    //==========================================================================

    if (C_iso)
    {

        //----------------------------------------------------------------------
        // C is iso; compute the pattern of C<#>=A*B with the any_pair semiring
        //----------------------------------------------------------------------

        GBURBLE ("(iso sparse saxpy) ") ;
        info = GB (_Asaxpy3B__any_pair_iso) (C, M, Mask_comp, Mask_struct,
            M_in_place, A, B, SaxpyTasks, ntasks, nfine,
            nthreads, do_sort, Context) ;
        if (info == GrB_SUCCESS)
        { 
            memcpy (C->x, cscalar, csize) ;
        }

    }
    else
    {

        //----------------------------------------------------------------------
        // C is non-iso
        //----------------------------------------------------------------------

        GBURBLE ("(sparse saxpy) ") ;
        bool done = false ;

        #ifndef GBCUDA_DEV

            //------------------------------------------------------------------
            // define the worker for the switch factory
            //------------------------------------------------------------------

            #define GB_Asaxpy3B(add,mult,xname) \
                GB (_Asaxpy3B_ ## add ## mult ## xname)

            #define GB_AxB_WORKER(add,mult,xname)                           \
            {                                                               \
                info = GB_Asaxpy3B (add,mult,xname) (C, M, Mask_comp,       \
                    Mask_struct, M_in_place, A, B,                          \
                    SaxpyTasks, ntasks, nfine, nthreads,                    \
                    do_sort, Context) ;                                     \
                done = (info != GrB_NO_VALUE) ;                             \
            }                                                               \
            break ;

            //------------------------------------------------------------------
            // launch the switch factory
            //------------------------------------------------------------------

            if (builtin_semiring)
            { 
                #include "GB_AxB_factory.c"
            }

        #endif

        //----------------------------------------------------------------------
        // generic saxpy3 method
        //----------------------------------------------------------------------

        if (!done)
        { 
            info = GB_AxB_saxpy_generic (C, M, Mask_comp, Mask_struct,
                M_in_place, A, A_is_pattern, B, B_is_pattern, semiring,
                flipxy, GB_SAXPY_METHOD_3,
                SaxpyTasks, ntasks, nfine, nthreads, do_sort,
                Context) ;
        }
    }

    if (info != GrB_SUCCESS)
    { 
        // out of memory
        GB_FREE_ALL ;
        return (GrB_OUT_OF_MEMORY) ;
    }

    //--------------------------------------------------------------------------
    // prune empty vectors, free workspace, and return result
    //--------------------------------------------------------------------------

    #ifdef GB_TIMING
    ttt = omp_get_wtime ( ) - ttt ;
    GB_Global_timing_add (7, ttt) ;
    ttt = omp_get_wtime ( ) ;
    #endif

    C->magic = GB_MAGIC ;
    GB_FREE_WORKSPACE ;
    GB_OK (GB_hypermatrix_prune (C, Context)) ;
    ASSERT_MATRIX_OK (C, "saxpy3: output", GB0) ;
    ASSERT (!GB_ZOMBIES (C)) ;
    ASSERT (!GB_PENDING (C)) ;
    (*mask_applied) = apply_mask ;

    #ifdef GB_TIMING
    ttt = omp_get_wtime ( ) - ttt ;
    GB_Global_timing_add (8, ttt) ;
    #endif

    return (info) ;
}