File: GB_AxB_dot4_template.c

package info (click to toggle)
suitesparse-graphblas 7.4.0%2Bdfsg-1
links: PTS, VCS
area: main
in suites: bookworm
size: 67,112 kB
sloc: ansic: 1,072,243; cpp: 8,081; sh: 512; makefile: 503; asm: 369; python: 125; awk: 10
file content (1095 lines) | stat: -rw-r--r-- 45,424 bytes
parent folder | download | duplicates (3)
//------------------------------------------------------------------------------
// GB_AxB_dot4_template:  C+=A'*B via dot products, where C is full
//------------------------------------------------------------------------------

// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

// C+=A'*B where C is full and computed in-place.  The monoid of the semiring
// matches the accum operator, and the type of C matches the ztype of accum.

// The PAIR and FIRSTJ multiplicative operators are important special cases.

// The matrix C is the user input matrix.  C is not iso on output, but might
// iso on input, in which case the input iso scalar is cinput, and C->x has
// been expanded to non-iso, and initialized if A and/or B are hypersparse.
// A and/or B can be iso.

// MIN_FIRSTJ or MIN_FIRSTJ1 semirings:
#define GB_IS_MIN_FIRSTJ_SEMIRING (GB_IS_IMIN_MONOID && GB_IS_FIRSTJ_MULTIPLIER)
// MAX_FIRSTJ or MAX_FIRSTJ1 semirings:
#define GB_IS_MAX_FIRSTJ_SEMIRING (GB_IS_IMAX_MONOID && GB_IS_FIRSTJ_MULTIPLIER)
// GB_OFFSET is 1 for the MIN/MAX_FIRSTJ1 semirings, and 0 otherwise.

#if GB_IS_ANY_MONOID
#error "dot4 is not used for the ANY monoid"
#endif

#undef  GB_GET4C
#define GB_GET4C(cij,p) cij = (C_in_iso) ? cinput : Cx [p]

#if ((GB_A_IS_BITMAP || GB_A_IS_FULL) && (GB_B_IS_BITMAP || GB_B_IS_FULL ))
{

    //--------------------------------------------------------------------------
    // C += A'*B where A and B are both bitmap/full
    //--------------------------------------------------------------------------

    // FUTURE: This method is not particularly efficient when both A and B are
    // bitmap/full.  A better method would use tiles to reduce memory traffic.

    int tid ;
    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
    for (tid = 0 ; tid < ntasks ; tid++)
    {

        //----------------------------------------------------------------------
        // get the task descriptor
        //----------------------------------------------------------------------

        const int a_tid = tid / nbslice ;
        const int b_tid = tid % nbslice ;
        const int64_t kA_start = A_slice [a_tid] ;
        const int64_t kA_end   = A_slice [a_tid+1] ;
        const int64_t kB_start = B_slice [b_tid] ;
        const int64_t kB_end   = B_slice [b_tid+1] ;

        for (int64_t j = kB_start ; j < kB_end ; j++)
        {

            //------------------------------------------------------------------
            // get B(:,j) and C(:,j)
            //------------------------------------------------------------------

            const int64_t pC_start = j * cvlen ;
            const int64_t pB_start = j * vlen ;

            //------------------------------------------------------------------
            // C(:,j) += A'*B(:,j)
            //------------------------------------------------------------------

            for (int64_t i = kA_start ; i < kA_end ; i++)
            {

                //--------------------------------------------------------------
                // get A(:,i)
                //--------------------------------------------------------------

                const int64_t pA = i * vlen ;

                //--------------------------------------------------------------
                // get C(i,j)
                //--------------------------------------------------------------

                int64_t pC = i + pC_start ;     // C(i,j) is at Cx [pC]
                GB_CTYPE GB_GET4C (cij, pC) ;   // cij = Cx [pC]

                //--------------------------------------------------------------
                // C(i,j) += A (:,i)*B(:,j): a single dot product
                //--------------------------------------------------------------

                int64_t pB = pB_start ;

                #if ( GB_A_IS_FULL && GB_B_IS_FULL )
                {

                    //----------------------------------------------------------
                    // both A and B are full
                    //----------------------------------------------------------

                    #if GB_IS_PAIR_MULTIPLIER
                    { 
                        #if GB_IS_EQ_MONOID
                        // EQ_PAIR semiring
                        cij = (cij == 1) ;
                        #elif (GB_CTYPE_BITS > 0)
                        // PLUS, XOR monoids: A(:,i)'*B(:,j) is nnz(A(:,i)),
                        // for bool, 8-bit, 16-bit, or 32-bit integer
                        uint64_t t = ((uint64_t) cij) + vlen ;
                        cij = (GB_CTYPE) (t & GB_CTYPE_BITS) ;
                        #elif GB_IS_PLUS_FC32_MONOID
                        // PLUS monoid for float complex
                        cij = GxB_CMPLXF (crealf (cij) + (float) vlen, 0) ;
                        #elif GB_IS_PLUS_FC64_MONOID
                        // PLUS monoid for double complex
                        cij = GxB_CMPLX (creal (cij) + (double) vlen, 0) ;
                        #else
                        // PLUS monoid for float, double, or 64-bit integers 
                        cij += (GB_CTYPE) vlen ;
                        #endif
                    }
                    #elif GB_IS_MIN_FIRSTJ_SEMIRING
                    {
                        // MIN_FIRSTJ semiring: take the first entry
                        if (vlen > 0)
                        { 
                            int64_t k = GB_OFFSET ;
                            cij = GB_IMIN (cij, k) ;
                        }
                    }
                    #elif GB_IS_MAX_FIRSTJ_SEMIRING
                    {
                        // MAX_FIRSTJ semiring: take the last entry
                        if (vlen > 0)
                        { 
                            int64_t k = vlen-1 + GB_OFFSET ;
                            cij = GB_IMAX (cij, k) ;
                        }
                    }
                    #else
                    {
                        GB_PRAGMA_SIMD_DOT (cij)
                        for (int64_t k = 0 ; k < vlen ; k++)
                        { 
                            GB_DOT (k, pA+k, pB+k) ;    // cij += A(k,i)*B(k,j)
                        }
                    }
                    #endif

                }
                #elif ( GB_A_IS_FULL && GB_B_IS_BITMAP )
                {

                    //----------------------------------------------------------
                    // A is full and B is bitmap
                    //----------------------------------------------------------

                    #if GB_IS_MIN_FIRSTJ_SEMIRING
                    {
                        // MIN_FIRSTJ semiring: take the first entry in B(:,j)
                        for (int64_t k = 0 ; k < vlen ; k++)
                        {
                            if (Bb [pB+k])
                            { 
                                cij = GB_IMIN (cij, k + GB_OFFSET) ;
                                break ;
                            }
                        }
                    }
                    #elif GB_IS_MAX_FIRSTJ_SEMIRING
                    {
                        // MAX_FIRSTJ semiring: take the last entry in B(:,j)
                        for (int64_t k = vlen-1 ; k >= 0 ; k--)
                        {
                            if (Bb [pB+k])
                            { 
                                cij = GB_IMAX (cij, k + GB_OFFSET) ;
                                break ;
                            }
                        }
                    }
                    #else
                    {
                        GB_PRAGMA_SIMD_DOT (cij)
                        for (int64_t k = 0 ; k < vlen ; k++)
                        {
                            if (Bb [pB+k])
                            { 
                                GB_DOT (k, pA+k, pB+k) ; // cij += A(k,i)*B(k,j)
                            }
                        }
                    }
                    #endif

                }
                #elif ( GB_A_IS_BITMAP && GB_B_IS_FULL )
                {

                    //----------------------------------------------------------
                    // A is bitmap and B is full
                    //----------------------------------------------------------

                    #if GB_IS_MIN_FIRSTJ_SEMIRING
                    {
                        // MIN_FIRSTJ semiring: take the first entry in A(:,i)
                        for (int64_t k = 0 ; k < vlen ; k++)
                        {
                            if (Ab [pA+k])
                            { 
                                cij = GB_IMIN (cij, k + GB_OFFSET) ;
                                break ;
                            }
                        }
                    }
                    #elif GB_IS_MAX_FIRSTJ_SEMIRING
                    {
                        // MAX_FIRSTJ semiring: take the last entry in A(:,i)
                        for (int64_t k = vlen-1 ; k >= 0 ; k--)
                        {
                            if (Ab [pA+k])
                            { 
                                cij = GB_IMAX (cij, k + GB_OFFSET) ;
                                break ;
                            }
                        }
                    }
                    #else
                    {
                        GB_PRAGMA_SIMD_DOT (cij)
                        for (int64_t k = 0 ; k < vlen ; k++)
                        {
                            if (Ab [pA+k])
                            { 
                                GB_DOT (k, pA+k, pB+k) ; // cij += A(k,i)*B(k,j)
                            }
                        }
                    }
                    #endif

                }
                #elif ( GB_A_IS_BITMAP && GB_B_IS_BITMAP )
                {

                    //----------------------------------------------------------
                    // both A and B are bitmap
                    //----------------------------------------------------------

                    #if GB_IS_MIN_FIRSTJ_SEMIRING
                    {
                        // MIN_FIRSTJ semiring: take the first entry
                        for (int64_t k = 0 ; k < vlen ; k++)
                        {
                            if (Ab [pA+k] && Bb [pB+k])
                            { 
                                cij = GB_IMIN (cij, k + GB_OFFSET) ;
                                break ;
                            }
                        }
                    }
                    #elif GB_IS_MAX_FIRSTJ_SEMIRING
                    {
                        // MAX_FIRSTJ semiring: take the last entry
                        for (int64_t k = vlen-1 ; k >= 0 ; k--)
                        {
                            if (Ab [pA+k] && Bb [pB+k])
                            { 
                                cij = GB_IMAX (cij, k + GB_OFFSET) ;
                                break ;
                            }
                        }
                    }
                    #else
                    {
                        GB_PRAGMA_SIMD_DOT (cij)
                        for (int64_t k = 0 ; k < vlen ; k++)
                        {
                            if (Ab [pA+k] && Bb [pB+k])
                            { 
                                GB_DOT (k, pA+k, pB+k) ; // cij += A(k,i)*B(k,j)
                            }
                        }
                    }
                    #endif

                }
                #endif

                //--------------------------------------------------------------
                // save C(i,j)
                //--------------------------------------------------------------

                Cx [pC] = cij ;
            }
        }
    }
}
#elif ((GB_A_IS_SPARSE || GB_A_IS_HYPER) && (GB_B_IS_BITMAP || GB_B_IS_FULL ))
{

    //--------------------------------------------------------------------------
    // C += A'*B when A is sparse/hyper and B is bitmap/full
    //--------------------------------------------------------------------------

    // special cases: these methods are very fast, but cannot do not need
    // to be unrolled.
    #undef  GB_SPECIAL_CASE_OR_TERMINAL
    #define GB_SPECIAL_CASE_OR_TERMINAL \
       (   GB_IS_PAIR_MULTIPLIER        /* the multiply op is PAIR */       \
        || GB_IS_MIN_FIRSTJ_SEMIRING    /* min_firstj semiring */           \
        || GB_IS_MAX_FIRSTJ_SEMIRING    /* max_firstj semiring */           \
        || GB_MONOID_IS_TERMINAL        /* monoid has a terminal value */   \
        || GB_B_IS_PATTERN )            /* B is pattern-only */

    // Transpose B and unroll the innermost loop if this condition holds: A
    // must be sparse, B must be full, and no special semirings or operators
    // can be used.  The monoid must not be terminal.  These conditions are
    // known at compile time.
    #undef  GB_UNROLL
    #define GB_UNROLL \
        ( GB_A_IS_SPARSE && GB_B_IS_FULL && !( GB_SPECIAL_CASE_OR_TERMINAL ) )

    // If GB_UNROLL is true at compile-time, the simpler variant can still be
    // used, without unrolling, for any of these conditions:  (1) A is very
    // sparse (fewer entries than the size of the W workspace) or (2) B is iso.

    // The unrolled method does not allow B to be iso or pattern-only (such as
    // for the FIRST multiplicative operator.  If B is iso or pattern-only, the
    // dense matrix G = B' would be a single scalar, or its values would not be
    // accessed at all, so there is no benefit to computing G.

    #if GB_UNROLL
    const int64_t wp = (bvdim == 1) ? 0 : GB_IMIN (bvdim, 4) ;
    const int64_t anz = GB_nnz (A) ;
    if (anz < wp * vlen || B_iso)
    #endif
    {

        //----------------------------------------------------------------------
        // C += A'*B without workspace
        //----------------------------------------------------------------------

        int tid ;
        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
        for (tid = 0 ; tid < ntasks ; tid++)
        {

            //------------------------------------------------------------------
            // get the task descriptor
            //------------------------------------------------------------------

            const int64_t kA_start = A_slice [tid] ;
            const int64_t kA_end   = A_slice [tid+1] ;

            //------------------------------------------------------------------
            // C+=A'*B where A is sparse/hyper and B is bitmap/full
            //------------------------------------------------------------------

            if (bvdim == 1)
            {

                //--------------------------------------------------------------
                // C += A'*B where C is a single vector
                //--------------------------------------------------------------

                #define pC_start 0
                #define pB 0
                #define j 0
                for (int64_t kA = kA_start ; kA < kA_end ; kA++)
                {
                    // get A(:,i)
                    #if GB_A_IS_HYPER
                    const int64_t i = Ah [kA] ;
                    #else
                    const int64_t i = kA ;
                    #endif
                    int64_t pA = Ap [kA] ;
                    const int64_t pA_end = Ap [kA+1] ;
                    const int64_t ainz = pA_end - pA ;
                    // C(i) += A(:,i)'*B(:,0)
                    #include "GB_AxB_dot4_cij.c"
                }
                #undef pC_start
                #undef pB
                #undef j

            }
            else
            {

                //--------------------------------------------------------------
                // C += A'*B where C is a matrix
                //--------------------------------------------------------------

                for (int64_t kA = kA_start ; kA < kA_end ; kA++)
                {
                    // get A(:,i)
                    #if GB_A_IS_HYPER
                    const int64_t i = Ah [kA] ;
                    #else
                    const int64_t i = kA ;
                    #endif
                    int64_t pA = Ap [kA] ;
                    const int64_t pA_end = Ap [kA+1] ;
                    const int64_t ainz = pA_end - pA ;
                    // C(i,:) += A(:,i)'*B
                    for (int64_t j = 0 ; j < bvdim ; j++)
                    {
                        // get B(:,j) and C(:,j)
                        const int64_t pC_start = j * cvlen ;
                        const int64_t pB = j * vlen ;
                        // C(i,j) += A(:,i)'*B(:,j)
                        #include "GB_AxB_dot4_cij.c"
                    }
                }
            }
        }
    }
    #if GB_UNROLL
    else
    {

        //----------------------------------------------------------------------
        // C += A'*B: with workspace W for transposing B, one panel at a time
        //----------------------------------------------------------------------

        size_t W_size = 0 ;
        GB_BTYPE *restrict W = NULL ;
        if (bvdim > 1)
        {
            W = GB_MALLOC_WORK (wp * vlen, GB_BTYPE, &W_size) ;
            if (W == NULL)
            { 
                // out of memory
                return (GrB_OUT_OF_MEMORY) ;
            }
        }

        for (int64_t j1 = 0 ; j1 < bvdim ; j1 += 4)
        {

            //------------------------------------------------------------------
            // C(:,j1:j2-1) += A * B (:,j1:j2-1) for a single panel
            //------------------------------------------------------------------

            const int64_t j2 = GB_IMIN (j1 + 4, bvdim) ;
            switch (j2 - j1)
            {

                default :
                case 1 :
                {

                    //----------------------------------------------------------
                    // C(:,j1:j2-1) is a single vector; use B(:,j1) in place
                    //----------------------------------------------------------

                    const GB_BTYPE *restrict G = Bx + j1 * vlen ;
                    int tid ;
                    #pragma omp parallel for num_threads(nthreads) \
                        schedule(dynamic,1)
                    for (tid = 0 ; tid < ntasks ; tid++)
                    {
                        // get the task descriptor
                        const int64_t kA_start = A_slice [tid] ;
                        const int64_t kA_end   = A_slice [tid+1] ;
                        for (int64_t i = kA_start ; i < kA_end ; i++)
                        {
                            // get A(:,i)
                            const int64_t pA = Ap [i] ;
                            const int64_t pA_end = Ap [i+1] ;
                            // cx [0] = C(i,j1)
                            GB_CTYPE cx [1] ;
                            GB_GET4C (cx [0], i + j1*cvlen) ;
                            // cx [0] += A (:,i)'*G
                            for (int64_t p = pA ; p < pA_end ; p++)
                            { 
                                // aki = A(k,i)
                                const int64_t k = Ai [p] ;
                                GB_GETA (aki, Ax, p, A_iso) ;
                                // cx [0] += A(k,i)*G(k,0)
                                GB_MULTADD (cx [0], aki, G [k], i, k, j1) ;
                            }
                            // C(i,j1) = cx [0]
                            Cx [i + j1*cvlen] = cx [0] ;
                        }
                    }
                }
                break ;

                case 2 :
                {

                    //----------------------------------------------------------
                    // G = B(:,j1:j1+1) and convert to row-form
                    //----------------------------------------------------------

                    GB_BTYPE *restrict G = W ;
                    int64_t k ;
                    #pragma omp parallel for num_threads(nthreads) \
                        schedule(static)
                    for (k = 0 ; k < vlen ; k++)
                    {
                        // G (k,0:1) = B (k,j1:j1+1)
                        const int64_t k2 = k << 1 ;
                        G [k2    ] = Bx [k + (j1    ) * vlen] ;
                        G [k2 + 1] = Bx [k + (j1 + 1) * vlen] ;
                    }

                    //----------------------------------------------------------
                    // C += A'*G where G is vlen-by-2 in row-form
                    //----------------------------------------------------------

                    int tid ;
                    #pragma omp parallel for num_threads(nthreads) \
                        schedule(dynamic,1)
                    for (tid = 0 ; tid < ntasks ; tid++)
                    {
                        // get the task descriptor
                        const int64_t kA_start = A_slice [tid] ;
                        const int64_t kA_end   = A_slice [tid+1] ;
                        for (int64_t i = kA_start ; i < kA_end ; i++)
                        {
                            // get A(:,i)
                            const int64_t pA = Ap [i] ;
                            const int64_t pA_end = Ap [i+1] ;
                            // cx [0:1] = C(i,j1:j1+1)
                            GB_CTYPE cx [2] ;
                            GB_GET4C (cx [0], i + (j1  )*cvlen) ;
                            GB_GET4C (cx [1], i + (j1+1)*cvlen) ;
                            // cx [0:1] += A (:,i)'*G
                            for (int64_t p = pA ; p < pA_end ; p++)
                            { 
                                // aki = A(k,i)
                                const int64_t k = Ai [p] ;
                                GB_GETA (aki, Ax, p, A_iso) ;
                                const int64_t k2 = k << 1 ;
                                // cx [0:1] += A(k,i)*G(k,0:1)
                                GB_MULTADD (cx [0], aki, G [k2],   i, k, j1) ;
                                GB_MULTADD (cx [1], aki, G [k2+1], i, k, j1+1) ;
                            }
                            // C(i,j1:j1+1) = cx [0:1]
                            Cx [i + (j1  )*cvlen] = cx [0] ;
                            Cx [i + (j1+1)*cvlen] = cx [1] ;
                        }
                    }
                }
                break ;

                case 3 :
                {

                    //----------------------------------------------------------
                    // G = B(:,j1:j1+2) and convert to row-form
                    //----------------------------------------------------------

                    GB_BTYPE *restrict G = W ;
                    int64_t k ;
                    #pragma omp parallel for num_threads(nthreads) \
                        schedule(static)
                    for (k = 0 ; k < vlen ; k++)
                    {
                        // G (k,0:2) = B (k,j1:j1+2)
                        const int64_t k3 = k * 3 ;
                        G [k3    ] = Bx [k + (j1    ) * vlen] ;
                        G [k3 + 1] = Bx [k + (j1 + 1) * vlen] ;
                        G [k3 + 2] = Bx [k + (j1 + 2) * vlen] ;
                    }

                    //----------------------------------------------------------
                    // C += A'*G where G is vlen-by-3 in row-form
                    //----------------------------------------------------------

                    int tid ;
                    #pragma omp parallel for num_threads(nthreads) \
                        schedule(dynamic,1)
                    for (tid = 0 ; tid < ntasks ; tid++)
                    {
                        // get the task descriptor
                        const int64_t kA_start = A_slice [tid] ;
                        const int64_t kA_end   = A_slice [tid+1] ;
                        for (int64_t i = kA_start ; i < kA_end ; i++)
                        {
                            // get A(:,i)
                            const int64_t pA = Ap [i] ;
                            const int64_t pA_end = Ap [i+1] ;
                            // cx [0:2] = C(i,j1:j1+2)
                            GB_CTYPE cx [3] ;
                            GB_GET4C (cx [0], i + (j1  )*cvlen) ;
                            GB_GET4C (cx [1], i + (j1+1)*cvlen) ;
                            GB_GET4C (cx [2], i + (j1+2)*cvlen) ;
                            // cx [0:2] += A (:,i)'*G
                            for (int64_t p = pA ; p < pA_end ; p++)
                            { 
                                // aki = A(k,i)
                                const int64_t k = Ai [p] ;
                                GB_GETA (aki, Ax, p, A_iso) ;
                                const int64_t k3 = k * 3 ;
                                // cx [0:2] += A(k,i)*G(k,0:2)
                                GB_MULTADD (cx [0], aki, G [k3  ], i, k, j1) ;
                                GB_MULTADD (cx [1], aki, G [k3+1], i, k, j1+1) ;
                                GB_MULTADD (cx [2], aki, G [k3+2], i, k, j1+2) ;
                            }
                            // C(i,j1:j1+2) = cx [0:2]
                            Cx [i + (j1  )*cvlen] = cx [0] ;
                            Cx [i + (j1+1)*cvlen] = cx [1] ;
                            Cx [i + (j1+2)*cvlen] = cx [2] ;
                        }
                    }
                }
                break ;

                case 4 :
                {

                    //----------------------------------------------------------
                    // G = B(:,j1:j1+3) and convert to row-form
                    //----------------------------------------------------------

                    GB_BTYPE *restrict G = W ;
                    int64_t k ;
                    #pragma omp parallel for num_threads(nthreads) \
                        schedule(static)
                    for (k = 0 ; k < vlen ; k++)
                    {
                        // G (k,0:3) = B (k,j1:j1+3)
                        const int64_t k4 = k << 2 ;
                        G [k4    ] = Bx [k + (j1    ) * vlen] ;
                        G [k4 + 1] = Bx [k + (j1 + 1) * vlen] ;
                        G [k4 + 2] = Bx [k + (j1 + 2) * vlen] ;
                        G [k4 + 3] = Bx [k + (j1 + 3) * vlen] ;
                    }

                    //----------------------------------------------------------
                    // C += A'*G where G is vlen-by-4 in row-form
                    //----------------------------------------------------------

                    int tid ;
                    #pragma omp parallel for num_threads(nthreads) \
                        schedule(dynamic,1)
                    for (tid = 0 ; tid < ntasks ; tid++)
                    {
                        // get the task descriptor
                        const int64_t kA_start = A_slice [tid] ;
                        const int64_t kA_end   = A_slice [tid+1] ;
                        for (int64_t i = kA_start ; i < kA_end ; i++)
                        {
                            // get A(:,i)
                            const int64_t pA = Ap [i] ;
                            const int64_t pA_end = Ap [i+1] ;
                            // cx [0:3] = C(i,j1:j1+3)
                            GB_CTYPE cx [4] ;
                            GB_GET4C (cx [0], i + (j1  )*cvlen) ;
                            GB_GET4C (cx [1], i + (j1+1)*cvlen) ;
                            GB_GET4C (cx [2], i + (j1+2)*cvlen) ;
                            GB_GET4C (cx [3], i + (j1+3)*cvlen) ;
                            // cx [0:3] += A (:,i)'*G
                            for (int64_t p = pA ; p < pA_end ; p++)
                            { 
                                // aki = A(k,i)
                                const int64_t k = Ai [p] ;
                                GB_GETA (aki, Ax, p, A_iso) ;
                                const int64_t k4 = k << 2 ;
                                // cx [0:3] += A(k,i)*G(k,0:3)
                                GB_MULTADD (cx [0], aki, G [k4  ], i, k, j1) ;
                                GB_MULTADD (cx [1], aki, G [k4+1], i, k, j1+1) ;
                                GB_MULTADD (cx [2], aki, G [k4+2], i, k, j1+2) ;
                                GB_MULTADD (cx [3], aki, G [k4+3], i, k, j1+3) ;
                            }
                            // C(i,j1:j1+3) = cx [0:3]
                            Cx [i + (j1  )*cvlen] = cx [0] ;
                            Cx [i + (j1+1)*cvlen] = cx [1] ;
                            Cx [i + (j1+2)*cvlen] = cx [2] ;
                            Cx [i + (j1+3)*cvlen] = cx [3] ;
                        }
                    }
                }
                break ;
            }
        }

        // free workspace
        GB_FREE_WORK (&W, W_size) ;
    }
    #endif

}
#elif ( (GB_A_IS_BITMAP || GB_A_IS_FULL) && (GB_B_IS_SPARSE || GB_B_IS_HYPER))
{

    //--------------------------------------------------------------------------
    // C += A'*B where A is bitmap/full and B is sparse/hyper
    //--------------------------------------------------------------------------

    // FUTURE: this can be unrolled, like the case above

    int tid ;
    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
    for (tid = 0 ; tid < ntasks ; tid++)
    {

        //----------------------------------------------------------------------
        // get the task descriptor
        //----------------------------------------------------------------------

        const int64_t kB_start = B_slice [tid] ;
        const int64_t kB_end   = B_slice [tid+1] ;

        for (int64_t kB = kB_start ; kB < kB_end ; kB++)
        {

            //------------------------------------------------------------------
            // get B(:,j) and C(:,j)
            //------------------------------------------------------------------

            #if GB_B_IS_HYPER
            const int64_t j = Bh [kB] ;
            #else
            const int64_t j = kB ;
            #endif
            const int64_t pC_start = j * cvlen ;
            const int64_t pB_start = Bp [kB] ;
            const int64_t pB_end = Bp [kB+1] ;
            const int64_t bjnz = pB_end - pB_start ;

            //------------------------------------------------------------------
            // C(:,j) += A'*B(:,j)
            //------------------------------------------------------------------

            for (int64_t i = 0 ; i < avdim ; i++)
            {

                //--------------------------------------------------------------
                // get A(:,i)
                //--------------------------------------------------------------

                const int64_t pA = i * vlen ;

                //--------------------------------------------------------------
                // get C(i,j)
                //--------------------------------------------------------------

                int64_t pC = i + pC_start ;     // C(i,j) is at Cx [pC]
                GB_CTYPE GB_GET4C (cij, pC) ;   // cij = Cx [pC]

                //--------------------------------------------------------------
                // C(i,j) += A (:,i)*B(:,j): a single dot product
                //--------------------------------------------------------------

                int64_t pB = pB_start ;

                #if ( GB_A_IS_FULL )
                {

                    //----------------------------------------------------------
                    // A is full and B is sparse/hyper
                    //----------------------------------------------------------

                    #if GB_IS_PAIR_MULTIPLIER
                    { 
                        #if GB_IS_EQ_MONOID
                        // EQ_PAIR semiring
                        cij = (cij == 1) ;
                        #elif (GB_CTYPE_BITS > 0)
                        // PLUS, XOR monoids: A(:,i)'*B(:,j) is nnz(A(:,i)),
                        // for bool, 8-bit, 16-bit, or 32-bit integer
                        uint64_t t = ((uint64_t) cij) + bjnz ;
                        cij = (GB_CTYPE) (t & GB_CTYPE_BITS) ;
                        #elif GB_IS_PLUS_FC32_MONOID
                        // PLUS monoid for float complex
                        cij = GxB_CMPLXF (crealf (cij) + (float) bjnz, 0) ;
                        #elif GB_IS_PLUS_FC64_MONOID
                        // PLUS monoid for double complex
                        cij = GxB_CMPLX (creal (cij) + (double) bjnz, 0) ;
                        #else
                        // PLUS monoid for float, double, or 64-bit integers
                        cij += (GB_CTYPE) bjnz ;
                        #endif
                    }
                    #elif GB_IS_MIN_FIRSTJ_SEMIRING
                    {
                        // MIN_FIRSTJ semiring: take the first entry in B(:,j)
                        if (bjnz > 0)
                        { 
                            int64_t k = Bi [pB] + GB_OFFSET ;
                            cij = GB_IMIN (cij, k) ;
                        }
                    }
                    #elif GB_IS_MAX_FIRSTJ_SEMIRING
                    {
                        // MAX_FIRSTJ semiring: take the last entry in B(:,j)
                        if (bjnz > 0)
                        { 
                            int64_t k = Bi [pB_end-1] + GB_OFFSET ;
                            cij = GB_IMAX (cij, k) ;
                        }
                    }
                    #else
                    {
                        GB_PRAGMA_SIMD_DOT (cij)
                        for (int64_t p = pB ; p < pB_end ; p++)
                        { 
                            int64_t k = Bi [p] ;
                            GB_DOT (k, pA+k, p) ;   // cij += A(k,i)*B(k,j)
                        }
                    }
                    #endif

                }
                #else
                {

                    //----------------------------------------------------------
                    // A is bitmap and B is sparse/hyper
                    //----------------------------------------------------------

                    #if GB_IS_MIN_FIRSTJ_SEMIRING
                    {
                        // MIN_FIRSTJ semiring: take the first entry
                        for (int64_t p = pB ; p < pB_end ; p++)
                        {
                            int64_t k = Bi [p] ;
                            if (Ab [pA+k])
                            { 
                                cij = GB_IMIN (cij, k + GB_OFFSET) ;
                                break ;
                            }
                        }
                    }
                    #elif GB_IS_MAX_FIRSTJ_SEMIRING
                    {
                        // MAX_FIRSTJ semiring: take the last entry
                        for (int64_t p = pB_end-1 ; p >= pB ; p--)
                        {
                            int64_t k = Bi [p] ;
                            if (Ab [pA+k])
                            { 
                                cij = GB_IMAX (cij, k + GB_OFFSET) ;
                                break ;
                            }
                        }
                    }
                    #else
                    {
                        GB_PRAGMA_SIMD_DOT (cij)
                        for (int64_t p = pB ; p < pB_end ; p++)
                        {
                            int64_t k = Bi [p] ;
                            if (Ab [pA+k])
                            { 
                                GB_DOT (k, pA+k, p) ;   // cij += A(k,i)*B(k,j)
                            }
                        }
                    }
                    #endif

                }
                #endif

                //--------------------------------------------------------------
                // save C(i,j)
                //--------------------------------------------------------------

                Cx [pC] = cij ;
            }
        }
    }
}
#elif ( (GB_A_IS_SPARSE || GB_A_IS_HYPER) && (GB_B_IS_SPARSE || GB_B_IS_HYPER))
{

    //--------------------------------------------------------------------------
    // C+=A'*B where A and B are both sparse/hyper
    //--------------------------------------------------------------------------

    int tid ;
    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
    for (tid = 0 ; tid < ntasks ; tid++)
    {

        //----------------------------------------------------------------------
        // get the task descriptor
        //----------------------------------------------------------------------

        const int a_tid = tid / nbslice ;
        const int b_tid = tid % nbslice ;
        const int64_t kA_start = A_slice [a_tid] ;
        const int64_t kA_end   = A_slice [a_tid+1] ;
        const int64_t kB_start = B_slice [b_tid] ;
        const int64_t kB_end   = B_slice [b_tid+1] ;

        //----------------------------------------------------------------------
        // C+=A'*B via dot products
        //----------------------------------------------------------------------

        for (int64_t kB = kB_start ; kB < kB_end ; kB++)
        {

            //------------------------------------------------------------------
            // get B(:,j) and C(:,j)
            //------------------------------------------------------------------

            #if GB_B_IS_HYPER
            const int64_t j = Bh [kB] ;
            #else
            const int64_t j = kB ;
            #endif
            const int64_t pC_start = j * cvlen ;
            const int64_t pB_start = Bp [kB] ;
            const int64_t pB_end = Bp [kB+1] ;
            const int64_t bjnz = pB_end - pB_start ;

            //------------------------------------------------------------------
            // C(:,j) += A'*B(:,j) where C is full
            //------------------------------------------------------------------

            for (int64_t kA = kA_start ; kA < kA_end ; kA++)
            {

                //--------------------------------------------------------------
                // get A(:,i)
                //--------------------------------------------------------------

                #if GB_A_IS_HYPER
                const int64_t i = Ah [kA] ;
                #else
                const int64_t i = kA ;
                #endif
                int64_t pA = Ap [kA] ;
                const int64_t pA_end = Ap [kA+1] ;
                const int64_t ainz = pA_end - pA ;

                //--------------------------------------------------------------
                // get C(i,j)
                //--------------------------------------------------------------

                int64_t pC = i + pC_start ;     // C(i,j) is at Cx [pC]
                GB_CTYPE GB_GET4C (cij, pC) ;   // cij = Cx [pC]

                //--------------------------------------------------------------
                // C(i,j) += A (:,i)*B(:,j): a single dot product
                //--------------------------------------------------------------

                int64_t pB = pB_start ;

                //----------------------------------------------------------
                // both A and B are sparse/hyper
                //----------------------------------------------------------

                // The MIN_FIRSTJ semirings are exploited, by terminating as
                // soon as any entry is found.  The MAX_FIRSTJ semirings are
                // not treated specially here.  They could be done with a
                // backwards traversal of the sparse vectors A(:,i) and
                // B(:,j).

                if (ainz == 0 || bjnz == 0 || 
                    Ai [pA_end-1] < Bi [pB_start] ||
                    Bi [pB_end-1] < Ai [pA])
                { 

                    //------------------------------------------------------
                    // A(:,i) and B(:,j) don't overlap, or are empty
                    //------------------------------------------------------

                }
                else if (ainz > 8 * bjnz)
                {

                    //------------------------------------------------------
                    // B(:,j) is very sparse compared to A(:,i)
                    //------------------------------------------------------

                    while (pA < pA_end && pB < pB_end)
                    {
                        int64_t ia = Ai [pA] ;
                        int64_t ib = Bi [pB] ;
                        if (ia < ib)
                        { 
                            // A(ia,i) appears before B(ib,j)
                            // discard all entries A(ia:ib-1,i)
                            int64_t pleft = pA + 1 ;
                            int64_t pright = pA_end - 1 ;
                            GB_TRIM_BINARY_SEARCH (ib, Ai, pleft, pright) ;
                            ASSERT (pleft > pA) ;
                            pA = pleft ;
                        }
                        else if (ib < ia)
                        { 
                            // B(ib,j) appears before A(ia,i)
                            pB++ ;
                        }
                        else // ia == ib == k
                        { 
                            // A(k,i) and B(k,j) are next entries to merge
                            GB_DOT (ia, pA, pB) ;   // cij += A(k,i)*B(k,j)
                            #if GB_IS_MIN_FIRSTJ_SEMIRING
                            break ;
                            #endif
                            pA++ ;
                            pB++ ;
                        }
                    }

                }
                else if (bjnz > 8 * ainz)
                {

                    //------------------------------------------------------
                    // A(:,i) is very sparse compared to B(:,j)
                    //------------------------------------------------------

                    while (pA < pA_end && pB < pB_end)
                    {
                        int64_t ia = Ai [pA] ;
                        int64_t ib = Bi [pB] ;
                        if (ia < ib)
                        { 
                            // A(ia,i) appears before B(ib,j)
                            pA++ ;
                        }
                        else if (ib < ia)
                        { 
                            // B(ib,j) appears before A(ia,i)
                            // discard all entries B(ib:ia-1,j)
                            int64_t pleft = pB + 1 ;
                            int64_t pright = pB_end - 1 ;
                            GB_TRIM_BINARY_SEARCH (ia, Bi, pleft, pright) ;
                            ASSERT (pleft > pB) ;
                            pB = pleft ;
                        }
                        else // ia == ib == k
                        { 
                            // A(k,i) and B(k,j) are next entries to merge
                            GB_DOT (ia, pA, pB) ;   // cij += A(k,i)*B(k,j)
                            #if GB_IS_MIN_FIRSTJ_SEMIRING
                            break ;
                            #endif
                            pA++ ;
                            pB++ ;
                        }
                    }

                }
                else
                {

                    //------------------------------------------------------
                    // A(:,i) and B(:,j) have about the same sparsity
                    //------------------------------------------------------

                    while (pA < pA_end && pB < pB_end)
                    {
                        int64_t ia = Ai [pA] ;
                        int64_t ib = Bi [pB] ;
                        if (ia < ib)
                        { 
                            // A(ia,i) appears before B(ib,j)
                            pA++ ;
                        }
                        else if (ib < ia)
                        { 
                            // B(ib,j) appears before A(ia,i)
                            pB++ ;
                        }
                        else // ia == ib == k
                        { 
                            // A(k,i) and B(k,j) are the entries to merge
                            GB_DOT (ia, pA, pB) ;   // cij += A(k,i)*B(k,j)
                            #if GB_IS_MIN_FIRSTJ_SEMIRING
                            break ;
                            #endif
                            pA++ ;
                            pB++ ;
                        }
                    }
                }

                //--------------------------------------------------------------
                // save C(i,j)
                //--------------------------------------------------------------

                Cx [pC] = cij ;
            }
        }
    }
}
#endif

#undef GB_IS_MIN_FIRSTJ_SEMIRING
#undef GB_IS_MAX_FIRSTJ_SEMIRING
#undef GB_GET4C
#undef GB_SPECIAL_CASE_OR_TERMINAL
#undef GB_UNROLL