File: GB_jit_kernel_AxB_saxpy5.c

package info (click to toggle)
suitesparse 1%3A7.10.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, trixie
  • size: 254,920 kB
  • sloc: ansic: 1,134,743; cpp: 46,133; makefile: 4,875; fortran: 2,087; java: 1,826; sh: 996; ruby: 725; python: 495; asm: 371; sed: 166; awk: 44
file content (182 lines) | stat: -rw-r--r-- 5,613 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
//------------------------------------------------------------------------------
// GB_jit_kernel_AxB_saxpy5: C+=A*B, C is full, A bitmap/full, B sparse/hyper
//------------------------------------------------------------------------------

// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2025, All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

#include "include/GB_AxB_saxpy3_template.h"

GB_JIT_GLOBAL GB_JIT_KERNEL_AXB_SAXPY5_PROTO (GB_jit_kernel) ;

#if !GB_A_IS_PATTERN && !GB_A_ISO && !GB_A_IS_BITMAP

    #if GB_SEMIRING_HAS_AVX_IMPLEMENTATION

        //----------------------------------------------------------------------
        // saxpy5 method with vectors of length 8 for double, 16 for single
        //----------------------------------------------------------------------

        // AVX512F: vector registers are 512 bits, or 64 bytes, which can hold
        // 16 floats or 8 doubles.

        #define GB_V16_512 (16 * GB_Z_NBITS <= 512)
        #define GB_V8_512  ( 8 * GB_Z_NBITS <= 512)
        #define GB_V4_512  ( 4 * GB_Z_NBITS <= 512)

        #define GB_V16 GB_V16_512
        #define GB_V8  GB_V8_512
        #define GB_V4  GB_V4_512

        #if GB_COMPILER_SUPPORTS_AVX512F && GB_V4_512

            GB_TARGET_AVX512F static inline void GB_AxB_saxpy5_unrolled_avx512f
            (
                GrB_Matrix C,
                const GrB_Matrix A,
                const GrB_Matrix B,
                const int ntasks,
                const int nthreads,
                const int64_t *B_slice
            )
            {
                #include "template/GB_AxB_saxpy5_unrolled.c"
            }

        #endif

        //----------------------------------------------------------------------
        // saxpy5 method with vectors of length 4 for double, 8 for single
        //----------------------------------------------------------------------

        // AVX2: vector registers are 256 bits, or 32 bytes, which can hold
        // 8 floats or 4 doubles.

        #define GB_V16_256 (16 * GB_Z_NBITS <= 256)
        #define GB_V8_256  ( 8 * GB_Z_NBITS <= 256)
        #define GB_V4_256  ( 4 * GB_Z_NBITS <= 256)

        #undef  GB_V16
        #undef  GB_V8
        #undef  GB_V4

        #define GB_V16 GB_V16_256
        #define GB_V8  GB_V8_256
        #define GB_V4  GB_V4_256

        #if GB_COMPILER_SUPPORTS_AVX2 && GB_V4_256

            GB_TARGET_AVX2 static inline void GB_AxB_saxpy5_unrolled_avx2
            (
                GrB_Matrix C,
                const GrB_Matrix A,
                const GrB_Matrix B,
                const int ntasks,
                const int nthreads,
                const int64_t *B_slice
            )
            {
                #include "template/GB_AxB_saxpy5_unrolled.c"
            }

        #endif
    
    #endif

    //--------------------------------------------------------------------------
    // saxpy5 method unrolled, with no vectors
    //--------------------------------------------------------------------------

    #undef  GB_V16
    #undef  GB_V8
    #undef  GB_V4

    #define GB_V16 0
    #define GB_V8  0
    #define GB_V4  0

    static inline void GB_AxB_saxpy5_unrolled_vanilla
    (
        GrB_Matrix C,
        const GrB_Matrix A,
        const GrB_Matrix B,
        const int ntasks,
        const int nthreads,
        const int64_t *B_slice
    )
    {
        #include "template/GB_AxB_saxpy5_unrolled.c"
    }

#endif

//------------------------------------------------------------------------------
// GB_jit_kernel: for saxpy5 method
//------------------------------------------------------------------------------

GB_JIT_GLOBAL GB_JIT_KERNEL_AXB_SAXPY5_PROTO (GB_jit_kernel)
{
    GB_GET_CALLBACKS ;

    #if GB_A_IS_PATTERN || GB_A_ISO
    {

        //----------------------------------------------------------------------
        // saxpy5: C+=A*B where A is bitmap/full and iso or pattern
        //----------------------------------------------------------------------

        #include "template/GB_AxB_saxpy5_A_iso_or_pattern.c"

    }
    #elif GB_A_IS_BITMAP
    {

        //----------------------------------------------------------------------
        // saxpy5: C+=A*B where A is bitmap (but not iso or pattern)
        //----------------------------------------------------------------------

        #include "template/GB_AxB_saxpy5_A_bitmap.c"

    }
    #else
    {

        //----------------------------------------------------------------------
        // saxpy5: C+=A*B where A is sparse/hypersparse
        //----------------------------------------------------------------------

        #if GB_SEMIRING_HAS_AVX_IMPLEMENTATION
        {

            #if GB_COMPILER_SUPPORTS_AVX512F && GB_V4_512
            if (cpu_has_avx512f)
            {
                // x86_64 with AVX512f
                GB_AxB_saxpy5_unrolled_avx512f (C, A, B, ntasks, nthreads,
                    B_slice) ;
                return (GrB_SUCCESS) ;
            }
            #endif

            #if GB_COMPILER_SUPPORTS_AVX2 && GB_V4_256
            if (cpu_has_avx2)
            {
                // x86_64 with AVX2
                GB_AxB_saxpy5_unrolled_avx2 (C, A, B, ntasks, nthreads,
                    B_slice) ;
                return (GrB_SUCCESS) ;
            }
            #endif
        }
        #endif

        // any architecture and any semiring
        GB_AxB_saxpy5_unrolled_vanilla (C, A, B, ntasks, nthreads, B_slice) ;

    }
    #endif
    return (GrB_SUCCESS) ;
}