1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
|
//------------------------------------------------------------------------------
// GB_jit_kernel_AxB_saxpy5: C+=A*B, C is full, A bitmap/full, B sparse/hyper
//------------------------------------------------------------------------------
// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2025, All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
//------------------------------------------------------------------------------
#include "include/GB_AxB_saxpy3_template.h"
GB_JIT_GLOBAL GB_JIT_KERNEL_AXB_SAXPY5_PROTO (GB_jit_kernel) ;
#if !GB_A_IS_PATTERN && !GB_A_ISO && !GB_A_IS_BITMAP
#if GB_SEMIRING_HAS_AVX_IMPLEMENTATION
//----------------------------------------------------------------------
// saxpy5 method with vectors of length 8 for double, 16 for single
//----------------------------------------------------------------------
// AVX512F: vector registers are 512 bits, or 64 bytes, which can hold
// 16 floats or 8 doubles.
#define GB_V16_512 (16 * GB_Z_NBITS <= 512)
#define GB_V8_512 ( 8 * GB_Z_NBITS <= 512)
#define GB_V4_512 ( 4 * GB_Z_NBITS <= 512)
#define GB_V16 GB_V16_512
#define GB_V8 GB_V8_512
#define GB_V4 GB_V4_512
#if GB_COMPILER_SUPPORTS_AVX512F && GB_V4_512
GB_TARGET_AVX512F static inline void GB_AxB_saxpy5_unrolled_avx512f
(
GrB_Matrix C,
const GrB_Matrix A,
const GrB_Matrix B,
const int ntasks,
const int nthreads,
const int64_t *B_slice
)
{
#include "template/GB_AxB_saxpy5_unrolled.c"
}
#endif
//----------------------------------------------------------------------
// saxpy5 method with vectors of length 4 for double, 8 for single
//----------------------------------------------------------------------
// AVX2: vector registers are 256 bits, or 32 bytes, which can hold
// 8 floats or 4 doubles.
#define GB_V16_256 (16 * GB_Z_NBITS <= 256)
#define GB_V8_256 ( 8 * GB_Z_NBITS <= 256)
#define GB_V4_256 ( 4 * GB_Z_NBITS <= 256)
#undef GB_V16
#undef GB_V8
#undef GB_V4
#define GB_V16 GB_V16_256
#define GB_V8 GB_V8_256
#define GB_V4 GB_V4_256
#if GB_COMPILER_SUPPORTS_AVX2 && GB_V4_256
GB_TARGET_AVX2 static inline void GB_AxB_saxpy5_unrolled_avx2
(
GrB_Matrix C,
const GrB_Matrix A,
const GrB_Matrix B,
const int ntasks,
const int nthreads,
const int64_t *B_slice
)
{
#include "template/GB_AxB_saxpy5_unrolled.c"
}
#endif
#endif
//--------------------------------------------------------------------------
// saxpy5 method unrolled, with no vectors
//--------------------------------------------------------------------------
#undef GB_V16
#undef GB_V8
#undef GB_V4
#define GB_V16 0
#define GB_V8 0
#define GB_V4 0
static inline void GB_AxB_saxpy5_unrolled_vanilla
(
GrB_Matrix C,
const GrB_Matrix A,
const GrB_Matrix B,
const int ntasks,
const int nthreads,
const int64_t *B_slice
)
{
#include "template/GB_AxB_saxpy5_unrolled.c"
}
#endif
//------------------------------------------------------------------------------
// GB_jit_kernel: for saxpy5 method
//------------------------------------------------------------------------------
GB_JIT_GLOBAL GB_JIT_KERNEL_AXB_SAXPY5_PROTO (GB_jit_kernel)
{
GB_GET_CALLBACKS ;
#if GB_A_IS_PATTERN || GB_A_ISO
{
//----------------------------------------------------------------------
// saxpy5: C+=A*B where A is bitmap/full and iso or pattern
//----------------------------------------------------------------------
#include "template/GB_AxB_saxpy5_A_iso_or_pattern.c"
}
#elif GB_A_IS_BITMAP
{
//----------------------------------------------------------------------
// saxpy5: C+=A*B where A is bitmap (but not iso or pattern)
//----------------------------------------------------------------------
#include "template/GB_AxB_saxpy5_A_bitmap.c"
}
#else
{
//----------------------------------------------------------------------
// saxpy5: C+=A*B where A is sparse/hypersparse
//----------------------------------------------------------------------
#if GB_SEMIRING_HAS_AVX_IMPLEMENTATION
{
#if GB_COMPILER_SUPPORTS_AVX512F && GB_V4_512
if (cpu_has_avx512f)
{
// x86_64 with AVX512f
GB_AxB_saxpy5_unrolled_avx512f (C, A, B, ntasks, nthreads,
B_slice) ;
return (GrB_SUCCESS) ;
}
#endif
#if GB_COMPILER_SUPPORTS_AVX2 && GB_V4_256
if (cpu_has_avx2)
{
// x86_64 with AVX2
GB_AxB_saxpy5_unrolled_avx2 (C, A, B, ntasks, nthreads,
B_slice) ;
return (GrB_SUCCESS) ;
}
#endif
}
#endif
// any architecture and any semiring
GB_AxB_saxpy5_unrolled_vanilla (C, A, B, ntasks, nthreads, B_slice) ;
}
#endif
return (GrB_SUCCESS) ;
}
|