1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
|
/*
-- MAGMA (version 2.9.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
@date January 2025
*/
#ifndef BATCHED_KERNEL_PARAM_H
#define BATCHED_KERNEL_PARAM_H
#define MAX_NTHREADS 1024 // 1024 is max threads for 2.x cards
#define MAX_SHARED_ALLOWED 47
#define zamax 256
#define zamax_shuffle 256
#define DOTC_MAX_BS 512 // 512 is max threads for 1.x cards
#define POTRF_NB 128 // blocking in main algorithm 128 if using recursive panel or 32 if using standard panel
#define POTF2_NB 8 // blocking size in panel factorization
#define POTF2_NNB 8 // blocking size in panel factorization
#define POTF2_TILE_SIZE 32
#define MAX_POTF2_SM 128
#define VERSION20
#define BATRF_NB 128
#define BATRF_RECNB 32
#define BATF2_NB 8
#define BAQRF_NB 32
#define BATRI_NB 128 // ztrsm_nb should be >= BATRF_NB
#define TRI_NB 128 // ztrsm_nb should match the NB in BATRF_NB
#define TRI_BLOCK_SIZE 16
// tuning for zgetf2_panel
#define ZGETF2_PANEL_NB (8)
#define CGETF2_PANEL_NB (8)
#define DGETF2_PANEL_NB (16)
#define SGETF2_PANEL_NB (16)
// tuning zgetf2_panel_chain
#define ZGETF2_FUSED_NTH (512)
#define CGETF2_FUSED_NTH (512)
#define DGETF2_FUSED_NTH (512)
#define SGETF2_FUSED_NTH (512)
//
#define ZGETF2_FUSED_MAX_M (7168)
#define CGETF2_FUSED_MAX_M (10240)
#define DGETF2_FUSED_MAX_M (23552)
#define SGETF2_FUSED_MAX_M (40960)
// TRSM tuning parameters
#define STRTRI_BATCHED_NB (64)
#define STRTRI_BATCHED_BLOCK_SIZE (16)
#define DTRTRI_BATCHED_NB (64)
#define DTRTRI_BATCHED_BLOCK_SIZE (16)
#define CTRTRI_BATCHED_NB (32)
#define CTRTRI_BATCHED_BLOCK_SIZE (16)
#define ZTRTRI_BATCHED_NB (128)
#define ZTRTRI_BATCHED_BLOCK_SIZE (16)
// HEMM tuning
#define ZHEMM_BATCHED_LEFT 8, 16, 16, 1
#define ZHEMM_BATCHED_RIGHT 8, 16, 16, 1
#define CHEMM_BATCHED_LEFT 16, 32, 32, 1
#define CHEMM_BATCHED_RIGHT 16, 32, 32, 1
#define DSYMM_BATCHED_LEFT 16, 32, 32, 0
#define DSYMM_BATCHED_RIGHT 16, 32, 32, 0
#define SSYMM_BATCHED_LEFT 32, 64, 64, 0
#define SSYMM_BATCHED_RIGHT 32, 64, 64, 0
// Batch TRMM tuning
#define ZTRMM_BATCHED_NB (16)
#define CTRMM_BATCHED_NB (16)
#define DTRMM_BATCHED_NB (32)
#define STRMM_BATCHED_NB (32)
// TRMM Tuning
#define ZTRMM_NB (16)
#define CTRMM_NB (16)
#define DTRMM_NB (32)
#define STRMM_NB (32)
// TRMV Tuning
#define ZTRMV_NB (32)
#define CTRMV_NB (64)
#define DTRMV_NB (64)
#define STRMV_NB (64)
// HEMV tuning
#define ZHEMV_BATCHED_LOWER 16, 4
#define CHEMV_BATCHED_LOWER 16, 4
#define DSYMV_BATCHED_LOWER 16, 4
#define SSYMV_BATCHED_LOWER 32, 4
#define ZHEMV_BATCHED_UPPER 16, 4
#define CHEMV_BATCHED_UPPER 16, 4
#define DSYMV_BATCHED_UPPER 16, 4
#define SSYMV_BATCHED_UPPER 32, 4
// GETF2_FUSED_BATCHED maximum rows
#define ZGETF2_FUSED_BATCHED_MAX_ROWS (256)
#define CGETF2_FUSED_BATCHED_MAX_ROWS (384)
#define DGETF2_FUSED_BATCHED_MAX_ROWS (512)
#define SGETF2_FUSED_BATCHED_MAX_ROWS (512)
#define magma_ceilpow2(N) ( (N > 16)? 32 : \
(N > 8)? 16 : \
(N > 4)? 8 : \
(N > 2)? 4 : \
(N > 0)? 2 : 0 )
#endif // #ifndef BATCHED_KERNEL_PARAM_H
|