File: GB_AxB_saxpy5_lv.c

package info (click to toggle)
suitesparse 1%3A7.11.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 258,172 kB
  • sloc: ansic: 1,153,566; cpp: 48,145; makefile: 4,997; fortran: 2,087; java: 1,826; sh: 1,113; ruby: 725; python: 676; asm: 371; sed: 166; awk: 44
file content (100 lines) | stat: -rw-r--r-- 4,113 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
//------------------------------------------------------------------------------
// GB_AxB_saxpy5_lv.c: C+=A*B when C is full
//------------------------------------------------------------------------------

#if GB_Z_NBITS == 64
    #define VSETVL(x) __riscv_vsetvl_e64m8(x)
    #define VLE(x,y) __riscv_vle64_v_f64m8(x, y)
    #define VFMACC(x,y,z,w) __riscv_vfmacc_vf_f64m8(x, y, z, w)
    #define VSE(x,y,z) __riscv_vse64_v_f64m8(x, y, z)
    #define VECTORTYPE vfloat64m8_t
#else
    #define VSETVL(x) __riscv_vsetvl_e32m8(x)
    #define VLE(x,y) __riscv_vle32_v_f32m8(x, y)
    #define VFMACC(x,y,z,w) __riscv_vfmacc_vf_f32m8(x, y, z, w)
    #define VSE(x,y,z) __riscv_vse32_v_f32m8(x, y, z)
    #define VECTORTYPE vfloat32m8_t
#endif

{

    //--------------------------------------------------------------------------
    // get C, A, and B
    //--------------------------------------------------------------------------

    const int64_t m = C->vlen;     // # of rows of C and A
    GB_Bp_DECLARE (Bp, const) ; GB_Bp_PTR (Bp, B) ;
    GB_Bh_DECLARE (Bh, const) ; GB_Bh_PTR (Bh, B) ;
    GB_Bi_DECLARE (Bi, const) ; GB_Bi_PTR (Bi, B) ;
    const bool B_iso = B->iso ;
    const GB_A_TYPE *restrict Ax = (GB_A_TYPE *)A->x;
    const GB_B_TYPE *restrict Bx = (GB_B_TYPE *)B->x;
    // get the max number of elements that vector can store
    size_t vl = VSETVL(m);
    GB_C_TYPE *restrict Cx = (GB_C_TYPE *)C->x;

    //--------------------------------------------------------------------------
    // C += A*B where A is full (and not iso or pattern-only)
    //--------------------------------------------------------------------------

    #pragma omp parallel for num_threads(nthreads) schedule(dynamic, 1)
    for (int tid = 0; tid < ntasks; tid++)
    {
        // get the task descriptor
        const int64_t jB_start = B_slice[tid];
        const int64_t jB_end = B_slice[tid + 1];
        // C(:,jB_start:jB_end-1) += A * B(:,jB_start:jB_end-1)
        for (int64_t jB = jB_start; jB < jB_end; jB++)
        {
            // get B(:,j) and C(:,j)
            const int64_t j = GBh_B (Bh, jB) ;
            GB_C_TYPE *restrict Cxj = Cx + (j * m) ;
            const int64_t pB_start = GB_IGET (Bp, jB) ;
            const int64_t pB_end   = GB_IGET (Bp, jB+1) ;

            //------------------------------------------------------------------
            // C(:,j) += A*B(:,j), on sets of vl rows of C and A at a time
            //------------------------------------------------------------------

            for (int64_t i = 0; i < m && (m - i) >= vl; i += vl)
            {
                // get C(i:i+vl,j)
                VECTORTYPE vc = VLE(Cxj + i, vl);
                for (int64_t pB = pB_start; pB < pB_end; pB++)
                {
                    // bkj = B(k,j)
                    const int64_t k = GB_IGET (Bi, pB) ;
                    GB_DECLAREB (bkj) ;
                    GB_GETB (bkj, Bx, pB, B_iso) ;
                    // get A(i,k)
                    VECTORTYPE va = VLE(Ax + i + k * m, vl);
                    // C(i:i+15,j) += A(i:i+15,k)*B(k,j)
                    vc = VFMACC(vc, bkj, va, vl);
                }
                // save C(i:i+15,j)
                VSE(Cxj + i, vc, vl);
            }

            //------------------------------------------------------------------
            // lines 179-1036 from GB_AxB_saxpy5_unrolled.c
            //------------------------------------------------------------------

            int64_t remaining = m % vl;
            if (remaining > 0)
            {
                int64_t i = m - remaining;
                VECTORTYPE vc = VLE(Cxj + i, remaining);
                for (int64_t pB = pB_start; pB < pB_end; pB++)
                {
                    const int64_t k = GB_IGET (Bi, pB) ;
                    GB_DECLAREB (bkj) ;
                    GB_GETB (bkj, Bx, pB, B_iso) ;
                    VECTORTYPE va = VLE(Ax + i + k * m, remaining);
                    vc = VFMACC(vc, bkj, va, remaining);
                }

                VSE(Cxj + i, vc, remaining);
            }
        }
    }
}