File: autotune.cpp

package info (click to toggle)
librsb 1.3.0.2%2Bdfsg-7
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 32,792 kB
  • sloc: ansic: 274,405; f90: 108,468; cpp: 16,934; sh: 6,761; makefile: 1,679; objc: 692; awk: 22; sed: 1
file content (146 lines) | stat: -rw-r--r-- 4,790 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
/*

Copyright (C) 2020-2022 Michele Martone

This file is part of librsb.

librsb is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published
by the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.

librsb is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public
License along with librsb; see the file COPYING.
If not, see <http://www.gnu.org/licenses/>.

*/
/*!
 \ingroup rsb_doc_examples
 @file
 @author Michele Martone

 @brief C++ example based on <rsb.hpp> for performance-benchmarking a matrix from file using RsbMatrix.tune_spmm(), for various right-hand side counts.

 \include autotune.cpp
*/
#include <complex>
#include <array>
#include <vector>
#include <iomanip>
#include <iostream>
#include <string>
#include <rsb.hpp>

using namespace ::rsb;

template <typename nt_t>
void bench(const std::string filename, rsb_flags_t order) {
	RsbLib rsblib;
	rsb_int_t rnt { rsblib.get_num_threads() };
	const rsb_int_t one {1};

	const rsb_trans_t transA = RSB_TRANSPOSITION_N;

	std::cout << "Librsb initialized with " << rnt << " threads." << std::endl;

	RsbMatrix<nt_t> mtx(filename.c_str());
	const rsb_flags_t flagsA {mtx.rsbflags()};
	const bool is_sym = flagsA & ( RSB_FLAG_SYMMETRIC|RSB_FLAG_HERMITIAN ) ? 1 : 0;
	const long long nnz_ops = ( is_sym ? 2 : 1 ) * ( mtx._is_complex() ? 8 : 2 );
	const rsb_real_t mtxocc = mtx._get_storage_bytes();

	std::cout << "Read matrix " << std::quoted(filename) << " : " << mtx._info() << std::endl << std::endl;
	std::cout << "Matrix occupies " << mtxocc << " bytes " << std::endl;

	for ( rsb_coo_idx_t nrhs : {1,2,4,50,100} )
	{
		const rsb_real_t opocc = sizeof(nt_t)*nrhs*(mtx.cols()+mtx.rows());
		std::cout << std::endl << "Operands occupy " << opocc << " bytes now ( with " << nrhs << " nrhs )" << std::endl;
		const rsb_nnz_idx_t nnzA {mtx.nnz()};
		const char oc = ( order == RSB_FLAG_WANT_COLUMN_MAJOR_ORDER ? 'C' : 'R' );
		const rsb_nnz_idx_t ldB {};
		const rsb_nnz_idx_t ldC {};
		const std::vector<nt_t> B(nrhs*mtx.cols(),1.);
		std::vector<nt_t> C(nrhs*mtx.rows(),0.);

		const nt_t alpha { 1.}, beta { 1.};
		rsb_time_t dt, tt;
		const rsb_blk_idx_t nsmA{mtx.blocks()};

		mtx.spmm(transA,&alpha,nrhs,order,B.data(),ldB,&beta,C.data(),ldC); // caches warmup

		dt = -rsb_time();
		mtx.spmm(transA,&alpha,nrhs,order,B.data(),ldB,&beta,C.data(),ldC);

		dt += rsb_time();

		const auto flops_u = (nnz_ops*nnzA*nrhs)/dt;
		std::cout << "rsb_spmm-" << nrhs << "-" << oc << " took " << dt << " s, for " << nnzA/dt << " nnz/s, " << flops_u << " flops/s\n";

		if(true)
		{
			rsb_real_t sf = 1.0;
			const rsb_int_t maxr = 10;
			const rsb_time_t tmax = 20;
			rsb_int_t tn {0};

			// std::cout << "Turning on autotuning before each nrhs count..." << std::endl;

			if(false)
				rsblib.set_opt(RSB_IO_WANT_VERBOSE_TUNING, &one);

			/*! [snip__tune_spmm] */
			tt = -rsb_time();
			mtx.tune_spmm(&sf,&tn,maxr,tmax,transA,&alpha,nrhs,order,B.data(),ldB,&beta,C.data(),ldC);
			tt += rsb_time();

			auto nnsmA {mtx.blocks()};
			std::cout << "Tuning took " << tt << " s ( " << tt / dt << " ops ) and changed " << nsmA << " to " << nnsmA << " blocks" << std::endl;

			mtx.spmm(transA,&alpha,nrhs,order,B.data(),ldB,&beta,C.data(),ldC); // caches warmup
			/*! [snip__tune_spmm] */
			dt = -rsb_time();
			mtx.spmm(transA,&alpha,nrhs,order,B.data(),ldB,&beta,C.data(),ldC);
			dt += rsb_time();

			const auto flops_o = (nnz_ops*nnzA*nrhs)/dt;
			std::cout << "rsb_spmm-" << nrhs << "-" << oc << " took " << dt << " s, for " << nnzA/dt << " nnz/s, " << flops_o << " flops/s\n";
			if ( sf > 1.0 && flops_o > flops_u  )
				std::cout << "Tuning brought a " << std::max(flops_o / flops_u,sf) << " x speedup" << std::endl;
			else
				std::cout << "Tuning brought no speedup" << std::endl;
		}
	}

	std::cout << "Done." << std::endl;
}

auto main(const int argc, char * argv[]) -> int
{
	const std::string filename{ argc > 1 ? argv[1] : "../A.mtx"};

	for ( rsb_flags_t order : { RSB_FLAG_WANT_COLUMN_MAJOR_ORDER,
#if defined(RSB_LIBRSB_VER_DATE) && (RSB_LIBRSB_VER_DATE) /* since 1.2.0.10 */
		RSB_FLAG_WANT_ROW_MAJOR_ORDER
#endif
	} )
	{
#ifdef RSB_NUMERICAL_TYPE_FLOAT
		bench<float>(filename,order);
#endif
#ifdef RSB_NUMERICAL_TYPE_DOUBLE
		bench<double>(filename,order);
#endif
#ifdef RSB_NUMERICAL_TYPE_FLOAT_COMPLEX
		bench<std::complex<float>>(filename,order);
#endif
#ifdef RSB_NUMERICAL_TYPE_DOUBLE_COMPLEX
		bench<std::complex<double>>(filename,order);
#endif
	}
}