File: generate_libsmm_acc_timer_multiply.py

package info (click to toggle)
dbcsr 2.8.0-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 39,836 kB
  • sloc: fortran: 54,534; ansic: 7,060; python: 3,482; cpp: 2,431; sh: 1,639; f90: 1,178; lisp: 689; makefile: 633
file content (142 lines) | stat: -rwxr-xr-x 5,272 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
####################################################################################################
# Copyright (C) by the DBCSR developers group - All rights reserved                                #
# This file is part of the DBCSR library.                                                          #
#                                                                                                  #
# For information on the license, see the LICENSE file.                                            #
# For further information please visit https://dbcsr.cp2k.org                                      #
# SPDX-License-Identifier: GPL-2.0+                                                                #
####################################################################################################

import os
import random
import json
import argparse


def format_to_cpp(kernels):
    """Given a list of kernels represented as dictionaries, return a string representing them as C++ vector of vectors
    using initializer lists"""
    kernels = sorted(kernels, key=lambda k: (k["m"], k["n"], k["k"]))
    out = ""
    init_list_line = "        {{{m:>2}, {n:>2}, {k:>2}}},\n"
    for k in kernels:
        out += init_list_line.format(m=k["m"], n=k["n"], k=k["k"])
    return out


# ===============================================================================
def main(
    dbcsr_base_dir,
    libsmm_acc_base_dir,
    test_template_dir,
    test_output_dir,
    gpu_version,
    nsamples,
):
    """
    Generate a performance test of libsmm_acc in the form of a CUDA or HIP file, using libsmm_acc_timer_multiply.cpp.template as
    a template
    """

    # Read parameter file
    print("GPU version: {}".format(gpu_version))
    param_fn = os.path.join(
        libsmm_acc_base_dir,
        os.path.join("parameters", "parameters_{}.json".format(gpu_version)),
    )
    with open(param_fn, "r") as f:
        all_kernels = json.load(f)

    # Get the autotuned kernels to test
    autotuned_kernels = [k for k in all_kernels if k["source"] == "autotuned"]
    print("Found {:,} autotuned kernels".format(len(autotuned_kernels)))
    kernels_to_print_autotuned = format_to_cpp(autotuned_kernels)

    # Get the non-autotuned kernels to test
    predicted_kernels = [k for k in all_kernels if k["source"] != "autotuned"]
    print("Found {:,} predicted kernels".format(len(predicted_kernels)))
    num_predicted_kernels = len(predicted_kernels)
    if num_predicted_kernels > 0:
        if nsamples >= num_predicted_kernels:
            nsamples = num_predicted_kernels
        kernels_to_test_predicted = random.sample(predicted_kernels, nsamples)
        kernels_to_print_predicted = format_to_cpp(kernels_to_test_predicted)
    else:
        kernels_to_test_predicted = list()
        kernels_to_print_predicted = ""

    # Print to test file
    file_template = os.path.join(
        test_template_dir, "libsmm_acc_timer_multiply.cpp.template"
    )
    file_generate = os.path.join(test_output_dir, "libsmm_acc_timer_multiply.cpp")
    with open(file_template, "r") as f:
        test = f.read()
    test = test.replace(
        "[[AUTOTUNED_KERNELS_HERE]]", kernels_to_print_autotuned.lstrip()
    )
    test = test.replace(
        "[[PREDICTED_KERNELS_HERE]]", kernels_to_print_predicted.lstrip()
    )
    with open(file_generate, "w") as f:
        f.write(test)
    print(
        "Wrote {:,} test kernels to {}".format(
            len(autotuned_kernels + kernels_to_test_predicted), file_generate
        )
    )


# ===============================================================================
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="""
        Generate a performance test of libsmm_acc in the form of a CUDA or HIP file, using
        libsmm_acc_timer_multiply.cpp.template as a template
        """,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "-f", "--base_dir", metavar="DBCSRHOME", default="", help="DBCSR base directory"
    )
    parser.add_argument(
        "-o",
        "--out_dir",
        metavar="OUTDIR",
        default="tests",
        help="Directory in which to write the generated test files. Expressed relatively to base_dir",
    )
    parser.add_argument(
        "-g",
        "--gpu_version",
        metavar="GPU_VERSION",
        default="P100",
        help="GPU card version, used to select the appropriate libsmm_acc parameters file",
    )
    parser.add_argument(
        "-n",
        "--nsamples",
        default=1000,
        help=(
            "Number of samples from the matrix sizes space 4 <= m,n,k <= 45 (except autotuned kernels)"
            " to sample for performance testing"
        ),
    )

    args = parser.parse_args()

    # Folders in/to which to read/write files
    libsmm_acc_base_dir = os.path.join(args.base_dir, "src/acc/libsmm_acc")
    test_template_dir = os.path.join(args.base_dir, "tests")
    test_output_dir = os.path.join(args.base_dir, args.out_dir)

    main(
        args.base_dir,
        libsmm_acc_base_dir,
        test_template_dir,
        test_output_dir,
        args.gpu_version,
        args.nsamples,
    )