File: FPInstructions.py

package info (click to toggle)
nvidia-cuda-toolkit 11.8.0-5~deb12u1
  • links: PTS, VCS
  • area: non-free
  • in suites: bookworm
  • size: 18,338,396 kB
  • sloc: ansic: 172,472; cpp: 57,058; javascript: 21,597; python: 12,656; xml: 12,438; makefile: 2,949; sh: 2,056; perl: 352
file content (87 lines) | stat: -rw-r--r-- 3,862 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import NvRules

def get_identifier():
    return "FPInstructions"

def get_name():
    return "FP32/64 Instructions"

def get_description():
    return "Floating-point instruction analysis."

def get_section_identifier():
    return "InstructionStats"


def apply(handle):
    ctx = NvRules.get_context(handle)
    action = ctx.range_by_idx(0).action_by_idx(0)
    fe = ctx.frontend()

    fp_types = {
        32 : [ "FADD", "FMUL", "FFMA" ],
        64 : [ "DADD", "DMUL", "DFMA" ]
    }

    # the correlation IDs of sass__inst_executed_per_opcode are the opcode mnemonics
    inst_per_opcode = action.metric_by_name("sass__inst_executed_per_opcode")
    num_opcodes = inst_per_opcode.num_instances()
    opcodes = inst_per_opcode.correlation_ids()

    # analyze both 32 and 64 bit
    for fp_type in fp_types:
        fp_insts = dict()
        fp_opcodes = fp_types[fp_type]
        # get number of instructions by opcode
        for i in range(0,num_opcodes):
            op = opcodes.as_string(i).upper()
            if op in fp_opcodes:
                fp_insts[op] = inst_per_opcode.as_uint64(i)

        # calculate the sum of low- and high-throughput instructions
        non_fused = 0
        for i in range(0, 2):
            op = fp_opcodes[i]
            if op in fp_insts:
                non_fused += fp_insts[op]

        fused = 0
        op = fp_opcodes[2]
        if op in fp_insts:
            fused += fp_insts[op]

        if non_fused > 0 or fused > 0:
            # high-throughput/fused instructions have twice the throughput of non-fused ones
            ratio = (non_fused / (non_fused + fused)) / 2
            if ratio > 0.1:
                message = "This kernel executes {} fused and {} non-fused FP{} instructions.".format(fused, non_fused, fp_type)
                message += " By converting pairs of non-fused instructions to their @url:fused:https://docs.nvidia.com/cuda/floating-point/#cuda-and-floating-point@, higher-throughput equivalent, the achieved FP{} performance could be increased by up to {:.0f}%"\
                    " (relative to its current performance)."\
                    " Check the Source page to identify where this kernel executes FP{} instructions.".format(fp_type, 100. * ratio, fp_type)

                fe.message(NvRules.IFrontend.MsgType_MSG_WARNING, message)