File: FPInstructions.py

package info (click to toggle)
nvidia-cuda-toolkit 12.4.1-3
  • links: PTS, VCS
  • area: non-free
  • in suites: forky, sid
  • size: 18,505,836 kB
  • sloc: ansic: 203,477; cpp: 64,769; python: 34,699; javascript: 22,006; xml: 13,410; makefile: 3,085; sh: 2,343; perl: 352
file content (128 lines) | stat: -rw-r--r-- 6,338 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import NvRules
from RequestedMetrics import MetricRequest, RequestedMetricsParser

requested_metrics = [
    MetricRequest("sass__inst_executed_per_opcode"),
]


def get_identifier():
    return "FPInstructions"

def get_name():
    return "FP32/64 Instructions"

def get_description():
    return "Floating-point instruction analysis."

def get_section_identifier():
    return "InstructionStats"

def get_parent_rules_identifiers():
    return ["HighPipeUtilization"]

def get_estimated_speedup(pipeline_utilization_pct, fused_instructions, non_fused_instructions):
    # To calculate the speedup, assume we can convert non-fused to fused instructions,
    # which have double the throughput.
    # To get a global estimate weigh this with the FP pipeline utilization
    # (in terms of active cycles).
    all_instructions = non_fused_instructions + fused_instructions
    improvement_local = 0.5 * (non_fused_instructions / all_instructions)

    if pipeline_utilization_pct is not None:
        speedup_type = NvRules.IFrontend.SpeedupType_GLOBAL
        improvement_percent = improvement_local * pipeline_utilization_pct
    else:
        speedup_type = NvRules.IFrontend.SpeedupType_LOCAL
        improvement_percent = improvement_local * 100

    return speedup_type, improvement_percent

def apply(handle):
    ctx = NvRules.get_context(handle)
    action = ctx.range_by_idx(0).action_by_idx(0)
    fe = ctx.frontend()
    metrics = RequestedMetricsParser(handle, action).parse(requested_metrics)
    parent_weights = fe.receive_dict_from_parent("HighPipeUtilization")

    fp_types = {
        32 : [ "FADD", "FMUL", "FFMA" ],
        64 : [ "DADD", "DMUL", "DFMA" ]
    }

    # the correlation IDs of sass__inst_executed_per_opcode are the opcode mnemonics
    inst_per_opcode = metrics["sass__inst_executed_per_opcode"]
    num_opcodes = inst_per_opcode.num_instances()
    opcodes = inst_per_opcode.correlation_ids()

    # analyze both 32 and 64 bit
    for fp_type in fp_types:
        fp_insts = dict()
        fp_opcodes = fp_types[fp_type]
        # get number of instructions by opcode
        for i in range(0,num_opcodes):
            op = opcodes.as_string(i).upper()
            if op in fp_opcodes:
                fp_insts[op] = inst_per_opcode.as_uint64(i)

        # calculate the sum of low- and high-throughput instructions
        non_fused = 0
        for i in range(0, 2):
            op = fp_opcodes[i]
            if op in fp_insts:
                non_fused += fp_insts[op]

        fused = 0
        op = fp_opcodes[2]
        if op in fp_insts:
            fused += fp_insts[op]

        if non_fused > 0 or fused > 0:
            # high-throughput/fused instructions have twice the throughput of non-fused ones
            ratio = (non_fused / (non_fused + fused)) / 2
            if ratio > 0.1:
                message = "This kernel executes {} fused and {} non-fused FP{} instructions.".format(fused, non_fused, fp_type)
                message += " By converting pairs of non-fused instructions to their @url:fused:https://docs.nvidia.com/cuda/floating-point/#cuda-and-floating-point@, higher-throughput equivalent, the achieved FP{} performance could be increased by up to {:.0f}%"\
                    " (relative to its current performance)."\
                    " Check the Source page to identify where this kernel executes FP{} instructions.".format(fp_type, 100. * ratio, fp_type)
                message_title = "FP{} Non-Fused Instructions".format(fp_type)
                msg_id = fe.message(NvRules.IFrontend.MsgType_MSG_OPTIMIZATION, message, message_title)

                pipeline_utilization_pct = None
                parent_weight_name = "fp{}_pipeline_utilization_pct".format(fp_type)
                if parent_weight_name in parent_weights:
                    pipeline_utilization_pct = parent_weights[parent_weight_name]
                speedup_type, speedup_value = get_estimated_speedup(pipeline_utilization_pct, fused, non_fused)
                fe.speedup(msg_id, speedup_type, speedup_value)

                fe.focus_metric(msg_id, "sass__inst_executed_per_opcode", non_fused, NvRules.IFrontend.Severity_SEVERITY_HIGH, "Decrease the number of non-fused floating-point instructions (FADD, FMUL, DADD, DMUL)")
                if pipeline_utilization_pct is not None:
                    if fp_type == 32:
                        metric_name = "sm__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active"
                    else:
                        metric_name = "sm__pipe_fp64_cycles_active.avg.pct_of_peak_sustained_active"
                    fe.focus_metric(msg_id, metric_name, pipeline_utilization_pct, NvRules.IFrontend.Severity_SEVERITY_LOW, "The higher the utilization of the pipeline the more severe the issue becomes")