File: WorkloadImbalance.py

package info (click to toggle)
nvidia-cuda-toolkit 12.4.1-3
  • links: PTS, VCS
  • area: non-free
  • in suites: forky, sid
  • size: 18,505,836 kB
  • sloc: ansic: 203,477; cpp: 64,769; python: 34,699; javascript: 22,006; xml: 13,410; makefile: 3,085; sh: 2,343; perl: 352
file content (148 lines) | stat: -rw-r--r-- 8,758 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import NvRules
from RequestedMetrics import MetricRequest, RequestedMetricsParser, Importance

requested_metrics = [
    MetricRequest("dram__cycles_active.avg", None, Importance.OPTIONAL, 0),
    MetricRequest("dram__cycles_active.max", None, Importance.OPTIONAL, 0),
    MetricRequest("dram__cycles_active.min", None, Importance.OPTIONAL, 0),
    MetricRequest("dram__cycles_active.sum", None, Importance.OPTIONAL, 0),
    MetricRequest("dram__cycles_elapsed.sum", None, Importance.OPTIONAL, 0),
    MetricRequest("l1tex__cycles_active.avg", None, Importance.OPTIONAL, 0),
    MetricRequest("l1tex__cycles_active.max", None, Importance.OPTIONAL, 0),
    MetricRequest("l1tex__cycles_active.min", None, Importance.OPTIONAL, 0),
    MetricRequest("l1tex__cycles_active.sum", None, Importance.OPTIONAL, 0),
    MetricRequest("l1tex__cycles_elapsed.sum", None, Importance.OPTIONAL, 0),
    MetricRequest("lts__cycles_active.avg", None, Importance.OPTIONAL, 0),
    MetricRequest("lts__cycles_active.max", None, Importance.OPTIONAL, 0),
    MetricRequest("lts__cycles_active.min", None, Importance.OPTIONAL, 0),
    MetricRequest("lts__cycles_active.sum", None, Importance.OPTIONAL, 0),
    MetricRequest("lts__cycles_elapsed.sum", None, Importance.OPTIONAL, 0),
    MetricRequest("sm__cycles_active.avg", None, Importance.OPTIONAL, 0),
    MetricRequest("sm__cycles_active.max", None, Importance.OPTIONAL, 0),
    MetricRequest("sm__cycles_active.min", None, Importance.OPTIONAL, 0),
    MetricRequest("sm__cycles_active.sum", None, Importance.OPTIONAL, 0),
    MetricRequest("sm__cycles_elapsed.sum", None, Importance.OPTIONAL, 0),
    MetricRequest("smsp__cycles_active.avg", None, Importance.OPTIONAL, 0),
    MetricRequest("smsp__cycles_active.max", None, Importance.OPTIONAL, 0),
    MetricRequest("smsp__cycles_active.min", None, Importance.OPTIONAL, 0),
    MetricRequest("smsp__cycles_active.sum", None, Importance.OPTIONAL, 0),
    MetricRequest("smsp__cycles_elapsed.sum", None, Importance.OPTIONAL, 0),
]


def get_identifier():
    return "WorkloadImbalance"


def get_name():
    return "Workload Imbalance"


def get_description():
    return "Analysis of workload distribution in active cycles of SM, SMP, SMSP, L1 & L2 caches, and DRAM"


def get_section_identifier():
    return "WorkloadDistribution"


def get_parent_rules_identifiers():
    return ["Compute"]

def analyze_imbalance(fe, metrics, metric_base_name, id, min_speedup, recommendation=None):

    # Metrics where unavailable
    if (metrics[f'{metric_base_name}.max'].value() == 0 or metrics[f'{metric_base_name}.avg'].value() == 0):
        return

    max_distance_from_avg = (1 - (metrics[f'{metric_base_name}.avg'].value() / metrics[f'{metric_base_name}.max'].value())) * 100
    min_distance_from_avg = (1 - (metrics[f'{metric_base_name}.min'].value() / metrics[f'{metric_base_name}.avg'].value())) * 100

    # In the case, for example, where you have multiple SM's doing "much more work" than the rest of the SM,
    # that means that their load can distributed over the remaining SM's achieving that difference percentage as a speed up.
    #
    # For example, if we have 4 SMs, with the following active cycles [1:100, 2:100, 3:100, 4:180].
    # In this case SM4 is the bottleneck. The sum of all active cycles is 480. The average of this is: 120
    # That means we have a max difference of = (1 - (120/180)) * 100 = 33%.
    #
    # Distributing the load we would have: [1:120, 2:120, 3:120, 4: 120] which
    # lowers the number of active cycles of the bottle neck SM by 33% (i.e speeding up by 33%)

    total_elapsed = metrics[f'{metric_base_name.replace("active", "elapsed")}.sum'].value()
    total_active = metrics[f'{metric_base_name}.sum'].value()
    speedup = max_distance_from_avg * (total_active / total_elapsed)

    # Speedup is less than the minimum required for reporting
    if speedup < min_speedup:
        return

    if (max_distance_from_avg >= min_speedup
        and min_distance_from_avg >= min_speedup
        and abs(max_distance_from_avg - min_distance_from_avg) < 2):
        # If max and min distance are less than 2% away from  each other and both are above min speedup
        msg_id = fe.message(NvRules.IFrontend.MsgType_MSG_OPTIMIZATION,
                f"One or more {id}s have a much higher number of active cycles than the average number of active cycles. Additionally, "
                f"other {id}s have a much lower number of active cycles than the average number of active cycles. "
                f"Maximum instance value is {max_distance_from_avg:.2f}% above the average, while the minimum instance value is {min_distance_from_avg:.2f}% below the average.",
                f"{id}s Workload Imbalance")
    elif (max_distance_from_avg >= min_speedup
          and max_distance_from_avg > min_distance_from_avg):
        # Max load is the major contributor to the imbalance (i.e there is room to distrubute work to other parts)
        msg_id = fe.message(NvRules.IFrontend.MsgType_MSG_OPTIMIZATION,
                f"One or more {id}s have a much higher number of active cycles than the average number of active cycles. "
                f"Maximum instance value is {max_distance_from_avg:.2f}% above the average, while the minimum instance value is {min_distance_from_avg:.2f}% below the average.",
                f"{id}s Workload Imbalance")
    elif (min_distance_from_avg >= min_speedup
          and min_distance_from_avg > max_distance_from_avg):
        # Min load is the major contributor to the imbalance (i.e assign more to this part)
        msg_id = fe.message(NvRules.IFrontend.MsgType_MSG_OPTIMIZATION,
                f"One or more {id}s have a much lower number of active cycles than the average number of active cycles. "
                f"Maximum instance value is {max_distance_from_avg:.2f}% above the average, while the minimum instance value is {min_distance_from_avg:.2f}% below the average.",
                f"{id}s Workload Imbalance")
    else:
        return

    fe.speedup(msg_id, NvRules.IFrontend.SpeedupType_GLOBAL, speedup)
    fe.focus_metric(
        msg_id,
        metrics[f"{metric_base_name}.avg"].name(),
        metrics[f"{metric_base_name}.avg"].value(),
        NvRules.IFrontend.Severity_SEVERITY_HIGH,
        (recommendation if recommendation is not None else f"Balancing the number of active cycles across {id}s would result in a more optimized kernel")
    )

def apply(handle):
    ctx = NvRules.get_context(handle)
    action = ctx.range_by_idx(0).action_by_idx(0)
    fe = ctx.frontend()
    metrics = RequestedMetricsParser(handle, action).parse(requested_metrics)

    analyze_imbalance(fe=fe, metrics=metrics, metric_base_name="sm__cycles_active", id="SM", min_speedup=5)
    analyze_imbalance(fe=fe, metrics=metrics, metric_base_name="smsp__cycles_active", id="SMSP", min_speedup=5)
    analyze_imbalance(fe=fe, metrics=metrics, metric_base_name="l1tex__cycles_active", id="L1 Slice", min_speedup=5)
    analyze_imbalance(fe=fe, metrics=metrics, metric_base_name="lts__cycles_active", id="L2 Slice", min_speedup=5)
    analyze_imbalance(fe=fe, metrics=metrics, metric_base_name="dram__cycles_active", id="DRAM Slice", min_speedup=5)