File: KernelInstanceBoundsAnalysis.py

package info (click to toggle)
nvidia-cuda-toolkit 12.4.1-2
  • links: PTS, VCS
  • area: non-free
  • in suites: trixie
  • size: 18,505,836 kB
  • sloc: ansic: 203,477; cpp: 64,769; python: 34,699; javascript: 22,006; xml: 13,410; makefile: 3,085; sh: 2,343; perl: 352
file content (71 lines) | stat: -rw-r--r-- 3,000 bytes parent folder | download | duplicates (12)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import NvRules
import nvanalysis

def get_name():
    return "Kernel Bounds Analysis"

def get_description():
    return "Kernel Bounds Analysis Rule"

def get_identifier():
    return "kernel_instance_bounds_analysis"

def get_section_identifier():
    return "kernel_instance_bounds_analysis"

#def evaluate(handle):
     # List rules proposed by this rule here once they are ready
#    return NvRules.require_rules(handle, [ "kernel_instance_latency_analysis" ])

def apply(handle):

    ctx = NvRules.get_context(handle)
    fe = ctx.frontend()
    action = ctx.range_by_idx(0).action_by_idx(0)

    inst_issued_slots = action.metric_by_name("smsp__inst_issued_slots").as_double()
    inst_executed_lsu_pipe = action.metric_by_name("smsp__inst_executed_lsu_pipe").as_double()
    inst_executed_tex_pipe = action.metric_by_name("smsp__inst_executed_tex_pipe").as_double()
    inst_executed_bru_pipe = action.metric_by_name("smsp__inst_executed_bru_pipe").as_double()
    utilization_issue = action.metric_by_name("smsp__utilization_issue").as_double()

    m_issue_slots = inst_issued_slots
    m_ldst_issued = inst_executed_lsu_pipe + inst_executed_tex_pipe
    m_cf_issued = inst_executed_bru_pipe
    m_issue_slot_utilization = utilization_issue

    m_cc_major = action.metric_by_name("device__attribute_compute_capability_major").as_uint64()
    m_cc_minor = action.metric_by_name("device__attribute_compute_capability_minor").as_uint64()
    m_device_name = action.metric_by_name("device__attribute_display_name").as_string()

    avail_mem_unit_enums = nvanalysis.units.get_memory_unit_enums(m_cc_major, m_cc_minor)
    avail_func_unit_enums = nvanalysis.units.get_function_unit_enums(m_cc_major, m_cc_minor)

    avail_mem_units = []
    avail_func_units = []
    for memUnitEnum in avail_mem_unit_enums:
        avail_mem_units.append(nvanalysis.units.get_memory_unit(memUnitEnum))
    for funcUnitEnum in avail_func_unit_enums:
        avail_func_units.append(nvanalysis.units.get_function_unit(funcUnitEnum))

    slot_utlization = m_issue_slot_utilization / 100.0
    max_utilized_function_unit = nvanalysis.units.get_max_utilized_function_unit(avail_func_units, action)
    fu_utilization = nvanalysis.metrics.get_utilization_percent(max_utilized_function_unit.value(action))

    # TODO
    # if (memMap.containsKey(MemoryUnit.SYSMEM))  ...

    (mem_util, mem_bound) = nvanalysis.units.get_memory_utilization(avail_func_units, avail_mem_units, action)

    if fu_utilization > slot_utlization:
        bound = nvanalysis.bounds.get_kernel_bound(fu_utilization, mem_util)
    else:
        cf_util = slot_utlization * (m_cf_issued / m_issue_slots)
        ldst_util = slot_utlization * (m_ldst_issued / m_issue_slots)
        arith_util = slot_utlization - cf_util - ldst_util

        sm_util = slot_utlization - ldst_util
        bound = nvanalysis.bounds.get_kernel_bound(sm_util, mem_util)

    fe.message(nvanalysis.messages.kernel_bounds_msg(bound, mem_bound, m_device_name))