1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
|
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import NvRules
def get_identifier():
return "MemoryCacheAccessPattern"
def get_name():
return "Memory Cache Access Pattern"
def get_description():
return "Detection of inefficient memory access patterns in the L1TEX cache and L2 cache."
def get_section_identifier():
return "MemoryWorkloadAnalysis_Tables"
def apply(handle):
ctx = NvRules.get_context(handle)
action = ctx.range_by_idx(0).action_by_idx(0)
fe = ctx.frontend()
# Metrics ==========================================================================================================
smsp__inst_executed_op_memory_8b = action.metric_by_name("smsp__sass_inst_executed_op_memory_8b.sum").as_double()
smsp__inst_executed_op_memory_16b = action.metric_by_name("smsp__sass_inst_executed_op_memory_16b.sum").as_double()
smsp__inst_executed_op_memory_32b = action.metric_by_name("smsp__sass_inst_executed_op_memory_32b.sum").as_double()
smsp__inst_executed_op_memory_64b = action.metric_by_name("smsp__sass_inst_executed_op_memory_64b.sum").as_double()
smsp__inst_executed_op_memory_128b = action.metric_by_name("smsp__sass_inst_executed_op_memory_128b.sum").as_double()
cc_major = action.metric_by_name("device__attribute_compute_capability_major").as_uint64()
cc_minor = action.metric_by_name("device__attribute_compute_capability_minor").as_uint64()
cc = cc_major * 10 + cc_minor
# Derived Metrics --------------------------------------------------------------------------------------------------
smsp__inst_executed_op_memory_flat_sum = \
smsp__inst_executed_op_memory_8b \
+ smsp__inst_executed_op_memory_16b \
+ smsp__inst_executed_op_memory_32b \
+ smsp__inst_executed_op_memory_64b \
+ smsp__inst_executed_op_memory_128b
smsp__inst_executed_op_memory_weighted_sum = \
8 * smsp__inst_executed_op_memory_8b \
+ 16 * smsp__inst_executed_op_memory_16b \
+ 32 * smsp__inst_executed_op_memory_32b \
+ 64 * smsp__inst_executed_op_memory_64b \
+ 128 * smsp__inst_executed_op_memory_128b
smspAvgMemoryBytesPerInst = smsp__inst_executed_op_memory_weighted_sum / smsp__inst_executed_op_memory_flat_sum / 8 if smsp__inst_executed_op_memory_flat_sum > 0 else 0
# L1TEX ============================================================================================================
l1tex_access_types = {
"mem_global_op_ld" : (
"Global Load"),
"mem_global_op_st" : (
"Global Store"),
"mem_local_op_ld" : (
"Local Load"),
"mem_local_op_st" : (
"Local Store"),
}
for access_type in l1tex_access_types:
access_info = l1tex_access_types[access_type]
sectors = action.metric_by_name("l1tex__t_sectors_pipe_lsu_{}.sum".format(access_type)).as_double()
requests = action.metric_by_name("l1tex__t_requests_pipe_lsu_{}.sum".format(access_type)).as_double()
sectors_per_request = sectors / requests if requests > 0 else 0
if sectors > 0 and requests > 0 and sectors_per_request > smspAvgMemoryBytesPerInst:
message = "The memory access pattern for {}s in L1TEX might not be optimal. ".format(access_info.lower())
message += "On average, this kernel accesses {:.1f} bytes per thread per memory request; ".format(smspAvgMemoryBytesPerInst)
message += "but the address pattern, possibly caused by the stride between threads, results in {:.1f} sectors per request, or {:.1f}*32 = {:.1f} bytes of cache data transfers per request. ".format(sectors_per_request,sectors_per_request,32 * sectors_per_request)
message += "The optimal thread address pattern for {:.1f} byte accesses would result in {:.1f}*32 = {:.1f} bytes of cache data transfers per request, to maximize L1TEX cache performance. ".format(smspAvgMemoryBytesPerInst,smspAvgMemoryBytesPerInst,32 * smspAvgMemoryBytesPerInst)
message += "Check the @section:SourceCounters:Source Counters@ section for uncoalesced {}s.".format(access_info.lower())
msg_id = fe.message(NvRules.IFrontend.MsgType_MSG_WARNING, message, "L1TEX {} Access Pattern".format(access_info))
fe.focus_metric(msg_id, "Sectors per L1TEX Request", sectors_per_request, NvRules.IFrontend.Severity_SEVERITY_HIGH if sectors_per_request > 2 * smspAvgMemoryBytesPerInst else NvRules.IFrontend.Severity_SEVERITY_LOW, "{:,.0f} / {:,.0f} > {:.1f}".format(sectors, requests, smspAvgMemoryBytesPerInst))
# L2 ==============================================================================================================
l2_access_types = {
"tex_op_read" : (
"Load"),
"tex_op_write" : (
"Store"),
}
for access_type in l2_access_types:
access_info = l2_access_types[access_type]
sectors = action.metric_by_name("lts__t_sectors_srcunit_{}.sum".format(access_type)).as_double()
requests = action.metric_by_name("lts__t_requests_srcunit_{}.sum".format(access_type)).as_double()
sectors_per_request = sectors / requests if requests > 0 else 0
# Anything less than 4 is not ideal, but we don't want to show a warning if it's very close.
if sectors > 0 and requests > 0 and sectors_per_request < 3.5:
message = "The memory access pattern for {}s from L1TEX to L2 is not optimal. ".format(access_info.lower())
message += "The granularity of an L1TEX request to L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. "
message += "However, this kernel only accesses an average of {:.1f} sectors out of the possible 4 sectors per cache line. ".format(sectors_per_request)
message += "Check the @section:SourceCounters:Source Counters@ section for uncoalesced {}s and try to minimize how many cache lines need to be accessed per memory request.".format(access_info.lower())
msg_id = fe.message(NvRules.IFrontend.MsgType_MSG_WARNING, message, "L2 {} Access Pattern".format(access_info))
fe.focus_metric(msg_id, "Sectors per L2 Request", sectors_per_request, NvRules.IFrontend.Severity_SEVERITY_HIGH if sectors_per_request <= 2 else NvRules.IFrontend.Severity_SEVERITY_LOW, "{:,.0f} / {:,.0f} < 4".format(sectors, requests))
# DRAM ============================================================================================================
if (True
and cc != 72
and cc != 87
):
dram__read_peak_pct = action.metric_by_name("dram__bytes_read.sum.pct_of_peak_sustained_elapsed").as_double()
lts__read_sectors = action.metric_by_name("lts__t_sectors_srcunit_tex_op_read.sum").as_double()
lts__read_sectors_hits = action.metric_by_name("lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum").as_double()
lts__read_sectors_misses = action.metric_by_name("lts__t_sectors_srcunit_tex_op_read_lookup_miss.sum").as_double()
lts__read_sectors_not_hit = lts__read_sectors - lts__read_sectors_hits
if dram__read_peak_pct > 50 and lts__read_sectors_not_hit < lts__read_sectors_misses:
message = "The memory access pattern for loads from device memory causes {:,.0f} sectors to be read from DRAM, which is {:.1f}x of the {:,.0f} sectors causing a miss in the L2 cache. ".format(lts__read_sectors_misses, lts__read_sectors_misses/lts__read_sectors_not_hit, lts__read_sectors_not_hit)
message += "The DRAM fetch granularity for read misses in L2 is 64 bytes, i.e. the lower or upper half of an L2 cache line. "
message += "Try changing your access pattern to make use of both sectors returned by a DRAM read request for optimal usage of the DRAM throughput. "
message += "For strided memory reads, avoid strides of 64 bytes or larger to avoid moving unused sectors from DRAM to L2. "
msg_id = fe.message(NvRules.IFrontend.MsgType_MSG_WARNING, message, "DRAM Excessive Read Sectors")
fe.focus_metric(msg_id, "DRAM Read Peak Utilization", dram__read_peak_pct, NvRules.IFrontend.Severity_SEVERITY_HIGH if dram__read_peak_pct > 75 else NvRules.IFrontend.Severity_SEVERITY_LOW, "{:.1f}% > 50%".format(dram__read_peak_pct))
fe.focus_metric(msg_id, "DRAM Excessive Read Sectors", lts__read_sectors_misses - lts__read_sectors, NvRules.IFrontend.Severity_SEVERITY_HIGH, "{:,.0f} > {:,.0f}".format(lts__read_sectors_misses, lts__read_sectors))
|