File: gpuWork.c

package info (click to toggle)
android-platform-tools 35.0.2-1~exp6
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 211,716 kB
  • sloc: cpp: 995,749; java: 290,495; ansic: 145,647; xml: 58,531; python: 39,608; sh: 14,500; javascript: 5,198; asm: 4,866; makefile: 3,115; yacc: 769; awk: 368; ruby: 183; sql: 140; perl: 88; lex: 67
file content (288 lines) | stat: -rw-r--r-- 13,987 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
/*
 * Copyright 2022 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "include/gpuwork/gpuWork.h"

#include <linux/bpf.h>
#include <stddef.h>
#include <stdint.h>

#ifdef MOCK_BPF
#include <test/mock_bpf_helpers.h>
#else
#include <bpf_helpers.h>
#endif

#define S_IN_NS (1000000000)
#define SMALL_TIME_GAP_LIMIT_NS (S_IN_NS)

// A map from GpuIdUid (GPU ID and application UID) to |UidTrackingInfo|.
DEFINE_BPF_MAP_GRW(gpu_work_map, HASH, GpuIdUid, UidTrackingInfo, kMaxTrackedGpuIdUids,
                   AID_GRAPHICS);

// A map containing a single entry of |GlobalData|.
DEFINE_BPF_MAP_GRW(gpu_work_global_data, ARRAY, uint32_t, GlobalData, 1, AID_GRAPHICS);

// Defines the structure of the kernel tracepoint:
//
//  /sys/kernel/tracing/events/power/gpu_work_period/
//
// Drivers must define an appropriate gpu_work_period kernel tracepoint (for
// example, using the DECLARE_EVENT_CLASS and DEFINE_EVENT macros) such that the
// arguments/fields match the fields of |GpuWorkPeriodEvent|, excluding the
// initial "common" field. Drivers must invoke the tracepoint (also referred to
// as emitting the event) as described below. Note that the description below
// assumes a single physical GPU and its driver; for devices with multiple GPUs,
// each GPU and its driver should emit events independently, using a different
// value for |gpu_id| per GPU.
//
// |GpuWorkPeriodEvent| defines a non-overlapping, non-zero period of time from
// |start_time_ns| (inclusive) until |end_time_ns| (exclusive) for a given
// |uid|, and includes details of how much work the GPU was performing for |uid|
// during the period. When GPU work for a given |uid| runs on the GPU, the
// driver must track one or more periods that cover the time where the work was
// running, and emit events soon after. The driver should try to emit the event
// for a period at most 1 second after |end_time_ns|, and must emit the event at
// most 2 seconds after |end_time_ns|. A period's duration (|end_time_ns| -
// |start_time_ns|) must be at most 1 second. Periods for different |uids| can
// overlap, but periods for the same |uid| must not overlap. The driver must
// emit events for the same |uid| in strictly increasing order of
// |start_time_ns|, such that it is guaranteed that the tracepoint call for a
// period for |uid| has returned before the tracepoint call for the next period
// for |uid| is made. Note that synchronization may be necessary if the driver
// emits events for the same |uid| from different threads/contexts. Note that
// |end_time_ns| for a period for a |uid| may equal the |start_time_ns| of the
// next period for |uid|. The driver should try to avoid emitting a large number
// of events in a short time period (e.g. 1000 events per second) for a given
// |uid|.
//
// The |total_active_duration_ns| must be set to the approximate total amount of
// time the GPU spent running work for |uid| within the period, without
// "double-counting" parallel GPU work on the same GPU for the same |uid|. Note
// that even if the parallel GPU work was submitted from several different
// processes (i.e. different PIDs) with the same UID, this overlapping work must
// not be double-counted, as it still came from a single |uid|. "GPU work"
// should correspond to the "GPU slices" shown in the AGI (Android GPU
// Inspector) tool, and so should include work such as fragment and non-fragment
// work/shaders running on the shader cores of the GPU. For example, given the
// following for a single |uid|:
//  - A period has:
//    - |start_time_ns|: 100,000,000 ns
//    - |end_time_ns|:   800,000,000 ns
//  - Some GPU vertex work (A):
//    - started at:      200,000,000 ns
//    - ended at:        400,000,000 ns
//  - Some GPU fragment work (B):
//    - started at:      300,000,000 ns
//    - ended at:        500,000,000 ns
//  - Some GPU fragment work (C):
//    - started at:      300,000,000 ns
//    - ended at:        400,000,000 ns
//  - Some GPU fragment work (D):
//    - started at:      600,000,000 ns
//    - ended at:        700,000,000 ns
//
// The |total_active_duration_ns| would be 400,000,000 ns, because GPU work for
// |uid| was executing:
//  - from 200,000,000 ns to 500,000,000 ns, giving a duration of 300,000,000 ns
//    (encompassing GPU work A, B, and C)
//  - from 600,000,000 ns to 700,000,000 ns, giving a duration of 100,000,000 ns
//    (GPU work D)
//
// Thus, the |total_active_duration_ns| is the sum of these two
// (non-overlapping) durations. Drivers may not have efficient access to the
// exact start and end times of all GPU work, as shown above, but drivers should
// try to approximate/aggregate the value of |total_active_duration_ns| as
// accurately as possible within the limitations of the hardware, without
// double-counting parallel GPU work for the same |uid|. The
// |total_active_duration_ns| value must be less than or equal to the period
// duration (|end_time_ns| - |start_time_ns|); if the aggregation approach might
// violate this requirement then the driver must clamp
// |total_active_duration_ns| to be at most the period duration.
//
// Protected mode: protected GPU work must not be reported. Periods must be
// emitted, and the |total_active_duration_ns| value set, as if the protected
// GPU work did not occur.
//
// Note that the above description allows for a certain amount of flexibility in
// how the driver tracks periods and emits the events. We list a few examples of
// how drivers might implement the above:
//
// - 1: The driver could track periods for all |uid| values at fixed intervals
//   of 1 second. Thus, every period duration would be exactly 1 second, and
//   periods from different |uid|s that overlap would have the same
//   |start_time_ns| and |end_time_ns| values.
//
// - 2: The driver could track periods with many different durations (up to 1
//   second), as needed in order to cover the GPU work for each |uid|.
//   Overlapping periods for different |uid|s may have very different durations,
//   as well as different |start_time_ns| and |end_time_ns| values.
//
// - 3: The driver could track fine-grained periods with different durations
//   that precisely cover the time where GPU work is running for each |uid|.
//   Thus, |total_active_duration_ns| would always equal the period duration.
//   For example, if a game was running at 60 frames per second, the driver
//   would most likely emit _at least_ 60 events per second (probably more, as
//   there would likely be multiple "chunks" of GPU work per frame, with gaps
//   between each chunk). However, the driver may sometimes need to resort to
//   more coarse-grained periods to avoid emitting thousands of events per
//   second for a |uid|, where |total_active_duration_ns| would then be less
//   than the period duration.
typedef struct {
    // Actual fields start at offset 8.
    uint64_t common;

    // A value that uniquely identifies the GPU within the system.
    uint32_t gpu_id;

    // The UID of the application (i.e. persistent, unique ID of the Android
    // app) that submitted work to the GPU.
    uint32_t uid;

    // The start time of the period in nanoseconds. The clock must be
    // CLOCK_MONOTONIC_RAW, as returned by the ktime_get_raw_ns(void) function.
    uint64_t start_time_ns;

    // The end time of the period in nanoseconds. The clock must be
    // CLOCK_MONOTONIC_RAW, as returned by the ktime_get_raw_ns(void) function.
    uint64_t end_time_ns;

    // The amount of time the GPU was running GPU work for |uid| during the
    // period, in nanoseconds, without double-counting parallel GPU work for the
    // same |uid|. For example, this might include the amount of time the GPU
    // spent performing shader work (vertex work, fragment work, etc.) for
    // |uid|.
    uint64_t total_active_duration_ns;

} GpuWorkPeriodEvent;

_Static_assert(offsetof(GpuWorkPeriodEvent, gpu_id) == 8 &&
                       offsetof(GpuWorkPeriodEvent, uid) == 12 &&
                       offsetof(GpuWorkPeriodEvent, start_time_ns) == 16 &&
                       offsetof(GpuWorkPeriodEvent, end_time_ns) == 24 &&
                       offsetof(GpuWorkPeriodEvent, total_active_duration_ns) == 32,
               "Field offsets of struct GpuWorkPeriodEvent must not be changed because they "
               "must match the tracepoint field offsets found via adb shell cat "
               "/sys/kernel/tracing/events/power/gpu_work_period/format");

DEFINE_BPF_PROG("tracepoint/power/gpu_work_period", AID_ROOT, AID_GRAPHICS, tp_gpu_work_period)
(GpuWorkPeriodEvent* const period) {
    // Note: In eBPF programs, |__sync_fetch_and_add| is translated to an atomic
    // add.

    // Return 1 to avoid blocking simpleperf from receiving events.
    const int ALLOW = 1;

    GpuIdUid gpu_id_and_uid;
    __builtin_memset(&gpu_id_and_uid, 0, sizeof(gpu_id_and_uid));
    gpu_id_and_uid.gpu_id = period->gpu_id;
    gpu_id_and_uid.uid = period->uid;

    // Get |UidTrackingInfo|.
    UidTrackingInfo* uid_tracking_info = bpf_gpu_work_map_lookup_elem(&gpu_id_and_uid);
    if (!uid_tracking_info) {
        // There was no existing entry, so we add a new one.
        UidTrackingInfo initial_info;
        __builtin_memset(&initial_info, 0, sizeof(initial_info));
        if (0 == bpf_gpu_work_map_update_elem(&gpu_id_and_uid, &initial_info, BPF_NOEXIST)) {
            // We added an entry to the map, so we increment our entry counter in
            // |GlobalData|.
            const uint32_t zero = 0;
            // Get the |GlobalData|.
            GlobalData* global_data = bpf_gpu_work_global_data_lookup_elem(&zero);
            // Getting the global data never fails because it is an |ARRAY| map,
            // but we need to keep the verifier happy.
            if (global_data) {
                __sync_fetch_and_add(&global_data->num_map_entries, 1);
            }
        }
        uid_tracking_info = bpf_gpu_work_map_lookup_elem(&gpu_id_and_uid);
        if (!uid_tracking_info) {
            // This should never happen, unless entries are getting deleted at
            // this moment. If so, we just give up.
            return ALLOW;
        }
    }

    if (
            // The period duration must be non-zero.
            period->start_time_ns >= period->end_time_ns ||
            // The period duration must be at most 1 second.
            (period->end_time_ns - period->start_time_ns) > S_IN_NS) {
        __sync_fetch_and_add(&uid_tracking_info->error_count, 1);
        return ALLOW;
    }

    // If |total_active_duration_ns| is 0 then no GPU work occurred and there is
    // nothing to do.
    if (period->total_active_duration_ns == 0) {
        return ALLOW;
    }

    // Update |uid_tracking_info->total_active_duration_ns|.
    __sync_fetch_and_add(&uid_tracking_info->total_active_duration_ns,
                         period->total_active_duration_ns);

    // |small_gap_time_ns| is the time gap between the current and previous
    // active period, which could be 0. If the gap is more than
    // |SMALL_TIME_GAP_LIMIT_NS| then |small_gap_time_ns| will be set to 0
    // because we want to estimate the small gaps between "continuous" GPU work.
    uint64_t small_gap_time_ns = 0;
    if (uid_tracking_info->previous_active_end_time_ns > period->start_time_ns) {
        // The current period appears to have occurred before the previous
        // active period, which must not happen because per-UID periods must not
        // overlap and must be emitted in strictly increasing order of
        // |start_time_ns|.
        __sync_fetch_and_add(&uid_tracking_info->error_count, 1);
    } else {
        // The current period appears to have been emitted after the previous
        // active period, as expected, so we can calculate the gap between the
        // current and previous active period.
        small_gap_time_ns = period->start_time_ns - uid_tracking_info->previous_active_end_time_ns;

        // Update |previous_active_end_time_ns|.
        uid_tracking_info->previous_active_end_time_ns = period->end_time_ns;

        // We want to estimate the small gaps between "continuous" GPU work; if
        // the gap is more than |SMALL_TIME_GAP_LIMIT_NS| then we don't consider
        // this "continuous" GPU work.
        if (small_gap_time_ns > SMALL_TIME_GAP_LIMIT_NS) {
            small_gap_time_ns = 0;
        }
    }

    uint64_t period_total_inactive_time_ns = 0;
    const uint64_t period_duration_ns = period->end_time_ns - period->start_time_ns;
    // |period->total_active_duration_ns| is the active time within the period duration, so
    // it must not be larger than |period_duration_ns|.
    if (period->total_active_duration_ns > period_duration_ns) {
        __sync_fetch_and_add(&uid_tracking_info->error_count, 1);
    } else {
        period_total_inactive_time_ns = period_duration_ns - period->total_active_duration_ns;
    }

    // Update |uid_tracking_info->total_inactive_duration_ns| by adding the
    // inactive time from this period, plus the small gap between the current
    // and previous active period. Either or both of these values could be 0.
    if (small_gap_time_ns > 0 || period_total_inactive_time_ns > 0) {
        __sync_fetch_and_add(&uid_tracking_info->total_inactive_duration_ns,
                             small_gap_time_ns + period_total_inactive_time_ns);
    }

    return ALLOW;
}

LICENSE("Apache 2.0");