File: Profiler.cc

package info (click to toggle)
squid 4.6-1%2Bdeb10u7
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 36,944 kB
  • sloc: cpp: 187,349; ansic: 20,249; makefile: 6,193; sh: 5,290; perl: 2,191; sql: 326; python: 258; awk: 141; sed: 1
file content (216 lines) | stat: -rw-r--r-- 7,000 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
/*
 * Copyright (C) 1996-2019 The Squid Software Foundation and contributors
 *
 * Squid software is distributed under GPLv2+ license and includes
 * contributions from numerous individuals and organizations.
 * Please see the COPYING and CONTRIBUTORS files for details.
 */

/* DEBUG: section 81    CPU Profiling Routines */

/**
 * CPU Profiling implementation.
 *
 * \par
 *  This library implements the Probes needed to gather stats.
 *  See src/ProfStats.c which implements historical recording and
 *  presentation in CacheMgr.cgi.
 *
 * \par
 *  For timing we prefer on-CPU ops that retrieve cpu ticks counter.
 *  For Intel, this is "rdtsc", which is 64-bit counter that virtually
 *  never wraps. For alpha, this is "rpcc" which is 32-bit counter and
 *  wraps every few seconds. Currently, no handling of wrapping counters
 *  is implemented. Other CPU's are also not covered. Potentially all
 *  modern CPU's has similar counters.
 *
 * Usage.
 *  Insert macro PROF_state(probename) in strategic places in code.
 *    PROF_start(probename);
 *     ...  section of code measured ...
 *    PROF_stop(probename);
 *
 *   probename must be added to the xprof_type.h enum list
 *   with prepended "XPROF_" string.
 *
 * \section description Description.
 * \par PROF
 *  gathers stats per probename into structures. It indexes these
 *  structures by enum type index in an array.
 *
 * \par PROF
 *  records best, best, average and worst values for delta time,
 *  also, if UNACCED is defined, it measures "empty" time during which
 *  no probes are in measuring state. This allows to see time "unaccounted"
 *  for. If OVERHEAD is defined, additional calculations are made at every
 *  probe to measure approximate overhead of the probe code itself.
 *
 * \par
 *  Probe data is stored in linked-list, so the more probes you define,
 *  the more overhead is added to find the deepest nested probe. To reduce
 *  average overhead, linked list is manipulated each time PR_start is
 *  called, so that probe just started is moved 1 position up in linkedlist.
 *  This way frequently used probes are moved closer to the head of list,
 *  reducing average overhead.
 *  Note that all overhead is on the scale of one hundred of CPU clock
 *  ticks, which on the scale of submicroseconds. Yet, to optimise really
 *  fast and frequent sections of code, we want to reduce this overhead
 *  to absolute minimum possible.
 *
 * \par
 *  For actual measurements, probe overhead cancels out mostly. Still,
 *  do not take the measured times as facts, they should be viewed in
 *  relative comparison to overall CPU time and on the same platform.
 *
 * \par
 *  Every 1 second, Event within squid is called that parses gathered
 *  statistics of every probe, and accumulates that into historical
 *  structures for last 1,5,30 secs, 1,5,30 mins, and 1,5 and 24 hours.
 *  Each second active probe stats are reset, and only historical data
 *  is presented in cachemgr output.
 *
 * \section reading Reading stats.
 * \par
 *  "Worst case" may be misleading. Anything can happen at any section
 *  of code that could delay reaching to probe stop. For eg. system may
 *  need to service interrupt routine, task switch could occur, or page
 *  fault needs to be handled. In this sense, this is quite meaningless
 *  metric. "Best case" shows fastest completion of probe section, and
 *  is also somewhat useless, unless you know that amount of work is
 *  constant. Best metric to watch is "average time" and total cumulated
 *  time in given timeframe, which really show percentage of time spent
 *  in given section of code, and its average completion time. This data
 *  could be used to detect bottlenecks withing squid and optimise them.
 *
 * \par
 *  TOTALS are quite off reality. Its there just to summarise cumulative
 *  times and percent column. Percent values over 100% shows that there
 *  have been some probes nested into each other.
 *
 */

#include "squid.h"
#include "profiler/Profiler.h"

#if USE_XPROF_STATS

#include <cassert>
#if HAVE_GNUMALLLOC_H
#include <gnumalloc.h>
#elif HAVE_MALLOC_H
#include <malloc.h>
#endif
#if HAVE_UNISTD_H
#include <unistd.h>
#endif

/* Exported Data */
TimersArray *xprof_Timers = NULL;

/* Private stuff */

/* new stuff */
#define MAXSTACKDEPTH   512

struct _callstack_entry {
    int timer;      /* index into timers array */
    const char *name;
    hrtime_t start, stop, accum;
};

struct _callstack_entry cstack[MAXSTACKDEPTH];
int cstack_head = 0;

#if defined(_MSC_VER) /* Microsoft C Compiler ONLY */
static __inline void
#else
static inline void
#endif
xprof_update(xprof_stats_data * head)
{
    if (head->delta < head->best)
        head->best = head->delta;
    if (head->worst < head->delta)
        head->worst = head->delta;
    head->summ += head->delta;
    ++head->count;
}

static xprof_stats_data *xp_UNACCOUNTED;
static int xprof_inited = 0;

static void
xprof_InitLib(void)
{
    if (xprof_inited)
        return;

    xprof_Timers = static_cast<TimersArray *>(calloc(XPROF_LAST + 2, sizeof(xprof_stats_node)));

    xprof_Timers[XPROF_PROF_UNACCOUNTED]->name = "PROF_UNACCOUNTED";
    xprof_Timers[XPROF_PROF_UNACCOUNTED]->accu.start = get_tick();
    xp_UNACCOUNTED = &xprof_Timers[XPROF_PROF_UNACCOUNTED]->accu;
    cstack_head = 0;
    xprof_inited = 1;
}

void
xprof_start(xprof_type type, const char *timer)
{
    hrtime_t tt = get_tick();
    if (!xprof_inited)
        xprof_InitLib();

    /* If nested, stop current stack frame */
    if (cstack_head > 0) {
        cstack[cstack_head - 1].accum += get_tick() - cstack[cstack_head - 1].start;
        cstack[cstack_head - 1].start = -1;
    }

    /* Are we at the first level? If so; stop the unaccounted timer */
    if (cstack_head == 0) {
        assert(xp_UNACCOUNTED->start != -1);
        xp_UNACCOUNTED->delta = tt - xp_UNACCOUNTED->start;
        xp_UNACCOUNTED->start = -1;
        xprof_update(xp_UNACCOUNTED);
    }

    /* Allocate new stack frame */
    cstack[cstack_head].start = tt;
    cstack[cstack_head].stop = -1;
    cstack[cstack_head].accum = 0;
    cstack[cstack_head].timer = type;
    cstack[cstack_head].name = timer;
    ++cstack_head;
    assert(cstack_head < MAXSTACKDEPTH);

}

void
xprof_stop(xprof_type type, const char *timer)
{
    hrtime_t tt = get_tick();
    assert(cstack_head > 0);
    --cstack_head;
    assert(cstack[cstack_head].timer == type);

    /* Record timer details */
    cstack[cstack_head].accum += tt - cstack[cstack_head].start;
    xprof_Timers[type]->accu.delta = cstack[cstack_head].accum;
    xprof_Timers[type]->name = timer;

    /* Update */
    xprof_update(&xprof_Timers[type]->accu);

    /* Restart previous timer if we're not at the top level */
    if (cstack_head > 0) {
        cstack[cstack_head - 1].start = tt;
        cstack[cstack_head - 1].stop = 0;
        return;
    }
    /* Get here? We're at the top level; unaccounted */
    xp_UNACCOUNTED->start = tt;
}

#endif /* USE_XPROF_STATS */