File: tsc_timer.h

package info (click to toggle)
highwayhash 0~git20200803.9490b14-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 924 kB
  • sloc: cpp: 7,804; ansic: 326; java: 271; makefile: 145; sh: 16
file content (208 lines) | stat: -rw-r--r-- 7,243 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef HIGHWAYHASH_TSC_TIMER_H_
#define HIGHWAYHASH_TSC_TIMER_H_

// High-resolution (~10 ns) timestamps, using fences to prevent reordering and
// ensure exactly the desired regions are measured.

#include <stdint.h>

#include "highwayhash/arch_specific.h"
#include "highwayhash/compiler_specific.h"

#if HH_ARCH_X64 && HH_MSC_VERSION
#include <emmintrin.h>  // _mm_lfence
#include <intrin.h>
#endif

#include <ctime>

namespace highwayhash {

// Start/Stop return absolute timestamps and must be placed immediately before
// and after the region to measure. We provide separate Start/Stop functions
// because they use different fences.
//
// Background: RDTSC is not 'serializing'; earlier instructions may complete
// after it, and/or later instructions may complete before it. 'Fences' ensure
// regions' elapsed times are independent of such reordering. The only
// documented unprivileged serializing instruction is CPUID, which acts as a
// full fence (no reordering across it in either direction). Unfortunately
// the latency of CPUID varies wildly (perhaps made worse by not initializing
// its EAX input). Because it cannot reliably be deducted from the region's
// elapsed time, it must not be included in the region to measure (i.e.
// between the two RDTSC).
//
// The newer RDTSCP is sometimes described as serializing, but it actually
// only serves as a half-fence with release semantics. Although all
// instructions in the region will complete before the final timestamp is
// captured, subsequent instructions may leak into the region and increase the
// elapsed time. Inserting another fence after the final RDTSCP would prevent
// such reordering without affecting the measured region.
//
// Fortunately, such a fence exists. The LFENCE instruction is only documented
// to delay later loads until earlier loads are visible. However, Intel's
// reference manual says it acts as a full fence (waiting until all earlier
// instructions have completed, and delaying later instructions until it
// completes). AMD assigns the same behavior to MFENCE.
//
// We need a fence before the initial RDTSC to prevent earlier instructions
// from leaking into the region, and arguably another after RDTSC to avoid
// region instructions from completing before the timestamp is recorded.
// When surrounded by fences, the additional RDTSCP half-fence provides no
// benefit, so the initial timestamp can be recorded via RDTSC, which has
// lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
//
// Using Start+Start leads to higher variance and overhead than Stop+Stop.
// However, Stop+Stop includes an LFENCE in the region measurements, which
// adds a delay dependent on earlier loads. The combination of Start+Stop
// is faster than Start+Start and more consistent than Stop+Stop because
// the first LFENCE already delayed subsequent loads before the measured
// region. This combination seems not to have been considered in prior work:
// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
//
// Note: performance counters can measure 'exact' instructions-retired or
// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
// requires fences. Unfortunately, it is not accessible on all OSes and we
// prefer to avoid kernel-mode drivers. Performance counters are also affected
// by several under/over-count errata, so we use the TSC instead.

// Primary templates; must use one of the specializations.
template <typename T>
inline T Start();

template <typename T>
inline T Stop();

// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
// divide by InvariantTicksPerSecond.
template <>
inline uint64_t Start<uint64_t>() {
  uint64_t t;
#if HH_ARCH_PPC
  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
#elif HH_ARCH_AARCH64
  asm volatile("mrs %0, cntvct_el0" : "=r"(t));
#elif HH_ARCH_X64 && HH_MSC_VERSION
  _mm_lfence();
  HH_COMPILER_FENCE;
  t = __rdtsc();
  _mm_lfence();
  HH_COMPILER_FENCE;
#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION)
  asm volatile(
      "lfence\n\t"
      "rdtsc\n\t"
      "shl $32, %%rdx\n\t"
      "or %%rdx, %0\n\t"
      "lfence"
      : "=a"(t)
      :
      // "memory" avoids reordering. rdx = TSC >> 32.
      // "cc" = flags modified by SHL.
      : "rdx", "memory", "cc");
#else
  t = static_cast<uint64_t>(clock());
#endif
  return t;
}

template <>
inline uint64_t Stop<uint64_t>() {
  uint64_t t;
#if HH_ARCH_PPC
  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
#elif HH_ARCH_AARCH64
  asm volatile("mrs %0, cntvct_el0" : "=r"(t));
#elif HH_ARCH_X64 && HH_MSC_VERSION
  HH_COMPILER_FENCE;
  unsigned aux;
  t = __rdtscp(&aux);
  _mm_lfence();
  HH_COMPILER_FENCE;
#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION)
  // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
  asm volatile(
      "rdtscp\n\t"
      "shl $32, %%rdx\n\t"
      "or %%rdx, %0\n\t"
      "lfence"
      : "=a"(t)
      :
      // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
      // "cc" = flags modified by SHL.
      : "rcx", "rdx", "memory", "cc");
#else
  t = static_cast<uint64_t>(clock());
#endif
  return t;
}

// Returns a 32-bit timestamp with about 4 cycles less overhead than
// Start<uint64_t>. Only suitable for measuring very short regions because the
// timestamp overflows about once a second.
template <>
inline uint32_t Start<uint32_t>() {
  uint32_t t;
#if HH_ARCH_X64 && HH_MSC_VERSION
  _mm_lfence();
  HH_COMPILER_FENCE;
  t = static_cast<uint32_t>(__rdtsc());
  _mm_lfence();
  HH_COMPILER_FENCE;
#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION)
  asm volatile(
      "lfence\n\t"
      "rdtsc\n\t"
      "lfence"
      : "=a"(t)
      :
      // "memory" avoids reordering. rdx = TSC >> 32.
      : "rdx", "memory");
#else
  t = static_cast<uint32_t>(Start<uint64_t>());
#endif
  return t;
}

template <>
inline uint32_t Stop<uint32_t>() {
  uint32_t t;
#if HH_ARCH_X64 && HH_MSC_VERSION
  HH_COMPILER_FENCE;
  unsigned aux;
  t = static_cast<uint32_t>(__rdtscp(&aux));
  _mm_lfence();
  HH_COMPILER_FENCE;
#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION)
  // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
  asm volatile(
      "rdtscp\n\t"
      "lfence"
      : "=a"(t)
      :
      // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
      : "rcx", "rdx", "memory");
#else
  t = static_cast<uint32_t>(Stop<uint64_t>());
#endif
  return t;
}

}  // namespace highwayhash

#endif  // HIGHWAYHASH_TSC_TIMER_H_