1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
|
#ifndef __EVENT_COUNTER_H
#define __EVENT_COUNTER_H
#ifndef SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS
#ifdef __aarch64__
// on ARM, we use just cycles and instructions
#define SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS 1
#else
// elsewhere, we try to use four counters.
#define SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS 0
#endif
#endif
#include <cassert>
#include <cctype>
#ifndef _MSC_VER
#include <dirent.h>
#endif
#include <unistd.h>
#include <cinttypes>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <algorithm>
#include <chrono>
#include <cstring>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <map>
#include <set>
#include <sstream>
#include <string>
#include <vector>
#ifdef __linux__
#include "linux-perf-events.h"
#include <libgen.h>
#endif
#if __APPLE__ && __aarch64__
#include "apple/apple_arm_events.h"
#endif
#include "simdjson.h"
using std::string;
using std::vector;
using std::chrono::steady_clock;
using std::chrono::time_point;
using std::chrono::duration;
struct event_count {
duration<double> elapsed;
vector<unsigned long long> event_counts;
event_count() : elapsed(0), event_counts{0,0,0,0,0} {}
event_count(const duration<double> _elapsed, const vector<unsigned long long> _event_counts) : elapsed(_elapsed), event_counts(_event_counts) {}
event_count(const event_count& other): elapsed(other.elapsed), event_counts(other.event_counts) { }
// The types of counters (so we can read the getter more easily)
#if SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS
enum event_counter_types {
CPU_CYCLES,
INSTRUCTIONS
};
#else
enum event_counter_types {
CPU_CYCLES,
INSTRUCTIONS,
BRANCH_MISSES,
CACHE_REFERENCES,
CACHE_MISSES
};
#endif
double elapsed_sec() const { return duration<double>(elapsed).count(); }
double elapsed_ns() const { return duration<double, std::nano>(elapsed).count(); }
double cycles() const { return static_cast<double>(event_counts[CPU_CYCLES]); }
double instructions() const { return static_cast<double>(event_counts[INSTRUCTIONS]); }
#if !SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS
double branch_misses() const { return static_cast<double>(event_counts[BRANCH_MISSES]); }
double cache_references() const { return static_cast<double>(event_counts[CACHE_REFERENCES]); }
double cache_misses() const { return static_cast<double>(event_counts[CACHE_MISSES]); }
#endif
event_count& operator=(const event_count& other) {
this->elapsed = other.elapsed;
this->event_counts = other.event_counts;
return *this;
}
event_count operator+(const event_count& other) const {
return event_count(elapsed+other.elapsed, {
event_counts[0]+other.event_counts[0],
event_counts[1]+other.event_counts[1],
event_counts[2]+other.event_counts[2],
event_counts[3]+other.event_counts[3],
event_counts[4]+other.event_counts[4],
});
}
void operator+=(const event_count& other) {
*this = *this + other;
}
};
struct event_aggregate {
int iterations = 0;
event_count total{};
event_count best{};
event_count worst{};
event_aggregate() {}
void operator<<(const event_count& other) {
if (iterations == 0 || other.elapsed < best.elapsed) {
best = other;
}
if (iterations == 0 || other.elapsed > worst.elapsed) {
worst = other;
}
iterations++;
total += other;
}
double elapsed_sec() const { return total.elapsed_sec() / iterations; }
double total_elapsed_ns() const { return total.elapsed_ns(); }
double elapsed_ns() const { return total.elapsed_ns() / iterations; }
double cycles() const { return total.cycles() / iterations; }
double instructions() const { return total.instructions() / iterations; }
#if !SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS
double branch_misses() const { return total.branch_misses() / iterations; }
double cache_references() const { return total.cache_references() / iterations; }
double cache_misses() const { return total.cache_misses() / iterations; }
#endif
};
struct event_collector {
event_count count{};
time_point<steady_clock> start_clock{};
#if defined(__linux__)
LinuxEvents<PERF_TYPE_HARDWARE> linux_events;
event_collector() : linux_events(vector<int>{
#if SIMDJSON_SIMPLE_PERFORMANCE_COUNTERS
PERF_COUNT_HW_CPU_CYCLES,
PERF_COUNT_HW_INSTRUCTIONS,
#else
PERF_COUNT_HW_CPU_CYCLES,
PERF_COUNT_HW_INSTRUCTIONS,
PERF_COUNT_HW_BRANCH_MISSES,
PERF_COUNT_HW_CACHE_REFERENCES,
PERF_COUNT_HW_CACHE_MISSES
#endif
}) {}
bool has_events() {
return linux_events.is_working();
}
#elif __APPLE__ && __aarch64__
AppleEvents apple_events;
performance_counters diff;
event_collector() : diff(0) {
apple_events.setup_performance_counters();
}
bool has_events() {
return apple_events.setup_performance_counters();
}
#else
event_collector() {}
bool has_events() {
return false;
}
#endif
simdjson_inline void start() {
#if defined(__linux)
linux_events.start();
#elif __APPLE__ && __aarch64__
if(has_events()) { diff = apple_events.get_counters(); }
#endif
start_clock = steady_clock::now();
}
simdjson_inline event_count& end() {
time_point<steady_clock> end_clock = steady_clock::now();
#if defined(__linux)
linux_events.end(count.event_counts);
#elif __APPLE__ && __aarch64__
if(has_events()) {
performance_counters end = apple_events.get_counters();
diff = end - diff;
}
count.event_counts[0] = diff.cycles;
count.event_counts[1] = diff.instructions;
count.event_counts[2] = diff.missed_branches;
count.event_counts[3] = 0;
count.event_counts[4] = 0;
#endif
count.elapsed = end_clock - start_clock;
return count;
}
};
#endif
|