1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
|
#include <algorithm>
#include <chrono>
#include <iostream>
#include <map>
#include <vector>
#include "simdjson.h"
#define NB_ITERATION 20
#define MIN_BATCH_SIZE 10000
#define MAX_BATCH_SIZE 10000000
bool test_baseline = false;
bool test_per_batch = true;
bool test_best_batch = false;
bool compare(std::pair<size_t, double> i, std::pair<size_t, double> j) {
return i.second > j.second;
}
int main(int argc, char *argv[]) {
if (argc <= 1) {
std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
exit(1);
}
const char *filename = argv[1];
simdjson::padded_string p;
if (simdjson::padded_string::load(filename).get(p)) {
std::cerr << "Could not load the file " << filename << std::endl;
return EXIT_FAILURE;
}
if (test_baseline) {
std::wclog << "Baseline: Getline + normal parse... " << std::endl;
std::cout << "Gigabytes/second\t"
<< "Nb of documents parsed" << std::endl;
for (auto i = 0; i < 3; i++) {
// Actual test
simdjson::dom::parser parser;
simdjson::error_code alloc_error = parser.allocate(p.size());
if (alloc_error) {
std::cerr << alloc_error << std::endl;
return EXIT_FAILURE;
}
std::istringstream ss(std::string(p.data(), p.size()));
auto start = std::chrono::steady_clock::now();
int count = 0;
std::string line;
int parse_res = simdjson::SUCCESS;
while (getline(ss, line)) {
// TODO we're likely triggering simdjson's padding reallocation here. Is
// that intentional?
parser.parse(line);
count++;
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> secs = end - start;
double speedinGBs = static_cast<double>(p.size()) /
(static_cast<double>(secs.count()) * 1000000000.0);
std::cout << speedinGBs << "\t\t\t\t" << count << std::endl;
if (parse_res != simdjson::SUCCESS) {
std::cerr << "Parsing failed" << std::endl;
exit(1);
}
}
}
std::map<size_t, double> batch_size_res;
if (test_per_batch) {
std::wclog << "parse_many: Speed per batch_size... from " << MIN_BATCH_SIZE
<< " bytes to " << MAX_BATCH_SIZE << " bytes..." << std::endl;
std::cout << "Batch Size\t"
<< "Gigabytes/second\t"
<< "Nb of documents parsed" << std::endl;
for (size_t i = MIN_BATCH_SIZE; i <= MAX_BATCH_SIZE;
i += (MAX_BATCH_SIZE - MIN_BATCH_SIZE) / 100) {
batch_size_res.insert(std::pair<size_t, double>(i, 0));
int count;
for (size_t j = 0; j < 5; j++) {
// Actual test
simdjson::dom::parser parser;
simdjson::error_code error;
auto start = std::chrono::steady_clock::now();
count = 0;
simdjson::dom::document_stream docs;
if ((error = parser.parse_many(p, i).get(docs))) {
std::wcerr << "Parsing failed with: " << error << std::endl;
exit(1);
}
for (auto result : docs) {
error = result.error();
if (error) {
std::wcerr << "Parsing failed with: " << error << std::endl;
exit(1);
}
count++;
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> secs = end - start;
double speedinGBs = static_cast<double>(p.size()) /
(static_cast<double>(secs.count()) * 1000000000.0);
if (speedinGBs > batch_size_res.at(i))
batch_size_res[i] = speedinGBs;
}
std::cout << i << "\t\t" << std::fixed << std::setprecision(3)
<< batch_size_res.at(i) << "\t\t\t\t" << count << std::endl;
}
}
size_t optimal_batch_size{};
double best_speed{};
if (test_per_batch) {
std::pair<size_t, double> best_results;
best_results =
(*min_element(batch_size_res.begin(), batch_size_res.end(), compare));
optimal_batch_size = best_results.first;
best_speed = best_results.second;
} else {
optimal_batch_size = MIN_BATCH_SIZE;
}
std::wclog << "Seemingly optimal batch_size: " << optimal_batch_size << "..."
<< std::endl;
std::wclog << "Best speed: " << best_speed << "..." << std::endl;
if (test_best_batch) {
std::wclog << "Starting speed test... Best of " << NB_ITERATION
<< " iterations..." << std::endl;
std::vector<double> res;
for (int i = 0; i < NB_ITERATION; i++) {
// Actual test
simdjson::dom::parser parser;
simdjson::error_code error;
auto start = std::chrono::steady_clock::now();
// This includes allocation of the parser
simdjson::dom::document_stream docs;
if ((error = parser.parse_many(p, optimal_batch_size).get(docs))) {
std::wcerr << "Parsing failed with: " << error << std::endl;
exit(1);
}
for (auto result : docs) {
error = result.error();
if (error) {
std::wcerr << "Parsing failed with: " << error << std::endl;
exit(1);
}
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> secs = end - start;
res.push_back(secs.count());
}
double min_result = *min_element(res.begin(), res.end());
double speedinGBs =
static_cast<double>(p.size()) / (min_result * 1000000000.0);
std::cout << "Min: " << min_result << " bytes read: " << p.size()
<< " Gigabytes/second: " << speedinGBs << std::endl;
}
#ifdef SIMDJSON_THREADS_ENABLED
// Multithreading probably does not help matters for small files (less than 10
// MB).
if (p.size() < 10000000) {
std::cout << std::endl;
std::cout << "Warning: your file is small and the performance results are "
"probably meaningless"
<< std::endl;
std::cout << "as far as multithreaded performance goes." << std::endl;
std::cout << std::endl;
std::cout
<< "Try to concatenate the file with itself to generate a large one."
<< std::endl;
std::cout << "In bash: " << std::endl;
std::cout << "for i in {1..1000}; do cat '" << filename
<< "' >> bar.ndjson; done" << std::endl;
std::cout << argv[0] << " bar.ndjson" << std::endl;
}
#endif
return 0;
}
|