File: bench.cpp

package info (click to toggle)
plocate 1.1.24-1
  • links: PTS
  • area: main
  • in suites: forky, sid
  • size: 460 kB
  • sloc: cpp: 5,390; sh: 84; makefile: 4
file content (146 lines) | stat: -rw-r--r-- 5,856 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#include <chrono>
#include <fcntl.h>
#include <memory>
#include <stdio.h>
#include <string>
#include <unistd.h>

#define dprintf(...)
//#define dprintf(...) fprintf(stderr, __VA_ARGS__);

#include "complete_pread.h"
#include "db.h"
#include "io_uring_engine.h"
#include "turbopfor-encode.h"
#include "turbopfor.h"
#include "vp4.h"

using namespace std;
using namespace std::chrono;

bool use_debug = false;

int main(void)
{
	int fd = open("plocate.db", O_RDONLY);
	if (fd == -1) {
		perror("plocate.db");
		exit(1);
	}

	Header hdr;
	complete_pread(fd, &hdr, sizeof(hdr), /*offset=*/0);

	unique_ptr<Trigram[]> ht(new Trigram[hdr.hashtable_size + hdr.extra_ht_slots + 1]);
	complete_pread(fd, ht.get(), (hdr.hashtable_size + hdr.extra_ht_slots + 1) * sizeof(Trigram), hdr.hash_table_offset_bytes);

	size_t posting_list_bytes = 0, own_posting_list_bytes = 0, total_elements = 0, most_bytes_pl = 0;
	uint32_t longest_pl = 0;
	vector<pair<string, unsigned>> posting_lists;
	for (unsigned i = 0; i < hdr.hashtable_size + hdr.extra_ht_slots; ++i) {
		if (ht[i].num_docids == 0) {
			continue;
		}
		size_t len = ht[i + 1].offset - ht[i].offset;
		string str;
		str.resize(len);
		complete_pread(fd, &str[0], len, ht[i].offset);
		posting_lists.emplace_back(move(str), ht[i].num_docids);
		longest_pl = std::max(ht[i].num_docids, longest_pl);
		most_bytes_pl = std::max(len, most_bytes_pl);
		posting_list_bytes += len;
		total_elements += ht[i].num_docids;
	}
	ht.reset();
	fprintf(stderr, "Read %zu posting lists.\n", posting_lists.size());

	string encoded_pl;
	encoded_pl.resize(longest_pl * 2 + 16384);  // Lots of margin.

	size_t num_decode_errors = 0, num_encode_errors = 0;
	for (auto &[pl, num_docids] : posting_lists) {
		//fprintf(stderr, "%zu bytes, %u docids\n", pl.size(), num_docids);
		vector<uint32_t> out1, out2;
		out1.resize(num_docids + 128);
		out2.resize(num_docids + 128);
		unsigned char *pldata = reinterpret_cast<unsigned char *>(&pl[0]);
		p4nd1dec128v32(pldata, num_docids, &out1[0]);
		decode_pfor_delta1_128(pldata, num_docids, /*interleaved=*/true, &out2[0]);
		for (unsigned i = 0; i < num_docids; ++i) {
			if (out1[i] != out2[i]) {
				if (++num_decode_errors < 10) {
					fprintf(stderr, "Decode error:\n");
					for (unsigned j = 0; j < num_docids; ++j) {
						fprintf(stderr, "%3u: reference=%u ours=%u  (diff=%d)\n", j, out1[j], out2[j], out1[j] - out2[j]);
					}
				}
				break;
			}
		}

		// Test encoding, by encoding with out own implementation
		// and checking that decoding with the reference gives
		// the same result. We do not measure performance (we're slow).
		uint32_t deltas[128];
		unsigned char *ptr = reinterpret_cast<unsigned char *>(&encoded_pl[0]);
		ptr = write_baseval(out1[0], ptr);
		for (unsigned i = 1; i < num_docids; i += 128) {
			unsigned num_docids_this_block = std::min(num_docids - i, 128u);
			for (unsigned j = 0; j < num_docids_this_block; ++j) {
				deltas[j] = out1[i + j] - out1[i + j - 1] - 1;
			}
			bool interleaved = (num_docids_this_block == 128);
			ptr = encode_pfor_single_block<128>(deltas, num_docids_this_block, interleaved, ptr);
		}
		own_posting_list_bytes += ptr - reinterpret_cast<unsigned char *>(&encoded_pl[0]);

		pldata = reinterpret_cast<unsigned char *>(&encoded_pl[0]);
		p4nd1dec128v32(pldata, num_docids, &out2[0]);
		for (unsigned i = 0; i < num_docids; ++i) {
			if (out1[i] != out2[i]) {
				if (++num_encode_errors < 10) {
					fprintf(stderr, "Encode error:\n");
					for (unsigned j = 0; j < num_docids; ++j) {
						fprintf(stderr, "%3u: reference=%u ours=%u  (diff=%d)\n", j, out1[j], out2[j], out1[j] - out2[j]);
					}
				}
				break;
			}
		}
	}
	fprintf(stderr, "%zu/%zu posting lists had errors in decoding.\n", num_decode_errors, posting_lists.size());
	fprintf(stderr, "%zu/%zu posting lists had errors in encoding.\n", num_encode_errors, posting_lists.size());

	// Benchmark.
	vector<uint32_t> dummy;
	dummy.resize(longest_pl + 128);
	steady_clock::time_point start = steady_clock::now();
	for (auto &[pl, num_docids] : posting_lists) {
		unsigned char *pldata = reinterpret_cast<unsigned char *>(&pl[0]);
		p4nd1dec128v32(pldata, num_docids, &dummy[0]);
	}
	steady_clock::time_point end = steady_clock::now();
	double reference_sec = duration<double>(end - start).count();
	fprintf(stderr, "Decoding with reference implementation: %.1f ms\n", 1e3 * reference_sec);

	start = steady_clock::now();
	for (auto &[pl, num_docids] : posting_lists) {
		unsigned char *pldata = reinterpret_cast<unsigned char *>(&pl[0]);
		decode_pfor_delta1_128(pldata, num_docids, /*interleaved=*/true, &dummy[0]);
	}
	end = steady_clock::now();
	double own_sec = duration<double>(end - start).count();
	fprintf(stderr, "Decoding with own implementation: %.3f ms (%.2f%% speed)\n", 1e3 * own_sec, 100.0 * reference_sec / own_sec);
	fprintf(stderr, "Size with own implementation: %.1f MB (%.2f%% of reference, %+d bytes)\n", own_posting_list_bytes / 1048576.0, 100.0 * own_posting_list_bytes / posting_list_bytes, int(own_posting_list_bytes) - int(posting_list_bytes));

	// Three numbers giving rules of thumb for judging our own implementation:
	//
	// - Compressed speed is easy to compare to disk I/O, to see the relative importance
	// - Uncompressed speed is easy to compare to intersection speeds and memory bandwidth
	//   (also very roughly comparable to the benchmark numbers in the TurboPFor README)
	// - ns/element gives an absolute measure for plocate (e.g. if we can decompress at
	//   1 ns/element, a 10k-element posting list goes by in 0.01 ms, which is way beyond
	//   instantaneous in practice).
	fprintf(stderr, "%.1f MB/sec (compressed), %.1f MB/sec (uncompressed), %.1f ns/element\n", posting_list_bytes / own_sec / 1048576.0,
	        (total_elements * sizeof(uint32_t)) / own_sec / 1048576.0, 1e9 * own_sec / total_elements);
}