File: database-builder.h

package info (click to toggle)
plocate 1.1.8-2%2Bdeb11u1
  • links: PTS
  • area: main
  • in suites: bullseye
  • size: 468 kB
  • sloc: cpp: 5,341; sh: 106; makefile: 4
file content (93 lines) | stat: -rw-r--r-- 2,348 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#ifndef _DATABASE_BUILDER_H
#define _DATABASE_BUILDER_H 1

#include "db.h"

#include <chrono>
#include <fcntl.h>
#include <memory>
#include <random>
#include <stddef.h>
#include <string>
#include <utility>
#include <vector>
#include <zstd.h>

class PostingListBuilder;

// {0,0} means unknown or so current that it should never match.
// {-1,0} means it's not a directory.
struct dir_time {
	int64_t sec;
	int32_t nsec;

	bool operator<(const dir_time &other) const
	{
		if (sec != other.sec)
			return sec < other.sec;
		return nsec < other.nsec;
	}
	bool operator>=(const dir_time &other) const
	{
		return !(other < *this);
	}
};
constexpr dir_time unknown_dir_time{ 0, 0 };
constexpr dir_time not_a_dir{ -1, 0 };

class DatabaseReceiver {
public:
	virtual ~DatabaseReceiver() = default;
	virtual void add_file(std::string filename, dir_time dt) = 0;
	virtual void flush_block() = 0;
	virtual void finish() { flush_block(); }

	// EncodingCorpus only.
	virtual size_t num_files_seen() const { return -1; }
};

class DictionaryBuilder : public DatabaseReceiver {
public:
	DictionaryBuilder(size_t blocks_to_keep, size_t block_size)
		: blocks_to_keep(blocks_to_keep), block_size(block_size) {}
	void add_file(std::string filename, dir_time dt) override;
	void flush_block() override;
	std::string train(size_t buf_size);

private:
	const size_t blocks_to_keep, block_size;
	std::string current_block;
	uint64_t block_num = 0;
	size_t num_files_in_block = 0;

	std::mt19937 reservoir_rand{ 1234 };  // Fixed seed for reproducibility.
	bool keep_current_block = true;
	int64_t slot_for_current_block = -1;

	std::vector<std::string> sampled_blocks;
	std::vector<size_t> lengths;
};

class EncodingCorpus;

class DatabaseBuilder {
public:
	DatabaseBuilder(const char *outfile, gid_t owner, int block_size, std::string dictionary, bool check_visibility);
	DatabaseReceiver *start_corpus(bool store_dir_times);
	void set_next_dictionary(std::string next_dictionary);
	void set_conf_block(std::string conf_block);
	void finish_corpus();

private:
	FILE *outfp;
	std::string outfile;
	std::string temp_filename;
	Header hdr;
	const int block_size;
	std::chrono::steady_clock::time_point corpus_start;
	EncodingCorpus *corpus = nullptr;
	ZSTD_CDict *cdict = nullptr;
	std::string next_dictionary, conf_block;
};

#endif  // !defined(_DATABASE_BUILDER_H)