1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
|
#include "database-builder.h"
#include "db.h"
#include "dprintf.h"
#include <algorithm>
#include <arpa/inet.h>
#include <assert.h>
#include <chrono>
#include <getopt.h>
#include <iosfwd>
#include <locale.h>
#include <math.h>
#include <memory>
#include <random>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#include <sys/stat.h>
#include <utility>
#include <vector>
using namespace std;
using namespace std::chrono;
bool use_debug = false;
enum {
DBE_NORMAL = 0, /* A non-directory file */
DBE_DIRECTORY = 1, /* A directory */
DBE_END = 2 /* End of directory contents; contains no name */
};
// From mlocate.
struct db_header {
uint8_t magic[8];
uint32_t conf_size;
uint8_t version;
uint8_t check_visibility;
uint8_t pad[2];
};
// From mlocate.
struct db_directory {
uint64_t time_sec;
uint32_t time_nsec;
uint8_t pad[4];
};
string read_cstr(FILE *fp)
{
string ret;
for (;;) {
int ch = getc(fp);
if (ch == -1) {
perror("getc");
exit(1);
}
if (ch == 0) {
return ret;
}
ret.push_back(ch);
}
}
void handle_directory(FILE *fp, DatabaseReceiver *receiver)
{
db_directory dummy;
if (fread(&dummy, sizeof(dummy), 1, fp) != 1) {
if (feof(fp)) {
return;
} else {
perror("fread");
}
}
string dir_path = read_cstr(fp);
if (dir_path == "/") {
dir_path = "";
}
for (;;) {
int type = getc(fp);
if (type == DBE_NORMAL) {
string filename = read_cstr(fp);
receiver->add_file(dir_path + "/" + filename, unknown_dir_time);
} else if (type == DBE_DIRECTORY) {
string dirname = read_cstr(fp);
receiver->add_file(dir_path + "/" + dirname, unknown_dir_time);
} else {
return; // Probably end.
}
}
}
void read_plaintext(FILE *fp, DatabaseReceiver *receiver)
{
if (fseek(fp, 0, SEEK_SET) != 0) {
perror("fseek");
exit(1);
}
while (!feof(fp)) {
char buf[1024];
if (fgets(buf, sizeof(buf), fp) == nullptr) {
break;
}
string s(buf);
assert(!s.empty());
while (s.back() != '\n' && !feof(fp)) {
// The string was longer than the buffer, so read again.
if (fgets(buf, sizeof(buf), fp) == nullptr) {
break;
}
s += buf;
}
if (!s.empty() && s.back() == '\n')
s.pop_back();
receiver->add_file(move(s), unknown_dir_time);
}
}
void read_mlocate(FILE *fp, DatabaseReceiver *receiver)
{
if (fseek(fp, 0, SEEK_SET) != 0) {
perror("fseek");
exit(1);
}
db_header hdr;
if (fread(&hdr, sizeof(hdr), 1, fp) != 1) {
perror("short read");
exit(1);
}
// TODO: Care about the base path.
string path = read_cstr(fp);
if (fseek(fp, ntohl(hdr.conf_size), SEEK_CUR) != 0) {
perror("skip conf block");
exit(1);
}
while (!feof(fp)) {
handle_directory(fp, receiver);
}
}
void do_build(const char *infile, const char *outfile, int block_size, bool plaintext, bool check_visibility)
{
FILE *infp = fopen(infile, "rb");
if (infp == nullptr) {
perror(infile);
exit(1);
}
// Train the dictionary by sampling real blocks.
// The documentation for ZDICT_trainFromBuffer() claims that a reasonable
// dictionary size is ~100 kB, but 1 kB seems to actually compress better for us,
// and decompress just as fast.
DictionaryBuilder builder(/*blocks_to_keep=*/1000, block_size);
if (plaintext) {
read_plaintext(infp, &builder);
} else {
read_mlocate(infp, &builder);
}
string dictionary = builder.train(1024);
DatabaseBuilder db(outfile, /*owner=*/-1, block_size, dictionary, check_visibility);
DatabaseReceiver *corpus = db.start_corpus(/*store_dir_times=*/false);
if (plaintext) {
read_plaintext(infp, corpus);
} else {
read_mlocate(infp, corpus);
}
fclose(infp);
dprintf("Read %zu files from %s\n", corpus->num_files_seen(), infile);
db.finish_corpus();
}
void usage()
{
printf(
"Usage: plocate-build MLOCATE_DB PLOCATE_DB\n"
"\n"
"Generate plocate index from mlocate.db, typically /var/lib/mlocate/mlocate.db.\n"
"Normally, the destination should be /var/lib/mlocate/plocate.db.\n"
"\n"
" -b, --block-size SIZE number of filenames to store in each block (default 32)\n"
" -p, --plaintext input is a plaintext file, not an mlocate database\n"
" -l, --require-visibility FLAG check visibility before reporting files\n"
" --help print this help\n"
" --version print version information\n");
}
void version()
{
printf("plocate-build %s\n", PACKAGE_VERSION);
printf("Copyright 2020 Steinar H. Gunderson\n");
printf("License GPLv2+: GNU GPL version 2 or later <https://gnu.org/licenses/gpl.html>.\n");
printf("This is free software: you are free to change and redistribute it.\n");
printf("There is NO WARRANTY, to the extent permitted by law.\n");
}
bool parse_bool(const string &str, bool *result)
{
if (str == "0" || str == "no") {
*result = false;
return true;
}
if (str == "1" || str == "yes") {
*result = true;
return true;
}
return false;
}
int main(int argc, char **argv)
{
static const struct option long_options[] = {
{ "block-size", required_argument, 0, 'b' },
{ "plaintext", no_argument, 0, 'p' },
{ "require-visibility", required_argument, 0, 'l' },
{ "help", no_argument, 0, 'h' },
{ "version", no_argument, 0, 'V' },
{ "debug", no_argument, 0, 'D' }, // Not documented.
{ 0, 0, 0, 0 }
};
int block_size = 32;
bool plaintext = false;
bool check_visibility = true;
setlocale(LC_ALL, "");
for (;;) {
int option_index = 0;
int c = getopt_long(argc, argv, "b:hpl:VD", long_options, &option_index);
if (c == -1) {
break;
}
switch (c) {
case 'b':
block_size = atoi(optarg);
break;
case 'p':
plaintext = true;
break;
case 'l':
if (!parse_bool(optarg, &check_visibility) != 0) {
fprintf(stderr, "plocate-build: invalid value `%s' for --%s\n",
optarg, "require-visibility");
exit(EXIT_FAILURE);
}
break;
case 'h':
usage();
exit(0);
case 'V':
version();
exit(0);
case 'D':
use_debug = true;
break;
default:
exit(1);
}
}
if (argc - optind != 2) {
usage();
exit(1);
}
do_build(argv[optind], argv[optind + 1], block_size, plaintext, check_visibility);
exit(EXIT_SUCCESS);
}
|