File: snappy.cpp

package info (click to toggle)
snappy-tools 1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 168 kB
  • sloc: cpp: 442; sh: 217; makefile: 14
file content (591 lines) | stat: -rw-r--r-- 23,315 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
// SPDX-License-Identifier: 0BSD


#include <algorithm>
#include <atomic>
#include <clocale>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <endian.h>
#include <err.h>
#include <pthread.h>
#include <snappy-sinksource.h>
#include <snappy.h>
#include <string_view>
#include <sys/mman.h>
#include <unistd.h>
#if __has_include(<libintl.h>)
#include <libintl.h>
#else
#define gettext(s) (s)
#define ngettext(s, p, n) (n == 1 ? s : p)
#endif

using namespace std::literals;


#if __i386__ || __x86_64__  // using function multiversioning (or manual cpuid check)
#include <nmmintrin.h>
#if(__linux__ && !__GLIBC__) || __OpenBSD__  // cpuid
#include <cpuid.h>
#define CRC32C_GENERIC_ATTR
#define CRC32C_GENERIC_NAME crc32c_generic
#define PARALLEL_COMPRESSION_CRC32C parallel_compression_crc32c_generic
#else  // multiversioning
#define MULTIVER 1
#define CRC32C_GENERIC_ATTR [[gnu::target("default")]]
#define CRC32C_GENERIC_NAME crc32c
#define PARALLEL_COMPRESSION_CRC32C parallel_compression_crc32c
#endif
#elif __aarch64__  // accelerated by default (CRC nominally optional on v8, required on v8.1)
#include <arm_acle.h>
#define CRC32C_GENERIC_ATTR [[maybe_unused]]
#define CRC32C_GENERIC_NAME crc32c_generic
#define PARALLEL_COMPRESSION_CRC32C parallel_compression_crc32c_generic
#elif __loongarch64  // manual cpucfg check in static init
#include <larchintrin.h>
#define CRC32C_GENERIC_ATTR
#define CRC32C_GENERIC_NAME crc32c_generic
#define PARALLEL_COMPRESSION_CRC32C parallel_compression_crc32c  // TODO: measure on hardware! defaults to true, which wins in QEMU
#else
#define CRC32C_GENERIC_ATTR
#define CRC32C_GENERIC_NAME crc32c
#define PARALLEL_COMPRESSION_CRC32C parallel_compression_crc32c
#endif


namespace {
	CRC32C_GENERIC_ATTR std::uint32_t CRC32C_GENERIC_NAME(std::uint32_t cur, const void * data, std::size_t data_len) {
		static const constexpr std::uint32_t crc_lut[] = {
		    0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB, 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B,
		    0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24, 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384,
		    0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B, 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A,
		    0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35, 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA,
		    0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A, 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A,
		    0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595, 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957,
		    0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198, 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927,
		    0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38, 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7,
		    0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789, 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859,
		    0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46, 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6,
		    0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829, 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C,
		    0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93, 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C,
		    0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC, 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C,
		    0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033, 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D,
		    0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982, 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D,
		    0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622, 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED,
		    0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F, 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF,
		    0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0, 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540,
		    0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F, 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE,
		    0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1, 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E,
		    0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E, 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E,
		    0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351};

		auto bytes = reinterpret_cast<const std::uint8_t *>(data), end = bytes + data_len;
		for(; bytes != end; ++bytes)
			cur = (cur >> 8) ^ crc_lut[(cur & 0xFF) ^ *bytes];
		return cur;
	}

	CRC32C_GENERIC_ATTR bool PARALLEL_COMPRESSION_CRC32C() {
		return true;
	}


#if __i386__ || __x86_64__
#if MULTIVER
#define CRC32Q(sym) sym
#else
#define CRC32Q(sym) sym##_sse42
#endif


	[[gnu::target("sse4.2")]] std::uint32_t CRC32Q(crc32c)(std::uint32_t cur_r, const void * data, std::size_t data_len) {
#if __x86_64__  // this isn't a simple std::size_t+#if _LP64 because x32 is amd64 ILP32
		using sse42_t = std::uint64_t;
#define _mm_crc32_big _mm_crc32_u64
#else
		using sse42_t = std::uint32_t;
#define _mm_crc32_big _mm_crc32_u32
#endif

		sse42_t cur = cur_r;

		auto bulk_iter = reinterpret_cast<const sse42_t *>(data);
		for(auto bulk = data_len / sizeof(sse42_t); bulk; --bulk, ++bulk_iter)
			cur = _mm_crc32_big(cur, *bulk_iter);

		auto single_iter = reinterpret_cast<const std::uint8_t *>(bulk_iter);
		for(auto single = data_len % sizeof(sse42_t); single; --single, ++single_iter)
			cur = _mm_crc32_u8(cur, *single_iter);

		return cur;
	}

	[[gnu::target("sse4.2")]] bool CRC32Q(parallel_compression_crc32c)() {
		return false;
	}

#if !MULTIVER
	const bool have_crc32q                 = ([] {
    std::uint32_t r[4];
    return __get_cpuid_count(1, 0, &r[0], &r[1], &r[2], &r[3]) && ((r[2] & (1 << 20)) == (1 << 20));
  }());
	const auto crc32c                      = have_crc32q ? CRC32Q(crc32c) : CRC32C_GENERIC_NAME;
	const auto parallel_compression_crc32c = have_crc32q ? CRC32Q(parallel_compression_crc32c) : PARALLEL_COMPRESSION_CRC32C;
#endif
#elif __aarch64__
	[[gnu::target("+crc")]] std::uint32_t crc32c(std::uint32_t cur, const void * data, std::size_t data_len) {
		auto bulk_iter = reinterpret_cast<const std::uint64_t *>(data);
		for(auto bulk = data_len / sizeof(std::uint64_t); bulk; --bulk, ++bulk_iter)
			cur = __crc32cd(cur, *bulk_iter);

		auto single_iter = reinterpret_cast<const std::uint8_t *>(bulk_iter);
		for(auto single = data_len % sizeof(std::uint64_t); single; --single, ++single_iter)
			cur = __crc32cb(cur, *single_iter);

		return cur;
	}

	bool parallel_compression_crc32c() {
		return false;
	}
#elif __loongarch64
	std::uint32_t crc32c_crcc(std::uint32_t cur_r, const void * data, std::size_t data_len) {
		int cur;
		std::memcpy(&cur, &cur_r, sizeof(std::uint32_t));

		auto bulk_iter = reinterpret_cast<const long int *>(data);
		for(auto bulk = data_len / sizeof(long int); bulk; --bulk, ++bulk_iter)
			cur = __crcc_w_d_w(*bulk_iter, cur);

		auto single_iter = reinterpret_cast<const char *>(bulk_iter);
		for(auto single = data_len % sizeof(long int); single; --single, ++single_iter)
			cur = __crcc_w_b_w(*single_iter, cur);

		std::memcpy(&cur_r, &cur, sizeof(std::uint32_t));
		return cur_r;
	}

	// bool parallel_compression_crc32c() {
	// 	// TODO: measure on hardware! defaults to true, which wins in QEMU
	// }

	// As of 2024-01-14, the official upstream ISA manual (https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#_cpucfg) says:
	//   Table 3. The configuration information accessible by the CPUCFG instruction
	//   Word number	Bit number	Annotation	Implication
	//           0x1         25   IOCSR_BRD   1 indicates that the string of processor product information is recorded at address 0 of the IOCSR access space
	//
	//                                        That is, information such as “Loongson3A5000 @2.5GHz”
	//
	// This is corroborated by the manuals for the 3A5000/3B5000, 3C5000, and 3D5000
	//   https://github.com/loongson-community/docs/blob/master/3A5000/Loongson3A5000_3B5000%20user%20book_V1.3.pdf
	//   https://github.com/loongson-community/docs/blob/master/3C5000/龙芯3C5000寄存器及使用手册V1.0.pdf
	//   https://github.com/loongson-community/docs/blob/master/3D5000/2023061508513851030.龙芯3D5000处理器寄存器使用手册_V1.0.pdf
	// whereas the 3A4000 manual allocates cpucfg completely differently
	//   https://github.com/loongson-community/docs/blob/master/3A4000/3A4000_user_v1.5_20191220.pdf
	//
	// The 3A6000's allocation scheme agrees with the live doc and the 5000s except bit 25 is
	//   CRC32  为 1 表示支持 CRC32 加速指令。  1’b1
	// https://github.com/loongson-community/docs/blob/master/3A6000/Loongson3A6000%20user%20book_V1.1.pdf
	//
	// Linux checks bit 25 for CRC/CRCC presence since
	//   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=df830336045db1246d3245d3737fee9939c5f731
	// saying
	//   LoongArch: Fix probing of the CRC32 feature
	//
	//   Not all LoongArch processors support CRC32 instructions. This feature
	//   is indicated by CPUCFG1.CRC32 (Bit25) but it is wrongly defined in the
	//   previous versions of the ISA manual (and so does in loongarch.h). The
	//   CRC32 feature is set unconditionally now, so fix it.
#define LOONG64_CPUCFG1_CRC32 (1 << 25)
	static const auto crc32c = __cpucfg(1) & LOONG64_CPUCFG1_CRC32 ? crc32c_crcc : crc32c_generic;
#endif

	std::uint32_t crc32c_finish(std::uint32_t cur) {
		cur = ~cur;
		// https://github.com/google/snappy/blob/f82bff66afe0de4c9ae22f8c4ef84e3c2233e799/framing_format.txt#L53-L55
		cur = ((cur >> 15) | (cur << 17)) + 0xA282EAD8;
		return cur;
	}


	static union {
		char buf[0xFFFFFF];
		std::uint32_t bufsum;
	};


	// A Source is an interface that yields a sequence of bytes
	struct fd_source : snappy::Source {
		int fd;
		const char * filename;
		off_t size, off;

		fd_source(int fd, const char * filename) : fd(fd), filename(filename) {
			if((this->off = (filename == "-"sv ? lseek(this->fd, 0, SEEK_CUR) : 0)) == -1)
				return;
			if((this->size = lseek(this->fd, 0, SEEK_END)) == -1)
				this->off = -1;
		}

		// Return the number of bytes left to read from the source
		virtual size_t Available() const { return this->size - this->off; }

		// Peek at the next flat region of the source.  Does not reposition
		// the source.  The returned region is empty iff Available()==0.
		//
		// Returns a pointer to the beginning of the region and store its
		// length in *len.
		//
		// The returned region is valid until the next call to Skip() or
		// until this object is destroyed, whichever occurs first.
		//
		// The returned region may be larger than Available() (for example
		// if this ByteSource is a view on a substring of a larger source).
		// The caller is responsible for ensuring that it only reads the
		// Available() bytes.
		virtual const char * Peek(size_t * len) {
			static_assert(snappy::kBlockSize <= sizeof(buf));
			ssize_t rd;
			while((rd = pread(this->fd, buf, std::min(static_cast<off_t>(snappy::kBlockSize), this->size), this->off)) == -1 && errno == EINTR)
				;
			if(rd == -1)
				err(2, "%s", this->filename);
			if(rd == 0)
				this->off = this->size;
			*len = rd;
			return buf;
		}

		// Skip the next n bytes.  Invalidates any buffer returned by
		// a previous call to Peek().
		// REQUIRES: Available() >= n
		virtual void Skip(size_t n) { this->off += n; }
	};

	struct FILE_sink : snappy::Sink {
		FILE * f;

		FILE_sink(FILE * f) : f(f) {}

		// Append "bytes[0,n-1]" to this.
		virtual void Append(const char * bytes, size_t n) {
			if(std::fwrite(bytes, 1, n, this->f) != n)
				err(2, gettext("write error"));
		}
	};

	struct FILE_crc32_sink : FILE_sink {
		std::uint32_t sum = ~0;

		using FILE_sink::FILE_sink;

		// Append "bytes[0,n-1]" to this.
		virtual void Append(const char * bytes, size_t n) {
			FILE_sink::Append(bytes, n);
			this->sum = crc32c(this->sum, bytes, n);
		}
	};


	bool uncompress_unframed(const void * data, std::size_t data_len, snappy::Sink * sink, auto filename, bool ignore_errors) {
		std::uint32_t len = -1;
		bool err{};
		{
			snappy::ByteArraySource src{reinterpret_cast<const char *>(data), data_len};
			if(!snappy::GetUncompressedLength(&src, &len)) {
				// not understood by the decompressor
				warnx(gettext("%s: compressed block of length %zu: invalid data"), filename, data_len), err = true;
				if(!ignore_errors)
					return err;
			}
		}
		{
			snappy::ByteArraySource src{reinterpret_cast<const char *>(data), data_len};
			auto un = snappy::UncompressAsMuchAsPossible(&src, sink);
			if(un != len)
				warnx(gettext("%s: compressed block of length %zu: expecting %zu bytes, got %zu"), filename, data_len, static_cast<std::size_t>(len), un), err = true;
		}
		return err;
	}

	// The stream ends when the file ends -- there is no explicit end-of-file marker.
	int uncompress_framed(FILE * f, const char * filename, bool ignore_errors) {
		bool err{};
		for(;;) {
			auto tp = getc(f);
			if(tp == EOF) {
			checkerr:
				if(std::ferror(f))
					::err(2, "%s", filename);
				else
					return err;
			}
			std::uint32_t chunk_len{};
			for(int i = 0; i < 3; ++i) {
				auto l = getc(f);
				if(l == EOF)
					goto checkerr;
				chunk_len |= l << (i * 8);
			}
			if(std::fread(buf, 1, chunk_len, f) != chunk_len)
				goto checkerr;

			FILE_crc32_sink sink{stdout};
			switch(tp) {
				case 0xFF:  // 4.1. Stream identifier (chunk type 0xff)
					if(chunk_len != std::strlen("sNaPpY"))
						warnx(gettext("%s: stream identifier chunk: length %zu != %zu"), filename, static_cast<std::size_t>(chunk_len), std::strlen("sNaPpY")), err = true;
					if(std::memcmp(buf, "sNaPpY", std::strlen("sNaPpY")))
						warnx(gettext("%s: stream identifier chunk: content %.*s != %s"), filename, (int)chunk_len, buf, "sNaPpY"), err = true;
					break;
				case 0x00:  // 4.2. Compressed data (chunk type 0x00)
					err |= uncompress_unframed(buf + 4, chunk_len - 4, &sink, filename, ignore_errors);
					sink.f = nullptr;
					break;
				case 0x01:  // 4.3. Uncompressed data (chunk type 0x01)
					sink.Append(buf + 4, chunk_len - 4);
					sink.f = nullptr;
					break;
				case 0xFE:  // 4.4. Padding (chunk type 0xfe)
					break;
				default:
					if(tp >= 0x02 && tp <= 0x7F)  // 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f)
						warnx(gettext("%s: chunk of length %zu: unknown type 0x%02X"), filename, static_cast<std::size_t>(chunk_len), tp), err = true;
					else  // 4.6. Reserved skippable chunks (chunk types 0x80-0xfd)
						;
			}
			if(err && !ignore_errors)
				return err;

			if(!sink.f) {
				sink.sum = crc32c_finish(sink.sum);
				if(le32toh(bufsum) != sink.sum) {
					warnx(gettext("%s: chunk of length %zu: checksum 0x%08X != 0x%08X"), filename, static_cast<std::size_t>(chunk_len), sink.sum, le32toh(bufsum)),
					    err = true;
					if(!ignore_errors)
						return err;
				}
			}
		}
		return err;
	}

	std::pair<void *, std::size_t> ingest(FILE * f, const char * filename, const void * prefix = nullptr, std::size_t prefix_len = 0) {
		char * data;
		std::size_t data_size;
		auto mf = open_memstream(&data, &data_size);
		if(!mf)
		mferr:
			err(2, nullptr);

		std::fwrite(prefix, 1, prefix_len, mf);

		for(auto rd = sizeof(buf); rd == sizeof(buf);) {
			if((rd = std::fread(buf, 1, sizeof(buf), f)) != sizeof(buf) && std::ferror(f))
				err(2, "%s", filename);

			if(std::fwrite(buf, 1, rd, mf) != rd)
				goto mferr;
		}
		if(std::fclose(mf))
			goto mferr;

		return {data, data_size};
	}

	struct stdout_flush {
		~stdout_flush() {
			if(std::fflush(stdout))
				err(2, gettext("write error"));
		}
	};
}

int main(int argc, char * const * argv) {
	setlocale(LC_ALL, "");
#if __has_include(<libintl.h>)
	bindtextdomain("snappy-tools", TEXTDOMAIN_DIRNAME);
	textdomain("snappy-tools");
#endif


	auto bn               = std::strrchr(argv[0] ?: "", '/') ?: (argv[0] ?: "");
	auto uncompress_argv0 = *bn == 'u' || (*bn == '/' && *(bn + 1) == 'u');

	bool uncompress = uncompress_argv0;
	bool uncompress_ignore_errors{};
	bool compress_frame{};

	for(int arg; (arg = getopt(argc, argv, uncompress_argv0 ? "i" : "dif")) != -1;)
		switch(arg) {
			case 'd':
				uncompress = true;
				break;
			case 'i':
				uncompress_ignore_errors = true;
				break;
			case 'f':
				compress_frame = true;
				break;
			default:
			usage:
				// Also in README!
				return std::fprintf(stderr,
				                    uncompress_argv0 ? gettext("usage: %1$s [-i]   snappy.sn|.sz\n"
				                                               "       %1$s [-i] < snappy.sn|.sz\n")
				                                     : gettext("usage: %1$s    [-f]   data > snappy.sn|.sz\n"
				                                               "       %1$s    [-f] < data > snappy.sn|.sz\n"
				                                               "       %1$s -d [-i]          snappy.sn|.sz\n"
				                                               "       %1$s -d [-i] <        snappy.sn|.sz\n"),
				                    argv[0]),
				       1;
		}
	if(*(argv + optind) && *(argv + optind + 1))
		goto usage;

	auto filename = *(argv + optind) ?: "-";
	if(filename != "-"sv)
		if(!std::freopen(filename, "r", stdin))
			err(2, "%s", filename);

	const int fd   = 0;
	FILE * const f = stdin;
	stdout_flush _flusher{};

	if(uncompress) {
		auto hdrlen = std::fread(buf, 1, sizeof("\xFF\x06\x00\x00sNaPpY") - 1, f);
		if(hdrlen != sizeof("\xFF\x06\x00\x00sNaPpY") - 1) {
			if(std::ferror(f))
				err(2, "%s", filename);

			FILE_sink sink{stdout};
			return uncompress_unframed(buf, hdrlen, &sink, filename, uncompress_ignore_errors);
		} else if(!std::memcmp(buf, "\xFF\x06\x00\x00sNaPpY", sizeof("\xFF\x06\x00\x00sNaPpY") - 1))
			return uncompress_framed(f, filename, uncompress_ignore_errors);

		// Ideally we'd just ungetc but realistically glibc/musl give us 8 bytes and we want 9 :/
		// while(hdrlen--)
		// 	assert(std::ungetc(buf[hdrlen], stdin) != EOF);

		off_t orig_pos, len;
		const void * mapping;
		if((orig_pos = ftello(f)) == -1 || fseeko(f, 0, SEEK_END) == -1 || (len = ftello(f)) == -1 ||
		   ((mapping = mmap(nullptr, len, PROT_READ, MAP_PRIVATE, fd, 0)) == MAP_FAILED && (fseeko(f, orig_pos, SEEK_SET), true))) {
			auto dt = ingest(f, filename, buf, hdrlen);
			FILE_sink sink{stdout};
			return uncompress_unframed(dt.first, dt.second, &sink, filename, uncompress_ignore_errors);
		}

		FILE_sink sink{stdout};
		return uncompress_unframed(reinterpret_cast<const std::uint8_t *>(mapping) + (orig_pos - hdrlen), len - (orig_pos - hdrlen), &sink, filename,
		                           uncompress_ignore_errors);
	} else {
		unsigned long long read{}, written{};

		// No  thread: {read   → compress → cksum → write}...
		//
		// Yes thread: {read 🚧→ compress 🚧→ write}...
		//                    ↓ len        ↑ sum
		//     thread:      {🚧→ cksum    🚧→}...
		const auto crc_in_thread = parallel_compression_crc32c();

		struct ipc {
			pthread_barrier_t barrier;
			std::atomic<std::uint32_t> len_sum;
		} ipc;
		pthread_t cksum_thread;
		if(crc_in_thread) {
			while(pthread_barrier_init(&ipc.barrier, nullptr, 2))
				;
			while(pthread_create(
			    &cksum_thread, nullptr,
			    [](void * ipcp) -> void * {
				    auto & ipc = *reinterpret_cast<struct ipc *>(ipcp);
				    for(;;) {
					    pthread_barrier_wait(&ipc.barrier);
					    ipc.len_sum.store(htole32(crc32c_finish(crc32c(~0, buf, ipc.len_sum.load(std::memory_order::relaxed)))), std::memory_order::relaxed);
					    pthread_barrier_wait(&ipc.barrier);
				    }
				    __builtin_unreachable();
			    },
			    &ipc))
				;
		}

		if(compress_frame) {
			if(std::fwrite("\xFF\x06\x00\x00sNaPpY", 1, sizeof("\xFF\x06\x00\x00sNaPpY") - 1, stdout) != sizeof("\xFF\x06\x00\x00sNaPpY") - 1)
			we:
				err(2, gettext("write error"));
			written = sizeof("\xFF\x06\x00\x00sNaPpY") - 1;

			// However, we place an additional restriction that the uncompressed data in a chunk must be no longer than 65536 bytes.
			for(std::size_t rd = 65536; rd == 65536;) {
				if((rd = std::fread(buf, 1, 65536, f)) != 65536 && std::ferror(f))
					err(2, "%s", filename);
				read += rd;
				if(!rd)
					break;

				if(crc_in_thread) {
					ipc.len_sum.store(rd, std::memory_order::relaxed);
					pthread_barrier_wait(&ipc.barrier);
				}

				auto compbuf            = buf + 65536;
				std::size_t compbuf_len = -1;
				snappy::RawCompress(buf, rd, compbuf, &compbuf_len);

				// Both the uncompressed and the compressed chunks have the same final size: [1    + 3  ] + 4     + data
				//                                                                           [type + len] + cksum + data
				//                                                                           [header    ] + body
				// 4.2. Compressed data (chunk type 0x00)
				// 4.3. Uncompressed data (chunk type 0x01)
				std::uint8_t tp = (compbuf_len < rd) ? 0x00 : 0x01;
				if(putc(tp, stdout) == EOF)
					goto we;
				auto outsize = 4 + (tp ? rd : compbuf_len);
				if(putc((outsize & 0x0000FF) >> 0, stdout) == EOF ||  //
				   putc((outsize & 0x00FF00) >> 8, stdout) == EOF ||  //
				   putc((outsize & 0xFF0000) >> 16, stdout) == EOF)
					goto we;

				std::uint32_t crc;
				if(crc_in_thread) {
					pthread_barrier_wait(&ipc.barrier);
					crc = ipc.len_sum.load(std::memory_order::relaxed);
				} else
					crc = htole32(crc32c_finish(crc32c(~0, buf, rd)));
				if(std::fwrite(&crc, 1, 4, stdout) != 4)
					goto we;

				if(std::fwrite(tp ? buf : compbuf, 1, tp ? rd : compbuf_len, stdout) != (tp ? rd : compbuf_len))
					goto we;
				written += 1 + 3 + 4 + (tp ? rd : compbuf_len);
			}
		} else {
			FILE_sink sink{stdout};

			if(fd_source source{fd, filename}; source.size && source.off != -1) {
				read = source.size - source.off;
				if(read != source.Available())
					errno = EOVERFLOW, err(2, "%s", filename);

				written = Compress(&source, &sink);
			} else {
				auto dt = ingest(f, filename);
				read    = dt.second;

				snappy::ByteArraySource src{reinterpret_cast<const char *>(dt.first), dt.second};
				written = Compress(&src, &sink);
			}
		}

		// filename, byte count read, byte count written, ratio
		std::fprintf(stderr, gettext("%s: %llu -> %llu (%.2f%%)\n"), filename, read, written, 100. * ((double)written / read));

		if(!compress_frame && read >= 4ull * 1024 * 1024 * 1024)
			return std::fprintf(stderr, gettext("%s: %s: sized %lluB >= 4GiB w/o -f: output stream may be broken!\n"), argv[0], filename, read), 3;
	}
}