File: xfile.c

package info (click to toggle)
xfsprogs 6.17.0-2
  • links: PTS
  • area: main
  • in suites: forky, sid
  • size: 11,324 kB
  • sloc: ansic: 167,334; sh: 4,604; makefile: 1,336; python: 835; cpp: 5
file content (420 lines) | stat: -rw-r--r-- 9,250 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
 * Author: Darrick J. Wong <djwong@kernel.org>
 */
#include "libxfs_priv.h"
#include "libxfs.h"
#include "libxfs/xfile.h"
#include <linux/memfd.h>
#include <sys/mman.h>
#ifndef HAVE_MEMFD_CREATE
#include <sys/syscall.h>
#endif
#include <sys/types.h>
#include <sys/wait.h>

/*
 * Swappable Temporary Memory
 * ==========================
 *
 * Offline checking sometimes needs to be able to stage a large amount of data
 * in memory.  This information might not fit in the available memory and it
 * doesn't all need to be accessible at all times.  In other words, we want an
 * indexed data buffer to store data that can be paged out.
 *
 * memfd files meet those requirements.  Therefore, the xfile mechanism uses
 * one to store our staging data.  The xfile must be freed with xfile_destroy.
 *
 * xfiles assume that the caller will handle all required concurrency
 * management; file locks are not taken.
 */

/*
 * Starting with Linux 6.3, there's a new MFD_NOEXEC_SEAL flag that disables
 * the longstanding memfd behavior that files are created with the executable
 * bit set, and seals the file against it being turned back on.
 */
#ifndef MFD_NOEXEC_SEAL
# define MFD_NOEXEC_SEAL	(0x0008U)
#endif

/*
 * The memfd_create system call was added to kernel 3.17 (2014), but
 * its corresponding glibc wrapper was only added in glibc 2.27
 * (2018).  In case a libc is not providing the wrapper, we provide
 * one here.
 */
#ifndef HAVE_MEMFD_CREATE
static int memfd_create(const char *name, unsigned int flags)
{
	return syscall(SYS_memfd_create, name, flags);
}
#endif

/*
 * Open a memory-backed fd to back an xfile.  We require close-on-exec here,
 * because these memfd files function as windowed RAM and hence should never
 * be shared with other processes.
 */
static int
xfile_create_fd(
	const char		*description)
{
	int			fd = -1;
	int			ret;

	/*
	 * memfd_create was added to kernel 3.17 (2014).  MFD_NOEXEC_SEAL
	 * causes -EINVAL on old kernels, so fall back to omitting it so that
	 * new xfs_repair can run on an older recovery cd kernel.
	 */
	fd = memfd_create(description, MFD_CLOEXEC | MFD_NOEXEC_SEAL);
	if (fd >= 0)
		goto got_fd;
	fd = memfd_create(description, MFD_CLOEXEC);
	if (fd >= 0)
		goto got_fd;

	/*
	 * O_TMPFILE exists as of kernel 3.11 (2013), which means that if we
	 * find it, we're pretty safe in assuming O_CLOEXEC exists too.
	 */
	fd = open("/dev/shm", O_TMPFILE | O_CLOEXEC | O_RDWR, 0600);
	if (fd >= 0)
		goto got_fd;

	fd = open("/tmp", O_TMPFILE | O_CLOEXEC | O_RDWR, 0600);
	if (fd >= 0)
		goto got_fd;

	/*
	 * mkostemp exists as of glibc 2.7 (2007) and O_CLOEXEC exists as of
	 * kernel 2.6.23 (2007).
	 */
	fd = mkostemp("libxfsXXXXXX", O_CLOEXEC);
	if (fd >= 0)
		goto got_fd;

	if (!errno)
		errno = EOPNOTSUPP;
	return -1;
got_fd:
	/*
	 * Turn off mode bits we don't want -- group members and others should
	 * not have access to the xfile, nor it be executable.  memfds are
	 * created with mode 0777, but we'll be careful just in case the other
	 * implementations fail to set 0600.
	 */
	ret = fchmod(fd, 0600);
	if (ret)
		perror("disabling xfile executable bit");

	return fd;
}

static LIST_HEAD(fcb_list);
static pthread_mutex_t fcb_mutex = PTHREAD_MUTEX_INITIALIZER;

/* Create a new memfd. */
static inline int
xfile_fcb_create(
	const char		*description,
	struct xfile_fcb	**fcbp)
{
	struct xfile_fcb	*fcb;
	int			fd;

	fd = xfile_create_fd(description);
	if (fd < 0)
		return -errno;

	fcb = malloc(sizeof(struct xfile_fcb));
	if (!fcb) {
		close(fd);
		return -ENOMEM;
	}

	list_head_init(&fcb->fcb_list);
	fcb->fd = fd;
	fcb->refcount = 1;

	*fcbp = fcb;
	return 0;
}

/* Release an xfile control block */
static void
xfile_fcb_irele(
	struct xfile_fcb	*fcb,
	loff_t			pos,
	uint64_t		len)
{
	/*
	 * If this memfd is linked only to itself, it's private, so we can
	 * close it without taking any locks.
	 */
	if (list_empty(&fcb->fcb_list)) {
		close(fcb->fd);
		free(fcb);
		return;
	}

	pthread_mutex_lock(&fcb_mutex);
	if (--fcb->refcount == 0) {
		/* If we're the last user of this memfd file, kill it fast. */
		list_del(&fcb->fcb_list);
		close(fcb->fd);
		free(fcb);
	} else if (len > 0) {
		struct stat	statbuf;
		int		ret;

		/*
		 * If we were using the end of a partitioned file, free the
		 * address space.  IOWs, bonus points if you delete these in
		 * reverse-order of creation.
		 */
		ret = fstat(fcb->fd, &statbuf);
		if (!ret && statbuf.st_size == pos + len) {
			ret = ftruncate(fcb->fd, pos);
		}
	}
	pthread_mutex_unlock(&fcb_mutex);
}

/*
 * Find an memfd that can accomodate the given amount of address space.
 */
static int
xfile_fcb_find(
	const char		*description,
	uint64_t		maxbytes,
	loff_t			*posp,
	struct xfile_fcb	**fcbp)
{
	struct xfile_fcb	*fcb;
	int			ret;
	int			error = 0;

	/* No maximum range means that the caller gets a private memfd. */
	if (maxbytes == 0) {
		*posp = 0;
		return xfile_fcb_create(description, fcbp);
	}

	/* round up to page granularity so we can do mmap */
	maxbytes = roundup_64(maxbytes, PAGE_SIZE);

	pthread_mutex_lock(&fcb_mutex);

	/*
	 * If we only need a certain number of byte range, look for one with
	 * available file range.
	 */
	list_for_each_entry(fcb, &fcb_list, fcb_list) {
		struct stat	statbuf;
		loff_t		pos;

		ret = fstat(fcb->fd, &statbuf);
		if (ret)
			continue;
		pos = roundup_64(statbuf.st_size, PAGE_SIZE);

		/*
		 * Truncate up to ensure that the memfd can actually handle
		 * writes to the end of the range.
		 */
		ret = ftruncate(fcb->fd, pos + maxbytes);
		if (ret)
			continue;

		fcb->refcount++;
		*posp = pos;
		*fcbp = fcb;
		goto out_unlock;
	}

	/* Otherwise, open a new memfd and add it to our list. */
	error = xfile_fcb_create(description, &fcb);
	if (error)
		goto out_unlock;

	ret = ftruncate(fcb->fd, maxbytes);
	if (ret) {
		error = -errno;
		xfile_fcb_irele(fcb, 0, maxbytes);
		goto out_unlock;
	}

	list_add_tail(&fcb->fcb_list, &fcb_list);
	*posp = 0;
	*fcbp = fcb;

out_unlock:
	pthread_mutex_unlock(&fcb_mutex);
	return error;
}

/*
 * Create an xfile of the given size.  The description will be used in the
 * trace output.
 */
int
xfile_create(
	const char		*description,
	unsigned long long	maxbytes,
	struct xfile		**xfilep)
{
	struct xfile		*xf;
	int			error;

	xf = kmalloc(sizeof(struct xfile), 0);
	if (!xf)
		return -ENOMEM;

	error = xfile_fcb_find(description, maxbytes, &xf->partition_pos,
			&xf->fcb);
	if (error) {
		kfree(xf);
		return error;
	}

	xf->maxbytes = maxbytes;
	*xfilep = xf;
	return 0;
}

/* Close the file and release all resources. */
void
xfile_destroy(
	struct xfile		*xf)
{
	xfile_fcb_irele(xf->fcb, xf->partition_pos, xf->maxbytes);
	kfree(xf);
}

static inline loff_t
xfile_maxbytes(
	struct xfile		*xf)
{
	if (xf->maxbytes > 0)
		return xf->maxbytes;

	if (sizeof(loff_t) == 8)
		return LLONG_MAX;
	return LONG_MAX;
}

/*
 * Load an object.  Since we're treating this file as "memory", any error or
 * short IO is treated as a failure to allocate memory.
 */
ssize_t
xfile_load(
	struct xfile		*xf,
	void			*buf,
	size_t			count,
	loff_t			pos)
{
	ssize_t			ret;

	if (count > INT_MAX)
		return -ENOMEM;
	if (xfile_maxbytes(xf) - pos < count)
		return -ENOMEM;

	ret = pread(xf->fcb->fd, buf, count, pos + xf->partition_pos);
	if (ret < 0)
		return -errno;
	if (ret != count)
		return -ENOMEM;
	return 0;
}

/*
 * Store an object.  Since we're treating this file as "memory", any error or
 * short IO is treated as a failure to allocate memory.
 */
ssize_t
xfile_store(
	struct xfile		*xf,
	const void		*buf,
	size_t			count,
	loff_t			pos)
{
	ssize_t			ret;

	if (count > INT_MAX)
		return -E2BIG;
	if (xfile_maxbytes(xf) - pos < count)
		return -EFBIG;

	ret = pwrite(xf->fcb->fd, buf, count, pos + xf->partition_pos);
	if (ret < 0)
		return -errno;
	if (ret != count)
		return -ENOMEM;
	return 0;
}

/* Compute the number of bytes used by a partitioned xfile. */
static unsigned long long
xfile_partition_bytes(
	struct xfile		*xf)
{
	loff_t			data_pos = xf->partition_pos;
	loff_t			stop_pos = data_pos + xf->maxbytes;
	loff_t			hole_pos;
	unsigned long long	bytes = 0;

	data_pos = lseek(xf->fcb->fd, data_pos, SEEK_DATA);
	while (data_pos >= 0 && data_pos < stop_pos) {
		hole_pos = lseek(xf->fcb->fd, data_pos, SEEK_HOLE);
		if (hole_pos < 0) {
			/* save error, break */
			data_pos = hole_pos;
			break;
		}
		if (hole_pos >= stop_pos) {
			bytes += stop_pos - data_pos;
			return bytes;
		}
		bytes += hole_pos - data_pos;

		data_pos = lseek(xf->fcb->fd, hole_pos, SEEK_DATA);
	}
	if (data_pos < 0 && errno != ENXIO)
		return xf->maxbytes;

	return bytes;
}

/* Compute the number of bytes used by a xfile. */
unsigned long long
xfile_bytes(
	struct xfile		*xf)
{
	struct stat		statbuf;
	int			error;

	if (xf->maxbytes > 0)
		return xfile_partition_bytes(xf);

	error = fstat(xf->fcb->fd, &statbuf);
	if (error)
		return -errno;

	return (unsigned long long)statbuf.st_blocks << 9;
}

/* Discard pages backing a range of the xfile. */
void
xfile_discard(
	struct xfile		*xf,
	loff_t			pos,
	unsigned long long	count)
{
	fallocate(xf->fcb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
			pos, count);
}