1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420
|
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (c) 2021-2024 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "libxfs_priv.h"
#include "libxfs.h"
#include "libxfs/xfile.h"
#include <linux/memfd.h>
#include <sys/mman.h>
#ifndef HAVE_MEMFD_CREATE
#include <sys/syscall.h>
#endif
#include <sys/types.h>
#include <sys/wait.h>
/*
* Swappable Temporary Memory
* ==========================
*
* Offline checking sometimes needs to be able to stage a large amount of data
* in memory. This information might not fit in the available memory and it
* doesn't all need to be accessible at all times. In other words, we want an
* indexed data buffer to store data that can be paged out.
*
* memfd files meet those requirements. Therefore, the xfile mechanism uses
* one to store our staging data. The xfile must be freed with xfile_destroy.
*
* xfiles assume that the caller will handle all required concurrency
* management; file locks are not taken.
*/
/*
* Starting with Linux 6.3, there's a new MFD_NOEXEC_SEAL flag that disables
* the longstanding memfd behavior that files are created with the executable
* bit set, and seals the file against it being turned back on.
*/
#ifndef MFD_NOEXEC_SEAL
# define MFD_NOEXEC_SEAL (0x0008U)
#endif
/*
* The memfd_create system call was added to kernel 3.17 (2014), but
* its corresponding glibc wrapper was only added in glibc 2.27
* (2018). In case a libc is not providing the wrapper, we provide
* one here.
*/
#ifndef HAVE_MEMFD_CREATE
static int memfd_create(const char *name, unsigned int flags)
{
return syscall(SYS_memfd_create, name, flags);
}
#endif
/*
* Open a memory-backed fd to back an xfile. We require close-on-exec here,
* because these memfd files function as windowed RAM and hence should never
* be shared with other processes.
*/
static int
xfile_create_fd(
const char *description)
{
int fd = -1;
int ret;
/*
* memfd_create was added to kernel 3.17 (2014). MFD_NOEXEC_SEAL
* causes -EINVAL on old kernels, so fall back to omitting it so that
* new xfs_repair can run on an older recovery cd kernel.
*/
fd = memfd_create(description, MFD_CLOEXEC | MFD_NOEXEC_SEAL);
if (fd >= 0)
goto got_fd;
fd = memfd_create(description, MFD_CLOEXEC);
if (fd >= 0)
goto got_fd;
/*
* O_TMPFILE exists as of kernel 3.11 (2013), which means that if we
* find it, we're pretty safe in assuming O_CLOEXEC exists too.
*/
fd = open("/dev/shm", O_TMPFILE | O_CLOEXEC | O_RDWR, 0600);
if (fd >= 0)
goto got_fd;
fd = open("/tmp", O_TMPFILE | O_CLOEXEC | O_RDWR, 0600);
if (fd >= 0)
goto got_fd;
/*
* mkostemp exists as of glibc 2.7 (2007) and O_CLOEXEC exists as of
* kernel 2.6.23 (2007).
*/
fd = mkostemp("libxfsXXXXXX", O_CLOEXEC);
if (fd >= 0)
goto got_fd;
if (!errno)
errno = EOPNOTSUPP;
return -1;
got_fd:
/*
* Turn off mode bits we don't want -- group members and others should
* not have access to the xfile, nor it be executable. memfds are
* created with mode 0777, but we'll be careful just in case the other
* implementations fail to set 0600.
*/
ret = fchmod(fd, 0600);
if (ret)
perror("disabling xfile executable bit");
return fd;
}
static LIST_HEAD(fcb_list);
static pthread_mutex_t fcb_mutex = PTHREAD_MUTEX_INITIALIZER;
/* Create a new memfd. */
static inline int
xfile_fcb_create(
const char *description,
struct xfile_fcb **fcbp)
{
struct xfile_fcb *fcb;
int fd;
fd = xfile_create_fd(description);
if (fd < 0)
return -errno;
fcb = malloc(sizeof(struct xfile_fcb));
if (!fcb) {
close(fd);
return -ENOMEM;
}
list_head_init(&fcb->fcb_list);
fcb->fd = fd;
fcb->refcount = 1;
*fcbp = fcb;
return 0;
}
/* Release an xfile control block */
static void
xfile_fcb_irele(
struct xfile_fcb *fcb,
loff_t pos,
uint64_t len)
{
/*
* If this memfd is linked only to itself, it's private, so we can
* close it without taking any locks.
*/
if (list_empty(&fcb->fcb_list)) {
close(fcb->fd);
free(fcb);
return;
}
pthread_mutex_lock(&fcb_mutex);
if (--fcb->refcount == 0) {
/* If we're the last user of this memfd file, kill it fast. */
list_del(&fcb->fcb_list);
close(fcb->fd);
free(fcb);
} else if (len > 0) {
struct stat statbuf;
int ret;
/*
* If we were using the end of a partitioned file, free the
* address space. IOWs, bonus points if you delete these in
* reverse-order of creation.
*/
ret = fstat(fcb->fd, &statbuf);
if (!ret && statbuf.st_size == pos + len) {
ret = ftruncate(fcb->fd, pos);
}
}
pthread_mutex_unlock(&fcb_mutex);
}
/*
* Find an memfd that can accomodate the given amount of address space.
*/
static int
xfile_fcb_find(
const char *description,
uint64_t maxbytes,
loff_t *posp,
struct xfile_fcb **fcbp)
{
struct xfile_fcb *fcb;
int ret;
int error = 0;
/* No maximum range means that the caller gets a private memfd. */
if (maxbytes == 0) {
*posp = 0;
return xfile_fcb_create(description, fcbp);
}
/* round up to page granularity so we can do mmap */
maxbytes = roundup_64(maxbytes, PAGE_SIZE);
pthread_mutex_lock(&fcb_mutex);
/*
* If we only need a certain number of byte range, look for one with
* available file range.
*/
list_for_each_entry(fcb, &fcb_list, fcb_list) {
struct stat statbuf;
loff_t pos;
ret = fstat(fcb->fd, &statbuf);
if (ret)
continue;
pos = roundup_64(statbuf.st_size, PAGE_SIZE);
/*
* Truncate up to ensure that the memfd can actually handle
* writes to the end of the range.
*/
ret = ftruncate(fcb->fd, pos + maxbytes);
if (ret)
continue;
fcb->refcount++;
*posp = pos;
*fcbp = fcb;
goto out_unlock;
}
/* Otherwise, open a new memfd and add it to our list. */
error = xfile_fcb_create(description, &fcb);
if (error)
goto out_unlock;
ret = ftruncate(fcb->fd, maxbytes);
if (ret) {
error = -errno;
xfile_fcb_irele(fcb, 0, maxbytes);
goto out_unlock;
}
list_add_tail(&fcb->fcb_list, &fcb_list);
*posp = 0;
*fcbp = fcb;
out_unlock:
pthread_mutex_unlock(&fcb_mutex);
return error;
}
/*
* Create an xfile of the given size. The description will be used in the
* trace output.
*/
int
xfile_create(
const char *description,
unsigned long long maxbytes,
struct xfile **xfilep)
{
struct xfile *xf;
int error;
xf = kmalloc(sizeof(struct xfile), 0);
if (!xf)
return -ENOMEM;
error = xfile_fcb_find(description, maxbytes, &xf->partition_pos,
&xf->fcb);
if (error) {
kfree(xf);
return error;
}
xf->maxbytes = maxbytes;
*xfilep = xf;
return 0;
}
/* Close the file and release all resources. */
void
xfile_destroy(
struct xfile *xf)
{
xfile_fcb_irele(xf->fcb, xf->partition_pos, xf->maxbytes);
kfree(xf);
}
static inline loff_t
xfile_maxbytes(
struct xfile *xf)
{
if (xf->maxbytes > 0)
return xf->maxbytes;
if (sizeof(loff_t) == 8)
return LLONG_MAX;
return LONG_MAX;
}
/*
* Load an object. Since we're treating this file as "memory", any error or
* short IO is treated as a failure to allocate memory.
*/
ssize_t
xfile_load(
struct xfile *xf,
void *buf,
size_t count,
loff_t pos)
{
ssize_t ret;
if (count > INT_MAX)
return -ENOMEM;
if (xfile_maxbytes(xf) - pos < count)
return -ENOMEM;
ret = pread(xf->fcb->fd, buf, count, pos + xf->partition_pos);
if (ret < 0)
return -errno;
if (ret != count)
return -ENOMEM;
return 0;
}
/*
* Store an object. Since we're treating this file as "memory", any error or
* short IO is treated as a failure to allocate memory.
*/
ssize_t
xfile_store(
struct xfile *xf,
const void *buf,
size_t count,
loff_t pos)
{
ssize_t ret;
if (count > INT_MAX)
return -E2BIG;
if (xfile_maxbytes(xf) - pos < count)
return -EFBIG;
ret = pwrite(xf->fcb->fd, buf, count, pos + xf->partition_pos);
if (ret < 0)
return -errno;
if (ret != count)
return -ENOMEM;
return 0;
}
/* Compute the number of bytes used by a partitioned xfile. */
static unsigned long long
xfile_partition_bytes(
struct xfile *xf)
{
loff_t data_pos = xf->partition_pos;
loff_t stop_pos = data_pos + xf->maxbytes;
loff_t hole_pos;
unsigned long long bytes = 0;
data_pos = lseek(xf->fcb->fd, data_pos, SEEK_DATA);
while (data_pos >= 0 && data_pos < stop_pos) {
hole_pos = lseek(xf->fcb->fd, data_pos, SEEK_HOLE);
if (hole_pos < 0) {
/* save error, break */
data_pos = hole_pos;
break;
}
if (hole_pos >= stop_pos) {
bytes += stop_pos - data_pos;
return bytes;
}
bytes += hole_pos - data_pos;
data_pos = lseek(xf->fcb->fd, hole_pos, SEEK_DATA);
}
if (data_pos < 0 && errno != ENXIO)
return xf->maxbytes;
return bytes;
}
/* Compute the number of bytes used by a xfile. */
unsigned long long
xfile_bytes(
struct xfile *xf)
{
struct stat statbuf;
int error;
if (xf->maxbytes > 0)
return xfile_partition_bytes(xf);
error = fstat(xf->fcb->fd, &statbuf);
if (error)
return -errno;
return (unsigned long long)statbuf.st_blocks << 9;
}
/* Discard pages backing a range of the xfile. */
void
xfile_discard(
struct xfile *xf,
loff_t pos,
unsigned long long count)
{
fallocate(xf->fcb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
pos, count);
}
|