File: buf_mem.c

package info (click to toggle)
xfsprogs 6.17.0-2
links: PTS
area: main
in suites: forky, sid
size: 11,324 kB
sloc: ansic: 167,334; sh: 4,604; makefile: 1,336; python: 835; cpp: 5
file content (408 lines) | stat: -rw-r--r-- 8,710 bytes
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (c) 2023-2024 Oracle.  All Rights Reserved.
 * Author: Darrick J. Wong <djwong@kernel.org>
 */
#include "libxfs_priv.h"
#include "libxfs.h"
#include "libxfs/xfile.h"
#include "libxfs/buf_mem.h"
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/wait.h>

/*
 * Buffer Cache for In-Memory Files
 * ================================
 *
 * Offline fsck wants to create ephemeral ordered recordsets.  The existing
 * btree infrastructure can do this, but we need the buffer cache to target
 * memory instead of block devices.
 *
 * xfiles meet those requirements.  Therefore, the xmbuf mechanism uses a
 * partition on an xfile to store the staging data.
 *
 * xmbufs assume that the caller will handle all required concurrency
 * management.  The resulting xfs_buf objects are kept private to the xmbuf
 * (they are not recycled to the LRU) because b_addr is mapped directly to the
 * memfd file.
 *
 * The only supported block size is the system page size.
 */

/* Figure out the xfile buffer cache block size here */
unsigned int	XMBUF_BLOCKSIZE;
unsigned int	XMBUF_BLOCKSHIFT;

long		xmbuf_max_mappings;
static atomic_t	xmbuf_mappings;
bool		xmbuf_unmap_early = false;

static long
get_max_mmap_count(void)
{
	char	buffer[64];
	char	*p = NULL;
	long	ret = -1;
	FILE	*file;

	file = fopen("/proc/sys/vm/max_map_count", "r");
	if (!file)
		return -1;

	while (fgets(buffer, sizeof(buffer), file)) {
		errno = 0;
		ret = strtol(buffer, &p, 0);
		if (errno || p == buffer)
			continue;

		/* only take half the maximum mmap count so others can use it */
		ret /= 2;
		break;
	}
	fclose(file);
	return ret;
}

void
xmbuf_libinit(void)
{
	long		ret = sysconf(_SC_PAGESIZE);

	/* If we don't find a power-of-two page size, go with 4k. */
	if (ret < 0 || !is_power_of_2(ret))
		ret = 4096;

	XMBUF_BLOCKSIZE = ret;
	XMBUF_BLOCKSHIFT = libxfs_highbit32(XMBUF_BLOCKSIZE);

	/*
	 * Figure out how many mmaps we will use simultaneously.  Pick a low
	 * default if we can't query procfs.
	 */
	xmbuf_max_mappings = get_max_mmap_count();
	if (xmbuf_max_mappings < 0)
		xmbuf_max_mappings = 1024;
}

/* Directly map a memfd page into the buffer cache. */
static int
xmbuf_map_page(
	struct xfs_buf		*bp)
{
	struct xfile		*xfile = bp->b_target->bt_xfile;
	void			*p;
	loff_t			pos;

	pos = xfile->partition_pos + BBTOB(xfs_buf_daddr(bp));
	p = mmap(NULL, BBTOB(bp->b_length), PROT_READ | PROT_WRITE, MAP_SHARED,
			xfile->fcb->fd, pos);
	if (p == MAP_FAILED) {
		if (errno == ENOMEM && !xmbuf_unmap_early) {
#ifdef DEBUG
			fprintf(stderr, "xmbuf could not make mappings!\n");
#endif
			xmbuf_unmap_early = true;
		}
		return errno;
	}

	if (!xmbuf_unmap_early &&
	    atomic_inc_return(&xmbuf_mappings) > xmbuf_max_mappings) {
#ifdef DEBUG
		fprintf(stderr, _("xmbuf hit too many mappings (%ld)!\n",
					xmbuf_max_mappings);
#endif
		xmbuf_unmap_early = true;
	}

	bp->b_addr = p;
	bp->b_flags |= LIBXFS_B_UPTODATE | LIBXFS_B_UNCHECKED;
	bp->b_error = 0;
	return 0;
}

/* Unmap a memfd page that was mapped into the buffer cache. */
static void
xmbuf_unmap_page(
	struct xfs_buf		*bp)
{
	if (!xmbuf_unmap_early)
		atomic_dec(&xmbuf_mappings);
	munmap(bp->b_addr, BBTOB(bp->b_length));
	bp->b_addr = NULL;
}


/* Allocate a new cache node (aka a xfs_buf) */
static struct cache_node *
xmbuf_cache_alloc(
	cache_key_t		key)
{
	struct xfs_bufkey	*bufkey = (struct xfs_bufkey *)key;
	struct xfs_buf		*bp;
	int			error;

	bp = kmem_cache_zalloc(xfs_buf_cache, 0);
	if (!bp)
		return NULL;

	bp->b_cache_key = bufkey->blkno;
	bp->b_length = bufkey->bblen;
	bp->b_target = bufkey->buftarg;
	bp->b_mount = bufkey->buftarg->bt_mount;

	pthread_mutex_init(&bp->b_lock, NULL);
	INIT_LIST_HEAD(&bp->b_li_list);
	bp->b_maps = &bp->__b_map;

	bp->b_nmaps = 1;
	bp->b_maps[0].bm_bn = bufkey->blkno;
	bp->b_maps[0].bm_len = bp->b_length;

	error = xmbuf_map_page(bp);
	if (error) {
		fprintf(stderr,
 _("%s: %s can't mmap %u bytes at xfile offset %llu: %s\n"),
				progname, __FUNCTION__, BBTOB(bp->b_length),
				(unsigned long long)BBTOB(bufkey->blkno),
				strerror(error));

		kmem_cache_free(xfs_buf_cache, bp);
		return NULL;
	}

	return &bp->b_node;
}

/* Flush a buffer to disk before purging the cache node */
static int
xmbuf_cache_flush(
	struct cache_node	*node)
{
	/* direct mapped buffers do not need writing */
	return 0;
}

/* Release resources, free the buffer. */
static void
xmbuf_cache_relse(
	struct cache_node	*node)
{
	struct xfs_buf		*bp;

	bp = container_of(node, struct xfs_buf, b_node);
	if (bp->b_addr)
		xmbuf_unmap_page(bp);
	kmem_cache_free(xfs_buf_cache, bp);
}

/* Release a bunch of buffers */
static unsigned int
xmbuf_cache_bulkrelse(
	struct cache		*cache,
	struct list_head	*list)
{
	struct cache_node	*cn, *n;
	int			count = 0;

	if (list_empty(list))
		return 0;

	list_for_each_entry_safe(cn, n, list, cn_mru) {
		xmbuf_cache_relse(cn);
		count++;
	}

	return count;
}

static int
xmbuf_cache_node_get(
	struct cache_node	*node)
{
	struct xfs_buf		*bp =
		container_of(node, struct xfs_buf, b_node);
	int			error;

	if (bp->b_addr != NULL)
		return 0;

	error = xmbuf_map_page(bp);
	if (error) {
		fprintf(stderr,
 _("%s: %s can't mmap %u bytes at xfile offset %llu: %s\n"),
				progname, __FUNCTION__, BBTOB(bp->b_length),
				(unsigned long long)xfs_buf_daddr(bp),
				strerror(error));
		return error;
	}

	return 0;
}

static void
xmbuf_cache_node_put(
	struct cache_node	*node)
{
	struct xfs_buf		*bp =
		container_of(node, struct xfs_buf, b_node);

	if (xmbuf_unmap_early)
		xmbuf_unmap_page(bp);
}

static struct cache_operations xmbuf_bcache_operations = {
	.hash		= libxfs_bhash,
	.alloc		= xmbuf_cache_alloc,
	.flush		= xmbuf_cache_flush,
	.relse		= xmbuf_cache_relse,
	.compare	= libxfs_bcompare,
	.bulkrelse	= xmbuf_cache_bulkrelse,
	.get		= xmbuf_cache_node_get,
	.put		= xmbuf_cache_node_put,
};

/*
 * Allocate a buffer cache target for a memory-backed file and set up the
 * buffer target.
 */
int
xmbuf_alloc(
	struct xfs_mount	*mp,
	const char		*descr,
	unsigned long long	maxpos,
	struct xfs_buftarg	**btpp)
{
	struct xfs_buftarg	*btp;
	struct xfile		*xfile;
	struct cache		*cache;
	int			error;

	btp = kzalloc(sizeof(*btp), GFP_KERNEL);
	if (!btp)
		return -ENOMEM;

	error = xfile_create(descr, maxpos, &xfile);
	if (error)
		goto out_btp;

	cache = cache_init(0, LIBXFS_BHASHSIZE(NULL), &xmbuf_bcache_operations);
	if (!cache) {
		error = -ENOMEM;
		goto out_xfile;
	}

	/* Initialize buffer target */
	btp->bt_mount = mp;
	btp->bt_bdev = (dev_t)-1;
	btp->bt_bdev_fd = -1;
	btp->bt_xfile = xfile;
	btp->bcache = cache;

	error = pthread_mutex_init(&btp->lock, NULL);
	if (error)
		goto out_cache;

	*btpp = btp;
	return 0;

out_cache:
	cache_destroy(cache);
out_xfile:
	xfile_destroy(xfile);
out_btp:
	kfree(btp);
	return error;
}

/* Free a buffer cache target for a memory-backed file. */
void
xmbuf_free(
	struct xfs_buftarg	*btp)
{
	ASSERT(xfs_buftarg_is_mem(btp));

	cache_destroy(btp->bcache);
	pthread_mutex_destroy(&btp->lock);
	xfile_destroy(btp->bt_xfile);
	kfree(btp);
}

/* Is this a valid daddr within the buftarg? */
bool
xmbuf_verify_daddr(
	struct xfs_buftarg	*btp,
	xfs_daddr_t		daddr)
{
	struct xfile		*xf = btp->bt_xfile;

	ASSERT(xfs_buftarg_is_mem(btp));

	return daddr < (xf->maxbytes >> BBSHIFT);
}

/* Discard the page backing this buffer. */
static void
xmbuf_stale(
	struct xfs_buf		*bp)
{
	struct xfile		*xf = bp->b_target->bt_xfile;
	loff_t			pos;

	ASSERT(xfs_buftarg_is_mem(bp->b_target));

	pos = BBTOB(xfs_buf_daddr(bp)) + xf->partition_pos;
	fallocate(xf->fcb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, pos,
			BBTOB(bp->b_length));
}

/*
 * Finalize a buffer -- discard the backing page if it's stale, or run the
 * write verifier to detect problems.
 */
int
xmbuf_finalize(
	struct xfs_buf		*bp)
{
	xfs_failaddr_t		fa;
	int			error = 0;

	if (bp->b_flags & LIBXFS_B_STALE) {
		xmbuf_stale(bp);
		return 0;
	}

	/*
	 * Although this btree is ephemeral, validate the buffer structure so
	 * that we can detect memory corruption errors and software bugs.
	 */
	fa = bp->b_ops->verify_struct(bp);
	if (fa) {
		error = -EFSCORRUPTED;
		xfs_verifier_error(bp, error, fa);
	}

	return error;
}

/*
 * Detach this xmbuf buffer from the transaction by any means necessary.
 * All buffers are direct-mapped, so they do not need bwrite.
 */
void
xmbuf_trans_bdetach(
	struct xfs_trans	*tp,
	struct xfs_buf		*bp)
{
	struct xfs_buf_log_item	*bli = bp->b_log_item;

	ASSERT(bli != NULL);

	bli->bli_flags &= ~(XFS_BLI_DIRTY | XFS_BLI_ORDERED |
			    XFS_BLI_STALE);
	clear_bit(XFS_LI_DIRTY, &bli->bli_item.li_flags);

	while (bp->b_log_item != NULL)
		xfs_trans_bdetach(tp, bp);
}