File: dm-io.c

package info (click to toggle)
devmapper 2%3A1.01.00-4sarge1
links: PTS
area: main
in suites: sarge
size: 4,044 kB
ctags: 2,336
sloc: ansic: 11,777; sh: 2,695; makefile: 291; perl: 16
file content (361 lines) | stat: -rw-r--r-- 8,086 bytes
parent folder | download | duplicates (2)
/*
 * Copyright (C) 2003 Sistina Software
 *
 * This file is released under the GPL.
 */

#include "dm-io.h"

#include <linux/mempool.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/bitops.h>

/* FIXME: can we shrink this ? */
struct io_context {
	int rw;
	unsigned int error;
	atomic_t count;
	struct task_struct *sleeper;
	io_notify_fn callback;
	void *context;
};

/*
 * We maintain a pool of buffer heads for dispatching the io.
 */
static unsigned int _num_bhs;
static mempool_t *_buffer_pool;

/*
 * io contexts are only dynamically allocated for asynchronous
 * io.  Since async io is likely to be the majority of io we'll
 * have the same number of io contexts as buffer heads ! (FIXME:
 * must reduce this).
 */
mempool_t *_io_pool;

static void *alloc_bh(int gfp_mask, void *pool_data)
{
	struct buffer_head *bh;

	bh = kmem_cache_alloc(bh_cachep, gfp_mask);
	if (bh) {
		bh->b_reqnext = NULL;
		init_waitqueue_head(&bh->b_wait);
		INIT_LIST_HEAD(&bh->b_inode_buffers);
	}

	return bh;
}

static void *alloc_io(int gfp_mask, void *pool_data)
{
	return kmalloc(sizeof(struct io_context), gfp_mask);
}

static void free_io(void *element, void *pool_data)
{
	kfree(element);
}

static unsigned int pages_to_buffers(unsigned int pages)
{
	return 4 * pages;	/* too many ? */
}

static int resize_pool(unsigned int new_bhs)
{
	int r = 0;

	if (_buffer_pool) {
		if (new_bhs == 0) {
			/* free off the pools */
			mempool_destroy(_buffer_pool);
			mempool_destroy(_io_pool);
			_buffer_pool = _io_pool = NULL;
		} else {
			/* resize the pools */
			r = mempool_resize(_buffer_pool, new_bhs, GFP_KERNEL);
			if (!r)
				r = mempool_resize(_io_pool,
						   new_bhs, GFP_KERNEL);
		}
	} else {
		/* create new pools */
		_buffer_pool = mempool_create(new_bhs, alloc_bh,
					      mempool_free_slab, bh_cachep);
		if (!_buffer_pool)
			r = -ENOMEM;

		_io_pool = mempool_create(new_bhs, alloc_io, free_io, NULL);
		if (!_io_pool) {
			mempool_destroy(_buffer_pool);
			_buffer_pool = NULL;
			r = -ENOMEM;
		}
	}

	if (!r)
		_num_bhs = new_bhs;

	return r;
}

int dm_io_get(unsigned int num_pages)
{
	return resize_pool(_num_bhs + pages_to_buffers(num_pages));
}

void dm_io_put(unsigned int num_pages)
{
	resize_pool(_num_bhs - pages_to_buffers(num_pages));
}

/*-----------------------------------------------------------------
 * We need to keep track of which region a buffer is doing io
 * for.  In order to save a memory allocation we store this in an
 * unused field of the buffer head, and provide these access
 * functions.
 *
 * FIXME: add compile time check that an unsigned int can fit
 * into a pointer.
 *
 *---------------------------------------------------------------*/
static inline void bh_set_region(struct buffer_head *bh, unsigned int region)
{
	bh->b_journal_head = (void *) region;
}

static inline int bh_get_region(struct buffer_head *bh)
{
	return (unsigned int) bh->b_journal_head;
}

/*-----------------------------------------------------------------
 * We need an io object to keep track of the number of bhs that
 * have been dispatched for a particular io.
 *---------------------------------------------------------------*/
static void dec_count(struct io_context *io, unsigned int region, int error)
{
	if (error)
		set_bit(region, &io->error);

	if (atomic_dec_and_test(&io->count)) {
		if (io->sleeper)
			wake_up_process(io->sleeper);

		else {
			int r = io->error;
			io_notify_fn fn = io->callback;
			void *context = io->context;

			mempool_free(io, _io_pool);
			fn(r, context);
		}
	}
}

static void endio(struct buffer_head *bh, int uptodate)
{
	struct io_context *io = (struct io_context *) bh->b_private;

	if (!uptodate && io->rw != WRITE) {
		/*
		 * We need to zero this region, otherwise people
		 * like kcopyd may write the arbitrary contents
		 * of the page.
		 */
		memset(bh->b_data, 0, bh->b_size);
	}

	dec_count((struct io_context *) bh->b_private,
		  bh_get_region(bh), !uptodate);
	mempool_free(bh, _buffer_pool);
}

/*
 * Primitives for alignment calculations.
 */
int fls(unsigned n)
{
	return generic_fls32(n);
}

static inline int log2_floor(unsigned n)
{
	return ffs(n) - 1;
}

static inline int log2_align(unsigned n)
{
	return fls(n) - 1;
}

/*
 * Returns the next block for io.
 */
static int do_page(kdev_t dev, sector_t *block, sector_t end_block,
		   unsigned int block_size,
		   struct page *p, unsigned int offset,
		   unsigned int region, struct io_context *io)
{
	struct buffer_head *bh;
	sector_t b = *block;
	sector_t blocks_per_page = PAGE_SIZE / block_size;
	unsigned int this_size; /* holds the size of the current io */
	sector_t len;

	if (!blocks_per_page) {
		DMERR("dm-io: PAGE_SIZE (%lu) < block_size (%u) unsupported",
		      PAGE_SIZE, block_size);
		return 0;
	}

	while ((offset < PAGE_SIZE) && (b != end_block)) {
		bh = mempool_alloc(_buffer_pool, GFP_NOIO);
		init_buffer(bh, endio, io);
		bh_set_region(bh, region);

		/*
		 * Block size must be a power of 2 and aligned
		 * correctly.
		 */

		len = min(end_block - b, blocks_per_page);
		len = min(len, blocks_per_page - offset / block_size);

		if (!len) {
			DMERR("dm-io: Invalid offset/block_size (%u/%u).",
			      offset, block_size);
			return 0;
		}

		this_size = 1 << log2_align(len);
		if (b)
			this_size = min(this_size,
					(unsigned) 1 << log2_floor(b));

		/*
		 * Add in the job offset.
		 */
		bh->b_blocknr = (b / this_size);
		bh->b_size = block_size * this_size;
		set_bh_page(bh, p, offset);
		bh->b_this_page = bh;

		bh->b_dev = dev;
		atomic_set(&bh->b_count, 1);

		bh->b_state = ((1 << BH_Uptodate) | (1 << BH_Mapped) |
			       (1 << BH_Lock));

		if (io->rw == WRITE)
			clear_bit(BH_Dirty, &bh->b_state);

		atomic_inc(&io->count);
		submit_bh(io->rw, bh);

		b += this_size;
		offset += block_size * this_size;
	}

	*block = b;
	return (b == end_block);
}

static void do_region(unsigned int region, struct io_region *where,
		      struct page *page, unsigned int offset,
		      struct io_context *io)
{
	unsigned int block_size = get_hardsect_size(where->dev);
	unsigned int sblock_size = block_size >> 9;
	sector_t block = where->sector / sblock_size;
	sector_t end_block = (where->sector + where->count) / sblock_size;

	while (1) {
		if (do_page(where->dev, &block, end_block, block_size,
			    page, offset, region, io))
			break;

		offset = 0;	/* only offset the first page */

		page = list_entry(page->list.next, struct page, list);
	}
}

static void dispatch_io(unsigned int num_regions, struct io_region *where,
			struct page *pages, unsigned int offset,
			struct io_context *io)
{
	int i;

	for (i = 0; i < num_regions; i++)
		if (where[i].count)
			do_region(i, where + i, pages, offset, io);

	/*
	 * Drop the extra refence that we were holding to avoid
	 * the io being completed too early.
	 */
	dec_count(io, 0, 0);
}

/*
 * Synchronous io
 */
int dm_io_sync(unsigned int num_regions, struct io_region *where,
	       int rw, struct page *pages, unsigned int offset,
	       unsigned int *error_bits)
{
	struct io_context io;

	BUG_ON(num_regions > 1 && rw != WRITE);

	io.rw = rw;
	io.error = 0;
	atomic_set(&io.count, 1); /* see dispatch_io() */
	io.sleeper = current;

	dispatch_io(num_regions, where, pages, offset, &io);
	run_task_queue(&tq_disk);

	while (1) {
		set_current_state(TASK_UNINTERRUPTIBLE);

		if (!atomic_read(&io.count))
			break;

		schedule();
	}
	set_current_state(TASK_RUNNING);

	*error_bits = io.error;
	return io.error ? -EIO : 0;
}

/*
 * Asynchronous io
 */
int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
		struct page *pages, unsigned int offset,
		io_notify_fn fn, void *context)
{
	struct io_context *io = mempool_alloc(_io_pool, GFP_NOIO);

	io->rw = rw;
	io->error = 0;
	atomic_set(&io->count, 1); /* see dispatch_io() */
	io->sleeper = NULL;
	io->callback = fn;
	io->context = context;

	dispatch_io(num_regions, where, pages, offset, io);
	return 0;
}

EXPORT_SYMBOL(dm_io_get);
EXPORT_SYMBOL(dm_io_put);
EXPORT_SYMBOL(dm_io_sync);
EXPORT_SYMBOL(dm_io_async);