From b62c21b71f08b7a4bfd025616ff1da2913a82904 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 12 Mar 2015 23:56:02 -0400 Subject: [PATCH 1/9] blk-mq: add blk_mq_init_allocated_queue and export blk_mq_register_disk Add a variant of blk_mq_init_queue that allows a previously allocated queue to be initialized. blk_mq_init_allocated_queue models blk_init_allocated_queue -- which was also created for DM's use. DM's approach to device creation requires a placeholder request_queue be allocated for use with alloc_dev() but the decision about what type of request_queue will be ultimately created is deferred until all component devices referenced in the DM table are processed to determine the table type (request-based, blk-mq request-based, or bio-based). Also, because of DM's late finalization of the request_queue type the call to blk_mq_register_disk() doesn't happen during alloc_dev(). Must export blk_mq_register_disk() so that DM can backfill the 'mq' dir once the blk-mq queue is fully allocated. Signed-off-by: Mike Snitzer Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 1 + block/blk-mq.c | 30 ++++++++++++++++++++---------- include/linux/blk-mq.h | 2 ++ 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 1630a20d5dcf..b79685e06b70 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -436,6 +436,7 @@ int blk_mq_register_disk(struct gendisk *disk) return 0; } +EXPORT_SYMBOL_GPL(blk_mq_register_disk); void blk_mq_sysfs_unregister(struct request_queue *q) { diff --git a/block/blk-mq.c b/block/blk-mq.c index b7b8933ec241..3000121840bb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1890,10 +1890,26 @@ void blk_mq_release(struct request_queue *q) } struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) +{ + struct request_queue *uninit_q, *q; + + uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); + if (!uninit_q) + return ERR_PTR(-ENOMEM); + + q = blk_mq_init_allocated_queue(set, uninit_q); + if (IS_ERR(q)) + blk_cleanup_queue(uninit_q); + + return q; +} +EXPORT_SYMBOL(blk_mq_init_queue); + +struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, + struct request_queue *q) { struct blk_mq_hw_ctx **hctxs; struct blk_mq_ctx __percpu *ctx; - struct request_queue *q; unsigned int *map; int i; @@ -1928,17 +1944,13 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) hctxs[i]->queue_num = i; } - q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); - if (!q) - goto err_hctxs; - /* * Init percpu_ref in atomic mode so that it's faster to shutdown. * See blk_register_queue() for details. */ if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) - goto err_mq_usage; + goto err_hctxs; setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); blk_queue_rq_timeout(q, 30000); @@ -1981,7 +1993,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) blk_mq_init_cpu_queues(q, set->nr_hw_queues); if (blk_mq_init_hw_queues(q, set)) - goto err_mq_usage; + goto err_hctxs; mutex_lock(&all_q_mutex); list_add_tail(&q->all_q_node, &all_q_list); @@ -1993,8 +2005,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) return q; -err_mq_usage: - blk_cleanup_queue(q); err_hctxs: kfree(map); for (i = 0; i < set->nr_hw_queues; i++) { @@ -2009,7 +2019,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) free_percpu(ctx); return ERR_PTR(-ENOMEM); } -EXPORT_SYMBOL(blk_mq_init_queue); +EXPORT_SYMBOL(blk_mq_init_allocated_queue); void blk_mq_free_queue(struct request_queue *q) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7aec86127335..9a75c88e8908 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -164,6 +164,8 @@ enum { << BLK_MQ_F_ALLOC_POLICY_START_BIT) struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); +struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, + struct request_queue *q); void blk_mq_finish_init(struct request_queue *q); int blk_mq_register_disk(struct gendisk *); void blk_mq_unregister_disk(struct gendisk *); From b94ec296403e99d5ac9a8c48332cec4118d44b94 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 11 Mar 2015 23:56:38 -0400 Subject: [PATCH 2/9] blk-mq: export blk_mq_run_hw_queues Rename blk_mq_run_queues to blk_mq_run_hw_queues, add async argument, and export it. DM's suspend support must be able to run the queue without starting stopped hw queues. Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++++---- include/linux/blk-mq.h | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 3000121840bb..06614ce0f475 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -33,7 +33,6 @@ static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); -static void blk_mq_run_queues(struct request_queue *q); /* * Check if any of the ctx's have pending work in this hardware queue @@ -118,7 +117,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q) if (freeze) { percpu_ref_kill(&q->mq_usage_counter); - blk_mq_run_queues(q); + blk_mq_run_hw_queues(q, false); } } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); @@ -904,7 +903,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) &hctx->run_work, 0); } -static void blk_mq_run_queues(struct request_queue *q) +void blk_mq_run_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; int i; @@ -915,9 +914,10 @@ static void blk_mq_run_queues(struct request_queue *q) test_bit(BLK_MQ_S_STOPPED, &hctx->state)) continue; - blk_mq_run_hw_queue(hctx, false); + blk_mq_run_hw_queue(hctx, async); } } +EXPORT_SYMBOL(blk_mq_run_hw_queues); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9a75c88e8908..ebfe707cf722 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -220,6 +220,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_stop_hw_queues(struct request_queue *q); void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); +void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, void *priv); From bfd343aa1718457d34b99ce6573085ac340da288 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 11 Mar 2015 23:56:39 -0400 Subject: [PATCH 3/9] blk-mq: don't wait in blk_mq_queue_enter() if __GFP_WAIT isn't set Return -EBUSY if we're unable to enter a queue immediately when allocating a blk-mq request without __GFP_WAIT. Signed-off-by: Keith Busch Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-mq.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 06614ce0f475..59fa23935a0f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -77,7 +77,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); } -static int blk_mq_queue_enter(struct request_queue *q) +static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp) { while (true) { int ret; @@ -85,6 +85,9 @@ static int blk_mq_queue_enter(struct request_queue *q) if (percpu_ref_tryget_live(&q->mq_usage_counter)) return 0; + if (!(gfp & __GFP_WAIT)) + return -EBUSY; + ret = wait_event_interruptible(q->mq_freeze_wq, !q->mq_freeze_depth || blk_queue_dying(q)); if (blk_queue_dying(q)) @@ -256,7 +259,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, struct blk_mq_alloc_data alloc_data; int ret; - ret = blk_mq_queue_enter(q); + ret = blk_mq_queue_enter(q, gfp); if (ret) return ERR_PTR(ret); @@ -1186,7 +1189,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, int rw = bio_data_dir(bio); struct blk_mq_alloc_data alloc_data; - if (unlikely(blk_mq_queue_enter(q))) { + if (unlikely(blk_mq_queue_enter(q, GFP_KERNEL))) { bio_endio(bio, -EIO); return NULL; } From 271508dba2c3fc307e7c44e2731a2ece70a4025e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 24 Mar 2015 16:21:16 -0700 Subject: [PATCH 4/9] block: allocate request memory local to request queue blk_init_rl() allocates a mempool using mempool_create_node() with node local memory. This only allocates the mempool and element list locally to the requeue queue node. What we really want to do is allocate the request itself local to the queue. To do this, we need our own alloc and free functions that will allocate from request_cachep and pass the request queue node in to prefer node local memory. Acked-by: Tejun Heo Signed-off-by: David Rientjes Signed-off-by: Jens Axboe --- block/blk-core.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 794c3e7f01cf..fd154b94447a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -557,6 +557,18 @@ void blk_cleanup_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_cleanup_queue); +/* Allocate memory local to the request queue */ +static void *alloc_request_struct(gfp_t gfp_mask, void *data) +{ + int nid = (int)(long)data; + return kmem_cache_alloc_node(request_cachep, gfp_mask, nid); +} + +static void free_request_struct(void *element, void *unused) +{ + kmem_cache_free(request_cachep, element); +} + int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask) { @@ -569,9 +581,10 @@ int blk_init_rl(struct request_list *rl, struct request_queue *q, init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); - rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, - mempool_free_slab, request_cachep, - gfp_mask, q->node); + rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, alloc_request_struct, + free_request_struct, + (void *)(long)q->node, gfp_mask, + q->node); if (!rl->rq_pool) return -ENOMEM; From f9018ac9308ea415e659cfbdda040504ef92597b Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Mon, 30 Mar 2015 13:19:14 +0800 Subject: [PATCH 5/9] block: remove redundant check about 'set->nr_hw_queues' in blk_mq_alloc_tag_set() At the beginning of blk_mq_alloc_tag_set(), we have already checked whether 'set->nr_hw_queues' is zero, so here remove this redundant check. Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 59fa23935a0f..37f14362aa15 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2174,7 +2174,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) return -EINVAL; - if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue) + if (!set->ops->queue_rq || !set->ops->map_queue) return -EINVAL; if (set->queue_depth > BLK_MQ_MAX_DEPTH) { From c76cbbcf404475f8885b2252049dac99b0614868 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Mon, 30 Mar 2015 09:07:00 -0600 Subject: [PATCH 6/9] blk-mq: put blk_queue_rq_timeout together in blk_mq_init_queue() Don't assign ->rq_timeout twice. Signed-off-by: Wei Fang Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 37f14362aa15..1192f85e5ff3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1956,7 +1956,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, goto err_hctxs; setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); - blk_queue_rq_timeout(q, 30000); + blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30000); q->nr_queues = nr_cpu_ids; q->nr_hw_queues = set->nr_hw_queues; @@ -1982,9 +1982,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, else blk_queue_make_request(q, blk_sq_make_request); - if (set->timeout) - blk_queue_rq_timeout(q, set->timeout); - /* * Do this after blk_queue_make_request() overrides it... */ From 2963e3f7e8e3465895897a175560210120b932ac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Apr 2015 15:54:05 -0600 Subject: [PATCH 7/9] blk-mq: cleanup blk_mq_rq_to_pdu() Casting to void and adding the size of the request is "shit code" and only a "crazy monkey on crack" would write that. So lets clean it up. Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ebfe707cf722..8210e8797c12 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -230,7 +230,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q); /* * Driver command data is immediately after the request. So subtract request - * size to get back to the original request. + * size to get back to the original request, add request size to get the PDU. */ static inline struct request *blk_mq_rq_from_pdu(void *pdu) { @@ -238,7 +238,7 @@ static inline struct request *blk_mq_rq_from_pdu(void *pdu) } static inline void *blk_mq_rq_to_pdu(struct request *rq) { - return (void *) rq + sizeof(*rq); + return rq + 1; } #define queue_for_each_hw_ctx(q, hctx, i) \ From dc48e56d761610da4ea1088d1bea0a030b8e3e43 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 15 Apr 2015 11:17:23 -0600 Subject: [PATCH 8/9] aio: fix serial draining in exit_aio() exit_aio() currently serializes killing io contexts. Each context killing ends up having to do percpu_ref_kill(), which in turns has to wait for an RCU grace period. This can take a long time, depending on the number of contexts. And there's no point in doing them serially, when we could be waiting for all of them in one fell swoop. This patches makes my fio thread offload test case exit 0.2s instead of almost 6s. Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- fs/aio.c | 45 ++++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index f8e52a1854c1..cabb5edd9bc1 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -77,6 +77,11 @@ struct kioctx_cpu { unsigned reqs_available; }; +struct ctx_rq_wait { + struct completion comp; + atomic_t count; +}; + struct kioctx { struct percpu_ref users; atomic_t dead; @@ -115,7 +120,7 @@ struct kioctx { /* * signals when all in-flight requests are done */ - struct completion *requests_done; + struct ctx_rq_wait *rq_wait; struct { /* @@ -535,8 +540,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref) struct kioctx *ctx = container_of(ref, struct kioctx, reqs); /* At this point we know that there are no any in-flight requests */ - if (ctx->requests_done) - complete(ctx->requests_done); + if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count)) + complete(&ctx->rq_wait->comp); INIT_WORK(&ctx->free_work, free_ioctx); schedule_work(&ctx->free_work); @@ -744,7 +749,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) * the rapid destruction of the kioctx. */ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, - struct completion *requests_done) + struct ctx_rq_wait *wait) { struct kioctx_table *table; @@ -773,7 +778,7 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); - ctx->requests_done = requests_done; + ctx->rq_wait = wait; percpu_ref_kill(&ctx->users); return 0; } @@ -805,18 +810,24 @@ EXPORT_SYMBOL(wait_on_sync_kiocb); void exit_aio(struct mm_struct *mm) { struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table); - int i; + struct ctx_rq_wait wait; + int i, skipped; if (!table) return; + atomic_set(&wait.count, table->nr); + init_completion(&wait.comp); + + skipped = 0; for (i = 0; i < table->nr; ++i) { struct kioctx *ctx = table->table[i]; - struct completion requests_done = - COMPLETION_INITIALIZER_ONSTACK(requests_done); - if (!ctx) + if (!ctx) { + skipped++; continue; + } + /* * We don't need to bother with munmap() here - exit_mmap(mm) * is coming and it'll unmap everything. And we simply can't, @@ -825,10 +836,12 @@ void exit_aio(struct mm_struct *mm) * that it needs to unmap the area, just set it to 0. */ ctx->mmap_size = 0; - kill_ioctx(mm, ctx, &requests_done); + kill_ioctx(mm, ctx, &wait); + } + if (!atomic_sub_and_test(skipped, &wait.count)) { /* Wait until all IO for the context are done. */ - wait_for_completion(&requests_done); + wait_for_completion(&wait.comp); } RCU_INIT_POINTER(mm->ioctx_table, NULL); @@ -1313,15 +1326,17 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - struct completion requests_done = - COMPLETION_INITIALIZER_ONSTACK(requests_done); + struct ctx_rq_wait wait; int ret; + init_completion(&wait.comp); + atomic_set(&wait.count, 1); + /* Pass requests_done to kill_ioctx() where it can be set * in a thread-safe way. If we try to set it here then we have * a race condition if two io_destroy() called simultaneously. */ - ret = kill_ioctx(current->mm, ioctx, &requests_done); + ret = kill_ioctx(current->mm, ioctx, &wait); percpu_ref_put(&ioctx->users); /* Wait until all IO for the context are done. Otherwise kernel @@ -1329,7 +1344,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) * is destroyed. */ if (!ret) - wait_for_completion(&requests_done); + wait_for_completion(&wait.comp); return ret; } From 889fa31f00b218a2cef96c32a6b3f57e6d3bf918 Mon Sep 17 00:00:00 2001 From: Chong Yuan Date: Wed, 15 Apr 2015 11:39:29 -0600 Subject: [PATCH 9/9] blk-mq: reduce unnecessary software queue looping In flush_busy_ctxs() and blk_mq_hctx_has_pending(), regardless of how many ctxs assigned to one hctx, they will all loop hctx->ctx_map.map_size times. Here hctx->ctx_map.map_size is a const ALIGN(nr_cpu_ids, 8) / 8. Especially, flush_busy_ctxs() is in hot code path. And it's unnecessary. Change ->map_size to contain the actually mapped software queues, so we only loop for as many iterations as we have to. And remove cpumask setting and nr_ctx count in blk_mq_init_cpu_queues() since they are all re-done in blk_mq_map_swqueue(). blk_mq_map_swqueue(). Signed-off-by: Chong Yuan Reviewed-by: Wenbo Wang Updated by me for formatting and commenting. Signed-off-by: Jens Axboe --- block/blk-mq.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 1192f85e5ff3..0b49e42e5310 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1522,8 +1522,6 @@ static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) if (!bitmap->map) return -ENOMEM; - bitmap->map_size = num_maps; - total = nr_cpu_ids; for (i = 0; i < num_maps; i++) { bitmap->map[i].depth = min(total, bitmap->bits_per_word); @@ -1764,8 +1762,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, continue; hctx = q->mq_ops->map_queue(q, i); - cpumask_set_cpu(i, hctx->cpumask); - hctx->nr_ctx++; /* * Set local node, IFF we have more than one hw queue. If @@ -1802,6 +1798,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) } queue_for_each_hw_ctx(q, hctx, i) { + struct blk_mq_ctxmap *map = &hctx->ctx_map; + /* * If no software queues are mapped to this hardware queue, * disable it and free the request entries. @@ -1817,6 +1815,13 @@ static void blk_mq_map_swqueue(struct request_queue *q) continue; } + /* + * Set the map size to the number of mapped software queues. + * This is more accurate and more efficient than looping + * over all possibly mapped software queues. + */ + map->map_size = hctx->nr_ctx / map->bits_per_word; + /* * Initialize batch roundrobin counts */