Merge branch 'for-4.1/core' of git://git.kernel.dk/linux-block
Pull block layer core bits from Jens Axboe: "This is the core pull request for 4.1. Not a lot of stuff in here for this round, mostly little fixes or optimizations. This pull request contains: - An optimization that speeds up queue runs on blk-mq, especially for the case where there's a large difference between nr_cpu_ids and the actual mapped software queues on a hardware queue. From Chong Yuan. - Honor node local allocations for requests on legacy devices. From David Rientjes. - Cleanup of blk_mq_rq_to_pdu() from me. - exit_aio() fixup from me, greatly speeding up exiting multiple IO contexts off exit_group(). For my particular test case, fio exit took ~6 seconds. A typical case of both exposing RCU grace periods to user space, and serializing exit of them. - Make blk_mq_queue_enter() honor the gfp mask passed in, so we only wait if __GFP_WAIT is set. From Keith Busch. - blk-mq exports and two added helpers from Mike Snitzer, which will be used by the dm-mq code. - Cleanups of blk-mq queue init from Wei Fang and Xiaoguang Wang" * 'for-4.1/core' of git://git.kernel.dk/linux-block: blk-mq: reduce unnecessary software queue looping aio: fix serial draining in exit_aio() blk-mq: cleanup blk_mq_rq_to_pdu() blk-mq: put blk_queue_rq_timeout together in blk_mq_init_queue() block: remove redundant check about 'set->nr_hw_queues' in blk_mq_alloc_tag_set() block: allocate request memory local to request queue blk-mq: don't wait in blk_mq_queue_enter() if __GFP_WAIT isn't set blk-mq: export blk_mq_run_hw_queues blk-mq: add blk_mq_init_allocated_queue and export blk_mq_register_disk
This commit is contained in:
commit
d82312c808
|
@ -557,6 +557,18 @@ void blk_cleanup_queue(struct request_queue *q)
|
|||
}
|
||||
EXPORT_SYMBOL(blk_cleanup_queue);
|
||||
|
||||
/* Allocate memory local to the request queue */
|
||||
static void *alloc_request_struct(gfp_t gfp_mask, void *data)
|
||||
{
|
||||
int nid = (int)(long)data;
|
||||
return kmem_cache_alloc_node(request_cachep, gfp_mask, nid);
|
||||
}
|
||||
|
||||
static void free_request_struct(void *element, void *unused)
|
||||
{
|
||||
kmem_cache_free(request_cachep, element);
|
||||
}
|
||||
|
||||
int blk_init_rl(struct request_list *rl, struct request_queue *q,
|
||||
gfp_t gfp_mask)
|
||||
{
|
||||
|
@ -569,9 +581,10 @@ int blk_init_rl(struct request_list *rl, struct request_queue *q,
|
|||
init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
|
||||
init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
|
||||
|
||||
rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
|
||||
mempool_free_slab, request_cachep,
|
||||
gfp_mask, q->node);
|
||||
rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, alloc_request_struct,
|
||||
free_request_struct,
|
||||
(void *)(long)q->node, gfp_mask,
|
||||
q->node);
|
||||
if (!rl->rq_pool)
|
||||
return -ENOMEM;
|
||||
|
||||
|
|
|
@ -436,6 +436,7 @@ int blk_mq_register_disk(struct gendisk *disk)
|
|||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mq_register_disk);
|
||||
|
||||
void blk_mq_sysfs_unregister(struct request_queue *q)
|
||||
{
|
||||
|
|
|
@ -33,7 +33,6 @@ static DEFINE_MUTEX(all_q_mutex);
|
|||
static LIST_HEAD(all_q_list);
|
||||
|
||||
static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
|
||||
static void blk_mq_run_queues(struct request_queue *q);
|
||||
|
||||
/*
|
||||
* Check if any of the ctx's have pending work in this hardware queue
|
||||
|
@ -78,7 +77,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
|
|||
clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
|
||||
}
|
||||
|
||||
static int blk_mq_queue_enter(struct request_queue *q)
|
||||
static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
|
||||
{
|
||||
while (true) {
|
||||
int ret;
|
||||
|
@ -86,6 +85,9 @@ static int blk_mq_queue_enter(struct request_queue *q)
|
|||
if (percpu_ref_tryget_live(&q->mq_usage_counter))
|
||||
return 0;
|
||||
|
||||
if (!(gfp & __GFP_WAIT))
|
||||
return -EBUSY;
|
||||
|
||||
ret = wait_event_interruptible(q->mq_freeze_wq,
|
||||
!q->mq_freeze_depth || blk_queue_dying(q));
|
||||
if (blk_queue_dying(q))
|
||||
|
@ -118,7 +120,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
|
|||
|
||||
if (freeze) {
|
||||
percpu_ref_kill(&q->mq_usage_counter);
|
||||
blk_mq_run_queues(q);
|
||||
blk_mq_run_hw_queues(q, false);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
|
||||
|
@ -257,7 +259,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
|
|||
struct blk_mq_alloc_data alloc_data;
|
||||
int ret;
|
||||
|
||||
ret = blk_mq_queue_enter(q);
|
||||
ret = blk_mq_queue_enter(q, gfp);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
|
@ -904,7 +906,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
|
|||
&hctx->run_work, 0);
|
||||
}
|
||||
|
||||
static void blk_mq_run_queues(struct request_queue *q)
|
||||
void blk_mq_run_hw_queues(struct request_queue *q, bool async)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
int i;
|
||||
|
@ -915,9 +917,10 @@ static void blk_mq_run_queues(struct request_queue *q)
|
|||
test_bit(BLK_MQ_S_STOPPED, &hctx->state))
|
||||
continue;
|
||||
|
||||
blk_mq_run_hw_queue(hctx, false);
|
||||
blk_mq_run_hw_queue(hctx, async);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(blk_mq_run_hw_queues);
|
||||
|
||||
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
|
@ -1186,7 +1189,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
|
|||
int rw = bio_data_dir(bio);
|
||||
struct blk_mq_alloc_data alloc_data;
|
||||
|
||||
if (unlikely(blk_mq_queue_enter(q))) {
|
||||
if (unlikely(blk_mq_queue_enter(q, GFP_KERNEL))) {
|
||||
bio_endio(bio, -EIO);
|
||||
return NULL;
|
||||
}
|
||||
|
@ -1517,8 +1520,6 @@ static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
|
|||
if (!bitmap->map)
|
||||
return -ENOMEM;
|
||||
|
||||
bitmap->map_size = num_maps;
|
||||
|
||||
total = nr_cpu_ids;
|
||||
for (i = 0; i < num_maps; i++) {
|
||||
bitmap->map[i].depth = min(total, bitmap->bits_per_word);
|
||||
|
@ -1759,8 +1760,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
|
|||
continue;
|
||||
|
||||
hctx = q->mq_ops->map_queue(q, i);
|
||||
cpumask_set_cpu(i, hctx->cpumask);
|
||||
hctx->nr_ctx++;
|
||||
|
||||
/*
|
||||
* Set local node, IFF we have more than one hw queue. If
|
||||
|
@ -1797,6 +1796,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
|
|||
}
|
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
struct blk_mq_ctxmap *map = &hctx->ctx_map;
|
||||
|
||||
/*
|
||||
* If no software queues are mapped to this hardware queue,
|
||||
* disable it and free the request entries.
|
||||
|
@ -1812,6 +1813,13 @@ static void blk_mq_map_swqueue(struct request_queue *q)
|
|||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the map size to the number of mapped software queues.
|
||||
* This is more accurate and more efficient than looping
|
||||
* over all possibly mapped software queues.
|
||||
*/
|
||||
map->map_size = hctx->nr_ctx / map->bits_per_word;
|
||||
|
||||
/*
|
||||
* Initialize batch roundrobin counts
|
||||
*/
|
||||
|
@ -1888,10 +1896,26 @@ void blk_mq_release(struct request_queue *q)
|
|||
}
|
||||
|
||||
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
|
||||
{
|
||||
struct request_queue *uninit_q, *q;
|
||||
|
||||
uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
|
||||
if (!uninit_q)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
q = blk_mq_init_allocated_queue(set, uninit_q);
|
||||
if (IS_ERR(q))
|
||||
blk_cleanup_queue(uninit_q);
|
||||
|
||||
return q;
|
||||
}
|
||||
EXPORT_SYMBOL(blk_mq_init_queue);
|
||||
|
||||
struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
|
||||
struct request_queue *q)
|
||||
{
|
||||
struct blk_mq_hw_ctx **hctxs;
|
||||
struct blk_mq_ctx __percpu *ctx;
|
||||
struct request_queue *q;
|
||||
unsigned int *map;
|
||||
int i;
|
||||
|
||||
|
@ -1926,20 +1950,16 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
|
|||
hctxs[i]->queue_num = i;
|
||||
}
|
||||
|
||||
q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
|
||||
if (!q)
|
||||
goto err_hctxs;
|
||||
|
||||
/*
|
||||
* Init percpu_ref in atomic mode so that it's faster to shutdown.
|
||||
* See blk_register_queue() for details.
|
||||
*/
|
||||
if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,
|
||||
PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
|
||||
goto err_mq_usage;
|
||||
goto err_hctxs;
|
||||
|
||||
setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
|
||||
blk_queue_rq_timeout(q, 30000);
|
||||
blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30000);
|
||||
|
||||
q->nr_queues = nr_cpu_ids;
|
||||
q->nr_hw_queues = set->nr_hw_queues;
|
||||
|
@ -1965,9 +1985,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
|
|||
else
|
||||
blk_queue_make_request(q, blk_sq_make_request);
|
||||
|
||||
if (set->timeout)
|
||||
blk_queue_rq_timeout(q, set->timeout);
|
||||
|
||||
/*
|
||||
* Do this after blk_queue_make_request() overrides it...
|
||||
*/
|
||||
|
@ -1979,7 +1996,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
|
|||
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
|
||||
|
||||
if (blk_mq_init_hw_queues(q, set))
|
||||
goto err_mq_usage;
|
||||
goto err_hctxs;
|
||||
|
||||
mutex_lock(&all_q_mutex);
|
||||
list_add_tail(&q->all_q_node, &all_q_list);
|
||||
|
@ -1991,8 +2008,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
|
|||
|
||||
return q;
|
||||
|
||||
err_mq_usage:
|
||||
blk_cleanup_queue(q);
|
||||
err_hctxs:
|
||||
kfree(map);
|
||||
for (i = 0; i < set->nr_hw_queues; i++) {
|
||||
|
@ -2007,7 +2022,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
|
|||
free_percpu(ctx);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
EXPORT_SYMBOL(blk_mq_init_queue);
|
||||
EXPORT_SYMBOL(blk_mq_init_allocated_queue);
|
||||
|
||||
void blk_mq_free_queue(struct request_queue *q)
|
||||
{
|
||||
|
@ -2159,7 +2174,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
|
|||
if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
|
||||
return -EINVAL;
|
||||
|
||||
if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue)
|
||||
if (!set->ops->queue_rq || !set->ops->map_queue)
|
||||
return -EINVAL;
|
||||
|
||||
if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
|
||||
|
|
45
fs/aio.c
45
fs/aio.c
|
@ -77,6 +77,11 @@ struct kioctx_cpu {
|
|||
unsigned reqs_available;
|
||||
};
|
||||
|
||||
struct ctx_rq_wait {
|
||||
struct completion comp;
|
||||
atomic_t count;
|
||||
};
|
||||
|
||||
struct kioctx {
|
||||
struct percpu_ref users;
|
||||
atomic_t dead;
|
||||
|
@ -115,7 +120,7 @@ struct kioctx {
|
|||
/*
|
||||
* signals when all in-flight requests are done
|
||||
*/
|
||||
struct completion *requests_done;
|
||||
struct ctx_rq_wait *rq_wait;
|
||||
|
||||
struct {
|
||||
/*
|
||||
|
@ -572,8 +577,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
|
|||
struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
|
||||
|
||||
/* At this point we know that there are no any in-flight requests */
|
||||
if (ctx->requests_done)
|
||||
complete(ctx->requests_done);
|
||||
if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
|
||||
complete(&ctx->rq_wait->comp);
|
||||
|
||||
INIT_WORK(&ctx->free_work, free_ioctx);
|
||||
schedule_work(&ctx->free_work);
|
||||
|
@ -783,7 +788,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
|
|||
* the rapid destruction of the kioctx.
|
||||
*/
|
||||
static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
|
||||
struct completion *requests_done)
|
||||
struct ctx_rq_wait *wait)
|
||||
{
|
||||
struct kioctx_table *table;
|
||||
|
||||
|
@ -813,7 +818,7 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
|
|||
if (ctx->mmap_size)
|
||||
vm_munmap(ctx->mmap_base, ctx->mmap_size);
|
||||
|
||||
ctx->requests_done = requests_done;
|
||||
ctx->rq_wait = wait;
|
||||
percpu_ref_kill(&ctx->users);
|
||||
return 0;
|
||||
}
|
||||
|
@ -829,18 +834,24 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
|
|||
void exit_aio(struct mm_struct *mm)
|
||||
{
|
||||
struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
|
||||
int i;
|
||||
struct ctx_rq_wait wait;
|
||||
int i, skipped;
|
||||
|
||||
if (!table)
|
||||
return;
|
||||
|
||||
atomic_set(&wait.count, table->nr);
|
||||
init_completion(&wait.comp);
|
||||
|
||||
skipped = 0;
|
||||
for (i = 0; i < table->nr; ++i) {
|
||||
struct kioctx *ctx = table->table[i];
|
||||
struct completion requests_done =
|
||||
COMPLETION_INITIALIZER_ONSTACK(requests_done);
|
||||
|
||||
if (!ctx)
|
||||
if (!ctx) {
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* We don't need to bother with munmap() here - exit_mmap(mm)
|
||||
* is coming and it'll unmap everything. And we simply can't,
|
||||
|
@ -849,10 +860,12 @@ void exit_aio(struct mm_struct *mm)
|
|||
* that it needs to unmap the area, just set it to 0.
|
||||
*/
|
||||
ctx->mmap_size = 0;
|
||||
kill_ioctx(mm, ctx, &requests_done);
|
||||
kill_ioctx(mm, ctx, &wait);
|
||||
}
|
||||
|
||||
if (!atomic_sub_and_test(skipped, &wait.count)) {
|
||||
/* Wait until all IO for the context are done. */
|
||||
wait_for_completion(&requests_done);
|
||||
wait_for_completion(&wait.comp);
|
||||
}
|
||||
|
||||
RCU_INIT_POINTER(mm->ioctx_table, NULL);
|
||||
|
@ -1331,15 +1344,17 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
|
|||
{
|
||||
struct kioctx *ioctx = lookup_ioctx(ctx);
|
||||
if (likely(NULL != ioctx)) {
|
||||
struct completion requests_done =
|
||||
COMPLETION_INITIALIZER_ONSTACK(requests_done);
|
||||
struct ctx_rq_wait wait;
|
||||
int ret;
|
||||
|
||||
init_completion(&wait.comp);
|
||||
atomic_set(&wait.count, 1);
|
||||
|
||||
/* Pass requests_done to kill_ioctx() where it can be set
|
||||
* in a thread-safe way. If we try to set it here then we have
|
||||
* a race condition if two io_destroy() called simultaneously.
|
||||
*/
|
||||
ret = kill_ioctx(current->mm, ioctx, &requests_done);
|
||||
ret = kill_ioctx(current->mm, ioctx, &wait);
|
||||
percpu_ref_put(&ioctx->users);
|
||||
|
||||
/* Wait until all IO for the context are done. Otherwise kernel
|
||||
|
@ -1347,7 +1362,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
|
|||
* is destroyed.
|
||||
*/
|
||||
if (!ret)
|
||||
wait_for_completion(&requests_done);
|
||||
wait_for_completion(&wait.comp);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -164,6 +164,8 @@ enum {
|
|||
<< BLK_MQ_F_ALLOC_POLICY_START_BIT)
|
||||
|
||||
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
|
||||
struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
|
||||
struct request_queue *q);
|
||||
void blk_mq_finish_init(struct request_queue *q);
|
||||
int blk_mq_register_disk(struct gendisk *);
|
||||
void blk_mq_unregister_disk(struct gendisk *);
|
||||
|
@ -218,6 +220,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
|
|||
void blk_mq_stop_hw_queues(struct request_queue *q);
|
||||
void blk_mq_start_hw_queues(struct request_queue *q);
|
||||
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
|
||||
void blk_mq_run_hw_queues(struct request_queue *q, bool async);
|
||||
void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
|
||||
void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
|
||||
void *priv);
|
||||
|
@ -227,7 +230,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q);
|
|||
|
||||
/*
|
||||
* Driver command data is immediately after the request. So subtract request
|
||||
* size to get back to the original request.
|
||||
* size to get back to the original request, add request size to get the PDU.
|
||||
*/
|
||||
static inline struct request *blk_mq_rq_from_pdu(void *pdu)
|
||||
{
|
||||
|
@ -235,7 +238,7 @@ static inline struct request *blk_mq_rq_from_pdu(void *pdu)
|
|||
}
|
||||
static inline void *blk_mq_rq_to_pdu(struct request *rq)
|
||||
{
|
||||
return (void *) rq + sizeof(*rq);
|
||||
return rq + 1;
|
||||
}
|
||||
|
||||
#define queue_for_each_hw_ctx(q, hctx, i) \
|
||||
|
|
Loading…
Reference in New Issue