blk-mq: fix issue with shared tag queue re-running

This patch attempts to make the case of hctx re-running on driver tag
failure more robust. Without this patch, it's pretty easy to trigger a
stall condition with shared tags. An example is using null_blk like
this:

modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1

which sets up 4 devices, sharing the same tag set with a depth of 1.
Running a fio job ala:

[global]
bs=4k
rw=randread
norandommap
direct=1
ioengine=libaio
iodepth=4

[nullb0]
filename=/dev/nullb0
[nullb1]
filename=/dev/nullb1
[nullb2]
filename=/dev/nullb2
[nullb3]
filename=/dev/nullb3

will inevitably end with one or more threads being stuck waiting for a
scheduler tag. That IO is then stuck forever, until someone else
triggers a run of the queue.

Ensure that we always re-run the hardware queue, if the driver tag we
were waiting for got freed before we added our leftover request entries
back on the dispatch list.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Tested-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Jens Axboe 2017-11-09 08:32:43 -07:00
parent e454d122e2
commit eb619fdb2d
3 changed files with 51 additions and 42 deletions

View File

@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = {
HCTX_STATE_NAME(STOPPED), HCTX_STATE_NAME(STOPPED),
HCTX_STATE_NAME(TAG_ACTIVE), HCTX_STATE_NAME(TAG_ACTIVE),
HCTX_STATE_NAME(SCHED_RESTART), HCTX_STATE_NAME(SCHED_RESTART),
HCTX_STATE_NAME(TAG_WAITING),
HCTX_STATE_NAME(START_ON_RUN), HCTX_STATE_NAME(START_ON_RUN),
}; };
#undef HCTX_STATE_NAME #undef HCTX_STATE_NAME

View File

@ -998,49 +998,64 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
return rq->tag != -1; return rq->tag != -1;
} }
static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
void *key) int flags, void *key)
{ {
struct blk_mq_hw_ctx *hctx; struct blk_mq_hw_ctx *hctx;
hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
list_del(&wait->entry); list_del_init(&wait->entry);
clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
blk_mq_run_hw_queue(hctx, true); blk_mq_run_hw_queue(hctx, true);
return 1; return 1;
} }
static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx,
struct request *rq)
{ {
struct blk_mq_hw_ctx *this_hctx = *hctx;
wait_queue_entry_t *wait = &this_hctx->dispatch_wait;
struct sbq_wait_state *ws; struct sbq_wait_state *ws;
/* if (!list_empty_careful(&wait->entry))
* The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
* The thread which wins the race to grab this bit adds the hardware
* queue to the wait queue.
*/
if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
return false; return false;
init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); spin_lock(&this_hctx->lock);
ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); if (!list_empty(&wait->entry)) {
spin_unlock(&this_hctx->lock);
return false;
}
ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
add_wait_queue(&ws->wait, wait);
/* /*
* As soon as this returns, it's no longer safe to fiddle with * It's possible that a tag was freed in the window between the
* hctx->dispatch_wait, since a completion can wake up the wait queue * allocation failure and adding the hardware queue to the wait
* and unlock the bit. * queue.
*/ */
add_wait_queue(&ws->wait, &hctx->dispatch_wait); if (!blk_mq_get_driver_tag(rq, hctx, false)) {
spin_unlock(&this_hctx->lock);
return false;
}
/*
* We got a tag, remove ourselves from the wait queue to ensure
* someone else gets the wakeup.
*/
spin_lock_irq(&ws->wait.lock);
list_del_init(&wait->entry);
spin_unlock_irq(&ws->wait.lock);
spin_unlock(&this_hctx->lock);
return true; return true;
} }
bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
bool got_budget) bool got_budget)
{ {
struct blk_mq_hw_ctx *hctx; struct blk_mq_hw_ctx *hctx;
struct request *rq, *nxt; struct request *rq, *nxt;
bool no_tag = false;
int errors, queued; int errors, queued;
if (list_empty(list)) if (list_empty(list))
@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
if (!blk_mq_get_driver_tag(rq, &hctx, false)) { if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
/* /*
* The initial allocation attempt failed, so we need to * The initial allocation attempt failed, so we need to
* rerun the hardware queue when a tag is freed. * rerun the hardware queue when a tag is freed. The
* waitqueue takes care of that. If the queue is run
* before we add this entry back on the dispatch list,
* we'll re-run it below.
*/ */
if (!blk_mq_dispatch_wait_add(hctx)) { if (!blk_mq_dispatch_wait_add(&hctx, rq)) {
if (got_budget)
blk_mq_put_dispatch_budget(hctx);
break;
}
/*
* It's possible that a tag was freed in the window
* between the allocation failure and adding the
* hardware queue to the wait queue.
*/
if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
if (got_budget) if (got_budget)
blk_mq_put_dispatch_budget(hctx); blk_mq_put_dispatch_budget(hctx);
no_tag = true;
break; break;
} }
} }
@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
* it is no longer set that means that it was cleared by another * it is no longer set that means that it was cleared by another
* thread and hence that a queue rerun is needed. * thread and hence that a queue rerun is needed.
* *
* If TAG_WAITING is set that means that an I/O scheduler has * If 'no_tag' is set, that means that we failed getting
* been configured and another thread is waiting for a driver * a driver tag with an I/O scheduler attached. If our dispatch
* tag. To guarantee fairness, do not rerun this hardware queue * waitqueue is no longer active, ensure that we run the queue
* but let the other thread grab the driver tag. * AFTER adding our entries back to the list.
* *
* If no I/O scheduler has been configured it is possible that * If no I/O scheduler has been configured it is possible that
* the hardware queue got stopped and restarted before requests * the hardware queue got stopped and restarted before requests
@ -1155,8 +1163,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
* returning BLK_STS_RESOURCE. Two exceptions are scsi-mq * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
* and dm-rq. * and dm-rq.
*/ */
if (!blk_mq_sched_needs_restart(hctx) && if (!blk_mq_sched_needs_restart(hctx) ||
!test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
blk_mq_run_hw_queue(hctx, true); blk_mq_run_hw_queue(hctx, true);
} }
@ -2020,6 +2028,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
hctx->nr_ctx = 0; hctx->nr_ctx = 0;
init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
if (set->ops->init_hctx && if (set->ops->init_hctx &&
set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
goto free_bitmap; goto free_bitmap;

View File

@ -35,7 +35,7 @@ struct blk_mq_hw_ctx {
struct blk_mq_ctx **ctxs; struct blk_mq_ctx **ctxs;
unsigned int nr_ctx; unsigned int nr_ctx;
wait_queue_entry_t dispatch_wait; wait_queue_entry_t dispatch_wait;
atomic_t wait_index; atomic_t wait_index;
struct blk_mq_tags *tags; struct blk_mq_tags *tags;
@ -181,8 +181,7 @@ enum {
BLK_MQ_S_STOPPED = 0, BLK_MQ_S_STOPPED = 0,
BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_TAG_ACTIVE = 1,
BLK_MQ_S_SCHED_RESTART = 2, BLK_MQ_S_SCHED_RESTART = 2,
BLK_MQ_S_TAG_WAITING = 3, BLK_MQ_S_START_ON_RUN = 3,
BLK_MQ_S_START_ON_RUN = 4,
BLK_MQ_MAX_DEPTH = 10240, BLK_MQ_MAX_DEPTH = 10240,