mirror of https://gitee.com/openkylin/qemu.git
aio-posix: remove idle poll handlers to improve scalability
When there are many poll handlers it's likely that some of them are idle most of the time. Remove handlers that haven't had activity recently so that the polling loop scales better for guests with a large number of devices. This feature only takes effect for the Linux io_uring fd monitoring implementation because it is capable of combining fd monitoring with userspace polling. The other implementations can't do that and risk starving fds in favor of poll handlers, so don't try this optimization when they are in use. IOPS improves from 10k to 105k when the guest has 100 virtio-blk-pci,num-queues=32 devices and 1 virtio-blk-pci,num-queues=1 device for rw=randread,iodepth=1,bs=4k,ioengine=libaio on NVMe. [Clarified aio_poll_handlers locking discipline explanation in comment after discussion with Paolo Bonzini <pbonzini@redhat.com>. --Stefan] Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Link: https://lore.kernel.org/r/20200305170806.1313245-8-stefanha@redhat.com Message-Id: <20200305170806.1313245-8-stefanha@redhat.com>
This commit is contained in:
parent
aa38e19f05
commit
d37d0e365a
|
@ -227,6 +227,14 @@ struct AioContext {
|
||||||
int64_t poll_grow; /* polling time growth factor */
|
int64_t poll_grow; /* polling time growth factor */
|
||||||
int64_t poll_shrink; /* polling time shrink factor */
|
int64_t poll_shrink; /* polling time shrink factor */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* List of handlers participating in userspace polling. Protected by
|
||||||
|
* ctx->list_lock. Iterated and modified mostly by the event loop thread
|
||||||
|
* from aio_poll() with ctx->list_lock incremented. aio_set_fd_handler()
|
||||||
|
* only touches the list to delete nodes if ctx->list_lock's count is zero.
|
||||||
|
*/
|
||||||
|
AioHandlerList poll_aio_handlers;
|
||||||
|
|
||||||
/* Are we in polling mode or monitoring file descriptors? */
|
/* Are we in polling mode or monitoring file descriptors? */
|
||||||
bool poll_started;
|
bool poll_started;
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,9 @@
|
||||||
#include "trace.h"
|
#include "trace.h"
|
||||||
#include "aio-posix.h"
|
#include "aio-posix.h"
|
||||||
|
|
||||||
|
/* Stop userspace polling on a handler if it isn't active for some time */
|
||||||
|
#define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
|
||||||
|
|
||||||
bool aio_poll_disabled(AioContext *ctx)
|
bool aio_poll_disabled(AioContext *ctx)
|
||||||
{
|
{
|
||||||
return atomic_read(&ctx->poll_disable_cnt);
|
return atomic_read(&ctx->poll_disable_cnt);
|
||||||
|
@ -78,6 +81,7 @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
|
||||||
* deleted because deleted nodes are only cleaned up while
|
* deleted because deleted nodes are only cleaned up while
|
||||||
* no one is walking the handlers list.
|
* no one is walking the handlers list.
|
||||||
*/
|
*/
|
||||||
|
QLIST_SAFE_REMOVE(node, node_poll);
|
||||||
QLIST_REMOVE(node, node);
|
QLIST_REMOVE(node, node);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -205,7 +209,7 @@ static bool poll_set_started(AioContext *ctx, bool started)
|
||||||
ctx->poll_started = started;
|
ctx->poll_started = started;
|
||||||
|
|
||||||
qemu_lockcnt_inc(&ctx->list_lock);
|
qemu_lockcnt_inc(&ctx->list_lock);
|
||||||
QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
|
QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
|
||||||
IOHandler *fn;
|
IOHandler *fn;
|
||||||
|
|
||||||
if (QLIST_IS_INSERTED(node, node_deleted)) {
|
if (QLIST_IS_INSERTED(node, node_deleted)) {
|
||||||
|
@ -286,6 +290,7 @@ static void aio_free_deleted_handlers(AioContext *ctx)
|
||||||
while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
|
while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
|
||||||
QLIST_REMOVE(node, node);
|
QLIST_REMOVE(node, node);
|
||||||
QLIST_REMOVE(node, node_deleted);
|
QLIST_REMOVE(node, node_deleted);
|
||||||
|
QLIST_SAFE_REMOVE(node, node_poll);
|
||||||
g_free(node);
|
g_free(node);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -300,6 +305,22 @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
|
||||||
revents = node->pfd.revents & node->pfd.events;
|
revents = node->pfd.revents & node->pfd.events;
|
||||||
node->pfd.revents = 0;
|
node->pfd.revents = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Start polling AioHandlers when they become ready because activity is
|
||||||
|
* likely to continue. Note that starvation is theoretically possible when
|
||||||
|
* fdmon_supports_polling(), but only until the fd fires for the first
|
||||||
|
* time.
|
||||||
|
*/
|
||||||
|
if (!QLIST_IS_INSERTED(node, node_deleted) &&
|
||||||
|
!QLIST_IS_INSERTED(node, node_poll) &&
|
||||||
|
node->io_poll) {
|
||||||
|
trace_poll_add(ctx, node, node->pfd.fd, revents);
|
||||||
|
if (ctx->poll_started && node->io_poll_begin) {
|
||||||
|
node->io_poll_begin(node->opaque);
|
||||||
|
}
|
||||||
|
QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll);
|
||||||
|
}
|
||||||
|
|
||||||
if (!QLIST_IS_INSERTED(node, node_deleted) &&
|
if (!QLIST_IS_INSERTED(node, node_deleted) &&
|
||||||
(revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
|
(revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
|
||||||
aio_node_check(ctx, node->is_external) &&
|
aio_node_check(ctx, node->is_external) &&
|
||||||
|
@ -364,15 +385,19 @@ void aio_dispatch(AioContext *ctx)
|
||||||
timerlistgroup_run_timers(&ctx->tlg);
|
timerlistgroup_run_timers(&ctx->tlg);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
|
static bool run_poll_handlers_once(AioContext *ctx,
|
||||||
|
int64_t now,
|
||||||
|
int64_t *timeout)
|
||||||
{
|
{
|
||||||
bool progress = false;
|
bool progress = false;
|
||||||
AioHandler *node;
|
AioHandler *node;
|
||||||
|
AioHandler *tmp;
|
||||||
|
|
||||||
QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
|
QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
|
||||||
if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll &&
|
if (aio_node_check(ctx, node->is_external) &&
|
||||||
aio_node_check(ctx, node->is_external) &&
|
|
||||||
node->io_poll(node->opaque)) {
|
node->io_poll(node->opaque)) {
|
||||||
|
node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Polling was successful, exit try_poll_mode immediately
|
* Polling was successful, exit try_poll_mode immediately
|
||||||
* to adjust the next polling time.
|
* to adjust the next polling time.
|
||||||
|
@ -389,6 +414,50 @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
|
||||||
return progress;
|
return progress;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool fdmon_supports_polling(AioContext *ctx)
|
||||||
|
{
|
||||||
|
return ctx->fdmon_ops->need_wait != aio_poll_disabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool remove_idle_poll_handlers(AioContext *ctx, int64_t now)
|
||||||
|
{
|
||||||
|
AioHandler *node;
|
||||||
|
AioHandler *tmp;
|
||||||
|
bool progress = false;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* File descriptor monitoring implementations without userspace polling
|
||||||
|
* support suffer from starvation when a subset of handlers is polled
|
||||||
|
* because fds will not be processed in a timely fashion. Don't remove
|
||||||
|
* idle poll handlers.
|
||||||
|
*/
|
||||||
|
if (!fdmon_supports_polling(ctx)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
|
||||||
|
if (node->poll_idle_timeout == 0LL) {
|
||||||
|
node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
|
||||||
|
} else if (now >= node->poll_idle_timeout) {
|
||||||
|
trace_poll_remove(ctx, node, node->pfd.fd);
|
||||||
|
node->poll_idle_timeout = 0LL;
|
||||||
|
QLIST_SAFE_REMOVE(node, node_poll);
|
||||||
|
if (ctx->poll_started && node->io_poll_end) {
|
||||||
|
node->io_poll_end(node->opaque);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Final poll in case ->io_poll_end() races with an event.
|
||||||
|
* Nevermind about re-adding the handler in the rare case where
|
||||||
|
* this causes progress.
|
||||||
|
*/
|
||||||
|
progress = node->io_poll(node->opaque) || progress;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return progress;
|
||||||
|
}
|
||||||
|
|
||||||
/* run_poll_handlers:
|
/* run_poll_handlers:
|
||||||
* @ctx: the AioContext
|
* @ctx: the AioContext
|
||||||
* @max_ns: maximum time to poll for, in nanoseconds
|
* @max_ns: maximum time to poll for, in nanoseconds
|
||||||
|
@ -424,12 +493,17 @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
|
||||||
|
|
||||||
start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
|
start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
|
||||||
do {
|
do {
|
||||||
progress = run_poll_handlers_once(ctx, timeout);
|
progress = run_poll_handlers_once(ctx, start_time, timeout);
|
||||||
elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
|
elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
|
||||||
max_ns = qemu_soonest_timeout(*timeout, max_ns);
|
max_ns = qemu_soonest_timeout(*timeout, max_ns);
|
||||||
assert(!(max_ns && progress));
|
assert(!(max_ns && progress));
|
||||||
} while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
|
} while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
|
||||||
|
|
||||||
|
if (remove_idle_poll_handlers(ctx, start_time + elapsed_time)) {
|
||||||
|
*timeout = 0;
|
||||||
|
progress = true;
|
||||||
|
}
|
||||||
|
|
||||||
/* If time has passed with no successful polling, adjust *timeout to
|
/* If time has passed with no successful polling, adjust *timeout to
|
||||||
* keep the same ending time.
|
* keep the same ending time.
|
||||||
*/
|
*/
|
||||||
|
@ -454,8 +528,13 @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
|
||||||
*/
|
*/
|
||||||
static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
|
static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
|
||||||
{
|
{
|
||||||
int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
|
int64_t max_ns;
|
||||||
|
|
||||||
|
if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
|
||||||
if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
|
if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
|
||||||
poll_set_started(ctx, true);
|
poll_set_started(ctx, true);
|
||||||
|
|
||||||
|
|
|
@ -30,10 +30,12 @@ struct AioHandler {
|
||||||
QLIST_ENTRY(AioHandler) node;
|
QLIST_ENTRY(AioHandler) node;
|
||||||
QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
|
QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
|
||||||
QLIST_ENTRY(AioHandler) node_deleted;
|
QLIST_ENTRY(AioHandler) node_deleted;
|
||||||
|
QLIST_ENTRY(AioHandler) node_poll;
|
||||||
#ifdef CONFIG_LINUX_IO_URING
|
#ifdef CONFIG_LINUX_IO_URING
|
||||||
QSLIST_ENTRY(AioHandler) node_submitted;
|
QSLIST_ENTRY(AioHandler) node_submitted;
|
||||||
unsigned flags; /* see fdmon-io_uring.c */
|
unsigned flags; /* see fdmon-io_uring.c */
|
||||||
#endif
|
#endif
|
||||||
|
int64_t poll_idle_timeout; /* when to stop userspace polling */
|
||||||
bool is_external;
|
bool is_external;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,8 @@ run_poll_handlers_begin(void *ctx, int64_t max_ns, int64_t timeout) "ctx %p max_
|
||||||
run_poll_handlers_end(void *ctx, bool progress, int64_t timeout) "ctx %p progress %d new timeout %"PRId64
|
run_poll_handlers_end(void *ctx, bool progress, int64_t timeout) "ctx %p progress %d new timeout %"PRId64
|
||||||
poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
|
poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
|
||||||
poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
|
poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
|
||||||
|
poll_add(void *ctx, void *node, int fd, unsigned revents) "ctx %p node %p fd %d revents 0x%x"
|
||||||
|
poll_remove(void *ctx, void *node, int fd) "ctx %p node %p fd %d"
|
||||||
|
|
||||||
# async.c
|
# async.c
|
||||||
aio_co_schedule(void *ctx, void *co) "ctx %p co %p"
|
aio_co_schedule(void *ctx, void *co) "ctx %p co %p"
|
||||||
|
|
Loading…
Reference in New Issue