mirror of https://gitee.com/openkylin/qemu.git
aio-posix: move RCU_READ_LOCK() into run_poll_handlers()
Now that run_poll_handlers_once() is only called by run_poll_handlers() we can improve the CPU time profile by moving the expensive RCU_READ_LOCK() out of the polling loop. This reduces the run_poll_handlers() from 40% CPU to 10% CPU in perf's sampling profiler output. Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Link: https://lore.kernel.org/r/20200305170806.1313245-3-stefanha@redhat.com Message-Id: <20200305170806.1313245-3-stefanha@redhat.com>
This commit is contained in:
parent
e4346192f1
commit
3aa221b382
|
@ -583,16 +583,6 @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
|
||||||
bool progress = false;
|
bool progress = false;
|
||||||
AioHandler *node;
|
AioHandler *node;
|
||||||
|
|
||||||
/*
|
|
||||||
* Optimization: ->io_poll() handlers often contain RCU read critical
|
|
||||||
* sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
|
|
||||||
* -> rcu_read_lock() -> ... sequences with expensive memory
|
|
||||||
* synchronization primitives. Make the entire polling loop an RCU
|
|
||||||
* critical section because nested rcu_read_lock()/rcu_read_unlock() calls
|
|
||||||
* are cheap.
|
|
||||||
*/
|
|
||||||
RCU_READ_LOCK_GUARD();
|
|
||||||
|
|
||||||
QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
|
QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
|
||||||
if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll &&
|
if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll &&
|
||||||
aio_node_check(ctx, node->is_external) &&
|
aio_node_check(ctx, node->is_external) &&
|
||||||
|
@ -636,6 +626,16 @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
|
||||||
|
|
||||||
trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
|
trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Optimization: ->io_poll() handlers often contain RCU read critical
|
||||||
|
* sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
|
||||||
|
* -> rcu_read_lock() -> ... sequences with expensive memory
|
||||||
|
* synchronization primitives. Make the entire polling loop an RCU
|
||||||
|
* critical section because nested rcu_read_lock()/rcu_read_unlock() calls
|
||||||
|
* are cheap.
|
||||||
|
*/
|
||||||
|
RCU_READ_LOCK_GUARD();
|
||||||
|
|
||||||
start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
|
start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
|
||||||
do {
|
do {
|
||||||
progress = run_poll_handlers_once(ctx, timeout);
|
progress = run_poll_handlers_once(ctx, timeout);
|
||||||
|
|
Loading…
Reference in New Issue