sched/core: move IO scheduling accounting from io_schedule_timeout() into scheduler
For an interface to support blocking for IOs, it must call io_schedule() instead of schedule(). This makes it tedious to add IO blocking to existing interfaces as the switching between schedule() and io_schedule() is often buried deep. As we already have a way to mark the task as IO scheduling, this can be made easier by separating out io_schedule() into multiple steps so that IO schedule preparation can be performed before invoking a blocking interface and the actual accounting happens inside the scheduler. io_schedule_timeout() does the following three things prior to calling schedule_timeout(). 1. Mark the task as scheduling for IO. 2. Flush out plugged IOs. 3. Account the IO scheduling. done close to the actual scheduling. This patch moves #3 into the scheduler so that later patches can separate out preparation and finish steps from io_schedule(). Patch-originally-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: adilger.kernel@dilger.ca Cc: akpm@linux-foundation.org Cc: axboe@kernel.dk Cc: jack@suse.com Cc: kernel-team@fb.com Cc: mingbo@fb.com Cc: tytso@mit.edu Link: http://lkml.kernel.org/r/20161207204841.GA22296@htj.duckdns.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
b8fd842369
commit
e33a9bba85
|
@ -2089,11 +2089,24 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
|||
p->sched_contributes_to_load = !!task_contributes_to_load(p);
|
||||
p->state = TASK_WAKING;
|
||||
|
||||
if (p->in_iowait) {
|
||||
delayacct_blkio_end();
|
||||
atomic_dec(&task_rq(p)->nr_iowait);
|
||||
}
|
||||
|
||||
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
|
||||
if (task_cpu(p) != cpu) {
|
||||
wake_flags |= WF_MIGRATED;
|
||||
set_task_cpu(p, cpu);
|
||||
}
|
||||
|
||||
#else /* CONFIG_SMP */
|
||||
|
||||
if (p->in_iowait) {
|
||||
delayacct_blkio_end();
|
||||
atomic_dec(&task_rq(p)->nr_iowait);
|
||||
}
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
ttwu_queue(p, cpu, wake_flags);
|
||||
|
@ -2143,8 +2156,13 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
|
|||
|
||||
trace_sched_waking(p);
|
||||
|
||||
if (!task_on_rq_queued(p))
|
||||
if (!task_on_rq_queued(p)) {
|
||||
if (p->in_iowait) {
|
||||
delayacct_blkio_end();
|
||||
atomic_dec(&rq->nr_iowait);
|
||||
}
|
||||
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
|
||||
}
|
||||
|
||||
ttwu_do_wakeup(rq, p, 0, rf);
|
||||
ttwu_stat(p, smp_processor_id(), 0);
|
||||
|
@ -2956,6 +2974,36 @@ unsigned long long nr_context_switches(void)
|
|||
return sum;
|
||||
}
|
||||
|
||||
/*
|
||||
* IO-wait accounting, and how its mostly bollocks (on SMP).
|
||||
*
|
||||
* The idea behind IO-wait account is to account the idle time that we could
|
||||
* have spend running if it were not for IO. That is, if we were to improve the
|
||||
* storage performance, we'd have a proportional reduction in IO-wait time.
|
||||
*
|
||||
* This all works nicely on UP, where, when a task blocks on IO, we account
|
||||
* idle time as IO-wait, because if the storage were faster, it could've been
|
||||
* running and we'd not be idle.
|
||||
*
|
||||
* This has been extended to SMP, by doing the same for each CPU. This however
|
||||
* is broken.
|
||||
*
|
||||
* Imagine for instance the case where two tasks block on one CPU, only the one
|
||||
* CPU will have IO-wait accounted, while the other has regular idle. Even
|
||||
* though, if the storage were faster, both could've ran at the same time,
|
||||
* utilising both CPUs.
|
||||
*
|
||||
* This means, that when looking globally, the current IO-wait accounting on
|
||||
* SMP is a lower bound, by reason of under accounting.
|
||||
*
|
||||
* Worse, since the numbers are provided per CPU, they are sometimes
|
||||
* interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
|
||||
* associated with any one particular CPU, it can wake to another CPU than it
|
||||
* blocked on. This means the per CPU IO-wait number is meaningless.
|
||||
*
|
||||
* Task CPU affinities can make all that even more 'interesting'.
|
||||
*/
|
||||
|
||||
unsigned long nr_iowait(void)
|
||||
{
|
||||
unsigned long i, sum = 0;
|
||||
|
@ -2966,6 +3014,13 @@ unsigned long nr_iowait(void)
|
|||
return sum;
|
||||
}
|
||||
|
||||
/*
|
||||
* Consumers of these two interfaces, like for example the cpufreq menu
|
||||
* governor are using nonsensical data. Boosting frequency for a CPU that has
|
||||
* IO-wait which might not even end up running the task when it does become
|
||||
* runnable.
|
||||
*/
|
||||
|
||||
unsigned long nr_iowait_cpu(int cpu)
|
||||
{
|
||||
struct rq *this = cpu_rq(cpu);
|
||||
|
@ -3377,6 +3432,11 @@ static void __sched notrace __schedule(bool preempt)
|
|||
deactivate_task(rq, prev, DEQUEUE_SLEEP);
|
||||
prev->on_rq = 0;
|
||||
|
||||
if (prev->in_iowait) {
|
||||
atomic_inc(&rq->nr_iowait);
|
||||
delayacct_blkio_start();
|
||||
}
|
||||
|
||||
/*
|
||||
* If a worker went to sleep, notify and ask workqueue
|
||||
* whether it wants to wake up a task to maintain
|
||||
|
@ -5075,19 +5135,13 @@ EXPORT_SYMBOL_GPL(yield_to);
|
|||
long __sched io_schedule_timeout(long timeout)
|
||||
{
|
||||
int old_iowait = current->in_iowait;
|
||||
struct rq *rq;
|
||||
long ret;
|
||||
|
||||
current->in_iowait = 1;
|
||||
blk_schedule_flush_plug(current);
|
||||
|
||||
delayacct_blkio_start();
|
||||
rq = raw_rq();
|
||||
atomic_inc(&rq->nr_iowait);
|
||||
ret = schedule_timeout(timeout);
|
||||
current->in_iowait = old_iowait;
|
||||
atomic_dec(&rq->nr_iowait);
|
||||
delayacct_blkio_end();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue