psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim
We've noticed cases where tasks in a cgroup are stalled on memory but there is little memory FULL pressure since tasks stay on the runqueue in reclaim. A simple example involves a single threaded program that keeps leaking and touching large amounts of memory. It runs in a cgroup with swap enabled, memory.high set at 10M and cpu.max ratio set at 5%. Though there is significant CPU pressure and memory SOME, there is barely any memory FULL since the task enters reclaim and stays on the runqueue. However, this memory-bound task is effectively stalled on memory and we expect memory FULL to match memory SOME in this scenario. The code is confused about memstall && running, thinking there is a stalled task and a productive task when there's only one task: a reclaimer that's counted as both. To fix this, we redefine the condition for PSI_MEM_FULL to check that all running tasks are in an active memstall instead of checking that there are no running tasks. case PSI_MEM_FULL: - return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]); + return unlikely(tasks[NR_MEMSTALL] && + tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); This will capture reclaimers. It will also capture tasks that called psi_memstall_enter() and are about to sleep, but this should be negligible noise. Signed-off-by: Brian Chen <brianchen118@gmail.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Link: https://lore.kernel.org/r/20211110213312.310243-1-brianchen118@gmail.com
This commit is contained in:
parent
4feee7d126
commit
cb0e52b774
|
@ -22,7 +22,17 @@ enum psi_task_count {
|
||||||
* don't have to special case any state tracking for it.
|
* don't have to special case any state tracking for it.
|
||||||
*/
|
*/
|
||||||
NR_ONCPU,
|
NR_ONCPU,
|
||||||
NR_PSI_TASK_COUNTS = 4,
|
/*
|
||||||
|
* For IO and CPU stalls the presence of running/oncpu tasks
|
||||||
|
* in the domain means a partial rather than a full stall.
|
||||||
|
* For memory it's not so simple because of page reclaimers:
|
||||||
|
* they are running/oncpu while representing a stall. To tell
|
||||||
|
* whether a domain has productivity left or not, we need to
|
||||||
|
* distinguish between regular running (i.e. productive)
|
||||||
|
* threads and memstall ones.
|
||||||
|
*/
|
||||||
|
NR_MEMSTALL_RUNNING,
|
||||||
|
NR_PSI_TASK_COUNTS = 5,
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Task state bitmasks */
|
/* Task state bitmasks */
|
||||||
|
@ -30,6 +40,7 @@ enum psi_task_count {
|
||||||
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
|
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
|
||||||
#define TSK_RUNNING (1 << NR_RUNNING)
|
#define TSK_RUNNING (1 << NR_RUNNING)
|
||||||
#define TSK_ONCPU (1 << NR_ONCPU)
|
#define TSK_ONCPU (1 << NR_ONCPU)
|
||||||
|
#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)
|
||||||
|
|
||||||
/* Resources that workloads could be stalled on */
|
/* Resources that workloads could be stalled on */
|
||||||
enum psi_res {
|
enum psi_res {
|
||||||
|
|
|
@ -35,13 +35,19 @@
|
||||||
* delayed on that resource such that nobody is advancing and the CPU
|
* delayed on that resource such that nobody is advancing and the CPU
|
||||||
* goes idle. This leaves both workload and CPU unproductive.
|
* goes idle. This leaves both workload and CPU unproductive.
|
||||||
*
|
*
|
||||||
* Naturally, the FULL state doesn't exist for the CPU resource at the
|
|
||||||
* system level, but exist at the cgroup level, means all non-idle tasks
|
|
||||||
* in a cgroup are delayed on the CPU resource which used by others outside
|
|
||||||
* of the cgroup or throttled by the cgroup cpu.max configuration.
|
|
||||||
*
|
|
||||||
* SOME = nr_delayed_tasks != 0
|
* SOME = nr_delayed_tasks != 0
|
||||||
* FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
|
* FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
|
||||||
|
*
|
||||||
|
* What it means for a task to be productive is defined differently
|
||||||
|
* for each resource. For IO, productive means a running task. For
|
||||||
|
* memory, productive means a running task that isn't a reclaimer. For
|
||||||
|
* CPU, productive means an oncpu task.
|
||||||
|
*
|
||||||
|
* Naturally, the FULL state doesn't exist for the CPU resource at the
|
||||||
|
* system level, but exist at the cgroup level. At the cgroup level,
|
||||||
|
* FULL means all non-idle tasks in the cgroup are delayed on the CPU
|
||||||
|
* resource which is being used by others outside of the cgroup or
|
||||||
|
* throttled by the cgroup cpu.max configuration.
|
||||||
*
|
*
|
||||||
* The percentage of wallclock time spent in those compound stall
|
* The percentage of wallclock time spent in those compound stall
|
||||||
* states gives pressure numbers between 0 and 100 for each resource,
|
* states gives pressure numbers between 0 and 100 for each resource,
|
||||||
|
@ -82,13 +88,13 @@
|
||||||
*
|
*
|
||||||
* threads = min(nr_nonidle_tasks, nr_cpus)
|
* threads = min(nr_nonidle_tasks, nr_cpus)
|
||||||
* SOME = min(nr_delayed_tasks / threads, 1)
|
* SOME = min(nr_delayed_tasks / threads, 1)
|
||||||
* FULL = (threads - min(nr_running_tasks, threads)) / threads
|
* FULL = (threads - min(nr_productive_tasks, threads)) / threads
|
||||||
*
|
*
|
||||||
* For the 257 number crunchers on 256 CPUs, this yields:
|
* For the 257 number crunchers on 256 CPUs, this yields:
|
||||||
*
|
*
|
||||||
* threads = min(257, 256)
|
* threads = min(257, 256)
|
||||||
* SOME = min(1 / 256, 1) = 0.4%
|
* SOME = min(1 / 256, 1) = 0.4%
|
||||||
* FULL = (256 - min(257, 256)) / 256 = 0%
|
* FULL = (256 - min(256, 256)) / 256 = 0%
|
||||||
*
|
*
|
||||||
* For the 1 out of 4 memory-delayed tasks, this yields:
|
* For the 1 out of 4 memory-delayed tasks, this yields:
|
||||||
*
|
*
|
||||||
|
@ -113,7 +119,7 @@
|
||||||
* For each runqueue, we track:
|
* For each runqueue, we track:
|
||||||
*
|
*
|
||||||
* tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
|
* tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
|
||||||
* tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
|
* tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
|
||||||
* tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
|
* tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
|
||||||
*
|
*
|
||||||
* and then periodically aggregate:
|
* and then periodically aggregate:
|
||||||
|
@ -234,7 +240,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
|
||||||
case PSI_MEM_SOME:
|
case PSI_MEM_SOME:
|
||||||
return unlikely(tasks[NR_MEMSTALL]);
|
return unlikely(tasks[NR_MEMSTALL]);
|
||||||
case PSI_MEM_FULL:
|
case PSI_MEM_FULL:
|
||||||
return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
|
return unlikely(tasks[NR_MEMSTALL] &&
|
||||||
|
tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
|
||||||
case PSI_CPU_SOME:
|
case PSI_CPU_SOME:
|
||||||
return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
|
return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
|
||||||
case PSI_CPU_FULL:
|
case PSI_CPU_FULL:
|
||||||
|
@ -711,10 +718,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
|
||||||
if (groupc->tasks[t]) {
|
if (groupc->tasks[t]) {
|
||||||
groupc->tasks[t]--;
|
groupc->tasks[t]--;
|
||||||
} else if (!psi_bug) {
|
} else if (!psi_bug) {
|
||||||
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
|
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
|
||||||
cpu, t, groupc->tasks[0],
|
cpu, t, groupc->tasks[0],
|
||||||
groupc->tasks[1], groupc->tasks[2],
|
groupc->tasks[1], groupc->tasks[2],
|
||||||
groupc->tasks[3], clear, set);
|
groupc->tasks[3], groupc->tasks[4],
|
||||||
|
clear, set);
|
||||||
psi_bug = 1;
|
psi_bug = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -854,12 +862,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
|
||||||
int clear = TSK_ONCPU, set = 0;
|
int clear = TSK_ONCPU, set = 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* When we're going to sleep, psi_dequeue() lets us handle
|
* When we're going to sleep, psi_dequeue() lets us
|
||||||
* TSK_RUNNING and TSK_IOWAIT here, where we can combine it
|
* handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
|
||||||
* with TSK_ONCPU and save walking common ancestors twice.
|
* TSK_IOWAIT here, where we can combine it with
|
||||||
|
* TSK_ONCPU and save walking common ancestors twice.
|
||||||
*/
|
*/
|
||||||
if (sleep) {
|
if (sleep) {
|
||||||
clear |= TSK_RUNNING;
|
clear |= TSK_RUNNING;
|
||||||
|
if (prev->in_memstall)
|
||||||
|
clear |= TSK_MEMSTALL_RUNNING;
|
||||||
if (prev->in_iowait)
|
if (prev->in_iowait)
|
||||||
set |= TSK_IOWAIT;
|
set |= TSK_IOWAIT;
|
||||||
}
|
}
|
||||||
|
@ -908,7 +919,7 @@ void psi_memstall_enter(unsigned long *flags)
|
||||||
rq = this_rq_lock_irq(&rf);
|
rq = this_rq_lock_irq(&rf);
|
||||||
|
|
||||||
current->in_memstall = 1;
|
current->in_memstall = 1;
|
||||||
psi_task_change(current, 0, TSK_MEMSTALL);
|
psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
|
||||||
|
|
||||||
rq_unlock_irq(rq, &rf);
|
rq_unlock_irq(rq, &rf);
|
||||||
}
|
}
|
||||||
|
@ -937,7 +948,7 @@ void psi_memstall_leave(unsigned long *flags)
|
||||||
rq = this_rq_lock_irq(&rf);
|
rq = this_rq_lock_irq(&rf);
|
||||||
|
|
||||||
current->in_memstall = 0;
|
current->in_memstall = 0;
|
||||||
psi_task_change(current, TSK_MEMSTALL, 0);
|
psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
|
||||||
|
|
||||||
rq_unlock_irq(rq, &rf);
|
rq_unlock_irq(rq, &rf);
|
||||||
}
|
}
|
||||||
|
|
|
@ -118,6 +118,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
|
||||||
if (static_branch_likely(&psi_disabled))
|
if (static_branch_likely(&psi_disabled))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
if (p->in_memstall)
|
||||||
|
set |= TSK_MEMSTALL_RUNNING;
|
||||||
|
|
||||||
if (!wakeup || p->sched_psi_wake_requeue) {
|
if (!wakeup || p->sched_psi_wake_requeue) {
|
||||||
if (p->in_memstall)
|
if (p->in_memstall)
|
||||||
set |= TSK_MEMSTALL;
|
set |= TSK_MEMSTALL;
|
||||||
|
@ -148,7 +151,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (p->in_memstall)
|
if (p->in_memstall)
|
||||||
clear |= TSK_MEMSTALL;
|
clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
|
||||||
|
|
||||||
psi_task_change(p, clear, 0);
|
psi_task_change(p, clear, 0);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue