psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim
We've noticed cases where tasks in a cgroup are stalled on memory but there is little memory FULL pressure since tasks stay on the runqueue in reclaim. A simple example involves a single threaded program that keeps leaking and touching large amounts of memory. It runs in a cgroup with swap enabled, memory.high set at 10M and cpu.max ratio set at 5%. Though there is significant CPU pressure and memory SOME, there is barely any memory FULL since the task enters reclaim and stays on the runqueue. However, this memory-bound task is effectively stalled on memory and we expect memory FULL to match memory SOME in this scenario. The code is confused about memstall && running, thinking there is a stalled task and a productive task when there's only one task: a reclaimer that's counted as both. To fix this, we redefine the condition for PSI_MEM_FULL to check that all running tasks are in an active memstall instead of checking that there are no running tasks. case PSI_MEM_FULL: - return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]); + return unlikely(tasks[NR_MEMSTALL] && + tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); This will capture reclaimers. It will also capture tasks that called psi_memstall_enter() and are about to sleep, but this should be negligible noise. Signed-off-by: Brian Chen <brianchen118@gmail.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Link: https://lore.kernel.org/r/20211110213312.310243-1-brianchen118@gmail.com
This commit is contained in:
parent
4feee7d126
commit
cb0e52b774
|
@ -22,7 +22,17 @@ enum psi_task_count {
|
|||
* don't have to special case any state tracking for it.
|
||||
*/
|
||||
NR_ONCPU,
|
||||
NR_PSI_TASK_COUNTS = 4,
|
||||
/*
|
||||
* For IO and CPU stalls the presence of running/oncpu tasks
|
||||
* in the domain means a partial rather than a full stall.
|
||||
* For memory it's not so simple because of page reclaimers:
|
||||
* they are running/oncpu while representing a stall. To tell
|
||||
* whether a domain has productivity left or not, we need to
|
||||
* distinguish between regular running (i.e. productive)
|
||||
* threads and memstall ones.
|
||||
*/
|
||||
NR_MEMSTALL_RUNNING,
|
||||
NR_PSI_TASK_COUNTS = 5,
|
||||
};
|
||||
|
||||
/* Task state bitmasks */
|
||||
|
@ -30,6 +40,7 @@ enum psi_task_count {
|
|||
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
|
||||
#define TSK_RUNNING (1 << NR_RUNNING)
|
||||
#define TSK_ONCPU (1 << NR_ONCPU)
|
||||
#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)
|
||||
|
||||
/* Resources that workloads could be stalled on */
|
||||
enum psi_res {
|
||||
|
|
|
@ -35,13 +35,19 @@
|
|||
* delayed on that resource such that nobody is advancing and the CPU
|
||||
* goes idle. This leaves both workload and CPU unproductive.
|
||||
*
|
||||
* Naturally, the FULL state doesn't exist for the CPU resource at the
|
||||
* system level, but exist at the cgroup level, means all non-idle tasks
|
||||
* in a cgroup are delayed on the CPU resource which used by others outside
|
||||
* of the cgroup or throttled by the cgroup cpu.max configuration.
|
||||
*
|
||||
* SOME = nr_delayed_tasks != 0
|
||||
* FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
|
||||
* FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
|
||||
*
|
||||
* What it means for a task to be productive is defined differently
|
||||
* for each resource. For IO, productive means a running task. For
|
||||
* memory, productive means a running task that isn't a reclaimer. For
|
||||
* CPU, productive means an oncpu task.
|
||||
*
|
||||
* Naturally, the FULL state doesn't exist for the CPU resource at the
|
||||
* system level, but exist at the cgroup level. At the cgroup level,
|
||||
* FULL means all non-idle tasks in the cgroup are delayed on the CPU
|
||||
* resource which is being used by others outside of the cgroup or
|
||||
* throttled by the cgroup cpu.max configuration.
|
||||
*
|
||||
* The percentage of wallclock time spent in those compound stall
|
||||
* states gives pressure numbers between 0 and 100 for each resource,
|
||||
|
@ -82,13 +88,13 @@
|
|||
*
|
||||
* threads = min(nr_nonidle_tasks, nr_cpus)
|
||||
* SOME = min(nr_delayed_tasks / threads, 1)
|
||||
* FULL = (threads - min(nr_running_tasks, threads)) / threads
|
||||
* FULL = (threads - min(nr_productive_tasks, threads)) / threads
|
||||
*
|
||||
* For the 257 number crunchers on 256 CPUs, this yields:
|
||||
*
|
||||
* threads = min(257, 256)
|
||||
* SOME = min(1 / 256, 1) = 0.4%
|
||||
* FULL = (256 - min(257, 256)) / 256 = 0%
|
||||
* FULL = (256 - min(256, 256)) / 256 = 0%
|
||||
*
|
||||
* For the 1 out of 4 memory-delayed tasks, this yields:
|
||||
*
|
||||
|
@ -113,7 +119,7 @@
|
|||
* For each runqueue, we track:
|
||||
*
|
||||
* tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
|
||||
* tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
|
||||
* tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
|
||||
* tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
|
||||
*
|
||||
* and then periodically aggregate:
|
||||
|
@ -234,7 +240,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
|
|||
case PSI_MEM_SOME:
|
||||
return unlikely(tasks[NR_MEMSTALL]);
|
||||
case PSI_MEM_FULL:
|
||||
return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
|
||||
return unlikely(tasks[NR_MEMSTALL] &&
|
||||
tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
|
||||
case PSI_CPU_SOME:
|
||||
return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
|
||||
case PSI_CPU_FULL:
|
||||
|
@ -711,10 +718,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
|
|||
if (groupc->tasks[t]) {
|
||||
groupc->tasks[t]--;
|
||||
} else if (!psi_bug) {
|
||||
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
|
||||
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
|
||||
cpu, t, groupc->tasks[0],
|
||||
groupc->tasks[1], groupc->tasks[2],
|
||||
groupc->tasks[3], clear, set);
|
||||
groupc->tasks[3], groupc->tasks[4],
|
||||
clear, set);
|
||||
psi_bug = 1;
|
||||
}
|
||||
}
|
||||
|
@ -854,12 +862,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
|
|||
int clear = TSK_ONCPU, set = 0;
|
||||
|
||||
/*
|
||||
* When we're going to sleep, psi_dequeue() lets us handle
|
||||
* TSK_RUNNING and TSK_IOWAIT here, where we can combine it
|
||||
* with TSK_ONCPU and save walking common ancestors twice.
|
||||
* When we're going to sleep, psi_dequeue() lets us
|
||||
* handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
|
||||
* TSK_IOWAIT here, where we can combine it with
|
||||
* TSK_ONCPU and save walking common ancestors twice.
|
||||
*/
|
||||
if (sleep) {
|
||||
clear |= TSK_RUNNING;
|
||||
if (prev->in_memstall)
|
||||
clear |= TSK_MEMSTALL_RUNNING;
|
||||
if (prev->in_iowait)
|
||||
set |= TSK_IOWAIT;
|
||||
}
|
||||
|
@ -908,7 +919,7 @@ void psi_memstall_enter(unsigned long *flags)
|
|||
rq = this_rq_lock_irq(&rf);
|
||||
|
||||
current->in_memstall = 1;
|
||||
psi_task_change(current, 0, TSK_MEMSTALL);
|
||||
psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
|
||||
|
||||
rq_unlock_irq(rq, &rf);
|
||||
}
|
||||
|
@ -937,7 +948,7 @@ void psi_memstall_leave(unsigned long *flags)
|
|||
rq = this_rq_lock_irq(&rf);
|
||||
|
||||
current->in_memstall = 0;
|
||||
psi_task_change(current, TSK_MEMSTALL, 0);
|
||||
psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
|
||||
|
||||
rq_unlock_irq(rq, &rf);
|
||||
}
|
||||
|
|
|
@ -118,6 +118,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
|
|||
if (static_branch_likely(&psi_disabled))
|
||||
return;
|
||||
|
||||
if (p->in_memstall)
|
||||
set |= TSK_MEMSTALL_RUNNING;
|
||||
|
||||
if (!wakeup || p->sched_psi_wake_requeue) {
|
||||
if (p->in_memstall)
|
||||
set |= TSK_MEMSTALL;
|
||||
|
@ -148,7 +151,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
|
|||
return;
|
||||
|
||||
if (p->in_memstall)
|
||||
clear |= TSK_MEMSTALL;
|
||||
clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
|
||||
|
||||
psi_task_change(p, clear, 0);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue