Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Thomas Gleixner: "This scheduler update provides: - The (hopefully) final fix for the vtime accounting issues which were around for quite some time - Use types known to user space in UAPI headers to unbreak user space builds - Make load balancing respect the current scheduling domain again instead of evaluating unrelated CPUs" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/headers/uapi: Fix linux/sched/types.h userspace compilation errors sched/fair: Fix load_balance() affinity redo path sched/cputime: Accumulate vtime on top of nsec clocksource sched/cputime: Move the vtime task fields to their own struct sched/cputime: Rename vtime fields sched/cputime: Always set tsk->vtime_snap_whence after accounting vtime vtime, sched/cputime: Remove vtime_account_user() Revert "sched/cputime: Refactor the cputime_adjust() code"
This commit is contained in:
commit
4fde846ac0
|
@ -170,9 +170,9 @@ extern struct cred init_cred;
|
|||
|
||||
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
|
||||
# define INIT_VTIME(tsk) \
|
||||
.vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount), \
|
||||
.vtime_snap = 0, \
|
||||
.vtime_snap_whence = VTIME_SYS,
|
||||
.vtime.seqcount = SEQCNT_ZERO(tsk.vtime.seqcount), \
|
||||
.vtime.starttime = 0, \
|
||||
.vtime.state = VTIME_SYS,
|
||||
#else
|
||||
# define INIT_VTIME(tsk)
|
||||
#endif
|
||||
|
|
|
@ -223,6 +223,24 @@ struct task_cputime {
|
|||
#define prof_exp stime
|
||||
#define sched_exp sum_exec_runtime
|
||||
|
||||
enum vtime_state {
|
||||
/* Task is sleeping or running in a CPU with VTIME inactive: */
|
||||
VTIME_INACTIVE = 0,
|
||||
/* Task runs in userspace in a CPU with VTIME active: */
|
||||
VTIME_USER,
|
||||
/* Task runs in kernelspace in a CPU with VTIME active: */
|
||||
VTIME_SYS,
|
||||
};
|
||||
|
||||
struct vtime {
|
||||
seqcount_t seqcount;
|
||||
unsigned long long starttime;
|
||||
enum vtime_state state;
|
||||
u64 utime;
|
||||
u64 stime;
|
||||
u64 gtime;
|
||||
};
|
||||
|
||||
struct sched_info {
|
||||
#ifdef CONFIG_SCHED_INFO
|
||||
/* Cumulative counters: */
|
||||
|
@ -688,16 +706,7 @@ struct task_struct {
|
|||
u64 gtime;
|
||||
struct prev_cputime prev_cputime;
|
||||
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
|
||||
seqcount_t vtime_seqcount;
|
||||
unsigned long long vtime_snap;
|
||||
enum {
|
||||
/* Task is sleeping or running in a CPU with VTIME inactive: */
|
||||
VTIME_INACTIVE = 0,
|
||||
/* Task runs in userspace in a CPU with VTIME active: */
|
||||
VTIME_USER,
|
||||
/* Task runs in kernelspace in a CPU with VTIME active: */
|
||||
VTIME_SYS,
|
||||
} vtime_snap_whence;
|
||||
struct vtime vtime;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_NO_HZ_FULL
|
||||
|
|
|
@ -67,19 +67,12 @@ static inline void vtime_account_system(struct task_struct *tsk) { }
|
|||
|
||||
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
|
||||
extern void arch_vtime_task_switch(struct task_struct *tsk);
|
||||
extern void vtime_account_user(struct task_struct *tsk);
|
||||
extern void vtime_user_enter(struct task_struct *tsk);
|
||||
|
||||
static inline void vtime_user_exit(struct task_struct *tsk)
|
||||
{
|
||||
vtime_account_user(tsk);
|
||||
}
|
||||
|
||||
extern void vtime_user_exit(struct task_struct *tsk);
|
||||
extern void vtime_guest_enter(struct task_struct *tsk);
|
||||
extern void vtime_guest_exit(struct task_struct *tsk);
|
||||
extern void vtime_init_idle(struct task_struct *tsk, int cpu);
|
||||
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */
|
||||
static inline void vtime_account_user(struct task_struct *tsk) { }
|
||||
static inline void vtime_user_enter(struct task_struct *tsk) { }
|
||||
static inline void vtime_user_exit(struct task_struct *tsk) { }
|
||||
static inline void vtime_guest_enter(struct task_struct *tsk) { }
|
||||
|
|
|
@ -54,21 +54,21 @@ struct sched_param {
|
|||
* available in the scheduling class file or in Documentation/.
|
||||
*/
|
||||
struct sched_attr {
|
||||
u32 size;
|
||||
__u32 size;
|
||||
|
||||
u32 sched_policy;
|
||||
u64 sched_flags;
|
||||
__u32 sched_policy;
|
||||
__u64 sched_flags;
|
||||
|
||||
/* SCHED_NORMAL, SCHED_BATCH */
|
||||
s32 sched_nice;
|
||||
__s32 sched_nice;
|
||||
|
||||
/* SCHED_FIFO, SCHED_RR */
|
||||
u32 sched_priority;
|
||||
__u32 sched_priority;
|
||||
|
||||
/* SCHED_DEADLINE */
|
||||
u64 sched_runtime;
|
||||
u64 sched_deadline;
|
||||
u64 sched_period;
|
||||
__u64 sched_runtime;
|
||||
__u64 sched_deadline;
|
||||
__u64 sched_period;
|
||||
};
|
||||
|
||||
#endif /* _UAPI_LINUX_SCHED_TYPES_H */
|
||||
|
|
|
@ -1637,9 +1637,9 @@ static __latent_entropy struct task_struct *copy_process(
|
|||
prev_cputime_init(&p->prev_cputime);
|
||||
|
||||
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
|
||||
seqcount_init(&p->vtime_seqcount);
|
||||
p->vtime_snap = 0;
|
||||
p->vtime_snap_whence = VTIME_INACTIVE;
|
||||
seqcount_init(&p->vtime.seqcount);
|
||||
p->vtime.starttime = 0;
|
||||
p->vtime.state = VTIME_INACTIVE;
|
||||
#endif
|
||||
|
||||
#if defined(SPLIT_RSS_COUNTING)
|
||||
|
|
|
@ -611,17 +611,23 @@ static void cputime_adjust(struct task_cputime *curr,
|
|||
utime = curr->utime;
|
||||
|
||||
/*
|
||||
* If either stime or both stime and utime are 0, assume all runtime is
|
||||
* userspace. Once a task gets some ticks, the monotonicy code at
|
||||
* 'update' will ensure things converge to the observed ratio.
|
||||
* If either stime or utime are 0, assume all runtime is userspace.
|
||||
* Once a task gets some ticks, the monotonicy code at 'update:'
|
||||
* will ensure things converge to the observed ratio.
|
||||
*/
|
||||
if (stime != 0) {
|
||||
if (utime == 0)
|
||||
stime = rtime;
|
||||
else
|
||||
stime = scale_stime(stime, rtime, stime + utime);
|
||||
if (stime == 0) {
|
||||
utime = rtime;
|
||||
goto update;
|
||||
}
|
||||
|
||||
if (utime == 0) {
|
||||
stime = rtime;
|
||||
goto update;
|
||||
}
|
||||
|
||||
stime = scale_stime(stime, rtime, stime + utime);
|
||||
|
||||
update:
|
||||
/*
|
||||
* Make sure stime doesn't go backwards; this preserves monotonicity
|
||||
* for utime because rtime is monotonic.
|
||||
|
@ -673,20 +679,21 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
|
|||
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
|
||||
|
||||
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
|
||||
static u64 vtime_delta(struct task_struct *tsk)
|
||||
static u64 vtime_delta(struct vtime *vtime)
|
||||
{
|
||||
unsigned long now = READ_ONCE(jiffies);
|
||||
unsigned long long clock;
|
||||
|
||||
if (time_before(now, (unsigned long)tsk->vtime_snap))
|
||||
clock = sched_clock_cpu(smp_processor_id());
|
||||
if (clock < vtime->starttime)
|
||||
return 0;
|
||||
|
||||
return jiffies_to_nsecs(now - tsk->vtime_snap);
|
||||
return clock - vtime->starttime;
|
||||
}
|
||||
|
||||
static u64 get_vtime_delta(struct task_struct *tsk)
|
||||
static u64 get_vtime_delta(struct vtime *vtime)
|
||||
{
|
||||
unsigned long now = READ_ONCE(jiffies);
|
||||
u64 delta, other;
|
||||
u64 delta = vtime_delta(vtime);
|
||||
u64 other;
|
||||
|
||||
/*
|
||||
* Unlike tick based timing, vtime based timing never has lost
|
||||
|
@ -695,104 +702,138 @@ static u64 get_vtime_delta(struct task_struct *tsk)
|
|||
* elapsed time. Limit account_other_time to prevent rounding
|
||||
* errors from causing elapsed vtime to go negative.
|
||||
*/
|
||||
delta = jiffies_to_nsecs(now - tsk->vtime_snap);
|
||||
other = account_other_time(delta);
|
||||
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
|
||||
tsk->vtime_snap = now;
|
||||
WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
|
||||
vtime->starttime += delta;
|
||||
|
||||
return delta - other;
|
||||
}
|
||||
|
||||
static void __vtime_account_system(struct task_struct *tsk)
|
||||
static void __vtime_account_system(struct task_struct *tsk,
|
||||
struct vtime *vtime)
|
||||
{
|
||||
account_system_time(tsk, irq_count(), get_vtime_delta(tsk));
|
||||
vtime->stime += get_vtime_delta(vtime);
|
||||
if (vtime->stime >= TICK_NSEC) {
|
||||
account_system_time(tsk, irq_count(), vtime->stime);
|
||||
vtime->stime = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static void vtime_account_guest(struct task_struct *tsk,
|
||||
struct vtime *vtime)
|
||||
{
|
||||
vtime->gtime += get_vtime_delta(vtime);
|
||||
if (vtime->gtime >= TICK_NSEC) {
|
||||
account_guest_time(tsk, vtime->gtime);
|
||||
vtime->gtime = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void vtime_account_system(struct task_struct *tsk)
|
||||
{
|
||||
if (!vtime_delta(tsk))
|
||||
struct vtime *vtime = &tsk->vtime;
|
||||
|
||||
if (!vtime_delta(vtime))
|
||||
return;
|
||||
|
||||
write_seqcount_begin(&tsk->vtime_seqcount);
|
||||
__vtime_account_system(tsk);
|
||||
write_seqcount_end(&tsk->vtime_seqcount);
|
||||
}
|
||||
|
||||
void vtime_account_user(struct task_struct *tsk)
|
||||
{
|
||||
write_seqcount_begin(&tsk->vtime_seqcount);
|
||||
tsk->vtime_snap_whence = VTIME_SYS;
|
||||
if (vtime_delta(tsk))
|
||||
account_user_time(tsk, get_vtime_delta(tsk));
|
||||
write_seqcount_end(&tsk->vtime_seqcount);
|
||||
write_seqcount_begin(&vtime->seqcount);
|
||||
/* We might have scheduled out from guest path */
|
||||
if (current->flags & PF_VCPU)
|
||||
vtime_account_guest(tsk, vtime);
|
||||
else
|
||||
__vtime_account_system(tsk, vtime);
|
||||
write_seqcount_end(&vtime->seqcount);
|
||||
}
|
||||
|
||||
void vtime_user_enter(struct task_struct *tsk)
|
||||
{
|
||||
write_seqcount_begin(&tsk->vtime_seqcount);
|
||||
if (vtime_delta(tsk))
|
||||
__vtime_account_system(tsk);
|
||||
tsk->vtime_snap_whence = VTIME_USER;
|
||||
write_seqcount_end(&tsk->vtime_seqcount);
|
||||
struct vtime *vtime = &tsk->vtime;
|
||||
|
||||
write_seqcount_begin(&vtime->seqcount);
|
||||
__vtime_account_system(tsk, vtime);
|
||||
vtime->state = VTIME_USER;
|
||||
write_seqcount_end(&vtime->seqcount);
|
||||
}
|
||||
|
||||
void vtime_user_exit(struct task_struct *tsk)
|
||||
{
|
||||
struct vtime *vtime = &tsk->vtime;
|
||||
|
||||
write_seqcount_begin(&vtime->seqcount);
|
||||
vtime->utime += get_vtime_delta(vtime);
|
||||
if (vtime->utime >= TICK_NSEC) {
|
||||
account_user_time(tsk, vtime->utime);
|
||||
vtime->utime = 0;
|
||||
}
|
||||
vtime->state = VTIME_SYS;
|
||||
write_seqcount_end(&vtime->seqcount);
|
||||
}
|
||||
|
||||
void vtime_guest_enter(struct task_struct *tsk)
|
||||
{
|
||||
struct vtime *vtime = &tsk->vtime;
|
||||
/*
|
||||
* The flags must be updated under the lock with
|
||||
* the vtime_snap flush and update.
|
||||
* the vtime_starttime flush and update.
|
||||
* That enforces a right ordering and update sequence
|
||||
* synchronization against the reader (task_gtime())
|
||||
* that can thus safely catch up with a tickless delta.
|
||||
*/
|
||||
write_seqcount_begin(&tsk->vtime_seqcount);
|
||||
if (vtime_delta(tsk))
|
||||
__vtime_account_system(tsk);
|
||||
write_seqcount_begin(&vtime->seqcount);
|
||||
__vtime_account_system(tsk, vtime);
|
||||
current->flags |= PF_VCPU;
|
||||
write_seqcount_end(&tsk->vtime_seqcount);
|
||||
write_seqcount_end(&vtime->seqcount);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(vtime_guest_enter);
|
||||
|
||||
void vtime_guest_exit(struct task_struct *tsk)
|
||||
{
|
||||
write_seqcount_begin(&tsk->vtime_seqcount);
|
||||
__vtime_account_system(tsk);
|
||||
struct vtime *vtime = &tsk->vtime;
|
||||
|
||||
write_seqcount_begin(&vtime->seqcount);
|
||||
vtime_account_guest(tsk, vtime);
|
||||
current->flags &= ~PF_VCPU;
|
||||
write_seqcount_end(&tsk->vtime_seqcount);
|
||||
write_seqcount_end(&vtime->seqcount);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(vtime_guest_exit);
|
||||
|
||||
void vtime_account_idle(struct task_struct *tsk)
|
||||
{
|
||||
account_idle_time(get_vtime_delta(tsk));
|
||||
account_idle_time(get_vtime_delta(&tsk->vtime));
|
||||
}
|
||||
|
||||
void arch_vtime_task_switch(struct task_struct *prev)
|
||||
{
|
||||
write_seqcount_begin(&prev->vtime_seqcount);
|
||||
prev->vtime_snap_whence = VTIME_INACTIVE;
|
||||
write_seqcount_end(&prev->vtime_seqcount);
|
||||
struct vtime *vtime = &prev->vtime;
|
||||
|
||||
write_seqcount_begin(¤t->vtime_seqcount);
|
||||
current->vtime_snap_whence = VTIME_SYS;
|
||||
current->vtime_snap = jiffies;
|
||||
write_seqcount_end(¤t->vtime_seqcount);
|
||||
write_seqcount_begin(&vtime->seqcount);
|
||||
vtime->state = VTIME_INACTIVE;
|
||||
write_seqcount_end(&vtime->seqcount);
|
||||
|
||||
vtime = ¤t->vtime;
|
||||
|
||||
write_seqcount_begin(&vtime->seqcount);
|
||||
vtime->state = VTIME_SYS;
|
||||
vtime->starttime = sched_clock_cpu(smp_processor_id());
|
||||
write_seqcount_end(&vtime->seqcount);
|
||||
}
|
||||
|
||||
void vtime_init_idle(struct task_struct *t, int cpu)
|
||||
{
|
||||
struct vtime *vtime = &t->vtime;
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
write_seqcount_begin(&t->vtime_seqcount);
|
||||
t->vtime_snap_whence = VTIME_SYS;
|
||||
t->vtime_snap = jiffies;
|
||||
write_seqcount_end(&t->vtime_seqcount);
|
||||
write_seqcount_begin(&vtime->seqcount);
|
||||
vtime->state = VTIME_SYS;
|
||||
vtime->starttime = sched_clock_cpu(cpu);
|
||||
write_seqcount_end(&vtime->seqcount);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
u64 task_gtime(struct task_struct *t)
|
||||
{
|
||||
struct vtime *vtime = &t->vtime;
|
||||
unsigned int seq;
|
||||
u64 gtime;
|
||||
|
||||
|
@ -800,13 +841,13 @@ u64 task_gtime(struct task_struct *t)
|
|||
return t->gtime;
|
||||
|
||||
do {
|
||||
seq = read_seqcount_begin(&t->vtime_seqcount);
|
||||
seq = read_seqcount_begin(&vtime->seqcount);
|
||||
|
||||
gtime = t->gtime;
|
||||
if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
|
||||
gtime += vtime_delta(t);
|
||||
if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
|
||||
gtime += vtime->gtime + vtime_delta(vtime);
|
||||
|
||||
} while (read_seqcount_retry(&t->vtime_seqcount, seq));
|
||||
} while (read_seqcount_retry(&vtime->seqcount, seq));
|
||||
|
||||
return gtime;
|
||||
}
|
||||
|
@ -818,8 +859,9 @@ u64 task_gtime(struct task_struct *t)
|
|||
*/
|
||||
void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
|
||||
{
|
||||
u64 delta;
|
||||
struct vtime *vtime = &t->vtime;
|
||||
unsigned int seq;
|
||||
u64 delta;
|
||||
|
||||
if (!vtime_accounting_enabled()) {
|
||||
*utime = t->utime;
|
||||
|
@ -828,25 +870,25 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
|
|||
}
|
||||
|
||||
do {
|
||||
seq = read_seqcount_begin(&t->vtime_seqcount);
|
||||
seq = read_seqcount_begin(&vtime->seqcount);
|
||||
|
||||
*utime = t->utime;
|
||||
*stime = t->stime;
|
||||
|
||||
/* Task is sleeping, nothing to add */
|
||||
if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
|
||||
if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
|
||||
continue;
|
||||
|
||||
delta = vtime_delta(t);
|
||||
delta = vtime_delta(vtime);
|
||||
|
||||
/*
|
||||
* Task runs either in user or kernel space, add pending nohz time to
|
||||
* the right place.
|
||||
*/
|
||||
if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU)
|
||||
*utime += delta;
|
||||
else if (t->vtime_snap_whence == VTIME_SYS)
|
||||
*stime += delta;
|
||||
} while (read_seqcount_retry(&t->vtime_seqcount, seq));
|
||||
if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
|
||||
*utime += vtime->utime + delta;
|
||||
else if (vtime->state == VTIME_SYS)
|
||||
*stime += vtime->stime + delta;
|
||||
} while (read_seqcount_retry(&vtime->seqcount, seq));
|
||||
}
|
||||
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
|
||||
|
|
|
@ -6646,10 +6646,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|||
* our sched_group. We may want to revisit it if we couldn't
|
||||
* meet load balance goals by pulling other tasks on src_cpu.
|
||||
*
|
||||
* Also avoid computing new_dst_cpu if we have already computed
|
||||
* one in current iteration.
|
||||
* Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
|
||||
* already computed one in current iteration.
|
||||
*/
|
||||
if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
|
||||
if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
|
||||
return 0;
|
||||
|
||||
/* Prevent to re-select dst_cpu via env's cpus */
|
||||
|
@ -8022,14 +8022,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
|
|||
.tasks = LIST_HEAD_INIT(env.tasks),
|
||||
};
|
||||
|
||||
/*
|
||||
* For NEWLY_IDLE load_balancing, we don't need to consider
|
||||
* other cpus in our group
|
||||
*/
|
||||
if (idle == CPU_NEWLY_IDLE)
|
||||
env.dst_grpmask = NULL;
|
||||
|
||||
cpumask_copy(cpus, cpu_active_mask);
|
||||
cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
|
||||
|
||||
schedstat_inc(sd->lb_count[idle]);
|
||||
|
||||
|
@ -8151,7 +8144,15 @@ static int load_balance(int this_cpu, struct rq *this_rq,
|
|||
/* All tasks on this runqueue were pinned by CPU affinity */
|
||||
if (unlikely(env.flags & LBF_ALL_PINNED)) {
|
||||
cpumask_clear_cpu(cpu_of(busiest), cpus);
|
||||
if (!cpumask_empty(cpus)) {
|
||||
/*
|
||||
* Attempting to continue load balancing at the current
|
||||
* sched_domain level only makes sense if there are
|
||||
* active CPUs remaining as possible busiest CPUs to
|
||||
* pull load from which are not contained within the
|
||||
* destination group that is receiving any migrated
|
||||
* load.
|
||||
*/
|
||||
if (!cpumask_subset(cpus, env.dst_grpmask)) {
|
||||
env.loop = 0;
|
||||
env.loop_break = sched_nr_migrate_break;
|
||||
goto redo;
|
||||
|
@ -8447,6 +8448,13 @@ static int active_load_balance_cpu_stop(void *data)
|
|||
.src_cpu = busiest_rq->cpu,
|
||||
.src_rq = busiest_rq,
|
||||
.idle = CPU_IDLE,
|
||||
/*
|
||||
* can_migrate_task() doesn't need to compute new_dst_cpu
|
||||
* for active balancing. Since we have CPU_IDLE, but no
|
||||
* @dst_grpmask we need to make that test go away with lying
|
||||
* about DST_PINNED.
|
||||
*/
|
||||
.flags = LBF_DST_PINNED,
|
||||
};
|
||||
|
||||
schedstat_inc(sd->alb_count);
|
||||
|
|
Loading…
Reference in New Issue