diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 7cd76f93a438..f7ea8e21656b 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -515,7 +515,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, dvcpu->arch.wait = 0; if (swq_has_sleeper(&dvcpu->wq)) - swake_up(&dvcpu->wq); + swake_up_one(&dvcpu->wq); return 0; } @@ -1204,7 +1204,7 @@ static void kvm_mips_comparecount_func(unsigned long data) vcpu->arch.wait = 0; if (swq_has_sleeper(&vcpu->wq)) - swake_up(&vcpu->wq); + swake_up_one(&vcpu->wq); } /* low level hrtimer wake routine */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index de686b340f4a..ee4a8854985e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -216,7 +216,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) wqp = kvm_arch_vcpu_wq(vcpu); if (swq_has_sleeper(wqp)) { - swake_up(wqp); + swake_up_one(wqp); ++vcpu->stat.halt_wakeup; } @@ -3188,7 +3188,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) } } - prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); + prepare_to_swait_exclusive(&vc->wq, &wait, TASK_INTERRUPTIBLE); if (kvmppc_vcore_check_block(vc)) { finish_swait(&vc->wq, &wait); @@ -3311,7 +3311,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) kvmppc_start_thread(vcpu, vc); trace_kvm_guest_enter(vcpu); } else if (vc->vcore_state == VCORE_SLEEPING) { - swake_up(&vc->wq); + swake_up_one(&vc->wq); } } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index daa09f89ca2d..fcb55b02990e 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -1145,7 +1145,7 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) * yield-candidate. */ vcpu->preempted = true; - swake_up(&vcpu->wq); + swake_up_one(&vcpu->wq); vcpu->stat.halt_wakeup++; } /* diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5b2300b818af..a37bda38d205 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -154,7 +154,7 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel) for (;;) { if (!n.halted) - prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); if (hlist_unhashed(&n.link)) break; @@ -188,7 +188,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n) if (n->halted) smp_send_reschedule(n->cpu); else if (swq_has_sleeper(&n->wq)) - swake_up(&n->wq); + swake_up_one(&n->wq); } static void apf_task_wake_all(void) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index b5cd8465d44f..d536d457517b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1379,7 +1379,7 @@ static void apic_timer_expired(struct kvm_lapic *apic) * using swait_active() is safe. */ if (swait_active(q)) - swake_up(q); + swake_up_one(q); if (apic_lvtt_tscdeadline(apic)) ktimer->expired_tscdeadline = ktimer->tscdeadline; diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 8796ba387152..4cf06a64bc02 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -164,6 +164,7 @@ enum cpuhp_state { CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE, CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE, CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE, + CPUHP_AP_WATCHDOG_ONLINE, CPUHP_AP_WORKQUEUE_ONLINE, CPUHP_AP_RCUTREE_ONLINE, CPUHP_AP_ONLINE_DYN, diff --git a/include/linux/nmi.h b/include/linux/nmi.h index b8d868d23e79..08f9247e9827 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -45,12 +45,18 @@ extern void touch_softlockup_watchdog(void); extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); extern unsigned int softlockup_panic; -#else + +extern int lockup_detector_online_cpu(unsigned int cpu); +extern int lockup_detector_offline_cpu(unsigned int cpu); +#else /* CONFIG_SOFTLOCKUP_DETECTOR */ static inline void touch_softlockup_watchdog_sched(void) { } static inline void touch_softlockup_watchdog(void) { } static inline void touch_softlockup_watchdog_sync(void) { } static inline void touch_all_softlockup_watchdogs(void) { } -#endif + +#define lockup_detector_online_cpu NULL +#define lockup_detector_offline_cpu NULL +#endif /* CONFIG_SOFTLOCKUP_DETECTOR */ #ifdef CONFIG_DETECT_HUNG_TASK void reset_hung_task_detector(void); diff --git a/include/linux/sched.h b/include/linux/sched.h index 43731fe51c97..e0f4f56c9310 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1017,7 +1017,6 @@ struct task_struct { u64 last_sum_exec_runtime; struct callback_head numa_work; - struct list_head numa_entry; struct numa_group *numa_group; /* diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 1c1a1512ec55..913488d828cb 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -40,7 +40,6 @@ extern unsigned int sysctl_numa_balancing_scan_size; #ifdef CONFIG_SCHED_DEBUG extern __read_mostly unsigned int sysctl_sched_migration_cost; extern __read_mostly unsigned int sysctl_sched_nr_migrate; -extern __read_mostly unsigned int sysctl_sched_time_avg; int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h index c174844cf663..d0884b525001 100644 --- a/include/linux/smpboot.h +++ b/include/linux/smpboot.h @@ -25,8 +25,6 @@ struct smpboot_thread_data; * parked (cpu offline) * @unpark: Optional unpark function, called when the thread is * unparked (cpu online) - * @cpumask: Internal state. To update which threads are unparked, - * call smpboot_update_cpumask_percpu_thread(). * @selfparking: Thread is not parked by the park function. * @thread_comm: The base name of the thread */ @@ -40,23 +38,12 @@ struct smp_hotplug_thread { void (*cleanup)(unsigned int cpu, bool online); void (*park)(unsigned int cpu); void (*unpark)(unsigned int cpu); - cpumask_var_t cpumask; bool selfparking; const char *thread_comm; }; -int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread, - const struct cpumask *cpumask); - -static inline int -smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) -{ - return smpboot_register_percpu_thread_cpumask(plug_thread, - cpu_possible_mask); -} +int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread); void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread); -void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, - const struct cpumask *); #endif diff --git a/include/linux/swait.h b/include/linux/swait.h index bf8cb0dee23c..73e06e9986d4 100644 --- a/include/linux/swait.h +++ b/include/linux/swait.h @@ -16,7 +16,7 @@ * wait-queues, but the semantics are actually completely different, and * every single user we have ever had has been buggy (or pointless). * - * A "swake_up()" only wakes up _one_ waiter, which is not at all what + * A "swake_up_one()" only wakes up _one_ waiter, which is not at all what * "wake_up()" does, and has led to problems. In other cases, it has * been fine, because there's only ever one waiter (kvm), but in that * case gthe whole "simple" wait-queue is just pointless to begin with, @@ -38,8 +38,8 @@ * all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right * sleeper state. * - * - the exclusive mode; because this requires preserving the list order - * and this is hard. + * - the !exclusive mode; because that leads to O(n) wakeups, everything is + * exclusive. * * - custom wake callback functions; because you cannot give any guarantees * about random code. This also allows swait to be used in RT, such that @@ -115,7 +115,7 @@ extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name * CPU0 - waker CPU1 - waiter * * for (;;) { - * @cond = true; prepare_to_swait(&wq_head, &wait, state); + * @cond = true; prepare_to_swait_exclusive(&wq_head, &wait, state); * smp_mb(); // smp_mb() from set_current_state() * if (swait_active(wq_head)) if (@cond) * wake_up(wq_head); break; @@ -157,20 +157,20 @@ static inline bool swq_has_sleeper(struct swait_queue_head *wq) return swait_active(wq); } -extern void swake_up(struct swait_queue_head *q); +extern void swake_up_one(struct swait_queue_head *q); extern void swake_up_all(struct swait_queue_head *q); extern void swake_up_locked(struct swait_queue_head *q); -extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state); +extern void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state); extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state); extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait); extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait); -/* as per ___wait_event() but for swait, therefore "exclusive == 0" */ +/* as per ___wait_event() but for swait, therefore "exclusive == 1" */ #define ___swait_event(wq, condition, state, ret, cmd) \ ({ \ + __label__ __out; \ struct swait_queue __wait; \ long __ret = ret; \ \ @@ -183,20 +183,20 @@ extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait); \ if (___wait_is_interruptible(state) && __int) { \ __ret = __int; \ - break; \ + goto __out; \ } \ \ cmd; \ } \ finish_swait(&wq, &__wait); \ - __ret; \ +__out: __ret; \ }) #define __swait_event(wq, condition) \ (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \ schedule()) -#define swait_event(wq, condition) \ +#define swait_event_exclusive(wq, condition) \ do { \ if (condition) \ break; \ @@ -208,7 +208,7 @@ do { \ TASK_UNINTERRUPTIBLE, timeout, \ __ret = schedule_timeout(__ret)) -#define swait_event_timeout(wq, condition, timeout) \ +#define swait_event_timeout_exclusive(wq, condition, timeout) \ ({ \ long __ret = timeout; \ if (!___wait_cond_timeout(condition)) \ @@ -220,7 +220,7 @@ do { \ ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0, \ schedule()) -#define swait_event_interruptible(wq, condition) \ +#define swait_event_interruptible_exclusive(wq, condition) \ ({ \ int __ret = 0; \ if (!(condition)) \ @@ -233,7 +233,7 @@ do { \ TASK_INTERRUPTIBLE, timeout, \ __ret = schedule_timeout(__ret)) -#define swait_event_interruptible_timeout(wq, condition, timeout) \ +#define swait_event_interruptible_timeout_exclusive(wq, condition, timeout)\ ({ \ long __ret = timeout; \ if (!___wait_cond_timeout(condition)) \ @@ -246,7 +246,7 @@ do { \ (void)___swait_event(wq, condition, TASK_IDLE, 0, schedule()) /** - * swait_event_idle - wait without system load contribution + * swait_event_idle_exclusive - wait without system load contribution * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * @@ -257,7 +257,7 @@ do { \ * condition and doesn't want to contribute to system load. Signals are * ignored. */ -#define swait_event_idle(wq, condition) \ +#define swait_event_idle_exclusive(wq, condition) \ do { \ if (condition) \ break; \ @@ -270,7 +270,7 @@ do { \ __ret = schedule_timeout(__ret)) /** - * swait_event_idle_timeout - wait up to timeout without load contribution + * swait_event_idle_timeout_exclusive - wait up to timeout without load contribution * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout at which we'll give up in jiffies @@ -288,7 +288,7 @@ do { \ * or the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed. */ -#define swait_event_idle_timeout(wq, condition, timeout) \ +#define swait_event_idle_timeout_exclusive(wq, condition, timeout) \ ({ \ long __ret = timeout; \ if (!___wait_cond_timeout(condition)) \ diff --git a/kernel/cpu.c b/kernel/cpu.c index 2f8f338e77cf..15be70aae8ac 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1344,6 +1344,11 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = perf_event_init_cpu, .teardown.single = perf_event_exit_cpu, }, + [CPUHP_AP_WATCHDOG_ONLINE] = { + .name = "lockup_detector:online", + .startup.single = lockup_detector_online_cpu, + .teardown.single = lockup_detector_offline_cpu, + }, [CPUHP_AP_WORKQUEUE_ONLINE] = { .name = "workqueue:online", .startup.single = workqueue_online_cpu, diff --git a/kernel/kthread.c b/kernel/kthread.c index 486dedbd9af5..087d18d771b5 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -190,7 +190,7 @@ static void __kthread_parkme(struct kthread *self) if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; - complete_all(&self->parked); + complete(&self->parked); schedule(); } __set_current_state(TASK_RUNNING); @@ -471,7 +471,6 @@ void kthread_unpark(struct task_struct *k) if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); - reinit_completion(&kthread->parked); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. @@ -499,6 +498,9 @@ int kthread_park(struct task_struct *k) if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; + if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) + return -EBUSY; + set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 87331565e505..70178f6ffdc4 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -92,7 +92,7 @@ static void s2idle_enter(void) /* Push all the CPUs into the idle loop. */ wake_up_all_idle_cpus(); /* Make the current CPU wait so it can enter the idle loop too. */ - swait_event(s2idle_wait_head, + swait_event_exclusive(s2idle_wait_head, s2idle_state == S2IDLE_STATE_WAKE); cpuidle_pause(); @@ -160,7 +160,7 @@ void s2idle_wake(void) raw_spin_lock_irqsave(&s2idle_lock, flags); if (s2idle_state > S2IDLE_STATE_NONE) { s2idle_state = S2IDLE_STATE_WAKE; - swake_up(&s2idle_wait_head); + swake_up_one(&s2idle_wait_head); } raw_spin_unlock_irqrestore(&s2idle_lock, flags); } diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 622792abe41a..04fc2ed71af8 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -110,7 +110,7 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx) WRITE_ONCE(sp->srcu_lock_nesting[idx], newval); if (!newval && READ_ONCE(sp->srcu_gp_waiting)) - swake_up(&sp->srcu_wq); + swake_up_one(&sp->srcu_wq); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -140,7 +140,7 @@ void srcu_drive_gp(struct work_struct *wp) idx = sp->srcu_idx; WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ - swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); + swait_event_exclusive(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ /* Invoke the callbacks we removed above. */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6930934e8b9f..0b760c1369f7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1701,7 +1701,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) !READ_ONCE(rsp->gp_flags) || !rsp->gp_kthread) return; - swake_up(&rsp->gp_wq); + swake_up_one(&rsp->gp_wq); } /* @@ -2015,7 +2015,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) } /* - * Helper function for swait_event_idle() wakeup at force-quiescent-state + * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state * time. */ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) @@ -2163,7 +2163,7 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gp_seq), TPS("reqwait")); rsp->gp_state = RCU_GP_WAIT_GPS; - swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & + swait_event_idle_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); rsp->gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ @@ -2191,7 +2191,7 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gp_seq), TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; - ret = swait_event_idle_timeout(rsp->gp_wq, + ret = swait_event_idle_timeout_exclusive(rsp->gp_wq, rcu_gp_fqs_check_wake(rsp, &gf), j); rsp->gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index b3df3b770afb..0b2c2ad69629 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -212,7 +212,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (wake) { smp_mb(); /* EGP done before wake_up(). */ - swake_up(&rsp->expedited_wq); + swake_up_one(&rsp->expedited_wq); } break; } @@ -526,7 +526,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) jiffies_start = jiffies; for (;;) { - ret = swait_event_timeout( + ret = swait_event_timeout_exclusive( rsp->expedited_wq, sync_rcu_preempt_exp_done_unlocked(rnp_root), jiffies_stall); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c1b17f5b9361..a97c20ea9bce 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1926,8 +1926,8 @@ static void __wake_nocb_leader(struct rcu_data *rdp, bool force, WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); del_timer(&rdp->nocb_timer); raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - smp_mb(); /* ->nocb_leader_sleep before swake_up(). */ - swake_up(&rdp_leader->nocb_wq); + smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */ + swake_up_one(&rdp_leader->nocb_wq); } else { raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } @@ -2159,7 +2159,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) */ trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { - swait_event_interruptible( + swait_event_interruptible_exclusive( rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1], (d = rcu_seq_done(&rnp->gp_seq, c))); if (likely(d)) @@ -2188,7 +2188,7 @@ static void nocb_leader_wait(struct rcu_data *my_rdp) /* Wait for callbacks to appear. */ if (!rcu_nocb_poll) { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep")); - swait_event_interruptible(my_rdp->nocb_wq, + swait_event_interruptible_exclusive(my_rdp->nocb_wq, !READ_ONCE(my_rdp->nocb_leader_sleep)); raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); my_rdp->nocb_leader_sleep = true; @@ -2253,7 +2253,7 @@ static void nocb_leader_wait(struct rcu_data *my_rdp) raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { /* List was empty, so wake up the follower. */ - swake_up(&rdp->nocb_wq); + swake_up_one(&rdp->nocb_wq); } } @@ -2270,7 +2270,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) { for (;;) { trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep")); - swait_event_interruptible(rdp->nocb_wq, + swait_event_interruptible_exclusive(rdp->nocb_wq, READ_ONCE(rdp->nocb_follower_head)); if (smp_load_acquire(&rdp->nocb_follower_head)) { /* ^^^ Ensure CB invocation follows _head test. */ diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index d9a02b318108..7fe183404c38 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -20,7 +20,7 @@ obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle.o fair.o rt.o deadline.o obj-y += wait.o wait_bit.o swait.o completion.o -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe365c9a08e9..deafa9fe602b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -17,6 +17,8 @@ #include "../workqueue_internal.h" #include "../smpboot.h" +#include "pelt.h" + #define CREATE_TRACE_POINTS #include @@ -44,14 +46,6 @@ const_debug unsigned int sysctl_sched_features = */ const_debug unsigned int sysctl_sched_nr_migrate = 32; -/* - * period over which we average the RT time consumption, measured - * in ms. - * - * default: 1s - */ -const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; - /* * period over which we measure -rt task CPU usage in us. * default: 1s @@ -183,9 +177,9 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->clock_task += delta; -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef HAVE_SCHED_AVG_IRQ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) - sched_rt_avg_update(rq, irq_delta + steal); + update_irq_load_avg(rq, irq_delta + steal); #endif } @@ -649,23 +643,6 @@ bool sched_can_stop_tick(struct rq *rq) return true; } #endif /* CONFIG_NO_HZ_FULL */ - -void sched_avg_update(struct rq *rq) -{ - s64 period = sched_avg_period(); - - while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { - /* - * Inline assembly required to prevent the compiler - * optimising this loop into a divmod call. - * See __iter_div_u64_rem() for another example of this. - */ - asm("" : "+rm" (rq->age_stamp)); - rq->age_stamp += period; - rq->rt_avg /= 2; - } -} - #endif /* CONFIG_SMP */ #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ @@ -1199,6 +1176,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) __set_task_cpu(p, new_cpu); } +#ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) { if (task_on_rq_queued(p)) { @@ -1280,16 +1258,17 @@ static int migrate_swap_stop(void *data) /* * Cross migrate two tasks */ -int migrate_swap(struct task_struct *cur, struct task_struct *p) +int migrate_swap(struct task_struct *cur, struct task_struct *p, + int target_cpu, int curr_cpu) { struct migration_swap_arg arg; int ret = -EINVAL; arg = (struct migration_swap_arg){ .src_task = cur, - .src_cpu = task_cpu(cur), + .src_cpu = curr_cpu, .dst_task = p, - .dst_cpu = task_cpu(p), + .dst_cpu = target_cpu, }; if (arg.src_cpu == arg.dst_cpu) @@ -1314,6 +1293,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) out: return ret; } +#endif /* CONFIG_NUMA_BALANCING */ /* * wait_task_inactive - wait for a thread to unschedule. @@ -2317,7 +2297,6 @@ static inline void init_schedstats(void) {} int sched_fork(unsigned long clone_flags, struct task_struct *p) { unsigned long flags; - int cpu = get_cpu(); __sched_fork(clone_flags, p); /* @@ -2353,14 +2332,12 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_reset_on_fork = 0; } - if (dl_prio(p->prio)) { - put_cpu(); + if (dl_prio(p->prio)) return -EAGAIN; - } else if (rt_prio(p->prio)) { + else if (rt_prio(p->prio)) p->sched_class = &rt_sched_class; - } else { + else p->sched_class = &fair_sched_class; - } init_entity_runnable_average(&p->se); @@ -2376,7 +2353,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * We're setting the CPU for the first time, we don't migrate, * so use __set_task_cpu(). */ - __set_task_cpu(p, cpu); + __set_task_cpu(p, smp_processor_id()); if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -2393,8 +2370,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); #endif - - put_cpu(); return 0; } @@ -5714,13 +5689,6 @@ void set_rq_offline(struct rq *rq) } } -static void set_cpu_rq_start_time(unsigned int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - rq->age_stamp = sched_clock_cpu(cpu); -} - /* * used to mark begin/end of suspend/resume: */ @@ -5838,7 +5806,6 @@ static void sched_rq_cpu_starting(unsigned int cpu) int sched_cpu_starting(unsigned int cpu) { - set_cpu_rq_start_time(cpu); sched_rq_cpu_starting(cpu); sched_tick_start(cpu); return 0; @@ -6106,7 +6073,6 @@ void __init sched_init(void) #ifdef CONFIG_SMP idle_thread_set_boot_cpu(); - set_cpu_rq_start_time(smp_processor_id()); #endif init_sched_fair_class(); @@ -6785,6 +6751,16 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v) seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); + if (schedstat_enabled() && tg != &root_task_group) { + u64 ws = 0; + int i; + + for_each_possible_cpu(i) + ws += schedstat_val(tg->se[i]->statistics.wait_sum); + + seq_printf(sf, "wait_sum %llu\n", ws); + } + return 0; } #endif /* CONFIG_CFS_BANDWIDTH */ diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index c907fde01eaa..3fffad3bc8a8 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -53,9 +53,7 @@ struct sugov_cpu { unsigned int iowait_boost_max; u64 last_update; - /* The fields below are only needed when sharing a policy: */ - unsigned long util_cfs; - unsigned long util_dl; + unsigned long bw_dl; unsigned long max; /* The field below is for single-CPU policies only: */ @@ -179,33 +177,90 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } -static void sugov_get_util(struct sugov_cpu *sg_cpu) +/* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. + * + * The scheduler tracks the following metrics: + * + * cpu_util_{cfs,rt,dl,irq}() + * cpu_bw_dl() + * + * Where the cfs,rt and dl util numbers are tracked with the same metric and + * synchronized windows and are thus directly comparable. + * + * The cfs,rt,dl utilization are the running times measured with rq->clock_task + * which excludes things like IRQ and steal-time. These latter are then accrued + * in the irq utilization. + * + * The DL bandwidth number otoh is not a measured metric but a value computed + * based on the task model parameters and gives the minimal utilization + * required to meet deadlines. + */ +static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); + unsigned long util, irq, max; - sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); - sg_cpu->util_cfs = cpu_util_cfs(rq); - sg_cpu->util_dl = cpu_util_dl(rq); -} - -static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) -{ - struct rq *rq = cpu_rq(sg_cpu->cpu); + sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); + sg_cpu->bw_dl = cpu_bw_dl(rq); if (rt_rq_is_runnable(&rq->rt)) - return sg_cpu->max; + return max; /* - * Utilization required by DEADLINE must always be granted while, for - * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to - * gracefully reduce the frequency when no tasks show up for longer + * Early check to see if IRQ/steal time saturates the CPU, can be + * because of inaccuracies in how we track these -- see + * update_irq_load_avg(). + */ + irq = cpu_util_irq(rq); + if (unlikely(irq >= max)) + return max; + + /* + * Because the time spend on RT/DL tasks is visible as 'lost' time to + * CFS tasks and we use the same metric to track the effective + * utilization (PELT windows are synchronized) we can directly add them + * to obtain the CPU's actual utilization. + */ + util = cpu_util_cfs(rq); + util += cpu_util_rt(rq); + + /* + * We do not make cpu_util_dl() a permanent part of this sum because we + * want to use cpu_bw_dl() later on, but we need to check if the + * CFS+RT+DL sum is saturated (ie. no idle time) such that we select + * f_max when there is no idle time. + * + * NOTE: numerical errors or stop class might cause us to not quite hit + * saturation when we should -- something for later. + */ + if ((util + cpu_util_dl(rq)) >= max) + return max; + + /* + * There is still idle time; further improve the number by using the + * irq metric. Because IRQ/steal time is hidden from the task clock we + * need to scale the task numbers: + * + * 1 - irq + * U' = irq + ------- * U + * max + */ + util = scale_irq_capacity(util, irq, max); + util += irq; + + /* + * Bandwidth required by DEADLINE must always be granted while, for + * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism + * to gracefully reduce the frequency when no tasks show up for longer * periods of time. * - * Ideally we would like to set util_dl as min/guaranteed freq and - * util_cfs + util_dl as requested freq. However, cpufreq is not yet - * ready for such an interface. So, we only do the latter for now. + * Ideally we would like to set bw_dl as min/guaranteed freq and util + + * bw_dl as requested freq. However, cpufreq is not yet ready for such + * an interface. So, we only do the latter for now. */ - return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs)); + return min(max, util + sg_cpu->bw_dl); } /** @@ -360,7 +415,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } */ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) { - if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl) + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) sg_policy->need_freq_update = true; } @@ -383,9 +438,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, busy = sugov_cpu_is_busy(sg_cpu); - sugov_get_util(sg_cpu); + util = sugov_get_util(sg_cpu); max = sg_cpu->max; - util = sugov_aggregate_util(sg_cpu); sugov_iowait_apply(sg_cpu, time, &util, &max); next_f = get_next_freq(sg_policy, util, max); /* @@ -424,9 +478,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); unsigned long j_util, j_max; - sugov_get_util(j_sg_cpu); + j_util = sugov_get_util(j_sg_cpu); j_max = j_sg_cpu->max; - j_util = sugov_aggregate_util(j_sg_cpu); sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max); if (j_util * max > j_max * util) { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b5fbdde6afa9..997ea7b839fa 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -16,6 +16,7 @@ * Fabio Checconi */ #include "sched.h" +#include "pelt.h" struct dl_bandwidth def_dl_bandwidth; @@ -1179,8 +1180,6 @@ static void update_curr_dl(struct rq *rq) curr->se.exec_start = now; cgroup_account_cputime(curr, delta_exec); - sched_rt_avg_update(rq, delta_exec); - if (dl_entity_is_special(dl_se)) return; @@ -1761,6 +1760,9 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) deadline_queue_push_tasks(rq); + if (rq->curr->sched_class != &dl_sched_class) + update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + return p; } @@ -1768,6 +1770,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { update_curr_dl(rq); + update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -1784,6 +1787,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) { update_curr_dl(rq); + update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); /* * Even when we have runtime, update_curr_dl() might have resulted in us * not being the leftmost task anymore. In that case NEED_RESCHED will diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index e593b4118578..870d4f3da285 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -111,20 +111,19 @@ static int sched_feat_set(char *cmp) cmp += 3; } - for (i = 0; i < __SCHED_FEAT_NR; i++) { - if (strcmp(cmp, sched_feat_names[i]) == 0) { - if (neg) { - sysctl_sched_features &= ~(1UL << i); - sched_feat_disable(i); - } else { - sysctl_sched_features |= (1UL << i); - sched_feat_enable(i); - } - break; - } + i = match_string(sched_feat_names, __SCHED_FEAT_NR, cmp); + if (i < 0) + return i; + + if (neg) { + sysctl_sched_features &= ~(1UL << i); + sched_feat_disable(i); + } else { + sysctl_sched_features |= (1UL << i); + sched_feat_enable(i); } - return i; + return 0; } static ssize_t @@ -133,7 +132,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, { char buf[64]; char *cmp; - int i; + int ret; struct inode *inode; if (cnt > 63) @@ -148,10 +147,10 @@ sched_feat_write(struct file *filp, const char __user *ubuf, /* Ensure the static_key remains in a consistent state */ inode = file_inode(filp); inode_lock(inode); - i = sched_feat_set(cmp); + ret = sched_feat_set(cmp); inode_unlock(inode); - if (i == __SCHED_FEAT_NR) - return -EINVAL; + if (ret < 0) + return ret; *ppos += cnt; @@ -843,8 +842,8 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, unsigned long tpf, unsigned long gsf, unsigned long gpf) { SEQ_printf(m, "numa_faults node=%d ", node); - SEQ_printf(m, "task_private=%lu task_shared=%lu ", tsf, tpf); - SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gsf, gpf); + SEQ_printf(m, "task_private=%lu task_shared=%lu ", tpf, tsf); + SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gpf, gsf); } #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2f0a0be4d344..309c93fcc604 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -255,9 +255,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) return cfs_rq->rq; } -/* An entity is a task if it doesn't "own" a runqueue */ -#define entity_is_task(se) (!se->my_q) - static inline struct task_struct *task_of(struct sched_entity *se) { SCHED_WARN_ON(!entity_is_task(se)); @@ -419,7 +416,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) return container_of(cfs_rq, struct rq, cfs); } -#define entity_is_task(se) 1 #define for_each_sched_entity(se) \ for (; se; se = NULL) @@ -692,7 +688,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP - +#include "pelt.h" #include "sched-pelt.h" static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); @@ -735,11 +731,12 @@ static void attach_entity_cfs_rq(struct sched_entity *se); * To solve this problem, we also cap the util_avg of successive tasks to * only 1/2 of the left utilization budget: * - * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n + * util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n * - * where n denotes the nth task. + * where n denotes the nth task and cpu_scale the CPU capacity. * - * For example, a simplest series from the beginning would be like: + * For example, for a CPU with 1024 of capacity, a simplest series from + * the beginning would be like: * * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... @@ -751,7 +748,8 @@ void post_init_entity_util_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; - long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; + long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); + long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -1314,7 +1312,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * of each group. Skip other nodes. */ if (sched_numa_topology_type == NUMA_BACKPLANE && - dist > maxdist) + dist >= maxdist) continue; /* Add up the faults from nearby nodes. */ @@ -1452,15 +1450,12 @@ static unsigned long capacity_of(int cpu); /* Cached statistics for all CPUs within a node */ struct numa_stats { - unsigned long nr_running; unsigned long load; /* Total compute capacity of CPUs on a node */ unsigned long compute_capacity; - /* Approximate capacity in terms of runnable tasks on a node */ - unsigned long task_capacity; - int has_free_capacity; + unsigned int nr_running; }; /* @@ -1487,8 +1482,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) * the @ns structure is NULL'ed and task_numa_compare() will * not find this node attractive. * - * We'll either bail at !has_free_capacity, or we'll detect a huge - * imbalance and bail there. + * We'll detect a huge imbalance and bail there. */ if (!cpus) return; @@ -1497,9 +1491,8 @@ static void update_numa_stats(struct numa_stats *ns, int nid) smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); capacity = cpus / smt; /* cores */ - ns->task_capacity = min_t(unsigned, capacity, + capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); - ns->has_free_capacity = (ns->nr_running < ns->task_capacity); } struct task_numa_env { @@ -1548,28 +1541,12 @@ static bool load_too_imbalanced(long src_load, long dst_load, src_capacity = env->src_stats.compute_capacity; dst_capacity = env->dst_stats.compute_capacity; - /* We care about the slope of the imbalance, not the direction. */ - if (dst_load < src_load) - swap(dst_load, src_load); + imb = abs(dst_load * src_capacity - src_load * dst_capacity); - /* Is the difference below the threshold? */ - imb = dst_load * src_capacity * 100 - - src_load * dst_capacity * env->imbalance_pct; - if (imb <= 0) - return false; - - /* - * The imbalance is above the allowed threshold. - * Compare it with the old imbalance. - */ orig_src_load = env->src_stats.load; orig_dst_load = env->dst_stats.load; - if (orig_dst_load < orig_src_load) - swap(orig_dst_load, orig_src_load); - - old_imb = orig_dst_load * src_capacity * 100 - - orig_src_load * dst_capacity * env->imbalance_pct; + old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity); /* Would this change make things worse? */ return (imb > old_imb); @@ -1582,9 +1559,8 @@ static bool load_too_imbalanced(long src_load, long dst_load, * be exchanged with the source task */ static void task_numa_compare(struct task_numa_env *env, - long taskimp, long groupimp) + long taskimp, long groupimp, bool maymove) { - struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; long src_load, dst_load; @@ -1605,97 +1581,73 @@ static void task_numa_compare(struct task_numa_env *env, if (cur == env->p) goto unlock; + if (!cur) { + if (maymove || imp > env->best_imp) + goto assign; + else + goto unlock; + } + /* * "imp" is the fault differential for the source task between the * source and destination node. Calculate the total differential for * the source task and potential destination task. The more negative - * the value is, the more rmeote accesses that would be expected to + * the value is, the more remote accesses that would be expected to * be incurred if the tasks were swapped. */ - if (cur) { - /* Skip this swap candidate if cannot move to the source CPU: */ - if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) - goto unlock; - - /* - * If dst and source tasks are in the same NUMA group, or not - * in any group then look only at task weights. - */ - if (cur->numa_group == env->p->numa_group) { - imp = taskimp + task_weight(cur, env->src_nid, dist) - - task_weight(cur, env->dst_nid, dist); - /* - * Add some hysteresis to prevent swapping the - * tasks within a group over tiny differences. - */ - if (cur->numa_group) - imp -= imp/16; - } else { - /* - * Compare the group weights. If a task is all by - * itself (not part of a group), use the task weight - * instead. - */ - if (cur->numa_group) - imp += group_weight(cur, env->src_nid, dist) - - group_weight(cur, env->dst_nid, dist); - else - imp += task_weight(cur, env->src_nid, dist) - - task_weight(cur, env->dst_nid, dist); - } - } - - if (imp <= env->best_imp && moveimp <= env->best_imp) + /* Skip this swap candidate if cannot move to the source cpu */ + if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) goto unlock; - if (!cur) { - /* Is there capacity at our destination? */ - if (env->src_stats.nr_running <= env->src_stats.task_capacity && - !env->dst_stats.has_free_capacity) - goto unlock; - - goto balance; - } - - /* Balance doesn't matter much if we're running a task per CPU: */ - if (imp > env->best_imp && src_rq->nr_running == 1 && - dst_rq->nr_running == 1) - goto assign; - /* - * In the overloaded case, try and keep the load balanced. + * If dst and source tasks are in the same NUMA group, or not + * in any group then look only at task weights. */ -balance: - load = task_h_load(env->p); - dst_load = env->dst_stats.load + load; - src_load = env->src_stats.load - load; - - if (moveimp > imp && moveimp > env->best_imp) { + if (cur->numa_group == env->p->numa_group) { + imp = taskimp + task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); /* - * If the improvement from just moving env->p direction is - * better than swapping tasks around, check if a move is - * possible. Store a slightly smaller score than moveimp, - * so an actually idle CPU will win. + * Add some hysteresis to prevent swapping the + * tasks within a group over tiny differences. */ - if (!load_too_imbalanced(src_load, dst_load, env)) { - imp = moveimp - 1; - cur = NULL; - goto assign; - } + if (cur->numa_group) + imp -= imp / 16; + } else { + /* + * Compare the group weights. If a task is all by itself + * (not part of a group), use the task weight instead. + */ + if (cur->numa_group && env->p->numa_group) + imp += group_weight(cur, env->src_nid, dist) - + group_weight(cur, env->dst_nid, dist); + else + imp += task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); } if (imp <= env->best_imp) goto unlock; - if (cur) { - load = task_h_load(cur); - dst_load -= load; - src_load += load; + if (maymove && moveimp > imp && moveimp > env->best_imp) { + imp = moveimp - 1; + cur = NULL; + goto assign; } + /* + * In the overloaded case, try and keep the load balanced. + */ + load = task_h_load(env->p) - task_h_load(cur); + if (!load) + goto assign; + + dst_load = env->dst_stats.load + load; + src_load = env->src_stats.load - load; + if (load_too_imbalanced(src_load, dst_load, env)) goto unlock; +assign: /* * One idle CPU per node is evaluated for a task numa move. * Call select_idle_sibling to maybe find a better one. @@ -1711,7 +1663,6 @@ static void task_numa_compare(struct task_numa_env *env, local_irq_enable(); } -assign: task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); @@ -1720,43 +1671,30 @@ static void task_numa_compare(struct task_numa_env *env, static void task_numa_find_cpu(struct task_numa_env *env, long taskimp, long groupimp) { + long src_load, dst_load, load; + bool maymove = false; int cpu; + load = task_h_load(env->p); + dst_load = env->dst_stats.load + load; + src_load = env->src_stats.load - load; + + /* + * If the improvement from just moving env->p direction is better + * than swapping tasks around, check if a move is possible. + */ + maymove = !load_too_imbalanced(src_load, dst_load, env); + for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { /* Skip this CPU if the source task cannot migrate */ if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed)) continue; env->dst_cpu = cpu; - task_numa_compare(env, taskimp, groupimp); + task_numa_compare(env, taskimp, groupimp, maymove); } } -/* Only move tasks to a NUMA node less busy than the current node. */ -static bool numa_has_capacity(struct task_numa_env *env) -{ - struct numa_stats *src = &env->src_stats; - struct numa_stats *dst = &env->dst_stats; - - if (src->has_free_capacity && !dst->has_free_capacity) - return false; - - /* - * Only consider a task move if the source has a higher load - * than the destination, corrected for CPU capacity on each node. - * - * src->load dst->load - * --------------------- vs --------------------- - * src->compute_capacity dst->compute_capacity - */ - if (src->load * dst->compute_capacity * env->imbalance_pct > - - dst->load * src->compute_capacity * 100) - return true; - - return false; -} - static int task_numa_migrate(struct task_struct *p) { struct task_numa_env env = { @@ -1797,7 +1735,7 @@ static int task_numa_migrate(struct task_struct *p) * elsewhere, so there is no point in (re)trying. */ if (unlikely(!sd)) { - p->numa_preferred_nid = task_node(p); + sched_setnuma(p, task_node(p)); return -EINVAL; } @@ -1811,8 +1749,7 @@ static int task_numa_migrate(struct task_struct *p) update_numa_stats(&env.dst_stats, env.dst_nid); /* Try to find a spot on the preferred nid. */ - if (numa_has_capacity(&env)) - task_numa_find_cpu(&env, taskimp, groupimp); + task_numa_find_cpu(&env, taskimp, groupimp); /* * Look at other nodes in these cases: @@ -1842,8 +1779,7 @@ static int task_numa_migrate(struct task_struct *p) env.dist = dist; env.dst_nid = nid; update_numa_stats(&env.dst_stats, env.dst_nid); - if (numa_has_capacity(&env)) - task_numa_find_cpu(&env, taskimp, groupimp); + task_numa_find_cpu(&env, taskimp, groupimp); } } @@ -1856,15 +1792,13 @@ static int task_numa_migrate(struct task_struct *p) * trying for a better one later. Do not set the preferred node here. */ if (p->numa_group) { - struct numa_group *ng = p->numa_group; - if (env.best_cpu == -1) nid = env.src_nid; else - nid = env.dst_nid; + nid = cpu_to_node(env.best_cpu); - if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng)) - sched_setnuma(p, env.dst_nid); + if (nid != p->numa_preferred_nid) + sched_setnuma(p, nid); } /* No better CPU than the current one was found. */ @@ -1884,7 +1818,8 @@ static int task_numa_migrate(struct task_struct *p) return ret; } - ret = migrate_swap(p, env.best_task); + ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu); + if (ret != 0) trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); put_task_struct(env.best_task); @@ -2144,8 +2079,8 @@ static int preferred_group_nid(struct task_struct *p, int nid) static void task_numa_placement(struct task_struct *p) { - int seq, nid, max_nid = -1, max_group_nid = -1; - unsigned long max_faults = 0, max_group_faults = 0; + int seq, nid, max_nid = -1; + unsigned long max_faults = 0; unsigned long fault_types[2] = { 0, 0 }; unsigned long total_faults; u64 runtime, period; @@ -2224,33 +2159,30 @@ static void task_numa_placement(struct task_struct *p) } } - if (faults > max_faults) { - max_faults = faults; + if (!p->numa_group) { + if (faults > max_faults) { + max_faults = faults; + max_nid = nid; + } + } else if (group_faults > max_faults) { + max_faults = group_faults; max_nid = nid; } - - if (group_faults > max_group_faults) { - max_group_faults = group_faults; - max_group_nid = nid; - } } - update_task_scan_period(p, fault_types[0], fault_types[1]); - if (p->numa_group) { numa_group_count_active_nodes(p->numa_group); spin_unlock_irq(group_lock); - max_nid = preferred_group_nid(p, max_group_nid); + max_nid = preferred_group_nid(p, max_nid); } if (max_faults) { /* Set the new preferred node */ if (max_nid != p->numa_preferred_nid) sched_setnuma(p, max_nid); - - if (task_node(p) != p->numa_preferred_nid) - numa_migrate_preferred(p); } + + update_task_scan_period(p, fault_types[0], fault_types[1]); } static inline int get_numa_group(struct numa_group *grp) @@ -2450,14 +2382,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) numa_is_active_node(mem_node, ng)) local = 1; - task_numa_placement(p); - /* * Retry task to preferred node migration periodically, in case it * case it previously failed, or the scheduler moved us. */ - if (time_after(jiffies, p->numa_migrate_retry)) + if (time_after(jiffies, p->numa_migrate_retry)) { + task_numa_placement(p); numa_migrate_preferred(p); + } if (migrated) p->numa_pages_migrated += pages; @@ -2749,19 +2681,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } while (0) #ifdef CONFIG_SMP -/* - * XXX we want to get rid of these helpers and use the full load resolution. - */ -static inline long se_weight(struct sched_entity *se) -{ - return scale_load_down(se->load.weight); -} - -static inline long se_runnable(struct sched_entity *se) -{ - return scale_load_down(se->runnable_weight); -} - static inline void enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -3062,314 +2981,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) } #ifdef CONFIG_SMP -/* - * Approximate: - * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) - */ -static u64 decay_load(u64 val, u64 n) -{ - unsigned int local_n; - - if (unlikely(n > LOAD_AVG_PERIOD * 63)) - return 0; - - /* after bounds checking we can collapse to 32-bit */ - local_n = n; - - /* - * As y^PERIOD = 1/2, we can combine - * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD) - * With a look-up table which covers y^n (n= LOAD_AVG_PERIOD)) { - val >>= local_n / LOAD_AVG_PERIOD; - local_n %= LOAD_AVG_PERIOD; - } - - val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); - return val; -} - -static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) -{ - u32 c1, c2, c3 = d3; /* y^0 == 1 */ - - /* - * c1 = d1 y^p - */ - c1 = decay_load((u64)d1, periods); - - /* - * p-1 - * c2 = 1024 \Sum y^n - * n=1 - * - * inf inf - * = 1024 ( \Sum y^n - \Sum y^n - y^0 ) - * n=0 n=p - */ - c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; - - return c1 + c2 + c3; -} - -/* - * Accumulate the three separate parts of the sum; d1 the remainder - * of the last (incomplete) period, d2 the span of full periods and d3 - * the remainder of the (incomplete) current period. - * - * d1 d2 d3 - * ^ ^ ^ - * | | | - * |<->|<----------------->|<--->| - * ... |---x---|------| ... |------|-----x (now) - * - * p-1 - * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0 - * n=1 - * - * = u y^p + (Step 1) - * - * p-1 - * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2) - * n=1 - */ -static __always_inline u32 -accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, - unsigned long load, unsigned long runnable, int running) -{ - unsigned long scale_freq, scale_cpu; - u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ - u64 periods; - - scale_freq = arch_scale_freq_capacity(cpu); - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - - delta += sa->period_contrib; - periods = delta / 1024; /* A period is 1024us (~1ms) */ - - /* - * Step 1: decay old *_sum if we crossed period boundaries. - */ - if (periods) { - sa->load_sum = decay_load(sa->load_sum, periods); - sa->runnable_load_sum = - decay_load(sa->runnable_load_sum, periods); - sa->util_sum = decay_load((u64)(sa->util_sum), periods); - - /* - * Step 2 - */ - delta %= 1024; - contrib = __accumulate_pelt_segments(periods, - 1024 - sa->period_contrib, delta); - } - sa->period_contrib = delta; - - contrib = cap_scale(contrib, scale_freq); - if (load) - sa->load_sum += load * contrib; - if (runnable) - sa->runnable_load_sum += runnable * contrib; - if (running) - sa->util_sum += contrib * scale_cpu; - - return periods; -} - -/* - * We can represent the historical contribution to runnable average as the - * coefficients of a geometric series. To do this we sub-divide our runnable - * history into segments of approximately 1ms (1024us); label the segment that - * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. - * - * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... - * p0 p1 p2 - * (now) (~1ms ago) (~2ms ago) - * - * Let u_i denote the fraction of p_i that the entity was runnable. - * - * We then designate the fractions u_i as our co-efficients, yielding the - * following representation of historical load: - * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... - * - * We choose y based on the with of a reasonably scheduling period, fixing: - * y^32 = 0.5 - * - * This means that the contribution to load ~32ms ago (u_32) will be weighted - * approximately half as much as the contribution to load within the last ms - * (u_0). - * - * When a period "rolls over" and we have new u_0`, multiplying the previous - * sum again by y is sufficient to update: - * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) - * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] - */ -static __always_inline int -___update_load_sum(u64 now, int cpu, struct sched_avg *sa, - unsigned long load, unsigned long runnable, int running) -{ - u64 delta; - - delta = now - sa->last_update_time; - /* - * This should only happen when time goes backwards, which it - * unfortunately does during sched clock init when we swap over to TSC. - */ - if ((s64)delta < 0) { - sa->last_update_time = now; - return 0; - } - - /* - * Use 1024ns as the unit of measurement since it's a reasonable - * approximation of 1us and fast to compute. - */ - delta >>= 10; - if (!delta) - return 0; - - sa->last_update_time += delta << 10; - - /* - * running is a subset of runnable (weight) so running can't be set if - * runnable is clear. But there are some corner cases where the current - * se has been already dequeued but cfs_rq->curr still points to it. - * This means that weight will be 0 but not running for a sched_entity - * but also for a cfs_rq if the latter becomes idle. As an example, - * this happens during idle_balance() which calls - * update_blocked_averages() - */ - if (!load) - runnable = running = 0; - - /* - * Now we know we crossed measurement unit boundaries. The *_avg - * accrues by two steps: - * - * Step 1: accumulate *_sum since last_update_time. If we haven't - * crossed period boundaries, finish. - */ - if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) - return 0; - - return 1; -} - -static __always_inline void -___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) -{ - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; - - /* - * Step 2: update *_avg. - */ - sa->load_avg = div_u64(load * sa->load_sum, divider); - sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); - sa->util_avg = sa->util_sum / divider; -} - -/* - * When a task is dequeued, its estimated utilization should not be update if - * its util_avg has not been updated at least once. - * This flag is used to synchronize util_avg updates with util_est updates. - * We map this information into the LSB bit of the utilization saved at - * dequeue time (i.e. util_est.dequeued). - */ -#define UTIL_AVG_UNCHANGED 0x1 - -static inline void cfs_se_util_change(struct sched_avg *avg) -{ - unsigned int enqueued; - - if (!sched_feat(UTIL_EST)) - return; - - /* Avoid store if the flag has been already set */ - enqueued = avg->util_est.enqueued; - if (!(enqueued & UTIL_AVG_UNCHANGED)) - return; - - /* Reset flag to report util_avg has been updated */ - enqueued &= ~UTIL_AVG_UNCHANGED; - WRITE_ONCE(avg->util_est.enqueued, enqueued); -} - -/* - * sched_entity: - * - * task: - * se_runnable() == se_weight() - * - * group: [ see update_cfs_group() ] - * se_weight() = tg->weight * grq->load_avg / tg->load_avg - * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg - * - * load_sum := runnable_sum - * load_avg = se_weight(se) * runnable_avg - * - * runnable_load_sum := runnable_sum - * runnable_load_avg = se_runnable(se) * runnable_avg - * - * XXX collapse load_sum and runnable_load_sum - * - * cfq_rs: - * - * load_sum = \Sum se_weight(se) * se->avg.load_sum - * load_avg = \Sum se->avg.load_avg - * - * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum - * runnable_load_avg = \Sum se->avg.runable_load_avg - */ - -static int -__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) -{ - if (entity_is_task(se)) - se->runnable_weight = se->load.weight; - - if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); - return 1; - } - - return 0; -} - -static int -__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - if (entity_is_task(se)) - se->runnable_weight = se->load.weight; - - if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, - cfs_rq->curr == se)) { - - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); - cfs_se_util_change(&se->avg); - return 1; - } - - return 0; -} - -static int -__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) -{ - if (___update_load_sum(now, cpu, &cfs_rq->avg, - scale_load_down(cfs_rq->load.weight), - scale_load_down(cfs_rq->runnable_weight), - cfs_rq->curr != NULL)) { - - ___update_load_avg(&cfs_rq->avg, 1, 1); - return 1; - } - - return 0; -} - #ifdef CONFIG_FAIR_GROUP_SCHED /** * update_tg_load_avg - update the tg's load avg @@ -4037,12 +3648,6 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) #else /* CONFIG_SMP */ -static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) -{ - return 0; -} - #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 #define DO_ATTACH 0x0 @@ -4726,7 +4331,6 @@ static inline int throttled_lb_pair(struct task_group *tg, throttled_hierarchy(dest_cfs_rq); } -/* updated child weight may affect parent so we have to do this bottom up */ static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; @@ -5653,8 +5257,6 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load, this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; } - - sched_avg_update(this_rq); } /* Used instead of source_load when we know the type == 0 */ @@ -7294,8 +6896,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env) static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); - unsigned long src_faults, dst_faults; - int src_nid, dst_nid; + unsigned long src_weight, dst_weight; + int src_nid, dst_nid, dist; if (!static_branch_likely(&sched_numa_balancing)) return -1; @@ -7322,18 +6924,19 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) return 0; /* Leaving a core idle is often worse than degrading locality. */ - if (env->idle != CPU_NOT_IDLE) + if (env->idle == CPU_IDLE) return -1; + dist = node_distance(src_nid, dst_nid); if (numa_group) { - src_faults = group_faults(p, src_nid); - dst_faults = group_faults(p, dst_nid); + src_weight = group_weight(p, src_nid, dist); + dst_weight = group_weight(p, dst_nid, dist); } else { - src_faults = task_faults(p, src_nid); - dst_faults = task_faults(p, dst_nid); + src_weight = task_weight(p, src_nid, dist); + dst_weight = task_weight(p, dst_nid, dist); } - return dst_faults < src_faults; + return dst_weight < src_weight; } #else @@ -7620,6 +7223,22 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) return false; } +static inline bool others_have_blocked(struct rq *rq) +{ + if (READ_ONCE(rq->avg_rt.util_avg)) + return true; + + if (READ_ONCE(rq->avg_dl.util_avg)) + return true; + +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + if (READ_ONCE(rq->avg_irq.util_avg)) + return true; +#endif + + return false; +} + #ifdef CONFIG_FAIR_GROUP_SCHED static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) @@ -7679,6 +7298,12 @@ static void update_blocked_averages(int cpu) if (cfs_rq_has_blocked(cfs_rq)) done = false; } + update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); + update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + update_irq_load_avg(rq, 0); + /* Don't need periodic decay once load/util_avg are null */ + if (others_have_blocked(rq)) + done = false; #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; @@ -7744,9 +7369,12 @@ static inline void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); + update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + update_irq_load_avg(rq, 0); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; - if (!cfs_rq_has_blocked(cfs_rq)) + if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq)) rq->has_blocked_load = 0; #endif rq_unlock_irqrestore(rq, &rf); @@ -7856,39 +7484,32 @@ static inline int get_sd_load_idx(struct sched_domain *sd, static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); - u64 total, used, age_stamp, avg; - s64 delta; + unsigned long max = arch_scale_cpu_capacity(NULL, cpu); + unsigned long used, free; + unsigned long irq; - /* - * Since we're reading these variables without serialization make sure - * we read them once before doing sanity checks on them. - */ - age_stamp = READ_ONCE(rq->age_stamp); - avg = READ_ONCE(rq->rt_avg); - delta = __rq_clock_broken(rq) - age_stamp; + irq = cpu_util_irq(rq); - if (unlikely(delta < 0)) - delta = 0; + if (unlikely(irq >= max)) + return 1; - total = sched_avg_period() + delta; + used = READ_ONCE(rq->avg_rt.util_avg); + used += READ_ONCE(rq->avg_dl.util_avg); - used = div_u64(avg, total); + if (unlikely(used >= max)) + return 1; - if (likely(used < SCHED_CAPACITY_SCALE)) - return SCHED_CAPACITY_SCALE - used; + free = max - used; - return 1; + return scale_irq_capacity(free, irq, max); } static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); + unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; - cpu_rq(cpu)->cpu_capacity_orig = capacity; - - capacity *= scale_rt_capacity(cpu); - capacity >>= SCHED_CAPACITY_SHIFT; + cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); if (!capacity) capacity = 1; diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c new file mode 100644 index 000000000000..35475c0c5419 --- /dev/null +++ b/kernel/sched/pelt.c @@ -0,0 +1,399 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Per Entity Load Tracking + * + * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar + * + * Interactivity improvements by Mike Galbraith + * (C) 2007 Mike Galbraith + * + * Various enhancements by Dmitry Adamushko. + * (C) 2007 Dmitry Adamushko + * + * Group scheduling enhancements by Srivatsa Vaddagiri + * Copyright IBM Corporation, 2007 + * Author: Srivatsa Vaddagiri + * + * Scaled math optimizations by Thomas Gleixner + * Copyright (C) 2007, Thomas Gleixner + * + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * Move PELT related code from fair.c into this pelt.c file + * Author: Vincent Guittot + */ + +#include +#include "sched.h" +#include "sched-pelt.h" +#include "pelt.h" + +/* + * Approximate: + * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) + */ +static u64 decay_load(u64 val, u64 n) +{ + unsigned int local_n; + + if (unlikely(n > LOAD_AVG_PERIOD * 63)) + return 0; + + /* after bounds checking we can collapse to 32-bit */ + local_n = n; + + /* + * As y^PERIOD = 1/2, we can combine + * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD) + * With a look-up table which covers y^n (n= LOAD_AVG_PERIOD)) { + val >>= local_n / LOAD_AVG_PERIOD; + local_n %= LOAD_AVG_PERIOD; + } + + val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); + return val; +} + +static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) +{ + u32 c1, c2, c3 = d3; /* y^0 == 1 */ + + /* + * c1 = d1 y^p + */ + c1 = decay_load((u64)d1, periods); + + /* + * p-1 + * c2 = 1024 \Sum y^n + * n=1 + * + * inf inf + * = 1024 ( \Sum y^n - \Sum y^n - y^0 ) + * n=0 n=p + */ + c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; + + return c1 + c2 + c3; +} + +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + +/* + * Accumulate the three separate parts of the sum; d1 the remainder + * of the last (incomplete) period, d2 the span of full periods and d3 + * the remainder of the (incomplete) current period. + * + * d1 d2 d3 + * ^ ^ ^ + * | | | + * |<->|<----------------->|<--->| + * ... |---x---|------| ... |------|-----x (now) + * + * p-1 + * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0 + * n=1 + * + * = u y^p + (Step 1) + * + * p-1 + * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2) + * n=1 + */ +static __always_inline u32 +accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, + unsigned long load, unsigned long runnable, int running) +{ + unsigned long scale_freq, scale_cpu; + u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ + u64 periods; + + scale_freq = arch_scale_freq_capacity(cpu); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + + delta += sa->period_contrib; + periods = delta / 1024; /* A period is 1024us (~1ms) */ + + /* + * Step 1: decay old *_sum if we crossed period boundaries. + */ + if (periods) { + sa->load_sum = decay_load(sa->load_sum, periods); + sa->runnable_load_sum = + decay_load(sa->runnable_load_sum, periods); + sa->util_sum = decay_load((u64)(sa->util_sum), periods); + + /* + * Step 2 + */ + delta %= 1024; + contrib = __accumulate_pelt_segments(periods, + 1024 - sa->period_contrib, delta); + } + sa->period_contrib = delta; + + contrib = cap_scale(contrib, scale_freq); + if (load) + sa->load_sum += load * contrib; + if (runnable) + sa->runnable_load_sum += runnable * contrib; + if (running) + sa->util_sum += contrib * scale_cpu; + + return periods; +} + +/* + * We can represent the historical contribution to runnable average as the + * coefficients of a geometric series. To do this we sub-divide our runnable + * history into segments of approximately 1ms (1024us); label the segment that + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. + * + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... + * p0 p1 p2 + * (now) (~1ms ago) (~2ms ago) + * + * Let u_i denote the fraction of p_i that the entity was runnable. + * + * We then designate the fractions u_i as our co-efficients, yielding the + * following representation of historical load: + * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... + * + * We choose y based on the with of a reasonably scheduling period, fixing: + * y^32 = 0.5 + * + * This means that the contribution to load ~32ms ago (u_32) will be weighted + * approximately half as much as the contribution to load within the last ms + * (u_0). + * + * When a period "rolls over" and we have new u_0`, multiplying the previous + * sum again by y is sufficient to update: + * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) + * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] + */ +static __always_inline int +___update_load_sum(u64 now, int cpu, struct sched_avg *sa, + unsigned long load, unsigned long runnable, int running) +{ + u64 delta; + + delta = now - sa->last_update_time; + /* + * This should only happen when time goes backwards, which it + * unfortunately does during sched clock init when we swap over to TSC. + */ + if ((s64)delta < 0) { + sa->last_update_time = now; + return 0; + } + + /* + * Use 1024ns as the unit of measurement since it's a reasonable + * approximation of 1us and fast to compute. + */ + delta >>= 10; + if (!delta) + return 0; + + sa->last_update_time += delta << 10; + + /* + * running is a subset of runnable (weight) so running can't be set if + * runnable is clear. But there are some corner cases where the current + * se has been already dequeued but cfs_rq->curr still points to it. + * This means that weight will be 0 but not running for a sched_entity + * but also for a cfs_rq if the latter becomes idle. As an example, + * this happens during idle_balance() which calls + * update_blocked_averages() + */ + if (!load) + runnable = running = 0; + + /* + * Now we know we crossed measurement unit boundaries. The *_avg + * accrues by two steps: + * + * Step 1: accumulate *_sum since last_update_time. If we haven't + * crossed period boundaries, finish. + */ + if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) + return 0; + + return 1; +} + +static __always_inline void +___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) +{ + u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + + /* + * Step 2: update *_avg. + */ + sa->load_avg = div_u64(load * sa->load_sum, divider); + sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); + WRITE_ONCE(sa->util_avg, sa->util_sum / divider); +} + +/* + * sched_entity: + * + * task: + * se_runnable() == se_weight() + * + * group: [ see update_cfs_group() ] + * se_weight() = tg->weight * grq->load_avg / tg->load_avg + * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg + * + * load_sum := runnable_sum + * load_avg = se_weight(se) * runnable_avg + * + * runnable_load_sum := runnable_sum + * runnable_load_avg = se_runnable(se) * runnable_avg + * + * XXX collapse load_sum and runnable_load_sum + * + * cfq_rq: + * + * load_sum = \Sum se_weight(se) * se->avg.load_sum + * load_avg = \Sum se->avg.load_avg + * + * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum + * runnable_load_avg = \Sum se->avg.runable_load_avg + */ + +int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) +{ + if (entity_is_task(se)) + se->runnable_weight = se->load.weight; + + if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + return 1; + } + + return 0; +} + +int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (entity_is_task(se)) + se->runnable_weight = se->load.weight; + + if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, + cfs_rq->curr == se)) { + + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + cfs_se_util_change(&se->avg); + return 1; + } + + return 0; +} + +int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) +{ + if (___update_load_sum(now, cpu, &cfs_rq->avg, + scale_load_down(cfs_rq->load.weight), + scale_load_down(cfs_rq->runnable_weight), + cfs_rq->curr != NULL)) { + + ___update_load_avg(&cfs_rq->avg, 1, 1); + return 1; + } + + return 0; +} + +/* + * rt_rq: + * + * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked + * util_sum = cpu_scale * load_sum + * runnable_load_sum = load_sum + * + * load_avg and runnable_load_avg are not supported and meaningless. + * + */ + +int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) +{ + if (___update_load_sum(now, rq->cpu, &rq->avg_rt, + running, + running, + running)) { + + ___update_load_avg(&rq->avg_rt, 1, 1); + return 1; + } + + return 0; +} + +/* + * dl_rq: + * + * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked + * util_sum = cpu_scale * load_sum + * runnable_load_sum = load_sum + * + */ + +int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) +{ + if (___update_load_sum(now, rq->cpu, &rq->avg_dl, + running, + running, + running)) { + + ___update_load_avg(&rq->avg_dl, 1, 1); + return 1; + } + + return 0; +} + +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +/* + * irq: + * + * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked + * util_sum = cpu_scale * load_sum + * runnable_load_sum = load_sum + * + */ + +int update_irq_load_avg(struct rq *rq, u64 running) +{ + int ret = 0; + /* + * We know the time that has been used by interrupt since last update + * but we don't when. Let be pessimistic and assume that interrupt has + * happened just before the update. This is not so far from reality + * because interrupt will most probably wake up task and trig an update + * of rq clock during which the metric si updated. + * We start to decay with normal context time and then we add the + * interrupt context time. + * We can safely remove running from rq->clock because + * rq->clock += delta with delta >= running + */ + ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq, + 0, + 0, + 0); + ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq, + 1, + 1, + 1); + + if (ret) + ___update_load_avg(&rq->avg_irq, 1, 1); + + return ret; +} +#endif diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h new file mode 100644 index 000000000000..d2894db28955 --- /dev/null +++ b/kernel/sched/pelt.h @@ -0,0 +1,72 @@ +#ifdef CONFIG_SMP + +int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se); +int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se); +int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); +int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); +int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); + +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +int update_irq_load_avg(struct rq *rq, u64 running); +#else +static inline int +update_irq_load_avg(struct rq *rq, u64 running) +{ + return 0; +} +#endif + +/* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. + * This flag is used to synchronize util_avg updates with util_est updates. + * We map this information into the LSB bit of the utilization saved at + * dequeue time (i.e. util_est.dequeued). + */ +#define UTIL_AVG_UNCHANGED 0x1 + +static inline void cfs_se_util_change(struct sched_avg *avg) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Avoid store if the flag has been already set */ + enqueued = avg->util_est.enqueued; + if (!(enqueued & UTIL_AVG_UNCHANGED)) + return; + + /* Reset flag to report util_avg has been updated */ + enqueued &= ~UTIL_AVG_UNCHANGED; + WRITE_ONCE(avg->util_est.enqueued, enqueued); +} + +#else + +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) +{ + return 0; +} + +static inline int +update_rt_rq_load_avg(u64 now, struct rq *rq, int running) +{ + return 0; +} + +static inline int +update_dl_rq_load_avg(u64 now, struct rq *rq, int running) +{ + return 0; +} + +static inline int +update_irq_load_avg(struct rq *rq, u64 running) +{ + return 0; +} +#endif + + diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index eaaec8364f96..2e2955a8cf8f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -5,6 +5,8 @@ */ #include "sched.h" +#include "pelt.h" + int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; @@ -973,8 +975,6 @@ static void update_curr_rt(struct rq *rq) curr->se.exec_start = now; cgroup_account_cputime(curr, delta_exec); - sched_rt_avg_update(rq, delta_exec); - if (!rt_bandwidth_enabled()) return; @@ -1578,6 +1578,14 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) rt_queue_push_tasks(rq); + /* + * If prev task was rt, put_prev_task() has already updated the + * utilization. We only care of the case where we start to schedule a + * rt task + */ + if (rq->curr->sched_class != &rt_sched_class) + update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); + return p; } @@ -1585,6 +1593,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { update_curr_rt(rq); + update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); + /* * The previous task needs to be made eligible for pushing * if it is still active @@ -2314,6 +2324,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); + update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); watchdog(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c7742dcc136c..4a2e8cae63c4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -594,6 +594,7 @@ struct rt_rq { unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; + #endif /* CONFIG_SMP */ int rt_queued; @@ -673,7 +674,26 @@ struct dl_rq { u64 bw_ratio; }; +#ifdef CONFIG_FAIR_GROUP_SCHED +/* An entity is a task if it doesn't "own" a runqueue */ +#define entity_is_task(se) (!se->my_q) +#else +#define entity_is_task(se) 1 +#endif + #ifdef CONFIG_SMP +/* + * XXX we want to get rid of these helpers and use the full load resolution. + */ +static inline long se_weight(struct sched_entity *se) +{ + return scale_load_down(se->load.weight); +} + +static inline long se_runnable(struct sched_entity *se) +{ + return scale_load_down(se->runnable_weight); +} static inline bool sched_asym_prefer(int a, int b) { @@ -833,8 +853,12 @@ struct rq { struct list_head cfs_tasks; - u64 rt_avg; - u64 age_stamp; + struct sched_avg avg_rt; + struct sched_avg avg_dl; +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#define HAVE_SCHED_AVG_IRQ + struct sched_avg avg_irq; +#endif u64 idle_stamp; u64 avg_idle; @@ -1075,7 +1099,8 @@ enum numa_faults_stats { }; extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); -extern int migrate_swap(struct task_struct *, struct task_struct *); +extern int migrate_swap(struct task_struct *p, struct task_struct *t, + int cpu, int scpu); extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); #else static inline void @@ -1690,15 +1715,9 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); -extern const_debug unsigned int sysctl_sched_time_avg; extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; -static inline u64 sched_avg_period(void) -{ - return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; -} - #ifdef CONFIG_SCHED_HRTICK /* @@ -1735,8 +1754,6 @@ unsigned long arch_scale_freq_capacity(int cpu) #endif #ifdef CONFIG_SMP -extern void sched_avg_update(struct rq *rq); - #ifndef arch_scale_cpu_capacity static __always_inline unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) @@ -1747,12 +1764,6 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) return SCHED_CAPACITY_SCALE; } #endif - -static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) -{ - rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq)); - sched_avg_update(rq); -} #else #ifndef arch_scale_cpu_capacity static __always_inline @@ -1761,8 +1772,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) return SCHED_CAPACITY_SCALE; } #endif -static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } -static inline void sched_avg_update(struct rq *rq) { } #endif struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) @@ -2177,11 +2186,16 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL -static inline unsigned long cpu_util_dl(struct rq *rq) +static inline unsigned long cpu_bw_dl(struct rq *rq) { return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; } +static inline unsigned long cpu_util_dl(struct rq *rq) +{ + return READ_ONCE(rq->avg_dl.util_avg); +} + static inline unsigned long cpu_util_cfs(struct rq *rq) { unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); @@ -2193,4 +2207,37 @@ static inline unsigned long cpu_util_cfs(struct rq *rq) return util; } + +static inline unsigned long cpu_util_rt(struct rq *rq) +{ + return READ_ONCE(rq->avg_rt.util_avg); +} +#endif + +#ifdef HAVE_SCHED_AVG_IRQ +static inline unsigned long cpu_util_irq(struct rq *rq) +{ + return rq->avg_irq.util_avg; +} + +static inline +unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) +{ + util *= (max - irq); + util /= max; + + return util; + +} +#else +static inline unsigned long cpu_util_irq(struct rq *rq) +{ + return 0; +} + +static inline +unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) +{ + return util; +} #endif diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index b6fb2c3b3ff7..66b59ac77c22 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -32,7 +32,7 @@ void swake_up_locked(struct swait_queue_head *q) } EXPORT_SYMBOL(swake_up_locked); -void swake_up(struct swait_queue_head *q) +void swake_up_one(struct swait_queue_head *q) { unsigned long flags; @@ -40,7 +40,7 @@ void swake_up(struct swait_queue_head *q) swake_up_locked(q); raw_spin_unlock_irqrestore(&q->lock, flags); } -EXPORT_SYMBOL(swake_up); +EXPORT_SYMBOL(swake_up_one); /* * Does not allow usage from IRQ disabled, since we must be able to @@ -69,14 +69,14 @@ void swake_up_all(struct swait_queue_head *q) } EXPORT_SYMBOL(swake_up_all); -void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) +static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) { wait->task = current; if (list_empty(&wait->task_list)) - list_add(&wait->task_list, &q->task_list); + list_add_tail(&wait->task_list, &q->task_list); } -void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state) +void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state) { unsigned long flags; @@ -85,16 +85,28 @@ void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int set_current_state(state); raw_spin_unlock_irqrestore(&q->lock, flags); } -EXPORT_SYMBOL(prepare_to_swait); +EXPORT_SYMBOL(prepare_to_swait_exclusive); long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state) { - if (signal_pending_state(state, current)) - return -ERESTARTSYS; + unsigned long flags; + long ret = 0; - prepare_to_swait(q, wait, state); + raw_spin_lock_irqsave(&q->lock, flags); + if (unlikely(signal_pending_state(state, current))) { + /* + * See prepare_to_wait_event(). TL;DR, subsequent swake_up_one() + * must not see us. + */ + list_del_init(&wait->task_list); + ret = -ERESTARTSYS; + } else { + __prepare_to_swait(q, wait); + set_current_state(state); + } + raw_spin_unlock_irqrestore(&q->lock, flags); - return 0; + return ret; } EXPORT_SYMBOL(prepare_to_swait_event); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 5043e7433f4b..c230c2dd48e1 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -238,8 +238,7 @@ int smpboot_unpark_threads(unsigned int cpu) mutex_lock(&smpboot_threads_lock); list_for_each_entry(cur, &hotplug_threads, list) - if (cpumask_test_cpu(cpu, cur->cpumask)) - smpboot_unpark_thread(cur, cpu); + smpboot_unpark_thread(cur, cpu); mutex_unlock(&smpboot_threads_lock); return 0; } @@ -280,34 +279,26 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) } /** - * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related + * smpboot_register_percpu_thread - Register a per_cpu thread related * to hotplug * @plug_thread: Hotplug thread descriptor - * @cpumask: The cpumask where threads run * * Creates and starts the threads on all online cpus. */ -int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread, - const struct cpumask *cpumask) +int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) { unsigned int cpu; int ret = 0; - if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL)) - return -ENOMEM; - cpumask_copy(plug_thread->cpumask, cpumask); - get_online_cpus(); mutex_lock(&smpboot_threads_lock); for_each_online_cpu(cpu) { ret = __smpboot_create_thread(plug_thread, cpu); if (ret) { smpboot_destroy_threads(plug_thread); - free_cpumask_var(plug_thread->cpumask); goto out; } - if (cpumask_test_cpu(cpu, cpumask)) - smpboot_unpark_thread(plug_thread, cpu); + smpboot_unpark_thread(plug_thread, cpu); } list_add(&plug_thread->list, &hotplug_threads); out: @@ -315,7 +306,7 @@ int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_threa put_online_cpus(); return ret; } -EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask); +EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); /** * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug @@ -331,44 +322,9 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) smpboot_destroy_threads(plug_thread); mutex_unlock(&smpboot_threads_lock); put_online_cpus(); - free_cpumask_var(plug_thread->cpumask); } EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); -/** - * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked - * @plug_thread: Hotplug thread descriptor - * @new: Revised mask to use - * - * The cpumask field in the smp_hotplug_thread must not be updated directly - * by the client, but only by calling this function. - * This function can only be called on a registered smp_hotplug_thread. - */ -void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, - const struct cpumask *new) -{ - struct cpumask *old = plug_thread->cpumask; - static struct cpumask tmp; - unsigned int cpu; - - lockdep_assert_cpus_held(); - mutex_lock(&smpboot_threads_lock); - - /* Park threads that were exclusively enabled on the old mask. */ - cpumask_andnot(&tmp, old, new); - for_each_cpu_and(cpu, &tmp, cpu_online_mask) - smpboot_park_thread(plug_thread, cpu); - - /* Unpark threads that are exclusively enabled on the new mask. */ - cpumask_andnot(&tmp, new, old); - for_each_cpu_and(cpu, &tmp, cpu_online_mask) - smpboot_unpark_thread(plug_thread, cpu); - - cpumask_copy(old, new); - - mutex_unlock(&smpboot_threads_lock); -} - static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); /* diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 69eb76daed34..067cb83f37ea 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -238,13 +238,24 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); DEFINE_WAKE_Q(wakeq); int err; + retry: + /* + * The waking up of stopper threads has to happen in the same + * scheduling context as the queueing. Otherwise, there is a + * possibility of one of the above stoppers being woken up by another + * CPU, and preempting us. This will cause us to not wake up the other + * stopper forever. + */ + preempt_disable(); raw_spin_lock_irq(&stopper1->lock); raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); - err = -ENOENT; - if (!stopper1->enabled || !stopper2->enabled) + if (!stopper1->enabled || !stopper2->enabled) { + err = -ENOENT; goto unlock; + } + /* * Ensure that if we race with __stop_cpus() the stoppers won't get * queued up in reverse order leading to system deadlock. @@ -255,36 +266,30 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, * It can be falsely true but it is safe to spin until it is cleared, * queue_stop_cpus_work() does everything under preempt_disable(). */ - err = -EDEADLK; - if (unlikely(stop_cpus_in_progress)) - goto unlock; + if (unlikely(stop_cpus_in_progress)) { + err = -EDEADLK; + goto unlock; + } err = 0; __cpu_stop_queue_work(stopper1, work1, &wakeq); __cpu_stop_queue_work(stopper2, work2, &wakeq); - /* - * The waking up of stopper threads has to happen - * in the same scheduling context as the queueing. - * Otherwise, there is a possibility of one of the - * above stoppers being woken up by another CPU, - * and preempting us. This will cause us to n ot - * wake up the other stopper forever. - */ - preempt_disable(); + unlock: raw_spin_unlock(&stopper2->lock); raw_spin_unlock_irq(&stopper1->lock); if (unlikely(err == -EDEADLK)) { + preempt_enable(); + while (stop_cpus_in_progress) cpu_relax(); + goto retry; } - if (!err) { - wake_up_q(&wakeq); - preempt_enable(); - } + wake_up_q(&wakeq); + preempt_enable(); return err; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2d9837c0aff4..f22f76b7a138 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -368,14 +368,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "sched_time_avg_ms", - .data = &sysctl_sched_time_avg, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - }, #ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats", diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 576d18045811..5470dce212c0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -18,18 +18,14 @@ #include #include #include -#include -#include -#include #include -#include #include #include #include +#include #include #include -#include static DEFINE_MUTEX(watchdog_mutex); @@ -169,11 +165,10 @@ static void lockup_detector_update_enable(void) unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; -static bool softlockup_threads_initialized __read_mostly; +static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); -static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); @@ -335,6 +330,27 @@ static void watchdog_interrupt_count(void) __this_cpu_inc(hrtimer_interrupts); } +static DEFINE_PER_CPU(struct completion, softlockup_completion); +static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); + +/* + * The watchdog thread function - touches the timestamp. + * + * It only runs once every sample_period seconds (4 seconds by + * default) to reset the softlockup timestamp. If this gets delayed + * for more than 2*watchdog_thresh seconds then the debug-printout + * triggers in watchdog_timer_fn(). + */ +static int softlockup_fn(void *data) +{ + __this_cpu_write(soft_lockup_hrtimer_cnt, + __this_cpu_read(hrtimer_interrupts)); + __touch_watchdog(); + complete(this_cpu_ptr(&softlockup_completion)); + + return 0; +} + /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { @@ -350,7 +366,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) watchdog_interrupt_count(); /* kick the softlockup detector */ - wake_up_process(__this_cpu_read(softlockup_watchdog)); + if (completion_done(this_cpu_ptr(&softlockup_completion))) { + reinit_completion(this_cpu_ptr(&softlockup_completion)); + stop_one_cpu_nowait(smp_processor_id(), + softlockup_fn, NULL, + this_cpu_ptr(&softlockup_stop_work)); + } /* .. and repeat */ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); @@ -448,16 +469,15 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; } -static void watchdog_set_prio(unsigned int policy, unsigned int prio) -{ - struct sched_param param = { .sched_priority = prio }; - - sched_setscheduler(current, policy, ¶m); -} - static void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); + struct completion *done = this_cpu_ptr(&softlockup_completion); + + WARN_ON_ONCE(cpu != smp_processor_id()); + + init_completion(done); + complete(done); /* * Start the timer first to prevent the NMI watchdog triggering @@ -473,15 +493,14 @@ static void watchdog_enable(unsigned int cpu) /* Enable the perf event */ if (watchdog_enabled & NMI_WATCHDOG_ENABLED) watchdog_nmi_enable(cpu); - - watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); } static void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); - watchdog_set_prio(SCHED_NORMAL, 0); + WARN_ON_ONCE(cpu != smp_processor_id()); + /* * Disable the perf event first. That prevents that a large delay * between disabling the timer and disabling the perf event causes @@ -489,79 +508,66 @@ static void watchdog_disable(unsigned int cpu) */ watchdog_nmi_disable(cpu); hrtimer_cancel(hrtimer); + wait_for_completion(this_cpu_ptr(&softlockup_completion)); } -static void watchdog_cleanup(unsigned int cpu, bool online) +static int softlockup_stop_fn(void *data) { - watchdog_disable(cpu); + watchdog_disable(smp_processor_id()); + return 0; } -static int watchdog_should_run(unsigned int cpu) +static void softlockup_stop_all(void) { - return __this_cpu_read(hrtimer_interrupts) != - __this_cpu_read(soft_lockup_hrtimer_cnt); -} + int cpu; -/* - * The watchdog thread function - touches the timestamp. - * - * It only runs once every sample_period seconds (4 seconds by - * default) to reset the softlockup timestamp. If this gets delayed - * for more than 2*watchdog_thresh seconds then the debug-printout - * triggers in watchdog_timer_fn(). - */ -static void watchdog(unsigned int cpu) -{ - __this_cpu_write(soft_lockup_hrtimer_cnt, - __this_cpu_read(hrtimer_interrupts)); - __touch_watchdog(); -} - -static struct smp_hotplug_thread watchdog_threads = { - .store = &softlockup_watchdog, - .thread_should_run = watchdog_should_run, - .thread_fn = watchdog, - .thread_comm = "watchdog/%u", - .setup = watchdog_enable, - .cleanup = watchdog_cleanup, - .park = watchdog_disable, - .unpark = watchdog_enable, -}; - -static void softlockup_update_smpboot_threads(void) -{ - lockdep_assert_held(&watchdog_mutex); - - if (!softlockup_threads_initialized) + if (!softlockup_initialized) return; - smpboot_update_cpumask_percpu_thread(&watchdog_threads, - &watchdog_allowed_mask); -} + for_each_cpu(cpu, &watchdog_allowed_mask) + smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false); -/* Temporarily park all watchdog threads */ -static void softlockup_park_all_threads(void) -{ cpumask_clear(&watchdog_allowed_mask); - softlockup_update_smpboot_threads(); } -/* Unpark enabled threads */ -static void softlockup_unpark_threads(void) +static int softlockup_start_fn(void *data) { + watchdog_enable(smp_processor_id()); + return 0; +} + +static void softlockup_start_all(void) +{ + int cpu; + cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); - softlockup_update_smpboot_threads(); + for_each_cpu(cpu, &watchdog_allowed_mask) + smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false); +} + +int lockup_detector_online_cpu(unsigned int cpu) +{ + watchdog_enable(cpu); + return 0; +} + +int lockup_detector_offline_cpu(unsigned int cpu) +{ + watchdog_disable(cpu); + return 0; } static void lockup_detector_reconfigure(void) { cpus_read_lock(); watchdog_nmi_stop(); - softlockup_park_all_threads(); + + softlockup_stop_all(); set_sample_period(); lockup_detector_update_enable(); if (watchdog_enabled && watchdog_thresh) - softlockup_unpark_threads(); + softlockup_start_all(); + watchdog_nmi_start(); cpus_read_unlock(); /* @@ -580,8 +586,6 @@ static void lockup_detector_reconfigure(void) */ static __init void lockup_detector_setup(void) { - int ret; - /* * If sysctl is off and watchdog got disabled on the command line, * nothing to do here. @@ -592,24 +596,13 @@ static __init void lockup_detector_setup(void) !(watchdog_enabled && watchdog_thresh)) return; - ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads, - &watchdog_allowed_mask); - if (ret) { - pr_err("Failed to initialize soft lockup detector threads\n"); - return; - } - mutex_lock(&watchdog_mutex); - softlockup_threads_initialized = true; lockup_detector_reconfigure(); + softlockup_initialized = true; mutex_unlock(&watchdog_mutex); } #else /* CONFIG_SOFTLOCKUP_DETECTOR */ -static inline int watchdog_park_threads(void) { return 0; } -static inline void watchdog_unpark_threads(void) { } -static inline int watchdog_enable_all_cpus(void) { return 0; } -static inline void watchdog_disable_all_cpus(void) { } static void lockup_detector_reconfigure(void) { cpus_read_lock(); diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index e449a23e9d59..1f7020d65d0a 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -175,8 +175,8 @@ static int hardlockup_detector_event_create(void) evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); if (IS_ERR(evt)) { - pr_info("Perf event create on CPU %d failed with %ld\n", cpu, - PTR_ERR(evt)); + pr_debug("Perf event create on CPU %d failed with %ld\n", cpu, + PTR_ERR(evt)); return PTR_ERR(evt); } this_cpu_write(watchdog_ev, evt); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 04e554cae3a2..108250e4d376 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -604,7 +604,7 @@ void kvm_arm_resume_guest(struct kvm *kvm) kvm_for_each_vcpu(i, vcpu, kvm) { vcpu->arch.pause = false; - swake_up(kvm_arch_vcpu_wq(vcpu)); + swake_up_one(kvm_arch_vcpu_wq(vcpu)); } } @@ -612,7 +612,7 @@ static void vcpu_req_sleep(struct kvm_vcpu *vcpu) { struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); - swait_event_interruptible(*wq, ((!vcpu->arch.power_off) && + swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) && (!vcpu->arch.pause))); if (vcpu->arch.power_off || vcpu->arch.pause) { diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c index c95ab4c5a475..9b73d3ad918a 100644 --- a/virt/kvm/arm/psci.c +++ b/virt/kvm/arm/psci.c @@ -155,7 +155,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) smp_mb(); /* Make sure the above is visible */ wq = kvm_arch_vcpu_wq(vcpu); - swake_up(wq); + swake_up_one(wq); return PSCI_RET_SUCCESS; } diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 57bcb27dcf30..23c2519c5b32 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -107,7 +107,7 @@ static void async_pf_execute(struct work_struct *work) trace_kvm_async_pf_completed(addr, gva); if (swq_has_sleeper(&vcpu->wq)) - swake_up(&vcpu->wq); + swake_up_one(&vcpu->wq); mmput(mm); kvm_put_kvm(vcpu->kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8b47507faab5..3d233ebfbee9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2172,7 +2172,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) kvm_arch_vcpu_blocking(vcpu); for (;;) { - prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); + prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); if (kvm_vcpu_check_block(vcpu) < 0) break; @@ -2214,7 +2214,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) wqp = kvm_arch_vcpu_wq(vcpu); if (swq_has_sleeper(wqp)) { - swake_up(wqp); + swake_up_one(wqp); ++vcpu->stat.halt_wakeup; return true; }