Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - membarrier updates (Mathieu Desnoyers)

 - SMP balancing optimizations (Mel Gorman)

 - stats update optimizations (Peter Zijlstra)

 - RT scheduler race fixes (Steven Rostedt)

 - misc fixes and updates

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/fair: Use a recently used CPU as an idle candidate and the basis for SIS
  sched/fair: Do not migrate if the prev_cpu is idle
  sched/fair: Restructure wake_affine*() to return a CPU id
  sched/fair: Remove unnecessary parameters from wake_affine_idle()
  sched/rt: Make update_curr_rt() more accurate
  sched/rt: Up the root domain ref count when passing it around via IPIs
  sched/rt: Use container_of() to get root domain in rto_push_irq_work_func()
  sched/core: Optimize update_stats_*()
  sched/core: Optimize ttwu_stat()
  membarrier/selftest: Test private expedited sync core command
  membarrier/arm64: Provide core serializing command
  membarrier/x86: Provide core serializing command
  membarrier: Provide core serializing command, *_SYNC_CORE
  lockin/x86: Implement sync_core_before_usermode()
  locking: Introduce sync_core_before_usermode()
  membarrier/selftest: Test global expedited command
  membarrier: Provide GLOBAL_EXPEDITED command
  membarrier: Document scheduler barrier requirements
  powerpc, membarrier: Skip memory barrier in switch_mm()
  membarrier/selftest: Test private expedited command
This commit is contained in:
Linus Torvalds 2018-02-06 19:57:31 -08:00
commit ab2d92ad88
25 changed files with 750 additions and 127 deletions

View File

@ -9025,6 +9025,7 @@ L: linux-kernel@vger.kernel.org
S: Supported S: Supported
F: kernel/sched/membarrier.c F: kernel/sched/membarrier.c
F: include/uapi/linux/membarrier.h F: include/uapi/linux/membarrier.h
F: arch/powerpc/include/asm/membarrier.h
MEMORY MANAGEMENT MEMORY MANAGEMENT
L: linux-mm@kvack.org L: linux-mm@kvack.org

View File

@ -16,6 +16,7 @@ config ARM64
select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
select ARCH_HAS_KCOV select ARCH_HAS_KCOV
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_SET_MEMORY select ARCH_HAS_SET_MEMORY
select ARCH_HAS_SG_CHAIN select ARCH_HAS_SG_CHAIN
select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_KERNEL_RWX

View File

@ -324,6 +324,10 @@ alternative_else_nop_endif
ldp x28, x29, [sp, #16 * 14] ldp x28, x29, [sp, #16 * 14]
ldr lr, [sp, #S_LR] ldr lr, [sp, #S_LR]
add sp, sp, #S_FRAME_SIZE // restore sp add sp, sp, #S_FRAME_SIZE // restore sp
/*
* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on eret context synchronization
* when returning from IPI handler, and when returning to user-space.
*/
.if \el == 0 .if \el == 0
alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0

View File

@ -141,6 +141,7 @@ config PPC
select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PHYS_TO_DMA
select ARCH_HAS_PMEM_API if PPC64 select ARCH_HAS_PMEM_API if PPC64
select ARCH_HAS_MEMBARRIER_CALLBACKS
select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE
select ARCH_HAS_SG_CHAIN select ARCH_HAS_SG_CHAIN
select ARCH_HAS_STRICT_KERNEL_RWX if ((PPC_BOOK3S_64 || PPC32) && !RELOCATABLE && !HIBERNATION) select ARCH_HAS_STRICT_KERNEL_RWX if ((PPC_BOOK3S_64 || PPC32) && !RELOCATABLE && !HIBERNATION)

View File

@ -0,0 +1,27 @@
#ifndef _ASM_POWERPC_MEMBARRIER_H
#define _ASM_POWERPC_MEMBARRIER_H
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
struct mm_struct *next,
struct task_struct *tsk)
{
/*
* Only need the full barrier when switching between processes.
* Barrier when switching from kernel to userspace is not
* required here, given that it is implied by mmdrop(). Barrier
* when switching from userspace to kernel is not needed after
* store to rq->curr.
*/
if (likely(!(atomic_read(&next->membarrier_state) &
(MEMBARRIER_STATE_PRIVATE_EXPEDITED |
MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev))
return;
/*
* The membarrier system call requires a full memory barrier
* after storing to rq->curr, before going back to user-space.
*/
smp_mb();
}
#endif /* _ASM_POWERPC_MEMBARRIER_H */

View File

@ -12,6 +12,7 @@
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/cpu.h> #include <linux/cpu.h>
#include <linux/sched/mm.h>
#include <asm/mmu_context.h> #include <asm/mmu_context.h>
@ -58,6 +59,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* *
* On the read side the barrier is in pte_xchg(), which orders * On the read side the barrier is in pte_xchg(), which orders
* the store to the PTE vs the load of mm_cpumask. * the store to the PTE vs the load of mm_cpumask.
*
* This full barrier is needed by membarrier when switching
* between processes after store to rq->curr, before user-space
* memory accesses.
*/ */
smp_mb(); smp_mb();
@ -80,6 +85,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
if (new_on_cpu) if (new_on_cpu)
radix_kvm_prefetch_workaround(next); radix_kvm_prefetch_workaround(next);
else
membarrier_arch_switch_mm(prev, next, tsk);
/* /*
* The actual HW switching method differs between the various * The actual HW switching method differs between the various

View File

@ -55,6 +55,7 @@ config X86
select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_KCOV if X86_64
select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PHYS_TO_DMA
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_REFCOUNT select ARCH_HAS_REFCOUNT
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
@ -62,6 +63,7 @@ config X86
select ARCH_HAS_SG_CHAIN select ARCH_HAS_SG_CHAIN
select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_STRICT_MODULE_RWX
select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAS_ZONE_DEVICE if X86_64 select ARCH_HAS_ZONE_DEVICE if X86_64
select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_HAVE_NMI_SAFE_CMPXCHG

View File

@ -566,6 +566,11 @@ restore_all:
.Lrestore_nocheck: .Lrestore_nocheck:
RESTORE_REGS 4 # skip orig_eax/error_code RESTORE_REGS 4 # skip orig_eax/error_code
.Lirq_return: .Lirq_return:
/*
* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
* when returning from IPI handler and when returning from
* scheduler to user-space.
*/
INTERRUPT_RETURN INTERRUPT_RETURN
.section .fixup, "ax" .section .fixup, "ax"

View File

@ -691,6 +691,10 @@ GLOBAL(restore_regs_and_return_to_kernel)
POP_EXTRA_REGS POP_EXTRA_REGS
POP_C_REGS POP_C_REGS
addq $8, %rsp /* skip regs->orig_ax */ addq $8, %rsp /* skip regs->orig_ax */
/*
* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
* when returning from IPI handler.
*/
INTERRUPT_RETURN INTERRUPT_RETURN
ENTRY(native_iret) ENTRY(native_iret)

View File

@ -0,0 +1,28 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SYNC_CORE_H
#define _ASM_X86_SYNC_CORE_H
#include <linux/preempt.h>
#include <asm/processor.h>
#include <asm/cpufeature.h>
/*
* Ensure that a core serializing instruction is issued before returning
* to user-mode. x86 implements return to user-space through sysexit,
* sysrel, and sysretq, which are not core serializing.
*/
static inline void sync_core_before_usermode(void)
{
/* With PTI, we unconditionally serialize before running user code. */
if (static_cpu_has(X86_FEATURE_PTI))
return;
/*
* Return from interrupt and NMI is done through iret, which is core
* serializing.
*/
if (in_irq() || in_nmi())
return;
sync_core();
}
#endif /* _ASM_X86_SYNC_CORE_H */

View File

@ -229,6 +229,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
#endif #endif
this_cpu_write(cpu_tlbstate.is_lazy, false); this_cpu_write(cpu_tlbstate.is_lazy, false);
/*
* The membarrier system call requires a full memory barrier and
* core serialization before returning to user-space, after
* storing to rq->curr. Writing to CR3 provides that full
* memory barrier and core serializing instruction.
*/
if (real_prev == next) { if (real_prev == next) {
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id); next->context.ctx_id);

View File

@ -555,6 +555,14 @@ struct task_struct {
unsigned long wakee_flip_decay_ts; unsigned long wakee_flip_decay_ts;
struct task_struct *last_wakee; struct task_struct *last_wakee;
/*
* recent_used_cpu is initially set as the last CPU used by a task
* that wakes affine another task. Waker/wakee relationships can
* push tasks around a CPU where each wakeup moves to the next one.
* Tracking a recently used CPU allows a quick search for a recently
* used CPU that may be idle.
*/
int recent_used_cpu;
int wake_cpu; int wake_cpu;
#endif #endif
int on_rq; int on_rq;

View File

@ -7,6 +7,7 @@
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/mm_types.h> #include <linux/mm_types.h>
#include <linux/gfp.h> #include <linux/gfp.h>
#include <linux/sync_core.h>
/* /*
* Routines for handling mm_structs * Routines for handling mm_structs
@ -194,18 +195,48 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
#ifdef CONFIG_MEMBARRIER #ifdef CONFIG_MEMBARRIER
enum { enum {
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0), MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0),
MEMBARRIER_STATE_SWITCH_MM = (1U << 1), MEMBARRIER_STATE_PRIVATE_EXPEDITED = (1U << 1),
MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY = (1U << 2),
MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3),
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 4),
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE = (1U << 5),
}; };
enum {
MEMBARRIER_FLAG_SYNC_CORE = (1U << 0),
};
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
#include <asm/membarrier.h>
#endif
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
if (likely(!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
return;
sync_core_before_usermode();
}
static inline void membarrier_execve(struct task_struct *t) static inline void membarrier_execve(struct task_struct *t)
{ {
atomic_set(&t->mm->membarrier_state, 0); atomic_set(&t->mm->membarrier_state, 0);
} }
#else #else
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
struct mm_struct *next,
struct task_struct *tsk)
{
}
#endif
static inline void membarrier_execve(struct task_struct *t) static inline void membarrier_execve(struct task_struct *t)
{ {
} }
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
}
#endif #endif
#endif /* _LINUX_SCHED_MM_H */ #endif /* _LINUX_SCHED_MM_H */

21
include/linux/sync_core.h Normal file
View File

@ -0,0 +1,21 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SYNC_CORE_H
#define _LINUX_SYNC_CORE_H
#ifdef CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
#include <asm/sync_core.h>
#else
/*
* This is a dummy sync_core_before_usermode() implementation that can be used
* on all architectures which return to user-space through core serializing
* instructions.
* If your architecture returns to user-space through non-core-serializing
* instructions, you need to write your own functions.
*/
static inline void sync_core_before_usermode(void)
{
}
#endif
#endif /* _LINUX_SYNC_CORE_H */

View File

@ -31,7 +31,7 @@
* enum membarrier_cmd - membarrier system call command * enum membarrier_cmd - membarrier system call command
* @MEMBARRIER_CMD_QUERY: Query the set of supported commands. It returns * @MEMBARRIER_CMD_QUERY: Query the set of supported commands. It returns
* a bitmask of valid commands. * a bitmask of valid commands.
* @MEMBARRIER_CMD_SHARED: Execute a memory barrier on all running threads. * @MEMBARRIER_CMD_GLOBAL: Execute a memory barrier on all running threads.
* Upon return from system call, the caller thread * Upon return from system call, the caller thread
* is ensured that all running threads have passed * is ensured that all running threads have passed
* through a state where all memory accesses to * through a state where all memory accesses to
@ -40,6 +40,28 @@
* (non-running threads are de facto in such a * (non-running threads are de facto in such a
* state). This covers threads from all processes * state). This covers threads from all processes
* running on the system. This command returns 0. * running on the system. This command returns 0.
* @MEMBARRIER_CMD_GLOBAL_EXPEDITED:
* Execute a memory barrier on all running threads
* of all processes which previously registered
* with MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED.
* Upon return from system call, the caller thread
* is ensured that all running threads have passed
* through a state where all memory accesses to
* user-space addresses match program order between
* entry to and return from the system call
* (non-running threads are de facto in such a
* state). This only covers threads from processes
* which registered with
* MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED.
* This command returns 0. Given that
* registration is about the intent to receive
* the barriers, it is valid to invoke
* MEMBARRIER_CMD_GLOBAL_EXPEDITED from a
* non-registered process.
* @MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
* Register the process intent to receive
* MEMBARRIER_CMD_GLOBAL_EXPEDITED memory
* barriers. Always returns 0.
* @MEMBARRIER_CMD_PRIVATE_EXPEDITED: * @MEMBARRIER_CMD_PRIVATE_EXPEDITED:
* Execute a memory barrier on each running * Execute a memory barrier on each running
* thread belonging to the same process as the current * thread belonging to the same process as the current
@ -51,7 +73,7 @@
* to and return from the system call * to and return from the system call
* (non-running threads are de facto in such a * (non-running threads are de facto in such a
* state). This only covers threads from the * state). This only covers threads from the
* same processes as the caller thread. This * same process as the caller thread. This
* command returns 0 on success. The * command returns 0 on success. The
* "expedited" commands complete faster than * "expedited" commands complete faster than
* the non-expedited ones, they never block, * the non-expedited ones, they never block,
@ -64,18 +86,54 @@
* Register the process intent to use * Register the process intent to use
* MEMBARRIER_CMD_PRIVATE_EXPEDITED. Always * MEMBARRIER_CMD_PRIVATE_EXPEDITED. Always
* returns 0. * returns 0.
* @MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
* In addition to provide memory ordering
* guarantees described in
* MEMBARRIER_CMD_PRIVATE_EXPEDITED, ensure
* the caller thread, upon return from system
* call, that all its running threads siblings
* have executed a core serializing
* instruction. (architectures are required to
* guarantee that non-running threads issue
* core serializing instructions before they
* resume user-space execution). This only
* covers threads from the same process as the
* caller thread. This command returns 0 on
* success. The "expedited" commands complete
* faster than the non-expedited ones, they
* never block, but have the downside of
* causing extra overhead. If this command is
* not implemented by an architecture, -EINVAL
* is returned. A process needs to register its
* intent to use the private expedited sync
* core command prior to using it, otherwise
* this command returns -EPERM.
* @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
* Register the process intent to use
* MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE.
* If this command is not implemented by an
* architecture, -EINVAL is returned.
* Returns 0 on success.
* @MEMBARRIER_CMD_SHARED:
* Alias to MEMBARRIER_CMD_GLOBAL. Provided for
* header backward compatibility.
* *
* Command to be passed to the membarrier system call. The commands need to * Command to be passed to the membarrier system call. The commands need to
* be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
* the value 0. * the value 0.
*/ */
enum membarrier_cmd { enum membarrier_cmd {
MEMBARRIER_CMD_QUERY = 0, MEMBARRIER_CMD_QUERY = 0,
MEMBARRIER_CMD_SHARED = (1 << 0), MEMBARRIER_CMD_GLOBAL = (1 << 0),
/* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */ MEMBARRIER_CMD_GLOBAL_EXPEDITED = (1 << 1),
/* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */ MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED = (1 << 2),
MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3), MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3),
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4), MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4),
MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 5),
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 6),
/* Alias for header backward compatibility. */
MEMBARRIER_CMD_SHARED = MEMBARRIER_CMD_GLOBAL,
}; };
#endif /* _UAPI_LINUX_MEMBARRIER_H */ #endif /* _UAPI_LINUX_MEMBARRIER_H */

View File

@ -1412,6 +1412,12 @@ config USERFAULTFD
Enable the userfaultfd() system call that allows to intercept and Enable the userfaultfd() system call that allows to intercept and
handle page faults in userland. handle page faults in userland.
config ARCH_HAS_MEMBARRIER_CALLBACKS
bool
config ARCH_HAS_MEMBARRIER_SYNC_CORE
bool
config EMBEDDED config EMBEDDED
bool "Embedded system" bool "Embedded system"
option allnoconfig_y option allnoconfig_y
@ -1915,3 +1921,6 @@ config ASN1
functions to call on what tags. functions to call on what tags.
source "kernel/Kconfig.locks" source "kernel/Kconfig.locks"
config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
bool

View File

@ -606,6 +606,11 @@ static void __mmdrop(struct mm_struct *mm)
void mmdrop(struct mm_struct *mm) void mmdrop(struct mm_struct *mm)
{ {
/*
* The implicit full barrier implied by atomic_dec_and_test() is
* required by the membarrier system call before returning to
* user-space, after storing to rq->curr.
*/
if (unlikely(atomic_dec_and_test(&mm->mm_count))) if (unlikely(atomic_dec_and_test(&mm->mm_count)))
__mmdrop(mm); __mmdrop(mm);
} }

View File

@ -1630,16 +1630,16 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
if (cpu == rq->cpu) { if (cpu == rq->cpu) {
schedstat_inc(rq->ttwu_local); __schedstat_inc(rq->ttwu_local);
schedstat_inc(p->se.statistics.nr_wakeups_local); __schedstat_inc(p->se.statistics.nr_wakeups_local);
} else { } else {
struct sched_domain *sd; struct sched_domain *sd;
schedstat_inc(p->se.statistics.nr_wakeups_remote); __schedstat_inc(p->se.statistics.nr_wakeups_remote);
rcu_read_lock(); rcu_read_lock();
for_each_domain(rq->cpu, sd) { for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
schedstat_inc(sd->ttwu_wake_remote); __schedstat_inc(sd->ttwu_wake_remote);
break; break;
} }
} }
@ -1647,14 +1647,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
} }
if (wake_flags & WF_MIGRATED) if (wake_flags & WF_MIGRATED)
schedstat_inc(p->se.statistics.nr_wakeups_migrate); __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
schedstat_inc(rq->ttwu_count); __schedstat_inc(rq->ttwu_count);
schedstat_inc(p->se.statistics.nr_wakeups); __schedstat_inc(p->se.statistics.nr_wakeups);
if (wake_flags & WF_SYNC) if (wake_flags & WF_SYNC)
schedstat_inc(p->se.statistics.nr_wakeups_sync); __schedstat_inc(p->se.statistics.nr_wakeups_sync);
} }
static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
@ -2461,6 +2461,7 @@ void wake_up_new_task(struct task_struct *p)
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
* as we're not fully set-up yet. * as we're not fully set-up yet.
*/ */
p->recent_used_cpu = task_cpu(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif #endif
rq = __task_rq_lock(p, &rf); rq = __task_rq_lock(p, &rf);
@ -2698,23 +2699,27 @@ static struct rq *finish_task_switch(struct task_struct *prev)
prev_state = prev->state; prev_state = prev->state;
vtime_task_switch(prev); vtime_task_switch(prev);
perf_event_task_sched_in(prev, current); perf_event_task_sched_in(prev, current);
/*
* The membarrier system call requires a full memory barrier
* after storing to rq->curr, before going back to user-space.
*
* TODO: This smp_mb__after_unlock_lock can go away if PPC end
* up adding a full barrier to switch_mm(), or we should figure
* out if a smp_mb__after_unlock_lock is really the proper API
* to use.
*/
smp_mb__after_unlock_lock();
finish_task(prev); finish_task(prev);
finish_lock_switch(rq); finish_lock_switch(rq);
finish_arch_post_lock_switch(); finish_arch_post_lock_switch();
fire_sched_in_preempt_notifiers(current); fire_sched_in_preempt_notifiers(current);
if (mm) /*
* When switching through a kernel thread, the loop in
* membarrier_{private,global}_expedited() may have observed that
* kernel thread and not issued an IPI. It is therefore possible to
* schedule between user->kernel->user threads without passing though
* switch_mm(). Membarrier requires a barrier after storing to
* rq->curr, before returning to userspace, so provide them here:
*
* - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
* provided by mmdrop(),
* - a sync_core for SYNC_CORE.
*/
if (mm) {
membarrier_mm_sync_core_before_usermode(mm);
mmdrop(mm); mmdrop(mm);
}
if (unlikely(prev_state == TASK_DEAD)) { if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead) if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev); prev->sched_class->task_dead(prev);
@ -2818,6 +2823,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
*/ */
arch_start_context_switch(prev); arch_start_context_switch(prev);
/*
* If mm is non-NULL, we pass through switch_mm(). If mm is
* NULL, we will pass through mmdrop() in finish_task_switch().
* Both of these contain the full memory barrier required by
* membarrier after storing to rq->curr, before returning to
* user-space.
*/
if (!mm) { if (!mm) {
next->active_mm = oldmm; next->active_mm = oldmm;
mmgrab(oldmm); mmgrab(oldmm);
@ -3354,6 +3366,9 @@ static void __sched notrace __schedule(bool preempt)
* Make sure that signal_pending_state()->signal_pending() below * Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up(). * done by the caller to avoid the race with signal_wake_up().
*
* The membarrier system call requires a full memory barrier
* after coming from user-space, before storing to rq->curr.
*/ */
rq_lock(rq, &rf); rq_lock(rq, &rf);
smp_mb__after_spinlock(); smp_mb__after_spinlock();
@ -3401,17 +3416,16 @@ static void __sched notrace __schedule(bool preempt)
/* /*
* The membarrier system call requires each architecture * The membarrier system call requires each architecture
* to have a full memory barrier after updating * to have a full memory barrier after updating
* rq->curr, before returning to user-space. For TSO * rq->curr, before returning to user-space.
* (e.g. x86), the architecture must provide its own *
* barrier in switch_mm(). For weakly ordered machines * Here are the schemes providing that barrier on the
* for which spin_unlock() acts as a full memory * various architectures:
* barrier, finish_lock_switch() in common code takes * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
* care of this barrier. For weakly ordered machines for * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
* which spin_unlock() acts as a RELEASE barrier (only * - finish_lock_switch() for weakly-ordered
* arm64 and PowerPC), arm64 has a full barrier in * architectures where spin_unlock is a full barrier,
* switch_to(), and PowerPC has * - switch_to() for arm64 (weakly-ordered, spin_unlock
* smp_mb__after_unlock_lock() before * is a RELEASE barrier),
* finish_lock_switch().
*/ */
++*switch_count; ++*switch_count;

View File

@ -871,7 +871,7 @@ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
likely(wait_start > prev_wait_start)) likely(wait_start > prev_wait_start))
wait_start -= prev_wait_start; wait_start -= prev_wait_start;
schedstat_set(se->statistics.wait_start, wait_start); __schedstat_set(se->statistics.wait_start, wait_start);
} }
static inline void static inline void
@ -893,17 +893,17 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
* time stamp can be adjusted to accumulate wait time * time stamp can be adjusted to accumulate wait time
* prior to migration. * prior to migration.
*/ */
schedstat_set(se->statistics.wait_start, delta); __schedstat_set(se->statistics.wait_start, delta);
return; return;
} }
trace_sched_stat_wait(p, delta); trace_sched_stat_wait(p, delta);
} }
schedstat_set(se->statistics.wait_max, __schedstat_set(se->statistics.wait_max,
max(schedstat_val(se->statistics.wait_max), delta)); max(schedstat_val(se->statistics.wait_max), delta));
schedstat_inc(se->statistics.wait_count); __schedstat_inc(se->statistics.wait_count);
schedstat_add(se->statistics.wait_sum, delta); __schedstat_add(se->statistics.wait_sum, delta);
schedstat_set(se->statistics.wait_start, 0); __schedstat_set(se->statistics.wait_start, 0);
} }
static inline void static inline void
@ -928,10 +928,10 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
delta = 0; delta = 0;
if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
schedstat_set(se->statistics.sleep_max, delta); __schedstat_set(se->statistics.sleep_max, delta);
schedstat_set(se->statistics.sleep_start, 0); __schedstat_set(se->statistics.sleep_start, 0);
schedstat_add(se->statistics.sum_sleep_runtime, delta); __schedstat_add(se->statistics.sum_sleep_runtime, delta);
if (tsk) { if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1); account_scheduler_latency(tsk, delta >> 10, 1);
@ -945,15 +945,15 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
delta = 0; delta = 0;
if (unlikely(delta > schedstat_val(se->statistics.block_max))) if (unlikely(delta > schedstat_val(se->statistics.block_max)))
schedstat_set(se->statistics.block_max, delta); __schedstat_set(se->statistics.block_max, delta);
schedstat_set(se->statistics.block_start, 0); __schedstat_set(se->statistics.block_start, 0);
schedstat_add(se->statistics.sum_sleep_runtime, delta); __schedstat_add(se->statistics.sum_sleep_runtime, delta);
if (tsk) { if (tsk) {
if (tsk->in_iowait) { if (tsk->in_iowait) {
schedstat_add(se->statistics.iowait_sum, delta); __schedstat_add(se->statistics.iowait_sum, delta);
schedstat_inc(se->statistics.iowait_count); __schedstat_inc(se->statistics.iowait_count);
trace_sched_stat_iowait(tsk, delta); trace_sched_stat_iowait(tsk, delta);
} }
@ -1012,10 +1012,10 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
struct task_struct *tsk = task_of(se); struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE) if (tsk->state & TASK_INTERRUPTIBLE)
schedstat_set(se->statistics.sleep_start, __schedstat_set(se->statistics.sleep_start,
rq_clock(rq_of(cfs_rq))); rq_clock(rq_of(cfs_rq)));
if (tsk->state & TASK_UNINTERRUPTIBLE) if (tsk->state & TASK_UNINTERRUPTIBLE)
schedstat_set(se->statistics.block_start, __schedstat_set(se->statistics.block_start,
rq_clock(rq_of(cfs_rq))); rq_clock(rq_of(cfs_rq)));
} }
} }
@ -5692,27 +5692,31 @@ static int wake_wide(struct task_struct *p)
* scheduling latency of the CPUs. This seems to work * scheduling latency of the CPUs. This seems to work
* for the overloaded case. * for the overloaded case.
*/ */
static int
static bool wake_affine_idle(int this_cpu, int prev_cpu, int sync)
wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int prev_cpu, int sync)
{ {
/* /*
* If this_cpu is idle, it implies the wakeup is from interrupt * If this_cpu is idle, it implies the wakeup is from interrupt
* context. Only allow the move if cache is shared. Otherwise an * context. Only allow the move if cache is shared. Otherwise an
* interrupt intensive workload could force all tasks onto one * interrupt intensive workload could force all tasks onto one
* node depending on the IO topology or IRQ affinity settings. * node depending on the IO topology or IRQ affinity settings.
*
* If the prev_cpu is idle and cache affine then avoid a migration.
* There is no guarantee that the cache hot data from an interrupt
* is more important than cache hot data on the prev_cpu and from
* a cpufreq perspective, it's better to have higher utilisation
* on one CPU.
*/ */
if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
return true; return idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
if (sync && cpu_rq(this_cpu)->nr_running == 1) if (sync && cpu_rq(this_cpu)->nr_running == 1)
return true; return this_cpu;
return false; return nr_cpumask_bits;
} }
static bool static int
wake_affine_weight(struct sched_domain *sd, struct task_struct *p, wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int prev_cpu, int sync) int this_cpu, int prev_cpu, int sync)
{ {
@ -5726,7 +5730,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
unsigned long current_load = task_h_load(current); unsigned long current_load = task_h_load(current);
if (current_load > this_eff_load) if (current_load > this_eff_load)
return true; return this_cpu;
this_eff_load -= current_load; this_eff_load -= current_load;
} }
@ -5743,28 +5747,28 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
prev_eff_load *= capacity_of(this_cpu); prev_eff_load *= capacity_of(this_cpu);
return this_eff_load <= prev_eff_load; return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits;
} }
static int wake_affine(struct sched_domain *sd, struct task_struct *p, static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync) int prev_cpu, int sync)
{ {
int this_cpu = smp_processor_id(); int this_cpu = smp_processor_id();
bool affine = false; int target = nr_cpumask_bits;
if (sched_feat(WA_IDLE) && !affine) if (sched_feat(WA_IDLE))
affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync); target = wake_affine_idle(this_cpu, prev_cpu, sync);
if (sched_feat(WA_WEIGHT) && !affine) if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (affine) { if (target == nr_cpumask_bits)
schedstat_inc(sd->ttwu_move_affine); return prev_cpu;
schedstat_inc(p->se.statistics.nr_wakeups_affine);
}
return affine; schedstat_inc(sd->ttwu_move_affine);
schedstat_inc(p->se.statistics.nr_wakeups_affine);
return target;
} }
static inline unsigned long task_util(struct task_struct *p); static inline unsigned long task_util(struct task_struct *p);
@ -6193,7 +6197,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
static int select_idle_sibling(struct task_struct *p, int prev, int target) static int select_idle_sibling(struct task_struct *p, int prev, int target)
{ {
struct sched_domain *sd; struct sched_domain *sd;
int i; int i, recent_used_cpu;
if (idle_cpu(target)) if (idle_cpu(target))
return target; return target;
@ -6204,6 +6208,21 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
return prev; return prev;
/* Check a recently used CPU as a potential idle candidate */
recent_used_cpu = p->recent_used_cpu;
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
idle_cpu(recent_used_cpu) &&
cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
/*
* Replace recent_used_cpu with prev as it is a potential
* candidate for the next wake.
*/
p->recent_used_cpu = prev;
return recent_used_cpu;
}
sd = rcu_dereference(per_cpu(sd_llc, target)); sd = rcu_dereference(per_cpu(sd_llc, target));
if (!sd) if (!sd)
return target; return target;
@ -6357,8 +6376,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (cpu == prev_cpu) if (cpu == prev_cpu)
goto pick_cpu; goto pick_cpu;
if (wake_affine(affine_sd, p, prev_cpu, sync)) new_cpu = wake_affine(affine_sd, p, prev_cpu, sync);
new_cpu = cpu;
} }
if (sd && !(sd_flag & SD_BALANCE_FORK)) { if (sd && !(sd_flag & SD_BALANCE_FORK)) {
@ -6372,9 +6390,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (!sd) { if (!sd) {
pick_cpu: pick_cpu:
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
if (want_affine)
current->recent_used_cpu = cpu;
}
} else { } else {
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
} }

View File

@ -26,24 +26,110 @@
* Bitmask made from a "or" of all commands within enum membarrier_cmd, * Bitmask made from a "or" of all commands within enum membarrier_cmd,
* except MEMBARRIER_CMD_QUERY. * except MEMBARRIER_CMD_QUERY.
*/ */
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
#else
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
#endif
#define MEMBARRIER_CMD_BITMASK \ #define MEMBARRIER_CMD_BITMASK \
(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
| MEMBARRIER_CMD_PRIVATE_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
static void ipi_mb(void *info) static void ipi_mb(void *info)
{ {
smp_mb(); /* IPIs should be serializing but paranoid. */ smp_mb(); /* IPIs should be serializing but paranoid. */
} }
static int membarrier_private_expedited(void) static int membarrier_global_expedited(void)
{ {
int cpu; int cpu;
bool fallback = false; bool fallback = false;
cpumask_var_t tmpmask; cpumask_var_t tmpmask;
if (!(atomic_read(&current->mm->membarrier_state) if (num_online_cpus() == 1)
& MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) return 0;
return -EPERM;
/*
* Matches memory barriers around rq->curr modification in
* scheduler.
*/
smp_mb(); /* system call entry is not a mb. */
/*
* Expedited membarrier commands guarantee that they won't
* block, hence the GFP_NOWAIT allocation flag and fallback
* implementation.
*/
if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
/* Fallback for OOM. */
fallback = true;
}
cpus_read_lock();
for_each_online_cpu(cpu) {
struct task_struct *p;
/*
* Skipping the current CPU is OK even through we can be
* migrated at any point. The current CPU, at the point
* where we read raw_smp_processor_id(), is ensured to
* be in program order with respect to the caller
* thread. Therefore, we can skip this CPU from the
* iteration.
*/
if (cpu == raw_smp_processor_id())
continue;
rcu_read_lock();
p = task_rcu_dereference(&cpu_rq(cpu)->curr);
if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
if (!fallback)
__cpumask_set_cpu(cpu, tmpmask);
else
smp_call_function_single(cpu, ipi_mb, NULL, 1);
}
rcu_read_unlock();
}
if (!fallback) {
preempt_disable();
smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
preempt_enable();
free_cpumask_var(tmpmask);
}
cpus_read_unlock();
/*
* Memory barrier on the caller thread _after_ we finished
* waiting for the last IPI. Matches memory barriers around
* rq->curr modification in scheduler.
*/
smp_mb(); /* exit from system call is not a mb */
return 0;
}
static int membarrier_private_expedited(int flags)
{
int cpu;
bool fallback = false;
cpumask_var_t tmpmask;
if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL;
if (!(atomic_read(&current->mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
return -EPERM;
} else {
if (!(atomic_read(&current->mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
return -EPERM;
}
if (num_online_cpus() == 1) if (num_online_cpus() == 1)
return 0; return 0;
@ -105,21 +191,69 @@ static int membarrier_private_expedited(void)
return 0; return 0;
} }
static void membarrier_register_private_expedited(void) static int membarrier_register_global_expedited(void)
{ {
struct task_struct *p = current; struct task_struct *p = current;
struct mm_struct *mm = p->mm; struct mm_struct *mm = p->mm;
if (atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
return 0;
atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
/*
* For single mm user, single threaded process, we can
* simply issue a memory barrier after setting
* MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
* no memory access following registration is reordered
* before registration.
*/
smp_mb();
} else {
/*
* For multi-mm user threads, we need to ensure all
* future scheduler executions will observe the new
* thread flag state for this mm.
*/
synchronize_sched();
}
atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
&mm->membarrier_state);
return 0;
}
static int membarrier_register_private_expedited(int flags)
{
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL;
state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
}
/* /*
* We need to consider threads belonging to different thread * We need to consider threads belonging to different thread
* groups, which use the same mm. (CLONE_VM but not * groups, which use the same mm. (CLONE_VM but not
* CLONE_THREAD). * CLONE_THREAD).
*/ */
if (atomic_read(&mm->membarrier_state) if (atomic_read(&mm->membarrier_state) & state)
& MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY) return 0;
return; atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, if (flags & MEMBARRIER_FLAG_SYNC_CORE)
&mm->membarrier_state); atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
&mm->membarrier_state);
if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
/*
* Ensure all future scheduler executions will observe the
* new thread flag state for this process.
*/
synchronize_sched();
}
atomic_or(state, &mm->membarrier_state);
return 0;
} }
/** /**
@ -159,21 +293,28 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
int cmd_mask = MEMBARRIER_CMD_BITMASK; int cmd_mask = MEMBARRIER_CMD_BITMASK;
if (tick_nohz_full_enabled()) if (tick_nohz_full_enabled())
cmd_mask &= ~MEMBARRIER_CMD_SHARED; cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
return cmd_mask; return cmd_mask;
} }
case MEMBARRIER_CMD_SHARED: case MEMBARRIER_CMD_GLOBAL:
/* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
if (tick_nohz_full_enabled()) if (tick_nohz_full_enabled())
return -EINVAL; return -EINVAL;
if (num_online_cpus() > 1) if (num_online_cpus() > 1)
synchronize_sched(); synchronize_sched();
return 0; return 0;
case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
return membarrier_global_expedited();
case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
return membarrier_register_global_expedited();
case MEMBARRIER_CMD_PRIVATE_EXPEDITED: case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
return membarrier_private_expedited(); return membarrier_private_expedited(0);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
membarrier_register_private_expedited(); return membarrier_register_private_expedited(0);
return 0; case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
default: default:
return -EINVAL; return -EINVAL;
} }

View File

@ -950,12 +950,13 @@ static void update_curr_rt(struct rq *rq)
{ {
struct task_struct *curr = rq->curr; struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt; struct sched_rt_entity *rt_se = &curr->rt;
u64 now = rq_clock_task(rq);
u64 delta_exec; u64 delta_exec;
if (curr->sched_class != &rt_sched_class) if (curr->sched_class != &rt_sched_class)
return; return;
delta_exec = rq_clock_task(rq) - curr->se.exec_start; delta_exec = now - curr->se.exec_start;
if (unlikely((s64)delta_exec <= 0)) if (unlikely((s64)delta_exec <= 0))
return; return;
@ -968,7 +969,7 @@ static void update_curr_rt(struct rq *rq)
curr->se.sum_exec_runtime += delta_exec; curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec); account_group_exec_runtime(curr, delta_exec);
curr->se.exec_start = rq_clock_task(rq); curr->se.exec_start = now;
cgroup_account_cputime(curr, delta_exec); cgroup_account_cputime(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec); sched_rt_avg_update(rq, delta_exec);
@ -1907,9 +1908,8 @@ static void push_rt_tasks(struct rq *rq)
* the rt_loop_next will cause the iterator to perform another scan. * the rt_loop_next will cause the iterator to perform another scan.
* *
*/ */
static int rto_next_cpu(struct rq *rq) static int rto_next_cpu(struct root_domain *rd)
{ {
struct root_domain *rd = rq->rd;
int next; int next;
int cpu; int cpu;
@ -1985,19 +1985,24 @@ static void tell_cpu_to_push(struct rq *rq)
* Otherwise it is finishing up and an ipi needs to be sent. * Otherwise it is finishing up and an ipi needs to be sent.
*/ */
if (rq->rd->rto_cpu < 0) if (rq->rd->rto_cpu < 0)
cpu = rto_next_cpu(rq); cpu = rto_next_cpu(rq->rd);
raw_spin_unlock(&rq->rd->rto_lock); raw_spin_unlock(&rq->rd->rto_lock);
rto_start_unlock(&rq->rd->rto_loop_start); rto_start_unlock(&rq->rd->rto_loop_start);
if (cpu >= 0) if (cpu >= 0) {
/* Make sure the rd does not get freed while pushing */
sched_get_rd(rq->rd);
irq_work_queue_on(&rq->rd->rto_push_work, cpu); irq_work_queue_on(&rq->rd->rto_push_work, cpu);
}
} }
/* Called from hardirq context */ /* Called from hardirq context */
void rto_push_irq_work_func(struct irq_work *work) void rto_push_irq_work_func(struct irq_work *work)
{ {
struct root_domain *rd =
container_of(work, struct root_domain, rto_push_work);
struct rq *rq; struct rq *rq;
int cpu; int cpu;
@ -2013,18 +2018,20 @@ void rto_push_irq_work_func(struct irq_work *work)
raw_spin_unlock(&rq->lock); raw_spin_unlock(&rq->lock);
} }
raw_spin_lock(&rq->rd->rto_lock); raw_spin_lock(&rd->rto_lock);
/* Pass the IPI to the next rt overloaded queue */ /* Pass the IPI to the next rt overloaded queue */
cpu = rto_next_cpu(rq); cpu = rto_next_cpu(rd);
raw_spin_unlock(&rq->rd->rto_lock); raw_spin_unlock(&rd->rto_lock);
if (cpu < 0) if (cpu < 0) {
sched_put_rd(rd);
return; return;
}
/* Try the next RT overloaded CPU */ /* Try the next RT overloaded CPU */
irq_work_queue_on(&rq->rd->rto_push_work, cpu); irq_work_queue_on(&rd->rto_push_work, cpu);
} }
#endif /* HAVE_RT_PUSH_IPI */ #endif /* HAVE_RT_PUSH_IPI */

View File

@ -691,6 +691,8 @@ extern struct mutex sched_domains_mutex;
extern void init_defrootdomain(void); extern void init_defrootdomain(void);
extern int sched_init_domains(const struct cpumask *cpu_map); extern int sched_init_domains(const struct cpumask *cpu_map);
extern void rq_attach_root(struct rq *rq, struct root_domain *rd); extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
extern void sched_get_rd(struct root_domain *rd);
extern void sched_put_rd(struct root_domain *rd);
#ifdef HAVE_RT_PUSH_IPI #ifdef HAVE_RT_PUSH_IPI
extern void rto_push_irq_work_func(struct irq_work *work); extern void rto_push_irq_work_func(struct irq_work *work);

View File

@ -31,8 +31,11 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
rq->rq_sched_info.run_delay += delta; rq->rq_sched_info.run_delay += delta;
} }
#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) #define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
#define __schedstat_inc(var) do { var++; } while (0)
#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) #define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
#define __schedstat_add(var, amt) do { var += (amt); } while (0)
#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) #define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
#define __schedstat_set(var, val) do { var = (val); } while (0)
#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
#define schedstat_val(var) (var) #define schedstat_val(var) (var)
#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
@ -48,8 +51,11 @@ static inline void
rq_sched_info_depart(struct rq *rq, unsigned long long delta) rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{} {}
#define schedstat_enabled() 0 #define schedstat_enabled() 0
#define __schedstat_inc(var) do { } while (0)
#define schedstat_inc(var) do { } while (0) #define schedstat_inc(var) do { } while (0)
#define __schedstat_add(var, amt) do { } while (0)
#define schedstat_add(var, amt) do { } while (0) #define schedstat_add(var, amt) do { } while (0)
#define __schedstat_set(var, val) do { } while (0)
#define schedstat_set(var, val) do { } while (0) #define schedstat_set(var, val) do { } while (0)
#define schedstat_val(var) 0 #define schedstat_val(var) 0
#define schedstat_val_or_zero(var) 0 #define schedstat_val_or_zero(var) 0

View File

@ -259,6 +259,19 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
call_rcu_sched(&old_rd->rcu, free_rootdomain); call_rcu_sched(&old_rd->rcu, free_rootdomain);
} }
void sched_get_rd(struct root_domain *rd)
{
atomic_inc(&rd->refcount);
}
void sched_put_rd(struct root_domain *rd)
{
if (!atomic_dec_and_test(&rd->refcount))
return;
call_rcu_sched(&rd->rcu, free_rootdomain);
}
static int init_rootdomain(struct root_domain *rd) static int init_rootdomain(struct root_domain *rd)
{ {
if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))

View File

@ -16,49 +16,210 @@ static int sys_membarrier(int cmd, int flags)
static int test_membarrier_cmd_fail(void) static int test_membarrier_cmd_fail(void)
{ {
int cmd = -1, flags = 0; int cmd = -1, flags = 0;
const char *test_name = "sys membarrier invalid command";
if (sys_membarrier(cmd, flags) != -1) { if (sys_membarrier(cmd, flags) != -1) {
ksft_exit_fail_msg( ksft_exit_fail_msg(
"sys membarrier invalid command test: command = %d, flags = %d. Should fail, but passed\n", "%s test: command = %d, flags = %d. Should fail, but passed\n",
cmd, flags); test_name, cmd, flags);
}
if (errno != EINVAL) {
ksft_exit_fail_msg(
"%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
test_name, flags, EINVAL, strerror(EINVAL),
errno, strerror(errno));
} }
ksft_test_result_pass( ksft_test_result_pass(
"sys membarrier invalid command test: command = %d, flags = %d. Failed as expected\n", "%s test: command = %d, flags = %d, errno = %d. Failed as expected\n",
cmd, flags); test_name, cmd, flags, errno);
return 0; return 0;
} }
static int test_membarrier_flags_fail(void) static int test_membarrier_flags_fail(void)
{ {
int cmd = MEMBARRIER_CMD_QUERY, flags = 1; int cmd = MEMBARRIER_CMD_QUERY, flags = 1;
const char *test_name = "sys membarrier MEMBARRIER_CMD_QUERY invalid flags";
if (sys_membarrier(cmd, flags) != -1) { if (sys_membarrier(cmd, flags) != -1) {
ksft_exit_fail_msg( ksft_exit_fail_msg(
"sys membarrier MEMBARRIER_CMD_QUERY invalid flags test: flags = %d. Should fail, but passed\n", "%s test: flags = %d. Should fail, but passed\n",
flags); test_name, flags);
}
if (errno != EINVAL) {
ksft_exit_fail_msg(
"%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
test_name, flags, EINVAL, strerror(EINVAL),
errno, strerror(errno));
} }
ksft_test_result_pass( ksft_test_result_pass(
"sys membarrier MEMBARRIER_CMD_QUERY invalid flags test: flags = %d. Failed as expected\n", "%s test: flags = %d, errno = %d. Failed as expected\n",
flags); test_name, flags, errno);
return 0; return 0;
} }
static int test_membarrier_success(void) static int test_membarrier_global_success(void)
{ {
int cmd = MEMBARRIER_CMD_SHARED, flags = 0; int cmd = MEMBARRIER_CMD_GLOBAL, flags = 0;
const char *test_name = "sys membarrier MEMBARRIER_CMD_SHARED\n"; const char *test_name = "sys membarrier MEMBARRIER_CMD_GLOBAL";
if (sys_membarrier(cmd, flags) != 0) { if (sys_membarrier(cmd, flags) != 0) {
ksft_exit_fail_msg( ksft_exit_fail_msg(
"sys membarrier MEMBARRIER_CMD_SHARED test: flags = %d\n", "%s test: flags = %d, errno = %d\n",
flags); test_name, flags, errno);
} }
ksft_test_result_pass( ksft_test_result_pass(
"sys membarrier MEMBARRIER_CMD_SHARED test: flags = %d\n", "%s test: flags = %d\n", test_name, flags);
flags); return 0;
}
static int test_membarrier_private_expedited_fail(void)
{
int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED, flags = 0;
const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED not registered failure";
if (sys_membarrier(cmd, flags) != -1) {
ksft_exit_fail_msg(
"%s test: flags = %d. Should fail, but passed\n",
test_name, flags);
}
if (errno != EPERM) {
ksft_exit_fail_msg(
"%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
test_name, flags, EPERM, strerror(EPERM),
errno, strerror(errno));
}
ksft_test_result_pass(
"%s test: flags = %d, errno = %d\n",
test_name, flags, errno);
return 0;
}
static int test_membarrier_register_private_expedited_success(void)
{
int cmd = MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, flags = 0;
const char *test_name = "sys membarrier MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED";
if (sys_membarrier(cmd, flags) != 0) {
ksft_exit_fail_msg(
"%s test: flags = %d, errno = %d\n",
test_name, flags, errno);
}
ksft_test_result_pass(
"%s test: flags = %d\n",
test_name, flags);
return 0;
}
static int test_membarrier_private_expedited_success(void)
{
int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED, flags = 0;
const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED";
if (sys_membarrier(cmd, flags) != 0) {
ksft_exit_fail_msg(
"%s test: flags = %d, errno = %d\n",
test_name, flags, errno);
}
ksft_test_result_pass(
"%s test: flags = %d\n",
test_name, flags);
return 0;
}
static int test_membarrier_private_expedited_sync_core_fail(void)
{
int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE, flags = 0;
const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE not registered failure";
if (sys_membarrier(cmd, flags) != -1) {
ksft_exit_fail_msg(
"%s test: flags = %d. Should fail, but passed\n",
test_name, flags);
}
if (errno != EPERM) {
ksft_exit_fail_msg(
"%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
test_name, flags, EPERM, strerror(EPERM),
errno, strerror(errno));
}
ksft_test_result_pass(
"%s test: flags = %d, errno = %d\n",
test_name, flags, errno);
return 0;
}
static int test_membarrier_register_private_expedited_sync_core_success(void)
{
int cmd = MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, flags = 0;
const char *test_name = "sys membarrier MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE";
if (sys_membarrier(cmd, flags) != 0) {
ksft_exit_fail_msg(
"%s test: flags = %d, errno = %d\n",
test_name, flags, errno);
}
ksft_test_result_pass(
"%s test: flags = %d\n",
test_name, flags);
return 0;
}
static int test_membarrier_private_expedited_sync_core_success(void)
{
int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED, flags = 0;
const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE";
if (sys_membarrier(cmd, flags) != 0) {
ksft_exit_fail_msg(
"%s test: flags = %d, errno = %d\n",
test_name, flags, errno);
}
ksft_test_result_pass(
"%s test: flags = %d\n",
test_name, flags);
return 0;
}
static int test_membarrier_register_global_expedited_success(void)
{
int cmd = MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, flags = 0;
const char *test_name = "sys membarrier MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED";
if (sys_membarrier(cmd, flags) != 0) {
ksft_exit_fail_msg(
"%s test: flags = %d, errno = %d\n",
test_name, flags, errno);
}
ksft_test_result_pass(
"%s test: flags = %d\n",
test_name, flags);
return 0;
}
static int test_membarrier_global_expedited_success(void)
{
int cmd = MEMBARRIER_CMD_GLOBAL_EXPEDITED, flags = 0;
const char *test_name = "sys membarrier MEMBARRIER_CMD_GLOBAL_EXPEDITED";
if (sys_membarrier(cmd, flags) != 0) {
ksft_exit_fail_msg(
"%s test: flags = %d, errno = %d\n",
test_name, flags, errno);
}
ksft_test_result_pass(
"%s test: flags = %d\n",
test_name, flags);
return 0; return 0;
} }
@ -72,7 +233,45 @@ static int test_membarrier(void)
status = test_membarrier_flags_fail(); status = test_membarrier_flags_fail();
if (status) if (status)
return status; return status;
status = test_membarrier_success(); status = test_membarrier_global_success();
if (status)
return status;
status = test_membarrier_private_expedited_fail();
if (status)
return status;
status = test_membarrier_register_private_expedited_success();
if (status)
return status;
status = test_membarrier_private_expedited_success();
if (status)
return status;
status = sys_membarrier(MEMBARRIER_CMD_QUERY, 0);
if (status < 0) {
ksft_test_result_fail("sys_membarrier() failed\n");
return status;
}
if (status & MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE) {
status = test_membarrier_private_expedited_sync_core_fail();
if (status)
return status;
status = test_membarrier_register_private_expedited_sync_core_success();
if (status)
return status;
status = test_membarrier_private_expedited_sync_core_success();
if (status)
return status;
}
/*
* It is valid to send a global membarrier from a non-registered
* process.
*/
status = test_membarrier_global_expedited_success();
if (status)
return status;
status = test_membarrier_register_global_expedited_success();
if (status)
return status;
status = test_membarrier_global_expedited_success();
if (status) if (status)
return status; return status;
return 0; return 0;
@ -94,8 +293,10 @@ static int test_membarrier_query(void)
} }
ksft_exit_fail_msg("sys_membarrier() failed\n"); ksft_exit_fail_msg("sys_membarrier() failed\n");
} }
if (!(ret & MEMBARRIER_CMD_SHARED)) if (!(ret & MEMBARRIER_CMD_GLOBAL)) {
ksft_test_result_fail("sys_membarrier() CMD_GLOBAL query failed\n");
ksft_exit_fail_msg("sys_membarrier is not supported.\n"); ksft_exit_fail_msg("sys_membarrier is not supported.\n");
}
ksft_test_result_pass("sys_membarrier available\n"); ksft_test_result_pass("sys_membarrier available\n");
return 0; return 0;
@ -108,5 +309,5 @@ int main(int argc, char **argv)
test_membarrier_query(); test_membarrier_query();
test_membarrier(); test_membarrier();
ksft_exit_pass(); return ksft_exit_pass();
} }