Merge branch 'timers/nohz-v3' of git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks into timers/nohz

Pull nohz improvements from Frederic Weisbecker:

 " It mostly contains fixes and full dynticks off-case optimizations. I believe that
   distros want to enable this feature so it seems important to optimize the case
   where the "nohz_full=" parameter is empty. ie: I'm trying to remove any performance
   regression that comes with NO_HZ_FULL=y when the feature is not used.

   This patchset improves the current situation a lot (off-case appears to be around 11% faster
   with hackbench, although I guess it may vary depending on the configuration but it should be
   significantly faster in any case) now there is still some work to do: I can still observe a
   remaining loss of 1.6% throughput seen with hackbench compared to CONFIG_NO_HZ_FULL=n. "

Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar 2013-08-14 17:58:56 +02:00
commit 6f1d657668
22 changed files with 548 additions and 332 deletions

View File

@ -3,3 +3,4 @@ generic-y += clkdev.h
generic-y += exec.h
generic-y += kvm_para.h
generic-y += trace_clock.h
generic-y += vtime.h

View File

@ -3,7 +3,7 @@
#include <linux/types.h>
#ifdef CONFIG_MMU
#include <linux/hardirq.h>
#include <linux/preempt_mask.h>
#endif
#include <linux/preempt.h>
#include <asm/thread_info.h>

View File

@ -2,3 +2,4 @@
generic-y += clkdev.h
generic-y += rwsem.h
generic-y += trace_clock.h
generic-y += vtime.h

View File

@ -13,9 +13,6 @@
#include <asm/div64.h>
#define __ARCH_HAS_VTIME_ACCOUNT
#define __ARCH_HAS_VTIME_TASK_SWITCH
/* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */
typedef unsigned long long __nocast cputime_t;

View File

@ -0,0 +1,7 @@
#ifndef _S390_VTIME_H
#define _S390_VTIME_H
#define __ARCH_HAS_VTIME_ACCOUNT
#define __ARCH_HAS_VTIME_TASK_SWITCH
#endif /* _S390_VTIME_H */

View File

@ -19,6 +19,7 @@
#include <asm/irq_regs.h>
#include <asm/cputime.h>
#include <asm/vtimer.h>
#include <asm/vtime.h>
#include <asm/irq.h>
#include "entry.h"

View File

View File

@ -2,100 +2,110 @@
#define _LINUX_CONTEXT_TRACKING_H
#include <linux/sched.h>
#include <linux/percpu.h>
#include <linux/vtime.h>
#include <linux/context_tracking_state.h>
#include <asm/ptrace.h>
struct context_tracking {
/*
* When active is false, probes are unset in order
* to minimize overhead: TIF flags are cleared
* and calls to user_enter/exit are ignored. This
* may be further optimized using static keys.
*/
bool active;
enum ctx_state {
IN_KERNEL = 0,
IN_USER,
} state;
};
static inline void __guest_enter(void)
{
/*
* This is running in ioctl context so we can avoid
* the call to vtime_account() with its unnecessary idle check.
*/
vtime_account_system(current);
current->flags |= PF_VCPU;
}
static inline void __guest_exit(void)
{
/*
* This is running in ioctl context so we can avoid
* the call to vtime_account() with its unnecessary idle check.
*/
vtime_account_system(current);
current->flags &= ~PF_VCPU;
}
#ifdef CONFIG_CONTEXT_TRACKING
DECLARE_PER_CPU(struct context_tracking, context_tracking);
extern void context_tracking_cpu_set(int cpu);
static inline bool context_tracking_in_user(void)
extern void context_tracking_user_enter(void);
extern void context_tracking_user_exit(void);
extern void __context_tracking_task_switch(struct task_struct *prev,
struct task_struct *next);
static inline void user_enter(void)
{
return __this_cpu_read(context_tracking.state) == IN_USER;
}
if (static_key_false(&context_tracking_enabled))
context_tracking_user_enter();
static inline bool context_tracking_active(void)
}
static inline void user_exit(void)
{
return __this_cpu_read(context_tracking.active);
if (static_key_false(&context_tracking_enabled))
context_tracking_user_exit();
}
extern void user_enter(void);
extern void user_exit(void);
extern void guest_enter(void);
extern void guest_exit(void);
static inline enum ctx_state exception_enter(void)
{
enum ctx_state prev_ctx;
if (!static_key_false(&context_tracking_enabled))
return 0;
prev_ctx = this_cpu_read(context_tracking.state);
user_exit();
context_tracking_user_exit();
return prev_ctx;
}
static inline void exception_exit(enum ctx_state prev_ctx)
{
if (prev_ctx == IN_USER)
user_enter();
if (static_key_false(&context_tracking_enabled)) {
if (prev_ctx == IN_USER)
context_tracking_user_enter();
}
}
extern void context_tracking_task_switch(struct task_struct *prev,
struct task_struct *next);
static inline void context_tracking_task_switch(struct task_struct *prev,
struct task_struct *next)
{
if (static_key_false(&context_tracking_enabled))
__context_tracking_task_switch(prev, next);
}
#else
static inline bool context_tracking_in_user(void) { return false; }
static inline void user_enter(void) { }
static inline void user_exit(void) { }
static inline void guest_enter(void)
{
__guest_enter();
}
static inline void guest_exit(void)
{
__guest_exit();
}
static inline enum ctx_state exception_enter(void) { return 0; }
static inline void exception_exit(enum ctx_state prev_ctx) { }
static inline void context_tracking_task_switch(struct task_struct *prev,
struct task_struct *next) { }
#endif /* !CONFIG_CONTEXT_TRACKING */
#ifdef CONFIG_CONTEXT_TRACKING_FORCE
extern void context_tracking_init(void);
#else
static inline void context_tracking_init(void) { }
#endif /* CONFIG_CONTEXT_TRACKING_FORCE */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
static inline void guest_enter(void)
{
if (vtime_accounting_enabled())
vtime_guest_enter(current);
else
current->flags |= PF_VCPU;
}
static inline void guest_exit(void)
{
if (vtime_accounting_enabled())
vtime_guest_exit(current);
else
current->flags &= ~PF_VCPU;
}
#else
static inline void guest_enter(void)
{
/*
* This is running in ioctl context so its safe
* to assume that it's the stime pending cputime
* to flush.
*/
vtime_account_system(current);
current->flags |= PF_VCPU;
}
static inline void guest_exit(void)
{
/* Flush the guest cputime we spent on the guest */
vtime_account_system(current);
current->flags &= ~PF_VCPU;
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
#endif

View File

@ -0,0 +1,39 @@
#ifndef _LINUX_CONTEXT_TRACKING_STATE_H
#define _LINUX_CONTEXT_TRACKING_STATE_H
#include <linux/percpu.h>
#include <linux/static_key.h>
struct context_tracking {
/*
* When active is false, probes are unset in order
* to minimize overhead: TIF flags are cleared
* and calls to user_enter/exit are ignored. This
* may be further optimized using static keys.
*/
bool active;
enum ctx_state {
IN_KERNEL = 0,
IN_USER,
} state;
};
#ifdef CONFIG_CONTEXT_TRACKING
extern struct static_key context_tracking_enabled;
DECLARE_PER_CPU(struct context_tracking, context_tracking);
static inline bool context_tracking_in_user(void)
{
return __this_cpu_read(context_tracking.state) == IN_USER;
}
static inline bool context_tracking_active(void)
{
return __this_cpu_read(context_tracking.active);
}
#else
static inline bool context_tracking_in_user(void) { return false; }
static inline bool context_tracking_active(void) { return false; }
#endif /* CONFIG_CONTEXT_TRACKING */
#endif

View File

@ -1,126 +1,11 @@
#ifndef LINUX_HARDIRQ_H
#define LINUX_HARDIRQ_H
#include <linux/preempt.h>
#include <linux/preempt_mask.h>
#include <linux/lockdep.h>
#include <linux/ftrace_irq.h>
#include <linux/vtime.h>
#include <asm/hardirq.h>
/*
* We put the hardirq and softirq counter into the preemption
* counter. The bitmask has the following meaning:
*
* - bits 0-7 are the preemption count (max preemption depth: 256)
* - bits 8-15 are the softirq count (max # of softirqs: 256)
*
* The hardirq count can in theory reach the same as NR_IRQS.
* In reality, the number of nested IRQS is limited to the stack
* size as well. For archs with over 1000 IRQS it is not practical
* to expect that they will all nest. We give a max of 10 bits for
* hardirq nesting. An arch may choose to give less than 10 bits.
* m68k expects it to be 8.
*
* - bits 16-25 are the hardirq count (max # of nested hardirqs: 1024)
* - bit 26 is the NMI_MASK
* - bit 27 is the PREEMPT_ACTIVE flag
*
* PREEMPT_MASK: 0x000000ff
* SOFTIRQ_MASK: 0x0000ff00
* HARDIRQ_MASK: 0x03ff0000
* NMI_MASK: 0x04000000
*/
#define PREEMPT_BITS 8
#define SOFTIRQ_BITS 8
#define NMI_BITS 1
#define MAX_HARDIRQ_BITS 10
#ifndef HARDIRQ_BITS
# define HARDIRQ_BITS MAX_HARDIRQ_BITS
#endif
#if HARDIRQ_BITS > MAX_HARDIRQ_BITS
#error HARDIRQ_BITS too high!
#endif
#define PREEMPT_SHIFT 0
#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS)
#define __IRQ_MASK(x) ((1UL << (x))-1)
#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT)
#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT)
#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT)
#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
#define NMI_OFFSET (1UL << NMI_SHIFT)
#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
#ifndef PREEMPT_ACTIVE
#define PREEMPT_ACTIVE_BITS 1
#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
#endif
#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS))
#error PREEMPT_ACTIVE is too low!
#endif
#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
| NMI_MASK))
/*
* Are we doing bottom half or hardware interrupt processing?
* Are we in a softirq context? Interrupt context?
* in_softirq - Are we currently processing softirq or have bh disabled?
* in_serving_softirq - Are we currently processing softirq?
*/
#define in_irq() (hardirq_count())
#define in_softirq() (softirq_count())
#define in_interrupt() (irq_count())
#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
/*
* Are we in NMI context?
*/
#define in_nmi() (preempt_count() & NMI_MASK)
#if defined(CONFIG_PREEMPT_COUNT)
# define PREEMPT_CHECK_OFFSET 1
#else
# define PREEMPT_CHECK_OFFSET 0
#endif
/*
* Are we running in atomic context? WARNING: this macro cannot
* always detect atomic context; in particular, it cannot know about
* held spinlocks in non-preemptible kernels. Thus it should not be
* used in the general case to determine whether sleeping is possible.
* Do not use in_atomic() in driver code.
*/
#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0)
/*
* Check whether we were atomic before we did preempt_disable():
* (used by the scheduler, *after* releasing the kernel lock)
*/
#define in_atomic_preempt_off() \
((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
#ifdef CONFIG_PREEMPT_COUNT
# define preemptible() (preempt_count() == 0 && !irqs_disabled())
#else
# define preemptible() 0
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_GENERIC_HARDIRQS)
extern void synchronize_irq(unsigned int irq);

View File

@ -0,0 +1,122 @@
#ifndef LINUX_PREEMPT_MASK_H
#define LINUX_PREEMPT_MASK_H
#include <linux/preempt.h>
#include <asm/hardirq.h>
/*
* We put the hardirq and softirq counter into the preemption
* counter. The bitmask has the following meaning:
*
* - bits 0-7 are the preemption count (max preemption depth: 256)
* - bits 8-15 are the softirq count (max # of softirqs: 256)
*
* The hardirq count can in theory reach the same as NR_IRQS.
* In reality, the number of nested IRQS is limited to the stack
* size as well. For archs with over 1000 IRQS it is not practical
* to expect that they will all nest. We give a max of 10 bits for
* hardirq nesting. An arch may choose to give less than 10 bits.
* m68k expects it to be 8.
*
* - bits 16-25 are the hardirq count (max # of nested hardirqs: 1024)
* - bit 26 is the NMI_MASK
* - bit 27 is the PREEMPT_ACTIVE flag
*
* PREEMPT_MASK: 0x000000ff
* SOFTIRQ_MASK: 0x0000ff00
* HARDIRQ_MASK: 0x03ff0000
* NMI_MASK: 0x04000000
*/
#define PREEMPT_BITS 8
#define SOFTIRQ_BITS 8
#define NMI_BITS 1
#define MAX_HARDIRQ_BITS 10
#ifndef HARDIRQ_BITS
# define HARDIRQ_BITS MAX_HARDIRQ_BITS
#endif
#if HARDIRQ_BITS > MAX_HARDIRQ_BITS
#error HARDIRQ_BITS too high!
#endif
#define PREEMPT_SHIFT 0
#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS)
#define __IRQ_MASK(x) ((1UL << (x))-1)
#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT)
#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT)
#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT)
#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
#define NMI_OFFSET (1UL << NMI_SHIFT)
#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
#ifndef PREEMPT_ACTIVE
#define PREEMPT_ACTIVE_BITS 1
#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
#endif
#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS))
#error PREEMPT_ACTIVE is too low!
#endif
#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
| NMI_MASK))
/*
* Are we doing bottom half or hardware interrupt processing?
* Are we in a softirq context? Interrupt context?
* in_softirq - Are we currently processing softirq or have bh disabled?
* in_serving_softirq - Are we currently processing softirq?
*/
#define in_irq() (hardirq_count())
#define in_softirq() (softirq_count())
#define in_interrupt() (irq_count())
#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
/*
* Are we in NMI context?
*/
#define in_nmi() (preempt_count() & NMI_MASK)
#if defined(CONFIG_PREEMPT_COUNT)
# define PREEMPT_CHECK_OFFSET 1
#else
# define PREEMPT_CHECK_OFFSET 0
#endif
/*
* Are we running in atomic context? WARNING: this macro cannot
* always detect atomic context; in particular, it cannot know about
* held spinlocks in non-preemptible kernels. Thus it should not be
* used in the general case to determine whether sleeping is possible.
* Do not use in_atomic() in driver code.
*/
#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0)
/*
* Check whether we were atomic before we did preempt_disable():
* (used by the scheduler, *after* releasing the kernel lock)
*/
#define in_atomic_preempt_off() \
((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
#ifdef CONFIG_PREEMPT_COUNT
# define preemptible() (preempt_count() == 0 && !irqs_disabled())
#else
# define preemptible() 0
#endif
#endif /* LINUX_PREEMPT_MASK_H */

View File

@ -10,6 +10,8 @@
#include <linux/irqflags.h>
#include <linux/percpu.h>
#include <linux/hrtimer.h>
#include <linux/context_tracking_state.h>
#include <linux/cpumask.h>
#ifdef CONFIG_GENERIC_CLOCKEVENTS
@ -158,20 +160,51 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
# endif /* !CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
extern bool tick_nohz_full_running;
extern cpumask_var_t tick_nohz_full_mask;
static inline bool tick_nohz_full_enabled(void)
{
if (!static_key_false(&context_tracking_enabled))
return false;
return tick_nohz_full_running;
}
static inline bool tick_nohz_full_cpu(int cpu)
{
if (!tick_nohz_full_enabled())
return false;
return cpumask_test_cpu(cpu, tick_nohz_full_mask);
}
extern void tick_nohz_init(void);
extern int tick_nohz_full_cpu(int cpu);
extern void tick_nohz_full_check(void);
extern void __tick_nohz_full_check(void);
extern void tick_nohz_full_kick(void);
extern void tick_nohz_full_kick_all(void);
extern void tick_nohz_task_switch(struct task_struct *tsk);
extern void __tick_nohz_task_switch(struct task_struct *tsk);
#else
static inline void tick_nohz_init(void) { }
static inline int tick_nohz_full_cpu(int cpu) { return 0; }
static inline void tick_nohz_full_check(void) { }
static inline bool tick_nohz_full_enabled(void) { return false; }
static inline bool tick_nohz_full_cpu(int cpu) { return false; }
static inline void __tick_nohz_full_check(void) { }
static inline void tick_nohz_full_kick(void) { }
static inline void tick_nohz_full_kick_all(void) { }
static inline void tick_nohz_task_switch(struct task_struct *tsk) { }
static inline void __tick_nohz_task_switch(struct task_struct *tsk) { }
#endif
static inline void tick_nohz_full_check(void)
{
if (tick_nohz_full_enabled())
__tick_nohz_full_check();
}
static inline void tick_nohz_task_switch(struct task_struct *tsk)
{
if (tick_nohz_full_enabled())
__tick_nohz_task_switch(tsk);
}
#endif

View File

@ -1,18 +1,68 @@
#ifndef _LINUX_KERNEL_VTIME_H
#define _LINUX_KERNEL_VTIME_H
#include <linux/context_tracking_state.h>
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
#include <asm/vtime.h>
#endif
struct task_struct;
/*
* vtime_accounting_enabled() definitions/declarations
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
static inline bool vtime_accounting_enabled(void) { return true; }
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
static inline bool vtime_accounting_enabled(void)
{
if (static_key_false(&context_tracking_enabled)) {
if (context_tracking_active())
return true;
}
return false;
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
static inline bool vtime_accounting_enabled(void) { return false; }
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
/*
* Common vtime APIs
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
#ifdef __ARCH_HAS_VTIME_TASK_SWITCH
extern void vtime_task_switch(struct task_struct *prev);
#else
extern void vtime_common_task_switch(struct task_struct *prev);
static inline void vtime_task_switch(struct task_struct *prev)
{
if (vtime_accounting_enabled())
vtime_common_task_switch(prev);
}
#endif /* __ARCH_HAS_VTIME_TASK_SWITCH */
extern void vtime_account_system(struct task_struct *tsk);
extern void vtime_account_idle(struct task_struct *tsk);
extern void vtime_account_user(struct task_struct *tsk);
extern void vtime_account_irq_enter(struct task_struct *tsk);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
static inline bool vtime_accounting_enabled(void) { return true; }
#endif
#ifdef __ARCH_HAS_VTIME_ACCOUNT
extern void vtime_account_irq_enter(struct task_struct *tsk);
#else
extern void vtime_common_account_irq_enter(struct task_struct *tsk);
static inline void vtime_account_irq_enter(struct task_struct *tsk)
{
if (vtime_accounting_enabled())
vtime_common_account_irq_enter(tsk);
}
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
@ -20,14 +70,20 @@ static inline void vtime_task_switch(struct task_struct *prev) { }
static inline void vtime_account_system(struct task_struct *tsk) { }
static inline void vtime_account_user(struct task_struct *tsk) { }
static inline void vtime_account_irq_enter(struct task_struct *tsk) { }
static inline bool vtime_accounting_enabled(void) { return false; }
#endif
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
extern void arch_vtime_task_switch(struct task_struct *tsk);
extern void vtime_account_irq_exit(struct task_struct *tsk);
extern bool vtime_accounting_enabled(void);
extern void vtime_gen_account_irq_exit(struct task_struct *tsk);
static inline void vtime_account_irq_exit(struct task_struct *tsk)
{
if (vtime_accounting_enabled())
vtime_gen_account_irq_exit(tsk);
}
extern void vtime_user_enter(struct task_struct *tsk);
static inline void vtime_user_exit(struct task_struct *tsk)
{
vtime_account_user(tsk);
@ -35,7 +91,7 @@ static inline void vtime_user_exit(struct task_struct *tsk)
extern void vtime_guest_enter(struct task_struct *tsk);
extern void vtime_guest_exit(struct task_struct *tsk);
extern void vtime_init_idle(struct task_struct *tsk, int cpu);
#else
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */
static inline void vtime_account_irq_exit(struct task_struct *tsk)
{
/* On hard|softirq exit we always account to hard|softirq cputime */

View File

@ -0,0 +1,58 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM context_tracking
#if !defined(_TRACE_CONTEXT_TRACKING_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_CONTEXT_TRACKING_H
#include <linux/tracepoint.h>
DECLARE_EVENT_CLASS(context_tracking_user,
TP_PROTO(int dummy),
TP_ARGS(dummy),
TP_STRUCT__entry(
__field( int, dummy )
),
TP_fast_assign(
__entry->dummy = dummy;
),
TP_printk("%s", "")
);
/**
* user_enter - called when the kernel resumes to userspace
* @dummy: dummy arg to make trace event macro happy
*
* This event occurs when the kernel resumes to userspace after
* an exception or a syscall.
*/
DEFINE_EVENT(context_tracking_user, user_enter,
TP_PROTO(int dummy),
TP_ARGS(dummy)
);
/**
* user_exit - called when userspace enters the kernel
* @dummy: dummy arg to make trace event macro happy
*
* This event occurs when userspace enters the kernel through
* an exception or a syscall.
*/
DEFINE_EVENT(context_tracking_user, user_exit,
TP_PROTO(int dummy),
TP_ARGS(dummy)
);
#endif /* _TRACE_CONTEXT_TRACKING_H */
/* This part must be outside protection */
#include <trace/define_trace.h>

View File

@ -527,13 +527,29 @@ config RCU_USER_QS
config CONTEXT_TRACKING_FORCE
bool "Force context tracking"
depends on CONTEXT_TRACKING
default CONTEXT_TRACKING
default y if !NO_HZ_FULL
help
Probe on user/kernel boundaries by default in order to
test the features that rely on it such as userspace RCU extended
quiescent states.
This test is there for debugging until we have a real user like the
full dynticks mode.
The major pre-requirement for full dynticks to work is to
support the context tracking subsystem. But there are also
other dependencies to provide in order to make the full
dynticks working.
This option stands for testing when an arch implements the
context tracking backend but doesn't yet fullfill all the
requirements to make the full dynticks feature working.
Without the full dynticks, there is no way to test the support
for context tracking and the subsystems that rely on it: RCU
userspace extended quiescent state and tickless cputime
accounting. This option copes with the absence of the full
dynticks subsystem by forcing the context tracking on all
CPUs in the system.
Say Y only if you're working on the developpement of an
architecture backend for the context tracking.
Say N otherwise, this option brings an overhead that you
don't want in production.
config RCU_FANOUT
int "Tree-based hierarchical RCU fanout value"

View File

@ -75,6 +75,7 @@
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/sched_clock.h>
#include <linux/context_tracking.h>
#include <asm/io.h>
#include <asm/bugs.h>
@ -545,6 +546,7 @@ asmlinkage void __init start_kernel(void)
idr_init_cache();
rcu_init();
tick_nohz_init();
context_tracking_init();
radix_tree_init();
/* init some links before init_ISA_irqs() */
early_irq_init();

View File

@ -20,22 +20,33 @@
#include <linux/hardirq.h>
#include <linux/export.h>
DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
#ifdef CONFIG_CONTEXT_TRACKING_FORCE
.active = true,
#endif
};
#define CREATE_TRACE_POINTS
#include <trace/events/context_tracking.h>
struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE;
EXPORT_SYMBOL_GPL(context_tracking_enabled);
DEFINE_PER_CPU(struct context_tracking, context_tracking);
EXPORT_SYMBOL_GPL(context_tracking);
void context_tracking_cpu_set(int cpu)
{
if (!per_cpu(context_tracking.active, cpu)) {
per_cpu(context_tracking.active, cpu) = true;
static_key_slow_inc(&context_tracking_enabled);
}
}
/**
* user_enter - Inform the context tracking that the CPU is going to
* enter userspace mode.
* context_tracking_user_enter - Inform the context tracking that the CPU is going to
* enter userspace mode.
*
* This function must be called right before we switch from the kernel
* to userspace, when it's guaranteed the remaining kernel instructions
* to execute won't use any RCU read side critical section because this
* function sets RCU in extended quiescent state.
*/
void user_enter(void)
void context_tracking_user_enter(void)
{
unsigned long flags;
@ -54,17 +65,32 @@ void user_enter(void)
WARN_ON_ONCE(!current->mm);
local_irq_save(flags);
if (__this_cpu_read(context_tracking.active) &&
__this_cpu_read(context_tracking.state) != IN_USER) {
if ( __this_cpu_read(context_tracking.state) != IN_USER) {
if (__this_cpu_read(context_tracking.active)) {
trace_user_enter(0);
/*
* At this stage, only low level arch entry code remains and
* then we'll run in userspace. We can assume there won't be
* any RCU read-side critical section until the next call to
* user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
* on the tick.
*/
vtime_user_enter(current);
rcu_user_enter();
}
/*
* At this stage, only low level arch entry code remains and
* then we'll run in userspace. We can assume there won't be
* any RCU read-side critical section until the next call to
* user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
* on the tick.
* Even if context tracking is disabled on this CPU, because it's outside
* the full dynticks mask for example, we still have to keep track of the
* context transitions and states to prevent inconsistency on those of
* other CPUs.
* If a task triggers an exception in userspace, sleep on the exception
* handler and then migrate to another CPU, that new CPU must know where
* the exception returns by the time we call exception_exit().
* This information can only be provided by the previous CPU when it called
* exception_enter().
* OTOH we can spare the calls to vtime and RCU when context_tracking.active
* is false because we know that CPU is not tickless.
*/
vtime_user_enter(current);
rcu_user_enter();
__this_cpu_write(context_tracking.state, IN_USER);
}
local_irq_restore(flags);
@ -87,10 +113,9 @@ void user_enter(void)
*/
void __sched notrace preempt_schedule_context(void)
{
struct thread_info *ti = current_thread_info();
enum ctx_state prev_ctx;
if (likely(ti->preempt_count || irqs_disabled()))
if (likely(!preemptible()))
return;
/*
@ -112,8 +137,8 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context);
#endif /* CONFIG_PREEMPT */
/**
* user_exit - Inform the context tracking that the CPU is
* exiting userspace mode and entering the kernel.
* context_tracking_user_exit - Inform the context tracking that the CPU is
* exiting userspace mode and entering the kernel.
*
* This function must be called after we entered the kernel from userspace
* before any use of RCU read side critical section. This potentially include
@ -122,7 +147,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context);
* This call supports re-entrancy. This way it can be called from any exception
* handler without needing to know if we came from userspace or not.
*/
void user_exit(void)
void context_tracking_user_exit(void)
{
unsigned long flags;
@ -131,38 +156,22 @@ void user_exit(void)
local_irq_save(flags);
if (__this_cpu_read(context_tracking.state) == IN_USER) {
/*
* We are going to run code that may use RCU. Inform
* RCU core about that (ie: we may need the tick again).
*/
rcu_user_exit();
vtime_user_exit(current);
if (__this_cpu_read(context_tracking.active)) {
/*
* We are going to run code that may use RCU. Inform
* RCU core about that (ie: we may need the tick again).
*/
rcu_user_exit();
vtime_user_exit(current);
trace_user_exit(0);
}
__this_cpu_write(context_tracking.state, IN_KERNEL);
}
local_irq_restore(flags);
}
void guest_enter(void)
{
if (vtime_accounting_enabled())
vtime_guest_enter(current);
else
__guest_enter();
}
EXPORT_SYMBOL_GPL(guest_enter);
void guest_exit(void)
{
if (vtime_accounting_enabled())
vtime_guest_exit(current);
else
__guest_exit();
}
EXPORT_SYMBOL_GPL(guest_exit);
/**
* context_tracking_task_switch - context switch the syscall callbacks
* __context_tracking_task_switch - context switch the syscall callbacks
* @prev: the task that is being switched out
* @next: the task that is being switched in
*
@ -174,11 +183,19 @@ EXPORT_SYMBOL_GPL(guest_exit);
* migrate to some CPU that doesn't do the context tracking. As such the TIF
* flag may not be desired there.
*/
void context_tracking_task_switch(struct task_struct *prev,
struct task_struct *next)
void __context_tracking_task_switch(struct task_struct *prev,
struct task_struct *next)
{
if (__this_cpu_read(context_tracking.active)) {
clear_tsk_thread_flag(prev, TIF_NOHZ);
set_tsk_thread_flag(next, TIF_NOHZ);
}
clear_tsk_thread_flag(prev, TIF_NOHZ);
set_tsk_thread_flag(next, TIF_NOHZ);
}
#ifdef CONFIG_CONTEXT_TRACKING_FORCE
void __init context_tracking_init(void)
{
int cpu;
for_each_possible_cpu(cpu)
context_tracking_cpu_set(cpu);
}
#endif

View File

@ -2510,13 +2510,11 @@ void __sched schedule_preempt_disabled(void)
*/
asmlinkage void __sched notrace preempt_schedule(void)
{
struct thread_info *ti = current_thread_info();
/*
* If there is a non-zero preempt_count or interrupts are disabled,
* we do not want to preempt the current task. Just return..
*/
if (likely(ti->preempt_count || irqs_disabled()))
if (likely(!preemptible()))
return;
do {

View File

@ -378,11 +378,8 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
void vtime_task_switch(struct task_struct *prev)
void vtime_common_task_switch(struct task_struct *prev)
{
if (!vtime_accounting_enabled())
return;
if (is_idle_task(prev))
vtime_account_idle(prev);
else
@ -404,11 +401,8 @@ void vtime_task_switch(struct task_struct *prev)
* vtime_account().
*/
#ifndef __ARCH_HAS_VTIME_ACCOUNT
void vtime_account_irq_enter(struct task_struct *tsk)
void vtime_common_account_irq_enter(struct task_struct *tsk)
{
if (!vtime_accounting_enabled())
return;
if (!in_interrupt()) {
/*
* If we interrupted user, context_tracking_in_user()
@ -428,7 +422,7 @@ void vtime_account_irq_enter(struct task_struct *tsk)
}
vtime_account_system(tsk);
}
EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
@ -559,12 +553,6 @@ static void cputime_adjust(struct task_cputime *curr,
{
cputime_t rtime, stime, utime, total;
if (vtime_accounting_enabled()) {
*ut = curr->utime;
*st = curr->stime;
return;
}
stime = curr->stime;
total = stime + curr->utime;
@ -664,23 +652,17 @@ static void __vtime_account_system(struct task_struct *tsk)
void vtime_account_system(struct task_struct *tsk)
{
if (!vtime_accounting_enabled())
return;
write_seqlock(&tsk->vtime_seqlock);
__vtime_account_system(tsk);
write_sequnlock(&tsk->vtime_seqlock);
}
void vtime_account_irq_exit(struct task_struct *tsk)
void vtime_gen_account_irq_exit(struct task_struct *tsk)
{
if (!vtime_accounting_enabled())
return;
write_seqlock(&tsk->vtime_seqlock);
__vtime_account_system(tsk);
if (context_tracking_in_user())
tsk->vtime_snap_whence = VTIME_USER;
__vtime_account_system(tsk);
write_sequnlock(&tsk->vtime_seqlock);
}
@ -688,12 +670,8 @@ void vtime_account_user(struct task_struct *tsk)
{
cputime_t delta_cpu;
if (!vtime_accounting_enabled())
return;
delta_cpu = get_vtime_delta(tsk);
write_seqlock(&tsk->vtime_seqlock);
delta_cpu = get_vtime_delta(tsk);
tsk->vtime_snap_whence = VTIME_SYS;
account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
write_sequnlock(&tsk->vtime_seqlock);
@ -701,22 +679,27 @@ void vtime_account_user(struct task_struct *tsk)
void vtime_user_enter(struct task_struct *tsk)
{
if (!vtime_accounting_enabled())
return;
write_seqlock(&tsk->vtime_seqlock);
tsk->vtime_snap_whence = VTIME_USER;
__vtime_account_system(tsk);
tsk->vtime_snap_whence = VTIME_USER;
write_sequnlock(&tsk->vtime_seqlock);
}
void vtime_guest_enter(struct task_struct *tsk)
{
/*
* The flags must be updated under the lock with
* the vtime_snap flush and update.
* That enforces a right ordering and update sequence
* synchronization against the reader (task_gtime())
* that can thus safely catch up with a tickless delta.
*/
write_seqlock(&tsk->vtime_seqlock);
__vtime_account_system(tsk);
current->flags |= PF_VCPU;
write_sequnlock(&tsk->vtime_seqlock);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
void vtime_guest_exit(struct task_struct *tsk)
{
@ -725,6 +708,7 @@ void vtime_guest_exit(struct task_struct *tsk)
current->flags &= ~PF_VCPU;
write_sequnlock(&tsk->vtime_seqlock);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
void vtime_account_idle(struct task_struct *tsk)
{
@ -733,11 +717,6 @@ void vtime_account_idle(struct task_struct *tsk)
account_idle_time(delta_cpu);
}
bool vtime_accounting_enabled(void)
{
return context_tracking_active();
}
void arch_vtime_task_switch(struct task_struct *prev)
{
write_seqlock(&prev->vtime_seqlock);

View File

@ -105,7 +105,6 @@ config NO_HZ_FULL
select RCU_USER_QS
select RCU_NOCB_CPU
select VIRT_CPU_ACCOUNTING_GEN
select CONTEXT_TRACKING_FORCE
select IRQ_WORK
help
Adaptively try to shutdown the tick whenever possible, even when

View File

@ -121,7 +121,7 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
BUG_ON(bits > 32);
WARN_ON(!irqs_disabled());
read_sched_clock = read;
sched_clock_mask = (1 << bits) - 1;
sched_clock_mask = (1ULL << bits) - 1;
cd.rate = rate;
/* calculate the mult/shift to convert counter ticks to ns. */

View File

@ -23,6 +23,7 @@
#include <linux/irq_work.h>
#include <linux/posix-timers.h>
#include <linux/perf_event.h>
#include <linux/context_tracking.h>
#include <asm/irq_regs.h>
@ -148,8 +149,8 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
}
#ifdef CONFIG_NO_HZ_FULL
static cpumask_var_t nohz_full_mask;
bool have_nohz_full_mask;
cpumask_var_t tick_nohz_full_mask;
bool tick_nohz_full_running;
static bool can_stop_full_tick(void)
{
@ -182,7 +183,8 @@ static bool can_stop_full_tick(void)
* Don't allow the user to think they can get
* full NO_HZ with this machine.
*/
WARN_ONCE(1, "NO_HZ FULL will not work with unstable sched clock");
WARN_ONCE(tick_nohz_full_running,
"NO_HZ FULL will not work with unstable sched clock");
return false;
}
#endif
@ -196,7 +198,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
* Re-evaluate the need for the tick on the current CPU
* and restart it if necessary.
*/
void tick_nohz_full_check(void)
void __tick_nohz_full_check(void)
{
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
@ -210,7 +212,7 @@ void tick_nohz_full_check(void)
static void nohz_full_kick_work_func(struct irq_work *work)
{
tick_nohz_full_check();
__tick_nohz_full_check();
}
static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
@ -229,7 +231,7 @@ void tick_nohz_full_kick(void)
static void nohz_full_kick_ipi(void *info)
{
tick_nohz_full_check();
__tick_nohz_full_check();
}
/*
@ -238,11 +240,11 @@ static void nohz_full_kick_ipi(void *info)
*/
void tick_nohz_full_kick_all(void)
{
if (!have_nohz_full_mask)
if (!tick_nohz_full_running)
return;
preempt_disable();
smp_call_function_many(nohz_full_mask,
smp_call_function_many(tick_nohz_full_mask,
nohz_full_kick_ipi, NULL, false);
preempt_enable();
}
@ -252,7 +254,7 @@ void tick_nohz_full_kick_all(void)
* It might need the tick due to per task/process properties:
* perf events, posix cpu timers, ...
*/
void tick_nohz_task_switch(struct task_struct *tsk)
void __tick_nohz_task_switch(struct task_struct *tsk)
{
unsigned long flags;
@ -268,31 +270,23 @@ void tick_nohz_task_switch(struct task_struct *tsk)
local_irq_restore(flags);
}
int tick_nohz_full_cpu(int cpu)
{
if (!have_nohz_full_mask)
return 0;
return cpumask_test_cpu(cpu, nohz_full_mask);
}
/* Parse the boot-time nohz CPU list from the kernel parameters. */
static int __init tick_nohz_full_setup(char *str)
{
int cpu;
alloc_bootmem_cpumask_var(&nohz_full_mask);
if (cpulist_parse(str, nohz_full_mask) < 0) {
alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
return 1;
}
cpu = smp_processor_id();
if (cpumask_test_cpu(cpu, nohz_full_mask)) {
if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
cpumask_clear_cpu(cpu, nohz_full_mask);
cpumask_clear_cpu(cpu, tick_nohz_full_mask);
}
have_nohz_full_mask = true;
tick_nohz_full_running = true;
return 1;
}
@ -310,7 +304,7 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
* If we handle the timekeeping duty for full dynticks CPUs,
* we can't safely shutdown that CPU.
*/
if (have_nohz_full_mask && tick_do_timer_cpu == cpu)
if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
return NOTIFY_BAD;
break;
}
@ -329,14 +323,14 @@ static int tick_nohz_init_all(void)
int err = -1;
#ifdef CONFIG_NO_HZ_FULL_ALL
if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) {
if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
return err;
}
err = 0;
cpumask_setall(nohz_full_mask);
cpumask_clear_cpu(smp_processor_id(), nohz_full_mask);
have_nohz_full_mask = true;
cpumask_setall(tick_nohz_full_mask);
cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
tick_nohz_full_running = true;
#endif
return err;
}
@ -345,17 +339,18 @@ void __init tick_nohz_init(void)
{
int cpu;
if (!have_nohz_full_mask) {
if (!tick_nohz_full_running) {
if (tick_nohz_init_all() < 0)
return;
}
for_each_cpu(cpu, tick_nohz_full_mask)
context_tracking_cpu_set(cpu);
cpu_notifier(tick_nohz_cpu_down_callback, 0);
cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask);
cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask);
pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
}
#else
#define have_nohz_full_mask (0)
#endif
/*
@ -733,7 +728,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return false;
}
if (have_nohz_full_mask) {
if (tick_nohz_full_enabled()) {
/*
* Keep the tick alive to guarantee timekeeping progression
* if there are full dynticks CPUs around