2012-03-29 01:11:12 +08:00
|
|
|
#ifndef _ASM_X86_SWITCH_TO_H
|
|
|
|
#define _ASM_X86_SWITCH_TO_H
|
|
|
|
|
|
|
|
struct task_struct; /* one of the stranger aspects of C forward declarations */
|
2013-08-06 06:02:39 +08:00
|
|
|
__visible struct task_struct *__switch_to(struct task_struct *prev,
|
|
|
|
struct task_struct *next);
|
2012-03-29 01:11:12 +08:00
|
|
|
struct tss_struct;
|
|
|
|
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
|
|
|
|
struct tss_struct *tss);
|
|
|
|
|
2016-08-11 17:35:23 +08:00
|
|
|
/* This runs runs on the previous thread's stack. */
|
|
|
|
static inline void prepare_switch_to(struct task_struct *prev,
|
|
|
|
struct task_struct *next)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_VMAP_STACK
|
|
|
|
/*
|
|
|
|
* If we switch to a stack that has a top-level paging entry
|
|
|
|
* that is not present in the current mm, the resulting #PF will
|
|
|
|
* will be promoted to a double-fault and we'll panic. Probe
|
|
|
|
* the new stack now so that vmalloc_fault can fix up the page
|
|
|
|
* tables if needed. This can only happen if we use a stack
|
|
|
|
* in vmap space.
|
|
|
|
*
|
|
|
|
* We assume that the stack is aligned so that it never spans
|
|
|
|
* more than one top-level paging entry.
|
|
|
|
*
|
|
|
|
* To minimize cache pollution, just follow the stack pointer.
|
|
|
|
*/
|
|
|
|
READ_ONCE(*(unsigned char *)next->thread.sp);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2016-08-14 00:38:18 +08:00
|
|
|
/* data that is pointed to by thread.sp */
|
|
|
|
struct inactive_task_frame {
|
|
|
|
unsigned long bp;
|
|
|
|
};
|
|
|
|
|
2012-03-29 01:11:12 +08:00
|
|
|
#ifdef CONFIG_X86_32
|
|
|
|
|
|
|
|
#ifdef CONFIG_CC_STACKPROTECTOR
|
|
|
|
#define __switch_canary \
|
|
|
|
"movl %P[task_canary](%[next]), %%ebx\n\t" \
|
|
|
|
"movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
|
|
|
|
#define __switch_canary_oparam \
|
|
|
|
, [stack_canary] "=m" (stack_canary.canary)
|
|
|
|
#define __switch_canary_iparam \
|
|
|
|
, [task_canary] "i" (offsetof(struct task_struct, stack_canary))
|
|
|
|
#else /* CC_STACKPROTECTOR */
|
|
|
|
#define __switch_canary
|
|
|
|
#define __switch_canary_oparam
|
|
|
|
#define __switch_canary_iparam
|
|
|
|
#endif /* CC_STACKPROTECTOR */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Saving eflags is important. It switches not only IOPL between tasks,
|
|
|
|
* it also protects other tasks from NT leaking through sysenter etc.
|
|
|
|
*/
|
|
|
|
#define switch_to(prev, next, last) \
|
|
|
|
do { \
|
|
|
|
/* \
|
|
|
|
* Context-switching clobbers all registers, so we clobber \
|
|
|
|
* them explicitly, via unused output variables. \
|
|
|
|
* (EAX and EBP is not listed because EBP is saved/restored \
|
|
|
|
* explicitly for wchan access and EAX is the return value of \
|
|
|
|
* __switch_to()) \
|
|
|
|
*/ \
|
|
|
|
unsigned long ebx, ecx, edx, esi, edi; \
|
|
|
|
\
|
2016-08-11 17:35:23 +08:00
|
|
|
prepare_switch_to(prev, next); \
|
|
|
|
\
|
2016-05-05 10:44:36 +08:00
|
|
|
asm volatile("pushl %%ebp\n\t" /* save EBP */ \
|
2012-03-29 01:11:12 +08:00
|
|
|
"movl %%esp,%[prev_sp]\n\t" /* save ESP */ \
|
|
|
|
"movl %[next_sp],%%esp\n\t" /* restore ESP */ \
|
|
|
|
"movl $1f,%[prev_ip]\n\t" /* save EIP */ \
|
|
|
|
"pushl %[next_ip]\n\t" /* restore EIP */ \
|
|
|
|
__switch_canary \
|
|
|
|
"jmp __switch_to\n" /* regparm call */ \
|
|
|
|
"1:\t" \
|
|
|
|
"popl %%ebp\n\t" /* restore EBP */ \
|
|
|
|
\
|
|
|
|
/* output parameters */ \
|
|
|
|
: [prev_sp] "=m" (prev->thread.sp), \
|
|
|
|
[prev_ip] "=m" (prev->thread.ip), \
|
|
|
|
"=a" (last), \
|
|
|
|
\
|
|
|
|
/* clobbered output registers: */ \
|
|
|
|
"=b" (ebx), "=c" (ecx), "=d" (edx), \
|
|
|
|
"=S" (esi), "=D" (edi) \
|
|
|
|
\
|
|
|
|
__switch_canary_oparam \
|
|
|
|
\
|
|
|
|
/* input parameters: */ \
|
|
|
|
: [next_sp] "m" (next->thread.sp), \
|
|
|
|
[next_ip] "m" (next->thread.ip), \
|
|
|
|
\
|
|
|
|
/* regparm parameters for __switch_to(): */ \
|
|
|
|
[prev] "a" (prev), \
|
|
|
|
[next] "d" (next) \
|
|
|
|
\
|
|
|
|
__switch_canary_iparam \
|
|
|
|
\
|
|
|
|
: /* reloaded segment registers */ \
|
|
|
|
"memory"); \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
#else /* CONFIG_X86_32 */
|
|
|
|
|
|
|
|
/* frame pointer must be last for get_wchan */
|
x86/sched/64: Don't save flags on context switch (reinstated)
This reinstates the following commit:
2c7577a75837 ("sched/x86_64: Don't save flags on context switch")
which was reverted in:
512255a2ad2c ("Revert 'sched/x86_64: Don't save flags on context switch'")
Historically, Linux has always saved and restored EFLAGS across
context switches. As far as I know, the only reason to do this
is because of the NT flag. In particular, if something calls
switch_to() with the NT flag set, then we don't want to leak the
NT flag into a different task that might try to IRET and fail
because NT is set.
Before this commit:
8c7aa698baca ("x86_64, entry: Filter RFLAGS.NT on entry from userspace")
we could run system call bodies with NT set. This would be a DoS or possibly
privilege escalation hole if scheduling in such a system call would leak
NT into a different task.
Importantly, we don't need to worry about NT being set while
preemptible or across page faults. The only way we can schedule
due to preemption or a page fault is in an interrupt entry that
nests inside the SYSENTER prologue. The CPU will clear NT when
entering through an interrupt gate, so we won't schedule with NT
set.
The only other interesting flags are IOPL and AC. Allowing
switch_to() to change IOPL has no effect, as the value loaded
during kernel execution doesn't matter at all except between a
SYSENTER entry and the subsequent PUSHF, and anythign that
interrupts in that window will restore IOPL on return.
If we call __switch_to() with AC set, we have bigger problems.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/d4440fdc2a89247bffb7c003d2a9a2952bd46827.1441146105.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-09-02 06:41:06 +08:00
|
|
|
#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t"
|
|
|
|
#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t"
|
2012-03-29 01:11:12 +08:00
|
|
|
|
|
|
|
#define __EXTRA_CLOBBER \
|
|
|
|
, "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
|
x86/sched/64: Don't save flags on context switch (reinstated)
This reinstates the following commit:
2c7577a75837 ("sched/x86_64: Don't save flags on context switch")
which was reverted in:
512255a2ad2c ("Revert 'sched/x86_64: Don't save flags on context switch'")
Historically, Linux has always saved and restored EFLAGS across
context switches. As far as I know, the only reason to do this
is because of the NT flag. In particular, if something calls
switch_to() with the NT flag set, then we don't want to leak the
NT flag into a different task that might try to IRET and fail
because NT is set.
Before this commit:
8c7aa698baca ("x86_64, entry: Filter RFLAGS.NT on entry from userspace")
we could run system call bodies with NT set. This would be a DoS or possibly
privilege escalation hole if scheduling in such a system call would leak
NT into a different task.
Importantly, we don't need to worry about NT being set while
preemptible or across page faults. The only way we can schedule
due to preemption or a page fault is in an interrupt entry that
nests inside the SYSENTER prologue. The CPU will clear NT when
entering through an interrupt gate, so we won't schedule with NT
set.
The only other interesting flags are IOPL and AC. Allowing
switch_to() to change IOPL has no effect, as the value loaded
during kernel execution doesn't matter at all except between a
SYSENTER entry and the subsequent PUSHF, and anythign that
interrupts in that window will restore IOPL on return.
If we call __switch_to() with AC set, we have bigger problems.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/d4440fdc2a89247bffb7c003d2a9a2952bd46827.1441146105.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-09-02 06:41:06 +08:00
|
|
|
"r12", "r13", "r14", "r15", "flags"
|
2012-03-29 01:11:12 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_CC_STACKPROTECTOR
|
|
|
|
#define __switch_canary \
|
|
|
|
"movq %P[task_canary](%%rsi),%%r8\n\t" \
|
|
|
|
"movq %%r8,"__percpu_arg([gs_canary])"\n\t"
|
|
|
|
#define __switch_canary_oparam \
|
|
|
|
, [gs_canary] "=m" (irq_stack_union.stack_canary)
|
|
|
|
#define __switch_canary_iparam \
|
|
|
|
, [task_canary] "i" (offsetof(struct task_struct, stack_canary))
|
|
|
|
#else /* CC_STACKPROTECTOR */
|
|
|
|
#define __switch_canary
|
|
|
|
#define __switch_canary_oparam
|
|
|
|
#define __switch_canary_iparam
|
|
|
|
#endif /* CC_STACKPROTECTOR */
|
|
|
|
|
x86/sched/64: Don't save flags on context switch (reinstated)
This reinstates the following commit:
2c7577a75837 ("sched/x86_64: Don't save flags on context switch")
which was reverted in:
512255a2ad2c ("Revert 'sched/x86_64: Don't save flags on context switch'")
Historically, Linux has always saved and restored EFLAGS across
context switches. As far as I know, the only reason to do this
is because of the NT flag. In particular, if something calls
switch_to() with the NT flag set, then we don't want to leak the
NT flag into a different task that might try to IRET and fail
because NT is set.
Before this commit:
8c7aa698baca ("x86_64, entry: Filter RFLAGS.NT on entry from userspace")
we could run system call bodies with NT set. This would be a DoS or possibly
privilege escalation hole if scheduling in such a system call would leak
NT into a different task.
Importantly, we don't need to worry about NT being set while
preemptible or across page faults. The only way we can schedule
due to preemption or a page fault is in an interrupt entry that
nests inside the SYSENTER prologue. The CPU will clear NT when
entering through an interrupt gate, so we won't schedule with NT
set.
The only other interesting flags are IOPL and AC. Allowing
switch_to() to change IOPL has no effect, as the value loaded
during kernel execution doesn't matter at all except between a
SYSENTER entry and the subsequent PUSHF, and anythign that
interrupts in that window will restore IOPL on return.
If we call __switch_to() with AC set, we have bigger problems.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/d4440fdc2a89247bffb7c003d2a9a2952bd46827.1441146105.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-09-02 06:41:06 +08:00
|
|
|
/*
|
|
|
|
* There is no need to save or restore flags, because flags are always
|
|
|
|
* clean in kernel mode, with the possible exception of IOPL. Kernel IOPL
|
|
|
|
* has no effect.
|
|
|
|
*/
|
2016-08-11 17:35:23 +08:00
|
|
|
#define switch_to(prev, next, last) \
|
|
|
|
prepare_switch_to(prev, next); \
|
|
|
|
\
|
2012-03-29 01:11:12 +08:00
|
|
|
asm volatile(SAVE_CONTEXT \
|
|
|
|
"movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
|
|
|
|
"movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
|
|
|
|
"call __switch_to\n\t" \
|
|
|
|
"movq "__percpu_arg([current_task])",%%rsi\n\t" \
|
|
|
|
__switch_canary \
|
|
|
|
"movq %P[thread_info](%%rsi),%%r8\n\t" \
|
|
|
|
"movq %%rax,%%rdi\n\t" \
|
|
|
|
"testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \
|
|
|
|
"jnz ret_from_fork\n\t" \
|
|
|
|
RESTORE_CONTEXT \
|
|
|
|
: "=a" (last) \
|
|
|
|
__switch_canary_oparam \
|
|
|
|
: [next] "S" (next), [prev] "D" (prev), \
|
|
|
|
[threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
|
|
|
|
[ti_flags] "i" (offsetof(struct thread_info, flags)), \
|
|
|
|
[_tif_fork] "i" (_TIF_FORK), \
|
|
|
|
[thread_info] "i" (offsetof(struct task_struct, stack)), \
|
|
|
|
[current_task] "m" (current_task) \
|
|
|
|
__switch_canary_iparam \
|
|
|
|
: "memory", "cc" __EXTRA_CLOBBER)
|
|
|
|
|
|
|
|
#endif /* CONFIG_X86_32 */
|
|
|
|
|
|
|
|
#endif /* _ASM_X86_SWITCH_TO_H */
|