Merge branch 'WIP.x86-pti.entry-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 syscall entry code changes for PTI from Ingo Molnar:
 "The main changes here are Andy Lutomirski's changes to switch the
  x86-64 entry code to use the 'per CPU entry trampoline stack'. This,
  besides helping fix KASLR leaks (the pending Page Table Isolation
  (PTI) work), also robustifies the x86 entry code"

* 'WIP.x86-pti.entry-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (26 commits)
  x86/cpufeatures: Make CPU bugs sticky
  x86/paravirt: Provide a way to check for hypervisors
  x86/paravirt: Dont patch flush_tlb_single
  x86/entry/64: Make cpu_entry_area.tss read-only
  x86/entry: Clean up the SYSENTER_stack code
  x86/entry/64: Remove the SYSENTER stack canary
  x86/entry/64: Move the IST stacks into struct cpu_entry_area
  x86/entry/64: Create a per-CPU SYSCALL entry trampoline
  x86/entry/64: Return to userspace from the trampoline stack
  x86/entry/64: Use a per-CPU trampoline stack for IDT entries
  x86/espfix/64: Stop assuming that pt_regs is on the entry stack
  x86/entry/64: Separate cpu_current_top_of_stack from TSS.sp0
  x86/entry: Remap the TSS into the CPU entry area
  x86/entry: Move SYSENTER_stack to the beginning of struct tss_struct
  x86/dumpstack: Handle stack overflow on all stacks
  x86/entry: Fix assumptions that the HW TSS is at the beginning of cpu_tss
  x86/kasan/64: Teach KASAN about the cpu_entry_area
  x86/mm/fixmap: Generalize the GDT fixmap mechanism, introduce struct cpu_entry_area
  x86/entry/gdt: Put per-CPU GDT remaps in ascending order
  x86/dumpstack: Add get_stack_info() support for the SYSENTER stack
  ...
This commit is contained in:
Linus Torvalds 2017-12-18 08:59:15 -08:00
commit 64a48099b3
40 changed files with 689 additions and 284 deletions

View File

@ -941,7 +941,8 @@ ENTRY(debug)
movl %esp, %eax # pt_regs pointer movl %esp, %eax # pt_regs pointer
/* Are we currently on the SYSENTER stack? */ /* Are we currently on the SYSENTER stack? */
PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) movl PER_CPU_VAR(cpu_entry_area), %ecx
addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
cmpl $SIZEOF_SYSENTER_stack, %ecx cmpl $SIZEOF_SYSENTER_stack, %ecx
jb .Ldebug_from_sysenter_stack jb .Ldebug_from_sysenter_stack
@ -984,7 +985,8 @@ ENTRY(nmi)
movl %esp, %eax # pt_regs pointer movl %esp, %eax # pt_regs pointer
/* Are we currently on the SYSENTER stack? */ /* Are we currently on the SYSENTER stack? */
PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) movl PER_CPU_VAR(cpu_entry_area), %ecx
addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
cmpl $SIZEOF_SYSENTER_stack, %ecx cmpl $SIZEOF_SYSENTER_stack, %ecx
jb .Lnmi_from_sysenter_stack jb .Lnmi_from_sysenter_stack

View File

@ -140,6 +140,64 @@ END(native_usergs_sysret64)
* with them due to bugs in both AMD and Intel CPUs. * with them due to bugs in both AMD and Intel CPUs.
*/ */
.pushsection .entry_trampoline, "ax"
/*
* The code in here gets remapped into cpu_entry_area's trampoline. This means
* that the assembler and linker have the wrong idea as to where this code
* lives (and, in fact, it's mapped more than once, so it's not even at a
* fixed address). So we can't reference any symbols outside the entry
* trampoline and expect it to work.
*
* Instead, we carefully abuse %rip-relative addressing.
* _entry_trampoline(%rip) refers to the start of the remapped) entry
* trampoline. We can thus find cpu_entry_area with this macro:
*/
#define CPU_ENTRY_AREA \
_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \
SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
ENTRY(entry_SYSCALL_64_trampoline)
UNWIND_HINT_EMPTY
swapgs
/* Stash the user RSP. */
movq %rsp, RSP_SCRATCH
/* Load the top of the task stack into RSP */
movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
/* Start building the simulated IRET frame. */
pushq $__USER_DS /* pt_regs->ss */
pushq RSP_SCRATCH /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
/*
* x86 lacks a near absolute jump, and we can't jump to the real
* entry text with a relative jump. We could push the target
* address and then use retq, but this destroys the pipeline on
* many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
* spill RDI and restore it in a second-stage trampoline.
*/
pushq %rdi
movq $entry_SYSCALL_64_stage2, %rdi
jmp *%rdi
END(entry_SYSCALL_64_trampoline)
.popsection
ENTRY(entry_SYSCALL_64_stage2)
UNWIND_HINT_EMPTY
popq %rdi
jmp entry_SYSCALL_64_after_hwframe
END(entry_SYSCALL_64_stage2)
ENTRY(entry_SYSCALL_64) ENTRY(entry_SYSCALL_64)
UNWIND_HINT_EMPTY UNWIND_HINT_EMPTY
/* /*
@ -330,8 +388,24 @@ syscall_return_via_sysret:
popq %rsi /* skip rcx */ popq %rsi /* skip rcx */
popq %rdx popq %rdx
popq %rsi popq %rsi
/*
* Now all regs are restored except RSP and RDI.
* Save old stack pointer and switch to trampoline stack.
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
pushq RSP-RDI(%rdi) /* RSP */
pushq (%rdi) /* RDI */
/*
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
popq %rdi popq %rdi
movq RSP-ORIG_RAX(%rsp), %rsp popq %rsp
USERGS_SYSRET64 USERGS_SYSRET64
END(entry_SYSCALL_64) END(entry_SYSCALL_64)
@ -466,12 +540,13 @@ END(irq_entries_start)
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
#ifdef CONFIG_DEBUG_ENTRY #ifdef CONFIG_DEBUG_ENTRY
pushfq pushq %rax
testl $X86_EFLAGS_IF, (%rsp) SAVE_FLAGS(CLBR_RAX)
testl $X86_EFLAGS_IF, %eax
jz .Lokay_\@ jz .Lokay_\@
ud2 ud2
.Lokay_\@: .Lokay_\@:
addq $8, %rsp popq %rax
#endif #endif
.endm .endm
@ -563,6 +638,13 @@ END(irq_entries_start)
/* 0(%rsp): ~(interrupt number) */ /* 0(%rsp): ~(interrupt number) */
.macro interrupt func .macro interrupt func
cld cld
testb $3, CS-ORIG_RAX(%rsp)
jz 1f
SWAPGS
call switch_to_thread_stack
1:
ALLOC_PT_GPREGS_ON_STACK ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS SAVE_C_REGS
SAVE_EXTRA_REGS SAVE_EXTRA_REGS
@ -572,12 +654,8 @@ END(irq_entries_start)
jz 1f jz 1f
/* /*
* IRQ from user mode. Switch to kernel gsbase and inform context * IRQ from user mode.
* tracking that we're in kernel mode. *
*/
SWAPGS
/*
* We need to tell lockdep that IRQs are off. We can't do this until * We need to tell lockdep that IRQs are off. We can't do this until
* we fix gsbase, and we should do it before enter_from_user_mode * we fix gsbase, and we should do it before enter_from_user_mode
* (which can take locks). Since TRACE_IRQS_OFF idempotent, * (which can take locks). Since TRACE_IRQS_OFF idempotent,
@ -630,10 +708,41 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
ud2 ud2
1: 1:
#endif #endif
SWAPGS
POP_EXTRA_REGS POP_EXTRA_REGS
POP_C_REGS popq %r11
addq $8, %rsp /* skip regs->orig_ax */ popq %r10
popq %r9
popq %r8
popq %rax
popq %rcx
popq %rdx
popq %rsi
/*
* The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
* Save old stack pointer and switch to trampoline stack.
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
/* Copy the IRET frame to the trampoline stack. */
pushq 6*8(%rdi) /* SS */
pushq 5*8(%rdi) /* RSP */
pushq 4*8(%rdi) /* EFLAGS */
pushq 3*8(%rdi) /* CS */
pushq 2*8(%rdi) /* RIP */
/* Push user RDI on the trampoline stack. */
pushq (%rdi)
/*
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
/* Restore RDI. */
popq %rdi
SWAPGS
INTERRUPT_RETURN INTERRUPT_RETURN
@ -829,7 +938,33 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
/* /*
* Exception entry points. * Exception entry points.
*/ */
#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
/*
* Switch to the thread stack. This is called with the IRET frame and
* orig_ax on the stack. (That is, RDI..R12 are not on the stack and
* space has not been allocated for them.)
*/
ENTRY(switch_to_thread_stack)
UNWIND_HINT_FUNC
pushq %rdi
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
pushq 7*8(%rdi) /* regs->ss */
pushq 6*8(%rdi) /* regs->rsp */
pushq 5*8(%rdi) /* regs->eflags */
pushq 4*8(%rdi) /* regs->cs */
pushq 3*8(%rdi) /* regs->ip */
pushq 2*8(%rdi) /* regs->orig_ax */
pushq 8(%rdi) /* return address */
UNWIND_HINT_FUNC
movq (%rdi), %rdi
ret
END(switch_to_thread_stack)
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym) ENTRY(\sym)
@ -848,11 +983,12 @@ ENTRY(\sym)
ALLOC_PT_GPREGS_ON_STACK ALLOC_PT_GPREGS_ON_STACK
.if \paranoid .if \paranoid < 2
.if \paranoid == 1
testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ testb $3, CS(%rsp) /* If coming from userspace, switch stacks */
jnz 1f jnz .Lfrom_usermode_switch_stack_\@
.endif .endif
.if \paranoid
call paranoid_entry call paranoid_entry
.else .else
call error_entry call error_entry
@ -894,20 +1030,15 @@ ENTRY(\sym)
jmp error_exit jmp error_exit
.endif .endif
.if \paranoid == 1 .if \paranoid < 2
/* /*
* Paranoid entry from userspace. Switch stacks and treat it * Entry from userspace. Switch stacks and treat it
* as a normal entry. This means that paranoid handlers * as a normal entry. This means that paranoid handlers
* run in real process context if user_mode(regs). * run in real process context if user_mode(regs).
*/ */
1: .Lfrom_usermode_switch_stack_\@:
call error_entry call error_entry
movq %rsp, %rdi /* pt_regs pointer */
call sync_regs
movq %rax, %rsp /* switch stack */
movq %rsp, %rdi /* pt_regs pointer */ movq %rsp, %rdi /* pt_regs pointer */
.if \has_error_code .if \has_error_code
@ -1170,6 +1301,14 @@ ENTRY(error_entry)
SWAPGS SWAPGS
.Lerror_entry_from_usermode_after_swapgs: .Lerror_entry_from_usermode_after_swapgs:
/* Put us onto the real thread stack. */
popq %r12 /* save return addr in %12 */
movq %rsp, %rdi /* arg0 = pt_regs pointer */
call sync_regs
movq %rax, %rsp /* switch stack */
ENCODE_FRAME_POINTER
pushq %r12
/* /*
* We need to tell lockdep that IRQs are off. We can't do this until * We need to tell lockdep that IRQs are off. We can't do this until
* we fix gsbase, and we should do it before enter_from_user_mode * we fix gsbase, and we should do it before enter_from_user_mode

View File

@ -48,7 +48,7 @@
*/ */
ENTRY(entry_SYSENTER_compat) ENTRY(entry_SYSENTER_compat)
/* Interrupts are off on entry. */ /* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK SWAPGS
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/* /*
@ -306,8 +306,11 @@ ENTRY(entry_INT80_compat)
*/ */
movl %eax, %eax movl %eax, %eax
/* Construct struct pt_regs on stack (iret frame is already on stack) */
pushq %rax /* pt_regs->orig_ax */ pushq %rax /* pt_regs->orig_ax */
/* switch to thread stack expects orig_ax to be pushed */
call switch_to_thread_stack
pushq %rdi /* pt_regs->di */ pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */ pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */ pushq %rdx /* pt_regs->dx */

View File

@ -135,6 +135,8 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
set_bit(bit, (unsigned long *)cpu_caps_set); \ set_bit(bit, (unsigned long *)cpu_caps_set); \
} while (0) } while (0)
#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
/* /*
* Static testing of CPU features. Used the same as boot_cpu_has(). * Static testing of CPU features. Used the same as boot_cpu_has().

View File

@ -60,17 +60,10 @@ static inline struct desc_struct *get_current_gdt_rw(void)
return this_cpu_ptr(&gdt_page)->gdt; return this_cpu_ptr(&gdt_page)->gdt;
} }
/* Get the fixmap index for a specific processor */
static inline unsigned int get_cpu_gdt_ro_index(int cpu)
{
return FIX_GDT_REMAP_BEGIN + cpu;
}
/* Provide the fixmap address of the remapped GDT */ /* Provide the fixmap address of the remapped GDT */
static inline struct desc_struct *get_cpu_gdt_ro(int cpu) static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
{ {
unsigned int idx = get_cpu_gdt_ro_index(cpu); return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt;
return (struct desc_struct *)__fix_to_virt(idx);
} }
/* Provide the current read-only GDT */ /* Provide the current read-only GDT */
@ -185,7 +178,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr,
#endif #endif
} }
static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr)
{ {
struct desc_struct *d = get_cpu_gdt_rw(cpu); struct desc_struct *d = get_cpu_gdt_rw(cpu);
tss_desc tss; tss_desc tss;

View File

@ -44,6 +44,45 @@ extern unsigned long __FIXADDR_TOP;
PAGE_SIZE) PAGE_SIZE)
#endif #endif
/*
* cpu_entry_area is a percpu region in the fixmap that contains things
* needed by the CPU and early entry/exit code. Real types aren't used
* for all fields here to avoid circular header dependencies.
*
* Every field is a virtual alias of some other allocated backing store.
* There is no direct allocation of a struct cpu_entry_area.
*/
struct cpu_entry_area {
char gdt[PAGE_SIZE];
/*
* The GDT is just below SYSENTER_stack and thus serves (on x86_64) as
* a a read-only guard page.
*/
struct SYSENTER_stack_page SYSENTER_stack_page;
/*
* On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
* we need task switches to work, and task switches write to the TSS.
*/
struct tss_struct tss;
char entry_trampoline[PAGE_SIZE];
#ifdef CONFIG_X86_64
/*
* Exception stacks used for IST entries.
*
* In the future, this should have a separate slot for each stack
* with guard pages between them.
*/
char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
#endif
};
#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
extern void setup_cpu_entry_areas(void);
/* /*
* Here we define all the compile-time 'special' virtual * Here we define all the compile-time 'special' virtual
@ -101,8 +140,8 @@ enum fixed_addresses {
FIX_LNW_VRTC, FIX_LNW_VRTC,
#endif #endif
/* Fixmap entries to remap the GDTs, one per processor. */ /* Fixmap entries to remap the GDTs, one per processor. */
FIX_GDT_REMAP_BEGIN, FIX_CPU_ENTRY_AREA_TOP,
FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1, FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1,
#ifdef CONFIG_ACPI_APEI_GHES #ifdef CONFIG_ACPI_APEI_GHES
/* Used for GHES mapping from assorted contexts */ /* Used for GHES mapping from assorted contexts */
@ -191,5 +230,30 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
void __early_set_fixmap(enum fixed_addresses idx, void __early_set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags); phys_addr_t phys, pgprot_t flags);
static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page)
{
BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page;
}
#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \
BUILD_BUG_ON(offset % PAGE_SIZE != 0); \
__get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \
})
#define get_cpu_entry_area_index(cpu, field) \
__get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field))
static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
{
return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0));
}
static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu)
{
return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack;
}
#endif /* !__ASSEMBLY__ */ #endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_FIXMAP_H */ #endif /* _ASM_X86_FIXMAP_H */

View File

@ -20,16 +20,7 @@
#ifndef _ASM_X86_HYPERVISOR_H #ifndef _ASM_X86_HYPERVISOR_H
#define _ASM_X86_HYPERVISOR_H #define _ASM_X86_HYPERVISOR_H
#ifdef CONFIG_HYPERVISOR_GUEST /* x86 hypervisor types */
#include <asm/kvm_para.h>
#include <asm/x86_init.h>
#include <asm/xen/hypervisor.h>
/*
* x86 hypervisor information
*/
enum x86_hypervisor_type { enum x86_hypervisor_type {
X86_HYPER_NATIVE = 0, X86_HYPER_NATIVE = 0,
X86_HYPER_VMWARE, X86_HYPER_VMWARE,
@ -39,6 +30,12 @@ enum x86_hypervisor_type {
X86_HYPER_KVM, X86_HYPER_KVM,
}; };
#ifdef CONFIG_HYPERVISOR_GUEST
#include <asm/kvm_para.h>
#include <asm/x86_init.h>
#include <asm/xen/hypervisor.h>
struct hypervisor_x86 { struct hypervisor_x86 {
/* Hypervisor name */ /* Hypervisor name */
const char *name; const char *name;
@ -58,7 +55,15 @@ struct hypervisor_x86 {
extern enum x86_hypervisor_type x86_hyper_type; extern enum x86_hypervisor_type x86_hyper_type;
extern void init_hypervisor_platform(void); extern void init_hypervisor_platform(void);
static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
{
return x86_hyper_type == type;
}
#else #else
static inline void init_hypervisor_platform(void) { } static inline void init_hypervisor_platform(void) { }
static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
{
return type == X86_HYPER_NATIVE;
}
#endif /* CONFIG_HYPERVISOR_GUEST */ #endif /* CONFIG_HYPERVISOR_GUEST */
#endif /* _ASM_X86_HYPERVISOR_H */ #endif /* _ASM_X86_HYPERVISOR_H */

View File

@ -142,6 +142,9 @@ static inline notrace unsigned long arch_local_irq_save(void)
swapgs; \ swapgs; \
sysretl sysretl
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS(x) pushfq; popq %rax
#endif
#else #else
#define INTERRUPT_RETURN iret #define INTERRUPT_RETURN iret
#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit #define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit

View File

@ -26,6 +26,7 @@ extern void die(const char *, struct pt_regs *,long);
extern int __must_check __die(const char *, struct pt_regs *, long); extern int __must_check __die(const char *, struct pt_regs *, long);
extern void show_stack_regs(struct pt_regs *regs); extern void show_stack_regs(struct pt_regs *regs);
extern void __show_regs(struct pt_regs *regs, int all); extern void __show_regs(struct pt_regs *regs, int all);
extern void show_iret_regs(struct pt_regs *regs);
extern unsigned long oops_begin(void); extern unsigned long oops_begin(void);
extern void oops_end(unsigned long, struct pt_regs *, int signr); extern void oops_end(unsigned long, struct pt_regs *, int signr);

View File

@ -927,6 +927,15 @@ extern void default_banner(void);
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
CLBR_NONE, \ CLBR_NONE, \
jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS(clobbers) \
PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \
PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \
PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
#endif
#endif /* CONFIG_X86_32 */ #endif /* CONFIG_X86_32 */
#endif /* __ASSEMBLY__ */ #endif /* __ASSEMBLY__ */

View File

@ -163,9 +163,9 @@ enum cpuid_regs_idx {
extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 boot_cpu_data;
extern struct cpuinfo_x86 new_cpu_data; extern struct cpuinfo_x86 new_cpu_data;
extern struct tss_struct doublefault_tss; extern struct x86_hw_tss doublefault_tss;
extern __u32 cpu_caps_cleared[NCAPINTS]; extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
extern __u32 cpu_caps_set[NCAPINTS]; extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
@ -253,6 +253,11 @@ static inline void load_cr3(pgd_t *pgdir)
write_cr3(__sme_pa(pgdir)); write_cr3(__sme_pa(pgdir));
} }
/*
* Note that while the legacy 'TSS' name comes from 'Task State Segment',
* on modern x86 CPUs the TSS also holds information important to 64-bit mode,
* unrelated to the task-switch mechanism:
*/
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
/* This is the TSS defined by the hardware. */ /* This is the TSS defined by the hardware. */
struct x86_hw_tss { struct x86_hw_tss {
@ -305,7 +310,13 @@ struct x86_hw_tss {
struct x86_hw_tss { struct x86_hw_tss {
u32 reserved1; u32 reserved1;
u64 sp0; u64 sp0;
/*
* We store cpu_current_top_of_stack in sp1 so it's always accessible.
* Linux does not use ring 1, so sp1 is not otherwise needed.
*/
u64 sp1; u64 sp1;
u64 sp2; u64 sp2;
u64 reserved2; u64 reserved2;
u64 ist[7]; u64 ist[7];
@ -323,12 +334,22 @@ struct x86_hw_tss {
#define IO_BITMAP_BITS 65536 #define IO_BITMAP_BITS 65536
#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) #define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) #define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) #define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
#define INVALID_IO_BITMAP_OFFSET 0x8000 #define INVALID_IO_BITMAP_OFFSET 0x8000
struct SYSENTER_stack {
unsigned long words[64];
};
struct SYSENTER_stack_page {
struct SYSENTER_stack stack;
} __aligned(PAGE_SIZE);
struct tss_struct { struct tss_struct {
/* /*
* The hardware state: * The fixed hardware portion. This must not cross a page boundary
* at risk of violating the SDM's advice and potentially triggering
* errata.
*/ */
struct x86_hw_tss x86_tss; struct x86_hw_tss x86_tss;
@ -339,18 +360,9 @@ struct tss_struct {
* be within the limit. * be within the limit.
*/ */
unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
} __aligned(PAGE_SIZE);
#ifdef CONFIG_X86_32 DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
/*
* Space for the temporary SYSENTER stack.
*/
unsigned long SYSENTER_stack_canary;
unsigned long SYSENTER_stack[64];
#endif
} ____cacheline_aligned;
DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
/* /*
* sizeof(unsigned long) coming from an extra "long" at the end * sizeof(unsigned long) coming from an extra "long" at the end
@ -364,6 +376,9 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
#else
/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
#endif #endif
/* /*
@ -523,7 +538,7 @@ static inline void native_set_iopl_mask(unsigned mask)
static inline void static inline void
native_load_sp0(unsigned long sp0) native_load_sp0(unsigned long sp0)
{ {
this_cpu_write(cpu_tss.x86_tss.sp0, sp0); this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
} }
static inline void native_swapgs(void) static inline void native_swapgs(void)
@ -535,12 +550,12 @@ static inline void native_swapgs(void)
static inline unsigned long current_top_of_stack(void) static inline unsigned long current_top_of_stack(void)
{ {
#ifdef CONFIG_X86_64 /*
return this_cpu_read_stable(cpu_tss.x86_tss.sp0); * We can't read directly from tss.sp0: sp0 on x86_32 is special in
#else * and around vm86 mode and sp0 on x86_64 is special because of the
/* sp0 on x86_32 is special in and around vm86 mode. */ * entry trampoline.
*/
return this_cpu_read_stable(cpu_current_top_of_stack); return this_cpu_read_stable(cpu_current_top_of_stack);
#endif
} }
static inline bool on_thread_stack(void) static inline bool on_thread_stack(void)

View File

@ -16,6 +16,7 @@ enum stack_type {
STACK_TYPE_TASK, STACK_TYPE_TASK,
STACK_TYPE_IRQ, STACK_TYPE_IRQ,
STACK_TYPE_SOFTIRQ, STACK_TYPE_SOFTIRQ,
STACK_TYPE_SYSENTER,
STACK_TYPE_EXCEPTION, STACK_TYPE_EXCEPTION,
STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
}; };
@ -28,6 +29,8 @@ struct stack_info {
bool in_task_stack(unsigned long *stack, struct task_struct *task, bool in_task_stack(unsigned long *stack, struct task_struct *task,
struct stack_info *info); struct stack_info *info);
bool in_sysenter_stack(unsigned long *stack, struct stack_info *info);
int get_stack_info(unsigned long *stack, struct task_struct *task, int get_stack_info(unsigned long *stack, struct task_struct *task,
struct stack_info *info, unsigned long *visit_mask); struct stack_info *info, unsigned long *visit_mask);

View File

@ -79,10 +79,10 @@ do { \
static inline void refresh_sysenter_cs(struct thread_struct *thread) static inline void refresh_sysenter_cs(struct thread_struct *thread)
{ {
/* Only happens when SEP is enabled, no need to test "SEP"arately: */ /* Only happens when SEP is enabled, no need to test "SEP"arately: */
if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs)) if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs))
return; return;
this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs); this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);
wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
} }
#endif #endif
@ -90,10 +90,12 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread)
/* This is used when switching tasks or entering/exiting vm86 mode. */ /* This is used when switching tasks or entering/exiting vm86 mode. */
static inline void update_sp0(struct task_struct *task) static inline void update_sp0(struct task_struct *task)
{ {
/* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
load_sp0(task->thread.sp0); load_sp0(task->thread.sp0);
#else #else
load_sp0(task_top_of_stack(task)); if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task));
#endif #endif
} }

View File

@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack,
#else /* !__ASSEMBLY__ */ #else /* !__ASSEMBLY__ */
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
# define cpu_current_top_of_stack (cpu_tss + TSS_sp0) # define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
#endif #endif
#endif #endif

View File

@ -75,7 +75,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long);
dotraplinkage void do_stack_segment(struct pt_regs *, long); dotraplinkage void do_stack_segment(struct pt_regs *, long);
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
dotraplinkage void do_double_fault(struct pt_regs *, long); dotraplinkage void do_double_fault(struct pt_regs *, long);
asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
#endif #endif
dotraplinkage void do_general_protection(struct pt_regs *, long); dotraplinkage void do_general_protection(struct pt_regs *, long);
dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);

View File

@ -7,6 +7,9 @@
#include <asm/ptrace.h> #include <asm/ptrace.h>
#include <asm/stacktrace.h> #include <asm/stacktrace.h>
#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip))
#define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET)
struct unwind_state { struct unwind_state {
struct stack_info stack_info; struct stack_info stack_info;
unsigned long stack_mask; unsigned long stack_mask;
@ -52,6 +55,10 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
} }
#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
/*
* WARNING: The entire pt_regs may not be safe to dereference. In some cases,
* only the iret frame registers are accessible. Use with caution!
*/
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
{ {
if (unwind_done(state)) if (unwind_done(state))

View File

@ -93,4 +93,10 @@ void common(void) {
BLANK(); BLANK();
DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
/* Layout info for cpu_entry_area */
OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page);
DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
} }

View File

@ -47,13 +47,8 @@ void foo(void)
BLANK(); BLANK();
/* Offset from the sysenter stack to tss.sp0 */ /* Offset from the sysenter stack to tss.sp0 */
DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
offsetofend(struct tss_struct, SYSENTER_stack)); offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack));
/* Offset from cpu_tss to SYSENTER_stack */
OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
/* Size of SYSENTER_stack */
DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
#ifdef CONFIG_CC_STACKPROTECTOR #ifdef CONFIG_CC_STACKPROTECTOR
BLANK(); BLANK();

View File

@ -23,6 +23,9 @@ int main(void)
#ifdef CONFIG_PARAVIRT #ifdef CONFIG_PARAVIRT
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
#ifdef CONFIG_DEBUG_ENTRY
OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl);
#endif
BLANK(); BLANK();
#endif #endif
@ -63,6 +66,7 @@ int main(void)
OFFSET(TSS_ist, tss_struct, x86_tss.ist); OFFSET(TSS_ist, tss_struct, x86_tss.ist);
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
BLANK(); BLANK();
#ifdef CONFIG_CC_STACKPROTECTOR #ifdef CONFIG_CC_STACKPROTECTOR

View File

@ -476,8 +476,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
return NULL; /* Not found */ return NULL; /* Not found */
} }
__u32 cpu_caps_cleared[NCAPINTS]; __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
__u32 cpu_caps_set[NCAPINTS]; __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
void load_percpu_segment(int cpu) void load_percpu_segment(int cpu)
{ {
@ -490,27 +490,116 @@ void load_percpu_segment(int cpu)
load_stack_canary_segment(); load_stack_canary_segment();
} }
/* Setup the fixmap mapping only once per-processor */ #ifdef CONFIG_X86_32
static inline void setup_fixmap_gdt(int cpu) /* The 32-bit entry code needs to find cpu_entry_area. */
DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
#endif
#ifdef CONFIG_X86_64
/*
* Special IST stacks which the CPU switches to when it calls
* an IST-marked descriptor entry. Up to 7 stacks (hardware
* limit), all of them are 4K, except the debug stack which
* is 8K.
*/
static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
#endif
static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page,
SYSENTER_stack_storage);
static void __init
set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
{
for ( ; pages; pages--, idx--, ptr += PAGE_SIZE)
__set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot);
}
/* Setup the fixmap mappings only once per-processor */
static void __init setup_cpu_entry_area(int cpu)
{ {
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
/* On 64-bit systems, we use a read-only fixmap GDT. */ extern char _entry_trampoline[];
pgprot_t prot = PAGE_KERNEL_RO;
/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
pgprot_t gdt_prot = PAGE_KERNEL_RO;
pgprot_t tss_prot = PAGE_KERNEL_RO;
#else #else
/* /*
* On native 32-bit systems, the GDT cannot be read-only because * On native 32-bit systems, the GDT cannot be read-only because
* our double fault handler uses a task gate, and entering through * our double fault handler uses a task gate, and entering through
* a task gate needs to change an available TSS to busy. If the GDT * a task gate needs to change an available TSS to busy. If the
* is read-only, that will triple fault. * GDT is read-only, that will triple fault. The TSS cannot be
* read-only because the CPU writes to it on task switches.
* *
* On Xen PV, the GDT must be read-only because the hypervisor requires * On Xen PV, the GDT must be read-only because the hypervisor
* it. * requires it.
*/ */
pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ? pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
PAGE_KERNEL_RO : PAGE_KERNEL; PAGE_KERNEL_RO : PAGE_KERNEL;
pgprot_t tss_prot = PAGE_KERNEL;
#endif #endif
__set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot); __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page),
per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1,
PAGE_KERNEL);
/*
* The Intel SDM says (Volume 3, 7.2.1):
*
* Avoid placing a page boundary in the part of the TSS that the
* processor reads during a task switch (the first 104 bytes). The
* processor may not correctly perform address translations if a
* boundary occurs in this area. During a task switch, the processor
* reads and writes into the first 104 bytes of each TSS (using
* contiguous physical addresses beginning with the physical address
* of the first byte of the TSS). So, after TSS access begins, if
* part of the 104 bytes is not physically contiguous, the processor
* will access incorrect information without generating a page-fault
* exception.
*
* There are also a lot of errata involving the TSS spanning a page
* boundary. Assert that we're not doing that.
*/
BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
&per_cpu(cpu_tss_rw, cpu),
sizeof(struct tss_struct) / PAGE_SIZE,
tss_prot);
#ifdef CONFIG_X86_32
per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
#endif
#ifdef CONFIG_X86_64
BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
BUILD_BUG_ON(sizeof(exception_stacks) !=
sizeof(((struct cpu_entry_area *)0)->exception_stacks));
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks),
&per_cpu(exception_stacks, cpu),
sizeof(exception_stacks) / PAGE_SIZE,
PAGE_KERNEL);
__set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
__pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
#endif
}
void __init setup_cpu_entry_areas(void)
{
unsigned int cpu;
for_each_possible_cpu(cpu)
setup_cpu_entry_area(cpu);
} }
/* Load the original GDT from the per-cpu structure */ /* Load the original GDT from the per-cpu structure */
@ -747,7 +836,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
{ {
int i; int i;
for (i = 0; i < NCAPINTS; i++) { for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
c->x86_capability[i] &= ~cpu_caps_cleared[i]; c->x86_capability[i] &= ~cpu_caps_cleared[i];
c->x86_capability[i] |= cpu_caps_set[i]; c->x86_capability[i] |= cpu_caps_set[i];
} }
@ -1250,7 +1339,7 @@ void enable_sep_cpu(void)
return; return;
cpu = get_cpu(); cpu = get_cpu();
tss = &per_cpu(cpu_tss, cpu); tss = &per_cpu(cpu_tss_rw, cpu);
/* /*
* We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
@ -1259,11 +1348,7 @@ void enable_sep_cpu(void)
tss->x86_tss.ss1 = __KERNEL_CS; tss->x86_tss.ss1 = __KERNEL_CS;
wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0);
wrmsr(MSR_IA32_SYSENTER_ESP,
(unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
0);
wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
put_cpu(); put_cpu();
@ -1357,25 +1442,19 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count); EXPORT_PER_CPU_SYMBOL(__preempt_count);
/*
* Special IST stacks which the CPU switches to when it calls
* an IST-marked descriptor entry. Up to 7 stacks (hardware
* limit), all of them are 4K, except the debug stack which
* is 8K.
*/
static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
/* May not be marked __init: used by software suspend */ /* May not be marked __init: used by software suspend */
void syscall_init(void) void syscall_init(void)
{ {
extern char _entry_trampoline[];
extern char entry_SYSCALL_64_trampoline[];
int cpu = smp_processor_id();
unsigned long SYSCALL64_entry_trampoline =
(unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
(entry_SYSCALL_64_trampoline - _entry_trampoline);
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
#ifdef CONFIG_IA32_EMULATION #ifdef CONFIG_IA32_EMULATION
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
@ -1386,7 +1465,7 @@ void syscall_init(void)
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/ */
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else #else
wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
@ -1530,7 +1609,7 @@ void cpu_init(void)
if (cpu) if (cpu)
load_ucode_ap(); load_ucode_ap();
t = &per_cpu(cpu_tss, cpu); t = &per_cpu(cpu_tss_rw, cpu);
oist = &per_cpu(orig_ist, cpu); oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
@ -1569,7 +1648,7 @@ void cpu_init(void)
* set up and load the per-CPU TSS * set up and load the per-CPU TSS
*/ */
if (!oist->ist[0]) { if (!oist->ist[0]) {
char *estacks = per_cpu(exception_stacks, cpu); char *estacks = get_cpu_entry_area(cpu)->exception_stacks;
for (v = 0; v < N_EXCEPTION_STACKS; v++) { for (v = 0; v < N_EXCEPTION_STACKS; v++) {
estacks += exception_stack_sizes[v]; estacks += exception_stack_sizes[v];
@ -1580,7 +1659,7 @@ void cpu_init(void)
} }
} }
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
/* /*
* <= is required because the CPU will access up to * <= is required because the CPU will access up to
@ -1596,11 +1675,12 @@ void cpu_init(void)
enter_lazy_tlb(&init_mm, me); enter_lazy_tlb(&init_mm, me);
/* /*
* Initialize the TSS. Don't bother initializing sp0, as the initial * Initialize the TSS. sp0 points to the entry trampoline stack
* task never enters user mode. * regardless of what task is running.
*/ */
set_tss_desc(cpu, t); set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc(); load_TR_desc();
load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
load_mm_ldt(&init_mm); load_mm_ldt(&init_mm);
@ -1612,7 +1692,6 @@ void cpu_init(void)
if (is_uv_system()) if (is_uv_system())
uv_cpu_init(); uv_cpu_init();
setup_fixmap_gdt(cpu);
load_fixmap_gdt(cpu); load_fixmap_gdt(cpu);
} }
@ -1622,7 +1701,7 @@ void cpu_init(void)
{ {
int cpu = smp_processor_id(); int cpu = smp_processor_id();
struct task_struct *curr = current; struct task_struct *curr = current;
struct tss_struct *t = &per_cpu(cpu_tss, cpu); struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
wait_for_master_cpu(cpu); wait_for_master_cpu(cpu);
@ -1657,12 +1736,12 @@ void cpu_init(void)
* Initialize the TSS. Don't bother initializing sp0, as the initial * Initialize the TSS. Don't bother initializing sp0, as the initial
* task never enters user mode. * task never enters user mode.
*/ */
set_tss_desc(cpu, t); set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc(); load_TR_desc();
load_mm_ldt(&init_mm); load_mm_ldt(&init_mm);
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
#ifdef CONFIG_DOUBLEFAULT #ifdef CONFIG_DOUBLEFAULT
/* Set up doublefault TSS pointer in the GDT */ /* Set up doublefault TSS pointer in the GDT */
@ -1674,7 +1753,6 @@ void cpu_init(void)
fpu__init_cpu(); fpu__init_cpu();
setup_fixmap_gdt(cpu);
load_fixmap_gdt(cpu); load_fixmap_gdt(cpu);
} }
#endif #endif

View File

@ -50,25 +50,23 @@ static void doublefault_fn(void)
cpu_relax(); cpu_relax();
} }
struct tss_struct doublefault_tss __cacheline_aligned = { struct x86_hw_tss doublefault_tss __cacheline_aligned = {
.x86_tss = { .sp0 = STACK_START,
.sp0 = STACK_START, .ss0 = __KERNEL_DS,
.ss0 = __KERNEL_DS, .ldt = 0,
.ldt = 0, .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
.io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
.ip = (unsigned long) doublefault_fn, .ip = (unsigned long) doublefault_fn,
/* 0x2 bit is always set */ /* 0x2 bit is always set */
.flags = X86_EFLAGS_SF | 0x2, .flags = X86_EFLAGS_SF | 0x2,
.sp = STACK_START, .sp = STACK_START,
.es = __USER_DS, .es = __USER_DS,
.cs = __KERNEL_CS, .cs = __KERNEL_CS,
.ss = __KERNEL_DS, .ss = __KERNEL_DS,
.ds = __USER_DS, .ds = __USER_DS,
.fs = __KERNEL_PERCPU, .fs = __KERNEL_PERCPU,
.__cr3 = __pa_nodebug(swapper_pg_dir), .__cr3 = __pa_nodebug(swapper_pg_dir),
}
}; };
/* dummy for do_double_fault() call */ /* dummy for do_double_fault() call */

View File

@ -43,6 +43,24 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
return true; return true;
} }
bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
{
struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id());
void *begin = ss;
void *end = ss + 1;
if ((void *)stack < begin || (void *)stack >= end)
return false;
info->type = STACK_TYPE_SYSENTER;
info->begin = begin;
info->end = end;
info->next_sp = NULL;
return true;
}
static void printk_stack_address(unsigned long address, int reliable, static void printk_stack_address(unsigned long address, int reliable,
char *log_lvl) char *log_lvl)
{ {
@ -50,6 +68,28 @@ static void printk_stack_address(unsigned long address, int reliable,
printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
} }
void show_iret_regs(struct pt_regs *regs)
{
printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip);
printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
regs->sp, regs->flags);
}
static void show_regs_safe(struct stack_info *info, struct pt_regs *regs)
{
if (on_stack(info, regs, sizeof(*regs)))
__show_regs(regs, 0);
else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
IRET_FRAME_SIZE)) {
/*
* When an interrupt or exception occurs in entry code, the
* full pt_regs might not have been saved yet. In that case
* just print the iret frame.
*/
show_iret_regs(regs);
}
}
void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, char *log_lvl) unsigned long *stack, char *log_lvl)
{ {
@ -71,31 +111,35 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - task stack * - task stack
* - interrupt stack * - interrupt stack
* - HW exception stacks (double fault, nmi, debug, mce) * - HW exception stacks (double fault, nmi, debug, mce)
* - SYSENTER stack
* *
* x86-32 can have up to three stacks: * x86-32 can have up to four stacks:
* - task stack * - task stack
* - softirq stack * - softirq stack
* - hardirq stack * - hardirq stack
* - SYSENTER stack
*/ */
for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
const char *stack_name; const char *stack_name;
/* if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
* If we overflowed the task stack into a guard page, jump back /*
* to the bottom of the usable stack. * We weren't on a valid stack. It's possible that
*/ * we overflowed a valid stack into a guard page.
if (task_stack_page(task) - (void *)stack < PAGE_SIZE) * See if the next page up is valid so that we can
stack = task_stack_page(task); * generate some kind of backtrace if this happens.
*/
if (get_stack_info(stack, task, &stack_info, &visit_mask)) stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack);
break; if (get_stack_info(stack, task, &stack_info, &visit_mask))
break;
}
stack_name = stack_type_name(stack_info.type); stack_name = stack_type_name(stack_info.type);
if (stack_name) if (stack_name)
printk("%s <%s>\n", log_lvl, stack_name); printk("%s <%s>\n", log_lvl, stack_name);
if (regs && on_stack(&stack_info, regs, sizeof(*regs))) if (regs)
__show_regs(regs, 0); show_regs_safe(&stack_info, regs);
/* /*
* Scan the stack, printing any text addresses we find. At the * Scan the stack, printing any text addresses we find. At the
@ -119,7 +163,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
/* /*
* Don't print regs->ip again if it was already printed * Don't print regs->ip again if it was already printed
* by __show_regs() below. * by show_regs_safe() below.
*/ */
if (regs && stack == &regs->ip) if (regs && stack == &regs->ip)
goto next; goto next;
@ -155,8 +199,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
/* if the frame has entry regs, print them */ /* if the frame has entry regs, print them */
regs = unwind_get_entry_regs(&state); regs = unwind_get_entry_regs(&state);
if (regs && on_stack(&stack_info, regs, sizeof(*regs))) if (regs)
__show_regs(regs, 0); show_regs_safe(&stack_info, regs);
} }
if (stack_name) if (stack_name)

View File

@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_SOFTIRQ) if (type == STACK_TYPE_SOFTIRQ)
return "SOFTIRQ"; return "SOFTIRQ";
if (type == STACK_TYPE_SYSENTER)
return "SYSENTER";
return NULL; return NULL;
} }
@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (task != current) if (task != current)
goto unknown; goto unknown;
if (in_sysenter_stack(stack, info))
goto recursion_check;
if (in_hardirq_stack(stack, info)) if (in_hardirq_stack(stack, info))
goto recursion_check; goto recursion_check;

View File

@ -37,6 +37,9 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_IRQ) if (type == STACK_TYPE_IRQ)
return "IRQ"; return "IRQ";
if (type == STACK_TYPE_SYSENTER)
return "SYSENTER";
if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
return exception_stack_names[type - STACK_TYPE_EXCEPTION]; return exception_stack_names[type - STACK_TYPE_EXCEPTION];
@ -115,6 +118,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (in_irq_stack(stack, info)) if (in_irq_stack(stack, info))
goto recursion_check; goto recursion_check;
if (in_sysenter_stack(stack, info))
goto recursion_check;
goto unknown; goto unknown;
recursion_check: recursion_check:

View File

@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
* because the ->io_bitmap_max value must match the bitmap * because the ->io_bitmap_max value must match the bitmap
* contents: * contents:
*/ */
tss = &per_cpu(cpu_tss, get_cpu()); tss = &per_cpu(cpu_tss_rw, get_cpu());
if (turn_on) if (turn_on)
bitmap_clear(t->io_bitmap_ptr, from, num); bitmap_clear(t->io_bitmap_ptr, from, num);

View File

@ -219,18 +219,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
/* high bit used in ret_from_ code */ /* high bit used in ret_from_ code */
unsigned vector = ~regs->orig_ax; unsigned vector = ~regs->orig_ax;
/*
* NB: Unlike exception entries, IRQ entries do not reliably
* handle context tracking in the low-level entry code. This is
* because syscall entries execute briefly with IRQs on before
* updating context tracking state, so we can take an IRQ from
* kernel mode with CONTEXT_USER. The low-level entry code only
* updates the context if we came from user mode, so we won't
* switch to CONTEXT_KERNEL. We'll fix that once the syscall
* code is cleaned up enough that we can cleanly defer enabling
* IRQs.
*/
entering_irq(); entering_irq();
/* entering_irq() tells RCU that we're not quiescent. Check it. */ /* entering_irq() tells RCU that we're not quiescent. Check it. */

View File

@ -57,10 +57,10 @@ static inline void stack_overflow_check(struct pt_regs *regs)
if (regs->sp >= estack_top && regs->sp <= estack_bottom) if (regs->sp >= estack_top && regs->sp <= estack_bottom)
return; return;
WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n", WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",
current->comm, curbase, regs->sp, current->comm, curbase, regs->sp,
irq_stack_top, irq_stack_bottom, irq_stack_top, irq_stack_bottom,
estack_top, estack_bottom); estack_top, estack_bottom, (void *)regs->ip);
if (sysctl_panic_on_stackoverflow) if (sysctl_panic_on_stackoverflow)
panic("low stack detected by irq handler - check messages\n"); panic("low stack detected by irq handler - check messages\n");

View File

@ -10,7 +10,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
@ -60,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr2);
PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3); PATCH_SITE(pv_mmu_ops, write_cr3);
PATCH_SITE(pv_mmu_ops, flush_tlb_single);
PATCH_SITE(pv_cpu_ops, wbinvd); PATCH_SITE(pv_cpu_ops, wbinvd);
#if defined(CONFIG_PARAVIRT_SPINLOCKS) #if defined(CONFIG_PARAVIRT_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):

View File

@ -47,7 +47,7 @@
* section. Since TSS's are completely CPU-local, we want them * section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong. * on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/ */
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = {
.x86_tss = { .x86_tss = {
/* /*
* .sp0 is only used when entering ring 0 from a lower * .sp0 is only used when entering ring 0 from a lower
@ -56,6 +56,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
* Poison it. * Poison it.
*/ */
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1, .sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
#ifdef CONFIG_X86_64
/*
* .sp1 is cpu_current_top_of_stack. The init task never
* runs user code, but cpu_current_top_of_stack should still
* be well defined before the first context switch.
*/
.sp1 = TOP_OF_INIT_STACK,
#endif
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS, .ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS, .ss1 = __KERNEL_CS,
@ -71,11 +81,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
*/ */
.io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
#endif #endif
#ifdef CONFIG_X86_32
.SYSENTER_stack_canary = STACK_END_MAGIC,
#endif
}; };
EXPORT_PER_CPU_SYMBOL(cpu_tss); EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
DEFINE_PER_CPU(bool, __tss_limit_invalid); DEFINE_PER_CPU(bool, __tss_limit_invalid);
EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
@ -104,7 +111,7 @@ void exit_thread(struct task_struct *tsk)
struct fpu *fpu = &t->fpu; struct fpu *fpu = &t->fpu;
if (bp) { if (bp) {
struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
t->io_bitmap_ptr = NULL; t->io_bitmap_ptr = NULL;
clear_thread_flag(TIF_IO_BITMAP); clear_thread_flag(TIF_IO_BITMAP);

View File

@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *prev_fpu = &prev->fpu; struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu; struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id(); int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu); struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */

View File

@ -69,9 +69,8 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned int fsindex, gsindex; unsigned int fsindex, gsindex;
unsigned int ds, cs, es; unsigned int ds, cs, es;
printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip); show_iret_regs(regs);
printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
regs->sp, regs->flags);
if (regs->orig_ax != -1) if (regs->orig_ax != -1)
pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
else else
@ -88,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all)
printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
regs->r13, regs->r14, regs->r15); regs->r13, regs->r14, regs->r15);
if (!all)
return;
asm("movl %%ds,%0" : "=r" (ds)); asm("movl %%ds,%0" : "=r" (ds));
asm("movl %%cs,%0" : "=r" (cs)); asm("movl %%cs,%0" : "=r" (cs));
asm("movl %%es,%0" : "=r" (es)); asm("movl %%es,%0" : "=r" (es));
@ -98,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all)
rdmsrl(MSR_GS_BASE, gs); rdmsrl(MSR_GS_BASE, gs);
rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
if (!all)
return;
cr0 = read_cr0(); cr0 = read_cr0();
cr2 = read_cr2(); cr2 = read_cr2();
cr3 = __read_cr3(); cr3 = __read_cr3();
@ -400,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *prev_fpu = &prev->fpu; struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu; struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id(); int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu); struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
this_cpu_read(irq_count) != -1); this_cpu_read(irq_count) != -1);
@ -462,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* Switch the PDA and FPU contexts. * Switch the PDA and FPU contexts.
*/ */
this_cpu_write(current_task, next_p); this_cpu_write(current_task, next_p);
this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
/* Reload sp0. */ /* Reload sp0. */
update_sp0(next_p); update_sp0(next_p);

View File

@ -348,9 +348,15 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
/* /*
* If IRET takes a non-IST fault on the espfix64 stack, then we * If IRET takes a non-IST fault on the espfix64 stack, then we
* end up promoting it to a doublefault. In that case, modify * end up promoting it to a doublefault. In that case, take
* the stack to make it look like we just entered the #GP * advantage of the fact that we're not using the normal (TSS.sp0)
* handler from user space, similar to bad_iret. * stack right now. We can write a fake #GP(0) frame at TSS.sp0
* and then modify our own IRET frame so that, when we return,
* we land directly at the #GP(0) vector with the stack already
* set up according to its expectations.
*
* The net result is that our #GP handler will think that we
* entered from usermode with the bad user context.
* *
* No need for ist_enter here because we don't use RCU. * No need for ist_enter here because we don't use RCU.
*/ */
@ -358,13 +364,26 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
regs->cs == __KERNEL_CS && regs->cs == __KERNEL_CS &&
regs->ip == (unsigned long)native_irq_return_iret) regs->ip == (unsigned long)native_irq_return_iret)
{ {
struct pt_regs *normal_regs = task_pt_regs(current); struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
/* Fake a #GP(0) from userspace. */ /*
memmove(&normal_regs->ip, (void *)regs->sp, 5*8); * regs->sp points to the failing IRET frame on the
normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ * ESPFIX64 stack. Copy it to the entry stack. This fills
* in gpregs->ss through gpregs->ip.
*
*/
memmove(&gpregs->ip, (void *)regs->sp, 5*8);
gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
/*
* Adjust our frame so that we return straight to the #GP
* vector with the expected RSP value. This is safe because
* we won't enable interupts or schedule before we invoke
* general_protection, so nothing will clobber the stack
* frame we just set up.
*/
regs->ip = (unsigned long)general_protection; regs->ip = (unsigned long)general_protection;
regs->sp = (unsigned long)&normal_regs->orig_ax; regs->sp = (unsigned long)&gpregs->orig_ax;
return; return;
} }
@ -389,7 +408,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
* *
* Processors update CR2 whenever a page fault is detected. If a * Processors update CR2 whenever a page fault is detected. If a
* second page fault occurs while an earlier page fault is being * second page fault occurs while an earlier page fault is being
* deliv- ered, the faulting linear address of the second fault will * delivered, the faulting linear address of the second fault will
* overwrite the contents of CR2 (replacing the previous * overwrite the contents of CR2 (replacing the previous
* address). These updates to CR2 occur even if the page fault * address). These updates to CR2 occur even if the page fault
* results in a double fault or occurs during the delivery of a * results in a double fault or occurs during the delivery of a
@ -605,14 +624,15 @@ NOKPROBE_SYMBOL(do_int3);
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
/* /*
* Help handler running on IST stack to switch off the IST stack if the * Help handler running on a per-cpu (IST or entry trampoline) stack
* interrupted code was in user mode. The actual stack switch is done in * to switch to the normal thread stack if the interrupted code was in
* entry_64.S * user mode. The actual stack switch is done in entry_64.S
*/ */
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
{ {
struct pt_regs *regs = task_pt_regs(current); struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
*regs = *eregs; if (regs != eregs)
*regs = *eregs;
return regs; return regs;
} }
NOKPROBE_SYMBOL(sync_regs); NOKPROBE_SYMBOL(sync_regs);
@ -628,13 +648,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
/* /*
* This is called from entry_64.S early in handling a fault * This is called from entry_64.S early in handling a fault
* caused by a bad iret to user mode. To handle the fault * caused by a bad iret to user mode. To handle the fault
* correctly, we want move our stack frame to task_pt_regs * correctly, we want to move our stack frame to where it would
* and we want to pretend that the exception came from the * be had we entered directly on the entry stack (rather than
* iret target. * just below the IRET frame) and we want to pretend that the
* exception came from the IRET target.
*/ */
struct bad_iret_stack *new_stack = struct bad_iret_stack *new_stack =
container_of(task_pt_regs(current), (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
struct bad_iret_stack, regs);
/* Copy the IRET target to the new stack. */ /* Copy the IRET target to the new stack. */
memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
@ -795,14 +815,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
debug_stack_usage_dec(); debug_stack_usage_dec();
exit: exit:
#if defined(CONFIG_X86_32)
/*
* This is the most likely code path that involves non-trivial use
* of the SYSENTER stack. Check that we haven't overrun it.
*/
WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
"Overran or corrupted SYSENTER stack\n");
#endif
ist_exit(regs); ist_exit(regs);
} }
NOKPROBE_SYMBOL(do_debug); NOKPROBE_SYMBOL(do_debug);
@ -929,6 +941,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
void __init trap_init(void) void __init trap_init(void)
{ {
/* Init cpu_entry_area before IST entries are set up */
setup_cpu_entry_areas();
idt_setup_traps(); idt_setup_traps();
/* /*

View File

@ -253,22 +253,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
return NULL; return NULL;
} }
static bool stack_access_ok(struct unwind_state *state, unsigned long addr, static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
size_t len) size_t len)
{ {
struct stack_info *info = &state->stack_info; struct stack_info *info = &state->stack_info;
void *addr = (void *)_addr;
/* if (!on_stack(info, addr, len) &&
* If the address isn't on the current stack, switch to the next one. (get_stack_info(addr, state->task, info, &state->stack_mask)))
* return false;
* We may have to traverse multiple stacks to deal with the possibility
* that info->next_sp could point to an empty stack and the address
* could be on a subsequent stack.
*/
while (!on_stack(info, (void *)addr, len))
if (get_stack_info(info->next_sp, state->task, info,
&state->stack_mask))
return false;
return true; return true;
} }
@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
return true; return true;
} }
#define REGS_SIZE (sizeof(struct pt_regs))
#define SP_OFFSET (offsetof(struct pt_regs, sp))
#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
unsigned long *ip, unsigned long *sp, bool full) unsigned long *ip, unsigned long *sp)
{ {
size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE; struct pt_regs *regs = (struct pt_regs *)addr;
size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
if (IS_ENABLED(CONFIG_X86_64)) { /* x86-32 support will be more complicated due to the &regs->sp hack */
if (!stack_access_ok(state, addr, regs_size)) BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));
return false;
*ip = regs->ip; if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
*sp = regs->sp;
return true;
}
if (!stack_access_ok(state, addr, sp_offset))
return false; return false;
*ip = regs->ip; *ip = regs->ip;
*sp = regs->sp;
return true;
}
if (user_mode(regs)) { static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr,
if (!stack_access_ok(state, addr + sp_offset, unsigned long *ip, unsigned long *sp)
REGS_SIZE - SP_OFFSET)) {
return false; struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET;
*sp = regs->sp; if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
} else return false;
*sp = (unsigned long)&regs->sp;
*ip = regs->ip;
*sp = regs->sp;
return true; return true;
} }
@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_state *state)
unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
enum stack_type prev_type = state->stack_info.type; enum stack_type prev_type = state->stack_info.type;
struct orc_entry *orc; struct orc_entry *orc;
struct pt_regs *ptregs;
bool indirect = false; bool indirect = false;
if (unwind_done(state)) if (unwind_done(state))
@ -435,7 +417,7 @@ bool unwind_next_frame(struct unwind_state *state)
break; break;
case ORC_TYPE_REGS: case ORC_TYPE_REGS:
if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
orc_warn("can't dereference registers at %p for ip %pB\n", orc_warn("can't dereference registers at %p for ip %pB\n",
(void *)sp, (void *)orig_ip); (void *)sp, (void *)orig_ip);
goto done; goto done;
@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_state *state)
break; break;
case ORC_TYPE_REGS_IRET: case ORC_TYPE_REGS_IRET:
if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
orc_warn("can't dereference iret registers at %p for ip %pB\n", orc_warn("can't dereference iret registers at %p for ip %pB\n",
(void *)sp, (void *)orig_ip); (void *)sp, (void *)orig_ip);
goto done; goto done;
} }
ptregs = container_of((void *)sp, struct pt_regs, ip); state->regs = (void *)sp - IRET_FRAME_OFFSET;
if ((unsigned long)ptregs >= prev_sp && state->full_regs = false;
on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
state->regs = ptregs;
state->full_regs = false;
} else
state->regs = NULL;
state->signal = true; state->signal = true;
break; break;
@ -553,8 +529,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
} }
if (get_stack_info((unsigned long *)state->sp, state->task, if (get_stack_info((unsigned long *)state->sp, state->task,
&state->stack_info, &state->stack_mask)) &state->stack_info, &state->stack_mask)) {
return; /*
* We weren't on a valid stack. It's possible that
* we overflowed a valid stack into a guard page.
* See if the next page up is valid so that we can
* generate some kind of backtrace if this happens.
*/
void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
if (get_stack_info(next_page, state->task, &state->stack_info,
&state->stack_mask))
return;
}
/* /*
* The caller can provide the address of the first frame directly * The caller can provide the address of the first frame directly

View File

@ -107,6 +107,15 @@ SECTIONS
SOFTIRQENTRY_TEXT SOFTIRQENTRY_TEXT
*(.fixup) *(.fixup)
*(.gnu.warning) *(.gnu.warning)
#ifdef CONFIG_X86_64
. = ALIGN(PAGE_SIZE);
_entry_trampoline = .;
*(.entry_trampoline)
. = ALIGN(PAGE_SIZE);
ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
#endif
/* End of text section */ /* End of text section */
_etext = .; _etext = .;
} :text = 0x9090 } :text = 0x9090

View File

@ -2302,7 +2302,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
* processors. See 22.2.4. * processors. See 22.2.4.
*/ */
vmcs_writel(HOST_TR_BASE, vmcs_writel(HOST_TR_BASE,
(unsigned long)this_cpu_ptr(&cpu_tss)); (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
/* /*

View File

@ -107,10 +107,10 @@ static void delay_mwaitx(unsigned long __loops)
delay = min_t(u64, MWAITX_MAX_LOOPS, loops); delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
/* /*
* Use cpu_tss as a cacheline-aligned, seldomly * Use cpu_tss_rw as a cacheline-aligned, seldomly
* accessed per-cpu variable as the monitor target. * accessed per-cpu variable as the monitor target.
*/ */
__monitorx(raw_cpu_ptr(&cpu_tss), 0, 0); __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
/* /*
* AMD, like Intel, supports the EAX hint and EAX=0xf * AMD, like Intel, supports the EAX hint and EAX=0xf

View File

@ -277,6 +277,7 @@ void __init kasan_early_init(void)
void __init kasan_init(void) void __init kasan_init(void)
{ {
int i; int i;
void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
#ifdef CONFIG_KASAN_INLINE #ifdef CONFIG_KASAN_INLINE
register_die_notifier(&kasan_die_notifier); register_die_notifier(&kasan_die_notifier);
@ -329,8 +330,23 @@ void __init kasan_init(void)
(unsigned long)kasan_mem_to_shadow(_end), (unsigned long)kasan_mem_to_shadow(_end),
early_pfn_to_nid(__pa(_stext))); early_pfn_to_nid(__pa(_stext)));
shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM);
shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
PAGE_SIZE);
shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE);
shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
PAGE_SIZE);
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
(void *)KASAN_SHADOW_END); shadow_cpu_entry_begin);
kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
(unsigned long)shadow_cpu_entry_end, 0);
kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END);
load_cr3(init_top_pgt); load_cr3(init_top_pgt);
__flush_tlb_all(); __flush_tlb_all();

View File

@ -152,17 +152,19 @@ static void do_fpu_end(void)
static void fix_processor_context(void) static void fix_processor_context(void)
{ {
int cpu = smp_processor_id(); int cpu = smp_processor_id();
struct tss_struct *t = &per_cpu(cpu_tss, cpu);
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
struct desc_struct *desc = get_cpu_gdt_rw(cpu); struct desc_struct *desc = get_cpu_gdt_rw(cpu);
tss_desc tss; tss_desc tss;
#endif #endif
set_tss_desc(cpu, t); /*
* This just modifies memory; should not be /*
* necessary. But... This is necessary, because * We need to reload TR, which requires that we change the
* 386 hardware has concept of busy TSS or some * GDT entry to indicate "available" first.
* similar stupidity. *
*/ * XXX: This could probably all be replaced by a call to
* force_reload_TR().
*/
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));

View File

@ -826,7 +826,7 @@ static void xen_load_sp0(unsigned long sp0)
mcs = xen_mc_entry(0); mcs = xen_mc_entry(0);
MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
xen_mc_issue(PARAVIRT_LAZY_CPU); xen_mc_issue(PARAVIRT_LAZY_CPU);
this_cpu_write(cpu_tss.x86_tss.sp0, sp0); this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
} }
void xen_set_iopl_mask(unsigned mask) void xen_set_iopl_mask(unsigned mask)

View File

@ -2272,7 +2272,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif #endif
case FIX_TEXT_POKE0: case FIX_TEXT_POKE0:
case FIX_TEXT_POKE1: case FIX_TEXT_POKE1:
case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END: case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM:
/* All local page mappings */ /* All local page mappings */
pte = pfn_pte(phys, prot); pte = pfn_pte(phys, prot);
break; break;