From 4188f063e3694ccbf2a2044cf17cc325f91e458f Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Mon, 25 Jun 2018 14:38:08 +0200 Subject: [PATCH 1/9] x86/mm: Get rid of KERN_CONT in show_fault_oops() KERN_CONT leads to split lines in kernel output and complicates useful changes to printk like printing context before each line. Only acceptable use of continuations is basically boot-time testing. Get rid of it. Signed-off-by: Dmitry Vyukov Reviewed-by: Sergey Senozhatsky Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180625123808.227417-1-dvyukov@gmail.com [ Removed unnecessary parentheses and prettified the printk statement. ] Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9a84a0d08727..ee85766e6329 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -671,13 +671,9 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(smep_warning, from_kuid(&init_user_ns, current_uid())); } - printk(KERN_ALERT "BUG: unable to handle kernel "); - if (address < PAGE_SIZE) - printk(KERN_CONT "NULL pointer dereference"); - else - printk(KERN_CONT "paging request"); - - printk(KERN_CONT " at %px\n", (void *) address); + pr_alert("BUG: unable to handle kernel %s at %px\n", + address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", + (void *)address); dump_pagetable(address); } From 236f0cd286b67291796362feeac4f342900cfa82 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 25 Jun 2018 04:21:59 -0600 Subject: [PATCH 2/9] x86/entry/32: Add explicit 'l' instruction suffix Omitting suffixes from instructions in AT&T mode is bad practice when operand size cannot be determined by the assembler from register operands, and is likely going to be warned about by upstream GAS in the future (mine does already). Add the single missing 'l' suffix here. Signed-off-by: Jan Beulich Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5B30C24702000078001CD6A6@prv1-mh.provo.novell.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 2582881d19ce..c371bfee137a 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -477,7 +477,7 @@ ENTRY(entry_SYSENTER_32) * whereas POPF does not.) */ addl $PT_EFLAGS-PT_DS, %esp /* point esp at pt_regs->flags */ - btr $X86_EFLAGS_IF_BIT, (%esp) + btrl $X86_EFLAGS_IF_BIT, (%esp) popfl /* From 0e311d237d7f3022b7dafb639b42541bfb42fe94 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 25 Jun 2018 13:24:27 +0300 Subject: [PATCH 3/9] x86/mm: Don't free P4D table when it is folded at runtime When the P4D page table layer is folded at runtime, the p4d_free() should do nothing, the same as in . It seems this bug should cause double-free in efi_call_phys_epilog(), but I don't know how to trigger that code path, so I can't confirm that by testing. Signed-off-by: Andrey Ryabinin Reviewed-by: Kirill A. Shutemov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org # 4.17 Fixes: 98219dda2ab5 ("x86/mm: Fold p4d page table layer at runtime") Link: http://lkml.kernel.org/r/20180625102427.15015-1-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgalloc.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index ada6410fd2ec..fbd578daa66e 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -184,6 +184,9 @@ static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) { + if (!pgtable_l5_enabled()) + return; + BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); free_page((unsigned long)p4d); } From 22cd978e598618e82c3c3348d2069184f6884182 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 26 Jun 2018 22:45:52 -0700 Subject: [PATCH 4/9] x86/entry/64/compat: Fix "x86/entry/64/compat: Preserve r8-r11 in int $0x80" Commit: 8bb2610bc496 ("x86/entry/64/compat: Preserve r8-r11 in int $0x80") was busted: my original patch had a minor conflict with some of the nospec changes, but "git apply" is very clever and silently accepted the patch by making the same changes to a different function in the same file. There was obviously a huge offset, but "git apply" for some reason doesn't feel any need to say so. Move the changes to the correct function. Now the test_syscall_vdso_32 selftests passes. If anyone cares to observe the original problem, try applying the patch at: https://lore.kernel.org/lkml/d4c4d9985fbe64f8c9e19291886453914b48caee.1523975710.git.luto@kernel.org/raw to the kernel at 316d097c4cd4e7f2ef50c40cff2db266593c4ec4: - "git am" and "git apply" accept the patch without any complaints at all - "patch -p1" at least prints out a message about the huge offset. Reported-by: zhijianx.li@intel.com Signed-off-by: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dan Williams Cc: Dave Hansen Cc: David Woodhouse Cc: Greg Kroah-Hartman Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org #v4.17+ Fixes: 8bb2610bc496 ("x86/entry/64/compat: Preserve r8-r11 in int $0x80") Link: http://lkml.kernel.org/r/6012b922485401bc42676e804171ded262fc2ef2.1530078306.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 9de7f1e1dede..7d0df78db727 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -84,13 +84,13 @@ ENTRY(entry_SYSENTER_compat) pushq %rdx /* pt_regs->dx */ pushq %rcx /* pt_regs->cx */ pushq $-ENOSYS /* pt_regs->ax */ - pushq %r8 /* pt_regs->r8 */ + pushq $0 /* pt_regs->r8 = 0 */ xorl %r8d, %r8d /* nospec r8 */ - pushq %r9 /* pt_regs->r9 */ + pushq $0 /* pt_regs->r9 = 0 */ xorl %r9d, %r9d /* nospec r9 */ - pushq %r10 /* pt_regs->r10 */ + pushq $0 /* pt_regs->r10 = 0 */ xorl %r10d, %r10d /* nospec r10 */ - pushq %r11 /* pt_regs->r11 */ + pushq $0 /* pt_regs->r11 = 0 */ xorl %r11d, %r11d /* nospec r11 */ pushq %rbx /* pt_regs->rbx */ xorl %ebx, %ebx /* nospec rbx */ @@ -374,13 +374,13 @@ ENTRY(entry_INT80_compat) pushq %rcx /* pt_regs->cx */ xorl %ecx, %ecx /* nospec cx */ pushq $-ENOSYS /* pt_regs->ax */ - pushq $0 /* pt_regs->r8 = 0 */ + pushq %r8 /* pt_regs->r8 */ xorl %r8d, %r8d /* nospec r8 */ - pushq $0 /* pt_regs->r9 = 0 */ + pushq %r9 /* pt_regs->r9 */ xorl %r9d, %r9d /* nospec r9 */ - pushq $0 /* pt_regs->r10 = 0 */ + pushq %r10 /* pt_regs->r10*/ xorl %r10d, %r10d /* nospec r10 */ - pushq $0 /* pt_regs->r11 = 0 */ + pushq %r11 /* pt_regs->r11 */ xorl %r11d, %r11d /* nospec r11 */ pushq %rbx /* pt_regs->rbx */ xorl %ebx, %ebx /* nospec rbx */ From ec348020566009d3da9b99f07c05814d13969c78 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 26 Jun 2018 22:17:17 -0700 Subject: [PATCH 5/9] selftests/x86/sigreturn/64: Fix spurious failures on AMD CPUs When I wrote the sigreturn test, I didn't realize that AMD's busted IRET behavior was different from Intel's busted IRET behavior: On AMD CPUs, the CPU leaks the high 32 bits of the kernel stack pointer to certain userspace contexts. Gee, thanks. There's very little the kernel can do about it. Modify the test so it passes. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/86e7fd3564497f657de30a36da4505799eebef01.1530076529.git.luto@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/sigreturn.c | 46 ++++++++++++++++--------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c index 246145b84a12..2559e2c01793 100644 --- a/tools/testing/selftests/x86/sigreturn.c +++ b/tools/testing/selftests/x86/sigreturn.c @@ -612,19 +612,38 @@ static int test_valid_sigreturn(int cs_bits, bool use_16bit_ss, int force_ss) greg_t req = requested_regs[i], res = resulting_regs[i]; if (i == REG_TRAPNO || i == REG_IP) continue; /* don't care */ - if (i == REG_SP) { - printf("\tSP: %llx -> %llx\n", (unsigned long long)req, - (unsigned long long)res); + if (i == REG_SP) { /* - * In many circumstances, the high 32 bits of rsp - * are zeroed. For example, we could be a real - * 32-bit program, or we could hit any of a number - * of poorly-documented IRET or segmented ESP - * oddities. If this happens, it's okay. + * If we were using a 16-bit stack segment, then + * the kernel is a bit stuck: IRET only restores + * the low 16 bits of ESP/RSP if SS is 16-bit. + * The kernel uses a hack to restore bits 31:16, + * but that hack doesn't help with bits 63:32. + * On Intel CPUs, bits 63:32 end up zeroed, and, on + * AMD CPUs, they leak the high bits of the kernel + * espfix64 stack pointer. There's very little that + * the kernel can do about it. + * + * Similarly, if we are returning to a 32-bit context, + * the CPU will often lose the high 32 bits of RSP. */ - if (res == (req & 0xFFFFFFFF)) - continue; /* OK; not expected to work */ + + if (res == req) + continue; + + if (cs_bits != 64 && ((res ^ req) & 0xFFFFFFFF) == 0) { + printf("[NOTE]\tSP: %llx -> %llx\n", + (unsigned long long)req, + (unsigned long long)res); + continue; + } + + printf("[FAIL]\tSP mismatch: requested 0x%llx; got 0x%llx\n", + (unsigned long long)requested_regs[i], + (unsigned long long)resulting_regs[i]); + nerrs++; + continue; } bool ignore_reg = false; @@ -663,13 +682,6 @@ static int test_valid_sigreturn(int cs_bits, bool use_16bit_ss, int force_ss) } if (requested_regs[i] != resulting_regs[i] && !ignore_reg) { - /* - * SP is particularly interesting here. The - * usual cause of failures is that we hit the - * nasty IRET case of returning to a 16-bit SS, - * in which case bits 16:31 of the *kernel* - * stack pointer persist in ESP. - */ printf("[FAIL]\tReg %d mismatch: requested 0x%llx; got 0x%llx\n", i, (unsigned long long)requested_regs[i], (unsigned long long)resulting_regs[i]); From e8a445dea219c32727016af14f847d2e8f7ebec8 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 26 Jun 2018 22:17:18 -0700 Subject: [PATCH 6/9] selftests/x86/sigreturn: Do minor cleanups We have short names for the requested and resulting register values. Use them instead of spelling out the whole register entry for each case. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/bb3bc1f923a2f6fe7912d22a1068fe29d6033d38.1530076529.git.luto@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/sigreturn.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c index 2559e2c01793..4d9dc3f2fd70 100644 --- a/tools/testing/selftests/x86/sigreturn.c +++ b/tools/testing/selftests/x86/sigreturn.c @@ -610,6 +610,7 @@ static int test_valid_sigreturn(int cs_bits, bool use_16bit_ss, int force_ss) */ for (int i = 0; i < NGREG; i++) { greg_t req = requested_regs[i], res = resulting_regs[i]; + if (i == REG_TRAPNO || i == REG_IP) continue; /* don't care */ @@ -673,18 +674,18 @@ static int test_valid_sigreturn(int cs_bits, bool use_16bit_ss, int force_ss) #endif /* Sanity check on the kernel */ - if (i == REG_CX && requested_regs[i] != resulting_regs[i]) { + if (i == REG_CX && req != res) { printf("[FAIL]\tCX (saved SP) mismatch: requested 0x%llx; got 0x%llx\n", - (unsigned long long)requested_regs[i], - (unsigned long long)resulting_regs[i]); + (unsigned long long)req, + (unsigned long long)res); nerrs++; continue; } - if (requested_regs[i] != resulting_regs[i] && !ignore_reg) { + if (req != res && !ignore_reg) { printf("[FAIL]\tReg %d mismatch: requested 0x%llx; got 0x%llx\n", - i, (unsigned long long)requested_regs[i], - (unsigned long long)resulting_regs[i]); + i, (unsigned long long)req, + (unsigned long long)res); nerrs++; } } From cfe19577047e74cdac5826adbdc2337d8437f8fb Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 25 Jun 2018 15:08:52 +0300 Subject: [PATCH 7/9] x86/efi: Fix efi_call_phys_epilog() with CONFIG_X86_5LEVEL=y Open-coded page table entry checks don't work correctly when we fold the page table level at runtime. pgd_present() on 4-level paging machine always returns true, but open-coded version of the check may return false-negative result and we silently skip the rest of the loop body in efi_call_phys_epilog(). Replace open-coded checks with proper helpers. Signed-off-by: Kirill A. Shutemov Acked-by: Ard Biesheuvel Cc: Andrey Ryabinin Cc: Baoquan He Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org # v4.12+ Fixes: 94133e46a0f5 ("x86/efi: Correct EFI identity mapping under 'efi=old_map' when KASLR is enabled") Link: http://lkml.kernel.org/r/20180625120852.18300-1-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/platform/efi/efi_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index e01f7ceb9e7a..77873ce700ae 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -166,14 +166,14 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd) pgd = pgd_offset_k(pgd_idx * PGDIR_SIZE); set_pgd(pgd_offset_k(pgd_idx * PGDIR_SIZE), save_pgd[pgd_idx]); - if (!(pgd_val(*pgd) & _PAGE_PRESENT)) + if (!pgd_present(*pgd)) continue; for (i = 0; i < PTRS_PER_P4D; i++) { p4d = p4d_offset(pgd, pgd_idx * PGDIR_SIZE + i * P4D_SIZE); - if (!(p4d_val(*p4d) & _PAGE_PRESENT)) + if (!p4d_present(*p4d)) continue; pud = (pud_t *)p4d_page_vaddr(*p4d); From b8c1e4293a5d1dfd19ab7b0984bfce8191940500 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 26 Jun 2018 13:03:41 +0300 Subject: [PATCH 8/9] x86/mm: Drop unneeded __always_inline for p4d page table helpers This reverts the following commits: 1ea66554d3b0 ("x86/mm: Mark p4d_offset() __always_inline") 046c0dbec023 ("x86: Mark native_set_p4d() as __always_inline") p4d_offset(), native_set_p4d() and native_p4d_clear() were marked __always_inline in attempt to move __pgtable_l5_enabled into __initdata section. It was required as KASAN initialization code is a user of USE_EARLY_PGTABLE_L5, so all pgtable_l5_enabled() translated to __pgtable_l5_enabled there. This includes pgtable_l5_enabled() called from inline p4d helpers. If compiler would decided to not inline these p4d helpers, but leave them standalone, we end up with section mismatch. We don't need __always_inline here anymore. __pgtable_l5_enabled moved back to be __ro_after_init. See the following commit: 51be13351517 ("Revert "x86/mm: Mark __pgtable_l5_enabled __initdata"") Signed-off-by: Kirill A. Shutemov Cc: Arnd Bergmann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180626100341.49910-1-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable_64.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 99ecde23c3ec..5715647fc4fe 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -898,7 +898,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) #define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd)) /* to find an entry in a page-table-directory. */ -static __always_inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) +static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) { if (!pgtable_l5_enabled()) return (p4d_t *)pgd; diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 0fdcd21dadbd..3c5385f9a88f 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -216,7 +216,7 @@ static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) } #endif -static __always_inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) +static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) { pgd_t pgd; @@ -230,7 +230,7 @@ static __always_inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) *p4dp = native_make_p4d(native_pgd_val(pgd)); } -static __always_inline void native_p4d_clear(p4d_t *p4d) +static inline void native_p4d_clear(p4d_t *p4d) { native_set_p4d(p4d, native_make_p4d(0)); } From d79d0d8ad0cb3d782b41631dfeac8eb05e414bcd Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Wed, 27 Jun 2018 11:07:15 +0200 Subject: [PATCH 9/9] x86/mm: Clean up the printk()s in show_fault_oops() - Remove 'nx_warning' and 'smep_warning', which are just pointless obfuscation. - Also convert to pr_crit(). Suggested-by: Joe Perches Signed-off-by: Dmitry Vyukov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180627090715.28076-1-dvyukov@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ee85766e6329..2aafa6ab6103 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -641,11 +641,6 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) return 0; } -static const char nx_warning[] = KERN_CRIT -"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; -static const char smep_warning[] = KERN_CRIT -"unable to execute userspace code (SMEP?) (uid: %d)\n"; - static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) @@ -664,11 +659,13 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, pte = lookup_address_in_pgd(pgd, address, &level); if (pte && pte_present(*pte) && !pte_exec(*pte)) - printk(nx_warning, from_kuid(&init_user_ns, current_uid())); + pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", + from_kuid(&init_user_ns, current_uid())); if (pte && pte_present(*pte) && pte_exec(*pte) && (pgd_flags(*pgd) & _PAGE_USER) && (__read_cr4() & X86_CR4_SMEP)) - printk(smep_warning, from_kuid(&init_user_ns, current_uid())); + pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n", + from_kuid(&init_user_ns, current_uid())); } pr_alert("BUG: unable to handle kernel %s at %px\n",