From a70b487b07cf4201bc6702e7f646fa593b23009f Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 17 Jul 2017 21:19:01 +1000 Subject: [PATCH 1/6] powerpc/powernv: Fix boot on Power8 bare metal due to opal_configure_cores() In commit 1c0eaf0f56d6 ("powerpc/powernv: Tell OPAL about our MMU mode on POWER9"), we added additional flags to the OPAL call to configure CPUs at boot. These flags only work on Power9 firmwares, and worse can cause boot failures on Power8 machines, so we check for CPU_FTR_ARCH_300 (aka POWER9) before adding the extra flags. Unfortunately we forgot that opal_configure_cores() is called before the CPU feature checks are dynamically patched, meaning the check always returns true. We definitely need to do something to make the CPU feature checks less prone to bugs like this, but for now the minimal fix is to use early_cpu_has_feature(). Reported-and-tested-by: Abdul Haleem Fixes: 1c0eaf0f56d6 ("powerpc/powernv: Tell OPAL about our MMU mode on POWER9") Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 9b87abb178f0..cad6b57ce494 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -78,7 +78,7 @@ void opal_configure_cores(void) * ie. Host hash supports hash guests * Host radix supports hash/radix guests */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (early_cpu_has_feature(CPU_FTR_ARCH_300)) { reinit_flags |= OPAL_REINIT_CPUS_MMU_HASH; if (early_radix_enabled()) reinit_flags |= OPAL_REINIT_CPUS_MMU_RADIX; From 101dd590a7fa37954540cf3149a1c502c0acc524 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 10 Jul 2017 16:19:38 +1000 Subject: [PATCH 2/6] powerpc/perf: Avoid spurious PMU interrupts after idle POWER9 DD2 can see spurious PMU interrupts after state-loss idle in some conditions. A solution is to save and reload MMCR0 over state-loss idle. Signed-off-by: Nicholas Piggin Acked-by: Madhavan Srinivasan Tested-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/idle_book3s.S | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 5adb390e773b..516ebef905c0 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -30,6 +30,7 @@ * Use unused space in the interrupt stack to save and restore * registers for winkle support. */ +#define _MMCR0 GPR0 #define _SDR1 GPR3 #define _PTCR GPR3 #define _RPR GPR4 @@ -272,6 +273,14 @@ power_enter_stop: b pnv_wakeup_noloss .Lhandle_esl_ec_set: + /* + * POWER9 DD2 can incorrectly set PMAO when waking up after a + * state-loss idle. Saving and restoring MMCR0 over idle is a + * workaround. + */ + mfspr r4,SPRN_MMCR0 + std r4,_MMCR0(r1) + /* * Check if the requested state is a deep idle state. */ @@ -450,10 +459,14 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) pnv_restore_hyp_resource_arch300: /* * Workaround for POWER9, if we lost resources, the ERAT - * might have been mixed up and needs flushing. + * might have been mixed up and needs flushing. We also need + * to reload MMCR0 (see comment above). */ blt cr3,1f PPC_INVALIDATE_ERAT + ld r1,PACAR1(r13) + ld r4,_MMCR0(r1) + mtspr SPRN_MMCR0,r4 1: /* * POWER ISA 3. Use PSSCR to determine if we From 76fc0cfcc5b0f520062ca6d7225b224d4a8aa828 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 18 Jul 2017 15:32:44 +1000 Subject: [PATCH 3/6] powerpc/64s: Fix hypercall entry clobbering r12 input A previous optimisation incorrectly assumed the PAPR hcall does not use r12, and clobbers it upon entry. In fact it is used as an input. This can result in KVM guests crashing (observed with PR KVM). Instead of using r12 to save r13, tihs patch saves r13 in ctr. This is more costly, but not as slow as using the SPRG. Fixes: acd7d8cef0153 ("powerpc/64s: Optimize hypercall/syscall entry") Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 4c18a5fbb4bb..124091d306ff 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -824,7 +824,7 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) * r3 volatile parameter and return value for status * r4-r10 volatile input and output value * r11 volatile hypercall number and output value - * r12 volatile + * r12 volatile input and output value * r13-r31 nonvolatile * LR nonvolatile * CTR volatile @@ -834,25 +834,26 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) * Other registers nonvolatile * * The intersection of volatile registers that don't contain possible - * inputs is: r12, cr0, xer, ctr. We may use these as scratch regs - * upon entry without saving. + * inputs is: cr0, xer, ctr. We may use these as scratch regs upon entry + * without saving, though xer is not a good idea to use, as hardware may + * interpret some bits so it may be costly to change them. */ #ifdef CONFIG_KVM_BOOK3S_64_HANDLER /* * There is a little bit of juggling to get syscall and hcall - * working well. Save r10 in ctr to be restored in case it is a - * hcall. + * working well. Save r13 in ctr to avoid using SPRG scratch + * register. * * Userspace syscalls have already saved the PPR, hcalls must save * it before setting HMT_MEDIUM. */ #define SYSCALL_KVMTEST \ - mr r12,r13; \ + mtctr r13; \ GET_PACA(r13); \ - mtctr r10; \ + std r10,PACA_EXGEN+EX_R10(r13); \ KVMTEST_PR(0xc00); /* uses r10, branch to do_kvm_0xc00_system_call */ \ HMT_MEDIUM; \ - mr r9,r12; \ + mfctr r9; #else #define SYSCALL_KVMTEST \ @@ -935,8 +936,8 @@ EXC_VIRT_END(system_call, 0x4c00, 0x100) * This is a hcall, so register convention is as above, with these * differences: * r13 = PACA - * r12 = orig r13 - * ctr = orig r10 + * ctr = orig r13 + * orig r10 saved in PACA */ TRAMP_KVM_BEGIN(do_kvm_0xc00) /* @@ -944,14 +945,13 @@ TRAMP_KVM_BEGIN(do_kvm_0xc00) * HMT_MEDIUM. That allows the KVM code to save that value into the * guest state (it is the guest's PPR value). */ - OPT_GET_SPR(r0, SPRN_PPR, CPU_FTR_HAS_PPR) + OPT_GET_SPR(r10, SPRN_PPR, CPU_FTR_HAS_PPR) HMT_MEDIUM - OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r0, CPU_FTR_HAS_PPR) + OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r10, CPU_FTR_HAS_PPR) mfctr r10 - SET_SCRATCH0(r12) + SET_SCRATCH0(r10) std r9,PACA_EXGEN+EX_R9(r13) mfcr r9 - std r10,PACA_EXGEN+EX_R10(r13) KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00) #endif From b134bd90286dc9f2952c35a91ab405474ca9374c Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 14 Jul 2017 16:51:21 +1000 Subject: [PATCH 4/6] powerpc/mm/radix: Refactor radix__mark_rodata_ro() Move the core logic into a helper, so we can use it for changing permissions other than _PAGE_WRITE. Signed-off-by: Michael Ellerman Reviewed-by: Balbir Singh Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable-radix.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 8c13e4282308..336e52ec652c 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -112,10 +112,9 @@ int radix__map_kernel_page(unsigned long ea, unsigned long pa, } #ifdef CONFIG_STRICT_KERNEL_RWX -void radix__mark_rodata_ro(void) +void radix__change_memory_range(unsigned long start, unsigned long end, + unsigned long clear) { - unsigned long start = (unsigned long)_stext; - unsigned long end = (unsigned long)__init_begin; unsigned long idx; pgd_t *pgdp; pud_t *pudp; @@ -125,7 +124,8 @@ void radix__mark_rodata_ro(void) start = ALIGN_DOWN(start, PAGE_SIZE); end = PAGE_ALIGN(end); // aligns up - pr_devel("marking ro start %lx, end %lx\n", start, end); + pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n", + start, end, clear); for (idx = start; idx < end; idx += PAGE_SIZE) { pgdp = pgd_offset_k(idx); @@ -147,11 +147,21 @@ void radix__mark_rodata_ro(void) if (!ptep) continue; update_the_pte: - radix__pte_update(&init_mm, idx, ptep, _PAGE_WRITE, 0, 0); + radix__pte_update(&init_mm, idx, ptep, clear, 0, 0); } radix__flush_tlb_kernel_range(start, end); } + +void radix__mark_rodata_ro(void) +{ + unsigned long start, end; + + start = (unsigned long)_stext; + end = (unsigned long)__init_begin; + + radix__change_memory_range(start, end, _PAGE_WRITE); +} #endif /* CONFIG_STRICT_KERNEL_RWX */ static inline void __meminit print_mapping(unsigned long start, From fa7f9189e017213bad63b93a76de5c715cd62a96 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 14 Jul 2017 16:51:22 +1000 Subject: [PATCH 5/6] powerpc/mm/hash: Refactor hash__mark_rodata_ro() Move the core logic into a helper, so we can use it for changing other permissions. We also change the logic to align start down, and end up. This means calling the function with a range will expand that range to be at least 1 mmu_linear_psize page in size. We need that so we can use it on __init_begin ... __init_end which is not a full page in size. This should always work for _stext/__init_begin, because we align __init_begin to _stext + 16M in the linker script. Signed-off-by: Michael Ellerman Reviewed-by: Balbir Singh Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable-hash64.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index 188b4107584d..73019c52141f 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -425,33 +425,39 @@ int hash__has_transparent_hugepage(void) #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_STRICT_KERNEL_RWX -void hash__mark_rodata_ro(void) +static bool hash__change_memory_range(unsigned long start, unsigned long end, + unsigned long newpp) { - unsigned long start = (unsigned long)_stext; - unsigned long end = (unsigned long)__init_begin; unsigned long idx; unsigned int step, shift; - unsigned long newpp = PP_RXXX; shift = mmu_psize_defs[mmu_linear_psize].shift; step = 1 << shift; - start = ((start + step - 1) >> shift) << shift; - end = (end >> shift) << shift; + start = ALIGN_DOWN(start, step); + end = ALIGN(end, step); // aligns up - pr_devel("marking ro start %lx, end %lx, step %x\n", - start, end, step); + if (start >= end) + return false; - if (start == end) { - pr_warn("could not set rodata ro, relocate the start" - " of the kernel to a 0x%x boundary\n", step); - return; - } + pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n", + start, end, newpp, step); for (idx = start; idx < end; idx += step) /* Not sure if we can do much with the return value */ mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize, mmu_kernel_ssize); + return true; +} + +void hash__mark_rodata_ro(void) +{ + unsigned long start, end; + + start = (unsigned long)_stext; + end = (unsigned long)__init_begin; + + WARN_ON(!hash__change_memory_range(start, end, PP_RXXX)); } #endif From 029d9252b116fa52a95150819e62af1f6e420fe5 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 14 Jul 2017 16:51:23 +1000 Subject: [PATCH 6/6] powerpc/mm: Mark __init memory no-execute when STRICT_KERNEL_RWX=y Currently even with STRICT_KERNEL_RWX we leave the __init text marked executable after init, which is bad. Add a hook to mark it NX (no-execute) before we free it, and implement it for radix and hash. Note that we use __init_end as the end address, not _einittext, because overlaps_kernel_text() uses __init_end, because there are additional executable sections other than .init.text between __init_begin and __init_end. Tested on radix and hash with: 0:mon> p $__init_begin *** 400 exception occurred Fixes: 1e0fc9d1eb2b ("powerpc/Kconfig: Enable STRICT_KERNEL_RWX for some configs") Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hash.h | 1 + arch/powerpc/include/asm/book3s/64/pgtable.h | 1 + arch/powerpc/include/asm/book3s/64/radix.h | 1 + arch/powerpc/include/asm/pgtable.h | 7 +++++++ arch/powerpc/mm/mem.c | 1 + arch/powerpc/mm/pgtable-hash64.c | 12 ++++++++++++ arch/powerpc/mm/pgtable-radix.c | 8 ++++++++ arch/powerpc/mm/pgtable_64.c | 8 ++++++++ 8 files changed, 39 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 0ce513f2926f..36fc7bfe9e11 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -91,6 +91,7 @@ static inline int hash__pgd_bad(pgd_t pgd) } #ifdef CONFIG_STRICT_KERNEL_RWX extern void hash__mark_rodata_ro(void); +extern void hash__mark_initmem_nx(void); #endif extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr, diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index c0737c86a362..d1da415e283c 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1192,5 +1192,6 @@ static inline const int pud_pfn(pud_t pud) BUILD_BUG(); return 0; } + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 487709ff6875..544440b5aff3 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -118,6 +118,7 @@ #ifdef CONFIG_STRICT_KERNEL_RWX extern void radix__mark_rodata_ro(void); +extern void radix__mark_initmem_nx(void); #endif static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr, diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index dd01212935ac..afae9a336136 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -80,6 +80,13 @@ unsigned long vmalloc_to_phys(void *vmalloc_addr); void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); void pgtable_cache_init(void); + +#ifdef CONFIG_STRICT_KERNEL_RWX +void mark_initmem_nx(void); +#else +static inline void mark_initmem_nx(void) { } +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_H */ diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 8541f18694a4..46b4e67d2372 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -402,6 +402,7 @@ void __init mem_init(void) void free_initmem(void) { ppc_md.progress = ppc_printk_progress; + mark_initmem_nx(); free_initmem_default(POISON_FREE_INITMEM); } diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index 73019c52141f..443a2c66a304 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -460,4 +460,16 @@ void hash__mark_rodata_ro(void) WARN_ON(!hash__change_memory_range(start, end, PP_RXXX)); } + +void hash__mark_initmem_nx(void) +{ + unsigned long start, end, pp; + + start = (unsigned long)__init_begin; + end = (unsigned long)__init_end; + + pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL)); + + WARN_ON(!hash__change_memory_range(start, end, pp)); +} #endif diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 336e52ec652c..5cc50d47ce3f 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -162,6 +162,14 @@ void radix__mark_rodata_ro(void) radix__change_memory_range(start, end, _PAGE_WRITE); } + +void radix__mark_initmem_nx(void) +{ + unsigned long start = (unsigned long)__init_begin; + unsigned long end = (unsigned long)__init_end; + + radix__change_memory_range(start, end, _PAGE_EXEC); +} #endif /* CONFIG_STRICT_KERNEL_RWX */ static inline void __meminit print_mapping(unsigned long start, diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 5c0b795d656c..0736e94c7615 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -505,4 +505,12 @@ void mark_rodata_ro(void) else hash__mark_rodata_ro(); } + +void mark_initmem_nx(void) +{ + if (radix_enabled()) + radix__mark_initmem_nx(); + else + hash__mark_initmem_nx(); +} #endif