From 69a8405999aa1c489de4b8d349468f0c2b83f093 Mon Sep 17 00:00:00 2001 From: Michael Jeanson Date: Thu, 14 Jun 2018 11:27:42 -0400 Subject: [PATCH 1/9] powerpc/e500mc: Set assembler machine type to e500mc In binutils 2.26 a new opcode for the "wait" instruction was added for the POWER9 and has precedence over the one specific to the e500mc. Commit ebf714ff3756 ("powerpc/e500mc: Add support for the wait instruction in e500_idle") uses this instruction specifically on the e500mc to work around an erratum. This results in an invalid instruction in idle_e500 when we build for the e500mc on bintutils >= 2.26 with the default assembler machine type. Since multiplatform between e500 and non-e500 is not supported, set the assembler machine type globaly when CONFIG_PPC_E500MC=y. Signed-off-by: Michael Jeanson Reviewed-by: Mathieu Desnoyers CC: Benjamin Herrenschmidt CC: Paul Mackerras CC: Michael Ellerman CC: Kumar Gala CC: Vakul Garg CC: Scott Wood CC: Mathieu Desnoyers CC: linuxppc-dev@lists.ozlabs.org CC: linux-kernel@vger.kernel.org CC: stable@vger.kernel.org Signed-off-by: Michael Ellerman --- arch/powerpc/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index bd06a3ccda31..2ea575cb3401 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -244,6 +244,7 @@ cpu-as-$(CONFIG_4xx) += -Wa,-m405 cpu-as-$(CONFIG_ALTIVEC) += $(call as-option,-Wa$(comma)-maltivec) cpu-as-$(CONFIG_E200) += -Wa,-me200 cpu-as-$(CONFIG_PPC_BOOK3S_64) += -Wa,-mpower4 +cpu-as-$(CONFIG_PPC_E500MC) += $(call as-option,-Wa$(comma)-me500mc) KBUILD_AFLAGS += $(cpu-as-y) KBUILD_CFLAGS += $(cpu-as-y) From 02390f66bd2362df114a0a0770d80ec33061f6d1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 15 Jun 2018 11:38:37 +1000 Subject: [PATCH 2/9] powerpc/64s/radix: Fix MADV_[FREE|DONTNEED] TLB flush miss problem with THP The patch 99baac21e4 ("mm: fix MADV_[FREE|DONTNEED] TLB flush miss problem") added a force flush mode to the mmu_gather flush, which unconditionally flushes the entire address range being invalidated (even if actual ptes only covered a smaller range), to solve a problem with concurrent threads invalidating the same PTEs causing them to miss TLBs that need flushing. This does not work with powerpc that invalidates mmu_gather batches according to page size. Have powerpc flush all possible page sizes in the range if it encounters this concurrency condition. Patch 4647706ebe ("mm: always flush VMA ranges affected by zap_page_range") does add a TLB flush for all page sizes on powerpc for the zap_page_range case, but that is to be removed and replaced with the mmu_gather flush to avoid redundant flushing. It is also thought to not cover other obscure race conditions: https://lkml.kernel.org/r/BD3A0EBE-ECF4-41D4-87FA-C755EA9AB6BD@gmail.com Hash does not have a problem because it invalidates TLBs inside the page table locks. Reported-by: Aneesh Kumar K.V Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/mm/tlb-radix.c | 96 +++++++++++++++++++++++++++++-------- 1 file changed, 75 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 67a6e86d3e7e..a734e486664d 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -689,22 +689,17 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range); static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; -void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) +static inline void __radix__flush_tlb_range(struct mm_struct *mm, + unsigned long start, unsigned long end, + bool flush_all_sizes) { - struct mm_struct *mm = vma->vm_mm; unsigned long pid; unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift; unsigned long page_size = 1UL << page_shift; unsigned long nr_pages = (end - start) >> page_shift; bool local, full; -#ifdef CONFIG_HUGETLB_PAGE - if (is_vm_hugetlb_page(vma)) - return radix__flush_hugetlb_tlb_range(vma, start, end); -#endif - pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) return; @@ -738,37 +733,64 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, _tlbie_pid(pid, RIC_FLUSH_TLB); } } else { - bool hflush = false; + bool hflush = flush_all_sizes; + bool gflush = flush_all_sizes; unsigned long hstart, hend; + unsigned long gstart, gend; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - hstart = (start + HPAGE_PMD_SIZE - 1) >> HPAGE_PMD_SHIFT; - hend = end >> HPAGE_PMD_SHIFT; - if (hstart < hend) { - hstart <<= HPAGE_PMD_SHIFT; - hend <<= HPAGE_PMD_SHIFT; + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) hflush = true; + + if (hflush) { + hstart = (start + PMD_SIZE - 1) & PMD_MASK; + hend = end & PMD_MASK; + if (hstart == hend) + hflush = false; + } + + if (gflush) { + gstart = (start + PUD_SIZE - 1) & PUD_MASK; + gend = end & PUD_MASK; + if (gstart == gend) + gflush = false; } -#endif asm volatile("ptesync": : :"memory"); if (local) { __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbiel_va_range(hstart, hend, pid, - HPAGE_PMD_SIZE, MMU_PAGE_2M); + PMD_SIZE, MMU_PAGE_2M); + if (gflush) + __tlbiel_va_range(gstart, gend, pid, + PUD_SIZE, MMU_PAGE_1G); asm volatile("ptesync": : :"memory"); } else { __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbie_va_range(hstart, hend, pid, - HPAGE_PMD_SIZE, MMU_PAGE_2M); + PMD_SIZE, MMU_PAGE_2M); + if (gflush) + __tlbie_va_range(gstart, gend, pid, + PUD_SIZE, MMU_PAGE_1G); fixup_tlbie(); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } } preempt_enable(); } + +void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) + +{ +#ifdef CONFIG_HUGETLB_PAGE + if (is_vm_hugetlb_page(vma)) + return radix__flush_hugetlb_tlb_range(vma, start, end); +#endif + + __radix__flush_tlb_range(vma->vm_mm, start, end, false); +} EXPORT_SYMBOL(radix__flush_tlb_range); static int radix_get_mmu_psize(int page_size) @@ -837,6 +859,8 @@ void radix__tlb_flush(struct mmu_gather *tlb) int psize = 0; struct mm_struct *mm = tlb->mm; int page_size = tlb->page_size; + unsigned long start = tlb->start; + unsigned long end = tlb->end; /* * if page size is not something we understand, do a full mm flush @@ -847,15 +871,45 @@ void radix__tlb_flush(struct mmu_gather *tlb) */ if (tlb->fullmm) { __flush_all_mm(mm, true); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) + } else if (mm_tlb_flush_nested(mm)) { + /* + * If there is a concurrent invalidation that is clearing ptes, + * then it's possible this invalidation will miss one of those + * cleared ptes and miss flushing the TLB. If this invalidate + * returns before the other one flushes TLBs, that can result + * in it returning while there are still valid TLBs inside the + * range to be invalidated. + * + * See mm/memory.c:tlb_finish_mmu() for more details. + * + * The solution to this is ensure the entire range is always + * flushed here. The problem for powerpc is that the flushes + * are page size specific, so this "forced flush" would not + * do the right thing if there are a mix of page sizes in + * the range to be invalidated. So use __flush_tlb_range + * which invalidates all possible page sizes in the range. + * + * PWC flush probably is not be required because the core code + * shouldn't free page tables in this path, but accounting + * for the possibility makes us a bit more robust. + * + * need_flush_all is an uncommon case because page table + * teardown should be done with exclusive locks held (but + * after locks are dropped another invalidate could come + * in), it could be optimized further if necessary. + */ + if (!tlb->need_flush_all) + __radix__flush_tlb_range(mm, start, end, true); + else + radix__flush_all_mm(mm); +#endif } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { if (!tlb->need_flush_all) radix__flush_tlb_mm(mm); else radix__flush_all_mm(mm); } else { - unsigned long start = tlb->start; - unsigned long end = tlb->end; - if (!tlb->need_flush_all) radix__flush_tlb_range_psize(mm, start, end, psize); else From 749a0278c2177b2d16da5d8b135ba7f940bb4199 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 13 Jun 2018 23:23:56 +1000 Subject: [PATCH 3/9] powerpc/64s: Fix DT CPU features Power9 DD2.1 logic In the device tree CPU features quirk code we want to set CPU_FTR_POWER9_DD2_1 on all Power9s that aren't DD2.0 or earlier. But we got the logic wrong and instead set it on all CPUs that aren't Power9 DD2.0 or earlier, ie. including Power8. Fix it by making sure we're on a Power9. This isn't a bug in practice because the only code that checks the feature is Power9 only to begin with. But we'll backport it anyway to avoid confusion. Fixes: 9e9626ed3a4a ("powerpc/64s: Fix POWER9 DD2.2 and above in DT CPU features") Cc: stable@vger.kernel.org # v4.17+ Reported-by: Paul Mackerras Signed-off-by: Michael Ellerman Acked-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/dt_cpu_ftrs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 4be1c0de9406..96dd3d871986 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -711,7 +711,8 @@ static __init void cpufeatures_cpu_quirks(void) cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_HV_ASSIST; cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_XER_SO_BUG; cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; - } else /* DD2.1 and up have DD2_1 */ + } else if ((version & 0xffff0000) == 0x004e0000) + /* DD2.1 and up have DD2_1 */ cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; if ((version & 0xffff0000) == 0x004e0000) { From 8c1aef6a682f87a059f10ab606cc1e2cdd663d5a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 19 May 2018 14:35:52 +1000 Subject: [PATCH 4/9] powerpc/64: hard disable irqs in panic_smp_self_stop Similarly to commit 855bfe0de1 ("powerpc: hard disable irqs in smp_send_stop loop"), irqs should be hard disabled by panic_smp_self_stop. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup_64.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 7a7ce8ad455e..225bc5f91049 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -387,6 +387,14 @@ void early_setup_secondary(void) #endif /* CONFIG_SMP */ +void panic_smp_self_stop(void) +{ + hard_irq_disable(); + spin_begin(); + while (1) + spin_cpu_relax(); +} + #if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE) static bool use_spinloop(void) { From de6e5d38417e6cdb005843db420a2974993d36ff Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 19 May 2018 14:35:53 +1000 Subject: [PATCH 5/9] powerpc: smp_send_stop do not offline stopped CPUs Marking CPUs stopped by smp_send_stop as offline can cause warnings due to cross-CPU wakeups. This trace was noticed on a busy system running a sysrq+c crash test, after the injected crash: WARNING: CPU: 51 PID: 1546 at kernel/sched/core.c:1179 set_task_cpu+0x22c/0x240 CPU: 51 PID: 1546 Comm: kworker/u352:1 Tainted: G D Workqueue: mlx5e mlx5e_update_stats_work [mlx5_core] [...] NIP [c00000000017c21c] set_task_cpu+0x22c/0x240 LR [c00000000017d580] try_to_wake_up+0x230/0x720 Call Trace: [c000000001017700] runqueues+0x0/0xb00 (unreliable) [c00000000017d580] try_to_wake_up+0x230/0x720 [c00000000015a214] insert_work+0x104/0x140 [c00000000015adb0] __queue_work+0x230/0x690 [c000003fc5007910] [c00000000015b26c] queue_work_on+0x5c/0x90 [c0080000135fc8f8] mlx5_cmd_exec+0x538/0xcb0 [mlx5_core] [c008000013608fd0] mlx5_core_access_reg+0x140/0x1d0 [mlx5_core] [c00800001362777c] mlx5e_update_pport_counters.constprop.59+0x6c/0x90 [mlx5_core] [c008000013628868] mlx5e_update_ndo_stats+0x28/0x90 [mlx5_core] [c008000013625558] mlx5e_update_stats_work+0x68/0xb0 [mlx5_core] [c00000000015bcec] process_one_work+0x1bc/0x5f0 [c00000000015ecac] worker_thread+0xac/0x6b0 [c000000000168338] kthread+0x168/0x1b0 [c00000000000b628] ret_from_kernel_thread+0x5c/0xb4 This happens because firstly the CPU is not really offline in the usual sense, processes and interrupts have not been migrated away. Secondly smp_send_stop does not happen atomically on all CPUs, so one CPU can have marked itself offline, while another CPU is still running processes or interrupts which can affect the first CPU. Fix this by just not marking the CPU as offline. It's more like frozen in time, so offline does not really reflect its state properly anyway. There should be nothing in the crash/panic path that walks online CPUs and synchronously waits for them, so this change should not introduce new hangs. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/smp.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 5eadfffabe35..4794d6b4f4d2 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -600,9 +600,6 @@ static void nmi_stop_this_cpu(struct pt_regs *regs) nmi_ipi_busy_count--; nmi_ipi_unlock(); - /* Remove this CPU */ - set_cpu_online(smp_processor_id(), false); - spin_begin(); while (1) spin_cpu_relax(); @@ -617,9 +614,6 @@ void smp_send_stop(void) static void stop_this_cpu(void *dummy) { - /* Remove this CPU */ - set_cpu_online(smp_processor_id(), false); - hard_irq_disable(); spin_begin(); while (1) From 855b6232dda2b6941ecd22979893e8a1d25642db Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 19 May 2018 14:35:54 +1000 Subject: [PATCH 6/9] powerpc/64: hard disable irqs on the panic()ing CPU Similar to previous patches, hard disable interrupts when a CPU is in panic. This reduces the chance the watchdog has to interfere with the panic, and avoids any other type of masked interrupt being executed when crashing which minimises the length of the crash path. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup-common.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 62b1a40d8957..40b44bb53a4e 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -700,12 +700,19 @@ EXPORT_SYMBOL(check_legacy_ioport); static int ppc_panic_event(struct notifier_block *this, unsigned long event, void *ptr) { + /* + * panic does a local_irq_disable, but we really + * want interrupts to be hard disabled. + */ + hard_irq_disable(); + /* * If firmware-assisted dump has been registered then trigger * firmware-assisted dump and let firmware handle everything else. */ crash_fadump(NULL, ptr); - ppc_md.panic(ptr); /* May not return */ + if (ppc_md.panic) + ppc_md.panic(ptr); /* May not return */ return NOTIFY_DONE; } @@ -716,7 +723,8 @@ static struct notifier_block ppc_panic_block = { void __init setup_panic(void) { - if (!ppc_md.panic) + /* PPC64 always does a hard irq disable in its panic handler */ + if (!IS_ENABLED(CONFIG_PPC64) && !ppc_md.panic) return; atomic_notifier_chain_register(&panic_notifier_list, &ppc_panic_block); } From e08ecba17b72aeb01859601bc242a5bc48620109 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 19 Jun 2018 21:51:55 +1000 Subject: [PATCH 7/9] powerpc/64s: Fix build failures with CONFIG_NMI_IPI=n I broke the build when CONFIG_NMI_IPI=n with my recent commit to add arch_trigger_cpumask_backtrace(), eg: stacktrace.c:(.text+0x1b0): undefined reference to `.smp_send_safe_nmi_ipi' We should rework the CONFIG symbols here in future to avoid these double barrelled ifdefs but for now they fix the build. Fixes: 5cc05910f26e ("powerpc/64s: Wire up arch_trigger_cpumask_backtrace()") Reported-by: Christophe LEROY Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nmi.h | 2 +- arch/powerpc/kernel/stacktrace.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h index 0f571e0ebca1..bd9ba8defd72 100644 --- a/arch/powerpc/include/asm/nmi.h +++ b/arch/powerpc/include/asm/nmi.h @@ -8,7 +8,7 @@ extern void arch_touch_nmi_watchdog(void); static inline void arch_touch_nmi_watchdog(void) {} #endif -#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_STACKTRACE) +#if defined(CONFIG_NMI_IPI) && defined(CONFIG_STACKTRACE) extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self); #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 07e97f289c52..e2c50b55138f 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -196,7 +196,7 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, EXPORT_SYMBOL_GPL(save_stack_trace_tsk_reliable); #endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */ -#ifdef CONFIG_PPC_BOOK3S_64 +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_NMI_IPI) static void handle_backtrace_ipi(struct pt_regs *regs) { nmi_cpu_backtrace(regs); @@ -242,4 +242,4 @@ void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self) { nmi_trigger_cpumask_backtrace(mask, exclude_self, raise_backtrace_ipi); } -#endif /* CONFIG_PPC64 */ +#endif /* defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_NMI_IPI) */ From 758380b8155f69b4e2f77f27562f8a7a466749d6 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 12 Jun 2018 19:38:08 +1000 Subject: [PATCH 8/9] powerpc/64s/radix: Fix radix_kvm_prefetch_workaround paca access of not possible CPU If possible CPUs are limited (e.g., by kexec), then the kvm prefetch workaround function can access the paca pointer for a !possible CPU. Fixes: d2e60075a3d44 ("powerpc/64: Use array of paca pointers and allocate pacas individually") Cc: stable@kernel.org Reported-by: Pridhiviraj Paidipeddi Tested-by: Pridhiviraj Paidipeddi Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/mm/tlb-radix.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index a734e486664d..1135b43a597c 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -1097,6 +1097,8 @@ extern void radix_kvm_prefetch_workaround(struct mm_struct *mm) for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) { if (sib == cpu) continue; + if (!cpu_possible(sib)) + continue; if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu) flush = true; } From fadd03c615922d8521a2e76d4ba2335891cb2790 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 14 Jun 2018 16:01:52 +0530 Subject: [PATCH 9/9] powerpc/mm/hash/4k: Free hugetlb page table caches correctly. With 4k page size for hugetlb we allocate hugepage directories from its on slab cache. With patch 0c4d26802 ("powerpc/book3s64/mm: Simplify the rcu callback for page table free") we missed to free these allocated hugepd tables. Update pgtable_free to handle hugetlb hugepd directory table. Fixes: 0c4d268029bf ("powerpc/book3s64/mm: Simplify the rcu callback for page table free") Signed-off-by: Aneesh Kumar K.V [mpe: Add CONFIG_HUGETLB_PAGE guard to fix build break] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/pgalloc.h | 1 + .../include/asm/book3s/64/pgtable-4k.h | 21 +++++++++++++++++++ .../include/asm/book3s/64/pgtable-64k.h | 9 ++++++++ arch/powerpc/include/asm/book3s/64/pgtable.h | 5 +++++ arch/powerpc/include/asm/nohash/32/pgalloc.h | 1 + arch/powerpc/include/asm/nohash/64/pgalloc.h | 1 + arch/powerpc/mm/hugetlbpage.c | 3 ++- arch/powerpc/mm/pgtable-book3s64.c | 12 +++++++++++ 8 files changed, 52 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h index 6a6673907e45..e4633803fe43 100644 --- a/arch/powerpc/include/asm/book3s/32/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h @@ -108,6 +108,7 @@ static inline void pgtable_free(void *table, unsigned index_size) } #define check_pgt_cache() do { } while (0) +#define get_hugepd_cache_index(x) (x) #ifdef CONFIG_SMP static inline void pgtable_free_tlb(struct mmu_gather *tlb, diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h index af5f2baac80f..a069dfcac9a9 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h @@ -49,6 +49,27 @@ static inline int hugepd_ok(hugepd_t hpd) } #define is_hugepd(hpd) (hugepd_ok(hpd)) +/* + * 16M and 16G huge page directory tables are allocated from slab cache + * + */ +#define H_16M_CACHE_INDEX (PAGE_SHIFT + H_PTE_INDEX_SIZE + H_PMD_INDEX_SIZE - 24) +#define H_16G_CACHE_INDEX \ + (PAGE_SHIFT + H_PTE_INDEX_SIZE + H_PMD_INDEX_SIZE + H_PUD_INDEX_SIZE - 34) + +static inline int get_hugepd_cache_index(int index) +{ + switch (index) { + case H_16M_CACHE_INDEX: + return HTLB_16M_INDEX; + case H_16G_CACHE_INDEX: + return HTLB_16G_INDEX; + default: + BUG(); + } + /* should not reach */ +} + #else /* !CONFIG_HUGETLB_PAGE */ static inline int pmd_huge(pmd_t pmd) { return 0; } static inline int pud_huge(pud_t pud) { return 0; } diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h index fb4b3ba52339..d7ee249d6890 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h @@ -45,8 +45,17 @@ static inline int hugepd_ok(hugepd_t hpd) { return 0; } + #define is_hugepd(pdep) 0 +/* + * This should never get called + */ +static inline int get_hugepd_cache_index(int index) +{ + BUG(); +} + #else /* !CONFIG_HUGETLB_PAGE */ static inline int pmd_huge(pmd_t pmd) { return 0; } static inline int pud_huge(pud_t pud) { return 0; } diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 63cee159022b..42aafba7a308 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -287,6 +287,11 @@ enum pgtable_index { PMD_INDEX, PUD_INDEX, PGD_INDEX, + /* + * Below are used with 4k page size and hugetlb + */ + HTLB_16M_INDEX, + HTLB_16G_INDEX, }; extern unsigned long __vmalloc_start; diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h index 1707781d2f20..9de40eb614da 100644 --- a/arch/powerpc/include/asm/nohash/32/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h @@ -109,6 +109,7 @@ static inline void pgtable_free(void *table, unsigned index_size) } #define check_pgt_cache() do { } while (0) +#define get_hugepd_cache_index(x) (x) #ifdef CONFIG_SMP static inline void pgtable_free_tlb(struct mmu_gather *tlb, diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index 0e693f322cb2..e2d62d033708 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -141,6 +141,7 @@ static inline void pgtable_free(void *table, int shift) } } +#define get_hugepd_cache_index(x) (x) #ifdef CONFIG_SMP static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) { diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 7c5f479c5c00..8a9a49c13865 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -337,7 +337,8 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif if (shift >= pdshift) hugepd_free(tlb, hugepte); else - pgtable_free_tlb(tlb, hugepte, pdshift - shift); + pgtable_free_tlb(tlb, hugepte, + get_hugepd_cache_index(pdshift - shift)); } static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index c1f4ca45c93a..4afbfbb64bfd 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -409,6 +409,18 @@ static inline void pgtable_free(void *table, int index) case PUD_INDEX: kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table); break; +#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE) + /* 16M hugepd directory at pud level */ + case HTLB_16M_INDEX: + BUILD_BUG_ON(H_16M_CACHE_INDEX <= 0); + kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX), table); + break; + /* 16G hugepd directory at the pgd level */ + case HTLB_16G_INDEX: + BUILD_BUG_ON(H_16G_CACHE_INDEX <= 0); + kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX), table); + break; +#endif /* We don't free pgd table via RCU callback */ default: BUG();