From 2bf1071a8d50928a4ae366bb3108833166c2b70c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 5 Jul 2018 18:47:00 +1000 Subject: [PATCH 1/7] powerpc/64s: Remove POWER9 DD1 support POWER9 DD1 was never a product. It is no longer supported by upstream firmware, and it is not effectively supported in Linux due to lack of testing. Signed-off-by: Nicholas Piggin Reviewed-by: Michael Ellerman [mpe: Remove arch_make_huge_pte() entirely] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hugetlb.h | 20 ------- arch/powerpc/include/asm/book3s/64/pgtable.h | 5 +- arch/powerpc/include/asm/book3s/64/radix.h | 35 ++--------- .../include/asm/book3s/64/tlbflush-radix.h | 2 - arch/powerpc/include/asm/cputable.h | 13 ++-- arch/powerpc/include/asm/paca.h | 5 -- arch/powerpc/kernel/asm-offsets.c | 1 - arch/powerpc/kernel/cputable.c | 19 ------ arch/powerpc/kernel/dt_cpu_ftrs.c | 4 +- arch/powerpc/kernel/exceptions-64s.S | 4 +- arch/powerpc/kernel/idle_book3s.S | 50 ---------------- arch/powerpc/kernel/process.c | 10 +--- arch/powerpc/kvm/book3s_64_mmu_radix.c | 15 +---- arch/powerpc/kvm/book3s_hv.c | 10 ---- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 16 +---- arch/powerpc/kvm/book3s_xive_template.c | 37 ++++-------- arch/powerpc/mm/hash_utils_64.c | 30 ---------- arch/powerpc/mm/hugetlbpage.c | 9 +-- arch/powerpc/mm/mmu_context_book3s64.c | 12 +--- arch/powerpc/mm/pgtable-radix.c | 60 +------------------ arch/powerpc/mm/tlb-radix.c | 18 ------ arch/powerpc/perf/core-book3s.c | 34 ----------- arch/powerpc/perf/isa207-common.c | 12 ++-- arch/powerpc/perf/isa207-common.h | 5 -- arch/powerpc/perf/power9-pmu.c | 54 +---------------- arch/powerpc/platforms/powernv/idle.c | 28 --------- arch/powerpc/platforms/powernv/smp.c | 27 +-------- arch/powerpc/sysdev/xive/common.c | 8 +-- arch/powerpc/xmon/xmon.c | 1 - drivers/misc/cxl/cxl.h | 8 --- drivers/misc/cxl/cxllib.c | 4 -- drivers/misc/cxl/pci.c | 41 +++++-------- 32 files changed, 65 insertions(+), 532 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index c459f937d484..50888388a359 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -32,26 +32,6 @@ static inline int hstate_get_psize(struct hstate *hstate) } } -#define arch_make_huge_pte arch_make_huge_pte -static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, - struct page *page, int writable) -{ - unsigned long page_shift; - - if (!cpu_has_feature(CPU_FTR_POWER9_DD1)) - return entry; - - page_shift = huge_page_shift(hstate_vma(vma)); - /* - * We don't support 1G hugetlb pages yet. - */ - VM_WARN_ON(page_shift == mmu_psize_defs[MMU_PAGE_1G].shift); - if (page_shift == mmu_psize_defs[MMU_PAGE_2M].shift) - return __pte(pte_val(entry) | R_PAGE_LARGE); - else - return entry; -} - #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE static inline bool gigantic_page_supported(void) { diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 63cee159022b..d334e6b9a46d 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -474,9 +474,8 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, { if (full && radix_enabled()) { /* - * Let's skip the DD1 style pte update here. We know that - * this is a full mm pte clear and hence can be sure there is - * no parallel set_pte. + * We know that this is a full mm pte clear and + * hence can be sure there is no parallel set_pte. */ return radix__ptep_get_and_clear_full(mm, addr, ptep, full); } diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index ef9f96742ce1..3ab3f7aef022 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -12,12 +12,6 @@ #include #endif -/* - * For P9 DD1 only, we need to track whether the pte's huge. - */ -#define R_PAGE_LARGE _RPAGE_RSV1 - - #ifndef __ASSEMBLY__ #include #include @@ -154,20 +148,7 @@ static inline unsigned long radix__pte_update(struct mm_struct *mm, { unsigned long old_pte; - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { - - unsigned long new_pte; - - old_pte = __radix_pte_update(ptep, ~0ul, 0); - /* - * new value of pte - */ - new_pte = (old_pte | set) & ~clr; - radix__flush_tlb_pte_p9_dd1(old_pte, mm, addr); - if (new_pte) - __radix_pte_update(ptep, 0, new_pte); - } else - old_pte = __radix_pte_update(ptep, clr, set); + old_pte = __radix_pte_update(ptep, clr, set); if (!huge) assert_pte_locked(mm, addr); @@ -253,8 +234,6 @@ static inline int radix__pmd_trans_huge(pmd_t pmd) static inline pmd_t radix__pmd_mkhuge(pmd_t pmd) { - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) - return __pmd(pmd_val(pmd) | _PAGE_PTE | R_PAGE_LARGE); return __pmd(pmd_val(pmd) | _PAGE_PTE); } @@ -285,18 +264,14 @@ static inline unsigned long radix__get_tree_size(void) unsigned long rts_field; /* * We support 52 bits, hence: - * DD1 52-28 = 24, 0b11000 - * Others 52-31 = 21, 0b10101 + * bits 52 - 31 = 21, 0b10101 * RTS encoding details * bits 0 - 3 of rts -> bits 6 - 8 unsigned long * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long */ - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) - rts_field = (0x3UL << 61); - else { - rts_field = (0x5UL << 5); /* 6 - 8 bits */ - rts_field |= (0x2UL << 61); - } + rts_field = (0x5UL << 5); /* 6 - 8 bits */ + rts_field |= (0x2UL << 61); + return rts_field; } diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h index ef5c3f2994c9..1154a6dc6d26 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -48,8 +48,6 @@ extern void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmad extern void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr); extern void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr); extern void radix__flush_tlb_all(void); -extern void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm, - unsigned long address); extern void radix__flush_tlb_lpid_page(unsigned int lpid, unsigned long addr, diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index 9c0a3083571b..f980f91cad8a 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -210,7 +210,6 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTR_DAWR LONG_ASM_CONST(0x0000008000000000) #define CPU_FTR_DABRX LONG_ASM_CONST(0x0000010000000000) #define CPU_FTR_PMAO_BUG LONG_ASM_CONST(0x0000020000000000) -#define CPU_FTR_POWER9_DD1 LONG_ASM_CONST(0x0000040000000000) #define CPU_FTR_POWER9_DD2_1 LONG_ASM_CONST(0x0000080000000000) #define CPU_FTR_P9_TM_HV_ASSIST LONG_ASM_CONST(0x0000100000000000) #define CPU_FTR_P9_TM_XER_SO_BUG LONG_ASM_CONST(0x0000200000000000) @@ -464,8 +463,6 @@ static inline void cpu_feature_keys_init(void) { } CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \ CPU_FTR_TM_COMP | CPU_FTR_ARCH_300 | CPU_FTR_PKEY | \ CPU_FTR_P9_TLBIE_BUG | CPU_FTR_P9_TIDR) -#define CPU_FTRS_POWER9_DD1 ((CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1) & \ - (~CPU_FTR_SAO)) #define CPU_FTRS_POWER9_DD2_0 CPU_FTRS_POWER9 #define CPU_FTRS_POWER9_DD2_1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1) #define CPU_FTRS_POWER9_DD2_2 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1 | \ @@ -489,16 +486,14 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTRS_POSSIBLE \ (CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | CPU_FTRS_POWER8 | \ CPU_FTRS_POWER8_DD1 | CPU_FTR_ALTIVEC_COMP | CPU_FTR_VSX_COMP | \ - CPU_FTRS_POWER9 | CPU_FTRS_POWER9_DD1 | CPU_FTRS_POWER9_DD2_1 | \ - CPU_FTRS_POWER9_DD2_2) + CPU_FTRS_POWER9 | CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2) #else #define CPU_FTRS_POSSIBLE \ (CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | \ CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \ CPU_FTRS_POWER8 | CPU_FTRS_POWER8_DD1 | CPU_FTRS_CELL | \ CPU_FTRS_PA6T | CPU_FTR_VSX_COMP | CPU_FTR_ALTIVEC_COMP | \ - CPU_FTRS_POWER9 | CPU_FTRS_POWER9_DD1 | CPU_FTRS_POWER9_DD2_1 | \ - CPU_FTRS_POWER9_DD2_2) + CPU_FTRS_POWER9 | CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2) #endif /* CONFIG_CPU_LITTLE_ENDIAN */ #endif #else @@ -567,7 +562,7 @@ enum { #define CPU_FTRS_ALWAYS \ (CPU_FTRS_POSSIBLE & ~CPU_FTR_HVMODE & CPU_FTRS_POWER7 & \ CPU_FTRS_POWER8E & CPU_FTRS_POWER8 & CPU_FTRS_POWER8_DD1 & \ - CPU_FTRS_POWER9 & CPU_FTRS_POWER9_DD1 & CPU_FTRS_POWER9_DD2_1 & \ + CPU_FTRS_POWER9 & CPU_FTRS_POWER9_DD2_1 & \ CPU_FTRS_DT_CPU_BASE) #else #define CPU_FTRS_ALWAYS \ @@ -575,7 +570,7 @@ enum { CPU_FTRS_POWER6 & CPU_FTRS_POWER7 & CPU_FTRS_CELL & \ CPU_FTRS_PA6T & CPU_FTRS_POWER8 & CPU_FTRS_POWER8E & \ CPU_FTRS_POWER8_DD1 & ~CPU_FTR_HVMODE & CPU_FTRS_POSSIBLE & \ - CPU_FTRS_POWER9 & CPU_FTRS_POWER9_DD1 & CPU_FTRS_POWER9_DD2_1 & \ + CPU_FTRS_POWER9 & CPU_FTRS_POWER9_DD2_1 & \ CPU_FTRS_DT_CPU_BASE) #endif /* CONFIG_CPU_LITTLE_ENDIAN */ #endif diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 6d34bd71139d..4e9cede5a7e7 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -187,11 +187,6 @@ struct paca_struct { u8 subcore_sibling_mask; /* Flag to request this thread not to stop */ atomic_t dont_stop; - /* - * Pointer to an array which contains pointer - * to the sibling threads' paca. - */ - struct paca_struct **thread_sibling_pacas; /* The PSSCR value that the kernel requested before going to stop */ u64 requested_psscr; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 0a0544335950..89cf15566c4e 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -766,7 +766,6 @@ int main(void) OFFSET(PACA_THREAD_IDLE_STATE, paca_struct, thread_idle_state); OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask); OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask); - OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas); OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr); OFFSET(PACA_DONT_STOP, paca_struct, dont_stop); #define STOP_SPR(x, f) OFFSET(x, paca_struct, stop_sprs.f) diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index c8fc9691f8c7..bc75a2908a7e 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -485,25 +485,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check_early = __machine_check_early_realmode_p8, .platform = "power8", }, - { /* Power9 DD1*/ - .pvr_mask = 0xffffff00, - .pvr_value = 0x004e0100, - .cpu_name = "POWER9 (raw)", - .cpu_features = CPU_FTRS_POWER9_DD1, - .cpu_user_features = COMMON_USER_POWER9, - .cpu_user_features2 = COMMON_USER2_POWER9, - .mmu_features = MMU_FTRS_POWER9, - .icache_bsize = 128, - .dcache_bsize = 128, - .num_pmcs = 6, - .pmc_type = PPC_PMC_IBM, - .oprofile_cpu_type = "ppc64/power9", - .oprofile_type = PPC_OPROFILE_INVALID, - .cpu_setup = __setup_cpu_power9, - .cpu_restore = __restore_cpu_power9, - .machine_check_early = __machine_check_early_realmode_p9, - .platform = "power9", - }, { /* Power9 DD2.0 */ .pvr_mask = 0xffffefff, .pvr_value = 0x004e0200, diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 4be1c0de9406..98c373a4c1cf 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -701,9 +701,7 @@ static __init void cpufeatures_cpu_quirks(void) /* * Not all quirks can be derived from the cpufeatures device tree. */ - if ((version & 0xffffff00) == 0x004e0100) - cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD1; - else if ((version & 0xffffefff) == 0x004e0200) + if ((version & 0xffffefff) == 0x004e0200) ; /* DD2.0 has no feature flag */ else if ((version & 0xffffefff) == 0x004e0201) cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 285c6465324a..76a14702cb9c 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -276,9 +276,7 @@ BEGIN_FTR_SECTION * * This interrupt can wake directly from idle. If that is the case, * the machine check is handled then the idle wakeup code is called - * to restore state. In that case, the POWER9 DD1 idle PACA workaround - * is not applied in the early machine check code, which will cause - * bugs. + * to restore state. */ mr r11,r1 /* Save r1 */ lhz r10,PACA_IN_MCE(r13) diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index e734f6e45abc..d85d5515a091 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -466,43 +466,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG) blr /* return 0 for wakeup cause / SRR1 value */ #endif -/* - * On waking up from stop 0,1,2 with ESL=1 on POWER9 DD1, - * HSPRG0 will be set to the HSPRG0 value of one of the - * threads in this core. Thus the value we have in r13 - * may not be this thread's paca pointer. - * - * Fortunately, the TIR remains invariant. Since this thread's - * paca pointer is recorded in all its sibling's paca, we can - * correctly recover this thread's paca pointer if we - * know the index of this thread in the core. - * - * This index can be obtained from the TIR. - * - * i.e, thread's position in the core = TIR. - * If this value is i, then this thread's paca is - * paca->thread_sibling_pacas[i]. - */ -power9_dd1_recover_paca: - mfspr r4, SPRN_TIR - /* - * Since each entry in thread_sibling_pacas is 8 bytes - * we need to left-shift by 3 bits. Thus r4 = i * 8 - */ - sldi r4, r4, 3 - /* Get &paca->thread_sibling_pacas[0] in r5 */ - ld r5, PACA_SIBLING_PACA_PTRS(r13) - /* Load paca->thread_sibling_pacas[i] into r13 */ - ldx r13, r4, r5 - SET_PACA(r13) - /* - * Indicate that we have lost NVGPR state - * which needs to be restored from the stack. - */ - li r3, 1 - stb r3,PACA_NAPSTATELOST(r13) - blr - /* * Called from machine check handler for powersave wakeups. * Low level machine check processing has already been done. Now just @@ -537,9 +500,6 @@ pnv_powersave_wakeup: ld r2, PACATOC(r13) BEGIN_FTR_SECTION -BEGIN_FTR_SECTION_NESTED(70) - bl power9_dd1_recover_paca -END_FTR_SECTION_NESTED_IFSET(CPU_FTR_POWER9_DD1, 70) bl pnv_restore_hyp_resource_arch300 FTR_SECTION_ELSE bl pnv_restore_hyp_resource_arch207 @@ -602,22 +562,12 @@ END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1) LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state) ld r4,ADDROFF(pnv_first_deep_stop_state)(r5) -BEGIN_FTR_SECTION_NESTED(71) - /* - * Assume that we are waking up from the state - * same as the Requested Level (RL) in the PSSCR - * which are Bits 60-63 - */ - ld r5,PACA_REQ_PSSCR(r13) - rldicl r5,r5,0,60 -FTR_SECTION_ELSE_NESTED(71) /* * 0-3 bits correspond to Power-Saving Level Status * which indicates the idle state we are waking up from */ mfspr r5, SPRN_PSSCR rldicl r5,r5,4,60 -ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_POWER9_DD1, 71) li r0, 0 /* clear requested_psscr to say we're awake */ std r0, PACA_REQ_PSSCR(r13) cmpd cr4,r5,r4 diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 9ef4aea9fffe..27f0caee55ea 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1250,17 +1250,9 @@ struct task_struct *__switch_to(struct task_struct *prev, * mappings. If the new process has the foreign real address * mappings, we must issue a cp_abort to clear any state and * prevent snooping, corruption or a covert channel. - * - * DD1 allows paste into normal system memory so we do an - * unpaired copy, rather than cp_abort, to clear the buffer, - * since cp_abort is quite expensive. */ - if (current_thread_info()->task->thread.used_vas) { + if (current_thread_info()->task->thread.used_vas) asm volatile(PPC_CP_ABORT); - } else if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { - asm volatile(PPC_COPY(%0, %1) - : : "r"(dummy_copy_buffer), "r"(0)); - } } #endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 176f911ee983..0af1c0aea1fe 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -66,10 +66,7 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, bits = root & RPDS_MASK; root = root & RPDB_MASK; - /* P9 DD1 interprets RTS (radix tree size) differently */ offset = rts + 31; - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) - offset -= 3; /* current implementations only support 52-bit space */ if (offset != 52) @@ -160,17 +157,7 @@ static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, unsigned long clr, unsigned long set, unsigned long addr, unsigned int shift) { - unsigned long old = 0; - - if (!(clr & _PAGE_PRESENT) && cpu_has_feature(CPU_FTR_POWER9_DD1) && - pte_present(*ptep)) { - /* have to invalidate it first */ - old = __radix_pte_update(ptep, _PAGE_PRESENT, 0); - kvmppc_radix_tlbie_page(kvm, addr, shift); - set |= _PAGE_PRESENT; - old &= _PAGE_PRESENT; - } - return __radix_pte_update(ptep, clr, set) | old; + return __radix_pte_update(ptep, clr, set); } void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr, diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index de686b340f4a..b568582120a3 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1693,14 +1693,6 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len); break; case KVM_REG_PPC_TB_OFFSET: - /* - * POWER9 DD1 has an erratum where writing TBU40 causes - * the timebase to lose ticks. So we don't let the - * timebase offset be changed on P9 DD1. (It is - * initialized to zero.) - */ - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) - break; /* round up to multiple of 2^24 */ vcpu->arch.vcore->tb_offset = ALIGN(set_reg_val(id, *val), 1UL << 24); @@ -2026,8 +2018,6 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, /* * Set the default HFSCR for the guest from the host value. * This value is only used on POWER9. - * On POWER9 DD1, TM doesn't work, so we make sure to - * prevent the guest from using it. * On POWER9, we want to virtualize the doorbell facility, so we * turn off the HFSCR bit, which causes those instructions to trap. */ diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 153988d878e8..6e4554b273f1 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -916,9 +916,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_DAWR) mtspr SPRN_BESCR, r6 mtspr SPRN_PID, r7 mtspr SPRN_WORT, r8 -BEGIN_FTR_SECTION - PPC_INVALIDATE_ERAT -END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1) BEGIN_FTR_SECTION /* POWER8-only registers */ ld r5, VCPU_TCSCR(r4) @@ -1912,7 +1909,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) ld r5, VCPU_KVM(r9) lbz r0, KVM_RADIX(r5) cmpwi cr2, r0, 0 - beq cr2, 4f + beq cr2, 2f /* * Radix: do eieio; tlbsync; ptesync sequence in case we @@ -1952,11 +1949,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) bdnz 1b ptesync -2: /* Flush the ERAT on radix P9 DD1 guest exit */ -BEGIN_FTR_SECTION - PPC_INVALIDATE_ERAT -END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1) -4: +2: #endif /* CONFIG_PPC_RADIX_MMU */ /* @@ -3367,11 +3360,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) mtspr SPRN_CIABR, r0 mtspr SPRN_DAWRX, r0 - /* Flush the ERAT on radix P9 DD1 guest exit */ -BEGIN_FTR_SECTION - PPC_INVALIDATE_ERAT -END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1) - BEGIN_MMU_FTR_SECTION b 4f END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c index 6e41ba7ec8f4..4171ede8722b 100644 --- a/arch/powerpc/kvm/book3s_xive_template.c +++ b/arch/powerpc/kvm/book3s_xive_template.c @@ -25,18 +25,6 @@ static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc) */ eieio(); - /* - * DD1 bug workaround: If PIPR is less favored than CPPR - * ignore the interrupt or we might incorrectly lose an IPB - * bit. - */ - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { - __be64 qw1 = __x_readq(__x_tima + TM_QW1_OS); - u8 pipr = be64_to_cpu(qw1) & 0xff; - if (pipr >= xc->hw_cppr) - return; - } - /* Perform the acknowledge OS to register cycle. */ ack = be16_to_cpu(__x_readw(__x_tima + TM_SPC_ACK_OS_REG)); @@ -89,8 +77,15 @@ static void GLUE(X_PFX,source_eoi)(u32 hw_irq, struct xive_irq_data *xd) /* If the XIVE supports the new "store EOI facility, use it */ if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) __x_writeq(0, __x_eoi_page(xd) + XIVE_ESB_STORE_EOI); - else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) { + else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) opal_int_eoi(hw_irq); + else if (xd->flags & XIVE_IRQ_FLAG_LSI) { + /* + * For LSIs the HW EOI cycle is used rather than PQ bits, + * as they are automatically re-triggred in HW when still + * pending. + */ + __x_readq(__x_eoi_page(xd) + XIVE_ESB_LOAD_EOI); } else { uint64_t eoi_val; @@ -102,20 +97,12 @@ static void GLUE(X_PFX,source_eoi)(u32 hw_irq, struct xive_irq_data *xd) * * This allows us to then do a re-trigger if Q was set * rather than synthetizing an interrupt in software - * - * For LSIs, using the HW EOI cycle works around a problem - * on P9 DD1 PHBs where the other ESB accesses don't work - * properly. */ - if (xd->flags & XIVE_IRQ_FLAG_LSI) - __x_readq(__x_eoi_page(xd) + XIVE_ESB_LOAD_EOI); - else { - eoi_val = GLUE(X_PFX,esb_load)(xd, XIVE_ESB_SET_PQ_00); + eoi_val = GLUE(X_PFX,esb_load)(xd, XIVE_ESB_SET_PQ_00); - /* Re-trigger if needed */ - if ((eoi_val & 1) && __x_trig_page(xd)) - __x_writeq(0, __x_trig_page(xd)); - } + /* Re-trigger if needed */ + if ((eoi_val & 1) && __x_trig_page(xd)) + __x_writeq(0, __x_trig_page(xd)); } } diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 8318716e5075..5a72e980e25a 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -808,31 +808,6 @@ int hash__remove_section_mapping(unsigned long start, unsigned long end) } #endif /* CONFIG_MEMORY_HOTPLUG */ -static void update_hid_for_hash(void) -{ - unsigned long hid0; - unsigned long rb = 3UL << PPC_BITLSHIFT(53); /* IS = 3 */ - - asm volatile("ptesync": : :"memory"); - /* prs = 0, ric = 2, rs = 0, r = 1 is = 3 */ - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(0), "i"(0), "i"(2), "r"(0) : "memory"); - asm volatile("eieio; tlbsync; ptesync; isync; slbia": : :"memory"); - trace_tlbie(0, 0, rb, 0, 2, 0, 0); - - /* - * now switch the HID - */ - hid0 = mfspr(SPRN_HID0); - hid0 &= ~HID0_POWER9_RADIX; - mtspr(SPRN_HID0, hid0); - asm volatile("isync": : :"memory"); - - /* Wait for it to happen */ - while ((mfspr(SPRN_HID0) & HID0_POWER9_RADIX)) - cpu_relax(); -} - static void __init hash_init_partition_table(phys_addr_t hash_table, unsigned long htab_size) { @@ -845,8 +820,6 @@ static void __init hash_init_partition_table(phys_addr_t hash_table, htab_size = __ilog2(htab_size) - 18; mmu_partition_table_set_entry(0, hash_table | htab_size, 0); pr_info("Partition table %p\n", partition_tb); - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) - update_hid_for_hash(); } static void __init htab_initialize(void) @@ -1077,9 +1050,6 @@ void hash__early_init_mmu_secondary(void) /* Initialize hash table for that CPU */ if (!firmware_has_feature(FW_FEATURE_LPAR)) { - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) - update_hid_for_hash(); - if (!cpu_has_feature(CPU_FTR_ARCH_300)) mtspr(SPRN_SDR1, _SDR1); else diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 7c5f479c5c00..ec7538a802f9 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -620,15 +620,12 @@ static int __init add_huge_page_size(unsigned long long size) * firmware we only add hugetlb support for page sizes that can be * supported by linux page table layout. * For now we have - * Radix: 2M + * Radix: 2M and 1G * Hash: 16M and 16G */ if (radix_enabled()) { - if (mmu_psize != MMU_PAGE_2M) { - if (cpu_has_feature(CPU_FTR_POWER9_DD1) || - (mmu_psize != MMU_PAGE_1G)) - return -EINVAL; - } + if (mmu_psize != MMU_PAGE_2M && mmu_psize != MMU_PAGE_1G) + return -EINVAL; } else { if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G) return -EINVAL; diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index f3d4b4a0e561..39e9ef0eb78b 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -273,15 +273,7 @@ void arch_exit_mmap(struct mm_struct *mm) #ifdef CONFIG_PPC_RADIX_MMU void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) { - - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { - isync(); - mtspr(SPRN_PID, next->context.id); - isync(); - asm volatile(PPC_INVALIDATE_ERAT : : :"memory"); - } else { - mtspr(SPRN_PID, next->context.id); - isync(); - } + mtspr(SPRN_PID, next->context.id); + isync(); } #endif diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 96f68c5aa1f5..bba168d02235 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -226,16 +226,6 @@ void radix__mark_rodata_ro(void) { unsigned long start, end; - /* - * mark_rodata_ro() will mark itself as !writable at some point. - * Due to DD1 workaround in radix__pte_update(), we'll end up with - * an invalid pte and the system will crash quite severly. - */ - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { - pr_warn("Warning: Unable to mark rodata read only on P9 DD1\n"); - return; - } - start = (unsigned long)_stext; end = (unsigned long)__init_begin; @@ -533,35 +523,6 @@ void __init radix__early_init_devtree(void) return; } -static void update_hid_for_radix(void) -{ - unsigned long hid0; - unsigned long rb = 3UL << PPC_BITLSHIFT(53); /* IS = 3 */ - - asm volatile("ptesync": : :"memory"); - /* prs = 0, ric = 2, rs = 0, r = 1 is = 3 */ - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(1), "i"(0), "i"(2), "r"(0) : "memory"); - /* prs = 1, ric = 2, rs = 0, r = 1 is = 3 */ - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(1), "i"(1), "i"(2), "r"(0) : "memory"); - asm volatile("eieio; tlbsync; ptesync; isync; slbia": : :"memory"); - trace_tlbie(0, 0, rb, 0, 2, 0, 1); - trace_tlbie(0, 0, rb, 0, 2, 1, 1); - - /* - * now switch the HID - */ - hid0 = mfspr(SPRN_HID0); - hid0 |= HID0_POWER9_RADIX; - mtspr(SPRN_HID0, hid0); - asm volatile("isync": : :"memory"); - - /* Wait for it to happen */ - while (!(mfspr(SPRN_HID0) & HID0_POWER9_RADIX)) - cpu_relax(); -} - static void radix_init_amor(void) { /* @@ -576,22 +537,12 @@ static void radix_init_amor(void) static void radix_init_iamr(void) { - unsigned long iamr; - - /* - * The IAMR should set to 0 on DD1. - */ - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) - iamr = 0; - else - iamr = (1ul << 62); - /* * Radix always uses key0 of the IAMR to determine if an access is * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction * fetch. */ - mtspr(SPRN_IAMR, iamr); + mtspr(SPRN_IAMR, (1ul << 62)); } void __init radix__early_init_mmu(void) @@ -644,8 +595,6 @@ void __init radix__early_init_mmu(void) if (!firmware_has_feature(FW_FEATURE_LPAR)) { radix_init_native(); - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) - update_hid_for_radix(); lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); radix_init_partition_table(); @@ -671,10 +620,6 @@ void radix__early_init_mmu_secondary(void) * update partition table control register and UPRT */ if (!firmware_has_feature(FW_FEATURE_LPAR)) { - - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) - update_hid_for_radix(); - lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); @@ -1095,8 +1040,7 @@ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, * To avoid NMMU hang while relaxing access, we need mark * the pte invalid in between. */ - if (cpu_has_feature(CPU_FTR_POWER9_DD1) || - atomic_read(&mm->context.copros) > 0) { + if (atomic_read(&mm->context.copros) > 0) { unsigned long old_pte, new_pte; old_pte = __radix_pte_update(ptep, ~0, 0); diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 67a6e86d3e7e..902767b8a9c1 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -994,24 +994,6 @@ void radix__flush_tlb_all(void) asm volatile("eieio; tlbsync; ptesync": : :"memory"); } -void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm, - unsigned long address) -{ - /* - * We track page size in pte only for DD1, So we can - * call this only on DD1. - */ - if (!cpu_has_feature(CPU_FTR_POWER9_DD1)) { - VM_WARN_ON(1); - return; - } - - if (old_pte & R_PAGE_LARGE) - radix__flush_tlb_page_psize(mm, address, MMU_PAGE_2M); - else - radix__flush_tlb_page_psize(mm, address, mmu_virtual_psize); -} - #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE extern void radix_kvm_prefetch_workaround(struct mm_struct *mm) { diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 3f66fcf8ad99..01f92c4a9f02 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -128,10 +128,6 @@ static inline void power_pmu_bhrb_disable(struct perf_event *event) {} static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) {} static inline void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) {} static void pmao_restore_workaround(bool ebb) { } -static bool use_ic(u64 event) -{ - return false; -} #endif /* CONFIG_PPC32 */ static bool regs_use_siar(struct pt_regs *regs) @@ -714,14 +710,6 @@ static void pmao_restore_workaround(bool ebb) mtspr(SPRN_PMC6, pmcs[5]); } -static bool use_ic(u64 event) -{ - if (cpu_has_feature(CPU_FTR_POWER9_DD1) && - (event == 0x200f2 || event == 0x300f2)) - return true; - - return false; -} #endif /* CONFIG_PPC64 */ static void perf_event_interrupt(struct pt_regs *regs); @@ -1046,7 +1034,6 @@ static u64 check_and_compute_delta(u64 prev, u64 val) static void power_pmu_read(struct perf_event *event) { s64 val, delta, prev; - struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); if (event->hw.state & PERF_HES_STOPPED) return; @@ -1056,13 +1043,6 @@ static void power_pmu_read(struct perf_event *event) if (is_ebb_event(event)) { val = read_pmc(event->hw.idx); - if (use_ic(event->attr.config)) { - val = mfspr(SPRN_IC); - if (val > cpuhw->ic_init) - val = val - cpuhw->ic_init; - else - val = val + (0 - cpuhw->ic_init); - } local64_set(&event->hw.prev_count, val); return; } @@ -1076,13 +1056,6 @@ static void power_pmu_read(struct perf_event *event) prev = local64_read(&event->hw.prev_count); barrier(); val = read_pmc(event->hw.idx); - if (use_ic(event->attr.config)) { - val = mfspr(SPRN_IC); - if (val > cpuhw->ic_init) - val = val - cpuhw->ic_init; - else - val = val + (0 - cpuhw->ic_init); - } delta = check_and_compute_delta(prev, val); if (!delta) return; @@ -1535,13 +1508,6 @@ static int power_pmu_add(struct perf_event *event, int ef_flags) event->attr.branch_sample_type); } - /* - * Workaround for POWER9 DD1 to use the Instruction Counter - * register value for instruction counting - */ - if (use_ic(event->attr.config)) - cpuhw->ic_init = mfspr(SPRN_IC); - perf_pmu_enable(event->pmu); local_irq_restore(flags); return ret; diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c index 2efee3f196f5..177de814286f 100644 --- a/arch/powerpc/perf/isa207-common.c +++ b/arch/powerpc/perf/isa207-common.c @@ -59,7 +59,7 @@ static bool is_event_valid(u64 event) { u64 valid_mask = EVENT_VALID_MASK; - if (cpu_has_feature(CPU_FTR_ARCH_300) && !cpu_has_feature(CPU_FTR_POWER9_DD1)) + if (cpu_has_feature(CPU_FTR_ARCH_300)) valid_mask = p9_EVENT_VALID_MASK; return !(event & ~valid_mask); @@ -86,8 +86,6 @@ static void mmcra_sdar_mode(u64 event, unsigned long *mmcra) * Incase of Power9: * Marked event: MMCRA[SDAR_MODE] will be set to 0b00 ('No Updates'), * or if group already have any marked events. - * Non-Marked events (for DD1): - * MMCRA[SDAR_MODE] will be set to 0b01 * For rest * MMCRA[SDAR_MODE] will be set from event code. * If sdar_mode from event is zero, default to 0b01. Hardware @@ -96,7 +94,7 @@ static void mmcra_sdar_mode(u64 event, unsigned long *mmcra) if (cpu_has_feature(CPU_FTR_ARCH_300)) { if (is_event_marked(event) || (*mmcra & MMCRA_SAMPLE_ENABLE)) *mmcra &= MMCRA_SDAR_MODE_NO_UPDATES; - else if (!cpu_has_feature(CPU_FTR_POWER9_DD1) && p9_SDAR_MODE(event)) + else if (p9_SDAR_MODE(event)) *mmcra |= p9_SDAR_MODE(event) << MMCRA_SDAR_MODE_SHIFT; else *mmcra |= MMCRA_SDAR_MODE_DCACHE; @@ -106,7 +104,7 @@ static void mmcra_sdar_mode(u64 event, unsigned long *mmcra) static u64 thresh_cmp_val(u64 value) { - if (cpu_has_feature(CPU_FTR_ARCH_300) && !cpu_has_feature(CPU_FTR_POWER9_DD1)) + if (cpu_has_feature(CPU_FTR_ARCH_300)) return value << p9_MMCRA_THR_CMP_SHIFT; return value << MMCRA_THR_CMP_SHIFT; @@ -114,7 +112,7 @@ static u64 thresh_cmp_val(u64 value) static unsigned long combine_from_event(u64 event) { - if (cpu_has_feature(CPU_FTR_ARCH_300) && !cpu_has_feature(CPU_FTR_POWER9_DD1)) + if (cpu_has_feature(CPU_FTR_ARCH_300)) return p9_EVENT_COMBINE(event); return EVENT_COMBINE(event); @@ -122,7 +120,7 @@ static unsigned long combine_from_event(u64 event) static unsigned long combine_shift(unsigned long pmc) { - if (cpu_has_feature(CPU_FTR_ARCH_300) && !cpu_has_feature(CPU_FTR_POWER9_DD1)) + if (cpu_has_feature(CPU_FTR_ARCH_300)) return p9_MMCR1_COMBINE_SHIFT(pmc); return MMCR1_COMBINE_SHIFT(pmc); diff --git a/arch/powerpc/perf/isa207-common.h b/arch/powerpc/perf/isa207-common.h index 6a0b586c935a..0028f4b9490d 100644 --- a/arch/powerpc/perf/isa207-common.h +++ b/arch/powerpc/perf/isa207-common.h @@ -158,11 +158,6 @@ CNST_PMC_VAL(1) | CNST_PMC_VAL(2) | CNST_PMC_VAL(3) | \ CNST_PMC_VAL(4) | CNST_PMC_VAL(5) | CNST_PMC_VAL(6) | CNST_NC_VAL -/* - * Lets restrict use of PMC5 for instruction counting. - */ -#define P9_DD1_TEST_ADDER (ISA207_TEST_ADDER | CNST_PMC_VAL(5)) - /* Bits in MMCR1 for PowerISA v2.07 */ #define MMCR1_UNIT_SHIFT(pmc) (60 - (4 * ((pmc) - 1))) #define MMCR1_COMBINE_SHIFT(pmc) (35 - ((pmc) - 1)) diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c index 2ca0b33b4efb..e012b1030a5b 100644 --- a/arch/powerpc/perf/power9-pmu.c +++ b/arch/powerpc/perf/power9-pmu.c @@ -219,12 +219,6 @@ static struct attribute_group power9_pmu_events_group = { .attrs = power9_events_attr, }; -static const struct attribute_group *power9_isa207_pmu_attr_groups[] = { - &isa207_pmu_format_group, - &power9_pmu_events_group, - NULL, -}; - PMU_FORMAT_ATTR(event, "config:0-51"); PMU_FORMAT_ATTR(pmcxsel, "config:0-7"); PMU_FORMAT_ATTR(mark, "config:8"); @@ -267,17 +261,6 @@ static const struct attribute_group *power9_pmu_attr_groups[] = { NULL, }; -static int power9_generic_events_dd1[] = { - [PERF_COUNT_HW_CPU_CYCLES] = PM_CYC, - [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = PM_ICT_NOSLOT_CYC, - [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = PM_CMPLU_STALL, - [PERF_COUNT_HW_INSTRUCTIONS] = PM_INST_DISP, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = PM_BR_CMPL_ALT, - [PERF_COUNT_HW_BRANCH_MISSES] = PM_BR_MPRED_CMPL, - [PERF_COUNT_HW_CACHE_REFERENCES] = PM_LD_REF_L1, - [PERF_COUNT_HW_CACHE_MISSES] = PM_LD_MISS_L1_FIN, -}; - static int power9_generic_events[] = { [PERF_COUNT_HW_CPU_CYCLES] = PM_CYC, [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = PM_ICT_NOSLOT_CYC, @@ -439,25 +422,6 @@ static int power9_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { #undef C -static struct power_pmu power9_isa207_pmu = { - .name = "POWER9", - .n_counter = MAX_PMU_COUNTERS, - .add_fields = ISA207_ADD_FIELDS, - .test_adder = P9_DD1_TEST_ADDER, - .compute_mmcr = isa207_compute_mmcr, - .config_bhrb = power9_config_bhrb, - .bhrb_filter_map = power9_bhrb_filter_map, - .get_constraint = isa207_get_constraint, - .get_alternatives = power9_get_alternatives, - .disable_pmc = isa207_disable_pmc, - .flags = PPMU_NO_SIAR | PPMU_ARCH_207S, - .n_generic = ARRAY_SIZE(power9_generic_events_dd1), - .generic_events = power9_generic_events_dd1, - .cache_events = &power9_cache_events, - .attr_groups = power9_isa207_pmu_attr_groups, - .bhrb_nr = 32, -}; - static struct power_pmu power9_pmu = { .name = "POWER9", .n_counter = MAX_PMU_COUNTERS, @@ -500,23 +464,7 @@ static int __init init_power9_pmu(void) } } - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { - /* - * Since PM_INST_CMPL may not provide right counts in all - * sampling scenarios in power9 DD1, instead use PM_INST_DISP. - */ - EVENT_VAR(PM_INST_CMPL, _g).id = PM_INST_DISP; - /* - * Power9 DD1 should use PM_BR_CMPL_ALT event code for - * "branches" to provide correct counter value. - */ - EVENT_VAR(PM_BR_CMPL, _g).id = PM_BR_CMPL_ALT; - EVENT_VAR(PM_BR_CMPL, _c).id = PM_BR_CMPL_ALT; - rc = register_power_pmu(&power9_isa207_pmu); - } else { - rc = register_power_pmu(&power9_pmu); - } - + rc = register_power_pmu(&power9_pmu); if (rc) return rc; diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 1c5d0675b43c..12f13acee1f6 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -177,11 +177,6 @@ static void pnv_alloc_idle_core_states(void) paca_ptrs[cpu]->core_idle_state_ptr = core_idle_state; paca_ptrs[cpu]->thread_idle_state = PNV_THREAD_RUNNING; paca_ptrs[cpu]->thread_mask = 1 << j; - if (!cpu_has_feature(CPU_FTR_POWER9_DD1)) - continue; - paca_ptrs[cpu]->thread_sibling_pacas = - kmalloc_node(paca_ptr_array_size, - GFP_KERNEL, node); } } @@ -805,29 +800,6 @@ static int __init pnv_init_idle_states(void) pnv_alloc_idle_core_states(); - /* - * For each CPU, record its PACA address in each of it's - * sibling thread's PACA at the slot corresponding to this - * CPU's index in the core. - */ - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { - int cpu; - - pr_info("powernv: idle: Saving PACA pointers of all CPUs in their thread sibling PACA\n"); - for_each_present_cpu(cpu) { - int base_cpu = cpu_first_thread_sibling(cpu); - int idx = cpu_thread_in_core(cpu); - int i; - - for (i = 0; i < threads_per_core; i++) { - int j = base_cpu + i; - - paca_ptrs[j]->thread_sibling_pacas[idx] = - paca_ptrs[cpu]; - } - } - } - if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED) ppc_md.power_save = power7_idle; diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index b80909957792..0d354e19ef92 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -283,23 +283,6 @@ static void pnv_cause_ipi(int cpu) ic_cause_ipi(cpu); } -static void pnv_p9_dd1_cause_ipi(int cpu) -{ - int this_cpu = get_cpu(); - - /* - * POWER9 DD1 has a global addressed msgsnd, but for now we restrict - * IPIs to same core, because it requires additional synchronization - * for inter-core doorbells which we do not implement. - */ - if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu))) - doorbell_global_ipi(cpu); - else - ic_cause_ipi(cpu); - - put_cpu(); -} - static void __init pnv_smp_probe(void) { if (xive_enabled()) @@ -311,14 +294,10 @@ static void __init pnv_smp_probe(void) ic_cause_ipi = smp_ops->cause_ipi; WARN_ON(!ic_cause_ipi); - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) - smp_ops->cause_ipi = pnv_p9_dd1_cause_ipi; - else - smp_ops->cause_ipi = doorbell_global_ipi; - } else { + if (cpu_has_feature(CPU_FTR_ARCH_300)) + smp_ops->cause_ipi = doorbell_global_ipi; + else smp_ops->cause_ipi = pnv_cause_ipi; - } } } diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 3459015092fa..4758173df426 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -319,7 +319,7 @@ void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd) * The FW told us to call it. This happens for some * interrupt sources that need additional HW whacking * beyond the ESB manipulation. For example LPC interrupts - * on P9 DD1.0 need a latch to be clared in the LPC bridge + * on P9 DD1.0 needed a latch to be clared in the LPC bridge * itself. The Firmware will take care of it. */ if (WARN_ON_ONCE(!xive_ops->eoi)) @@ -337,9 +337,9 @@ void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd) * This allows us to then do a re-trigger if Q was set * rather than synthesizing an interrupt in software * - * For LSIs, using the HW EOI cycle works around a problem - * on P9 DD1 PHBs where the other ESB accesses don't work - * properly. + * For LSIs the HW EOI cycle is used rather than PQ bits, + * as they are automatically re-triggred in HW when still + * pending. */ if (xd->flags & XIVE_IRQ_FLAG_LSI) xive_esb_read(xd, XIVE_ESB_LOAD_EOI); diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 47166ad2a669..21119cfe8474 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2429,7 +2429,6 @@ static void dump_one_paca(int cpu) DUMP(p, thread_idle_state, "%#-*x"); DUMP(p, thread_mask, "%#-*x"); DUMP(p, subcore_sibling_mask, "%#-*x"); - DUMP(p, thread_sibling_pacas, "%-*px"); DUMP(p, requested_psscr, "%#-*llx"); DUMP(p, stop_sprs.pid, "%#-*llx"); DUMP(p, stop_sprs.ldbar, "%#-*llx"); diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index 918d4fb742d1..505f973e13f3 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -865,14 +865,6 @@ static inline bool cxl_is_power9(void) return false; } -static inline bool cxl_is_power9_dd1(void) -{ - if ((pvr_version_is(PVR_POWER9)) && - cpu_has_feature(CPU_FTR_POWER9_DD1)) - return true; - return false; -} - ssize_t cxl_pci_afu_read_err_buffer(struct cxl_afu *afu, char *buf, loff_t off, size_t count); diff --git a/drivers/misc/cxl/cxllib.c b/drivers/misc/cxl/cxllib.c index 0bc7c31cf739..5a3f91255258 100644 --- a/drivers/misc/cxl/cxllib.c +++ b/drivers/misc/cxl/cxllib.c @@ -102,10 +102,6 @@ int cxllib_get_xsl_config(struct pci_dev *dev, struct cxllib_xsl_config *cfg) rc = cxl_get_xsl9_dsnctl(dev, capp_unit_id, &cfg->dsnctl); if (rc) return rc; - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { - /* workaround for DD1 - nbwind = capiind */ - cfg->dsnctl |= ((u64)0x02 << (63-47)); - } cfg->version = CXL_XSL_CONFIG_CURRENT_VERSION; cfg->log_bar_size = CXL_CAPI_WINDOW_LOG_SIZE; diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index 429d6de1dde7..2af0d4c47b76 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -465,23 +465,21 @@ int cxl_get_xsl9_dsnctl(struct pci_dev *dev, u64 capp_unit_id, u64 *reg) /* nMMU_ID Defaults to: b’000001001’*/ xsl_dsnctl |= ((u64)0x09 << (63-28)); - if (!(cxl_is_power9_dd1())) { - /* - * Used to identify CAPI packets which should be sorted into - * the Non-Blocking queues by the PHB. This field should match - * the PHB PBL_NBW_CMPM register - * nbwind=0x03, bits [57:58], must include capi indicator. - * Not supported on P9 DD1. - */ - xsl_dsnctl |= (nbwind << (63-55)); + /* + * Used to identify CAPI packets which should be sorted into + * the Non-Blocking queues by the PHB. This field should match + * the PHB PBL_NBW_CMPM register + * nbwind=0x03, bits [57:58], must include capi indicator. + * Not supported on P9 DD1. + */ + xsl_dsnctl |= (nbwind << (63-55)); - /* - * Upper 16b address bits of ASB_Notify messages sent to the - * system. Need to match the PHB’s ASN Compare/Mask Register. - * Not supported on P9 DD1. - */ - xsl_dsnctl |= asnind; - } + /* + * Upper 16b address bits of ASB_Notify messages sent to the + * system. Need to match the PHB’s ASN Compare/Mask Register. + * Not supported on P9 DD1. + */ + xsl_dsnctl |= asnind; *reg = xsl_dsnctl; return 0; @@ -539,15 +537,8 @@ static int init_implementation_adapter_regs_psl9(struct cxl *adapter, /* Snoop machines */ cxl_p1_write(adapter, CXL_PSL9_APCDEDALLOC, 0x800F000200000000ULL); - if (cxl_is_power9_dd1()) { - /* Disabling deadlock counter CAR */ - cxl_p1_write(adapter, CXL_PSL9_GP_CT, 0x0020000000000001ULL); - /* Enable NORST */ - cxl_p1_write(adapter, CXL_PSL9_DEBUG, 0x8000000000000000ULL); - } else { - /* Enable NORST and DD2 features */ - cxl_p1_write(adapter, CXL_PSL9_DEBUG, 0xC000000000000000ULL); - } + /* Enable NORST and DD2 features */ + cxl_p1_write(adapter, CXL_PSL9_DEBUG, 0xC000000000000000ULL); /* * Check if PSL has data-cache. We need to flush adapter datacache From da2bb0da730c5c427f66ce501aa4367f6921779e Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 4 Jul 2018 16:13:44 +1000 Subject: [PATCH 2/7] powerpc/powernv: Remove useless wrapper This gets rid of a useless wrapper around pnv_pci_ioda2_table_free_pages(). Reviewed-by: David Gibson Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 5bd0eb6681bc..d453f88c3c3e 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2199,11 +2199,6 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false); } -static void pnv_ioda2_table_free(struct iommu_table *tbl) -{ - pnv_pci_ioda2_table_free_pages(tbl); -} - static struct iommu_table_ops pnv_ioda2_iommu_ops = { .set = pnv_ioda2_tce_build, #ifdef CONFIG_IOMMU_API @@ -2212,7 +2207,7 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = { #endif .clear = pnv_ioda2_tce_free, .get = pnv_tce_get, - .free = pnv_ioda2_table_free, + .free = pnv_pci_ioda2_table_free_pages, }; static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data) From 191c22879fbcfd98a7fe9a51786ef41253b1549b Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 4 Jul 2018 16:13:45 +1000 Subject: [PATCH 3/7] powerpc/powernv: Move TCE manupulation code to its own file Right now we have allocation code in pci-ioda.c and traversing code in pci.c, let's keep them toghether. However both files are big enough already so let's move this business to a new file. While we at it, move the code which links IOMMU table groups to IOMMU tables as it is not specific to any PNV PHB model. These puts exported symbols from the new file together. This fixes several warnings from checkpatch.pl like this: "WARNING: Prefer 'unsigned int' to bare use of 'unsigned'". As this is almost cut-n-paste, there should be no behavioral change. Reviewed-by: David Gibson Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/Makefile | 2 +- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 313 ++++++++++++++++++ arch/powerpc/platforms/powernv/pci-ioda.c | 146 -------- arch/powerpc/platforms/powernv/pci.c | 158 --------- arch/powerpc/platforms/powernv/pci.h | 41 ++- 5 files changed, 340 insertions(+), 320 deletions(-) create mode 100644 arch/powerpc/platforms/powernv/pci-ioda-tce.c diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 703a350a7f4e..b540ce8eec55 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -6,7 +6,7 @@ obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o -obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o +obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o obj-$(CONFIG_CXL_BASE) += pci-cxl.o obj-$(CONFIG_EEH) += eeh-powernv.o obj-$(CONFIG_PPC_SCOM) += opal-xscom.o diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c new file mode 100644 index 000000000000..726b8693f5ae --- /dev/null +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * TCE helpers for IODA PCI/PCIe on PowerNV platforms + * + * Copyright 2018 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +#include +#include +#include "pci.h" + +void pnv_pci_setup_iommu_table(struct iommu_table *tbl, + void *tce_mem, u64 tce_size, + u64 dma_offset, unsigned int page_shift) +{ + tbl->it_blocksize = 16; + tbl->it_base = (unsigned long)tce_mem; + tbl->it_page_shift = page_shift; + tbl->it_offset = dma_offset >> tbl->it_page_shift; + tbl->it_index = 0; + tbl->it_size = tce_size >> 3; + tbl->it_busno = 0; + tbl->it_type = TCE_PCI; +} + +static __be64 *pnv_tce(struct iommu_table *tbl, long idx) +{ + __be64 *tmp = ((__be64 *)tbl->it_base); + int level = tbl->it_indirect_levels; + const long shift = ilog2(tbl->it_level_size); + unsigned long mask = (tbl->it_level_size - 1) << (level * shift); + + while (level) { + int n = (idx & mask) >> (level * shift); + unsigned long tce = be64_to_cpu(tmp[n]); + + tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE)); + idx &= ~mask; + mask >>= shift; + --level; + } + + return tmp + idx; +} + +int pnv_tce_build(struct iommu_table *tbl, long index, long npages, + unsigned long uaddr, enum dma_data_direction direction, + unsigned long attrs) +{ + u64 proto_tce = iommu_direction_to_tce_perm(direction); + u64 rpn = __pa(uaddr) >> tbl->it_page_shift; + long i; + + if (proto_tce & TCE_PCI_WRITE) + proto_tce |= TCE_PCI_READ; + + for (i = 0; i < npages; i++) { + unsigned long newtce = proto_tce | + ((rpn + i) << tbl->it_page_shift); + unsigned long idx = index - tbl->it_offset + i; + + *(pnv_tce(tbl, idx)) = cpu_to_be64(newtce); + } + + return 0; +} + +#ifdef CONFIG_IOMMU_API +int pnv_tce_xchg(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction) +{ + u64 proto_tce = iommu_direction_to_tce_perm(*direction); + unsigned long newtce = *hpa | proto_tce, oldtce; + unsigned long idx = index - tbl->it_offset; + + BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl)); + + if (newtce & TCE_PCI_WRITE) + newtce |= TCE_PCI_READ; + + oldtce = be64_to_cpu(xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce))); + *hpa = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE); + *direction = iommu_tce_direction(oldtce); + + return 0; +} +#endif + +void pnv_tce_free(struct iommu_table *tbl, long index, long npages) +{ + long i; + + for (i = 0; i < npages; i++) { + unsigned long idx = index - tbl->it_offset + i; + + *(pnv_tce(tbl, idx)) = cpu_to_be64(0); + } +} + +unsigned long pnv_tce_get(struct iommu_table *tbl, long index) +{ + return be64_to_cpu(*(pnv_tce(tbl, index - tbl->it_offset))); +} + +static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr, + unsigned long size, unsigned int levels) +{ + const unsigned long addr_ul = (unsigned long) addr & + ~(TCE_PCI_READ | TCE_PCI_WRITE); + + if (levels) { + long i; + u64 *tmp = (u64 *) addr_ul; + + for (i = 0; i < size; ++i) { + unsigned long hpa = be64_to_cpu(tmp[i]); + + if (!(hpa & (TCE_PCI_READ | TCE_PCI_WRITE))) + continue; + + pnv_pci_ioda2_table_do_free_pages(__va(hpa), size, + levels - 1); + } + } + + free_pages(addr_ul, get_order(size << 3)); +} + +void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl) +{ + const unsigned long size = tbl->it_indirect_levels ? + tbl->it_level_size : tbl->it_size; + + if (!tbl->it_size) + return; + + pnv_pci_ioda2_table_do_free_pages((__be64 *)tbl->it_base, size, + tbl->it_indirect_levels); +} + +static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned int shift, + unsigned int levels, unsigned long limit, + unsigned long *current_offset, unsigned long *total_allocated) +{ + struct page *tce_mem = NULL; + __be64 *addr, *tmp; + unsigned int order = max_t(unsigned int, shift, PAGE_SHIFT) - + PAGE_SHIFT; + unsigned long allocated = 1UL << (order + PAGE_SHIFT); + unsigned int entries = 1UL << (shift - 3); + long i; + + tce_mem = alloc_pages_node(nid, GFP_KERNEL, order); + if (!tce_mem) { + pr_err("Failed to allocate a TCE memory, order=%d\n", order); + return NULL; + } + addr = page_address(tce_mem); + memset(addr, 0, allocated); + *total_allocated += allocated; + + --levels; + if (!levels) { + *current_offset += allocated; + return addr; + } + + for (i = 0; i < entries; ++i) { + tmp = pnv_pci_ioda2_table_do_alloc_pages(nid, shift, + levels, limit, current_offset, total_allocated); + if (!tmp) + break; + + addr[i] = cpu_to_be64(__pa(tmp) | + TCE_PCI_READ | TCE_PCI_WRITE); + + if (*current_offset >= limit) + break; + } + + return addr; +} + +long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, + __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table *tbl) +{ + void *addr; + unsigned long offset = 0, level_shift, total_allocated = 0; + const unsigned int window_shift = ilog2(window_size); + unsigned int entries_shift = window_shift - page_shift; + unsigned int table_shift = max_t(unsigned int, entries_shift + 3, + PAGE_SHIFT); + const unsigned long tce_table_size = 1UL << table_shift; + + if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS)) + return -EINVAL; + + if (!is_power_of_2(window_size)) + return -EINVAL; + + /* Adjust direct table size from window_size and levels */ + entries_shift = (entries_shift + levels - 1) / levels; + level_shift = entries_shift + 3; + level_shift = max_t(unsigned int, level_shift, PAGE_SHIFT); + + if ((level_shift - 3) * levels + page_shift >= 60) + return -EINVAL; + + /* Allocate TCE table */ + addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, + levels, tce_table_size, &offset, &total_allocated); + + /* addr==NULL means that the first level allocation failed */ + if (!addr) + return -ENOMEM; + + /* + * First level was allocated but some lower level failed as + * we did not allocate as much as we wanted, + * release partially allocated table. + */ + if (offset < tce_table_size) { + pnv_pci_ioda2_table_do_free_pages(addr, + 1ULL << (level_shift - 3), levels - 1); + return -ENOMEM; + } + + /* Setup linux iommu table */ + pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset, + page_shift); + tbl->it_level_size = 1ULL << (level_shift - 3); + tbl->it_indirect_levels = levels - 1; + tbl->it_allocated_size = total_allocated; + + pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n", + window_size, tce_table_size, bus_offset); + + return 0; +} + +static void pnv_iommu_table_group_link_free(struct rcu_head *head) +{ + struct iommu_table_group_link *tgl = container_of(head, + struct iommu_table_group_link, rcu); + + kfree(tgl); +} + +void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, + struct iommu_table_group *table_group) +{ + long i; + bool found; + struct iommu_table_group_link *tgl; + + if (!tbl || !table_group) + return; + + /* Remove link to a group from table's list of attached groups */ + found = false; + list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { + if (tgl->table_group == table_group) { + list_del_rcu(&tgl->next); + call_rcu(&tgl->rcu, pnv_iommu_table_group_link_free); + found = true; + break; + } + } + if (WARN_ON(!found)) + return; + + /* Clean a pointer to iommu_table in iommu_table_group::tables[] */ + found = false; + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + if (table_group->tables[i] == tbl) { + table_group->tables[i] = NULL; + found = true; + break; + } + } + WARN_ON(!found); +} + +long pnv_pci_link_table_and_group(int node, int num, + struct iommu_table *tbl, + struct iommu_table_group *table_group) +{ + struct iommu_table_group_link *tgl = NULL; + + if (WARN_ON(!tbl || !table_group)) + return -EINVAL; + + tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL, + node); + if (!tgl) + return -ENOMEM; + + tgl->table_group = table_group; + list_add_rcu(&tgl->next, &tbl->it_group_list); + + table_group->tables[num] = tbl; + + return 0; +} diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index d453f88c3c3e..4abf1175626e 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -51,12 +51,8 @@ #define PNV_IODA1_M64_SEGS 8 /* Segments per M64 BAR */ #define PNV_IODA1_DMA32_SEGSIZE 0x10000000 -#define POWERNV_IOMMU_DEFAULT_LEVELS 1 -#define POWERNV_IOMMU_MAX_LEVELS 5 - static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK", "NPU_OCAPI" }; -static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl); void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...) @@ -2457,10 +2453,6 @@ void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) pe->tce_bypass_enabled = enable; } -static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, - __u32 page_shift, __u64 window_size, __u32 levels, - struct iommu_table *tbl); - static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group, int num, __u32 page_shift, __u64 window_size, __u32 levels, struct iommu_table **ptbl) @@ -2768,144 +2760,6 @@ static void pnv_pci_ioda_setup_iommu_api(void) static void pnv_pci_ioda_setup_iommu_api(void) { }; #endif -static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift, - unsigned levels, unsigned long limit, - unsigned long *current_offset, unsigned long *total_allocated) -{ - struct page *tce_mem = NULL; - __be64 *addr, *tmp; - unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT; - unsigned long allocated = 1UL << (order + PAGE_SHIFT); - unsigned entries = 1UL << (shift - 3); - long i; - - tce_mem = alloc_pages_node(nid, GFP_KERNEL, order); - if (!tce_mem) { - pr_err("Failed to allocate a TCE memory, order=%d\n", order); - return NULL; - } - addr = page_address(tce_mem); - memset(addr, 0, allocated); - *total_allocated += allocated; - - --levels; - if (!levels) { - *current_offset += allocated; - return addr; - } - - for (i = 0; i < entries; ++i) { - tmp = pnv_pci_ioda2_table_do_alloc_pages(nid, shift, - levels, limit, current_offset, total_allocated); - if (!tmp) - break; - - addr[i] = cpu_to_be64(__pa(tmp) | - TCE_PCI_READ | TCE_PCI_WRITE); - - if (*current_offset >= limit) - break; - } - - return addr; -} - -static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr, - unsigned long size, unsigned level); - -static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, - __u32 page_shift, __u64 window_size, __u32 levels, - struct iommu_table *tbl) -{ - void *addr; - unsigned long offset = 0, level_shift, total_allocated = 0; - const unsigned window_shift = ilog2(window_size); - unsigned entries_shift = window_shift - page_shift; - unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT); - const unsigned long tce_table_size = 1UL << table_shift; - - if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS)) - return -EINVAL; - - if (!is_power_of_2(window_size)) - return -EINVAL; - - /* Adjust direct table size from window_size and levels */ - entries_shift = (entries_shift + levels - 1) / levels; - level_shift = entries_shift + 3; - level_shift = max_t(unsigned, level_shift, PAGE_SHIFT); - - if ((level_shift - 3) * levels + page_shift >= 60) - return -EINVAL; - - /* Allocate TCE table */ - addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, - levels, tce_table_size, &offset, &total_allocated); - - /* addr==NULL means that the first level allocation failed */ - if (!addr) - return -ENOMEM; - - /* - * First level was allocated but some lower level failed as - * we did not allocate as much as we wanted, - * release partially allocated table. - */ - if (offset < tce_table_size) { - pnv_pci_ioda2_table_do_free_pages(addr, - 1ULL << (level_shift - 3), levels - 1); - return -ENOMEM; - } - - /* Setup linux iommu table */ - pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset, - page_shift); - tbl->it_level_size = 1ULL << (level_shift - 3); - tbl->it_indirect_levels = levels - 1; - tbl->it_allocated_size = total_allocated; - - pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n", - window_size, tce_table_size, bus_offset); - - return 0; -} - -static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr, - unsigned long size, unsigned level) -{ - const unsigned long addr_ul = (unsigned long) addr & - ~(TCE_PCI_READ | TCE_PCI_WRITE); - - if (level) { - long i; - u64 *tmp = (u64 *) addr_ul; - - for (i = 0; i < size; ++i) { - unsigned long hpa = be64_to_cpu(tmp[i]); - - if (!(hpa & (TCE_PCI_READ | TCE_PCI_WRITE))) - continue; - - pnv_pci_ioda2_table_do_free_pages(__va(hpa), size, - level - 1); - } - } - - free_pages(addr_ul, get_order(size << 3)); -} - -static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl) -{ - const unsigned long size = tbl->it_indirect_levels ? - tbl->it_level_size : tbl->it_size; - - if (!tbl->it_size) - return; - - pnv_pci_ioda2_table_do_free_pages((__be64 *)tbl->it_base, size, - tbl->it_indirect_levels); -} - static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb) { struct pci_controller *hose = phb->hose; diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index b265ecc0836a..13aef2323bbc 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -802,85 +802,6 @@ struct pci_ops pnv_pci_ops = { .write = pnv_pci_write_config, }; -static __be64 *pnv_tce(struct iommu_table *tbl, long idx) -{ - __be64 *tmp = ((__be64 *)tbl->it_base); - int level = tbl->it_indirect_levels; - const long shift = ilog2(tbl->it_level_size); - unsigned long mask = (tbl->it_level_size - 1) << (level * shift); - - while (level) { - int n = (idx & mask) >> (level * shift); - unsigned long tce = be64_to_cpu(tmp[n]); - - tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE)); - idx &= ~mask; - mask >>= shift; - --level; - } - - return tmp + idx; -} - -int pnv_tce_build(struct iommu_table *tbl, long index, long npages, - unsigned long uaddr, enum dma_data_direction direction, - unsigned long attrs) -{ - u64 proto_tce = iommu_direction_to_tce_perm(direction); - u64 rpn = __pa(uaddr) >> tbl->it_page_shift; - long i; - - if (proto_tce & TCE_PCI_WRITE) - proto_tce |= TCE_PCI_READ; - - for (i = 0; i < npages; i++) { - unsigned long newtce = proto_tce | - ((rpn + i) << tbl->it_page_shift); - unsigned long idx = index - tbl->it_offset + i; - - *(pnv_tce(tbl, idx)) = cpu_to_be64(newtce); - } - - return 0; -} - -#ifdef CONFIG_IOMMU_API -int pnv_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) -{ - u64 proto_tce = iommu_direction_to_tce_perm(*direction); - unsigned long newtce = *hpa | proto_tce, oldtce; - unsigned long idx = index - tbl->it_offset; - - BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl)); - - if (newtce & TCE_PCI_WRITE) - newtce |= TCE_PCI_READ; - - oldtce = be64_to_cpu(xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce))); - *hpa = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE); - *direction = iommu_tce_direction(oldtce); - - return 0; -} -#endif - -void pnv_tce_free(struct iommu_table *tbl, long index, long npages) -{ - long i; - - for (i = 0; i < npages; i++) { - unsigned long idx = index - tbl->it_offset + i; - - *(pnv_tce(tbl, idx)) = cpu_to_be64(0); - } -} - -unsigned long pnv_tce_get(struct iommu_table *tbl, long index) -{ - return be64_to_cpu(*(pnv_tce(tbl, index - tbl->it_offset))); -} - struct iommu_table *pnv_pci_table_alloc(int nid) { struct iommu_table *tbl; @@ -895,85 +816,6 @@ struct iommu_table *pnv_pci_table_alloc(int nid) return tbl; } -long pnv_pci_link_table_and_group(int node, int num, - struct iommu_table *tbl, - struct iommu_table_group *table_group) -{ - struct iommu_table_group_link *tgl = NULL; - - if (WARN_ON(!tbl || !table_group)) - return -EINVAL; - - tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL, - node); - if (!tgl) - return -ENOMEM; - - tgl->table_group = table_group; - list_add_rcu(&tgl->next, &tbl->it_group_list); - - table_group->tables[num] = tbl; - - return 0; -} - -static void pnv_iommu_table_group_link_free(struct rcu_head *head) -{ - struct iommu_table_group_link *tgl = container_of(head, - struct iommu_table_group_link, rcu); - - kfree(tgl); -} - -void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, - struct iommu_table_group *table_group) -{ - long i; - bool found; - struct iommu_table_group_link *tgl; - - if (!tbl || !table_group) - return; - - /* Remove link to a group from table's list of attached groups */ - found = false; - list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { - if (tgl->table_group == table_group) { - list_del_rcu(&tgl->next); - call_rcu(&tgl->rcu, pnv_iommu_table_group_link_free); - found = true; - break; - } - } - if (WARN_ON(!found)) - return; - - /* Clean a pointer to iommu_table in iommu_table_group::tables[] */ - found = false; - for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { - if (table_group->tables[i] == tbl) { - table_group->tables[i] = NULL; - found = true; - break; - } - } - WARN_ON(!found); -} - -void pnv_pci_setup_iommu_table(struct iommu_table *tbl, - void *tce_mem, u64 tce_size, - u64 dma_offset, unsigned page_shift) -{ - tbl->it_blocksize = 16; - tbl->it_base = (unsigned long)tce_mem; - tbl->it_page_shift = page_shift; - tbl->it_offset = dma_offset >> tbl->it_page_shift; - tbl->it_index = 0; - tbl->it_size = tce_size >> 3; - tbl->it_busno = 0; - tbl->it_type = TCE_PCI; -} - void pnv_pci_dma_dev_setup(struct pci_dev *pdev) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index eada4b6068cb..fa90f60e89ce 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -201,13 +201,6 @@ struct pnv_phb { }; extern struct pci_ops pnv_pci_ops; -extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, - unsigned long uaddr, enum dma_data_direction direction, - unsigned long attrs); -extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); -extern int pnv_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction); -extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, unsigned char *log_buff); @@ -217,14 +210,6 @@ int pnv_pci_cfg_write(struct pci_dn *pdn, int where, int size, u32 val); extern struct iommu_table *pnv_pci_table_alloc(int nid); -extern long pnv_pci_link_table_and_group(int node, int num, - struct iommu_table *tbl, - struct iommu_table_group *table_group); -extern void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, - struct iommu_table_group *table_group); -extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, - void *tce_mem, u64 tce_size, - u64 dma_offset, unsigned page_shift); extern void pnv_pci_init_ioda_hub(struct device_node *np); extern void pnv_pci_init_ioda2_phb(struct device_node *np); extern void pnv_pci_init_npu_phb(struct device_node *np); @@ -272,4 +257,30 @@ extern void pnv_cxl_cx4_teardown_msi_irqs(struct pci_dev *pdev); /* phb ops (cxl switches these when enabling the kernel api on the phb) */ extern const struct pci_controller_ops pnv_cxl_cx4_ioda_controller_ops; +/* pci-ioda-tce.c */ +#define POWERNV_IOMMU_DEFAULT_LEVELS 1 +#define POWERNV_IOMMU_MAX_LEVELS 5 + +extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, + unsigned long uaddr, enum dma_data_direction direction, + unsigned long attrs); +extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); +extern int pnv_tce_xchg(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction); +extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); + +extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, + __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table *tbl); +extern void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl); + +extern long pnv_pci_link_table_and_group(int node, int num, + struct iommu_table *tbl, + struct iommu_table_group *table_group); +extern void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, + struct iommu_table_group *table_group); +extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, + void *tce_mem, u64 tce_size, + u64 dma_offset, unsigned int page_shift); + #endif /* __POWERNV_PCI_H */ From 00a5c58d9499bd0c290b57205f43a70f2e69d3f6 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 4 Jul 2018 16:13:46 +1000 Subject: [PATCH 4/7] KVM: PPC: Make iommu_table::it_userspace big endian We are going to reuse multilevel TCE code for the userspace copy of the TCE table and since it is big endian, let's make the copy big endian too. Reviewed-by: David Gibson Signed-off-by: Alexey Kardashevskiy Acked-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 2 +- arch/powerpc/kvm/book3s_64_vio.c | 11 ++++++----- arch/powerpc/kvm/book3s_64_vio_hv.c | 10 +++++----- drivers/vfio/vfio_iommu_spapr_tce.c | 19 +++++++++---------- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 20febe0b7f32..803ac70ecedf 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -117,7 +117,7 @@ struct iommu_table { unsigned long *it_map; /* A simple allocation bitmap for now */ unsigned long it_page_shift;/* table iommu page size */ struct list_head it_group_list;/* List of iommu_table_group_link */ - unsigned long *it_userspace; /* userspace view of the table */ + __be64 *it_userspace; /* userspace view of the table */ struct iommu_table_ops *it_ops; struct kref it_kref; }; diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index d066e37551ec..8b9aaf24b0a2 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -378,19 +378,19 @@ static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm, { struct mm_iommu_table_group_mem_t *mem = NULL; const unsigned long pgsize = 1ULL << tbl->it_page_shift; - unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); if (!pua) /* it_userspace allocation might be delayed */ return H_TOO_HARD; - mem = mm_iommu_lookup(kvm->mm, *pua, pgsize); + mem = mm_iommu_lookup(kvm->mm, be64_to_cpu(*pua), pgsize); if (!mem) return H_TOO_HARD; mm_iommu_mapped_dec(mem); - *pua = 0; + *pua = cpu_to_be64(0); return H_SUCCESS; } @@ -437,7 +437,8 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, enum dma_data_direction dir) { long ret; - unsigned long hpa, *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + unsigned long hpa; + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); struct mm_iommu_table_group_mem_t *mem; if (!pua) @@ -464,7 +465,7 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, if (dir != DMA_NONE) kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry); - *pua = ua; + *pua = cpu_to_be64(ua); return 0; } diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 925fc316a104..236f74b210a7 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -200,7 +200,7 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, { struct mm_iommu_table_group_mem_t *mem = NULL; const unsigned long pgsize = 1ULL << tbl->it_page_shift; - unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); if (!pua) /* it_userspace allocation might be delayed */ @@ -210,13 +210,13 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, if (WARN_ON_ONCE_RM(!pua)) return H_HARDWARE; - mem = mm_iommu_lookup_rm(kvm->mm, *pua, pgsize); + mem = mm_iommu_lookup_rm(kvm->mm, be64_to_cpu(*pua), pgsize); if (!mem) return H_TOO_HARD; mm_iommu_mapped_dec(mem); - *pua = 0; + *pua = cpu_to_be64(0); return H_SUCCESS; } @@ -268,7 +268,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, { long ret; unsigned long hpa = 0; - unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); struct mm_iommu_table_group_mem_t *mem; if (!pua) @@ -302,7 +302,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, if (dir != DMA_NONE) kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); - *pua = ua; + *pua = cpu_to_be64(ua); return 0; } diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 759a5bdd40e1..8ab124a67311 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -230,7 +230,7 @@ static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl, decrement_locked_vm(mm, cb >> PAGE_SHIFT); return -ENOMEM; } - tbl->it_userspace = uas; + tbl->it_userspace = (__be64 *) uas; return 0; } @@ -482,20 +482,20 @@ static void tce_iommu_unuse_page_v2(struct tce_container *container, struct mm_iommu_table_group_mem_t *mem = NULL; int ret; unsigned long hpa = 0; - unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); if (!pua) return; - ret = tce_iommu_prereg_ua_to_hpa(container, *pua, IOMMU_PAGE_SIZE(tbl), - &hpa, &mem); + ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua), + IOMMU_PAGE_SIZE(tbl), &hpa, &mem); if (ret) - pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n", - __func__, *pua, entry, ret); + pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n", + __func__, be64_to_cpu(*pua), entry, ret); if (mem) mm_iommu_mapped_dec(mem); - *pua = 0; + *pua = cpu_to_be64(0); } static int tce_iommu_clear(struct tce_container *container, @@ -607,8 +607,7 @@ static long tce_iommu_build_v2(struct tce_container *container, for (i = 0; i < pages; ++i) { struct mm_iommu_table_group_mem_t *mem = NULL; - unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, - entry + i); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i); ret = tce_iommu_prereg_ua_to_hpa(container, tce, IOMMU_PAGE_SIZE(tbl), &hpa, &mem); @@ -642,7 +641,7 @@ static long tce_iommu_build_v2(struct tce_container *container, if (dirtmp != DMA_NONE) tce_iommu_unuse_page_v2(container, tbl, entry + i); - *pua = tce; + *pua = cpu_to_be64(tce); tce += IOMMU_PAGE_SIZE(tbl); } From 090bad39b237aad92d8e01baa033699cf0c81cbe Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 4 Jul 2018 16:13:47 +1000 Subject: [PATCH 5/7] powerpc/powernv: Add indirect levels to it_userspace We want to support sparse memory and therefore huge chunks of DMA windows do not need to be mapped. If a DMA window big enough to require 2 or more indirect levels, and a DMA window is used to map all RAM (which is a default case for 64bit window), we can actually save some memory by not allocation TCE for regions which we are not going to map anyway. The hardware tables alreary support indirect levels but we also keep host-physical-to-userspace translation array which is allocated by vmalloc() and is a flat array which might use quite some memory. This converts it_userspace from vmalloc'ed array to a multi level table. As the format becomes platform dependend, this replaces the direct access to it_usespace with a iommu_table_ops::useraddrptr hook which returns a pointer to the userspace copy of a TCE; future extension will return NULL if the level was not allocated. This should not change non-KVM handling of TCE tables and it_userspace will not be allocated for non-KVM tables. Reviewed-by: David Gibson Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 6 +- arch/powerpc/kvm/book3s_64_vio_hv.c | 8 --- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 65 +++++++++++++++---- arch/powerpc/platforms/powernv/pci-ioda.c | 23 +++++-- arch/powerpc/platforms/powernv/pci.h | 3 +- drivers/vfio/vfio_iommu_spapr_tce.c | 46 ------------- 6 files changed, 73 insertions(+), 78 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 803ac70ecedf..4bdcf22509e6 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -69,6 +69,8 @@ struct iommu_table_ops { long index, unsigned long *hpa, enum dma_data_direction *direction); + + __be64 *(*useraddrptr)(struct iommu_table *tbl, long index); #endif void (*clear)(struct iommu_table *tbl, long index, long npages); @@ -123,9 +125,7 @@ struct iommu_table { }; #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \ - ((tbl)->it_userspace ? \ - &((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \ - NULL) + ((tbl)->it_ops->useraddrptr((tbl), (entry))) /* Pure 2^n version of get_order */ static inline __attribute_const__ diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 236f74b210a7..ee98cf6180d7 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -206,10 +206,6 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, /* it_userspace allocation might be delayed */ return H_TOO_HARD; - pua = (void *) vmalloc_to_phys(pua); - if (WARN_ON_ONCE_RM(!pua)) - return H_HARDWARE; - mem = mm_iommu_lookup_rm(kvm->mm, be64_to_cpu(*pua), pgsize); if (!mem) return H_TOO_HARD; @@ -282,10 +278,6 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, &hpa))) return H_HARDWARE; - pua = (void *) vmalloc_to_phys(pua); - if (WARN_ON_ONCE_RM(!pua)) - return H_HARDWARE; - if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) return H_CLOSED; diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index 726b8693f5ae..88cecc1815d9 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -31,9 +31,9 @@ void pnv_pci_setup_iommu_table(struct iommu_table *tbl, tbl->it_type = TCE_PCI; } -static __be64 *pnv_tce(struct iommu_table *tbl, long idx) +static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx) { - __be64 *tmp = ((__be64 *)tbl->it_base); + __be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base; int level = tbl->it_indirect_levels; const long shift = ilog2(tbl->it_level_size); unsigned long mask = (tbl->it_level_size - 1) << (level * shift); @@ -67,7 +67,7 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages, ((rpn + i) << tbl->it_page_shift); unsigned long idx = index - tbl->it_offset + i; - *(pnv_tce(tbl, idx)) = cpu_to_be64(newtce); + *(pnv_tce(tbl, false, idx)) = cpu_to_be64(newtce); } return 0; @@ -86,12 +86,21 @@ int pnv_tce_xchg(struct iommu_table *tbl, long index, if (newtce & TCE_PCI_WRITE) newtce |= TCE_PCI_READ; - oldtce = be64_to_cpu(xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce))); + oldtce = be64_to_cpu(xchg(pnv_tce(tbl, false, idx), + cpu_to_be64(newtce))); *hpa = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE); *direction = iommu_tce_direction(oldtce); return 0; } + +__be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index) +{ + if (WARN_ON_ONCE(!tbl->it_userspace)) + return NULL; + + return pnv_tce(tbl, true, index - tbl->it_offset); +} #endif void pnv_tce_free(struct iommu_table *tbl, long index, long npages) @@ -101,13 +110,15 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages) for (i = 0; i < npages; i++) { unsigned long idx = index - tbl->it_offset + i; - *(pnv_tce(tbl, idx)) = cpu_to_be64(0); + *(pnv_tce(tbl, false, idx)) = cpu_to_be64(0); } } unsigned long pnv_tce_get(struct iommu_table *tbl, long index) { - return be64_to_cpu(*(pnv_tce(tbl, index - tbl->it_offset))); + __be64 *ptce = pnv_tce(tbl, false, index - tbl->it_offset); + + return be64_to_cpu(*ptce); } static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr, @@ -144,6 +155,10 @@ void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl) pnv_pci_ioda2_table_do_free_pages((__be64 *)tbl->it_base, size, tbl->it_indirect_levels); + if (tbl->it_userspace) { + pnv_pci_ioda2_table_do_free_pages(tbl->it_userspace, size, + tbl->it_indirect_levels); + } } static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned int shift, @@ -191,10 +206,11 @@ static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned int shift, long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, __u32 page_shift, __u64 window_size, __u32 levels, - struct iommu_table *tbl) + bool alloc_userspace_copy, struct iommu_table *tbl) { - void *addr; + void *addr, *uas = NULL; unsigned long offset = 0, level_shift, total_allocated = 0; + unsigned long total_allocated_uas = 0; const unsigned int window_shift = ilog2(window_size); unsigned int entries_shift = window_shift - page_shift; unsigned int table_shift = max_t(unsigned int, entries_shift + 3, @@ -228,10 +244,20 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, * we did not allocate as much as we wanted, * release partially allocated table. */ - if (offset < tce_table_size) { - pnv_pci_ioda2_table_do_free_pages(addr, - 1ULL << (level_shift - 3), levels - 1); - return -ENOMEM; + if (offset < tce_table_size) + goto free_tces_exit; + + /* Allocate userspace view of the TCE table */ + if (alloc_userspace_copy) { + offset = 0; + uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, + levels, tce_table_size, &offset, + &total_allocated_uas); + if (!uas) + goto free_tces_exit; + if (offset < tce_table_size || + total_allocated_uas != total_allocated) + goto free_uas_exit; } /* Setup linux iommu table */ @@ -240,11 +266,22 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, tbl->it_level_size = 1ULL << (level_shift - 3); tbl->it_indirect_levels = levels - 1; tbl->it_allocated_size = total_allocated; + tbl->it_userspace = uas; - pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n", - window_size, tce_table_size, bus_offset); + pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d\n", + window_size, tce_table_size, bus_offset, tbl->it_base, + tbl->it_userspace, levels); return 0; + +free_uas_exit: + pnv_pci_ioda2_table_do_free_pages(uas, + 1ULL << (level_shift - 3), levels - 1); +free_tces_exit: + pnv_pci_ioda2_table_do_free_pages(addr, + 1ULL << (level_shift - 3), levels - 1); + + return -ENOMEM; } static void pnv_iommu_table_group_link_free(struct rcu_head *head) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 4abf1175626e..fc38f06ee41d 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2036,6 +2036,7 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = { #ifdef CONFIG_IOMMU_API .exchange = pnv_ioda1_tce_xchg, .exchange_rm = pnv_ioda1_tce_xchg_rm, + .useraddrptr = pnv_tce_useraddrptr, #endif .clear = pnv_ioda1_tce_free, .get = pnv_tce_get, @@ -2200,6 +2201,7 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = { #ifdef CONFIG_IOMMU_API .exchange = pnv_ioda2_tce_xchg, .exchange_rm = pnv_ioda2_tce_xchg_rm, + .useraddrptr = pnv_tce_useraddrptr, #endif .clear = pnv_ioda2_tce_free, .get = pnv_tce_get, @@ -2455,7 +2457,7 @@ void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group, int num, __u32 page_shift, __u64 window_size, __u32 levels, - struct iommu_table **ptbl) + bool alloc_userspace_copy, struct iommu_table **ptbl) { struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, table_group); @@ -2472,7 +2474,7 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group, ret = pnv_pci_ioda2_table_alloc_pages(nid, bus_offset, page_shift, window_size, - levels, tbl); + levels, alloc_userspace_copy, tbl); if (ret) { iommu_tce_table_put(tbl); return ret; @@ -2505,7 +2507,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, IOMMU_PAGE_SHIFT_4K, window_size, - POWERNV_IOMMU_DEFAULT_LEVELS, &tbl); + POWERNV_IOMMU_DEFAULT_LEVELS, false, &tbl); if (rc) { pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc); @@ -2592,7 +2594,16 @@ static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, tce_table_size, direct_table_size); } - return bytes; + return bytes + bytes; /* one for HW table, one for userspace copy */ +} + +static long pnv_pci_ioda2_create_table_userspace( + struct iommu_table_group *table_group, + int num, __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table **ptbl) +{ + return pnv_pci_ioda2_create_table(table_group, + num, page_shift, window_size, levels, true, ptbl); } static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) @@ -2621,7 +2632,7 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) static struct iommu_table_group_ops pnv_pci_ioda2_ops = { .get_table_size = pnv_pci_ioda2_get_table_size, - .create_table = pnv_pci_ioda2_create_table, + .create_table = pnv_pci_ioda2_create_table_userspace, .set_window = pnv_pci_ioda2_set_window, .unset_window = pnv_pci_ioda2_unset_window, .take_ownership = pnv_ioda2_take_ownership, @@ -2726,7 +2737,7 @@ static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group) static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = { .get_table_size = pnv_pci_ioda2_get_table_size, - .create_table = pnv_pci_ioda2_create_table, + .create_table = pnv_pci_ioda2_create_table_userspace, .set_window = pnv_pci_ioda2_npu_set_window, .unset_window = pnv_pci_ioda2_npu_unset_window, .take_ownership = pnv_ioda2_npu_take_ownership, diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index fa90f60e89ce..2962f6ddb2a8 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -267,11 +267,12 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); extern int pnv_tce_xchg(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction); +extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index); extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, __u32 page_shift, __u64 window_size, __u32 levels, - struct iommu_table *tbl); + bool alloc_userspace_copy, struct iommu_table *tbl); extern void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl); extern long pnv_pci_link_table_and_group(int node, int num, diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 8ab124a67311..54ae6c2be1b7 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -211,44 +211,6 @@ static long tce_iommu_register_pages(struct tce_container *container, return 0; } -static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl, - struct mm_struct *mm) -{ - unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) * - tbl->it_size, PAGE_SIZE); - unsigned long *uas; - long ret; - - BUG_ON(tbl->it_userspace); - - ret = try_increment_locked_vm(mm, cb >> PAGE_SHIFT); - if (ret) - return ret; - - uas = vzalloc(cb); - if (!uas) { - decrement_locked_vm(mm, cb >> PAGE_SHIFT); - return -ENOMEM; - } - tbl->it_userspace = (__be64 *) uas; - - return 0; -} - -static void tce_iommu_userspace_view_free(struct iommu_table *tbl, - struct mm_struct *mm) -{ - unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) * - tbl->it_size, PAGE_SIZE); - - if (!tbl->it_userspace) - return; - - vfree(tbl->it_userspace); - tbl->it_userspace = NULL; - decrement_locked_vm(mm, cb >> PAGE_SHIFT); -} - static bool tce_page_is_contained(struct page *page, unsigned page_shift) { /* @@ -599,12 +561,6 @@ static long tce_iommu_build_v2(struct tce_container *container, unsigned long hpa; enum dma_data_direction dirtmp; - if (!tbl->it_userspace) { - ret = tce_iommu_userspace_view_alloc(tbl, container->mm); - if (ret) - return ret; - } - for (i = 0; i < pages; ++i) { struct mm_iommu_table_group_mem_t *mem = NULL; __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i); @@ -685,7 +641,6 @@ static void tce_iommu_free_table(struct tce_container *container, { unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; - tce_iommu_userspace_view_free(tbl, container->mm); iommu_tce_table_put(tbl); decrement_locked_vm(container->mm, pages); } @@ -1200,7 +1155,6 @@ static void tce_iommu_release_ownership(struct tce_container *container, continue; tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); - tce_iommu_userspace_view_free(tbl, container->mm); if (tbl->it_map) iommu_release_ownership(tbl); From 9bc98c8a43c4900ee63b160f805c65051e35d917 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 4 Jul 2018 16:13:48 +1000 Subject: [PATCH 6/7] powerpc/powernv: Rework TCE level allocation This moves actual pages allocation to a separate function which is going to be reused later in on-demand TCE allocation. While we are at it, remove unnecessary level size round up as the caller does this already. Reviewed-by: David Gibson Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index 88cecc1815d9..123c49925b46 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -31,6 +31,23 @@ void pnv_pci_setup_iommu_table(struct iommu_table *tbl, tbl->it_type = TCE_PCI; } +static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift) +{ + struct page *tce_mem = NULL; + __be64 *addr; + + tce_mem = alloc_pages_node(nid, GFP_KERNEL, shift - PAGE_SHIFT); + if (!tce_mem) { + pr_err("Failed to allocate a TCE memory, level shift=%d\n", + shift); + return NULL; + } + addr = page_address(tce_mem); + memset(addr, 0, 1UL << shift); + + return addr; +} + static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx) { __be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base; @@ -165,21 +182,12 @@ static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned int shift, unsigned int levels, unsigned long limit, unsigned long *current_offset, unsigned long *total_allocated) { - struct page *tce_mem = NULL; __be64 *addr, *tmp; - unsigned int order = max_t(unsigned int, shift, PAGE_SHIFT) - - PAGE_SHIFT; - unsigned long allocated = 1UL << (order + PAGE_SHIFT); + unsigned long allocated = 1UL << shift; unsigned int entries = 1UL << (shift - 3); long i; - tce_mem = alloc_pages_node(nid, GFP_KERNEL, order); - if (!tce_mem) { - pr_err("Failed to allocate a TCE memory, order=%d\n", order); - return NULL; - } - addr = page_address(tce_mem); - memset(addr, 0, allocated); + addr = pnv_alloc_tce_level(nid, shift); *total_allocated += allocated; --levels; From a68bd1267b7286b1687905651b404e765046de25 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 4 Jul 2018 16:13:49 +1000 Subject: [PATCH 7/7] powerpc/powernv/ioda: Allocate indirect TCE levels on demand At the moment we allocate the entire TCE table, twice (hardware part and userspace translation cache). This normally works as we normally have contigous memory and the guest will map entire RAM for 64bit DMA. However if we have sparse RAM (one example is a memory device), then we will allocate TCEs which will never be used as the guest only maps actual memory for DMA. If it is a single level TCE table, there is nothing we can really do but if it a multilevel table, we can skip allocating TCEs we know we won't need. This adds ability to allocate only first level, saving memory. This changes iommu_table::free() to avoid allocating of an extra level; iommu_table::set() will do this when needed. This adds @alloc parameter to iommu_table::exchange() to tell the callback if it can allocate an extra level; the flag is set to "false" for the realmode KVM handlers of H_PUT_TCE hcalls and the callback returns H_TOO_HARD. This still requires the entire table to be counted in mm::locked_vm. To be conservative, this only does on-demand allocation when the usespace cache table is requested which is the case of VFIO. The example math for a system replicating a powernv setup with NVLink2 in a guest: 16GB RAM mapped at 0x0 128GB GPU RAM window (16GB of actual RAM) mapped at 0x244000000000 the table to cover that all with 64K pages takes: (((0x244000000000 + 0x2000000000) >> 16)*8)>>20 = 4556MB If we allocate only necessary TCE levels, we will only need: (((0x400000000 + 0x400000000) >> 16)*8)>>20 = 4MB (plus some for indirect levels). Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 7 +- arch/powerpc/kvm/book3s_64_vio_hv.c | 4 +- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 73 +++++++++++++++---- arch/powerpc/platforms/powernv/pci-ioda.c | 8 +- arch/powerpc/platforms/powernv/pci.h | 6 +- drivers/vfio/vfio_iommu_spapr_tce.c | 2 +- 6 files changed, 73 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 4bdcf22509e6..daa3ee5d7ad2 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -70,7 +70,7 @@ struct iommu_table_ops { unsigned long *hpa, enum dma_data_direction *direction); - __be64 *(*useraddrptr)(struct iommu_table *tbl, long index); + __be64 *(*useraddrptr)(struct iommu_table *tbl, long index, bool alloc); #endif void (*clear)(struct iommu_table *tbl, long index, long npages); @@ -122,10 +122,13 @@ struct iommu_table { __be64 *it_userspace; /* userspace view of the table */ struct iommu_table_ops *it_ops; struct kref it_kref; + int it_nid; }; +#define IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry) \ + ((tbl)->it_ops->useraddrptr((tbl), (entry), false)) #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \ - ((tbl)->it_ops->useraddrptr((tbl), (entry))) + ((tbl)->it_ops->useraddrptr((tbl), (entry), true)) /* Pure 2^n version of get_order */ static inline __attribute_const__ diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index ee98cf6180d7..d4bcd1b17b09 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -200,7 +200,7 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, { struct mm_iommu_table_group_mem_t *mem = NULL; const unsigned long pgsize = 1ULL << tbl->it_page_shift; - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); if (!pua) /* it_userspace allocation might be delayed */ @@ -264,7 +264,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, { long ret; unsigned long hpa = 0; - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); struct mm_iommu_table_group_mem_t *mem; if (!pua) diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index 123c49925b46..6c5db1acbe8d 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -48,7 +48,7 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift) return addr; } -static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx) +static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc) { __be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base; int level = tbl->it_indirect_levels; @@ -57,7 +57,23 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx) while (level) { int n = (idx & mask) >> (level * shift); - unsigned long tce = be64_to_cpu(tmp[n]); + unsigned long tce; + + if (tmp[n] == 0) { + __be64 *tmp2; + + if (!alloc) + return NULL; + + tmp2 = pnv_alloc_tce_level(tbl->it_nid, + ilog2(tbl->it_level_size) + 3); + if (!tmp2) + return NULL; + + tmp[n] = cpu_to_be64(__pa(tmp2) | + TCE_PCI_READ | TCE_PCI_WRITE); + } + tce = be64_to_cpu(tmp[n]); tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE)); idx &= ~mask; @@ -84,7 +100,7 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages, ((rpn + i) << tbl->it_page_shift); unsigned long idx = index - tbl->it_offset + i; - *(pnv_tce(tbl, false, idx)) = cpu_to_be64(newtce); + *(pnv_tce(tbl, false, idx, true)) = cpu_to_be64(newtce); } return 0; @@ -92,31 +108,46 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages, #ifdef CONFIG_IOMMU_API int pnv_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) + unsigned long *hpa, enum dma_data_direction *direction, + bool alloc) { u64 proto_tce = iommu_direction_to_tce_perm(*direction); unsigned long newtce = *hpa | proto_tce, oldtce; unsigned long idx = index - tbl->it_offset; + __be64 *ptce = NULL; BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl)); + if (*direction == DMA_NONE) { + ptce = pnv_tce(tbl, false, idx, false); + if (!ptce) { + *hpa = 0; + return 0; + } + } + + if (!ptce) { + ptce = pnv_tce(tbl, false, idx, alloc); + if (!ptce) + return alloc ? H_HARDWARE : H_TOO_HARD; + } + if (newtce & TCE_PCI_WRITE) newtce |= TCE_PCI_READ; - oldtce = be64_to_cpu(xchg(pnv_tce(tbl, false, idx), - cpu_to_be64(newtce))); + oldtce = be64_to_cpu(xchg(ptce, cpu_to_be64(newtce))); *hpa = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE); *direction = iommu_tce_direction(oldtce); return 0; } -__be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index) +__be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index, bool alloc) { if (WARN_ON_ONCE(!tbl->it_userspace)) return NULL; - return pnv_tce(tbl, true, index - tbl->it_offset); + return pnv_tce(tbl, true, index - tbl->it_offset, alloc); } #endif @@ -126,14 +157,19 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages) for (i = 0; i < npages; i++) { unsigned long idx = index - tbl->it_offset + i; + __be64 *ptce = pnv_tce(tbl, false, idx, false); - *(pnv_tce(tbl, false, idx)) = cpu_to_be64(0); + if (ptce) + *ptce = cpu_to_be64(0); } } unsigned long pnv_tce_get(struct iommu_table *tbl, long index) { - __be64 *ptce = pnv_tce(tbl, false, index - tbl->it_offset); + __be64 *ptce = pnv_tce(tbl, false, index - tbl->it_offset, false); + + if (!ptce) + return 0; return be64_to_cpu(*ptce); } @@ -224,6 +260,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, unsigned int table_shift = max_t(unsigned int, entries_shift + 3, PAGE_SHIFT); const unsigned long tce_table_size = 1UL << table_shift; + unsigned int tmplevels = levels; if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS)) return -EINVAL; @@ -231,6 +268,9 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, if (!is_power_of_2(window_size)) return -EINVAL; + if (alloc_userspace_copy && (window_size > (1ULL << 32))) + tmplevels = 1; + /* Adjust direct table size from window_size and levels */ entries_shift = (entries_shift + levels - 1) / levels; level_shift = entries_shift + 3; @@ -241,7 +281,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, /* Allocate TCE table */ addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, - levels, tce_table_size, &offset, &total_allocated); + tmplevels, tce_table_size, &offset, &total_allocated); /* addr==NULL means that the first level allocation failed */ if (!addr) @@ -252,7 +292,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, * we did not allocate as much as we wanted, * release partially allocated table. */ - if (offset < tce_table_size) + if (tmplevels == levels && offset < tce_table_size) goto free_tces_exit; /* Allocate userspace view of the TCE table */ @@ -263,8 +303,8 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, &total_allocated_uas); if (!uas) goto free_tces_exit; - if (offset < tce_table_size || - total_allocated_uas != total_allocated) + if (tmplevels == levels && (offset < tce_table_size || + total_allocated_uas != total_allocated)) goto free_uas_exit; } @@ -275,10 +315,11 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, tbl->it_indirect_levels = levels - 1; tbl->it_allocated_size = total_allocated; tbl->it_userspace = uas; + tbl->it_nid = nid; - pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d\n", + pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n", window_size, tce_table_size, bus_offset, tbl->it_base, - tbl->it_userspace, levels); + tbl->it_userspace, tmplevels, levels); return 0; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index fc38f06ee41d..b4475f71a0b4 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2003,7 +2003,7 @@ static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index, static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction) { - long ret = pnv_tce_xchg(tbl, index, hpa, direction); + long ret = pnv_tce_xchg(tbl, index, hpa, direction, true); if (!ret) pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false); @@ -2014,7 +2014,7 @@ static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index, static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction) { - long ret = pnv_tce_xchg(tbl, index, hpa, direction); + long ret = pnv_tce_xchg(tbl, index, hpa, direction, false); if (!ret) pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true); @@ -2168,7 +2168,7 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction) { - long ret = pnv_tce_xchg(tbl, index, hpa, direction); + long ret = pnv_tce_xchg(tbl, index, hpa, direction, true); if (!ret) pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false); @@ -2179,7 +2179,7 @@ static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index, static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction) { - long ret = pnv_tce_xchg(tbl, index, hpa, direction); + long ret = pnv_tce_xchg(tbl, index, hpa, direction, false); if (!ret) pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 2962f6ddb2a8..0020937fc694 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -266,8 +266,10 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long attrs); extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); extern int pnv_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction); -extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index); + unsigned long *hpa, enum dma_data_direction *direction, + bool alloc); +extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index, + bool alloc); extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 54ae6c2be1b7..11a4c194d6e3 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -631,7 +631,7 @@ static long tce_iommu_create_table(struct tce_container *container, page_shift, window_size, levels, ptbl); WARN_ON(!ret && !(*ptbl)->it_ops->free); - WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size)); + WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size)); return ret; }