From ff083a2d972f56bebfd82409ca62e5dfce950961 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:22 +0000 Subject: [PATCH 01/17] perf: Protect perf_guest_cbs with RCU Protect perf_guest_cbs with RCU to fix multiple possible errors. Luckily, all paths that read perf_guest_cbs already require RCU protection, e.g. to protect the callback chains, so only the direct perf_guest_cbs touchpoints need to be modified. Bug #1 is a simple lack of WRITE_ONCE/READ_ONCE behavior to ensure perf_guest_cbs isn't reloaded between a !NULL check and a dereference. Fixed via the READ_ONCE() in rcu_dereference(). Bug #2 is that on weakly-ordered architectures, updates to the callbacks themselves are not guaranteed to be visible before the pointer is made visible to readers. Fixed by the smp_store_release() in rcu_assign_pointer() when the new pointer is non-NULL. Bug #3 is that, because the callbacks are global, it's possible for readers to run in parallel with an unregisters, and thus a module implementing the callbacks can be unloaded while readers are in flight, resulting in a use-after-free. Fixed by a synchronize_rcu() call when unregistering callbacks. Bug #1 escaped notice because it's extremely unlikely a compiler will reload perf_guest_cbs in this sequence. perf_guest_cbs does get reloaded for future derefs, e.g. for ->is_user_mode(), but the ->is_in_guest() guard all but guarantees the consumer will win the race, e.g. to nullify perf_guest_cbs, KVM has to completely exit the guest and teardown down all VMs before KVM start its module unload / unregister sequence. This also makes it all but impossible to encounter bug #3. Bug #2 has not been a problem because all architectures that register callbacks are strongly ordered and/or have a static set of callbacks. But with help, unloading kvm_intel can trigger bug #1 e.g. wrapping perf_guest_cbs with READ_ONCE in perf_misc_flags() while spamming kvm_intel module load/unload leads to: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP CPU: 6 PID: 1825 Comm: stress Not tainted 5.14.0-rc2+ #459 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:perf_misc_flags+0x1c/0x70 Call Trace: perf_prepare_sample+0x53/0x6b0 perf_event_output_forward+0x67/0x160 __perf_event_overflow+0x52/0xf0 handle_pmi_common+0x207/0x300 intel_pmu_handle_irq+0xcf/0x410 perf_event_nmi_handler+0x28/0x50 nmi_handle+0xc7/0x260 default_do_nmi+0x6b/0x170 exc_nmi+0x103/0x130 asm_exc_nmi+0x76/0xbf Fixes: 39447b386c84 ("perf: Enhance perf to allow for guest statistic collection from host") Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211111020738.2512932-2-seanjc@google.com --- arch/arm/kernel/perf_callchain.c | 17 +++++++++++------ arch/arm64/kernel/perf_callchain.c | 18 ++++++++++++------ arch/csky/kernel/perf_callchain.c | 6 ++++-- arch/nds32/kernel/perf_event_cpu.c | 17 +++++++++++------ arch/riscv/kernel/perf_callchain.c | 7 +++++-- arch/x86/events/core.c | 17 +++++++++++------ arch/x86/events/intel/core.c | 9 ++++++--- include/linux/perf_event.h | 13 ++++++++++++- kernel/events/core.c | 13 ++++++++++--- 9 files changed, 82 insertions(+), 35 deletions(-) diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c index 3b69a76d341e..1626dfc6f6ce 100644 --- a/arch/arm/kernel/perf_callchain.c +++ b/arch/arm/kernel/perf_callchain.c @@ -62,9 +62,10 @@ user_backtrace(struct frame_tail __user *tail, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct frame_tail __user *tail; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { /* We don't support guest os callchain now */ return; } @@ -98,9 +99,10 @@ callchain_trace(struct stackframe *fr, void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct stackframe fr; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { /* We don't support guest os callchain now */ return; } @@ -111,18 +113,21 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re unsigned long perf_instruction_pointer(struct pt_regs *regs) { - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) - return perf_guest_cbs->get_guest_ip(); + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + + if (guest_cbs && guest_cbs->is_in_guest()) + return guest_cbs->get_guest_ip(); return instruction_pointer(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); int misc = 0; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - if (perf_guest_cbs->is_user_mode()) + if (guest_cbs && guest_cbs->is_in_guest()) { + if (guest_cbs->is_user_mode()) misc |= PERF_RECORD_MISC_GUEST_USER; else misc |= PERF_RECORD_MISC_GUEST_KERNEL; diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 4a72c2727309..86d9f2013172 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -102,7 +102,9 @@ compat_user_backtrace(struct compat_frame_tail __user *tail, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + + if (guest_cbs && guest_cbs->is_in_guest()) { /* We don't support guest os callchain now */ return; } @@ -147,9 +149,10 @@ static bool callchain_trace(void *data, unsigned long pc) void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct stackframe frame; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { /* We don't support guest os callchain now */ return; } @@ -160,18 +163,21 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, unsigned long perf_instruction_pointer(struct pt_regs *regs) { - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) - return perf_guest_cbs->get_guest_ip(); + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + + if (guest_cbs && guest_cbs->is_in_guest()) + return guest_cbs->get_guest_ip(); return instruction_pointer(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); int misc = 0; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - if (perf_guest_cbs->is_user_mode()) + if (guest_cbs && guest_cbs->is_in_guest()) { + if (guest_cbs->is_user_mode()) misc |= PERF_RECORD_MISC_GUEST_USER; else misc |= PERF_RECORD_MISC_GUEST_KERNEL; diff --git a/arch/csky/kernel/perf_callchain.c b/arch/csky/kernel/perf_callchain.c index ab55e98ee8f6..35318a635a5f 100644 --- a/arch/csky/kernel/perf_callchain.c +++ b/arch/csky/kernel/perf_callchain.c @@ -86,10 +86,11 @@ static unsigned long user_backtrace(struct perf_callchain_entry_ctx *entry, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); unsigned long fp = 0; /* C-SKY does not support virtualization. */ - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) + if (guest_cbs && guest_cbs->is_in_guest()) return; fp = regs->regs[4]; @@ -110,10 +111,11 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct stackframe fr; /* C-SKY does not support virtualization. */ - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { pr_warn("C-SKY does not support perf in guest mode!"); return; } diff --git a/arch/nds32/kernel/perf_event_cpu.c b/arch/nds32/kernel/perf_event_cpu.c index 0ce6f9f307e6..f38791960781 100644 --- a/arch/nds32/kernel/perf_event_cpu.c +++ b/arch/nds32/kernel/perf_event_cpu.c @@ -1363,6 +1363,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); unsigned long fp = 0; unsigned long gp = 0; unsigned long lp = 0; @@ -1371,7 +1372,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, leaf_fp = 0; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { /* We don't support guest os callchain now */ return; } @@ -1479,9 +1480,10 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct stackframe fr; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { /* We don't support guest os callchain now */ return; } @@ -1493,20 +1495,23 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, unsigned long perf_instruction_pointer(struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + /* However, NDS32 does not support virtualization */ - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) - return perf_guest_cbs->get_guest_ip(); + if (guest_cbs && guest_cbs->is_in_guest()) + return guest_cbs->get_guest_ip(); return instruction_pointer(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); int misc = 0; /* However, NDS32 does not support virtualization */ - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - if (perf_guest_cbs->is_user_mode()) + if (guest_cbs && guest_cbs->is_in_guest()) { + if (guest_cbs->is_user_mode()) misc |= PERF_RECORD_MISC_GUEST_USER; else misc |= PERF_RECORD_MISC_GUEST_KERNEL; diff --git a/arch/riscv/kernel/perf_callchain.c b/arch/riscv/kernel/perf_callchain.c index 0bb1854dce83..8ecfc4c128bc 100644 --- a/arch/riscv/kernel/perf_callchain.c +++ b/arch/riscv/kernel/perf_callchain.c @@ -56,10 +56,11 @@ static unsigned long user_backtrace(struct perf_callchain_entry_ctx *entry, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); unsigned long fp = 0; /* RISC-V does not support perf in guest mode. */ - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) + if (guest_cbs && guest_cbs->is_in_guest()) return; fp = regs->s0; @@ -78,8 +79,10 @@ static bool fill_callchain(void *entry, unsigned long pc) void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + /* RISC-V does not support perf in guest mode. */ - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { pr_warn("RISC-V does not support perf in guest mode!"); return; } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 38b2c779146f..32cec290d3ad 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2768,10 +2768,11 @@ static bool perf_hw_regs(struct pt_regs *regs) void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct unwind_state state; unsigned long addr; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { /* TODO: We don't support guest os callchain now */ return; } @@ -2871,10 +2872,11 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct stack_frame frame; const struct stack_frame __user *fp; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { /* TODO: We don't support guest os callchain now */ return; } @@ -2951,18 +2953,21 @@ static unsigned long code_segment_base(struct pt_regs *regs) unsigned long perf_instruction_pointer(struct pt_regs *regs) { - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) - return perf_guest_cbs->get_guest_ip(); + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + + if (guest_cbs && guest_cbs->is_in_guest()) + return guest_cbs->get_guest_ip(); return regs->ip + code_segment_base(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); int misc = 0; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - if (perf_guest_cbs->is_user_mode()) + if (guest_cbs && guest_cbs->is_in_guest()) { + if (guest_cbs->is_user_mode()) misc |= PERF_RECORD_MISC_GUEST_USER; else misc |= PERF_RECORD_MISC_GUEST_KERNEL; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 42cf01ecdd13..2258e02ca350 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2837,6 +2837,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) { struct perf_sample_data data; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_guest_info_callbacks *guest_cbs; int bit; int handled = 0; u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); @@ -2903,9 +2904,11 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) */ if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) { handled++; - if (unlikely(perf_guest_cbs && perf_guest_cbs->is_in_guest() && - perf_guest_cbs->handle_intel_pt_intr)) - perf_guest_cbs->handle_intel_pt_intr(); + + guest_cbs = perf_get_guest_cbs(); + if (unlikely(guest_cbs && guest_cbs->is_in_guest() && + guest_cbs->handle_intel_pt_intr)) + guest_cbs->handle_intel_pt_intr(); else intel_pt_interrupt(); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0dcfd265beed..318c489b735b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1240,7 +1240,18 @@ extern void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags); -extern struct perf_guest_info_callbacks *perf_guest_cbs; +extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; +static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) +{ + /* + * Callbacks are RCU-protected and must be READ_ONCE to avoid reloading + * the callbacks between a !NULL check and dereferences, to ensure + * pending stores/changes to the callback pointers are visible before a + * non-NULL perf_guest_cbs is visible to readers, and to prevent a + * module from unloading callbacks while readers are active. + */ + return rcu_dereference(perf_guest_cbs); +} extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); diff --git a/kernel/events/core.c b/kernel/events/core.c index 523106a506ee..c552e1bfcaea 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6526,18 +6526,25 @@ static void perf_pending_event(struct irq_work *entry) * Later on, we might change it to a list if there is * another virtualization implementation supporting the callbacks. */ -struct perf_guest_info_callbacks *perf_guest_cbs; +struct perf_guest_info_callbacks __rcu *perf_guest_cbs; int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { - perf_guest_cbs = cbs; + if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) + return -EBUSY; + + rcu_assign_pointer(perf_guest_cbs, cbs); return 0; } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { - perf_guest_cbs = NULL; + if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs)) + return -EINVAL; + + rcu_assign_pointer(perf_guest_cbs, NULL); + synchronize_rcu(); return 0; } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); From 5c7df80e2ce4c954c80eb4ecf5fa002a5ff5d2d6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:23 +0000 Subject: [PATCH 02/17] KVM: x86: Register perf callbacks after calling vendor's hardware_setup() Wait to register perf callbacks until after doing vendor hardaware setup. VMX's hardware_setup() configures Intel Processor Trace (PT) mode, and a future fix to register the Intel PT guest interrupt hook if and only if Intel PT is exposed to the guest will consume the configured PT mode. Delaying registration to hardware setup is effectively a nop as KVM's perf hooks all pivot on the per-CPU current_vcpu, which is non-NULL only when KVM is handling an IRQ/NMI in a VM-Exit path. I.e. current_vcpu will be NULL throughout both kvm_arch_init() and kvm_arch_hardware_setup(). Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paolo Bonzini Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211111020738.2512932-3-seanjc@google.com --- arch/x86/kvm/x86.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dc7eb5fddfd3..50f0cd16f2d4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8626,8 +8626,6 @@ int kvm_arch_init(void *opaque) kvm_timer_init(); - perf_register_guest_info_callbacks(&kvm_guest_cbs); - if (boot_cpu_has(X86_FEATURE_XSAVE)) { host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; @@ -8659,7 +8657,6 @@ void kvm_arch_exit(void) clear_hv_tscchange_cb(); #endif kvm_lapic_exit(); - perf_unregister_guest_info_callbacks(&kvm_guest_cbs); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, @@ -11225,6 +11222,8 @@ int kvm_arch_hardware_setup(void *opaque) memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); kvm_ops_static_call_update(); + perf_register_guest_info_callbacks(&kvm_guest_cbs); + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) supported_xss = 0; @@ -11252,6 +11251,8 @@ int kvm_arch_hardware_setup(void *opaque) void kvm_arch_hardware_unsetup(void) { + perf_unregister_guest_info_callbacks(&kvm_guest_cbs); + static_call(kvm_x86_hardware_unsetup)(); } From f4b027c5c8199abd4fb6f00d67d380548dbfdfa8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:24 +0000 Subject: [PATCH 03/17] KVM: x86: Register Processor Trace interrupt hook iff PT enabled in guest Override the Processor Trace (PT) interrupt handler for guest mode if and only if PT is configured for host+guest mode, i.e. is being used independently by both host and guest. If PT is configured for system mode, the host fully controls PT and must handle all events. Fixes: 8479e04e7d6b ("KVM: x86: Inject PMI for KVM guest") Reported-by: Alexander Shishkin Reported-by: Artem Kashkanov Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paolo Bonzini Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211111020738.2512932-4-seanjc@google.com --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/vmx.c | 1 + arch/x86/kvm/x86.c | 5 ++++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e5d8700319cc..41e858df5795 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1516,6 +1516,7 @@ struct kvm_x86_init_ops { int (*disabled_by_bios)(void); int (*check_processor_compatibility)(void); int (*hardware_setup)(void); + bool (*intel_pt_intr_in_guest)(void); struct kvm_x86_ops *runtime_ops; }; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ba66c171d951..7d90c8d443ac 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7865,6 +7865,7 @@ static struct kvm_x86_init_ops vmx_init_ops __initdata = { .disabled_by_bios = vmx_disabled_by_bios, .check_processor_compatibility = vmx_check_processor_compat, .hardware_setup = hardware_setup, + .intel_pt_intr_in_guest = vmx_pt_mode_is_host_guest, .runtime_ops = &vmx_x86_ops, }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 50f0cd16f2d4..760c4e3a8326 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8510,7 +8510,7 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { .is_in_guest = kvm_is_in_guest, .is_user_mode = kvm_is_user_mode, .get_guest_ip = kvm_get_guest_ip, - .handle_intel_pt_intr = kvm_handle_intel_pt_intr, + .handle_intel_pt_intr = NULL, }; #ifdef CONFIG_X86_64 @@ -11222,6 +11222,8 @@ int kvm_arch_hardware_setup(void *opaque) memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); kvm_ops_static_call_update(); + if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest()) + kvm_guest_cbs.handle_intel_pt_intr = kvm_handle_intel_pt_intr; perf_register_guest_info_callbacks(&kvm_guest_cbs); if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) @@ -11252,6 +11254,7 @@ int kvm_arch_hardware_setup(void *opaque) void kvm_arch_hardware_unsetup(void) { perf_unregister_guest_info_callbacks(&kvm_guest_cbs); + kvm_guest_cbs.handle_intel_pt_intr = NULL; static_call(kvm_x86_hardware_unsetup)(); } From 2934e3d09350c1a7ca2433fbeabfcd831e48a575 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:25 +0000 Subject: [PATCH 04/17] perf: Stop pretending that perf can handle multiple guest callbacks Drop the 'int' return value from the perf (un)register callbacks helpers and stop pretending perf can support multiple callbacks. The 'int' returns are not future proofing anything as none of the callers take action on an error. It's also not obvious that there will ever be co-tenant hypervisors, and if there are, that allowing multiple callbacks to be registered is desirable or even correct. Opportunistically rename callbacks=>cbs in the affected declarations to match their definitions. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-5-seanjc@google.com --- arch/arm64/include/asm/kvm_host.h | 4 ++-- arch/arm64/kvm/perf.c | 8 ++++---- include/linux/perf_event.h | 12 ++++++------ kernel/events/core.c | 15 ++++----------- 4 files changed, 16 insertions(+), 23 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 2a5f7f38006f..f680f303ba7c 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -675,8 +675,8 @@ unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); int kvm_handle_mmio_return(struct kvm_vcpu *vcpu); int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa); -int kvm_perf_init(void); -int kvm_perf_teardown(void); +void kvm_perf_init(void); +void kvm_perf_teardown(void); long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu); gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c index c84fe24b2ea1..a0d660cf889e 100644 --- a/arch/arm64/kvm/perf.c +++ b/arch/arm64/kvm/perf.c @@ -48,12 +48,12 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { .get_guest_ip = kvm_get_guest_ip, }; -int kvm_perf_init(void) +void kvm_perf_init(void) { - return perf_register_guest_info_callbacks(&kvm_guest_cbs); + perf_register_guest_info_callbacks(&kvm_guest_cbs); } -int kvm_perf_teardown(void) +void kvm_perf_teardown(void) { - return perf_unregister_guest_info_callbacks(&kvm_guest_cbs); + perf_unregister_guest_info_callbacks(&kvm_guest_cbs); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 318c489b735b..98c204488496 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1252,8 +1252,8 @@ static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) */ return rcu_dereference(perf_guest_cbs); } -extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); -extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); +extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); +extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); @@ -1497,10 +1497,10 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void perf_bp_event(struct perf_event *event, void *data) { } -static inline int perf_register_guest_info_callbacks -(struct perf_guest_info_callbacks *callbacks) { return 0; } -static inline int perf_unregister_guest_info_callbacks -(struct perf_guest_info_callbacks *callbacks) { return 0; } +static inline void perf_register_guest_info_callbacks +(struct perf_guest_info_callbacks *cbs) { } +static inline void perf_unregister_guest_info_callbacks +(struct perf_guest_info_callbacks *cbs) { } static inline void perf_event_mmap(struct vm_area_struct *vma) { } diff --git a/kernel/events/core.c b/kernel/events/core.c index c552e1bfcaea..17e5b20762c5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6521,31 +6521,24 @@ static void perf_pending_event(struct irq_work *entry) perf_swevent_put_recursion_context(rctx); } -/* - * We assume there is only KVM supporting the callbacks. - * Later on, we might change it to a list if there is - * another virtualization implementation supporting the callbacks. - */ struct perf_guest_info_callbacks __rcu *perf_guest_cbs; -int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) - return -EBUSY; + return; rcu_assign_pointer(perf_guest_cbs, cbs); - return 0; } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); -int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs)) - return -EINVAL; + return; rcu_assign_pointer(perf_guest_cbs, NULL); synchronize_rcu(); - return 0; } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); From 84af21d850ee1ccc990df37dd47c13fdfe93be75 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:26 +0000 Subject: [PATCH 05/17] perf: Drop dead and useless guest "support" from arm, csky, nds32 and riscv Drop "support" for guest callbacks from architectures that don't implement the guest callbacks. Future patches will convert the callbacks to static_call; rather than churn a bunch of arch code (that was presumably copy+pasted from x86), remove it wholesale as it's useless and at best wasting cycles. A future patch will also add a Kconfig to force architcture to opt into the callbacks to make it more difficult for uses "support" to sneak in in the future. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-6-seanjc@google.com --- arch/arm/kernel/perf_callchain.c | 33 ++++------------------------- arch/csky/kernel/perf_callchain.c | 12 ----------- arch/nds32/kernel/perf_event_cpu.c | 34 ++++-------------------------- arch/riscv/kernel/perf_callchain.c | 13 ------------ 4 files changed, 8 insertions(+), 84 deletions(-) diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c index 1626dfc6f6ce..bc6b246ab55e 100644 --- a/arch/arm/kernel/perf_callchain.c +++ b/arch/arm/kernel/perf_callchain.c @@ -62,14 +62,8 @@ user_backtrace(struct frame_tail __user *tail, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct frame_tail __user *tail; - if (guest_cbs && guest_cbs->is_in_guest()) { - /* We don't support guest os callchain now */ - return; - } - perf_callchain_store(entry, regs->ARM_pc); if (!current->mm) @@ -99,44 +93,25 @@ callchain_trace(struct stackframe *fr, void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct stackframe fr; - if (guest_cbs && guest_cbs->is_in_guest()) { - /* We don't support guest os callchain now */ - return; - } - arm_get_current_stackframe(regs, &fr); walk_stackframe(&fr, callchain_trace, entry); } unsigned long perf_instruction_pointer(struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - if (guest_cbs && guest_cbs->is_in_guest()) - return guest_cbs->get_guest_ip(); - return instruction_pointer(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); int misc = 0; - if (guest_cbs && guest_cbs->is_in_guest()) { - if (guest_cbs->is_user_mode()) - misc |= PERF_RECORD_MISC_GUEST_USER; - else - misc |= PERF_RECORD_MISC_GUEST_KERNEL; - } else { - if (user_mode(regs)) - misc |= PERF_RECORD_MISC_USER; - else - misc |= PERF_RECORD_MISC_KERNEL; - } + if (user_mode(regs)) + misc |= PERF_RECORD_MISC_USER; + else + misc |= PERF_RECORD_MISC_KERNEL; return misc; } diff --git a/arch/csky/kernel/perf_callchain.c b/arch/csky/kernel/perf_callchain.c index 35318a635a5f..92057de08f4f 100644 --- a/arch/csky/kernel/perf_callchain.c +++ b/arch/csky/kernel/perf_callchain.c @@ -86,13 +86,8 @@ static unsigned long user_backtrace(struct perf_callchain_entry_ctx *entry, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); unsigned long fp = 0; - /* C-SKY does not support virtualization. */ - if (guest_cbs && guest_cbs->is_in_guest()) - return; - fp = regs->regs[4]; perf_callchain_store(entry, regs->pc); @@ -111,15 +106,8 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct stackframe fr; - /* C-SKY does not support virtualization. */ - if (guest_cbs && guest_cbs->is_in_guest()) { - pr_warn("C-SKY does not support perf in guest mode!"); - return; - } - fr.fp = regs->regs[4]; fr.lr = regs->lr; walk_stackframe(&fr, entry); diff --git a/arch/nds32/kernel/perf_event_cpu.c b/arch/nds32/kernel/perf_event_cpu.c index f38791960781..a78a879e7ef1 100644 --- a/arch/nds32/kernel/perf_event_cpu.c +++ b/arch/nds32/kernel/perf_event_cpu.c @@ -1363,7 +1363,6 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); unsigned long fp = 0; unsigned long gp = 0; unsigned long lp = 0; @@ -1372,11 +1371,6 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, leaf_fp = 0; - if (guest_cbs && guest_cbs->is_in_guest()) { - /* We don't support guest os callchain now */ - return; - } - perf_callchain_store(entry, regs->ipc); fp = regs->fp; gp = regs->gp; @@ -1480,13 +1474,8 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct stackframe fr; - if (guest_cbs && guest_cbs->is_in_guest()) { - /* We don't support guest os callchain now */ - return; - } fr.fp = regs->fp; fr.lp = regs->lp; fr.sp = regs->sp; @@ -1495,32 +1484,17 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, unsigned long perf_instruction_pointer(struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - /* However, NDS32 does not support virtualization */ - if (guest_cbs && guest_cbs->is_in_guest()) - return guest_cbs->get_guest_ip(); - return instruction_pointer(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); int misc = 0; - /* However, NDS32 does not support virtualization */ - if (guest_cbs && guest_cbs->is_in_guest()) { - if (guest_cbs->is_user_mode()) - misc |= PERF_RECORD_MISC_GUEST_USER; - else - misc |= PERF_RECORD_MISC_GUEST_KERNEL; - } else { - if (user_mode(regs)) - misc |= PERF_RECORD_MISC_USER; - else - misc |= PERF_RECORD_MISC_KERNEL; - } + if (user_mode(regs)) + misc |= PERF_RECORD_MISC_USER; + else + misc |= PERF_RECORD_MISC_KERNEL; return misc; } diff --git a/arch/riscv/kernel/perf_callchain.c b/arch/riscv/kernel/perf_callchain.c index 8ecfc4c128bc..1fc075b8f764 100644 --- a/arch/riscv/kernel/perf_callchain.c +++ b/arch/riscv/kernel/perf_callchain.c @@ -56,13 +56,8 @@ static unsigned long user_backtrace(struct perf_callchain_entry_ctx *entry, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); unsigned long fp = 0; - /* RISC-V does not support perf in guest mode. */ - if (guest_cbs && guest_cbs->is_in_guest()) - return; - fp = regs->s0; perf_callchain_store(entry, regs->epc); @@ -79,13 +74,5 @@ static bool fill_callchain(void *entry, unsigned long pc) void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - /* RISC-V does not support perf in guest mode. */ - if (guest_cbs && guest_cbs->is_in_guest()) { - pr_warn("RISC-V does not support perf in guest mode!"); - return; - } - walk_stackframe(NULL, regs, fill_callchain, entry); } From b9f5621c9547dd787900f005a9e1c3d5712de512 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Thu, 11 Nov 2021 02:07:27 +0000 Subject: [PATCH 06/17] perf/core: Rework guest callbacks to prepare for static_call support To prepare for using static_calls to optimize perf's guest callbacks, replace ->is_in_guest and ->is_user_mode with a new multiplexed hook ->state, tweak ->handle_intel_pt_intr to play nice with being called when there is no active guest, and drop "guest" from ->get_guest_ip. Return '0' from ->state and ->handle_intel_pt_intr to indicate "not in guest" so that DEFINE_STATIC_CALL_RET0 can be used to define the static calls, i.e. no callback == !guest. [sean: extracted from static_call patch, fixed get_ip() bug, wrote changelog] Suggested-by: Peter Zijlstra (Intel) Originally-by: Peter Zijlstra (Intel) Signed-off-by: Like Xu Signed-off-by: Zhu Lingshan Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Boris Ostrovsky Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-7-seanjc@google.com --- arch/arm64/kernel/perf_callchain.c | 13 ++++--- arch/arm64/kvm/perf.c | 35 ++++++++--------- arch/x86/events/core.c | 13 ++++--- arch/x86/events/intel/core.c | 5 +-- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/pmu.c | 2 +- arch/x86/kvm/x86.c | 62 ++++++++++++++++-------------- arch/x86/xen/pmu.c | 32 +++++++-------- include/linux/perf_event.h | 10 +++-- 9 files changed, 84 insertions(+), 90 deletions(-) diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 86d9f2013172..274dc3e11b6d 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -104,7 +104,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, { struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - if (guest_cbs && guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->state()) { /* We don't support guest os callchain now */ return; } @@ -152,7 +152,7 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct stackframe frame; - if (guest_cbs && guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->state()) { /* We don't support guest os callchain now */ return; } @@ -165,8 +165,8 @@ unsigned long perf_instruction_pointer(struct pt_regs *regs) { struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - if (guest_cbs && guest_cbs->is_in_guest()) - return guest_cbs->get_guest_ip(); + if (guest_cbs && guest_cbs->state()) + return guest_cbs->get_ip(); return instruction_pointer(regs); } @@ -174,10 +174,11 @@ unsigned long perf_instruction_pointer(struct pt_regs *regs) unsigned long perf_misc_flags(struct pt_regs *regs) { struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + unsigned int guest_state = guest_cbs ? guest_cbs->state() : 0; int misc = 0; - if (guest_cbs && guest_cbs->is_in_guest()) { - if (guest_cbs->is_user_mode()) + if (guest_state) { + if (guest_state & PERF_GUEST_USER) misc |= PERF_RECORD_MISC_GUEST_USER; else misc |= PERF_RECORD_MISC_GUEST_KERNEL; diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c index a0d660cf889e..dfa9bce8559e 100644 --- a/arch/arm64/kvm/perf.c +++ b/arch/arm64/kvm/perf.c @@ -13,39 +13,34 @@ DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available); -static int kvm_is_in_guest(void) +static unsigned int kvm_guest_state(void) { - return kvm_get_running_vcpu() != NULL; -} + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + unsigned int state; -static int kvm_is_user_mode(void) -{ - struct kvm_vcpu *vcpu; + if (!vcpu) + return 0; - vcpu = kvm_get_running_vcpu(); + state = PERF_GUEST_ACTIVE; + if (!vcpu_mode_priv(vcpu)) + state |= PERF_GUEST_USER; - if (vcpu) - return !vcpu_mode_priv(vcpu); - - return 0; + return state; } static unsigned long kvm_get_guest_ip(void) { - struct kvm_vcpu *vcpu; + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); - vcpu = kvm_get_running_vcpu(); + if (WARN_ON_ONCE(!vcpu)) + return 0; - if (vcpu) - return *vcpu_pc(vcpu); - - return 0; + return *vcpu_pc(vcpu); } static struct perf_guest_info_callbacks kvm_guest_cbs = { - .is_in_guest = kvm_is_in_guest, - .is_user_mode = kvm_is_user_mode, - .get_guest_ip = kvm_get_guest_ip, + .state = kvm_guest_state, + .get_ip = kvm_get_guest_ip, }; void kvm_perf_init(void) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 32cec290d3ad..e29312a1003a 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2772,7 +2772,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re struct unwind_state state; unsigned long addr; - if (guest_cbs && guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->state()) { /* TODO: We don't support guest os callchain now */ return; } @@ -2876,7 +2876,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs struct stack_frame frame; const struct stack_frame __user *fp; - if (guest_cbs && guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->state()) { /* TODO: We don't support guest os callchain now */ return; } @@ -2955,8 +2955,8 @@ unsigned long perf_instruction_pointer(struct pt_regs *regs) { struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - if (guest_cbs && guest_cbs->is_in_guest()) - return guest_cbs->get_guest_ip(); + if (guest_cbs && guest_cbs->state()) + return guest_cbs->get_ip(); return regs->ip + code_segment_base(regs); } @@ -2964,10 +2964,11 @@ unsigned long perf_instruction_pointer(struct pt_regs *regs) unsigned long perf_misc_flags(struct pt_regs *regs) { struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + unsigned int guest_state = guest_cbs ? guest_cbs->state() : 0; int misc = 0; - if (guest_cbs && guest_cbs->is_in_guest()) { - if (guest_cbs->is_user_mode()) + if (guest_state) { + if (guest_state & PERF_GUEST_USER) misc |= PERF_RECORD_MISC_GUEST_USER; else misc |= PERF_RECORD_MISC_GUEST_KERNEL; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2258e02ca350..7ff24d1ecdb7 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2906,10 +2906,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) handled++; guest_cbs = perf_get_guest_cbs(); - if (unlikely(guest_cbs && guest_cbs->is_in_guest() && - guest_cbs->handle_intel_pt_intr)) - guest_cbs->handle_intel_pt_intr(); - else + if (likely(!guest_cbs || !guest_cbs->handle_intel_pt_intr())) intel_pt_interrupt(); } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 41e858df5795..fa1b1a209945 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1895,7 +1895,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); -int kvm_is_in_guest(void); +unsigned int kvm_guest_state(void); void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 09873f6488f7..b2520b3e9e89 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -87,7 +87,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event, * woken up. So we should wake it, but this is impossible from * NMI context. Do it from irq work instead. */ - if (!kvm_is_in_guest()) + if (!kvm_guest_state()) irq_work_queue(&pmc_to_pmu(pmc)->irq_work); else kvm_make_request(KVM_REQ_PMI, pmc->vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 760c4e3a8326..2011a1cfb42d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8472,44 +8472,48 @@ static void kvm_timer_init(void) DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu); -int kvm_is_in_guest(void) -{ - return __this_cpu_read(current_vcpu) != NULL; -} - -static int kvm_is_user_mode(void) -{ - int user_mode = 3; - - if (__this_cpu_read(current_vcpu)) - user_mode = static_call(kvm_x86_get_cpl)(__this_cpu_read(current_vcpu)); - - return user_mode != 0; -} - -static unsigned long kvm_get_guest_ip(void) -{ - unsigned long ip = 0; - - if (__this_cpu_read(current_vcpu)) - ip = kvm_rip_read(__this_cpu_read(current_vcpu)); - - return ip; -} - -static void kvm_handle_intel_pt_intr(void) +unsigned int kvm_guest_state(void) { struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); + unsigned int state; + + if (!vcpu) + return 0; + + state = PERF_GUEST_ACTIVE; + if (static_call(kvm_x86_get_cpl)(vcpu)) + state |= PERF_GUEST_USER; + + return state; +} + +static unsigned long kvm_guest_get_ip(void) +{ + struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); + + if (WARN_ON_ONCE(!vcpu)) + return 0; + + return kvm_rip_read(vcpu); +} + +static unsigned int kvm_handle_intel_pt_intr(void) +{ + struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); + + /* '0' on failure so that the !PT case can use a RET0 static call. */ + if (!vcpu) + return 0; kvm_make_request(KVM_REQ_PMI, vcpu); __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, (unsigned long *)&vcpu->arch.pmu.global_status); + return 1; } static struct perf_guest_info_callbacks kvm_guest_cbs = { - .is_in_guest = kvm_is_in_guest, - .is_user_mode = kvm_is_user_mode, - .get_guest_ip = kvm_get_guest_ip, + .state = kvm_guest_state, + .get_ip = kvm_guest_get_ip, .handle_intel_pt_intr = NULL, }; diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index e13b0b49fcdf..89dd6b1708b0 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -413,34 +413,29 @@ int pmu_apic_update(uint32_t val) } /* perf callbacks */ -static int xen_is_in_guest(void) +static unsigned int xen_guest_state(void) { const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + unsigned int state = 0; if (!xenpmu_data) { pr_warn_once("%s: pmudata not initialized\n", __func__); - return 0; + return state; } if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF)) - return 0; + return state; - return 1; -} + state |= PERF_GUEST_ACTIVE; -static int xen_is_user_mode(void) -{ - const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); - - if (!xenpmu_data) { - pr_warn_once("%s: pmudata not initialized\n", __func__); - return 0; + if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV) { + if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER) + state |= PERF_GUEST_USER; + } else if (xenpmu_data->pmu.r.regs.cpl & 3) { + state |= PERF_GUEST_USER; } - if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV) - return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER); - else - return !!(xenpmu_data->pmu.r.regs.cpl & 3); + return state; } static unsigned long xen_get_guest_ip(void) @@ -456,9 +451,8 @@ static unsigned long xen_get_guest_ip(void) } static struct perf_guest_info_callbacks xen_guest_cbs = { - .is_in_guest = xen_is_in_guest, - .is_user_mode = xen_is_user_mode, - .get_guest_ip = xen_get_guest_ip, + .state = xen_guest_state, + .get_ip = xen_get_guest_ip, }; /* Convert registers from Xen's format to Linux' */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 98c204488496..5e6b346d62a7 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -26,11 +26,13 @@ # include #endif +#define PERF_GUEST_ACTIVE 0x01 +#define PERF_GUEST_USER 0x02 + struct perf_guest_info_callbacks { - int (*is_in_guest)(void); - int (*is_user_mode)(void); - unsigned long (*get_guest_ip)(void); - void (*handle_intel_pt_intr)(void); + unsigned int (*state)(void); + unsigned long (*get_ip)(void); + unsigned int (*handle_intel_pt_intr)(void); }; #ifdef CONFIG_HAVE_HW_BREAKPOINT From 1c3430516b0732d923de9fd3bfb3e2e537eeb235 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:28 +0000 Subject: [PATCH 07/17] perf: Add wrappers for invoking guest callbacks Add helpers for the guest callbacks to prepare for burying the callbacks behind a Kconfig (it's a lot easier to provide a few stubs than to #ifdef piles of code), and also to prepare for converting the callbacks to static_call(). perf_instruction_pointer() in particular will have subtle semantics with static_call(), as the "no callbacks" case will return 0 if the callbacks are unregistered between querying guest state and getting the IP. Implement the change now to avoid a functional change when adding static_call() support, and because the new helper needs to return _something_ in this case. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-8-seanjc@google.com --- arch/arm64/kernel/perf_callchain.c | 16 +++++----------- arch/x86/events/core.c | 15 +++++---------- arch/x86/events/intel/core.c | 5 +---- include/linux/perf_event.h | 24 ++++++++++++++++++++++++ 4 files changed, 35 insertions(+), 25 deletions(-) diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 274dc3e11b6d..db04a55cee7e 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -102,9 +102,7 @@ compat_user_backtrace(struct compat_frame_tail __user *tail, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - if (guest_cbs && guest_cbs->state()) { + if (perf_guest_state()) { /* We don't support guest os callchain now */ return; } @@ -149,10 +147,9 @@ static bool callchain_trace(void *data, unsigned long pc) void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct stackframe frame; - if (guest_cbs && guest_cbs->state()) { + if (perf_guest_state()) { /* We don't support guest os callchain now */ return; } @@ -163,18 +160,15 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, unsigned long perf_instruction_pointer(struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - if (guest_cbs && guest_cbs->state()) - return guest_cbs->get_ip(); + if (perf_guest_state()) + return perf_guest_get_ip(); return instruction_pointer(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - unsigned int guest_state = guest_cbs ? guest_cbs->state() : 0; + unsigned int guest_state = perf_guest_state(); int misc = 0; if (guest_state) { diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index e29312a1003a..620347398027 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2768,11 +2768,10 @@ static bool perf_hw_regs(struct pt_regs *regs) void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct unwind_state state; unsigned long addr; - if (guest_cbs && guest_cbs->state()) { + if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } @@ -2872,11 +2871,10 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct stack_frame frame; const struct stack_frame __user *fp; - if (guest_cbs && guest_cbs->state()) { + if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } @@ -2953,18 +2951,15 @@ static unsigned long code_segment_base(struct pt_regs *regs) unsigned long perf_instruction_pointer(struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - if (guest_cbs && guest_cbs->state()) - return guest_cbs->get_ip(); + if (perf_guest_state()) + return perf_guest_get_ip(); return regs->ip + code_segment_base(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - unsigned int guest_state = guest_cbs ? guest_cbs->state() : 0; + unsigned int guest_state = perf_guest_state(); int misc = 0; if (guest_state) { diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7ff24d1ecdb7..f7af8029664f 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2837,7 +2837,6 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) { struct perf_sample_data data; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct perf_guest_info_callbacks *guest_cbs; int bit; int handled = 0; u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); @@ -2904,9 +2903,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) */ if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) { handled++; - - guest_cbs = perf_get_guest_cbs(); - if (likely(!guest_cbs || !guest_cbs->handle_intel_pt_intr())) + if (!perf_guest_handle_intel_pt_intr()) intel_pt_interrupt(); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5e6b346d62a7..346d5aff5804 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1254,6 +1254,30 @@ static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) */ return rcu_dereference(perf_guest_cbs); } +static inline unsigned int perf_guest_state(void) +{ + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + + return guest_cbs ? guest_cbs->state() : 0; +} +static inline unsigned long perf_guest_get_ip(void) +{ + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + + /* + * Arbitrarily return '0' in the unlikely scenario that the callbacks + * are unregistered between checking guest state and getting the IP. + */ + return guest_cbs ? guest_cbs->get_ip() : 0; +} +static inline unsigned int perf_guest_handle_intel_pt_intr(void) +{ + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + + if (guest_cbs && guest_cbs->handle_intel_pt_intr) + return guest_cbs->handle_intel_pt_intr(); + return 0; +} extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); From 2aef6f306b39bbe74e2287d6e2ee07c4867d87d0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:29 +0000 Subject: [PATCH 08/17] perf: Force architectures to opt-in to guest callbacks Introduce GUEST_PERF_EVENTS and require architectures to select it to allow registering and using guest callbacks in perf. This will hopefully make it more difficult for new architectures to add useless "support" for guest callbacks, e.g. via copy+paste. Stubbing out the helpers has the happy bonus of avoiding a load of perf_guest_cbs when GUEST_PERF_EVENTS=n on arm64/x86. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-9-seanjc@google.com --- arch/arm64/kvm/Kconfig | 1 + arch/x86/kvm/Kconfig | 1 + arch/x86/xen/Kconfig | 1 + include/linux/perf_event.h | 6 ++++++ init/Kconfig | 4 ++++ kernel/events/core.c | 2 ++ 6 files changed, 15 insertions(+) diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 8ffcbe29395e..e9761d84f982 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -39,6 +39,7 @@ menuconfig KVM select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_VCPU_RUN_PID_CHANGE select SCHED_INFO + select GUEST_PERF_EVENTS if PERF_EVENTS help Support hosting virtualized guest machines. diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 619186138176..47bdbe705a76 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -36,6 +36,7 @@ config KVM select KVM_MMIO select SCHED_INFO select PERF_EVENTS + select GUEST_PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_NO_POLL diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 6bcd3d8ca6ac..85246dd9faa1 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -23,6 +23,7 @@ config XEN_PV select PARAVIRT_XXL select XEN_HAVE_PVMMU select XEN_HAVE_VPMU + select GUEST_PERF_EVENTS help Support running as a Xen PV guest. diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 346d5aff5804..ea47ef616ee0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1242,6 +1242,7 @@ extern void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags); +#ifdef CONFIG_GUEST_PERF_EVENTS extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) { @@ -1280,6 +1281,11 @@ static inline unsigned int perf_guest_handle_intel_pt_intr(void) } extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); +#else +static inline unsigned int perf_guest_state(void) { return 0; } +static inline unsigned long perf_guest_get_ip(void) { return 0; } +static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; } +#endif /* CONFIG_GUEST_PERF_EVENTS */ extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); diff --git a/init/Kconfig b/init/Kconfig index 036b750e8d8a..72d40b3b5805 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1804,6 +1804,10 @@ config HAVE_PERF_EVENTS help See tools/perf/design.txt for details. +config GUEST_PERF_EVENTS + bool + depends on HAVE_PERF_EVENTS + config PERF_USE_VMALLOC bool help diff --git a/kernel/events/core.c b/kernel/events/core.c index 17e5b20762c5..5a3502cd5362 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6521,6 +6521,7 @@ static void perf_pending_event(struct irq_work *entry) perf_swevent_put_recursion_context(rctx); } +#ifdef CONFIG_GUEST_PERF_EVENTS struct perf_guest_info_callbacks __rcu *perf_guest_cbs; void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) @@ -6541,6 +6542,7 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); +#endif static void perf_output_sample_regs(struct perf_output_handle *handle, From 87b940a0675e25261f022ac3e53e0dfff9cdb995 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:30 +0000 Subject: [PATCH 09/17] perf/core: Use static_call to optimize perf_guest_info_callbacks Use static_call to optimize perf's guest callbacks on arm64 and x86, which are now the only architectures that define the callbacks. Use DEFINE_STATIC_CALL_RET0 as the default/NULL for all guest callbacks, as the callback semantics are that a return value '0' means "not in guest". static_call obviously avoids the overhead of CONFIG_RETPOLINE=y, but is also advantageous versus other solutions, e.g. per-cpu callbacks, in that a per-cpu memory load is not needed to detect the !guest case. Based on code from Peter and Like. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-10-seanjc@google.com --- include/linux/perf_event.h | 34 ++++++++-------------------------- kernel/events/core.c | 15 +++++++++++++++ 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ea47ef616ee0..0ac7d867ca0c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1244,40 +1244,22 @@ extern void perf_event_bpf_event(struct bpf_prog *prog, #ifdef CONFIG_GUEST_PERF_EVENTS extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; -static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) -{ - /* - * Callbacks are RCU-protected and must be READ_ONCE to avoid reloading - * the callbacks between a !NULL check and dereferences, to ensure - * pending stores/changes to the callback pointers are visible before a - * non-NULL perf_guest_cbs is visible to readers, and to prevent a - * module from unloading callbacks while readers are active. - */ - return rcu_dereference(perf_guest_cbs); -} + +DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); +DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip); +DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); + static inline unsigned int perf_guest_state(void) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - return guest_cbs ? guest_cbs->state() : 0; + return static_call(__perf_guest_state)(); } static inline unsigned long perf_guest_get_ip(void) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - /* - * Arbitrarily return '0' in the unlikely scenario that the callbacks - * are unregistered between checking guest state and getting the IP. - */ - return guest_cbs ? guest_cbs->get_ip() : 0; + return static_call(__perf_guest_get_ip)(); } static inline unsigned int perf_guest_handle_intel_pt_intr(void) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - if (guest_cbs && guest_cbs->handle_intel_pt_intr) - return guest_cbs->handle_intel_pt_intr(); - return 0; + return static_call(__perf_guest_handle_intel_pt_intr)(); } extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); diff --git a/kernel/events/core.c b/kernel/events/core.c index 5a3502cd5362..3b3297a57228 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6524,12 +6524,23 @@ static void perf_pending_event(struct irq_work *entry) #ifdef CONFIG_GUEST_PERF_EVENTS struct perf_guest_info_callbacks __rcu *perf_guest_cbs; +DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state); +DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip); +DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); + void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) return; rcu_assign_pointer(perf_guest_cbs, cbs); + static_call_update(__perf_guest_state, cbs->state); + static_call_update(__perf_guest_get_ip, cbs->get_ip); + + /* Implementing ->handle_intel_pt_intr is optional. */ + if (cbs->handle_intel_pt_intr) + static_call_update(__perf_guest_handle_intel_pt_intr, + cbs->handle_intel_pt_intr); } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); @@ -6539,6 +6550,10 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) return; rcu_assign_pointer(perf_guest_cbs, NULL); + static_call_update(__perf_guest_state, (void *)&__static_call_return0); + static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0); + static_call_update(__perf_guest_handle_intel_pt_intr, + (void *)&__static_call_return0); synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); From 73cd107b9685c5308e864061772e4a78a629e4a0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:31 +0000 Subject: [PATCH 10/17] KVM: x86: Drop current_vcpu for kvm_running_vcpu + kvm_arch_vcpu variable Use the generic kvm_running_vcpu plus a new 'handling_intr_from_guest' variable in kvm_arch_vcpu instead of the semi-redundant current_vcpu. kvm_before/after_interrupt() must be called while the vCPU is loaded, (which protects against preemption), thus kvm_running_vcpu is guaranteed to be non-NULL when handling_intr_from_guest is non-zero. Switching to kvm_get_running_vcpu() will allows moving KVM's perf callbacks to generic code, and the new flag will be used in a future patch to more precisely identify the "NMI from guest" case. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-11-seanjc@google.com --- arch/x86/include/asm/kvm_host.h | 3 +-- arch/x86/kvm/pmu.c | 2 +- arch/x86/kvm/x86.c | 23 +++++++++++++---------- arch/x86/kvm/x86.h | 10 ++++++---- 4 files changed, 21 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fa1b1a209945..38f01b00d82a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -773,6 +773,7 @@ struct kvm_vcpu_arch { unsigned nmi_pending; /* NMI queued after currently running handler */ bool nmi_injected; /* Trying to inject an NMI this entry */ bool smi_pending; /* SMI queued after currently running handler */ + u8 handling_intr_from_guest; struct kvm_mtrr mtrr_state; u64 pat; @@ -1895,8 +1896,6 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); -unsigned int kvm_guest_state(void); - void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b2520b3e9e89..0c2133eb4cf6 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -87,7 +87,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event, * woken up. So we should wake it, but this is impossible from * NMI context. Do it from irq work instead. */ - if (!kvm_guest_state()) + if (!kvm_handling_nmi_from_guest(pmc->vcpu)) irq_work_queue(&pmc_to_pmu(pmc)->irq_work); else kvm_make_request(KVM_REQ_PMI, pmc->vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2011a1cfb42d..bb71e10fdb6a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8469,15 +8469,17 @@ static void kvm_timer_init(void) kvmclock_cpu_online, kvmclock_cpu_down_prep); } -DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); -EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu); - -unsigned int kvm_guest_state(void) +static inline bool kvm_pmi_in_guest(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); + return vcpu && vcpu->arch.handling_intr_from_guest; +} + +static unsigned int kvm_guest_state(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); unsigned int state; - if (!vcpu) + if (!kvm_pmi_in_guest(vcpu)) return 0; state = PERF_GUEST_ACTIVE; @@ -8489,9 +8491,10 @@ unsigned int kvm_guest_state(void) static unsigned long kvm_guest_get_ip(void) { - struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); - if (WARN_ON_ONCE(!vcpu)) + /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */ + if (WARN_ON_ONCE(!kvm_pmi_in_guest(vcpu))) return 0; return kvm_rip_read(vcpu); @@ -8499,10 +8502,10 @@ static unsigned long kvm_guest_get_ip(void) static unsigned int kvm_handle_intel_pt_intr(void) { - struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); /* '0' on failure so that the !PT case can use a RET0 static call. */ - if (!vcpu) + if (!kvm_pmi_in_guest(vcpu)) return 0; kvm_make_request(KVM_REQ_PMI, vcpu); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index ea264c4502e4..d070043fd2e8 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -385,18 +385,20 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm) return kvm->arch.cstate_in_guest; } -DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu); - static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu) { - __this_cpu_write(current_vcpu, vcpu); + WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 1); } static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) { - __this_cpu_write(current_vcpu, NULL); + WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0); } +static inline bool kvm_handling_nmi_from_guest(struct kvm_vcpu *vcpu) +{ + return !!vcpu->arch.handling_intr_from_guest; +} static inline bool kvm_pat_valid(u64 data) { From db215756ae5970aec8ad50257d2eb1678b552b91 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:32 +0000 Subject: [PATCH 11/17] KVM: x86: More precisely identify NMI from guest when handling PMI Differentiate between IRQ and NMI for KVM's PMC overflow callback, which was originally invoked in response to an NMI that arrived while the guest was running, but was inadvertantly changed to fire on IRQs as well when support for perf without PMU/NMI was added to KVM. In practice, this should be a nop as the PMC overflow callback shouldn't be reached, but it's a cheap and easy fix that also better documents the situation. Note, this also doesn't completely prevent false positives if perf somehow ends up calling into KVM, e.g. an NMI can arrive in host after KVM sets its flag. Fixes: dd60d217062f ("KVM: x86: Fix perf timer mode IP reporting") Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-12-seanjc@google.com --- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/vmx.c | 4 +++- arch/x86/kvm/x86.c | 2 +- arch/x86/kvm/x86.h | 13 ++++++++++--- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 5630c241d5f6..b2f0c6c40802 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3931,7 +3931,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) } if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) - kvm_before_interrupt(vcpu); + kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); kvm_load_host_xsave_state(vcpu); stgi(); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7d90c8d443ac..a0c24976e6bb 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6317,7 +6317,9 @@ void vmx_do_interrupt_nmi_irqoff(unsigned long entry); static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, unsigned long entry) { - kvm_before_interrupt(vcpu); + bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist; + + kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ); vmx_do_interrupt_nmi_irqoff(entry); kvm_after_interrupt(vcpu); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bb71e10fdb6a..ab032ef7879f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9896,7 +9896,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * interrupts on processors that implement an interrupt shadow, the * stat.exits increment will do nicely. */ - kvm_before_interrupt(vcpu); + kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); local_irq_enable(); ++vcpu->stat.exits; local_irq_disable(); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index d070043fd2e8..f8d2c58feadc 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -385,9 +385,16 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm) return kvm->arch.cstate_in_guest; } -static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu) +enum kvm_intr_type { + /* Values are arbitrary, but must be non-zero. */ + KVM_HANDLING_IRQ = 1, + KVM_HANDLING_NMI, +}; + +static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu, + enum kvm_intr_type intr) { - WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 1); + WRITE_ONCE(vcpu->arch.handling_intr_from_guest, (u8)intr); } static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) @@ -397,7 +404,7 @@ static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) static inline bool kvm_handling_nmi_from_guest(struct kvm_vcpu *vcpu) { - return !!vcpu->arch.handling_intr_from_guest; + return vcpu->arch.handling_intr_from_guest == KVM_HANDLING_NMI; } static inline bool kvm_pat_valid(u64 data) From e1bfc24577cc65c95dc519d7621a9c985b97e567 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:33 +0000 Subject: [PATCH 12/17] KVM: Move x86's perf guest info callbacks to generic KVM Move x86's perf guest callbacks into common KVM, as they are semantically identical to arm64's callbacks (the only other such KVM callbacks). arm64 will convert to the common versions in a future patch. Implement the necessary arm64 arch hooks now to avoid having to provide stubs or a temporary #define (from x86) to avoid arm64 compilation errors when CONFIG_GUEST_PERF_EVENTS=y. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20211111020738.2512932-13-seanjc@google.com --- arch/arm64/include/asm/kvm_host.h | 10 ++++++ arch/arm64/kvm/arm.c | 5 +++ arch/x86/include/asm/kvm_host.h | 3 ++ arch/x86/kvm/x86.c | 53 +++++++------------------------ include/linux/kvm_host.h | 10 ++++++ virt/kvm/kvm_main.c | 44 +++++++++++++++++++++++++ 6 files changed, 83 insertions(+), 42 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f680f303ba7c..aa28b8e0e5d0 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -678,6 +678,16 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa); void kvm_perf_init(void); void kvm_perf_teardown(void); +/* + * Returns true if a Performance Monitoring Interrupt (PMI), a.k.a. perf event, + * arrived in guest context. For arm64, any event that arrives while a vCPU is + * loaded is considered to be "in guest". + */ +static inline bool kvm_arch_pmi_in_guest(struct kvm_vcpu *vcpu) +{ + return IS_ENABLED(CONFIG_GUEST_PERF_EVENTS) && !!vcpu; +} + long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu); gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu); void kvm_update_stolen_time(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 2f03cbfefe67..b400be996040 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -496,6 +496,11 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) return vcpu_mode_priv(vcpu); } +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) +{ + return *vcpu_pc(vcpu); +} + /* Just ensure a guest exit from a particular CPU */ static void exit_vm_noop(void *info) { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 38f01b00d82a..89576549b1d5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1567,6 +1567,9 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) return -ENOTSUPP; } +#define kvm_arch_pmi_in_guest(vcpu) \ + ((vcpu) && (vcpu)->arch.handling_intr_from_guest) + int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ab032ef7879f..32cb6f9ca077 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8469,43 +8469,12 @@ static void kvm_timer_init(void) kvmclock_cpu_online, kvmclock_cpu_down_prep); } -static inline bool kvm_pmi_in_guest(struct kvm_vcpu *vcpu) -{ - return vcpu && vcpu->arch.handling_intr_from_guest; -} - -static unsigned int kvm_guest_state(void) -{ - struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); - unsigned int state; - - if (!kvm_pmi_in_guest(vcpu)) - return 0; - - state = PERF_GUEST_ACTIVE; - if (static_call(kvm_x86_get_cpl)(vcpu)) - state |= PERF_GUEST_USER; - - return state; -} - -static unsigned long kvm_guest_get_ip(void) -{ - struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); - - /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */ - if (WARN_ON_ONCE(!kvm_pmi_in_guest(vcpu))) - return 0; - - return kvm_rip_read(vcpu); -} - static unsigned int kvm_handle_intel_pt_intr(void) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); /* '0' on failure so that the !PT case can use a RET0 static call. */ - if (!kvm_pmi_in_guest(vcpu)) + if (!kvm_arch_pmi_in_guest(vcpu)) return 0; kvm_make_request(KVM_REQ_PMI, vcpu); @@ -8514,12 +8483,6 @@ static unsigned int kvm_handle_intel_pt_intr(void) return 1; } -static struct perf_guest_info_callbacks kvm_guest_cbs = { - .state = kvm_guest_state, - .get_ip = kvm_guest_get_ip, - .handle_intel_pt_intr = NULL, -}; - #ifdef CONFIG_X86_64 static void pvclock_gtod_update_fn(struct work_struct *work) { @@ -11229,9 +11192,11 @@ int kvm_arch_hardware_setup(void *opaque) memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); kvm_ops_static_call_update(); + /* Temporary ugliness. */ if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest()) - kvm_guest_cbs.handle_intel_pt_intr = kvm_handle_intel_pt_intr; - perf_register_guest_info_callbacks(&kvm_guest_cbs); + kvm_register_perf_callbacks(kvm_handle_intel_pt_intr); + else + kvm_register_perf_callbacks(NULL); if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) supported_xss = 0; @@ -11260,8 +11225,7 @@ int kvm_arch_hardware_setup(void *opaque) void kvm_arch_hardware_unsetup(void) { - perf_unregister_guest_info_callbacks(&kvm_guest_cbs); - kvm_guest_cbs.handle_intel_pt_intr = NULL; + kvm_unregister_perf_callbacks(); static_call(kvm_x86_hardware_unsetup)(); } @@ -11852,6 +11816,11 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) return vcpu->arch.preempted_in_kernel; } +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) +{ + return kvm_rip_read(vcpu); +} + int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9e0667e3723e..9df7ab2d7530 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1170,6 +1170,16 @@ static inline bool kvm_arch_intc_initialized(struct kvm *kvm) } #endif +#ifdef CONFIG_GUEST_PERF_EVENTS +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu); + +void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)); +void kvm_unregister_perf_callbacks(void); +#else +static inline void kvm_register_perf_callbacks(void *ign) {} +static inline void kvm_unregister_perf_callbacks(void) {} +#endif /* CONFIG_GUEST_PERF_EVENTS */ + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type); void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_arch_sync_events(struct kvm *kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d31724500501..76778dd2351f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5479,6 +5479,50 @@ struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) return &kvm_running_vcpu; } +#ifdef CONFIG_GUEST_PERF_EVENTS +static unsigned int kvm_guest_state(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + unsigned int state; + + if (!kvm_arch_pmi_in_guest(vcpu)) + return 0; + + state = PERF_GUEST_ACTIVE; + if (!kvm_arch_vcpu_in_kernel(vcpu)) + state |= PERF_GUEST_USER; + + return state; +} + +static unsigned long kvm_guest_get_ip(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + + /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */ + if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu))) + return 0; + + return kvm_arch_vcpu_get_ip(vcpu); +} + +static struct perf_guest_info_callbacks kvm_guest_cbs = { + .state = kvm_guest_state, + .get_ip = kvm_guest_get_ip, + .handle_intel_pt_intr = NULL, +}; + +void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)) +{ + kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler; + perf_register_guest_info_callbacks(&kvm_guest_cbs); +} +void kvm_unregister_perf_callbacks(void) +{ + perf_unregister_guest_info_callbacks(&kvm_guest_cbs); +} +#endif + struct kvm_cpu_compat_check { void *opaque; int *ret; From 33271a9e2b52e07e278a67c900d2d2afb5c55bd5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:34 +0000 Subject: [PATCH 13/17] KVM: x86: Move Intel Processor Trace interrupt handler to vmx.c Now that all state needed for VMX's PT interrupt handler is exposed to vmx.c (specifically the currently running vCPU), move the handler into vmx.c where it belongs. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20211111020738.2512932-14-seanjc@google.com --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/vmx/vmx.c | 22 +++++++++++++++++++++- arch/x86/kvm/x86.c | 20 +------------------- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 89576549b1d5..298224fd60ad 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1517,7 +1517,7 @@ struct kvm_x86_init_ops { int (*disabled_by_bios)(void); int (*check_processor_compatibility)(void); int (*hardware_setup)(void); - bool (*intel_pt_intr_in_guest)(void); + unsigned int (*handle_intel_pt_intr)(void); struct kvm_x86_ops *runtime_ops; }; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a0c24976e6bb..b258f90a23b9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7655,6 +7655,20 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, }; +static unsigned int vmx_handle_intel_pt_intr(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + + /* '0' on failure so that the !PT case can use a RET0 static call. */ + if (!kvm_arch_pmi_in_guest(vcpu)) + return 0; + + kvm_make_request(KVM_REQ_PMI, vcpu); + __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, + (unsigned long *)&vcpu->arch.pmu.global_status); + return 1; +} + static __init void vmx_setup_user_return_msrs(void) { @@ -7681,6 +7695,8 @@ static __init void vmx_setup_user_return_msrs(void) kvm_add_user_return_msr(vmx_uret_msrs_list[i]); } +static struct kvm_x86_init_ops vmx_init_ops __initdata; + static __init int hardware_setup(void) { unsigned long host_bndcfgs; @@ -7839,6 +7855,10 @@ static __init int hardware_setup(void) return -EINVAL; if (!enable_ept || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; + if (pt_mode == PT_MODE_HOST_GUEST) + vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; + else + vmx_init_ops.handle_intel_pt_intr = NULL; setup_default_sgx_lepubkeyhash(); @@ -7867,7 +7887,7 @@ static struct kvm_x86_init_ops vmx_init_ops __initdata = { .disabled_by_bios = vmx_disabled_by_bios, .check_processor_compatibility = vmx_check_processor_compat, .hardware_setup = hardware_setup, - .intel_pt_intr_in_guest = vmx_pt_mode_is_host_guest, + .handle_intel_pt_intr = NULL, .runtime_ops = &vmx_x86_ops, }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 32cb6f9ca077..4e0ed2fdc2e1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8469,20 +8469,6 @@ static void kvm_timer_init(void) kvmclock_cpu_online, kvmclock_cpu_down_prep); } -static unsigned int kvm_handle_intel_pt_intr(void) -{ - struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); - - /* '0' on failure so that the !PT case can use a RET0 static call. */ - if (!kvm_arch_pmi_in_guest(vcpu)) - return 0; - - kvm_make_request(KVM_REQ_PMI, vcpu); - __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, - (unsigned long *)&vcpu->arch.pmu.global_status); - return 1; -} - #ifdef CONFIG_X86_64 static void pvclock_gtod_update_fn(struct work_struct *work) { @@ -11192,11 +11178,7 @@ int kvm_arch_hardware_setup(void *opaque) memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); kvm_ops_static_call_update(); - /* Temporary ugliness. */ - if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest()) - kvm_register_perf_callbacks(kvm_handle_intel_pt_intr); - else - kvm_register_perf_callbacks(NULL); + kvm_register_perf_callbacks(ops->handle_intel_pt_intr); if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) supported_xss = 0; From 7b517831a1c6aceb0821860edb9c7bc7d4f803a2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:35 +0000 Subject: [PATCH 14/17] KVM: arm64: Convert to the generic perf callbacks Drop arm64's version of the callbacks in favor of the callbacks provided by generic KVM, which are semantically identical. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20211111020738.2512932-15-seanjc@google.com --- arch/arm64/kvm/perf.c | 34 ++-------------------------------- 1 file changed, 2 insertions(+), 32 deletions(-) diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c index dfa9bce8559e..374c496a3f1d 100644 --- a/arch/arm64/kvm/perf.c +++ b/arch/arm64/kvm/perf.c @@ -13,42 +13,12 @@ DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available); -static unsigned int kvm_guest_state(void) -{ - struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); - unsigned int state; - - if (!vcpu) - return 0; - - state = PERF_GUEST_ACTIVE; - if (!vcpu_mode_priv(vcpu)) - state |= PERF_GUEST_USER; - - return state; -} - -static unsigned long kvm_get_guest_ip(void) -{ - struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); - - if (WARN_ON_ONCE(!vcpu)) - return 0; - - return *vcpu_pc(vcpu); -} - -static struct perf_guest_info_callbacks kvm_guest_cbs = { - .state = kvm_guest_state, - .get_ip = kvm_get_guest_ip, -}; - void kvm_perf_init(void) { - perf_register_guest_info_callbacks(&kvm_guest_cbs); + kvm_register_perf_callbacks(NULL); } void kvm_perf_teardown(void) { - perf_unregister_guest_info_callbacks(&kvm_guest_cbs); + kvm_unregister_perf_callbacks(); } From be399d824b432a85f8df86b566d2e5994fdf58b0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:36 +0000 Subject: [PATCH 15/17] KVM: arm64: Hide kvm_arm_pmu_available behind CONFIG_HW_PERF_EVENTS=y Move the definition of kvm_arm_pmu_available to pmu-emul.c and, out of "necessity", hide it behind CONFIG_HW_PERF_EVENTS. Provide a stub for the key's wrapper, kvm_arm_support_pmu_v3(). Moving the key's definition out of perf.c will allow a future commit to delete perf.c entirely. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20211111020738.2512932-16-seanjc@google.com --- arch/arm64/kernel/image-vars.h | 2 ++ arch/arm64/kvm/perf.c | 2 -- arch/arm64/kvm/pmu-emul.c | 2 ++ include/kvm/arm_pmu.h | 19 ++++++++++++------- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index c96a9a0043bf..7eaf1f7c4168 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -102,7 +102,9 @@ KVM_NVHE_ALIAS(__stop___kvm_ex_table); KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base); /* PMU available static key */ +#ifdef CONFIG_HW_PERF_EVENTS KVM_NVHE_ALIAS(kvm_arm_pmu_available); +#endif /* Position-independent library routines */ KVM_NVHE_ALIAS_HYP(clear_page, __pi_clear_page); diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c index 374c496a3f1d..52cfab253c65 100644 --- a/arch/arm64/kvm/perf.c +++ b/arch/arm64/kvm/perf.c @@ -11,8 +11,6 @@ #include -DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available); - void kvm_perf_init(void) { kvm_register_perf_callbacks(NULL); diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index a5e4bbf5e68f..3308ceefa129 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -14,6 +14,8 @@ #include #include +DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available); + static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx); static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx); static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc); diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 90f21898aad8..f9ed4c171d7b 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -13,13 +13,6 @@ #define ARMV8_PMU_CYCLE_IDX (ARMV8_PMU_MAX_COUNTERS - 1) #define ARMV8_PMU_MAX_COUNTER_PAIRS ((ARMV8_PMU_MAX_COUNTERS + 1) >> 1) -DECLARE_STATIC_KEY_FALSE(kvm_arm_pmu_available); - -static __always_inline bool kvm_arm_support_pmu_v3(void) -{ - return static_branch_likely(&kvm_arm_pmu_available); -} - #ifdef CONFIG_HW_PERF_EVENTS struct kvm_pmc { @@ -36,6 +29,13 @@ struct kvm_pmu { struct irq_work overflow_work; }; +DECLARE_STATIC_KEY_FALSE(kvm_arm_pmu_available); + +static __always_inline bool kvm_arm_support_pmu_v3(void) +{ + return static_branch_likely(&kvm_arm_pmu_available); +} + #define kvm_arm_pmu_irq_initialized(v) ((v)->arch.pmu.irq_num >= VGIC_NR_SGIS) u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx); void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); @@ -65,6 +65,11 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu); struct kvm_pmu { }; +static inline bool kvm_arm_support_pmu_v3(void) +{ + return false; +} + #define kvm_arm_pmu_irq_initialized(v) (false) static inline u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) From 17ed14eba22b3a86e82fb6df28af00fb4cadfd77 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:37 +0000 Subject: [PATCH 16/17] KVM: arm64: Drop perf.c and fold its tiny bits of code into arm.c Call KVM's (un)register perf callbacks helpers directly from arm.c and delete perf.c No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20211111020738.2512932-17-seanjc@google.com --- arch/arm64/include/asm/kvm_host.h | 3 --- arch/arm64/kvm/Makefile | 2 +- arch/arm64/kvm/arm.c | 5 +++-- arch/arm64/kvm/perf.c | 22 ---------------------- 4 files changed, 4 insertions(+), 28 deletions(-) delete mode 100644 arch/arm64/kvm/perf.c diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index aa28b8e0e5d0..541e7a813eb8 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -675,9 +675,6 @@ unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); int kvm_handle_mmio_return(struct kvm_vcpu *vcpu); int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa); -void kvm_perf_init(void); -void kvm_perf_teardown(void); - /* * Returns true if a Performance Monitoring Interrupt (PMI), a.k.a. perf event, * arrived in guest context. For arm64, any event that arrives while a vCPU is diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 989bb5dad2c8..0bcc378b7961 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -12,7 +12,7 @@ obj-$(CONFIG_KVM) += hyp/ kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \ $(KVM)/vfio.o $(KVM)/irqchip.o $(KVM)/binary_stats.o \ - arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \ + arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ inject_fault.o va_layout.o handle_exit.o \ guest.o debug.o reset.o sys_regs.o \ vgic-sys-reg-v3.o fpsimd.o pmu.o \ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index b400be996040..8129ee1ed3a4 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1773,7 +1773,8 @@ static int init_subsystems(void) if (err) goto out; - kvm_perf_init(); + kvm_register_perf_callbacks(NULL); + kvm_sys_reg_table_init(); out: @@ -2161,7 +2162,7 @@ int kvm_arch_init(void *opaque) /* NOP: Compiling as a module not supported */ void kvm_arch_exit(void) { - kvm_perf_teardown(); + kvm_unregister_perf_callbacks(); } static int __init early_kvm_mode_cfg(char *arg) diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c deleted file mode 100644 index 52cfab253c65..000000000000 --- a/arch/arm64/kvm/perf.c +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Based on the x86 implementation. - * - * Copyright (C) 2012 ARM Ltd. - * Author: Marc Zyngier - */ - -#include -#include - -#include - -void kvm_perf_init(void) -{ - kvm_register_perf_callbacks(NULL); -} - -void kvm_perf_teardown(void) -{ - kvm_unregister_perf_callbacks(); -} From a9f4a6e92b3b319296fb078da2615f618f6cd80c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:38 +0000 Subject: [PATCH 17/17] perf: Drop guest callback (un)register stubs Drop perf's stubs for (un)registering guest callbacks now that KVM registration of callbacks is hidden behind GUEST_PERF_EVENTS=y. The only other user is x86 XEN_PV, and x86 unconditionally selects PERF_EVENTS. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-18-seanjc@google.com --- include/linux/perf_event.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0ac7d867ca0c..7b7525e9155f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1511,11 +1511,6 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void perf_bp_event(struct perf_event *event, void *data) { } -static inline void perf_register_guest_info_callbacks -(struct perf_guest_info_callbacks *cbs) { } -static inline void perf_unregister_guest_info_callbacks -(struct perf_guest_info_callbacks *cbs) { } - static inline void perf_event_mmap(struct vm_area_struct *vma) { } typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data);