From 9699f970de84292a766709029e5135ea0b6c9aa9 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 24 Jan 2019 15:27:09 +0100 Subject: [PATCH 01/18] x86/kvm/hyper-v: don't announce GUEST IDLE MSR support HV_X64_MSR_GUEST_IDLE_AVAILABLE appeared in kvm_vcpu_ioctl_get_hv_cpuid() by mistake: it announces support for HV_X64_MSR_GUEST_IDLE (0x400000F0) which we don't support in KVM (yet). Fixes: 2bc39970e932 ("x86/kvm/hyper-v: Introduce KVM_GET_SUPPORTED_HV_CPUID") Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index c90a5352d158..ac44a681f065 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1832,7 +1832,6 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_X64_MSR_VP_INDEX_AVAILABLE; ent->eax |= HV_X64_MSR_RESET_AVAILABLE; ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE; - ent->eax |= HV_X64_MSR_GUEST_IDLE_AVAILABLE; ent->eax |= HV_X64_ACCESS_FREQUENCY_MSRS; ent->eax |= HV_X64_ACCESS_REENLIGHTENMENT; From 5cc244a20b86090c087073c124284381cdf47234 Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Mon, 21 Jan 2019 15:48:40 +0300 Subject: [PATCH 02/18] KVM: x86: Fix single-step debugging The single-step debugging of KVM guests on x86 is broken: if we run gdb 'stepi' command at the breakpoint when the guest interrupts are enabled, RIP always jumps to native_apic_mem_write(). Then other nasty effects follow. Long investigation showed that on Jun 7, 2017 the commit c8401dda2f0a00cd25c0 ("KVM: x86: fix singlestepping over syscall") introduced the kvm_run.debug corruption: kvm_vcpu_do_singlestep() can be called without X86_EFLAGS_TF set. Let's fix it. Please consider that for -stable. Signed-off-by: Alexander Popov Cc: stable@vger.kernel.org Fixes: c8401dda2f0a00cd25c0 ("KVM: x86: fix singlestepping over syscall") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 02c8e095a239..f14bb806aeed 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6480,8 +6480,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; kvm_rip_write(vcpu, ctxt->eip); - if (r == EMULATE_DONE && - (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) + if (r == EMULATE_DONE && ctxt->tf) kvm_vcpu_do_singlestep(vcpu, &r); if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) From 85ba2b165d11029c0c57a58640d4cf41f9d9fa0d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 14 Jan 2019 12:12:02 -0800 Subject: [PATCH 03/18] KVM: VMX: Use the correct field var when clearing VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL Fix a recently introduced bug that results in the wrong VMCS control field being updated when applying a IA32_PERF_GLOBAL_CTRL errata. Fixes: c73da3fcab43 ("KVM: VMX: Properly handle dynamic VM Entry/Exit controls") Reported-by: Harald Arnesen Tested-by: Harald Arnesen Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f6915f10e584..0762fcab8fc9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2344,7 +2344,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, case 37: /* AAT100 */ case 44: /* BC86,AAY89,BD102 */ case 46: /* BA97 */ - _vmexit_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " "does not work properly. Using workaround\n"); From 3a33d030daaa7c507e1c12d5adcf828248429593 Mon Sep 17 00:00:00 2001 From: Tom Roeder Date: Thu, 24 Jan 2019 13:48:20 -0800 Subject: [PATCH 04/18] kvm: x86/vmx: Use kzalloc for cached_vmcs12 This changes the allocation of cached_vmcs12 to use kzalloc instead of kmalloc. This removes the information leak found by Syzkaller (see Reported-by) in this case and prevents similar leaks from happening based on cached_vmcs12. It also changes vmx_get_nested_state to copy out the full 4k VMCS12_SIZE in copy_to_user rather than only the size of the struct. Tested: rebuilt against head, booted, and ran the syszkaller repro https://syzkaller.appspot.com/text?tag=ReproC&x=174efca3400000 without observing any problems. Reported-by: syzbot+ded1696f6b50b615b630@syzkaller.appspotmail.com Fixes: 8fcc4b5923af5de58b80b53a069453b135693304 Cc: stable@vger.kernel.org Signed-off-by: Tom Roeder Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 2616bd2c7f2c..ce8153923854 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4140,11 +4140,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (r < 0) goto out_vmcs02; - vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); + vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL); if (!vmx->nested.cached_vmcs12) goto out_cached_vmcs12; - vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); + vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL); if (!vmx->nested.cached_shadow_vmcs12) goto out_cached_shadow_vmcs12; @@ -5263,13 +5263,17 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, copy_shadow_to_vmcs12(vmx); } - if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12))) + /* + * Copy over the full allocated size of vmcs12 rather than just the size + * of the struct. + */ + if (copy_to_user(user_kvm_nested_state->data, vmcs12, VMCS12_SIZE)) return -EFAULT; if (nested_cpu_has_shadow_vmcs(vmcs12) && vmcs12->vmcs_link_pointer != -1ull) { if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE, - get_shadow_vmcs12(vcpu), sizeof(*vmcs12))) + get_shadow_vmcs12(vcpu), VMCS12_SIZE)) return -EFAULT; } From 1998fd32aa62fbf22cd1d8258e6a9deffd6bc466 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 25 Jan 2019 12:19:33 +0100 Subject: [PATCH 05/18] x86/kvm/hyper-v: don't recommend doing reset via synthetic MSR System reset through synthetic MSR is not recommended neither by genuine Hyper-V nor my QEMU. Fixes: 2bc39970e932 ("x86/kvm/hyper-v: Introduce KVM_GET_SUPPORTED_HV_CPUID") Signed-off-by: Vitaly Kuznetsov Reviewed-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index ac44a681f065..4840f5b3c88f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1847,7 +1847,6 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, case HYPERV_CPUID_ENLIGHTMENT_INFO: ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED; - ent->eax |= HV_X64_SYSTEM_RESET_RECOMMENDED; ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED; ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; From f1adceaf01f0446e69c15b32f24ce98e3c3623f1 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 25 Jan 2019 12:19:34 +0100 Subject: [PATCH 06/18] x86/kvm/hyper-v: recommend using eVMCS only when it is enabled We shouldn't probably be suggesting using Enlightened VMCS when it's not enabled (not supported from guest's point of view). Hyper-V on KVM seems to be fine either way but let's be consistent. Fixes: 2bc39970e932 ("x86/kvm/hyper-v: Introduce KVM_GET_SUPPORTED_HV_CPUID") Reviewed-by: Liran Alon Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 4840f5b3c88f..4730fcaa70cf 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1850,7 +1850,8 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED; ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; - ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; + if (evmcs_ver) + ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; /* * Default number of spinlock retry attempts, matches From 1ed199a41c70ad7bfaee8b14f78e791fcf43b278 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Jan 2019 09:22:39 -0800 Subject: [PATCH 07/18] KVM: x86: Fix PV IPIs for 32-bit KVM host The recognition of the KVM_HC_SEND_IPI hypercall was unintentionally wrapped in "#ifdef CONFIG_X86_64", causing 32-bit KVM hosts to reject any and all PV IPI requests despite advertising the feature. This results in all KVM paravirtualized guests hanging during SMP boot due to IPIs never being delivered. Fixes: 4180bf1b655a ("KVM: X86: Implement "send IPI" hypercall") Cc: stable@vger.kernel.org Cc: Wanpeng Li Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f14bb806aeed..d21dcad397e4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7092,10 +7092,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) case KVM_HC_CLOCK_PAIRING: ret = kvm_pv_clock_pairing(vcpu, a0, a1); break; +#endif case KVM_HC_SEND_IPI: ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); break; -#endif default: ret = -KVM_ENOSYS; break; From de81c2f912ef57917bdc6d63b410c534c3e07982 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Jan 2019 09:22:40 -0800 Subject: [PATCH 08/18] KVM: x86: WARN_ONCE if sending a PV IPI returns a fatal error KVM hypercalls return a negative value error code in case of a fatal error, e.g. when the hypercall isn't supported or was made with invalid parameters. WARN_ONCE on fatal errors when sending PV IPIs as any such error all but guarantees an SMP system will hang due to a missing IPI. Fixes: aaffcfd1e82d ("KVM: X86: Implement PV IPIs in linux guest") Cc: stable@vger.kernel.org Cc: Wanpeng Li Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index ba4bfb7f6a36..5c93a65ee1e5 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -457,6 +457,7 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector) #else u64 ipi_bitmap = 0; #endif + long ret; if (cpumask_empty(mask)) return; @@ -482,8 +483,9 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector) } else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) { max = apic_id < max ? max : apic_id; } else { - kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, + ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); + WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret); min = max = apic_id; ipi_bitmap = 0; } @@ -491,8 +493,9 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector) } if (ipi_bitmap) { - kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, + ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); + WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret); } local_irq_restore(flags); From 37ef0c4414c9743ba7f1af4392f0a27a99649f2a Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 22 Jan 2019 10:24:19 +0000 Subject: [PATCH 09/18] svm: Add warning message for AVIC IPI invalid target Print warning message when IPI target ID is invalid due to one of the following reasons: * In logical mode: cluster > max_cluster (64) * In physical mode: target > max_physical (512) * Address is not present in the physical or logical ID tables Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a157ca5b6869..2aff835a65ed 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4526,6 +4526,8 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) break; } case AVIC_IPI_FAILURE_INVALID_TARGET: + WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n", + index, svm->vcpu.vcpu_id, icrh, icrl); break; case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: WARN_ONCE(1, "Invalid backing page\n"); From bb218fbcfaaa3b115d4cd7a43c0ca164f3a96e57 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 22 Jan 2019 10:25:13 +0000 Subject: [PATCH 10/18] svm: Fix AVIC incomplete IPI emulation In case of incomplete IPI with invalid interrupt type, the current SVM driver does not properly emulate the IPI, and fails to boot FreeBSD guests with multiple vcpus when enabling AVIC. Fix this by update APIC ICR high/low registers, which also emulate sending the IPI. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2aff835a65ed..8a0c9a1f6ac8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4504,25 +4504,14 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) kvm_lapic_reg_write(apic, APIC_ICR, icrl); break; case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: { - int i; - struct kvm_vcpu *vcpu; - struct kvm *kvm = svm->vcpu.kvm; struct kvm_lapic *apic = svm->vcpu.arch.apic; /* - * At this point, we expect that the AVIC HW has already - * set the appropriate IRR bits on the valid target - * vcpus. So, we just need to kick the appropriate vcpu. + * Update ICR high and low, then emulate sending IPI, + * which is handled when writing APIC_ICR. */ - kvm_for_each_vcpu(i, vcpu, kvm) { - bool m = kvm_apic_match_dest(vcpu, apic, - icrl & KVM_APIC_SHORT_MASK, - GET_APIC_DEST_FIELD(icrh), - icrl & KVM_APIC_DEST_MASK); - - if (m && !avic_vcpu_is_running(vcpu)) - kvm_vcpu_wake_up(vcpu); - } + kvm_lapic_reg_write(apic, APIC_ICR2, icrh); + kvm_lapic_reg_write(apic, APIC_ICR, icrl); break; } case AVIC_IPI_FAILURE_INVALID_TARGET: From 619ad846fc3452adaf71ca246c5aa711e2055398 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 7 Jan 2019 19:44:51 +0100 Subject: [PATCH 11/18] KVM: nSVM: clear events pending from svm_complete_interrupts() when exiting to L1 kvm-unit-tests' eventinj "NMI failing on IDT" test results in NMI being delivered to the host (L1) when it's running nested. The problem seems to be: svm_complete_interrupts() raises 'nmi_injected' flag but later we decide to reflect EXIT_NPF to L1. The flag remains pending and we do NMI injection upon entry so it got delivered to L1 instead of L2. It seems that VMX code solves the same issue in prepare_vmcs12(), this was introduced with code refactoring in commit 5f3d5799974b ("KVM: nVMX: Rework event injection and recovery"). Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8a0c9a1f6ac8..9caf1252c64a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3414,6 +3414,14 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) kvm_mmu_reset_context(&svm->vcpu); kvm_mmu_load(&svm->vcpu); + /* + * Drop what we picked up for L2 via svm_complete_interrupts() so it + * doesn't end up in L1. + */ + svm->vcpu.arch.nmi_injected = false; + kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_interrupt_queue(&svm->vcpu); + return 0; } From 8997f657001d1ac5042d368a936987c87251c5ec Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Mon, 21 Jan 2019 15:27:05 +0800 Subject: [PATCH 12/18] kvm: vmx: fix some -Wmissing-prototypes warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We get some warnings when building kernel with W=1: arch/x86/kvm/vmx/vmx.c:426:5: warning: no previous prototype for ‘kvm_fill_hv_flush_list_func’ [-Wmissing-prototypes] arch/x86/kvm/vmx/nested.c:58:6: warning: no previous prototype for ‘init_vmcs_shadow_fields’ [-Wmissing-prototypes] Make them static to fix this. Signed-off-by: Yi Wang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ce8153923854..8ff20523661b 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -55,7 +55,7 @@ static u16 shadow_read_write_fields[] = { static int max_shadow_read_write_fields = ARRAY_SIZE(shadow_read_write_fields); -void init_vmcs_shadow_fields(void) +static void init_vmcs_shadow_fields(void) { int i, j; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0762fcab8fc9..8be2abbdf63f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -423,7 +423,7 @@ static void check_ept_pointer_match(struct kvm *kvm) to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH; } -int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush, +static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush, void *data) { struct kvm_tlb_range *range = data; From 94a980c39c8e3f8abaff5d3b5bbcd4ccf1c02c4f Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 16 Jan 2019 09:41:15 -0800 Subject: [PATCH 13/18] kvm: selftests: Fix region overlap check in kvm_util Fix a call to userspace_mem_region_find to conform to its spec of taking an inclusive, inclusive range. It was previously being called with an inclusive, exclusive range. Also remove a redundant region bounds check in vm_userspace_mem_region_add. Region overlap checking is already performed by the call to userspace_mem_region_find. Tested: Compiled tools/testing/selftests/kvm with -static Ran all resulting test binaries on an Intel Haswell test machine All tests passed Signed-off-by: Ben Gardon Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/kvm_util.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 23022e9d32eb..b52cfdefecbf 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -571,7 +571,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, * already exist. */ region = (struct userspace_mem_region *) userspace_mem_region_find( - vm, guest_paddr, guest_paddr + npages * vm->page_size); + vm, guest_paddr, (guest_paddr + npages * vm->page_size) - 1); if (region != NULL) TEST_ASSERT(false, "overlapping userspace_mem_region already " "exists\n" @@ -587,15 +587,10 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, region = region->next) { if (region->region.slot == slot) break; - if ((guest_paddr <= (region->region.guest_phys_addr - + region->region.memory_size)) - && ((guest_paddr + npages * vm->page_size) - >= region->region.guest_phys_addr)) - break; } if (region != NULL) TEST_ASSERT(false, "A mem region with the requested slot " - "or overlapping physical memory range already exists.\n" + "already exists.\n" " requested slot: %u paddr: 0x%lx npages: 0x%lx\n" " existing slot: %u paddr: 0x%lx size: 0x%lx", slot, guest_paddr, npages, From 5ad6ece869d46c834976ce383ef200f9116881f8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Jan 2019 17:10:53 -0800 Subject: [PATCH 14/18] KVM: VMX: Move vmx_vcpu_run()'s VM-Enter asm blob to a helper function ...along with the function's STACK_FRAME_NON_STANDARD tag. Moving the asm blob results in a significantly smaller amount of code that is marked with STACK_FRAME_NON_STANDARD, which makes it far less likely that gcc will split the function and trigger a spurious objtool warning. As a bonus, removing STACK_FRAME_NON_STANDARD from vmx_vcpu_run() allows the bulk of code to be properly checked by objtool. Because %rbp is not loaded via VMCS fields, vmx_vcpu_run() must manually save/restore the host's RBP and load the guest's RBP prior to calling vmx_vmenter(). Modifying %rbp triggers objtool's stack validation code, and so vmx_vcpu_run() is tagged with STACK_FRAME_NON_STANDARD since it's impossible to avoid modifying %rbp. Unfortunately, vmx_vcpu_run() is also a gigantic function that gcc will split into separate functions, e.g. so that pieces of the function can be inlined. Splitting the function means that the compiled Elf file will contain one or more vmx_vcpu_run.part.* functions in addition to a vmx_vcpu_run function. Depending on where the function is split, objtool may warn about a "call without frame pointer save/setup" in vmx_vcpu_run.part.* since objtool's stack validation looks for exact names when whitelisting functions tagged with STACK_FRAME_NON_STANDARD. Up until recently, the undesirable function splitting was effectively blocked because vmx_vcpu_run() was tagged with __noclone. At the time, __noclone had an unintended side effect that put vmx_vcpu_run() into a separate optimization unit, which in turn prevented gcc from inlining the function (or any of its own function calls) and thus eliminated gcc's motivation to split the function. Removing the __noclone attribute allowed gcc to optimize vmx_vcpu_run(), exposing the objtool warning. Kudos to Qian Cai for root causing that the fnsplit optimization is what caused objtool to complain. Fixes: 453eafbe65f7 ("KVM: VMX: Move VM-Enter + VM-Exit handling to non-inline sub-routines") Tested-by: Qian Cai Cc: Josh Poimboeuf Reported-by: kbuild test robot Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 139 ++++++++++++++++++++++------------------- 1 file changed, 73 insertions(+), 66 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 8be2abbdf63f..99c898523c5e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6362,72 +6362,9 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->hv_timer_armed = false; } -static void vmx_vcpu_run(struct kvm_vcpu *vcpu) +static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr3, cr4, evmcs_rsp; - - /* Record the guest's net vcpu time for enforced NMI injections. */ - if (unlikely(!enable_vnmi && - vmx->loaded_vmcs->soft_vnmi_blocked)) - vmx->loaded_vmcs->entry_time = ktime_get(); - - /* Don't enter VMX if guest state is invalid, let the exit handler - start emulation until we arrive back to a valid state */ - if (vmx->emulation_required) - return; - - if (vmx->ple_window_dirty) { - vmx->ple_window_dirty = false; - vmcs_write32(PLE_WINDOW, vmx->ple_window); - } - - if (vmx->nested.need_vmcs12_sync) - nested_sync_from_vmcs12(vcpu); - - if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) - vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); - if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) - vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); - - cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { - vmcs_writel(HOST_CR3, cr3); - vmx->loaded_vmcs->host_state.cr3 = cr3; - } - - cr4 = cr4_read_shadow(); - if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { - vmcs_writel(HOST_CR4, cr4); - vmx->loaded_vmcs->host_state.cr4 = cr4; - } - - /* When single-stepping over STI and MOV SS, we must clear the - * corresponding interruptibility bits in the guest state. Otherwise - * vmentry fails as it then expects bit 14 (BS) in pending debug - * exceptions being set, but that's not correct for the guest debugging - * case. */ - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - vmx_set_interrupt_shadow(vcpu, 0); - - if (static_cpu_has(X86_FEATURE_PKU) && - kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && - vcpu->arch.pkru != vmx->host_pkru) - __write_pkru(vcpu->arch.pkru); - - pt_guest_enter(vmx); - - atomic_switch_perf_msrs(vmx); - - vmx_update_hv_timer(vcpu); - - /* - * If this vCPU has touched SPEC_CTRL, restore the guest's value if - * it's non-zero. Since vmentry is serialising on affected CPUs, there - * is no need to worry about the conditional branch over the wrmsr - * being speculatively taken. - */ - x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); + unsigned long evmcs_rsp; vmx->__launched = vmx->loaded_vmcs->launched; @@ -6567,6 +6504,77 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) , "eax", "ebx", "edi" #endif ); +} +STACK_FRAME_NON_STANDARD(__vmx_vcpu_run); + +static void vmx_vcpu_run(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long cr3, cr4; + + /* Record the guest's net vcpu time for enforced NMI injections. */ + if (unlikely(!enable_vnmi && + vmx->loaded_vmcs->soft_vnmi_blocked)) + vmx->loaded_vmcs->entry_time = ktime_get(); + + /* Don't enter VMX if guest state is invalid, let the exit handler + start emulation until we arrive back to a valid state */ + if (vmx->emulation_required) + return; + + if (vmx->ple_window_dirty) { + vmx->ple_window_dirty = false; + vmcs_write32(PLE_WINDOW, vmx->ple_window); + } + + if (vmx->nested.need_vmcs12_sync) + nested_sync_from_vmcs12(vcpu); + + if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); + if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { + vmcs_writel(HOST_CR3, cr3); + vmx->loaded_vmcs->host_state.cr3 = cr3; + } + + cr4 = cr4_read_shadow(); + if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { + vmcs_writel(HOST_CR4, cr4); + vmx->loaded_vmcs->host_state.cr4 = cr4; + } + + /* When single-stepping over STI and MOV SS, we must clear the + * corresponding interruptibility bits in the guest state. Otherwise + * vmentry fails as it then expects bit 14 (BS) in pending debug + * exceptions being set, but that's not correct for the guest debugging + * case. */ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + vmx_set_interrupt_shadow(vcpu, 0); + + if (static_cpu_has(X86_FEATURE_PKU) && + kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && + vcpu->arch.pkru != vmx->host_pkru) + __write_pkru(vcpu->arch.pkru); + + pt_guest_enter(vmx); + + atomic_switch_perf_msrs(vmx); + + vmx_update_hv_timer(vcpu); + + /* + * If this vCPU has touched SPEC_CTRL, restore the guest's value if + * it's non-zero. Since vmentry is serialising on affected CPUs, there + * is no need to worry about the conditional branch over the wrmsr + * being speculatively taken. + */ + x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); + + __vmx_vcpu_run(vcpu, vmx); /* * We do not use IBRS in the kernel. If this vCPU has used the @@ -6648,7 +6656,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); } -STACK_FRAME_NON_STANDARD(vmx_vcpu_run); static struct kvm *vmx_vm_alloc(void) { From 3a2f5773baab34a9943be4c77e1ff2ac79d16c75 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 17 Jan 2019 18:12:09 +0100 Subject: [PATCH 15/18] x86/kvm/hyper-v: nested_enable_evmcs() sets vmcs_version incorrectly Commit e2e871ab2f02 ("x86/kvm/hyper-v: Introduce nested_get_evmcs_version() helper") broke EVMCS enablement: to set vmcs_version we now call nested_get_evmcs_version() but this function checks enlightened_vmcs_enabled flag which is not yet set so we end up returning zero. Fix the issue by re-arranging things in nested_enable_evmcs(). Fixes: e2e871ab2f02 ("x86/kvm/hyper-v: Introduce nested_get_evmcs_version() helper") Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/evmcs.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 95bc2247478d..5466c6d85cf3 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -332,16 +332,17 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version) { struct vcpu_vmx *vmx = to_vmx(vcpu); + bool evmcs_already_enabled = vmx->nested.enlightened_vmcs_enabled; + + vmx->nested.enlightened_vmcs_enabled = true; if (vmcs_version) *vmcs_version = nested_get_evmcs_version(vcpu); /* We don't support disabling the feature for simplicity. */ - if (vmx->nested.enlightened_vmcs_enabled) + if (evmcs_already_enabled) return 0; - vmx->nested.enlightened_vmcs_enabled = true; - vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; From 35b531a1e7fc30ac8c62e5ac1794eb1460da614e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 17 Jan 2019 18:12:10 +0100 Subject: [PATCH 16/18] KVM: selftests: check returned evmcs version range Check that KVM_CAP_HYPERV_ENLIGHTENED_VMCS returns correct version range. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/evmcs_test.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index ea3c73e8f4f6..c49c2a28b0eb 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -103,6 +103,12 @@ int main(int argc, char *argv[]) vcpu_ioctl(vm, VCPU_ID, KVM_ENABLE_CAP, &enable_evmcs_cap); + /* KVM should return supported EVMCS version range */ + TEST_ASSERT(((evmcs_ver >> 8) >= (evmcs_ver & 0xff)) && + (evmcs_ver & 0xff) > 0, + "Incorrect EVMCS version range: %x:%x\n", + evmcs_ver & 0xff, evmcs_ver >> 8); + run = vcpu_state(vm, VCPU_ID); vcpu_regs_get(vm, VCPU_ID, ®s1); From 5cd5548ff439b916cf72448109994394c2bf4b3c Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 25 Jan 2019 16:32:46 +0900 Subject: [PATCH 17/18] KVM: x86: fix TRACE_INCLUDE_PATH and remove -I. header search paths The header search path -I. in kernel Makefiles is very suspicious; it allows the compiler to search for headers in the top of $(srctree), where obviously no header file exists. The reason of having -I. here is to make the incorrectly set TRACE_INCLUDE_PATH working. As the comment block in include/trace/define_trace.h says, TRACE_INCLUDE_PATH should be a relative path to the define_trace.h Fix the TRACE_INCLUDE_PATH, and remove the iffy include paths. Signed-off-by: Masahiro Yamada Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Makefile | 4 ---- arch/x86/kvm/trace.h | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 69b3a7c30013..31ecf7a76d5a 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -2,10 +2,6 @@ ccflags-y += -Iarch/x86/kvm -CFLAGS_x86.o := -I. -CFLAGS_svm.o := -I. -CFLAGS_vmx.o := -I. - KVM := ../../../virt/kvm kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 705f40ae2532..6432d08c7de7 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1465,7 +1465,7 @@ TRACE_EVENT(kvm_hv_send_ipi_ex, #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH arch/x86/kvm +#define TRACE_INCLUDE_PATH ../../arch/x86/kvm #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace From b2869f28e1476cd705bb28c58fd01b0bd661bb99 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 25 Jan 2019 12:23:17 -0600 Subject: [PATCH 18/18] KVM: x86: Mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. This patch fixes the following warnings: arch/x86/kvm/lapic.c:1037:27: warning: this statement may fall through [-Wimplicit-fallthrough=] arch/x86/kvm/lapic.c:1876:3: warning: this statement may fall through [-Wimplicit-fallthrough=] arch/x86/kvm/hyperv.c:1637:6: warning: this statement may fall through [-Wimplicit-fallthrough=] arch/x86/kvm/svm.c:4396:6: warning: this statement may fall through [-Wimplicit-fallthrough=] arch/x86/kvm/mmu.c:4372:36: warning: this statement may fall through [-Wimplicit-fallthrough=] arch/x86/kvm/x86.c:3835:6: warning: this statement may fall through [-Wimplicit-fallthrough=] arch/x86/kvm/x86.c:7938:23: warning: this statement may fall through [-Wimplicit-fallthrough=] arch/x86/kvm/vmx/vmx.c:2015:6: warning: this statement may fall through [-Wimplicit-fallthrough=] arch/x86/kvm/vmx/vmx.c:1773:6: warning: this statement may fall through [-Wimplicit-fallthrough=] Warning level 3 was used: -Wimplicit-fallthrough=3 This patch is part of the ongoing efforts to enabling -Wimplicit-fallthrough. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 2 +- arch/x86/kvm/lapic.c | 2 ++ arch/x86/kvm/mmu.c | 1 + arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 3 +++ 6 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 4730fcaa70cf..89d20ed1d2e8 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1636,7 +1636,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) ret = kvm_hvcall_signal_event(vcpu, fast, ingpa); if (ret != HV_STATUS_INVALID_PORT_ID) break; - /* maybe userspace knows this conn_id: fall through */ + /* fall through - maybe userspace knows this conn_id. */ case HVCALL_POST_MESSAGE: /* don't bother userspace if it has no way to handle it */ if (unlikely(rep || !vcpu_to_synic(vcpu)->active)) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9f089e2e09d0..4b6c2da7265c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1035,6 +1035,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, switch (delivery_mode) { case APIC_DM_LOWEST: vcpu->arch.apic_arb_prio++; + /* fall through */ case APIC_DM_FIXED: if (unlikely(trig_mode && !level)) break; @@ -1874,6 +1875,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LVT0: apic_manage_nmi_watchdog(apic, val); + /* fall through */ case APIC_LVTTHMR: case APIC_LVTPC: case APIC_LVT1: diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ce770b446238..da9c42349b1f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4371,6 +4371,7 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, rsvd_bits(maxphyaddr, 51); rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; + /* fall through */ case PT64_ROOT_4LEVEL: rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd | nonleaf_bit8_rsvd | rsvd_bits(7, 7) | diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9caf1252c64a..f13a3a24d360 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4403,7 +4403,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_IA32_APICBASE: if (kvm_vcpu_apicv_active(vcpu)) avic_update_vapic_bar(to_svm(vcpu), data); - /* Follow through */ + /* Fall through */ default: return kvm_set_msr_common(vcpu, msr); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 99c898523c5e..4341175339f3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1773,7 +1773,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) return 1; - /* Otherwise falls through */ + /* Else, falls through */ default: msr = find_msr_entry(vmx, msr_info->index); if (msr) { @@ -2014,7 +2014,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* Check reserved bit, higher 32 bits should be zero */ if ((data >> 32) != 0) return 1; - /* Otherwise falls through */ + /* Else, falls through */ default: msr = find_msr_entry(vmx, msr_index); if (msr) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d21dcad397e4..3d27206f6c01 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3834,6 +3834,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, case KVM_CAP_HYPERV_SYNIC2: if (cap->args[0]) return -EINVAL; + /* fall through */ + case KVM_CAP_HYPERV_SYNIC: if (!irqchip_in_kernel(vcpu->kvm)) return -EINVAL; @@ -7936,6 +7938,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) vcpu->arch.pv.pv_unhalted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + /* fall through */ case KVM_MP_STATE_RUNNABLE: vcpu->arch.apf.halted = false; break;