From dc9be0fac70a2ad86e31a81372bb0bdfb6945353 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 5 Mar 2015 11:54:46 +0100 Subject: [PATCH 1/8] kvm: move advertising of KVM_CAP_IRQFD to common code POWER supports irqfds but forgot to advertise them. Some userspace does not check for the capability, but others check it---thus they work on x86 and s390 but not POWER. To avoid that other architectures in the future make the same mistake, let common code handle KVM_CAP_IRQFD the same way as KVM_CAP_IRQFD_RESAMPLE. Reported-and-tested-by: Greg Kurz Cc: stable@vger.kernel.org Fixes: 297e21053a52f060944e9f0de4c64fad9bcd72fc Signed-off-by: Paolo Bonzini Signed-off-by: Marcelo Tosatti --- arch/s390/kvm/kvm-s390.c | 1 - arch/x86/kvm/x86.c | 1 - virt/kvm/kvm_main.c | 1 + 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index f6579cfde2df..19e17bd7aec0 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -165,7 +165,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ONE_REG: case KVM_CAP_ENABLE_CAP: case KVM_CAP_S390_CSS_SUPPORT: - case KVM_CAP_IRQFD: case KVM_CAP_IOEVENTFD: case KVM_CAP_DEVICE_CTRL: case KVM_CAP_ENABLE_CAP_VM: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bd7a70be41b3..32bf19ef3115 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2744,7 +2744,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_USER_NMI: case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: - case KVM_CAP_IRQFD: case KVM_CAP_IOEVENTFD: case KVM_CAP_IOEVENTFD_NO_LENGTH: case KVM_CAP_PIT2: diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a1093700f3a4..a2214d9609bd 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2492,6 +2492,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_SIGNAL_MSI: #endif #ifdef CONFIG_HAVE_KVM_IRQFD + case KVM_CAP_IRQFD: case KVM_CAP_IRQFD_RESAMPLE: #endif case KVM_CAP_CHECK_EXTENSION_VM: From a987370f8e7a1677ae385042644326d9cd145a20 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 10 Mar 2015 19:06:59 +0000 Subject: [PATCH 2/8] arm64: KVM: Fix stage-2 PGD allocation to have per-page refcounting We're using __get_free_pages with to allocate the guest's stage-2 PGD. The standard behaviour of this function is to return a set of pages where only the head page has a valid refcount. This behaviour gets us into trouble when we're trying to increment the refcount on a non-head page: page:ffff7c00cfb693c0 count:0 mapcount:0 mapping: (null) index:0x0 flags: 0x4000000000000000() page dumped because: VM_BUG_ON_PAGE((*({ __attribute__((unused)) typeof((&page->_count)->counter) __var = ( typeof((&page->_count)->counter)) 0; (volatile typeof((&page->_count)->counter) *)&((&page->_count)->counter); })) <= 0) BUG: failure at include/linux/mm.h:548/get_page()! Kernel panic - not syncing: BUG! CPU: 1 PID: 1695 Comm: kvm-vcpu-0 Not tainted 4.0.0-rc1+ #3825 Hardware name: APM X-Gene Mustang board (DT) Call trace: [] dump_backtrace+0x0/0x13c [] show_stack+0x10/0x1c [] dump_stack+0x74/0x94 [] panic+0x100/0x240 [] stage2_get_pmd+0x17c/0x2bc [] kvm_handle_guest_abort+0x4b4/0x6b0 [] handle_exit+0x58/0x180 [] kvm_arch_vcpu_ioctl_run+0x114/0x45c [] kvm_vcpu_ioctl+0x2e0/0x754 [] do_vfs_ioctl+0x424/0x5c8 [] SyS_ioctl+0x40/0x78 CPU0: stopping A possible approach for this is to split the compound page using split_page() at allocation time, and change the teardown path to free one page at a time. It turns out that alloc_pages_exact() and free_pages_exact() does exactly that. While we're at it, the PGD allocation code is reworked to reduce duplication. This has been tested on an X-Gene platform with a 4kB/48bit-VA host kernel, and kvmtool hacked to place memory in the second page of the hardware PGD (PUD for the host kernel). Also regression-tested on a Cubietruck (Cortex-A7). [ Reworked to use alloc_pages_exact() and free_pages_exact() and to return pointers directly instead of by reference as arguments - Christoffer ] Reported-by: Mark Rutland Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_mmu.h | 12 +++--- arch/arm/kvm/mmu.c | 67 +++++++++++++++++++++++--------- arch/arm64/include/asm/kvm_mmu.h | 46 ++-------------------- 3 files changed, 58 insertions(+), 67 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index bf0fe99e8ca9..c57c41dc7e87 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -162,18 +162,16 @@ static inline bool kvm_page_empty(void *ptr) #define KVM_PREALLOC_LEVEL 0 -static inline int kvm_prealloc_hwpgd(struct kvm *kvm, pgd_t *pgd) -{ - return 0; -} - -static inline void kvm_free_hwpgd(struct kvm *kvm) { } - static inline void *kvm_get_hwpgd(struct kvm *kvm) { return kvm->arch.pgd; } +static inline unsigned int kvm_get_hwpgd_size(void) +{ + return PTRS_PER_S2_PGD * sizeof(pgd_t); +} + struct kvm; #define kvm_flush_dcache_to_poc(a,l) __cpuc_flush_dcache_area((a), (l)) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 3e6859bc3e11..a48a73c6b866 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -632,6 +632,20 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr) __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE); } +/* Free the HW pgd, one page at a time */ +static void kvm_free_hwpgd(void *hwpgd) +{ + free_pages_exact(hwpgd, kvm_get_hwpgd_size()); +} + +/* Allocate the HW PGD, making sure that each page gets its own refcount */ +static void *kvm_alloc_hwpgd(void) +{ + unsigned int size = kvm_get_hwpgd_size(); + + return alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO); +} + /** * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. * @kvm: The KVM struct pointer for the VM. @@ -645,15 +659,31 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr) */ int kvm_alloc_stage2_pgd(struct kvm *kvm) { - int ret; pgd_t *pgd; + void *hwpgd; if (kvm->arch.pgd != NULL) { kvm_err("kvm_arch already initialized?\n"); return -EINVAL; } + hwpgd = kvm_alloc_hwpgd(); + if (!hwpgd) + return -ENOMEM; + + /* When the kernel uses more levels of page tables than the + * guest, we allocate a fake PGD and pre-populate it to point + * to the next-level page table, which will be the real + * initial page table pointed to by the VTTBR. + * + * When KVM_PREALLOC_LEVEL==2, we allocate a single page for + * the PMD and the kernel will use folded pud. + * When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD + * pages. + */ if (KVM_PREALLOC_LEVEL > 0) { + int i; + /* * Allocate fake pgd for the page table manipulation macros to * work. This is not used by the hardware and we have no @@ -661,30 +691,32 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm) */ pgd = (pgd_t *)kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t), GFP_KERNEL | __GFP_ZERO); + + if (!pgd) { + kvm_free_hwpgd(hwpgd); + return -ENOMEM; + } + + /* Plug the HW PGD into the fake one. */ + for (i = 0; i < PTRS_PER_S2_PGD; i++) { + if (KVM_PREALLOC_LEVEL == 1) + pgd_populate(NULL, pgd + i, + (pud_t *)hwpgd + i * PTRS_PER_PUD); + else if (KVM_PREALLOC_LEVEL == 2) + pud_populate(NULL, pud_offset(pgd, 0) + i, + (pmd_t *)hwpgd + i * PTRS_PER_PMD); + } } else { /* * Allocate actual first-level Stage-2 page table used by the * hardware for Stage-2 page table walks. */ - pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, S2_PGD_ORDER); + pgd = (pgd_t *)hwpgd; } - if (!pgd) - return -ENOMEM; - - ret = kvm_prealloc_hwpgd(kvm, pgd); - if (ret) - goto out_err; - kvm_clean_pgd(pgd); kvm->arch.pgd = pgd; return 0; -out_err: - if (KVM_PREALLOC_LEVEL > 0) - kfree(pgd); - else - free_pages((unsigned long)pgd, S2_PGD_ORDER); - return ret; } /** @@ -785,11 +817,10 @@ void kvm_free_stage2_pgd(struct kvm *kvm) return; unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); - kvm_free_hwpgd(kvm); + kvm_free_hwpgd(kvm_get_hwpgd(kvm)); if (KVM_PREALLOC_LEVEL > 0) kfree(kvm->arch.pgd); - else - free_pages((unsigned long)kvm->arch.pgd, S2_PGD_ORDER); + kvm->arch.pgd = NULL; } diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 6458b5373142..a099cd9cdef8 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -171,43 +171,6 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd) #define KVM_PREALLOC_LEVEL (0) #endif -/** - * kvm_prealloc_hwpgd - allocate inital table for VTTBR - * @kvm: The KVM struct pointer for the VM. - * @pgd: The kernel pseudo pgd - * - * When the kernel uses more levels of page tables than the guest, we allocate - * a fake PGD and pre-populate it to point to the next-level page table, which - * will be the real initial page table pointed to by the VTTBR. - * - * When KVM_PREALLOC_LEVEL==2, we allocate a single page for the PMD and - * the kernel will use folded pud. When KVM_PREALLOC_LEVEL==1, we - * allocate 2 consecutive PUD pages. - */ -static inline int kvm_prealloc_hwpgd(struct kvm *kvm, pgd_t *pgd) -{ - unsigned int i; - unsigned long hwpgd; - - if (KVM_PREALLOC_LEVEL == 0) - return 0; - - hwpgd = __get_free_pages(GFP_KERNEL | __GFP_ZERO, PTRS_PER_S2_PGD_SHIFT); - if (!hwpgd) - return -ENOMEM; - - for (i = 0; i < PTRS_PER_S2_PGD; i++) { - if (KVM_PREALLOC_LEVEL == 1) - pgd_populate(NULL, pgd + i, - (pud_t *)hwpgd + i * PTRS_PER_PUD); - else if (KVM_PREALLOC_LEVEL == 2) - pud_populate(NULL, pud_offset(pgd, 0) + i, - (pmd_t *)hwpgd + i * PTRS_PER_PMD); - } - - return 0; -} - static inline void *kvm_get_hwpgd(struct kvm *kvm) { pgd_t *pgd = kvm->arch.pgd; @@ -224,12 +187,11 @@ static inline void *kvm_get_hwpgd(struct kvm *kvm) return pmd_offset(pud, 0); } -static inline void kvm_free_hwpgd(struct kvm *kvm) +static inline unsigned int kvm_get_hwpgd_size(void) { - if (KVM_PREALLOC_LEVEL > 0) { - unsigned long hwpgd = (unsigned long)kvm_get_hwpgd(kvm); - free_pages(hwpgd, PTRS_PER_S2_PGD_SHIFT); - } + if (KVM_PREALLOC_LEVEL > 0) + return PTRS_PER_S2_PGD * PAGE_SIZE; + return PTRS_PER_S2_PGD * sizeof(pgd_t); } static inline bool kvm_page_empty(void *ptr) From 04b8dc85bf4a64517e3cf20e409eeaa503b15cc1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 10 Mar 2015 19:07:00 +0000 Subject: [PATCH 3/8] arm64: KVM: Do not use pgd_index to index stage-2 pgd The kernel's pgd_index macro is designed to index a normal, page sized array. KVM is a bit diffferent, as we can use concatenated pages to have a bigger address space (for example 40bit IPA with 4kB pages gives us an 8kB PGD. In the above case, the use of pgd_index will always return an index inside the first 4kB, which makes a guest that has memory above 0x8000000000 rather unhappy, as it spins forever in a page fault, whist the host happilly corrupts the lower pgd. The obvious fix is to get our own kvm_pgd_index that does the right thing(tm). Tested on X-Gene with a hacked kvmtool that put memory at a stupidly high address. Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_mmu.h | 3 ++- arch/arm/kvm/mmu.c | 8 ++++---- arch/arm64/include/asm/kvm_mmu.h | 2 ++ 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index c57c41dc7e87..4cf48c3aca13 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -149,13 +149,14 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd) (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) +#define kvm_pgd_index(addr) pgd_index(addr) + static inline bool kvm_page_empty(void *ptr) { struct page *ptr_page = virt_to_page(ptr); return page_count(ptr_page) == 1; } - #define kvm_pte_table_empty(kvm, ptep) kvm_page_empty(ptep) #define kvm_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) #define kvm_pud_table_empty(kvm, pudp) (0) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index a48a73c6b866..5656d79c5a44 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -290,7 +290,7 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, phys_addr_t addr = start, end = start + size; phys_addr_t next; - pgd = pgdp + pgd_index(addr); + pgd = pgdp + kvm_pgd_index(addr); do { next = kvm_pgd_addr_end(addr, end); if (!pgd_none(*pgd)) @@ -355,7 +355,7 @@ static void stage2_flush_memslot(struct kvm *kvm, phys_addr_t next; pgd_t *pgd; - pgd = kvm->arch.pgd + pgd_index(addr); + pgd = kvm->arch.pgd + kvm_pgd_index(addr); do { next = kvm_pgd_addr_end(addr, end); stage2_flush_puds(kvm, pgd, addr, next); @@ -830,7 +830,7 @@ static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache pgd_t *pgd; pud_t *pud; - pgd = kvm->arch.pgd + pgd_index(addr); + pgd = kvm->arch.pgd + kvm_pgd_index(addr); if (WARN_ON(pgd_none(*pgd))) { if (!cache) return NULL; @@ -1120,7 +1120,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) pgd_t *pgd; phys_addr_t next; - pgd = kvm->arch.pgd + pgd_index(addr); + pgd = kvm->arch.pgd + kvm_pgd_index(addr); do { /* * Release kvm_mmu_lock periodically if the memory region is diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index a099cd9cdef8..bbfb600fa822 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -158,6 +158,8 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd) #define PTRS_PER_S2_PGD (1 << PTRS_PER_S2_PGD_SHIFT) #define S2_PGD_ORDER get_order(PTRS_PER_S2_PGD * sizeof(pgd_t)) +#define kvm_pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) + /* * If we are concatenating first level stage-2 page tables, we would have less * than or equal to 16 pointers in the fake PGD, because that's what the From 84ed7412b5eee1011579b3db7454b9cb6d26fa65 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 10 Mar 2015 19:07:01 +0000 Subject: [PATCH 4/8] arm64: KVM: Fix outdated comment about VTCR_EL2.PS Commit 87366d8cf7b3 ("arm64: Add boot time configuration of Intermediate Physical Address size") removed the hardcoded setting of VTCR_EL2.PS to use ID_AA64MMFR0_EL1.PARange instead, but didn't remove the (now rather misleading) comment. Fix the comments to match reality (at least for the next few minutes). Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/include/asm/kvm_arm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 94674eb7e7bb..54bb4ba97441 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -129,6 +129,9 @@ * 40 bits wide (T0SZ = 24). Systems with a PARange smaller than 40 bits are * not known to exist and will break with this configuration. * + * VTCR_EL2.PS is extracted from ID_AA64MMFR0_EL1.PARange at boot time + * (see hyp-init.S). + * * Note that when using 4K pages, we concatenate two first level page tables * together. * @@ -138,7 +141,6 @@ #ifdef CONFIG_ARM64_64K_PAGES /* * Stage2 translation configuration: - * 40bits output (PS = 2) * 40bits input (T0SZ = 24) * 64kB pages (TG0 = 1) * 2 level page tables (SL = 1) @@ -150,7 +152,6 @@ #else /* * Stage2 translation configuration: - * 40bits output (PS = 2) * 40bits input (T0SZ = 24) * 4kB pages (TG0 = 0) * 3 level page tables (SL = 1) From c1a6bff28cbff796bf6e7db5cf42ec9244911be2 Mon Sep 17 00:00:00 2001 From: Petr Matousek Date: Wed, 11 Mar 2015 12:16:09 +0100 Subject: [PATCH 5/8] kvm: x86: i8259: return initialized data on invalid-size read If data is read from PIC with invalid access size, the return data stays uninitialized even though success is returned. Fix this by always initializing the data. Signed-off-by: Petr Matousek Reported-by: Nadav Amit Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8259.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index cc31f7c06d3d..9541ba34126b 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -507,6 +507,7 @@ static int picdev_read(struct kvm_pic *s, return -EOPNOTSUPP; if (len != 1) { + memset(val, 0, len); pr_pic_unimpl("non byte read\n"); return 0; } From b52104e509479c4709eb9d81642df77c5ef2716b Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 27 Feb 2015 19:41:45 +0800 Subject: [PATCH 6/8] arm/arm64: KVM: fix missing unlock on error in kvm_vgic_create() Add the missing unlock before return from function kvm_vgic_create() in the error handling case. Signed-off-by: Wei Yongjun Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 0cc6ab6005a0..4b2c2e7856a3 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -1583,8 +1583,10 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) * emulation. So check this here again. KVM_CREATE_DEVICE does * the proper checks already. */ - if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && !vgic->can_emulate_gicv2) - return -ENODEV; + if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && !vgic->can_emulate_gicv2) { + ret = -ENODEV; + goto out; + } /* * Any time a vcpu is run, vcpu_load is called which tries to grab the From 670125bda1d86edfadf81dc56a87582ac7fbd47b Mon Sep 17 00:00:00 2001 From: Wincy Van Date: Wed, 4 Mar 2015 14:31:56 +0800 Subject: [PATCH 7/8] KVM: VMX: Set msr bitmap correctly if vcpu is in guest mode In commit 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap"), we are setting MSR_BITMAP in prepare_vmcs02 if we should use hardware. This is not enough since the field will be modified by following vmx_set_efer. Fix this by setting vmx_msr_bitmap_nested in vmx_set_msr_bitmap if vcpu is in guest mode. Signed-off-by: Wincy Van Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f7b20b417a3a..10a481b7674d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2168,7 +2168,10 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) { unsigned long *msr_bitmap; - if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) { + if (is_guest_mode(vcpu)) + msr_bitmap = vmx_msr_bitmap_nested; + else if (irqchip_in_kernel(vcpu->kvm) && + apic_x2apic_mode(vcpu->arch.apic)) { if (is_long_mode(vcpu)) msr_bitmap = vmx_msr_bitmap_longmode_x2apic; else @@ -9218,9 +9221,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) } if (cpu_has_vmx_msr_bitmap() && - exec_control & CPU_BASED_USE_MSR_BITMAPS && - nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) { - vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested)); + exec_control & CPU_BASED_USE_MSR_BITMAPS) { + nested_vmx_merge_msr_bitmap(vcpu, vmcs12); + /* MSR_BITMAP will be set by following vmx_set_efer. */ } else exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; From ae705930fca6322600690df9dc1c7d0516145a93 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 13 Mar 2015 17:02:56 +0000 Subject: [PATCH 8/8] arm/arm64: KVM: Keep elrsr/aisr in sync with software model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is an interesting bug in the vgic code, which manifests itself when the KVM run loop has a signal pending or needs a vmid generation rollover after having disabled interrupts but before actually switching to the guest. In this case, we flush the vgic as usual, but we sync back the vgic state and exit to userspace before entering the guest. The consequence is that we will be syncing the list registers back to the software model using the GICH_ELRSR and GICH_EISR from the last execution of the guest, potentially overwriting a list register containing an interrupt. This showed up during migration testing where we would capture a state where the VM has masked the arch timer but there were no interrupts, resulting in a hung test. Cc: Marc Zyngier Reported-by: Alex Bennee Signed-off-by: Christoffer Dall Signed-off-by: Alex Bennée Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- include/kvm/arm_vgic.h | 1 + virt/kvm/arm/vgic-v2.c | 8 ++++++++ virt/kvm/arm/vgic-v3.c | 8 ++++++++ virt/kvm/arm/vgic.c | 16 ++++++++++++++++ 4 files changed, 33 insertions(+) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 7c55dd5dd2c9..66203b268984 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -114,6 +114,7 @@ struct vgic_ops { void (*sync_lr_elrsr)(struct kvm_vcpu *, int, struct vgic_lr); u64 (*get_elrsr)(const struct kvm_vcpu *vcpu); u64 (*get_eisr)(const struct kvm_vcpu *vcpu); + void (*clear_eisr)(struct kvm_vcpu *vcpu); u32 (*get_interrupt_status)(const struct kvm_vcpu *vcpu); void (*enable_underflow)(struct kvm_vcpu *vcpu); void (*disable_underflow)(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c index a0a7b5d1a070..f9b9c7c51372 100644 --- a/virt/kvm/arm/vgic-v2.c +++ b/virt/kvm/arm/vgic-v2.c @@ -72,6 +72,8 @@ static void vgic_v2_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr, { if (!(lr_desc.state & LR_STATE_MASK)) vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr); + else + vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr &= ~(1ULL << lr); } static u64 vgic_v2_get_elrsr(const struct kvm_vcpu *vcpu) @@ -84,6 +86,11 @@ static u64 vgic_v2_get_eisr(const struct kvm_vcpu *vcpu) return vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr; } +static void vgic_v2_clear_eisr(struct kvm_vcpu *vcpu) +{ + vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr = 0; +} + static u32 vgic_v2_get_interrupt_status(const struct kvm_vcpu *vcpu) { u32 misr = vcpu->arch.vgic_cpu.vgic_v2.vgic_misr; @@ -148,6 +155,7 @@ static const struct vgic_ops vgic_v2_ops = { .sync_lr_elrsr = vgic_v2_sync_lr_elrsr, .get_elrsr = vgic_v2_get_elrsr, .get_eisr = vgic_v2_get_eisr, + .clear_eisr = vgic_v2_clear_eisr, .get_interrupt_status = vgic_v2_get_interrupt_status, .enable_underflow = vgic_v2_enable_underflow, .disable_underflow = vgic_v2_disable_underflow, diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c index 3a62d8a9a2c6..dff06021e748 100644 --- a/virt/kvm/arm/vgic-v3.c +++ b/virt/kvm/arm/vgic-v3.c @@ -104,6 +104,8 @@ static void vgic_v3_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr, { if (!(lr_desc.state & LR_STATE_MASK)) vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr); + else + vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr &= ~(1U << lr); } static u64 vgic_v3_get_elrsr(const struct kvm_vcpu *vcpu) @@ -116,6 +118,11 @@ static u64 vgic_v3_get_eisr(const struct kvm_vcpu *vcpu) return vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr; } +static void vgic_v3_clear_eisr(struct kvm_vcpu *vcpu) +{ + vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr = 0; +} + static u32 vgic_v3_get_interrupt_status(const struct kvm_vcpu *vcpu) { u32 misr = vcpu->arch.vgic_cpu.vgic_v3.vgic_misr; @@ -192,6 +199,7 @@ static const struct vgic_ops vgic_v3_ops = { .sync_lr_elrsr = vgic_v3_sync_lr_elrsr, .get_elrsr = vgic_v3_get_elrsr, .get_eisr = vgic_v3_get_eisr, + .clear_eisr = vgic_v3_clear_eisr, .get_interrupt_status = vgic_v3_get_interrupt_status, .enable_underflow = vgic_v3_enable_underflow, .disable_underflow = vgic_v3_disable_underflow, diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 4b2c2e7856a3..c9f60f524588 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -883,6 +883,11 @@ static inline u64 vgic_get_eisr(struct kvm_vcpu *vcpu) return vgic_ops->get_eisr(vcpu); } +static inline void vgic_clear_eisr(struct kvm_vcpu *vcpu) +{ + vgic_ops->clear_eisr(vcpu); +} + static inline u32 vgic_get_interrupt_status(struct kvm_vcpu *vcpu) { return vgic_ops->get_interrupt_status(vcpu); @@ -922,6 +927,7 @@ static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu) vgic_set_lr(vcpu, lr_nr, vlr); clear_bit(lr_nr, vgic_cpu->lr_used); vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY; + vgic_sync_lr_elrsr(vcpu, lr_nr, vlr); } /* @@ -978,6 +984,7 @@ bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq) BUG_ON(!test_bit(lr, vgic_cpu->lr_used)); vlr.state |= LR_STATE_PENDING; vgic_set_lr(vcpu, lr, vlr); + vgic_sync_lr_elrsr(vcpu, lr, vlr); return true; } } @@ -999,6 +1006,7 @@ bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq) vlr.state |= LR_EOI_INT; vgic_set_lr(vcpu, lr, vlr); + vgic_sync_lr_elrsr(vcpu, lr, vlr); return true; } @@ -1136,6 +1144,14 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu) if (status & INT_STATUS_UNDERFLOW) vgic_disable_underflow(vcpu); + /* + * In the next iterations of the vcpu loop, if we sync the vgic state + * after flushing it, but before entering the guest (this happens for + * pending signals and vmid rollovers), then make sure we don't pick + * up any old maintenance interrupts here. + */ + vgic_clear_eisr(vcpu); + return level_pending; }