mirror of https://gitee.com/openkylin/linux.git
x86:
- take into account HVA before retrying on MMU notifier race - fixes for nested AMD guests without NPT - allow INVPCID in guest without PCID - disable PML in hardware when not in use - MMU code cleanups -----BEGIN PGP SIGNATURE----- iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmA3eMQUHHBib256aW5p QHJlZGhhdC5jb20ACgkQv/vSX3jHroP6TQf5ARpUyq3oo+13albwg+zNca6hzR8i Vl7dpoR3bSJCN3sTYFnlL9eXw5TxgeUL2nqKqma6ddZDNDEBLT2Bq8rcFkbi4pUf n7av76EEq74HW/jlUhKVug7Q5Dm5DiKC6BOH3RVuKHbr6iZseyF3jXZSX0Ppf0yF gvoy6cGyMW60NVLN5tuGeOjVQ1fxziE0SqB90fXuiWgZ5rzIBfbqJV7EOOZsGO67 /LHSaEpvKutsc2a+Hx76yQNJjAbb2/O+4Bo5/RqfdqS5tRLGBzYggdJjLvAPvd6P pTNtDCnErvBZQfMedEQyHYuBL2Ca59fOp6i/ekOM2I+m7816+kSkdTMt2g== =iMHY -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull more KVM updates from Paolo Bonzini: "x86: - take into account HVA before retrying on MMU notifier race - fixes for nested AMD guests without NPT - allow INVPCID in guest without PCID - disable PML in hardware when not in use - MMU code cleanups: * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (28 commits) KVM: SVM: Fix nested VM-Exit on #GP interception handling KVM: vmx/pmu: Fix dummy check if lbr_desc->event is created KVM: x86/mmu: Consider the hva in mmu_notifier retry KVM: x86/mmu: Skip mmu_notifier check when handling MMIO page fault KVM: Documentation: rectify rst markup in KVM_GET_SUPPORTED_HV_CPUID KVM: nSVM: prepare guest save area while is_guest_mode is true KVM: x86/mmu: Remove a variety of unnecessary exports KVM: x86: Fold "write-protect large" use case into generic write-protect KVM: x86/mmu: Don't set dirty bits when disabling dirty logging w/ PML KVM: VMX: Dynamically enable/disable PML based on memslot dirty logging KVM: x86: Further clarify the logic and comments for toggling log dirty KVM: x86: Move MMU's PML logic to common code KVM: x86/mmu: Make dirty log size hook (PML) a value, not a function KVM: x86/mmu: Expand on the comment in kvm_vcpu_ad_need_write_protect() KVM: nVMX: Disable PML in hardware when running L2 KVM: x86/mmu: Consult max mapping level when zapping collapsible SPTEs KVM: x86/mmu: Pass the memslot to the rmap callbacks KVM: x86/mmu: Split out max mapping level calculation to helper KVM: x86/mmu: Expand collapsible SPTE zap for TDP MMU to ZONE_DEVICE and HugeTLB pages KVM: nVMX: no need to undo inject_page_fault change on nested vmexit ...
This commit is contained in:
commit
d94d14008e
|
@ -4519,6 +4519,7 @@ KVM_GET_SUPPORTED_CPUID ioctl because some of them intersect with KVM feature
|
|||
leaves (0x40000000, 0x40000001).
|
||||
|
||||
Currently, the following list of CPUID leaves are returned:
|
||||
|
||||
- HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS
|
||||
- HYPERV_CPUID_INTERFACE
|
||||
- HYPERV_CPUID_VERSION
|
||||
|
@ -4543,6 +4544,7 @@ userspace should not expect to get any particular value there.
|
|||
Note, vcpu version of KVM_GET_SUPPORTED_HV_CPUID is currently deprecated. Unlike
|
||||
system ioctl which exposes all supported feature bits unconditionally, vcpu
|
||||
version has the following quirks:
|
||||
|
||||
- HYPERV_CPUID_NESTED_FEATURES leaf and HV_X64_ENLIGHTENED_VMCS_RECOMMENDED
|
||||
feature bit are only exposed when Enlightened VMCS was previously enabled
|
||||
on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS).
|
||||
|
|
|
@ -591,7 +591,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
|
|||
} else {
|
||||
/* Call KVM generic code to do the slow-path check */
|
||||
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
|
||||
writing, &write_ok);
|
||||
writing, &write_ok, NULL);
|
||||
if (is_error_noslot_pfn(pfn))
|
||||
return -EFAULT;
|
||||
page = NULL;
|
||||
|
|
|
@ -822,7 +822,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
|
|||
|
||||
/* Call KVM generic code to do the slow-path check */
|
||||
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
|
||||
writing, upgrade_p);
|
||||
writing, upgrade_p, NULL);
|
||||
if (is_error_noslot_pfn(pfn))
|
||||
return -EFAULT;
|
||||
page = NULL;
|
||||
|
|
|
@ -93,11 +93,7 @@ KVM_X86_OP(check_intercept)
|
|||
KVM_X86_OP(handle_exit_irqoff)
|
||||
KVM_X86_OP_NULL(request_immediate_exit)
|
||||
KVM_X86_OP(sched_in)
|
||||
KVM_X86_OP_NULL(slot_enable_log_dirty)
|
||||
KVM_X86_OP_NULL(slot_disable_log_dirty)
|
||||
KVM_X86_OP_NULL(flush_log_dirty)
|
||||
KVM_X86_OP_NULL(enable_log_dirty_pt_masked)
|
||||
KVM_X86_OP_NULL(cpu_dirty_log_size)
|
||||
KVM_X86_OP_NULL(update_cpu_dirty_logging)
|
||||
KVM_X86_OP_NULL(pre_block)
|
||||
KVM_X86_OP_NULL(post_block)
|
||||
KVM_X86_OP_NULL(vcpu_blocking)
|
||||
|
|
|
@ -89,6 +89,8 @@
|
|||
KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
|
||||
#define KVM_REQ_APF_READY KVM_ARCH_REQ(28)
|
||||
#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29)
|
||||
#define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \
|
||||
KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
|
||||
|
||||
#define CR0_RESERVED_BITS \
|
||||
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
|
||||
|
@ -1007,6 +1009,7 @@ struct kvm_arch {
|
|||
u32 bsp_vcpu_id;
|
||||
|
||||
u64 disabled_quirks;
|
||||
int cpu_dirty_logging_count;
|
||||
|
||||
enum kvm_irqchip_mode irqchip_mode;
|
||||
u8 nr_reserved_ioapic_pins;
|
||||
|
@ -1271,30 +1274,11 @@ struct kvm_x86_ops {
|
|||
void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
|
||||
|
||||
/*
|
||||
* Arch-specific dirty logging hooks. These hooks are only supposed to
|
||||
* be valid if the specific arch has hardware-accelerated dirty logging
|
||||
* mechanism. Currently only for PML on VMX.
|
||||
*
|
||||
* - slot_enable_log_dirty:
|
||||
* called when enabling log dirty mode for the slot.
|
||||
* - slot_disable_log_dirty:
|
||||
* called when disabling log dirty mode for the slot.
|
||||
* also called when slot is created with log dirty disabled.
|
||||
* - flush_log_dirty:
|
||||
* called before reporting dirty_bitmap to userspace.
|
||||
* - enable_log_dirty_pt_masked:
|
||||
* called when reenabling log dirty for the GFNs in the mask after
|
||||
* corresponding bits are cleared in slot->dirty_bitmap.
|
||||
* Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero
|
||||
* value indicates CPU dirty logging is unsupported or disabled.
|
||||
*/
|
||||
void (*slot_enable_log_dirty)(struct kvm *kvm,
|
||||
struct kvm_memory_slot *slot);
|
||||
void (*slot_disable_log_dirty)(struct kvm *kvm,
|
||||
struct kvm_memory_slot *slot);
|
||||
void (*flush_log_dirty)(struct kvm *kvm);
|
||||
void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
|
||||
struct kvm_memory_slot *slot,
|
||||
gfn_t offset, unsigned long mask);
|
||||
int (*cpu_dirty_log_size)(void);
|
||||
int cpu_dirty_log_size;
|
||||
void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu);
|
||||
|
||||
/* pmu operations of sub-arch */
|
||||
const struct kvm_pmu_ops *pmu_ops;
|
||||
|
@ -1437,11 +1421,6 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
|
|||
struct kvm_memory_slot *memslot);
|
||||
void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
|
||||
struct kvm_memory_slot *memslot);
|
||||
void kvm_mmu_slot_set_dirty(struct kvm *kvm,
|
||||
struct kvm_memory_slot *memslot);
|
||||
void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
|
||||
struct kvm_memory_slot *slot,
|
||||
gfn_t gfn_offset, unsigned long mask);
|
||||
void kvm_mmu_zap_all(struct kvm *kvm);
|
||||
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
|
||||
unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
|
||||
|
@ -1613,7 +1592,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu);
|
|||
void kvm_update_dr7(struct kvm_vcpu *vcpu);
|
||||
|
||||
int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
|
||||
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
|
||||
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
|
||||
int kvm_mmu_load(struct kvm_vcpu *vcpu);
|
||||
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
|
||||
|
|
|
@ -408,7 +408,7 @@ void kvm_set_cpu_caps(void)
|
|||
|
||||
kvm_cpu_cap_mask(CPUID_7_0_EBX,
|
||||
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
|
||||
F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
|
||||
F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
|
||||
F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
|
||||
F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
|
||||
F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/
|
||||
|
|
|
@ -1165,7 +1165,8 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep)
|
|||
* - W bit on ad-disabled SPTEs.
|
||||
* Returns true iff any D or W bits were cleared.
|
||||
*/
|
||||
static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
|
||||
static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
|
||||
struct kvm_memory_slot *slot)
|
||||
{
|
||||
u64 *sptep;
|
||||
struct rmap_iterator iter;
|
||||
|
@ -1180,35 +1181,6 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
|
|||
return flush;
|
||||
}
|
||||
|
||||
static bool spte_set_dirty(u64 *sptep)
|
||||
{
|
||||
u64 spte = *sptep;
|
||||
|
||||
rmap_printk("spte %p %llx\n", sptep, *sptep);
|
||||
|
||||
/*
|
||||
* Similar to the !kvm_x86_ops.slot_disable_log_dirty case,
|
||||
* do not bother adding back write access to pages marked
|
||||
* SPTE_AD_WRPROT_ONLY_MASK.
|
||||
*/
|
||||
spte |= shadow_dirty_mask;
|
||||
|
||||
return mmu_spte_update(sptep, spte);
|
||||
}
|
||||
|
||||
static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
|
||||
{
|
||||
u64 *sptep;
|
||||
struct rmap_iterator iter;
|
||||
bool flush = false;
|
||||
|
||||
for_each_rmap_spte(rmap_head, &iter, sptep)
|
||||
if (spte_ad_enabled(*sptep))
|
||||
flush |= spte_set_dirty(sptep);
|
||||
|
||||
return flush;
|
||||
}
|
||||
|
||||
/**
|
||||
* kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
|
||||
* @kvm: kvm instance
|
||||
|
@ -1248,9 +1220,9 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
|
|||
*
|
||||
* Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
|
||||
*/
|
||||
void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
|
||||
struct kvm_memory_slot *slot,
|
||||
gfn_t gfn_offset, unsigned long mask)
|
||||
static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
|
||||
struct kvm_memory_slot *slot,
|
||||
gfn_t gfn_offset, unsigned long mask)
|
||||
{
|
||||
struct kvm_rmap_head *rmap_head;
|
||||
|
||||
|
@ -1260,13 +1232,12 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
|
|||
while (mask) {
|
||||
rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
|
||||
PG_LEVEL_4K, slot);
|
||||
__rmap_clear_dirty(kvm, rmap_head);
|
||||
__rmap_clear_dirty(kvm, rmap_head, slot);
|
||||
|
||||
/* clear the first set bit */
|
||||
mask &= mask - 1;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
|
||||
|
||||
/**
|
||||
* kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
|
||||
|
@ -1282,20 +1253,15 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
|
|||
struct kvm_memory_slot *slot,
|
||||
gfn_t gfn_offset, unsigned long mask)
|
||||
{
|
||||
if (kvm_x86_ops.enable_log_dirty_pt_masked)
|
||||
static_call(kvm_x86_enable_log_dirty_pt_masked)(kvm, slot,
|
||||
gfn_offset,
|
||||
mask);
|
||||
if (kvm_x86_ops.cpu_dirty_log_size)
|
||||
kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
|
||||
else
|
||||
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
|
||||
}
|
||||
|
||||
int kvm_cpu_dirty_log_size(void)
|
||||
{
|
||||
if (kvm_x86_ops.cpu_dirty_log_size)
|
||||
return static_call(kvm_x86_cpu_dirty_log_size)();
|
||||
|
||||
return 0;
|
||||
return kvm_x86_ops.cpu_dirty_log_size;
|
||||
}
|
||||
|
||||
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
|
||||
|
@ -1325,7 +1291,8 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
|
|||
return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
|
||||
}
|
||||
|
||||
static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
|
||||
static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
|
||||
struct kvm_memory_slot *slot)
|
||||
{
|
||||
u64 *sptep;
|
||||
struct rmap_iterator iter;
|
||||
|
@ -1345,7 +1312,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
|
|||
struct kvm_memory_slot *slot, gfn_t gfn, int level,
|
||||
unsigned long data)
|
||||
{
|
||||
return kvm_zap_rmapp(kvm, rmap_head);
|
||||
return kvm_zap_rmapp(kvm, rmap_head, slot);
|
||||
}
|
||||
|
||||
static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
|
||||
|
@ -2499,7 +2466,21 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
|
|||
|
||||
return r;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
|
||||
|
||||
static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
|
||||
{
|
||||
gpa_t gpa;
|
||||
int r;
|
||||
|
||||
if (vcpu->arch.mmu->direct_map)
|
||||
return 0;
|
||||
|
||||
gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
|
||||
|
||||
r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
|
||||
{
|
||||
|
@ -2753,11 +2734,18 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
|
|||
if (sp->role.level > PG_LEVEL_4K)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If addresses are being invalidated, skip prefetching to avoid
|
||||
* accidentally prefetching those addresses.
|
||||
*/
|
||||
if (unlikely(vcpu->kvm->mmu_notifier_count))
|
||||
return;
|
||||
|
||||
__direct_pte_prefetch(vcpu, sp, sptep);
|
||||
}
|
||||
|
||||
static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
|
||||
kvm_pfn_t pfn, struct kvm_memory_slot *slot)
|
||||
static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
|
||||
struct kvm_memory_slot *slot)
|
||||
{
|
||||
unsigned long hva;
|
||||
pte_t *pte;
|
||||
|
@ -2776,19 +2764,36 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
|
|||
*/
|
||||
hva = __gfn_to_hva_memslot(slot, gfn);
|
||||
|
||||
pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level);
|
||||
pte = lookup_address_in_mm(kvm->mm, hva, &level);
|
||||
if (unlikely(!pte))
|
||||
return PG_LEVEL_4K;
|
||||
|
||||
return level;
|
||||
}
|
||||
|
||||
int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
|
||||
gfn_t gfn, kvm_pfn_t pfn, int max_level)
|
||||
{
|
||||
struct kvm_lpage_info *linfo;
|
||||
|
||||
max_level = min(max_level, max_huge_page_level);
|
||||
for ( ; max_level > PG_LEVEL_4K; max_level--) {
|
||||
linfo = lpage_info_slot(gfn, slot, max_level);
|
||||
if (!linfo->disallow_lpage)
|
||||
break;
|
||||
}
|
||||
|
||||
if (max_level == PG_LEVEL_4K)
|
||||
return PG_LEVEL_4K;
|
||||
|
||||
return host_pfn_mapping_level(kvm, gfn, pfn, slot);
|
||||
}
|
||||
|
||||
int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
|
||||
int max_level, kvm_pfn_t *pfnp,
|
||||
bool huge_page_disallowed, int *req_level)
|
||||
{
|
||||
struct kvm_memory_slot *slot;
|
||||
struct kvm_lpage_info *linfo;
|
||||
kvm_pfn_t pfn = *pfnp;
|
||||
kvm_pfn_t mask;
|
||||
int level;
|
||||
|
@ -2805,17 +2810,7 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
|
|||
if (!slot)
|
||||
return PG_LEVEL_4K;
|
||||
|
||||
max_level = min(max_level, max_huge_page_level);
|
||||
for ( ; max_level > PG_LEVEL_4K; max_level--) {
|
||||
linfo = lpage_info_slot(gfn, slot, max_level);
|
||||
if (!linfo->disallow_lpage)
|
||||
break;
|
||||
}
|
||||
|
||||
if (max_level == PG_LEVEL_4K)
|
||||
return PG_LEVEL_4K;
|
||||
|
||||
level = host_pfn_mapping_level(vcpu, gfn, pfn, slot);
|
||||
level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level);
|
||||
if (level == PG_LEVEL_4K)
|
||||
return level;
|
||||
|
||||
|
@ -3437,7 +3432,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
|
|||
kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
|
||||
write_unlock(&vcpu->kvm->mmu_lock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
|
||||
|
||||
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
|
||||
u32 access, struct x86_exception *exception)
|
||||
|
@ -3653,8 +3647,8 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
|
|||
}
|
||||
|
||||
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
|
||||
gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
|
||||
bool *writable)
|
||||
gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
|
||||
bool write, bool *writable)
|
||||
{
|
||||
struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
|
||||
bool async;
|
||||
|
@ -3667,7 +3661,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
|
|||
}
|
||||
|
||||
async = false;
|
||||
*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
|
||||
*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async,
|
||||
write, writable, hva);
|
||||
if (!async)
|
||||
return false; /* *pfn has correct page already */
|
||||
|
||||
|
@ -3681,7 +3676,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
|
|||
return true;
|
||||
}
|
||||
|
||||
*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
|
||||
*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
|
||||
write, writable, hva);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -3694,6 +3690,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
|
|||
gfn_t gfn = gpa >> PAGE_SHIFT;
|
||||
unsigned long mmu_seq;
|
||||
kvm_pfn_t pfn;
|
||||
hva_t hva;
|
||||
int r;
|
||||
|
||||
if (page_fault_handle_page_track(vcpu, error_code, gfn))
|
||||
|
@ -3712,7 +3709,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
|
|||
mmu_seq = vcpu->kvm->mmu_notifier_seq;
|
||||
smp_rmb();
|
||||
|
||||
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
|
||||
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
|
||||
write, &map_writable))
|
||||
return RET_PF_RETRY;
|
||||
|
||||
if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
|
||||
|
@ -3725,7 +3723,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
|
|||
else
|
||||
write_lock(&vcpu->kvm->mmu_lock);
|
||||
|
||||
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
|
||||
if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
|
||||
goto out_unlock;
|
||||
r = make_mmu_pages_available(vcpu);
|
||||
if (r)
|
||||
|
@ -5003,22 +5001,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
|
|||
write_unlock(&vcpu->kvm->mmu_lock);
|
||||
}
|
||||
|
||||
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
|
||||
{
|
||||
gpa_t gpa;
|
||||
int r;
|
||||
|
||||
if (vcpu->arch.mmu->direct_map)
|
||||
return 0;
|
||||
|
||||
gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
|
||||
|
||||
r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
|
||||
|
||||
return r;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
|
||||
|
||||
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
|
||||
void *insn, int insn_len)
|
||||
{
|
||||
|
@ -5117,7 +5099,6 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
|
|||
mmu->invlpg(vcpu, gva, root_hpa);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_gva);
|
||||
|
||||
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
|
||||
{
|
||||
|
@ -5157,7 +5138,6 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
|
|||
* for them.
|
||||
*/
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
|
||||
|
||||
void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
|
||||
int tdp_huge_page_level)
|
||||
|
@ -5182,7 +5162,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
|
|||
EXPORT_SYMBOL_GPL(kvm_configure_mmu);
|
||||
|
||||
/* The return value indicates if tlb flush on all vcpus is needed. */
|
||||
typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
|
||||
typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head,
|
||||
struct kvm_memory_slot *slot);
|
||||
|
||||
/* The caller should hold mmu-lock before calling this function. */
|
||||
static __always_inline bool
|
||||
|
@ -5196,7 +5177,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
|
|||
for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
|
||||
end_gfn, &iterator) {
|
||||
if (iterator.rmap)
|
||||
flush |= fn(kvm, iterator.rmap);
|
||||
flush |= fn(kvm, iterator.rmap, memslot);
|
||||
|
||||
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
|
||||
if (flush && lock_flush_tlb) {
|
||||
|
@ -5229,22 +5210,6 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
|
|||
lock_flush_tlb);
|
||||
}
|
||||
|
||||
static __always_inline bool
|
||||
slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
|
||||
slot_level_handler fn, bool lock_flush_tlb)
|
||||
{
|
||||
return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
|
||||
KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
|
||||
}
|
||||
|
||||
static __always_inline bool
|
||||
slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
|
||||
slot_level_handler fn, bool lock_flush_tlb)
|
||||
{
|
||||
return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K + 1,
|
||||
KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
|
||||
}
|
||||
|
||||
static __always_inline bool
|
||||
slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
|
||||
slot_level_handler fn, bool lock_flush_tlb)
|
||||
|
@ -5485,7 +5450,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
|
|||
}
|
||||
|
||||
static bool slot_rmap_write_protect(struct kvm *kvm,
|
||||
struct kvm_rmap_head *rmap_head)
|
||||
struct kvm_rmap_head *rmap_head,
|
||||
struct kvm_memory_slot *slot)
|
||||
{
|
||||
return __rmap_write_protect(kvm, rmap_head, false);
|
||||
}
|
||||
|
@ -5519,7 +5485,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
|
|||
}
|
||||
|
||||
static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
|
||||
struct kvm_rmap_head *rmap_head)
|
||||
struct kvm_rmap_head *rmap_head,
|
||||
struct kvm_memory_slot *slot)
|
||||
{
|
||||
u64 *sptep;
|
||||
struct rmap_iterator iter;
|
||||
|
@ -5540,8 +5507,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
|
|||
* mapping if the indirect sp has level = 1.
|
||||
*/
|
||||
if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
|
||||
(kvm_is_zone_device_pfn(pfn) ||
|
||||
PageCompound(pfn_to_page(pfn)))) {
|
||||
sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
|
||||
pfn, PG_LEVEL_NUM)) {
|
||||
pte_list_remove(rmap_head, sptep);
|
||||
|
||||
if (kvm_available_flush_tlb_with_range())
|
||||
|
@ -5561,12 +5528,13 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
|
|||
const struct kvm_memory_slot *memslot)
|
||||
{
|
||||
/* FIXME: const-ify all uses of struct kvm_memory_slot. */
|
||||
struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot;
|
||||
|
||||
write_lock(&kvm->mmu_lock);
|
||||
slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
|
||||
kvm_mmu_zap_collapsible_spte, true);
|
||||
slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
|
||||
|
||||
if (is_tdp_mmu_enabled(kvm))
|
||||
kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot);
|
||||
kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
|
||||
write_unlock(&kvm->mmu_lock);
|
||||
}
|
||||
|
||||
|
@ -5605,40 +5573,6 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
|
|||
if (flush)
|
||||
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
|
||||
|
||||
void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
|
||||
struct kvm_memory_slot *memslot)
|
||||
{
|
||||
bool flush;
|
||||
|
||||
write_lock(&kvm->mmu_lock);
|
||||
flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
|
||||
false);
|
||||
if (is_tdp_mmu_enabled(kvm))
|
||||
flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M);
|
||||
write_unlock(&kvm->mmu_lock);
|
||||
|
||||
if (flush)
|
||||
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
|
||||
|
||||
void kvm_mmu_slot_set_dirty(struct kvm *kvm,
|
||||
struct kvm_memory_slot *memslot)
|
||||
{
|
||||
bool flush;
|
||||
|
||||
write_lock(&kvm->mmu_lock);
|
||||
flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
|
||||
if (is_tdp_mmu_enabled(kvm))
|
||||
flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot);
|
||||
write_unlock(&kvm->mmu_lock);
|
||||
|
||||
if (flush)
|
||||
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
|
||||
|
||||
void kvm_mmu_zap_all(struct kvm *kvm)
|
||||
{
|
||||
|
|
|
@ -84,7 +84,10 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
|
|||
* When using the EPT page-modification log, the GPAs in the log
|
||||
* would come from L2 rather than L1. Therefore, we need to rely
|
||||
* on write protection to record dirty pages. This also bypasses
|
||||
* PML, since writes now result in a vmexit.
|
||||
* PML, since writes now result in a vmexit. Note, this helper will
|
||||
* tag SPTEs as needing write-protection even if PML is disabled or
|
||||
* unsupported, but that's ok because the tag is consumed if and only
|
||||
* if PML is enabled. Omit the PML check to save a few uops.
|
||||
*/
|
||||
return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
|
||||
}
|
||||
|
@ -138,6 +141,8 @@ enum {
|
|||
#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
|
||||
#define SET_SPTE_SPURIOUS BIT(2)
|
||||
|
||||
int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
|
||||
gfn_t gfn, kvm_pfn_t pfn, int max_level);
|
||||
int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
|
||||
int max_level, kvm_pfn_t *pfnp,
|
||||
bool huge_page_disallowed, int *req_level);
|
||||
|
|
|
@ -601,6 +601,13 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
|
|||
if (sp->role.level > PG_LEVEL_4K)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If addresses are being invalidated, skip prefetching to avoid
|
||||
* accidentally prefetching those addresses.
|
||||
*/
|
||||
if (unlikely(vcpu->kvm->mmu_notifier_count))
|
||||
return;
|
||||
|
||||
if (sp->role.direct)
|
||||
return __direct_pte_prefetch(vcpu, sp, sptep);
|
||||
|
||||
|
@ -790,6 +797,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
|
|||
struct guest_walker walker;
|
||||
int r;
|
||||
kvm_pfn_t pfn;
|
||||
hva_t hva;
|
||||
unsigned long mmu_seq;
|
||||
bool map_writable, is_self_change_mapping;
|
||||
int max_level;
|
||||
|
@ -840,8 +848,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
|
|||
mmu_seq = vcpu->kvm->mmu_notifier_seq;
|
||||
smp_rmb();
|
||||
|
||||
if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
|
||||
&map_writable))
|
||||
if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
|
||||
write_fault, &map_writable))
|
||||
return RET_PF_RETRY;
|
||||
|
||||
if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
|
||||
|
@ -869,7 +877,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
|
|||
|
||||
r = RET_PF_RETRY;
|
||||
write_lock(&vcpu->kvm->mmu_lock);
|
||||
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
|
||||
if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
|
||||
goto out_unlock;
|
||||
|
||||
kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
|
||||
|
|
|
@ -1268,68 +1268,16 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is
|
||||
* only used for PML, and so will involve setting the dirty bit on each SPTE.
|
||||
* Returns true if an SPTE has been changed and the TLBs need to be flushed.
|
||||
*/
|
||||
static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
|
||||
gfn_t start, gfn_t end)
|
||||
{
|
||||
struct tdp_iter iter;
|
||||
u64 new_spte;
|
||||
bool spte_set = false;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
tdp_root_for_each_pte(iter, root, start, end) {
|
||||
if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
|
||||
continue;
|
||||
|
||||
if (!is_shadow_present_pte(iter.old_spte) ||
|
||||
iter.old_spte & shadow_dirty_mask)
|
||||
continue;
|
||||
|
||||
new_spte = iter.old_spte | shadow_dirty_mask;
|
||||
|
||||
tdp_mmu_set_spte(kvm, &iter, new_spte);
|
||||
spte_set = true;
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
return spte_set;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is
|
||||
* only used for PML, and so will involve setting the dirty bit on each SPTE.
|
||||
* Returns true if an SPTE has been changed and the TLBs need to be flushed.
|
||||
*/
|
||||
bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot)
|
||||
{
|
||||
struct kvm_mmu_page *root;
|
||||
int root_as_id;
|
||||
bool spte_set = false;
|
||||
|
||||
for_each_tdp_mmu_root_yield_safe(kvm, root) {
|
||||
root_as_id = kvm_mmu_page_as_id(root);
|
||||
if (root_as_id != slot->as_id)
|
||||
continue;
|
||||
|
||||
spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn,
|
||||
slot->base_gfn + slot->npages);
|
||||
}
|
||||
return spte_set;
|
||||
}
|
||||
|
||||
/*
|
||||
* Clear leaf entries which could be replaced by large mappings, for
|
||||
* GFNs within the slot.
|
||||
*/
|
||||
static void zap_collapsible_spte_range(struct kvm *kvm,
|
||||
struct kvm_mmu_page *root,
|
||||
gfn_t start, gfn_t end)
|
||||
struct kvm_memory_slot *slot)
|
||||
{
|
||||
gfn_t start = slot->base_gfn;
|
||||
gfn_t end = start + slot->npages;
|
||||
struct tdp_iter iter;
|
||||
kvm_pfn_t pfn;
|
||||
bool spte_set = false;
|
||||
|
@ -1348,7 +1296,8 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
|
|||
|
||||
pfn = spte_to_pfn(iter.old_spte);
|
||||
if (kvm_is_reserved_pfn(pfn) ||
|
||||
!PageTransCompoundMap(pfn_to_page(pfn)))
|
||||
iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
|
||||
pfn, PG_LEVEL_NUM))
|
||||
continue;
|
||||
|
||||
tdp_mmu_set_spte(kvm, &iter, 0);
|
||||
|
@ -1366,7 +1315,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
|
|||
* be replaced by large mappings, for GFNs within the slot.
|
||||
*/
|
||||
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *slot)
|
||||
struct kvm_memory_slot *slot)
|
||||
{
|
||||
struct kvm_mmu_page *root;
|
||||
int root_as_id;
|
||||
|
@ -1376,8 +1325,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
|
|||
if (root_as_id != slot->as_id)
|
||||
continue;
|
||||
|
||||
zap_collapsible_spte_range(kvm, root, slot->base_gfn,
|
||||
slot->base_gfn + slot->npages);
|
||||
zap_collapsible_spte_range(kvm, root, slot);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -33,9 +33,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
|
|||
struct kvm_memory_slot *slot,
|
||||
gfn_t gfn, unsigned long mask,
|
||||
bool wrprot);
|
||||
bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot);
|
||||
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *slot);
|
||||
struct kvm_memory_slot *slot);
|
||||
|
||||
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
|
||||
struct kvm_memory_slot *slot, gfn_t gfn);
|
||||
|
|
|
@ -51,6 +51,23 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
|
|||
nested_svm_vmexit(svm);
|
||||
}
|
||||
|
||||
static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault)
|
||||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
WARN_ON(!is_guest_mode(vcpu));
|
||||
|
||||
if (vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) &&
|
||||
!svm->nested.nested_run_pending) {
|
||||
svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
|
||||
svm->vmcb->control.exit_code_hi = 0;
|
||||
svm->vmcb->control.exit_info_1 = fault->error_code;
|
||||
svm->vmcb->control.exit_info_2 = fault->address;
|
||||
nested_svm_vmexit(svm);
|
||||
} else {
|
||||
kvm_inject_page_fault(vcpu, fault);
|
||||
}
|
||||
}
|
||||
|
||||
static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
|
||||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
|
@ -436,16 +453,33 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
|
|||
{
|
||||
int ret;
|
||||
|
||||
trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
|
||||
vmcb12->save.rip,
|
||||
vmcb12->control.int_ctl,
|
||||
vmcb12->control.event_inj,
|
||||
vmcb12->control.nested_ctl);
|
||||
|
||||
trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
|
||||
vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
|
||||
vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
|
||||
vmcb12->control.intercepts[INTERCEPT_WORD3],
|
||||
vmcb12->control.intercepts[INTERCEPT_WORD4],
|
||||
vmcb12->control.intercepts[INTERCEPT_WORD5]);
|
||||
|
||||
|
||||
svm->nested.vmcb12_gpa = vmcb12_gpa;
|
||||
load_nested_vmcb_control(svm, &vmcb12->control);
|
||||
nested_prepare_vmcb_save(svm, vmcb12);
|
||||
nested_prepare_vmcb_control(svm);
|
||||
nested_prepare_vmcb_save(svm, vmcb12);
|
||||
|
||||
ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
|
||||
nested_npt_enabled(svm));
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (!npt_enabled)
|
||||
svm->vcpu.arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
|
||||
|
||||
svm_set_gif(svm, true);
|
||||
|
||||
return 0;
|
||||
|
@ -489,18 +523,6 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
|
|||
goto out;
|
||||
}
|
||||
|
||||
trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
|
||||
vmcb12->save.rip,
|
||||
vmcb12->control.int_ctl,
|
||||
vmcb12->control.event_inj,
|
||||
vmcb12->control.nested_ctl);
|
||||
|
||||
trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
|
||||
vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
|
||||
vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
|
||||
vmcb12->control.intercepts[INTERCEPT_WORD3],
|
||||
vmcb12->control.intercepts[INTERCEPT_WORD4],
|
||||
vmcb12->control.intercepts[INTERCEPT_WORD5]);
|
||||
|
||||
/* Clear internal status */
|
||||
kvm_clear_exception_queue(&svm->vcpu);
|
||||
|
|
|
@ -926,9 +926,6 @@ static __init void svm_set_cpu_caps(void)
|
|||
if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
|
||||
boot_cpu_has(X86_FEATURE_AMD_SSBD))
|
||||
kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
|
||||
|
||||
/* Enable INVPCID feature */
|
||||
kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
|
||||
}
|
||||
|
||||
static __init int svm_hardware_setup(void)
|
||||
|
@ -1103,12 +1100,12 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
|
|||
static void svm_check_invpcid(struct vcpu_svm *svm)
|
||||
{
|
||||
/*
|
||||
* Intercept INVPCID instruction only if shadow page table is
|
||||
* enabled. Interception is not required with nested page table
|
||||
* enabled.
|
||||
* Intercept INVPCID if shadow paging is enabled to sync/free shadow
|
||||
* roots, or if INVPCID is disabled in the guest to inject #UD.
|
||||
*/
|
||||
if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
|
||||
if (!npt_enabled)
|
||||
if (!npt_enabled ||
|
||||
!guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
|
||||
svm_set_intercept(svm, INTERCEPT_INVPCID);
|
||||
else
|
||||
svm_clr_intercept(svm, INTERCEPT_INVPCID);
|
||||
|
@ -2214,15 +2211,20 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
|
|||
[SVM_INSTR_VMSAVE] = vmsave_interception,
|
||||
};
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
int ret;
|
||||
|
||||
if (is_guest_mode(vcpu)) {
|
||||
svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode];
|
||||
svm->vmcb->control.exit_info_1 = 0;
|
||||
svm->vmcb->control.exit_info_2 = 0;
|
||||
|
||||
return nested_svm_vmexit(svm);
|
||||
} else
|
||||
return svm_instr_handlers[opcode](svm);
|
||||
/* Returns '1' or -errno on failure, '0' on success. */
|
||||
ret = nested_svm_vmexit(svm);
|
||||
if (ret)
|
||||
return ret;
|
||||
return 1;
|
||||
}
|
||||
return svm_instr_handlers[opcode](svm);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -2167,15 +2167,13 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
|
|||
vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
|
||||
|
||||
/*
|
||||
* The PML address never changes, so it is constant in vmcs02.
|
||||
* Conceptually we want to copy the PML index from vmcs01 here,
|
||||
* and then back to vmcs01 on nested vmexit. But since we flush
|
||||
* the log and reset GUEST_PML_INDEX on each vmexit, the PML
|
||||
* index is also effectively constant in vmcs02.
|
||||
* PML is emulated for L2, but never enabled in hardware as the MMU
|
||||
* handles A/D emulation. Disabling PML for L2 also avoids having to
|
||||
* deal with filtering out L2 GPAs from the buffer.
|
||||
*/
|
||||
if (enable_pml) {
|
||||
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
|
||||
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
|
||||
vmcs_write64(PML_ADDRESS, 0);
|
||||
vmcs_write16(GUEST_PML_INDEX, -1);
|
||||
}
|
||||
|
||||
if (cpu_has_vmx_encls_vmexit())
|
||||
|
@ -2210,7 +2208,7 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
|
|||
|
||||
static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
|
||||
{
|
||||
u32 exec_control, vmcs12_exec_ctrl;
|
||||
u32 exec_control;
|
||||
u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
|
||||
|
||||
if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
|
||||
|
@ -2284,11 +2282,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
|
|||
SECONDARY_EXEC_APIC_REGISTER_VIRT |
|
||||
SECONDARY_EXEC_ENABLE_VMFUNC);
|
||||
if (nested_cpu_has(vmcs12,
|
||||
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
|
||||
vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
|
||||
~SECONDARY_EXEC_ENABLE_PML;
|
||||
exec_control |= vmcs12_exec_ctrl;
|
||||
}
|
||||
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
|
||||
exec_control |= vmcs12->secondary_vm_exec_control;
|
||||
|
||||
/* PML is emulated and never enabled in hardware for L2. */
|
||||
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
|
||||
|
||||
/* VMCS shadowing for L2 is emulated for now */
|
||||
exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
|
||||
|
@ -4200,9 +4198,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
|
|||
if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored))
|
||||
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
|
||||
|
||||
if (!enable_ept)
|
||||
vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
|
||||
|
||||
nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
|
||||
|
||||
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
|
||||
|
@ -4495,6 +4490,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
|
|||
vmx_set_virtual_apic_mode(vcpu);
|
||||
}
|
||||
|
||||
if (vmx->nested.update_vmcs01_cpu_dirty_logging) {
|
||||
vmx->nested.update_vmcs01_cpu_dirty_logging = false;
|
||||
vmx_update_cpu_dirty_logging(vcpu);
|
||||
}
|
||||
|
||||
/* Unpin physical memory we referred to in vmcs02 */
|
||||
if (vmx->nested.apic_access_page) {
|
||||
kvm_release_page_clean(vmx->nested.apic_access_page);
|
||||
|
@ -5793,7 +5793,10 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
|
|||
case EXIT_REASON_PREEMPTION_TIMER:
|
||||
return true;
|
||||
case EXIT_REASON_PML_FULL:
|
||||
/* We emulate PML support to L1. */
|
||||
/*
|
||||
* PML is emulated for an L1 VMM and should never be enabled in
|
||||
* vmcs02, always "handle" PML_FULL by exiting to userspace.
|
||||
*/
|
||||
return true;
|
||||
case EXIT_REASON_VMFUNC:
|
||||
/* VM functions are emulated through L2->L0 vmexits. */
|
||||
|
|
|
@ -298,7 +298,7 @@ int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu)
|
|||
if (IS_ERR(event)) {
|
||||
pr_debug_ratelimited("%s: failed %ld\n",
|
||||
__func__, PTR_ERR(event));
|
||||
return -ENOENT;
|
||||
return PTR_ERR(event);
|
||||
}
|
||||
lbr_desc->event = event;
|
||||
pmu->event_count++;
|
||||
|
@ -320,7 +320,7 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
|
|||
if (!intel_pmu_is_valid_lbr_msr(vcpu, index))
|
||||
return false;
|
||||
|
||||
if (!lbr_desc->event && !intel_pmu_create_guest_lbr_event(vcpu))
|
||||
if (!lbr_desc->event && intel_pmu_create_guest_lbr_event(vcpu) < 0)
|
||||
goto dummy;
|
||||
|
||||
/*
|
||||
|
|
|
@ -4277,7 +4277,12 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
|
|||
*/
|
||||
exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
|
||||
|
||||
if (!enable_pml)
|
||||
/*
|
||||
* PML is enabled/disabled when dirty logging of memsmlots changes, but
|
||||
* it needs to be set here when dirty logging is already active, e.g.
|
||||
* if this vCPU was created after dirty logging was enabled.
|
||||
*/
|
||||
if (!vcpu->kvm->arch.cpu_dirty_logging_count)
|
||||
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
|
||||
|
||||
if (cpu_has_vmx_xsaves()) {
|
||||
|
@ -4295,18 +4300,8 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
|
|||
}
|
||||
|
||||
vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP);
|
||||
|
||||
/*
|
||||
* Expose INVPCID if and only if PCID is also exposed to the guest.
|
||||
* INVPCID takes a #UD when it's disabled in the VMCS, but a #GP or #PF
|
||||
* if CR4.PCIDE=0. Enumerating CPUID.INVPCID=1 would lead to incorrect
|
||||
* behavior from the guest perspective (it would expect #GP or #PF).
|
||||
*/
|
||||
if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
|
||||
guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
|
||||
vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
|
||||
|
||||
|
||||
vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
|
||||
vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
|
||||
|
||||
|
@ -5776,24 +5771,6 @@ static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
|
|||
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
|
||||
* Called before reporting dirty_bitmap to userspace.
|
||||
*/
|
||||
static void kvm_flush_pml_buffers(struct kvm *kvm)
|
||||
{
|
||||
int i;
|
||||
struct kvm_vcpu *vcpu;
|
||||
/*
|
||||
* We only need to kick vcpu out of guest mode here, as PML buffer
|
||||
* is flushed at beginning of all VMEXITs, and it's obvious that only
|
||||
* vcpus running in guest are possible to have unflushed GPAs in PML
|
||||
* buffer.
|
||||
*/
|
||||
kvm_for_each_vcpu(i, vcpu, kvm)
|
||||
kvm_vcpu_kick(vcpu);
|
||||
}
|
||||
|
||||
static void vmx_dump_sel(char *name, uint32_t sel)
|
||||
{
|
||||
pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
|
||||
|
@ -5976,9 +5953,10 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
|
|||
* updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
|
||||
* querying dirty_bitmap, we only need to kick all vcpus out of guest
|
||||
* mode as if vcpus is in root mode, the PML buffer must has been
|
||||
* flushed already.
|
||||
* flushed already. Note, PML is never enabled in hardware while
|
||||
* running L2.
|
||||
*/
|
||||
if (enable_pml)
|
||||
if (enable_pml && !is_guest_mode(vcpu))
|
||||
vmx_flush_pml_buffer(vcpu);
|
||||
|
||||
/*
|
||||
|
@ -5994,6 +5972,13 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
|
|||
return handle_invalid_guest_state(vcpu);
|
||||
|
||||
if (is_guest_mode(vcpu)) {
|
||||
/*
|
||||
* PML is never enabled when running L2, bail immediately if a
|
||||
* PML full exit occurs as something is horribly wrong.
|
||||
*/
|
||||
if (exit_reason.basic == EXIT_REASON_PML_FULL)
|
||||
goto unexpected_vmexit;
|
||||
|
||||
/*
|
||||
* The host physical addresses of some pages of guest memory
|
||||
* are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
|
||||
|
@ -6851,13 +6836,15 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
|
|||
if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
|
||||
kvm_machine_check();
|
||||
|
||||
if (likely(!vmx->exit_reason.failed_vmentry))
|
||||
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
|
||||
|
||||
trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX);
|
||||
|
||||
if (unlikely(vmx->exit_reason.failed_vmentry))
|
||||
return EXIT_FASTPATH_NONE;
|
||||
|
||||
vmx->loaded_vmcs->launched = 1;
|
||||
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
|
||||
|
||||
vmx_recover_nmi_blocking(vmx);
|
||||
vmx_complete_interrupts(vmx);
|
||||
|
@ -7330,8 +7317,8 @@ static __init void vmx_set_cpu_caps(void)
|
|||
/* CPUID 0x7 */
|
||||
if (kvm_mpx_supported())
|
||||
kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
|
||||
if (cpu_has_vmx_invpcid())
|
||||
kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
|
||||
if (!cpu_has_vmx_invpcid())
|
||||
kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
|
||||
if (vmx_pt_mode_is_host_guest())
|
||||
kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
|
||||
|
||||
|
@ -7509,30 +7496,24 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
|
|||
shrink_ple_window(vcpu);
|
||||
}
|
||||
|
||||
static void vmx_slot_enable_log_dirty(struct kvm *kvm,
|
||||
struct kvm_memory_slot *slot)
|
||||
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
|
||||
kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
|
||||
kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
|
||||
}
|
||||
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
||||
|
||||
static void vmx_slot_disable_log_dirty(struct kvm *kvm,
|
||||
struct kvm_memory_slot *slot)
|
||||
{
|
||||
kvm_mmu_slot_set_dirty(kvm, slot);
|
||||
}
|
||||
if (is_guest_mode(vcpu)) {
|
||||
vmx->nested.update_vmcs01_cpu_dirty_logging = true;
|
||||
return;
|
||||
}
|
||||
|
||||
static void vmx_flush_log_dirty(struct kvm *kvm)
|
||||
{
|
||||
kvm_flush_pml_buffers(kvm);
|
||||
}
|
||||
|
||||
static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
|
||||
struct kvm_memory_slot *memslot,
|
||||
gfn_t offset, unsigned long mask)
|
||||
{
|
||||
kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
|
||||
/*
|
||||
* Note, cpu_dirty_logging_count can be changed concurrent with this
|
||||
* code, but in that case another update request will be made and so
|
||||
* the guest will never run with a stale PML value.
|
||||
*/
|
||||
if (vcpu->kvm->arch.cpu_dirty_logging_count)
|
||||
secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
|
||||
else
|
||||
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
|
||||
}
|
||||
|
||||
static int vmx_pre_block(struct kvm_vcpu *vcpu)
|
||||
|
@ -7642,11 +7623,6 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit)
|
|||
return supported & BIT(bit);
|
||||
}
|
||||
|
||||
static int vmx_cpu_dirty_log_size(void)
|
||||
{
|
||||
return enable_pml ? PML_ENTITY_NUM : 0;
|
||||
}
|
||||
|
||||
static struct kvm_x86_ops vmx_x86_ops __initdata = {
|
||||
.hardware_unsetup = hardware_unsetup,
|
||||
|
||||
|
@ -7746,10 +7722,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
|
|||
|
||||
.sched_in = vmx_sched_in,
|
||||
|
||||
.slot_enable_log_dirty = vmx_slot_enable_log_dirty,
|
||||
.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
|
||||
.flush_log_dirty = vmx_flush_log_dirty,
|
||||
.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
|
||||
.cpu_dirty_log_size = PML_ENTITY_NUM,
|
||||
.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
|
||||
|
||||
.pre_block = vmx_pre_block,
|
||||
.post_block = vmx_post_block,
|
||||
|
@ -7777,7 +7751,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
|
|||
|
||||
.msr_filter_changed = vmx_msr_filter_changed,
|
||||
.complete_emulated_msr = kvm_complete_insn_gp,
|
||||
.cpu_dirty_log_size = vmx_cpu_dirty_log_size,
|
||||
|
||||
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
|
||||
};
|
||||
|
@ -7894,13 +7867,8 @@ static __init int hardware_setup(void)
|
|||
if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
|
||||
enable_pml = 0;
|
||||
|
||||
if (!enable_pml) {
|
||||
vmx_x86_ops.slot_enable_log_dirty = NULL;
|
||||
vmx_x86_ops.slot_disable_log_dirty = NULL;
|
||||
vmx_x86_ops.flush_log_dirty = NULL;
|
||||
vmx_x86_ops.enable_log_dirty_pt_masked = NULL;
|
||||
vmx_x86_ops.cpu_dirty_log_size = NULL;
|
||||
}
|
||||
if (!enable_pml)
|
||||
vmx_x86_ops.cpu_dirty_log_size = 0;
|
||||
|
||||
if (!cpu_has_vmx_preemption_timer())
|
||||
enable_preemption_timer = false;
|
||||
|
|
|
@ -165,6 +165,7 @@ struct nested_vmx {
|
|||
|
||||
bool change_vmcs01_virtual_apic_mode;
|
||||
bool reload_vmcs01_apic_access_page;
|
||||
bool update_vmcs01_cpu_dirty_logging;
|
||||
|
||||
/*
|
||||
* Enlightened VMCS has been enabled. It does not mean that L1 has to
|
||||
|
@ -393,6 +394,7 @@ int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
|
|||
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
|
||||
void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
|
||||
u32 msr, int type, bool value);
|
||||
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
|
||||
|
||||
static inline u8 vmx_get_rvi(void)
|
||||
{
|
||||
|
|
|
@ -5215,10 +5215,18 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
|
|||
|
||||
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
|
||||
{
|
||||
|
||||
/*
|
||||
* Flush potentially hardware-cached dirty pages to dirty_bitmap.
|
||||
* Flush all CPUs' dirty log buffers to the dirty_bitmap. Called
|
||||
* before reporting dirty_bitmap to userspace. KVM flushes the buffers
|
||||
* on all VM-Exits, thus we only need to kick running vCPUs to force a
|
||||
* VM-Exit.
|
||||
*/
|
||||
static_call_cond(kvm_x86_flush_log_dirty)(kvm);
|
||||
struct kvm_vcpu *vcpu;
|
||||
int i;
|
||||
|
||||
kvm_for_each_vcpu(i, vcpu, kvm)
|
||||
kvm_vcpu_kick(vcpu);
|
||||
}
|
||||
|
||||
int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
|
||||
|
@ -8980,6 +8988,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
|
|||
kvm_check_async_pf_completion(vcpu);
|
||||
if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
|
||||
static_call(kvm_x86_msr_filter_changed)(vcpu);
|
||||
|
||||
if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
|
||||
static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
|
||||
}
|
||||
|
||||
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
|
||||
|
@ -10748,75 +10759,96 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
|
|||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
|
||||
{
|
||||
struct kvm_arch *ka = &kvm->arch;
|
||||
|
||||
if (!kvm_x86_ops.cpu_dirty_log_size)
|
||||
return;
|
||||
|
||||
if ((enable && ++ka->cpu_dirty_logging_count == 1) ||
|
||||
(!enable && --ka->cpu_dirty_logging_count == 0))
|
||||
kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING);
|
||||
|
||||
WARN_ON_ONCE(ka->cpu_dirty_logging_count < 0);
|
||||
}
|
||||
|
||||
static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
|
||||
struct kvm_memory_slot *old,
|
||||
struct kvm_memory_slot *new,
|
||||
enum kvm_mr_change change)
|
||||
{
|
||||
bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES;
|
||||
|
||||
/*
|
||||
* Nothing to do for RO slots or CREATE/MOVE/DELETE of a slot.
|
||||
* See comments below.
|
||||
* Update CPU dirty logging if dirty logging is being toggled. This
|
||||
* applies to all operations.
|
||||
*/
|
||||
if ((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES)
|
||||
kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages);
|
||||
|
||||
/*
|
||||
* Nothing more to do for RO slots (which can't be dirtied and can't be
|
||||
* made writable) or CREATE/MOVE/DELETE of a slot.
|
||||
*
|
||||
* For a memslot with dirty logging disabled:
|
||||
* CREATE: No dirty mappings will already exist.
|
||||
* MOVE/DELETE: The old mappings will already have been cleaned up by
|
||||
* kvm_arch_flush_shadow_memslot()
|
||||
*
|
||||
* For a memslot with dirty logging enabled:
|
||||
* CREATE: No shadow pages exist, thus nothing to write-protect
|
||||
* and no dirty bits to clear.
|
||||
* MOVE/DELETE: The old mappings will already have been cleaned up by
|
||||
* kvm_arch_flush_shadow_memslot().
|
||||
*/
|
||||
if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Dirty logging tracks sptes in 4k granularity, meaning that large
|
||||
* sptes have to be split. If live migration is successful, the guest
|
||||
* in the source machine will be destroyed and large sptes will be
|
||||
* created in the destination. However, if the guest continues to run
|
||||
* in the source machine (for example if live migration fails), small
|
||||
* sptes will remain around and cause bad performance.
|
||||
*
|
||||
* Scan sptes if dirty logging has been stopped, dropping those
|
||||
* which can be collapsed into a single large-page spte. Later
|
||||
* page faults will create the large-page sptes.
|
||||
*
|
||||
* There is no need to do this in any of the following cases:
|
||||
* CREATE: No dirty mappings will already exist.
|
||||
* MOVE/DELETE: The old mappings will already have been cleaned up by
|
||||
* kvm_arch_flush_shadow_memslot()
|
||||
* READONLY and non-flags changes were filtered out above, and the only
|
||||
* other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty
|
||||
* logging isn't being toggled on or off.
|
||||
*/
|
||||
if ((old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
|
||||
!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
|
||||
if (WARN_ON_ONCE(!((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES)))
|
||||
return;
|
||||
|
||||
if (!log_dirty_pages) {
|
||||
/*
|
||||
* Dirty logging tracks sptes in 4k granularity, meaning that
|
||||
* large sptes have to be split. If live migration succeeds,
|
||||
* the guest in the source machine will be destroyed and large
|
||||
* sptes will be created in the destination. However, if the
|
||||
* guest continues to run in the source machine (for example if
|
||||
* live migration fails), small sptes will remain around and
|
||||
* cause bad performance.
|
||||
*
|
||||
* Scan sptes if dirty logging has been stopped, dropping those
|
||||
* which can be collapsed into a single large-page spte. Later
|
||||
* page faults will create the large-page sptes.
|
||||
*/
|
||||
kvm_mmu_zap_collapsible_sptes(kvm, new);
|
||||
} else {
|
||||
/* By default, write-protect everything to log writes. */
|
||||
int level = PG_LEVEL_4K;
|
||||
|
||||
/*
|
||||
* Enable or disable dirty logging for the slot.
|
||||
*
|
||||
* For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of the old
|
||||
* slot have been zapped so no dirty logging updates are needed for
|
||||
* the old slot.
|
||||
* For KVM_MR_CREATE and KVM_MR_MOVE, once the new slot is visible
|
||||
* any mappings that might be created in it will consume the
|
||||
* properties of the new slot and do not need to be updated here.
|
||||
*
|
||||
* When PML is enabled, the kvm_x86_ops dirty logging hooks are
|
||||
* called to enable/disable dirty logging.
|
||||
*
|
||||
* When disabling dirty logging with PML enabled, the D-bit is set
|
||||
* for sptes in the slot in order to prevent unnecessary GPA
|
||||
* logging in the PML buffer (and potential PML buffer full VMEXIT).
|
||||
* This guarantees leaving PML enabled for the guest's lifetime
|
||||
* won't have any additional overhead from PML when the guest is
|
||||
* running with dirty logging disabled.
|
||||
*
|
||||
* When enabling dirty logging, large sptes are write-protected
|
||||
* so they can be split on first write. New large sptes cannot
|
||||
* be created for this slot until the end of the logging.
|
||||
* See the comments in fast_page_fault().
|
||||
* For small sptes, nothing is done if the dirty log is in the
|
||||
* initial-all-set state. Otherwise, depending on whether pml
|
||||
* is enabled the D-bit or the W-bit will be cleared.
|
||||
*/
|
||||
if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
|
||||
if (kvm_x86_ops.slot_enable_log_dirty) {
|
||||
static_call(kvm_x86_slot_enable_log_dirty)(kvm, new);
|
||||
} else {
|
||||
int level =
|
||||
kvm_dirty_log_manual_protect_and_init_set(kvm) ?
|
||||
PG_LEVEL_2M : PG_LEVEL_4K;
|
||||
if (kvm_x86_ops.cpu_dirty_log_size) {
|
||||
/*
|
||||
* Clear all dirty bits, unless pages are treated as
|
||||
* dirty from the get-go.
|
||||
*/
|
||||
if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
|
||||
kvm_mmu_slot_leaf_clear_dirty(kvm, new);
|
||||
|
||||
/*
|
||||
* Write-protect large pages on write so that dirty
|
||||
* logging happens at 4k granularity. No need to
|
||||
* write-protect small SPTEs since write accesses are
|
||||
* logged by the CPU via dirty bits.
|
||||
*/
|
||||
level = PG_LEVEL_2M;
|
||||
} else if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
|
||||
/*
|
||||
* If we're with initial-all-set, we don't need
|
||||
* to write protect any small page because
|
||||
|
@ -10825,10 +10857,9 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
|
|||
* so that the page split can happen lazily on
|
||||
* the first write to the huge page.
|
||||
*/
|
||||
kvm_mmu_slot_remove_write_access(kvm, new, level);
|
||||
level = PG_LEVEL_2M;
|
||||
}
|
||||
} else {
|
||||
static_call_cond(kvm_x86_slot_disable_log_dirty)(kvm, new);
|
||||
kvm_mmu_slot_remove_write_access(kvm, new, level);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
#include <linux/signal.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/bug.h>
|
||||
#include <linux/minmax.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <linux/preempt.h>
|
||||
|
@ -506,6 +507,8 @@ struct kvm {
|
|||
struct mmu_notifier mmu_notifier;
|
||||
unsigned long mmu_notifier_seq;
|
||||
long mmu_notifier_count;
|
||||
unsigned long mmu_notifier_range_start;
|
||||
unsigned long mmu_notifier_range_end;
|
||||
#endif
|
||||
long tlbs_dirty;
|
||||
struct list_head devices;
|
||||
|
@ -733,7 +736,7 @@ kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
|
|||
kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
|
||||
kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
|
||||
bool atomic, bool *async, bool write_fault,
|
||||
bool *writable);
|
||||
bool *writable, hva_t *hva);
|
||||
|
||||
void kvm_release_pfn_clean(kvm_pfn_t pfn);
|
||||
void kvm_release_pfn_dirty(kvm_pfn_t pfn);
|
||||
|
@ -1207,6 +1210,26 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
|
|||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int mmu_notifier_retry_hva(struct kvm *kvm,
|
||||
unsigned long mmu_seq,
|
||||
unsigned long hva)
|
||||
{
|
||||
lockdep_assert_held(&kvm->mmu_lock);
|
||||
/*
|
||||
* If mmu_notifier_count is non-zero, then the range maintained by
|
||||
* kvm_mmu_notifier_invalidate_range_start contains all addresses that
|
||||
* might be being invalidated. Note that it may include some false
|
||||
* positives, due to shortcuts when handing concurrent invalidations.
|
||||
*/
|
||||
if (unlikely(kvm->mmu_notifier_count) &&
|
||||
hva >= kvm->mmu_notifier_range_start &&
|
||||
hva < kvm->mmu_notifier_range_end)
|
||||
return 1;
|
||||
if (kvm->mmu_notifier_seq != mmu_seq)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
/demand_paging_test
|
||||
/dirty_log_test
|
||||
/dirty_log_perf_test
|
||||
/hardware_disable_test
|
||||
/kvm_create_max_vcpus
|
||||
/memslot_modification_stress_test
|
||||
/set_memory_region_test
|
||||
|
|
|
@ -67,6 +67,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
|
|||
TEST_GEN_PROGS_x86_64 += demand_paging_test
|
||||
TEST_GEN_PROGS_x86_64 += dirty_log_test
|
||||
TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
|
||||
TEST_GEN_PROGS_x86_64 += hardware_disable_test
|
||||
TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
|
||||
TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
|
||||
TEST_GEN_PROGS_x86_64 += set_memory_region_test
|
||||
|
|
|
@ -0,0 +1,165 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
* This test is intended to reproduce a crash that happens when
|
||||
* kvm_arch_hardware_disable is called and it attempts to unregister the user
|
||||
* return notifiers.
|
||||
*/
|
||||
|
||||
#define _GNU_SOURCE
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <pthread.h>
|
||||
#include <semaphore.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/wait.h>
|
||||
|
||||
#include <test_util.h>
|
||||
|
||||
#include "kvm_util.h"
|
||||
|
||||
#define VCPU_NUM 4
|
||||
#define SLEEPING_THREAD_NUM (1 << 4)
|
||||
#define FORK_NUM (1ULL << 9)
|
||||
#define DELAY_US_MAX 2000
|
||||
#define GUEST_CODE_PIO_PORT 4
|
||||
|
||||
sem_t *sem;
|
||||
|
||||
/* Arguments for the pthreads */
|
||||
struct payload {
|
||||
struct kvm_vm *vm;
|
||||
uint32_t index;
|
||||
};
|
||||
|
||||
static void guest_code(void)
|
||||
{
|
||||
for (;;)
|
||||
; /* Some busy work */
|
||||
printf("Should not be reached.\n");
|
||||
}
|
||||
|
||||
static void *run_vcpu(void *arg)
|
||||
{
|
||||
struct payload *payload = (struct payload *)arg;
|
||||
struct kvm_run *state = vcpu_state(payload->vm, payload->index);
|
||||
|
||||
vcpu_run(payload->vm, payload->index);
|
||||
|
||||
TEST_ASSERT(false, "%s: exited with reason %d: %s\n",
|
||||
__func__, state->exit_reason,
|
||||
exit_reason_str(state->exit_reason));
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
|
||||
static void *sleeping_thread(void *arg)
|
||||
{
|
||||
int fd;
|
||||
|
||||
while (true) {
|
||||
fd = open("/dev/null", O_RDWR);
|
||||
close(fd);
|
||||
}
|
||||
TEST_ASSERT(false, "%s: exited\n", __func__);
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
|
||||
static inline void check_create_thread(pthread_t *thread, pthread_attr_t *attr,
|
||||
void *(*f)(void *), void *arg)
|
||||
{
|
||||
int r;
|
||||
|
||||
r = pthread_create(thread, attr, f, arg);
|
||||
TEST_ASSERT(r == 0, "%s: failed to create thread", __func__);
|
||||
}
|
||||
|
||||
static inline void check_set_affinity(pthread_t thread, cpu_set_t *cpu_set)
|
||||
{
|
||||
int r;
|
||||
|
||||
r = pthread_setaffinity_np(thread, sizeof(cpu_set_t), cpu_set);
|
||||
TEST_ASSERT(r == 0, "%s: failed set affinity", __func__);
|
||||
}
|
||||
|
||||
static inline void check_join(pthread_t thread, void **retval)
|
||||
{
|
||||
int r;
|
||||
|
||||
r = pthread_join(thread, retval);
|
||||
TEST_ASSERT(r == 0, "%s: failed to join thread", __func__);
|
||||
}
|
||||
|
||||
static void run_test(uint32_t run)
|
||||
{
|
||||
struct kvm_vm *vm;
|
||||
cpu_set_t cpu_set;
|
||||
pthread_t threads[VCPU_NUM];
|
||||
pthread_t throw_away;
|
||||
struct payload payloads[VCPU_NUM];
|
||||
void *b;
|
||||
uint32_t i, j;
|
||||
|
||||
CPU_ZERO(&cpu_set);
|
||||
for (i = 0; i < VCPU_NUM; i++)
|
||||
CPU_SET(i, &cpu_set);
|
||||
|
||||
vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR);
|
||||
kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
|
||||
vm_create_irqchip(vm);
|
||||
|
||||
fprintf(stderr, "%s: [%d] start vcpus\n", __func__, run);
|
||||
for (i = 0; i < VCPU_NUM; ++i) {
|
||||
vm_vcpu_add_default(vm, i, guest_code);
|
||||
payloads[i].vm = vm;
|
||||
payloads[i].index = i;
|
||||
|
||||
check_create_thread(&threads[i], NULL, run_vcpu,
|
||||
(void *)&payloads[i]);
|
||||
check_set_affinity(threads[i], &cpu_set);
|
||||
|
||||
for (j = 0; j < SLEEPING_THREAD_NUM; ++j) {
|
||||
check_create_thread(&throw_away, NULL, sleeping_thread,
|
||||
(void *)NULL);
|
||||
check_set_affinity(throw_away, &cpu_set);
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "%s: [%d] all threads launched\n", __func__, run);
|
||||
sem_post(sem);
|
||||
for (i = 0; i < VCPU_NUM; ++i)
|
||||
check_join(threads[i], &b);
|
||||
/* Should not be reached */
|
||||
TEST_ASSERT(false, "%s: [%d] child escaped the ninja\n", __func__, run);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
uint32_t i;
|
||||
int s, r;
|
||||
pid_t pid;
|
||||
|
||||
sem = sem_open("vm_sem", O_CREAT | O_EXCL, 0644, 0);
|
||||
sem_unlink("vm_sem");
|
||||
|
||||
for (i = 0; i < FORK_NUM; ++i) {
|
||||
pid = fork();
|
||||
TEST_ASSERT(pid >= 0, "%s: unable to fork", __func__);
|
||||
if (pid == 0)
|
||||
run_test(i); /* This function always exits */
|
||||
|
||||
fprintf(stderr, "%s: [%d] waiting semaphore\n", __func__, i);
|
||||
sem_wait(sem);
|
||||
r = (rand() % DELAY_US_MAX) + 1;
|
||||
fprintf(stderr, "%s: [%d] waiting %dus\n", __func__, i, r);
|
||||
usleep(r);
|
||||
r = waitpid(pid, &s, WNOHANG);
|
||||
TEST_ASSERT(r != pid,
|
||||
"%s: [%d] child exited unexpectedly status: [%d]",
|
||||
__func__, i, s);
|
||||
fprintf(stderr, "%s: [%d] killing child\n", __func__, i);
|
||||
kill(pid, SIGKILL);
|
||||
}
|
||||
|
||||
sem_destroy(sem);
|
||||
exit(0);
|
||||
}
|
|
@ -720,7 +720,8 @@ struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid)
|
|||
{
|
||||
struct vcpu *vcpu = vcpu_find(vm, vcpuid);
|
||||
struct kvm_cpuid2 *cpuid;
|
||||
int rc, max_ent;
|
||||
int max_ent;
|
||||
int rc = -1;
|
||||
|
||||
TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
|
||||
|
||||
|
|
|
@ -486,6 +486,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
|
|||
* count is also read inside the mmu_lock critical section.
|
||||
*/
|
||||
kvm->mmu_notifier_count++;
|
||||
if (likely(kvm->mmu_notifier_count == 1)) {
|
||||
kvm->mmu_notifier_range_start = range->start;
|
||||
kvm->mmu_notifier_range_end = range->end;
|
||||
} else {
|
||||
/*
|
||||
* Fully tracking multiple concurrent ranges has dimishing
|
||||
* returns. Keep things simple and just find the minimal range
|
||||
* which includes the current and new ranges. As there won't be
|
||||
* enough information to subtract a range after its invalidate
|
||||
* completes, any ranges invalidated concurrently will
|
||||
* accumulate and persist until all outstanding invalidates
|
||||
* complete.
|
||||
*/
|
||||
kvm->mmu_notifier_range_start =
|
||||
min(kvm->mmu_notifier_range_start, range->start);
|
||||
kvm->mmu_notifier_range_end =
|
||||
max(kvm->mmu_notifier_range_end, range->end);
|
||||
}
|
||||
need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
|
||||
range->flags);
|
||||
/* we've to flush the tlb before the pages can be freed */
|
||||
|
@ -2023,10 +2041,13 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
|
|||
|
||||
kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
|
||||
bool atomic, bool *async, bool write_fault,
|
||||
bool *writable)
|
||||
bool *writable, hva_t *hva)
|
||||
{
|
||||
unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
|
||||
|
||||
if (hva)
|
||||
*hva = addr;
|
||||
|
||||
if (addr == KVM_HVA_ERR_RO_BAD) {
|
||||
if (writable)
|
||||
*writable = false;
|
||||
|
@ -2054,19 +2075,19 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
|
|||
bool *writable)
|
||||
{
|
||||
return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
|
||||
write_fault, writable);
|
||||
write_fault, writable, NULL);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
|
||||
|
||||
kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
|
||||
{
|
||||
return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
|
||||
return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
|
||||
|
||||
kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
|
||||
{
|
||||
return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
|
||||
return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
|
||||
|
||||
|
|
Loading…
Reference in New Issue