- optimization for the exitless interrupt support that was merged in 4.16-rc1
 - improve the branch prediction blocking for nested KVM
 - replace some jump tables with switch statements to improve expoline performance
 - fixes for multiple epoch facility
 
 ARM:
 - fix the interaction of userspace irqchip VMs with in-kernel irqchip VMs
 - make sure we can build 32-bit KVM/ARM with gcc-8.
 
 x86:
 - fixes for AMD SEV
 - fixes for Intel nested VMX, emulated UMIP and a dump_stack() on VM startup
 - fixes for async page fault migration
 - small optimization to PV TLB flush (new in 4.16-rc1)
 - syzkaller fixes
 
 Generic:
 - compiler warning fixes
 - syzkaller fixes
 - more improvements to the kvm_stat tool
 
 Two more small Spectre fixes are going to reach you via Ingo.
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v2.0.22 (GNU/Linux)
 
 iQEbBAABAgAGBQJakL/fAAoJEL/70l94x66Dzp4H9j6qMzgOTAQ0bYmupQp81tad
 V8lNabVSNi0UBYwk2D44oNigtNjQckE18KGnjuJ4tZW+GZ+D7zrrHrKXWtATXgxP
 SIfHj+raSd/lgJoy6HLu/N0oT6wS+PdZMYFgSu600Vi618lGKGX1SIAwBhjoxdMX
 7QKKAuPcDZ1qgGddhWaLnof28nQQEWcCAVfFeVojmM0TyhvSbgSysh/Gq10ydybh
 NVUfgP3fzLtT9gVngX/ZtbogNkltPYmucpI+wT3nWfsgBic783klfWrfpnC/GM85
 OeXLVhHwVLG6tXUGhb4ULO+F9HwRGX31+er6iIxmwH9PvqnQMRcQ0Xxf2gbNXg==
 =YmH6
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM fixes from Paolo Bonzini:
 "s390:
   - optimization for the exitless interrupt support that was merged in 4.16-rc1
   - improve the branch prediction blocking for nested KVM
   - replace some jump tables with switch statements to improve expoline performance
   - fixes for multiple epoch facility

  ARM:
   - fix the interaction of userspace irqchip VMs with in-kernel irqchip VMs
   - make sure we can build 32-bit KVM/ARM with gcc-8.

  x86:
   - fixes for AMD SEV
   - fixes for Intel nested VMX, emulated UMIP and a dump_stack() on VM startup
   - fixes for async page fault migration
   - small optimization to PV TLB flush (new in 4.16-rc1)
   - syzkaller fixes

  Generic:
   - compiler warning fixes
   - syzkaller fixes
   - more improvements to the kvm_stat tool

  Two more small Spectre fixes are going to reach you via Ingo"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (40 commits)
  KVM: SVM: Fix SEV LAUNCH_SECRET command
  KVM: SVM: install RSM intercept
  KVM: SVM: no need to call access_ok() in LAUNCH_MEASURE command
  include: psp-sev: Capitalize invalid length enum
  crypto: ccp: Fix sparse, use plain integer as NULL pointer
  KVM: X86: Avoid traversing all the cpus for pv tlb flush when steal time is disabled
  x86/kvm: Make parse_no_xxx __init for kvm
  KVM: x86: fix backward migration with async_PF
  kvm: fix warning for non-x86 builds
  kvm: fix warning for CONFIG_HAVE_KVM_EVENTFD builds
  tools/kvm_stat: print 'Total' line for multiple events only
  tools/kvm_stat: group child events indented after parent
  tools/kvm_stat: separate drilldown and fields filtering
  tools/kvm_stat: eliminate extra guest/pid selection dialog
  tools/kvm_stat: mark private methods as such
  tools/kvm_stat: fix debugfs handling
  tools/kvm_stat: print error on invalid regex
  tools/kvm_stat: fix crash when filtering out all non-child trace events
  tools/kvm_stat: avoid 'is' for equality checks
  tools/kvm_stat: use a more pythonic way to iterate over dictionaries
  ...
This commit is contained in:
Linus Torvalds 2018-02-26 09:28:35 -08:00
commit d4858aaf6b
26 changed files with 699 additions and 517 deletions

View File

@ -58,6 +58,10 @@ KVM_FEATURE_PV_TLB_FLUSH || 9 || guest checks this feature bit
|| || before enabling paravirtualized || || before enabling paravirtualized
|| || tlb flush. || || tlb flush.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
KVM_FEATURE_ASYNC_PF_VMEXIT || 10 || paravirtualized async PF VM exit
|| || can be enabled by setting bit 2
|| || when writing to msr 0x4b564d02
------------------------------------------------------------------------------
KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side
|| || per-cpu warps are expected in || || per-cpu warps are expected in
|| || kvmclock. || || kvmclock.

View File

@ -170,7 +170,8 @@ MSR_KVM_ASYNC_PF_EN: 0x4b564d02
when asynchronous page faults are enabled on the vcpu 0 when when asynchronous page faults are enabled on the vcpu 0 when
disabled. Bit 1 is 1 if asynchronous page faults can be injected disabled. Bit 1 is 1 if asynchronous page faults can be injected
when vcpu is in cpl == 0. Bit 2 is 1 if asynchronous page faults when vcpu is in cpl == 0. Bit 2 is 1 if asynchronous page faults
are delivered to L1 as #PF vmexits. are delivered to L1 as #PF vmexits. Bit 2 can be set only if
KVM_FEATURE_ASYNC_PF_VMEXIT is present in CPUID.
First 4 byte of 64 byte memory location will be written to by First 4 byte of 64 byte memory location will be written to by
the hypervisor at the time of asynchronous page fault (APF) the hypervisor at the time of asynchronous page fault (APF)

View File

@ -7,6 +7,8 @@ ccflags-y += -fno-stack-protector -DDISABLE_BRANCH_PROFILING
KVM=../../../../virt/kvm KVM=../../../../virt/kvm
CFLAGS_ARMV7VE :=$(call cc-option, -march=armv7ve)
obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o
obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o
obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o
@ -15,7 +17,10 @@ obj-$(CONFIG_KVM_ARM_HOST) += tlb.o
obj-$(CONFIG_KVM_ARM_HOST) += cp15-sr.o obj-$(CONFIG_KVM_ARM_HOST) += cp15-sr.o
obj-$(CONFIG_KVM_ARM_HOST) += vfp.o obj-$(CONFIG_KVM_ARM_HOST) += vfp.o
obj-$(CONFIG_KVM_ARM_HOST) += banked-sr.o obj-$(CONFIG_KVM_ARM_HOST) += banked-sr.o
CFLAGS_banked-sr.o += $(CFLAGS_ARMV7VE)
obj-$(CONFIG_KVM_ARM_HOST) += entry.o obj-$(CONFIG_KVM_ARM_HOST) += entry.o
obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o
obj-$(CONFIG_KVM_ARM_HOST) += switch.o obj-$(CONFIG_KVM_ARM_HOST) += switch.o
CFLAGS_switch.o += $(CFLAGS_ARMV7VE)
obj-$(CONFIG_KVM_ARM_HOST) += s2-setup.o obj-$(CONFIG_KVM_ARM_HOST) += s2-setup.o

View File

@ -20,6 +20,10 @@
#include <asm/kvm_hyp.h> #include <asm/kvm_hyp.h>
/*
* gcc before 4.9 doesn't understand -march=armv7ve, so we have to
* trick the assembler.
*/
__asm__(".arch_extension virt"); __asm__(".arch_extension virt");
void __hyp_text __banked_save_state(struct kvm_cpu_context *ctxt) void __hyp_text __banked_save_state(struct kvm_cpu_context *ctxt)

View File

@ -22,22 +22,6 @@
#include "trace.h" #include "trace.h"
#include "trace-s390.h" #include "trace-s390.h"
static const intercept_handler_t instruction_handlers[256] = {
[0x01] = kvm_s390_handle_01,
[0x82] = kvm_s390_handle_lpsw,
[0x83] = kvm_s390_handle_diag,
[0xaa] = kvm_s390_handle_aa,
[0xae] = kvm_s390_handle_sigp,
[0xb2] = kvm_s390_handle_b2,
[0xb6] = kvm_s390_handle_stctl,
[0xb7] = kvm_s390_handle_lctl,
[0xb9] = kvm_s390_handle_b9,
[0xe3] = kvm_s390_handle_e3,
[0xe5] = kvm_s390_handle_e5,
[0xeb] = kvm_s390_handle_eb,
};
u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu) u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu)
{ {
struct kvm_s390_sie_block *sie_block = vcpu->arch.sie_block; struct kvm_s390_sie_block *sie_block = vcpu->arch.sie_block;
@ -129,16 +113,39 @@ static int handle_validity(struct kvm_vcpu *vcpu)
static int handle_instruction(struct kvm_vcpu *vcpu) static int handle_instruction(struct kvm_vcpu *vcpu)
{ {
intercept_handler_t handler;
vcpu->stat.exit_instruction++; vcpu->stat.exit_instruction++;
trace_kvm_s390_intercept_instruction(vcpu, trace_kvm_s390_intercept_instruction(vcpu,
vcpu->arch.sie_block->ipa, vcpu->arch.sie_block->ipa,
vcpu->arch.sie_block->ipb); vcpu->arch.sie_block->ipb);
handler = instruction_handlers[vcpu->arch.sie_block->ipa >> 8];
if (handler) switch (vcpu->arch.sie_block->ipa >> 8) {
return handler(vcpu); case 0x01:
return -EOPNOTSUPP; return kvm_s390_handle_01(vcpu);
case 0x82:
return kvm_s390_handle_lpsw(vcpu);
case 0x83:
return kvm_s390_handle_diag(vcpu);
case 0xaa:
return kvm_s390_handle_aa(vcpu);
case 0xae:
return kvm_s390_handle_sigp(vcpu);
case 0xb2:
return kvm_s390_handle_b2(vcpu);
case 0xb6:
return kvm_s390_handle_stctl(vcpu);
case 0xb7:
return kvm_s390_handle_lctl(vcpu);
case 0xb9:
return kvm_s390_handle_b9(vcpu);
case 0xe3:
return kvm_s390_handle_e3(vcpu);
case 0xe5:
return kvm_s390_handle_e5(vcpu);
case 0xeb:
return kvm_s390_handle_eb(vcpu);
default:
return -EOPNOTSUPP;
}
} }
static int inject_prog_on_prog_intercept(struct kvm_vcpu *vcpu) static int inject_prog_on_prog_intercept(struct kvm_vcpu *vcpu)

View File

@ -169,8 +169,15 @@ static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu)
static int ckc_irq_pending(struct kvm_vcpu *vcpu) static int ckc_irq_pending(struct kvm_vcpu *vcpu)
{ {
if (vcpu->arch.sie_block->ckc >= kvm_s390_get_tod_clock_fast(vcpu->kvm)) const u64 now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
const u64 ckc = vcpu->arch.sie_block->ckc;
if (vcpu->arch.sie_block->gcr[0] & 0x0020000000000000ul) {
if ((s64)ckc >= (s64)now)
return 0;
} else if (ckc >= now) {
return 0; return 0;
}
return ckc_interrupts_enabled(vcpu); return ckc_interrupts_enabled(vcpu);
} }
@ -187,12 +194,6 @@ static int cpu_timer_irq_pending(struct kvm_vcpu *vcpu)
return kvm_s390_get_cpu_timer(vcpu) >> 63; return kvm_s390_get_cpu_timer(vcpu) >> 63;
} }
static inline int is_ioirq(unsigned long irq_type)
{
return ((irq_type >= IRQ_PEND_IO_ISC_7) &&
(irq_type <= IRQ_PEND_IO_ISC_0));
}
static uint64_t isc_to_isc_bits(int isc) static uint64_t isc_to_isc_bits(int isc)
{ {
return (0x80 >> isc) << 24; return (0x80 >> isc) << 24;
@ -236,10 +237,15 @@ static inline int kvm_s390_gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gis
return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
} }
static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) static inline unsigned long pending_irqs_no_gisa(struct kvm_vcpu *vcpu)
{ {
return vcpu->kvm->arch.float_int.pending_irqs | return vcpu->kvm->arch.float_int.pending_irqs |
vcpu->arch.local_int.pending_irqs | vcpu->arch.local_int.pending_irqs;
}
static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu)
{
return pending_irqs_no_gisa(vcpu) |
kvm_s390_gisa_get_ipm(vcpu->kvm->arch.gisa) << IRQ_PEND_IO_ISC_7; kvm_s390_gisa_get_ipm(vcpu->kvm->arch.gisa) << IRQ_PEND_IO_ISC_7;
} }
@ -337,7 +343,7 @@ static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
static void set_intercept_indicators_io(struct kvm_vcpu *vcpu) static void set_intercept_indicators_io(struct kvm_vcpu *vcpu)
{ {
if (!(pending_irqs(vcpu) & IRQ_PEND_IO_MASK)) if (!(pending_irqs_no_gisa(vcpu) & IRQ_PEND_IO_MASK))
return; return;
else if (psw_ioint_disabled(vcpu)) else if (psw_ioint_disabled(vcpu))
kvm_s390_set_cpuflags(vcpu, CPUSTAT_IO_INT); kvm_s390_set_cpuflags(vcpu, CPUSTAT_IO_INT);
@ -1011,24 +1017,6 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu,
return rc; return rc;
} }
typedef int (*deliver_irq_t)(struct kvm_vcpu *vcpu);
static const deliver_irq_t deliver_irq_funcs[] = {
[IRQ_PEND_MCHK_EX] = __deliver_machine_check,
[IRQ_PEND_MCHK_REP] = __deliver_machine_check,
[IRQ_PEND_PROG] = __deliver_prog,
[IRQ_PEND_EXT_EMERGENCY] = __deliver_emergency_signal,
[IRQ_PEND_EXT_EXTERNAL] = __deliver_external_call,
[IRQ_PEND_EXT_CLOCK_COMP] = __deliver_ckc,
[IRQ_PEND_EXT_CPU_TIMER] = __deliver_cpu_timer,
[IRQ_PEND_RESTART] = __deliver_restart,
[IRQ_PEND_SET_PREFIX] = __deliver_set_prefix,
[IRQ_PEND_PFAULT_INIT] = __deliver_pfault_init,
[IRQ_PEND_EXT_SERVICE] = __deliver_service,
[IRQ_PEND_PFAULT_DONE] = __deliver_pfault_done,
[IRQ_PEND_VIRTIO] = __deliver_virtio,
};
/* Check whether an external call is pending (deliverable or not) */ /* Check whether an external call is pending (deliverable or not) */
int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu) int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu)
{ {
@ -1066,13 +1054,19 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
static u64 __calculate_sltime(struct kvm_vcpu *vcpu) static u64 __calculate_sltime(struct kvm_vcpu *vcpu)
{ {
u64 now, cputm, sltime = 0; const u64 now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
const u64 ckc = vcpu->arch.sie_block->ckc;
u64 cputm, sltime = 0;
if (ckc_interrupts_enabled(vcpu)) { if (ckc_interrupts_enabled(vcpu)) {
now = kvm_s390_get_tod_clock_fast(vcpu->kvm); if (vcpu->arch.sie_block->gcr[0] & 0x0020000000000000ul) {
sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now); if ((s64)now < (s64)ckc)
/* already expired or overflow? */ sltime = tod_to_ns((s64)ckc - (s64)now);
if (!sltime || vcpu->arch.sie_block->ckc <= now) } else if (now < ckc) {
sltime = tod_to_ns(ckc - now);
}
/* already expired */
if (!sltime)
return 0; return 0;
if (cpu_timer_interrupts_enabled(vcpu)) { if (cpu_timer_interrupts_enabled(vcpu)) {
cputm = kvm_s390_get_cpu_timer(vcpu); cputm = kvm_s390_get_cpu_timer(vcpu);
@ -1192,7 +1186,6 @@ void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu)
int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
{ {
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
deliver_irq_t func;
int rc = 0; int rc = 0;
unsigned long irq_type; unsigned long irq_type;
unsigned long irqs; unsigned long irqs;
@ -1212,16 +1205,57 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
while ((irqs = deliverable_irqs(vcpu)) && !rc) { while ((irqs = deliverable_irqs(vcpu)) && !rc) {
/* bits are in the reverse order of interrupt priority */ /* bits are in the reverse order of interrupt priority */
irq_type = find_last_bit(&irqs, IRQ_PEND_COUNT); irq_type = find_last_bit(&irqs, IRQ_PEND_COUNT);
if (is_ioirq(irq_type)) { switch (irq_type) {
case IRQ_PEND_IO_ISC_0:
case IRQ_PEND_IO_ISC_1:
case IRQ_PEND_IO_ISC_2:
case IRQ_PEND_IO_ISC_3:
case IRQ_PEND_IO_ISC_4:
case IRQ_PEND_IO_ISC_5:
case IRQ_PEND_IO_ISC_6:
case IRQ_PEND_IO_ISC_7:
rc = __deliver_io(vcpu, irq_type); rc = __deliver_io(vcpu, irq_type);
} else { break;
func = deliver_irq_funcs[irq_type]; case IRQ_PEND_MCHK_EX:
if (!func) { case IRQ_PEND_MCHK_REP:
WARN_ON_ONCE(func == NULL); rc = __deliver_machine_check(vcpu);
clear_bit(irq_type, &li->pending_irqs); break;
continue; case IRQ_PEND_PROG:
} rc = __deliver_prog(vcpu);
rc = func(vcpu); break;
case IRQ_PEND_EXT_EMERGENCY:
rc = __deliver_emergency_signal(vcpu);
break;
case IRQ_PEND_EXT_EXTERNAL:
rc = __deliver_external_call(vcpu);
break;
case IRQ_PEND_EXT_CLOCK_COMP:
rc = __deliver_ckc(vcpu);
break;
case IRQ_PEND_EXT_CPU_TIMER:
rc = __deliver_cpu_timer(vcpu);
break;
case IRQ_PEND_RESTART:
rc = __deliver_restart(vcpu);
break;
case IRQ_PEND_SET_PREFIX:
rc = __deliver_set_prefix(vcpu);
break;
case IRQ_PEND_PFAULT_INIT:
rc = __deliver_pfault_init(vcpu);
break;
case IRQ_PEND_EXT_SERVICE:
rc = __deliver_service(vcpu);
break;
case IRQ_PEND_PFAULT_DONE:
rc = __deliver_pfault_done(vcpu);
break;
case IRQ_PEND_VIRTIO:
rc = __deliver_virtio(vcpu);
break;
default:
WARN_ONCE(1, "Unknown pending irq type %ld", irq_type);
clear_bit(irq_type, &li->pending_irqs);
} }
} }
@ -1701,7 +1735,8 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type)
kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_STOP_INT); kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_STOP_INT);
break; break;
case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT); if (!(type & KVM_S390_INT_IO_AI_MASK && kvm->arch.gisa))
kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT);
break; break;
default: default:
kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_EXT_INT); kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_EXT_INT);

View File

@ -179,6 +179,28 @@ int kvm_arch_hardware_enable(void)
static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
unsigned long end); unsigned long end);
static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta)
{
u8 delta_idx = 0;
/*
* The TOD jumps by delta, we have to compensate this by adding
* -delta to the epoch.
*/
delta = -delta;
/* sign-extension - we're adding to signed values below */
if ((s64)delta < 0)
delta_idx = -1;
scb->epoch += delta;
if (scb->ecd & ECD_MEF) {
scb->epdx += delta_idx;
if (scb->epoch < delta)
scb->epdx += 1;
}
}
/* /*
* This callback is executed during stop_machine(). All CPUs are therefore * This callback is executed during stop_machine(). All CPUs are therefore
* temporarily stopped. In order not to change guest behavior, we have to * temporarily stopped. In order not to change guest behavior, we have to
@ -194,13 +216,17 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val,
unsigned long long *delta = v; unsigned long long *delta = v;
list_for_each_entry(kvm, &vm_list, vm_list) { list_for_each_entry(kvm, &vm_list, vm_list) {
kvm->arch.epoch -= *delta;
kvm_for_each_vcpu(i, vcpu, kvm) { kvm_for_each_vcpu(i, vcpu, kvm) {
vcpu->arch.sie_block->epoch -= *delta; kvm_clock_sync_scb(vcpu->arch.sie_block, *delta);
if (i == 0) {
kvm->arch.epoch = vcpu->arch.sie_block->epoch;
kvm->arch.epdx = vcpu->arch.sie_block->epdx;
}
if (vcpu->arch.cputm_enabled) if (vcpu->arch.cputm_enabled)
vcpu->arch.cputm_start += *delta; vcpu->arch.cputm_start += *delta;
if (vcpu->arch.vsie_block) if (vcpu->arch.vsie_block)
vcpu->arch.vsie_block->epoch -= *delta; kvm_clock_sync_scb(vcpu->arch.vsie_block,
*delta);
} }
} }
return NOTIFY_OK; return NOTIFY_OK;
@ -902,12 +928,9 @@ static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr)
if (copy_from_user(&gtod, (void __user *)attr->addr, sizeof(gtod))) if (copy_from_user(&gtod, (void __user *)attr->addr, sizeof(gtod)))
return -EFAULT; return -EFAULT;
if (test_kvm_facility(kvm, 139)) if (!test_kvm_facility(kvm, 139) && gtod.epoch_idx)
kvm_s390_set_tod_clock_ext(kvm, &gtod);
else if (gtod.epoch_idx == 0)
kvm_s390_set_tod_clock(kvm, gtod.tod);
else
return -EINVAL; return -EINVAL;
kvm_s390_set_tod_clock(kvm, &gtod);
VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x, TOD base: 0x%llx", VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x, TOD base: 0x%llx",
gtod.epoch_idx, gtod.tod); gtod.epoch_idx, gtod.tod);
@ -932,13 +955,14 @@ static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
{ {
u64 gtod; struct kvm_s390_vm_tod_clock gtod = { 0 };
if (copy_from_user(&gtod, (void __user *)attr->addr, sizeof(gtod))) if (copy_from_user(&gtod.tod, (void __user *)attr->addr,
sizeof(gtod.tod)))
return -EFAULT; return -EFAULT;
kvm_s390_set_tod_clock(kvm, gtod); kvm_s390_set_tod_clock(kvm, &gtod);
VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod); VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod.tod);
return 0; return 0;
} }
@ -2389,6 +2413,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
mutex_lock(&vcpu->kvm->lock); mutex_lock(&vcpu->kvm->lock);
preempt_disable(); preempt_disable();
vcpu->arch.sie_block->epoch = vcpu->kvm->arch.epoch; vcpu->arch.sie_block->epoch = vcpu->kvm->arch.epoch;
vcpu->arch.sie_block->epdx = vcpu->kvm->arch.epdx;
preempt_enable(); preempt_enable();
mutex_unlock(&vcpu->kvm->lock); mutex_unlock(&vcpu->kvm->lock);
if (!kvm_is_ucontrol(vcpu->kvm)) { if (!kvm_is_ucontrol(vcpu->kvm)) {
@ -3021,8 +3046,8 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
return 0; return 0;
} }
void kvm_s390_set_tod_clock_ext(struct kvm *kvm, void kvm_s390_set_tod_clock(struct kvm *kvm,
const struct kvm_s390_vm_tod_clock *gtod) const struct kvm_s390_vm_tod_clock *gtod)
{ {
struct kvm_vcpu *vcpu; struct kvm_vcpu *vcpu;
struct kvm_s390_tod_clock_ext htod; struct kvm_s390_tod_clock_ext htod;
@ -3034,10 +3059,12 @@ void kvm_s390_set_tod_clock_ext(struct kvm *kvm,
get_tod_clock_ext((char *)&htod); get_tod_clock_ext((char *)&htod);
kvm->arch.epoch = gtod->tod - htod.tod; kvm->arch.epoch = gtod->tod - htod.tod;
kvm->arch.epdx = gtod->epoch_idx - htod.epoch_idx; kvm->arch.epdx = 0;
if (test_kvm_facility(kvm, 139)) {
if (kvm->arch.epoch > gtod->tod) kvm->arch.epdx = gtod->epoch_idx - htod.epoch_idx;
kvm->arch.epdx -= 1; if (kvm->arch.epoch > gtod->tod)
kvm->arch.epdx -= 1;
}
kvm_s390_vcpu_block_all(kvm); kvm_s390_vcpu_block_all(kvm);
kvm_for_each_vcpu(i, vcpu, kvm) { kvm_for_each_vcpu(i, vcpu, kvm) {
@ -3050,22 +3077,6 @@ void kvm_s390_set_tod_clock_ext(struct kvm *kvm,
mutex_unlock(&kvm->lock); mutex_unlock(&kvm->lock);
} }
void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod)
{
struct kvm_vcpu *vcpu;
int i;
mutex_lock(&kvm->lock);
preempt_disable();
kvm->arch.epoch = tod - get_tod_clock();
kvm_s390_vcpu_block_all(kvm);
kvm_for_each_vcpu(i, vcpu, kvm)
vcpu->arch.sie_block->epoch = kvm->arch.epoch;
kvm_s390_vcpu_unblock_all(kvm);
preempt_enable();
mutex_unlock(&kvm->lock);
}
/** /**
* kvm_arch_fault_in_page - fault-in guest page if necessary * kvm_arch_fault_in_page - fault-in guest page if necessary
* @vcpu: The corresponding virtual cpu * @vcpu: The corresponding virtual cpu

View File

@ -19,8 +19,6 @@
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/sclp.h> #include <asm/sclp.h>
typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
/* Transactional Memory Execution related macros */ /* Transactional Memory Execution related macros */
#define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & ECB_TE)) #define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & ECB_TE))
#define TDB_FORMAT1 1 #define TDB_FORMAT1 1
@ -283,9 +281,8 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
/* implemented in kvm-s390.c */ /* implemented in kvm-s390.c */
void kvm_s390_set_tod_clock_ext(struct kvm *kvm, void kvm_s390_set_tod_clock(struct kvm *kvm,
const struct kvm_s390_vm_tod_clock *gtod); const struct kvm_s390_vm_tod_clock *gtod);
void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod);
long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr); int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr);

View File

@ -85,9 +85,10 @@ int kvm_s390_handle_e3(struct kvm_vcpu *vcpu)
/* Handle SCK (SET CLOCK) interception */ /* Handle SCK (SET CLOCK) interception */
static int handle_set_clock(struct kvm_vcpu *vcpu) static int handle_set_clock(struct kvm_vcpu *vcpu)
{ {
struct kvm_s390_vm_tod_clock gtod = { 0 };
int rc; int rc;
u8 ar; u8 ar;
u64 op2, val; u64 op2;
vcpu->stat.instruction_sck++; vcpu->stat.instruction_sck++;
@ -97,12 +98,12 @@ static int handle_set_clock(struct kvm_vcpu *vcpu)
op2 = kvm_s390_get_base_disp_s(vcpu, &ar); op2 = kvm_s390_get_base_disp_s(vcpu, &ar);
if (op2 & 7) /* Operand must be on a doubleword boundary */ if (op2 & 7) /* Operand must be on a doubleword boundary */
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
rc = read_guest(vcpu, op2, ar, &val, sizeof(val)); rc = read_guest(vcpu, op2, ar, &gtod.tod, sizeof(gtod.tod));
if (rc) if (rc)
return kvm_s390_inject_prog_cond(vcpu, rc); return kvm_s390_inject_prog_cond(vcpu, rc);
VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", val); VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", gtod.tod);
kvm_s390_set_tod_clock(vcpu->kvm, val); kvm_s390_set_tod_clock(vcpu->kvm, &gtod);
kvm_s390_set_psw_cc(vcpu, 0); kvm_s390_set_psw_cc(vcpu, 0);
return 0; return 0;
@ -795,55 +796,60 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
return rc; return rc;
} }
static const intercept_handler_t b2_handlers[256] = {
[0x02] = handle_stidp,
[0x04] = handle_set_clock,
[0x10] = handle_set_prefix,
[0x11] = handle_store_prefix,
[0x12] = handle_store_cpu_address,
[0x14] = kvm_s390_handle_vsie,
[0x21] = handle_ipte_interlock,
[0x29] = handle_iske,
[0x2a] = handle_rrbe,
[0x2b] = handle_sske,
[0x2c] = handle_test_block,
[0x30] = handle_io_inst,
[0x31] = handle_io_inst,
[0x32] = handle_io_inst,
[0x33] = handle_io_inst,
[0x34] = handle_io_inst,
[0x35] = handle_io_inst,
[0x36] = handle_io_inst,
[0x37] = handle_io_inst,
[0x38] = handle_io_inst,
[0x39] = handle_io_inst,
[0x3a] = handle_io_inst,
[0x3b] = handle_io_inst,
[0x3c] = handle_io_inst,
[0x50] = handle_ipte_interlock,
[0x56] = handle_sthyi,
[0x5f] = handle_io_inst,
[0x74] = handle_io_inst,
[0x76] = handle_io_inst,
[0x7d] = handle_stsi,
[0xb1] = handle_stfl,
[0xb2] = handle_lpswe,
};
int kvm_s390_handle_b2(struct kvm_vcpu *vcpu) int kvm_s390_handle_b2(struct kvm_vcpu *vcpu)
{ {
intercept_handler_t handler; switch (vcpu->arch.sie_block->ipa & 0x00ff) {
case 0x02:
/* return handle_stidp(vcpu);
* A lot of B2 instructions are priviledged. Here we check for case 0x04:
* the privileged ones, that we can handle in the kernel. return handle_set_clock(vcpu);
* Anything else goes to userspace. case 0x10:
*/ return handle_set_prefix(vcpu);
handler = b2_handlers[vcpu->arch.sie_block->ipa & 0x00ff]; case 0x11:
if (handler) return handle_store_prefix(vcpu);
return handler(vcpu); case 0x12:
return handle_store_cpu_address(vcpu);
return -EOPNOTSUPP; case 0x14:
return kvm_s390_handle_vsie(vcpu);
case 0x21:
case 0x50:
return handle_ipte_interlock(vcpu);
case 0x29:
return handle_iske(vcpu);
case 0x2a:
return handle_rrbe(vcpu);
case 0x2b:
return handle_sske(vcpu);
case 0x2c:
return handle_test_block(vcpu);
case 0x30:
case 0x31:
case 0x32:
case 0x33:
case 0x34:
case 0x35:
case 0x36:
case 0x37:
case 0x38:
case 0x39:
case 0x3a:
case 0x3b:
case 0x3c:
case 0x5f:
case 0x74:
case 0x76:
return handle_io_inst(vcpu);
case 0x56:
return handle_sthyi(vcpu);
case 0x7d:
return handle_stsi(vcpu);
case 0xb1:
return handle_stfl(vcpu);
case 0xb2:
return handle_lpswe(vcpu);
default:
return -EOPNOTSUPP;
}
} }
static int handle_epsw(struct kvm_vcpu *vcpu) static int handle_epsw(struct kvm_vcpu *vcpu)
@ -1105,25 +1111,22 @@ static int handle_essa(struct kvm_vcpu *vcpu)
return 0; return 0;
} }
static const intercept_handler_t b9_handlers[256] = {
[0x8a] = handle_ipte_interlock,
[0x8d] = handle_epsw,
[0x8e] = handle_ipte_interlock,
[0x8f] = handle_ipte_interlock,
[0xab] = handle_essa,
[0xaf] = handle_pfmf,
};
int kvm_s390_handle_b9(struct kvm_vcpu *vcpu) int kvm_s390_handle_b9(struct kvm_vcpu *vcpu)
{ {
intercept_handler_t handler; switch (vcpu->arch.sie_block->ipa & 0x00ff) {
case 0x8a:
/* This is handled just as for the B2 instructions. */ case 0x8e:
handler = b9_handlers[vcpu->arch.sie_block->ipa & 0x00ff]; case 0x8f:
if (handler) return handle_ipte_interlock(vcpu);
return handler(vcpu); case 0x8d:
return handle_epsw(vcpu);
return -EOPNOTSUPP; case 0xab:
return handle_essa(vcpu);
case 0xaf:
return handle_pfmf(vcpu);
default:
return -EOPNOTSUPP;
}
} }
int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu) int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu)
@ -1271,22 +1274,20 @@ static int handle_stctg(struct kvm_vcpu *vcpu)
return rc ? kvm_s390_inject_prog_cond(vcpu, rc) : 0; return rc ? kvm_s390_inject_prog_cond(vcpu, rc) : 0;
} }
static const intercept_handler_t eb_handlers[256] = {
[0x2f] = handle_lctlg,
[0x25] = handle_stctg,
[0x60] = handle_ri,
[0x61] = handle_ri,
[0x62] = handle_ri,
};
int kvm_s390_handle_eb(struct kvm_vcpu *vcpu) int kvm_s390_handle_eb(struct kvm_vcpu *vcpu)
{ {
intercept_handler_t handler; switch (vcpu->arch.sie_block->ipb & 0x000000ff) {
case 0x25:
handler = eb_handlers[vcpu->arch.sie_block->ipb & 0xff]; return handle_stctg(vcpu);
if (handler) case 0x2f:
return handler(vcpu); return handle_lctlg(vcpu);
return -EOPNOTSUPP; case 0x60:
case 0x61:
case 0x62:
return handle_ri(vcpu);
default:
return -EOPNOTSUPP;
}
} }
static int handle_tprot(struct kvm_vcpu *vcpu) static int handle_tprot(struct kvm_vcpu *vcpu)
@ -1346,10 +1347,12 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
int kvm_s390_handle_e5(struct kvm_vcpu *vcpu) int kvm_s390_handle_e5(struct kvm_vcpu *vcpu)
{ {
/* For e5xx... instructions we only handle TPROT */ switch (vcpu->arch.sie_block->ipa & 0x00ff) {
if ((vcpu->arch.sie_block->ipa & 0x00ff) == 0x01) case 0x01:
return handle_tprot(vcpu); return handle_tprot(vcpu);
return -EOPNOTSUPP; default:
return -EOPNOTSUPP;
}
} }
static int handle_sckpf(struct kvm_vcpu *vcpu) static int handle_sckpf(struct kvm_vcpu *vcpu)
@ -1380,17 +1383,14 @@ static int handle_ptff(struct kvm_vcpu *vcpu)
return 0; return 0;
} }
static const intercept_handler_t x01_handlers[256] = {
[0x04] = handle_ptff,
[0x07] = handle_sckpf,
};
int kvm_s390_handle_01(struct kvm_vcpu *vcpu) int kvm_s390_handle_01(struct kvm_vcpu *vcpu)
{ {
intercept_handler_t handler; switch (vcpu->arch.sie_block->ipa & 0x00ff) {
case 0x04:
handler = x01_handlers[vcpu->arch.sie_block->ipa & 0x00ff]; return handle_ptff(vcpu);
if (handler) case 0x07:
return handler(vcpu); return handle_sckpf(vcpu);
return -EOPNOTSUPP; default:
return -EOPNOTSUPP;
}
} }

View File

@ -821,6 +821,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
{ {
struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
int guest_bp_isolation;
int rc; int rc;
handle_last_fault(vcpu, vsie_page); handle_last_fault(vcpu, vsie_page);
@ -831,6 +832,20 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
s390_handle_mcck(); s390_handle_mcck();
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
/* save current guest state of bp isolation override */
guest_bp_isolation = test_thread_flag(TIF_ISOLATE_BP_GUEST);
/*
* The guest is running with BPBC, so we have to force it on for our
* nested guest. This is done by enabling BPBC globally, so the BPBC
* control in the SCB (which the nested guest can modify) is simply
* ignored.
*/
if (test_kvm_facility(vcpu->kvm, 82) &&
vcpu->arch.sie_block->fpf & FPF_BPBC)
set_thread_flag(TIF_ISOLATE_BP_GUEST);
local_irq_disable(); local_irq_disable();
guest_enter_irqoff(); guest_enter_irqoff();
local_irq_enable(); local_irq_enable();
@ -840,6 +855,11 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
local_irq_disable(); local_irq_disable();
guest_exit_irqoff(); guest_exit_irqoff();
local_irq_enable(); local_irq_enable();
/* restore guest state for bp isolation override */
if (!guest_bp_isolation)
clear_thread_flag(TIF_ISOLATE_BP_GUEST);
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
if (rc == -EINTR) { if (rc == -EINTR) {

View File

@ -1464,7 +1464,4 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
#define put_smstate(type, buf, offset, val) \ #define put_smstate(type, buf, offset, val) \
*(type *)((buf) + (offset) - 0x7e00) = val *(type *)((buf) + (offset) - 0x7e00) = val
void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
unsigned long start, unsigned long end);
#endif /* _ASM_X86_KVM_HOST_H */ #endif /* _ASM_X86_KVM_HOST_H */

View File

@ -26,6 +26,7 @@
#define KVM_FEATURE_PV_EOI 6 #define KVM_FEATURE_PV_EOI 6
#define KVM_FEATURE_PV_UNHALT 7 #define KVM_FEATURE_PV_UNHALT 7
#define KVM_FEATURE_PV_TLB_FLUSH 9 #define KVM_FEATURE_PV_TLB_FLUSH 9
#define KVM_FEATURE_ASYNC_PF_VMEXIT 10
/* The last 8 bits are used to indicate how to interpret the flags field /* The last 8 bits are used to indicate how to interpret the flags field
* in pvclock structure. If no bits are set, all flags are ignored. * in pvclock structure. If no bits are set, all flags are ignored.

View File

@ -49,7 +49,7 @@
static int kvmapf = 1; static int kvmapf = 1;
static int parse_no_kvmapf(char *arg) static int __init parse_no_kvmapf(char *arg)
{ {
kvmapf = 0; kvmapf = 0;
return 0; return 0;
@ -58,7 +58,7 @@ static int parse_no_kvmapf(char *arg)
early_param("no-kvmapf", parse_no_kvmapf); early_param("no-kvmapf", parse_no_kvmapf);
static int steal_acc = 1; static int steal_acc = 1;
static int parse_no_stealacc(char *arg) static int __init parse_no_stealacc(char *arg)
{ {
steal_acc = 0; steal_acc = 0;
return 0; return 0;
@ -67,7 +67,7 @@ static int parse_no_stealacc(char *arg)
early_param("no-steal-acc", parse_no_stealacc); early_param("no-steal-acc", parse_no_stealacc);
static int kvmclock_vsyscall = 1; static int kvmclock_vsyscall = 1;
static int parse_no_kvmclock_vsyscall(char *arg) static int __init parse_no_kvmclock_vsyscall(char *arg)
{ {
kvmclock_vsyscall = 0; kvmclock_vsyscall = 0;
return 0; return 0;
@ -341,10 +341,10 @@ static void kvm_guest_cpu_init(void)
#endif #endif
pa |= KVM_ASYNC_PF_ENABLED; pa |= KVM_ASYNC_PF_ENABLED;
/* Async page fault support for L1 hypervisor is optional */ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
if (wrmsr_safe(MSR_KVM_ASYNC_PF_EN, pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
(pa | KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT) & 0xffffffff, pa >> 32) < 0)
wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
__this_cpu_write(apf_reason.enabled, 1); __this_cpu_write(apf_reason.enabled, 1);
printk(KERN_INFO"KVM setup async PF for cpu %d\n", printk(KERN_INFO"KVM setup async PF for cpu %d\n",
smp_processor_id()); smp_processor_id());
@ -545,7 +545,8 @@ static void __init kvm_guest_init(void)
pv_time_ops.steal_clock = kvm_steal_clock; pv_time_ops.steal_clock = kvm_steal_clock;
} }
if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH)) if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
!kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))
pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others; pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
@ -633,7 +634,8 @@ static __init int kvm_setup_pv_tlb_flush(void)
{ {
int cpu; int cpu;
if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH)) { if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
!kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu), zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu),
GFP_KERNEL, cpu_to_node(cpu)); GFP_KERNEL, cpu_to_node(cpu));

View File

@ -607,7 +607,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
(1 << KVM_FEATURE_PV_EOI) | (1 << KVM_FEATURE_PV_EOI) |
(1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
(1 << KVM_FEATURE_PV_UNHALT) | (1 << KVM_FEATURE_PV_UNHALT) |
(1 << KVM_FEATURE_PV_TLB_FLUSH); (1 << KVM_FEATURE_PV_TLB_FLUSH) |
(1 << KVM_FEATURE_ASYNC_PF_VMEXIT);
if (sched_info_on()) if (sched_info_on())
entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);

View File

@ -2165,7 +2165,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
*/ */
vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
kvm_lapic_reset(vcpu, false);
kvm_iodevice_init(&apic->dev, &apic_mmio_ops); kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
return 0; return 0;

View File

@ -3029,7 +3029,7 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
return RET_PF_RETRY; return RET_PF_RETRY;
} }
return -EFAULT; return RET_PF_EMULATE;
} }
static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,

View File

@ -300,6 +300,8 @@ module_param(vgif, int, 0444);
static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT); static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
module_param(sev, int, 0444); module_param(sev, int, 0444);
static u8 rsm_ins_bytes[] = "\x0f\xaa";
static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa); static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
static void svm_complete_interrupts(struct vcpu_svm *svm); static void svm_complete_interrupts(struct vcpu_svm *svm);
@ -1383,6 +1385,7 @@ static void init_vmcb(struct vcpu_svm *svm)
set_intercept(svm, INTERCEPT_SKINIT); set_intercept(svm, INTERCEPT_SKINIT);
set_intercept(svm, INTERCEPT_WBINVD); set_intercept(svm, INTERCEPT_WBINVD);
set_intercept(svm, INTERCEPT_XSETBV); set_intercept(svm, INTERCEPT_XSETBV);
set_intercept(svm, INTERCEPT_RSM);
if (!kvm_mwait_in_guest()) { if (!kvm_mwait_in_guest()) {
set_intercept(svm, INTERCEPT_MONITOR); set_intercept(svm, INTERCEPT_MONITOR);
@ -3699,6 +3702,12 @@ static int emulate_on_interception(struct vcpu_svm *svm)
return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
} }
static int rsm_interception(struct vcpu_svm *svm)
{
return x86_emulate_instruction(&svm->vcpu, 0, 0,
rsm_ins_bytes, 2) == EMULATE_DONE;
}
static int rdpmc_interception(struct vcpu_svm *svm) static int rdpmc_interception(struct vcpu_svm *svm)
{ {
int err; int err;
@ -4541,7 +4550,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_MWAIT] = mwait_interception,
[SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_XSETBV] = xsetbv_interception,
[SVM_EXIT_NPF] = npf_interception, [SVM_EXIT_NPF] = npf_interception,
[SVM_EXIT_RSM] = emulate_on_interception, [SVM_EXIT_RSM] = rsm_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
[SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
}; };
@ -6236,16 +6245,18 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
{ {
void __user *measure = (void __user *)(uintptr_t)argp->data;
struct kvm_sev_info *sev = &kvm->arch.sev_info; struct kvm_sev_info *sev = &kvm->arch.sev_info;
struct sev_data_launch_measure *data; struct sev_data_launch_measure *data;
struct kvm_sev_launch_measure params; struct kvm_sev_launch_measure params;
void __user *p = NULL;
void *blob = NULL; void *blob = NULL;
int ret; int ret;
if (!sev_guest(kvm)) if (!sev_guest(kvm))
return -ENOTTY; return -ENOTTY;
if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params))) if (copy_from_user(&params, measure, sizeof(params)))
return -EFAULT; return -EFAULT;
data = kzalloc(sizeof(*data), GFP_KERNEL); data = kzalloc(sizeof(*data), GFP_KERNEL);
@ -6256,17 +6267,13 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!params.len) if (!params.len)
goto cmd; goto cmd;
if (params.uaddr) { p = (void __user *)(uintptr_t)params.uaddr;
if (p) {
if (params.len > SEV_FW_BLOB_MAX_SIZE) { if (params.len > SEV_FW_BLOB_MAX_SIZE) {
ret = -EINVAL; ret = -EINVAL;
goto e_free; goto e_free;
} }
if (!access_ok(VERIFY_WRITE, params.uaddr, params.len)) {
ret = -EFAULT;
goto e_free;
}
ret = -ENOMEM; ret = -ENOMEM;
blob = kmalloc(params.len, GFP_KERNEL); blob = kmalloc(params.len, GFP_KERNEL);
if (!blob) if (!blob)
@ -6290,13 +6297,13 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
goto e_free_blob; goto e_free_blob;
if (blob) { if (blob) {
if (copy_to_user((void __user *)(uintptr_t)params.uaddr, blob, params.len)) if (copy_to_user(p, blob, params.len))
ret = -EFAULT; ret = -EFAULT;
} }
done: done:
params.len = data->len; params.len = data->len;
if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) if (copy_to_user(measure, &params, sizeof(params)))
ret = -EFAULT; ret = -EFAULT;
e_free_blob: e_free_blob:
kfree(blob); kfree(blob);
@ -6597,7 +6604,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
struct page **pages; struct page **pages;
void *blob, *hdr; void *blob, *hdr;
unsigned long n; unsigned long n;
int ret; int ret, offset;
if (!sev_guest(kvm)) if (!sev_guest(kvm))
return -ENOTTY; return -ENOTTY;
@ -6623,6 +6630,10 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!data) if (!data)
goto e_unpin_memory; goto e_unpin_memory;
offset = params.guest_uaddr & (PAGE_SIZE - 1);
data->guest_address = __sme_page_pa(pages[0]) + offset;
data->guest_len = params.guest_len;
blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len); blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
if (IS_ERR(blob)) { if (IS_ERR(blob)) {
ret = PTR_ERR(blob); ret = PTR_ERR(blob);
@ -6637,8 +6648,8 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
ret = PTR_ERR(hdr); ret = PTR_ERR(hdr);
goto e_free_blob; goto e_free_blob;
} }
data->trans_address = __psp_pa(blob); data->hdr_address = __psp_pa(hdr);
data->trans_len = params.trans_len; data->hdr_len = params.hdr_len;
data->handle = sev->handle; data->handle = sev->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error); ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error);

View File

@ -4485,7 +4485,8 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
SECONDARY_EXEC_DESC); SECONDARY_EXEC_DESC);
hw_cr4 &= ~X86_CR4_UMIP; hw_cr4 &= ~X86_CR4_UMIP;
} else } else if (!is_guest_mode(vcpu) ||
!nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC))
vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
SECONDARY_EXEC_DESC); SECONDARY_EXEC_DESC);
@ -11199,7 +11200,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (ret) if (ret)
return ret; return ret;
if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) /*
* If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
* by event injection, halt vcpu.
*/
if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK))
return kvm_vcpu_halt(vcpu); return kvm_vcpu_halt(vcpu);
vmx->nested.nested_run_pending = 1; vmx->nested.nested_run_pending = 1;

View File

@ -7975,6 +7975,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
kvm_vcpu_mtrr_init(vcpu); kvm_vcpu_mtrr_init(vcpu);
vcpu_load(vcpu); vcpu_load(vcpu);
kvm_vcpu_reset(vcpu, false); kvm_vcpu_reset(vcpu, false);
kvm_lapic_reset(vcpu, false);
kvm_mmu_setup(vcpu); kvm_mmu_setup(vcpu);
vcpu_put(vcpu); vcpu_put(vcpu);
return 0; return 0;
@ -8460,10 +8461,8 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
return r; return r;
} }
if (!size) { if (!size)
r = vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE); vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE);
WARN_ON(r < 0);
}
return 0; return 0;
} }

View File

@ -211,7 +211,7 @@ static int __sev_platform_shutdown_locked(int *error)
{ {
int ret; int ret;
ret = __sev_do_cmd_locked(SEV_CMD_SHUTDOWN, 0, error); ret = __sev_do_cmd_locked(SEV_CMD_SHUTDOWN, NULL, error);
if (ret) if (ret)
return ret; return ret;
@ -271,7 +271,7 @@ static int sev_ioctl_do_reset(struct sev_issue_cmd *argp)
return rc; return rc;
} }
return __sev_do_cmd_locked(SEV_CMD_FACTORY_RESET, 0, &argp->error); return __sev_do_cmd_locked(SEV_CMD_FACTORY_RESET, NULL, &argp->error);
} }
static int sev_ioctl_do_platform_status(struct sev_issue_cmd *argp) static int sev_ioctl_do_platform_status(struct sev_issue_cmd *argp)
@ -299,7 +299,7 @@ static int sev_ioctl_do_pek_pdh_gen(int cmd, struct sev_issue_cmd *argp)
return rc; return rc;
} }
return __sev_do_cmd_locked(cmd, 0, &argp->error); return __sev_do_cmd_locked(cmd, NULL, &argp->error);
} }
static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp) static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp)
@ -624,7 +624,7 @@ EXPORT_SYMBOL_GPL(sev_guest_decommission);
int sev_guest_df_flush(int *error) int sev_guest_df_flush(int *error)
{ {
return sev_do_cmd(SEV_CMD_DF_FLUSH, 0, error); return sev_do_cmd(SEV_CMD_DF_FLUSH, NULL, error);
} }
EXPORT_SYMBOL_GPL(sev_guest_df_flush); EXPORT_SYMBOL_GPL(sev_guest_df_flush);

View File

@ -1105,7 +1105,6 @@ static inline void kvm_irq_routing_update(struct kvm *kvm)
{ {
} }
#endif #endif
void kvm_arch_irq_routing_update(struct kvm *kvm);
static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
{ {
@ -1114,6 +1113,8 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
#endif /* CONFIG_HAVE_KVM_EVENTFD */ #endif /* CONFIG_HAVE_KVM_EVENTFD */
void kvm_arch_irq_routing_update(struct kvm *kvm);
static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
{ {
/* /*
@ -1272,4 +1273,7 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp,
} }
#endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */
void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
unsigned long start, unsigned long end);
#endif #endif

View File

@ -42,7 +42,7 @@ typedef enum {
SEV_RET_INVALID_PLATFORM_STATE, SEV_RET_INVALID_PLATFORM_STATE,
SEV_RET_INVALID_GUEST_STATE, SEV_RET_INVALID_GUEST_STATE,
SEV_RET_INAVLID_CONFIG, SEV_RET_INAVLID_CONFIG,
SEV_RET_INVALID_len, SEV_RET_INVALID_LEN,
SEV_RET_ALREADY_OWNED, SEV_RET_ALREADY_OWNED,
SEV_RET_INVALID_CERTIFICATE, SEV_RET_INVALID_CERTIFICATE,
SEV_RET_POLICY_FAILURE, SEV_RET_POLICY_FAILURE,

View File

@ -33,7 +33,7 @@ import resource
import struct import struct
import re import re
import subprocess import subprocess
from collections import defaultdict from collections import defaultdict, namedtuple
VMX_EXIT_REASONS = { VMX_EXIT_REASONS = {
'EXCEPTION_NMI': 0, 'EXCEPTION_NMI': 0,
@ -228,6 +228,7 @@ IOCTL_NUMBERS = {
} }
ENCODING = locale.getpreferredencoding(False) ENCODING = locale.getpreferredencoding(False)
TRACE_FILTER = re.compile(r'^[^\(]*$')
class Arch(object): class Arch(object):
@ -260,6 +261,11 @@ class Arch(object):
return ArchX86(SVM_EXIT_REASONS) return ArchX86(SVM_EXIT_REASONS)
return return
def tracepoint_is_child(self, field):
if (TRACE_FILTER.match(field)):
return None
return field.split('(', 1)[0]
class ArchX86(Arch): class ArchX86(Arch):
def __init__(self, exit_reasons): def __init__(self, exit_reasons):
@ -267,6 +273,10 @@ class ArchX86(Arch):
self.ioctl_numbers = IOCTL_NUMBERS self.ioctl_numbers = IOCTL_NUMBERS
self.exit_reasons = exit_reasons self.exit_reasons = exit_reasons
def debugfs_is_child(self, field):
""" Returns name of parent if 'field' is a child, None otherwise """
return None
class ArchPPC(Arch): class ArchPPC(Arch):
def __init__(self): def __init__(self):
@ -282,6 +292,10 @@ class ArchPPC(Arch):
self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16 self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16
self.exit_reasons = {} self.exit_reasons = {}
def debugfs_is_child(self, field):
""" Returns name of parent if 'field' is a child, None otherwise """
return None
class ArchA64(Arch): class ArchA64(Arch):
def __init__(self): def __init__(self):
@ -289,6 +303,10 @@ class ArchA64(Arch):
self.ioctl_numbers = IOCTL_NUMBERS self.ioctl_numbers = IOCTL_NUMBERS
self.exit_reasons = AARCH64_EXIT_REASONS self.exit_reasons = AARCH64_EXIT_REASONS
def debugfs_is_child(self, field):
""" Returns name of parent if 'field' is a child, None otherwise """
return None
class ArchS390(Arch): class ArchS390(Arch):
def __init__(self): def __init__(self):
@ -296,6 +314,12 @@ class ArchS390(Arch):
self.ioctl_numbers = IOCTL_NUMBERS self.ioctl_numbers = IOCTL_NUMBERS
self.exit_reasons = None self.exit_reasons = None
def debugfs_is_child(self, field):
""" Returns name of parent if 'field' is a child, None otherwise """
if field.startswith('instruction_'):
return 'exit_instruction'
ARCH = Arch.get_arch() ARCH = Arch.get_arch()
@ -331,9 +355,6 @@ class perf_event_attr(ctypes.Structure):
PERF_TYPE_TRACEPOINT = 2 PERF_TYPE_TRACEPOINT = 2
PERF_FORMAT_GROUP = 1 << 3 PERF_FORMAT_GROUP = 1 << 3
PATH_DEBUGFS_TRACING = '/sys/kernel/debug/tracing'
PATH_DEBUGFS_KVM = '/sys/kernel/debug/kvm'
class Group(object): class Group(object):
"""Represents a perf event group.""" """Represents a perf event group."""
@ -376,8 +397,8 @@ class Event(object):
self.syscall = self.libc.syscall self.syscall = self.libc.syscall
self.name = name self.name = name
self.fd = None self.fd = None
self.setup_event(group, trace_cpu, trace_pid, trace_point, self._setup_event(group, trace_cpu, trace_pid, trace_point,
trace_filter, trace_set) trace_filter, trace_set)
def __del__(self): def __del__(self):
"""Closes the event's file descriptor. """Closes the event's file descriptor.
@ -390,7 +411,7 @@ class Event(object):
if self.fd: if self.fd:
os.close(self.fd) os.close(self.fd)
def perf_event_open(self, attr, pid, cpu, group_fd, flags): def _perf_event_open(self, attr, pid, cpu, group_fd, flags):
"""Wrapper for the sys_perf_evt_open() syscall. """Wrapper for the sys_perf_evt_open() syscall.
Used to set up performance events, returns a file descriptor or -1 Used to set up performance events, returns a file descriptor or -1
@ -409,7 +430,7 @@ class Event(object):
ctypes.c_int(pid), ctypes.c_int(cpu), ctypes.c_int(pid), ctypes.c_int(cpu),
ctypes.c_int(group_fd), ctypes.c_long(flags)) ctypes.c_int(group_fd), ctypes.c_long(flags))
def setup_event_attribute(self, trace_set, trace_point): def _setup_event_attribute(self, trace_set, trace_point):
"""Returns an initialized ctype perf_event_attr struct.""" """Returns an initialized ctype perf_event_attr struct."""
id_path = os.path.join(PATH_DEBUGFS_TRACING, 'events', trace_set, id_path = os.path.join(PATH_DEBUGFS_TRACING, 'events', trace_set,
@ -419,8 +440,8 @@ class Event(object):
event_attr.config = int(open(id_path).read()) event_attr.config = int(open(id_path).read())
return event_attr return event_attr
def setup_event(self, group, trace_cpu, trace_pid, trace_point, def _setup_event(self, group, trace_cpu, trace_pid, trace_point,
trace_filter, trace_set): trace_filter, trace_set):
"""Sets up the perf event in Linux. """Sets up the perf event in Linux.
Issues the syscall to register the event in the kernel and Issues the syscall to register the event in the kernel and
@ -428,7 +449,7 @@ class Event(object):
""" """
event_attr = self.setup_event_attribute(trace_set, trace_point) event_attr = self._setup_event_attribute(trace_set, trace_point)
# First event will be group leader. # First event will be group leader.
group_leader = -1 group_leader = -1
@ -437,8 +458,8 @@ class Event(object):
if group.events: if group.events:
group_leader = group.events[0].fd group_leader = group.events[0].fd
fd = self.perf_event_open(event_attr, trace_pid, fd = self._perf_event_open(event_attr, trace_pid,
trace_cpu, group_leader, 0) trace_cpu, group_leader, 0)
if fd == -1: if fd == -1:
err = ctypes.get_errno() err = ctypes.get_errno()
raise OSError(err, os.strerror(err), raise OSError(err, os.strerror(err),
@ -475,6 +496,10 @@ class Event(object):
class Provider(object): class Provider(object):
"""Encapsulates functionalities used by all providers.""" """Encapsulates functionalities used by all providers."""
def __init__(self, pid):
self.child_events = False
self.pid = pid
@staticmethod @staticmethod
def is_field_wanted(fields_filter, field): def is_field_wanted(fields_filter, field):
"""Indicate whether field is valid according to fields_filter.""" """Indicate whether field is valid according to fields_filter."""
@ -500,12 +525,12 @@ class TracepointProvider(Provider):
""" """
def __init__(self, pid, fields_filter): def __init__(self, pid, fields_filter):
self.group_leaders = [] self.group_leaders = []
self.filters = self.get_filters() self.filters = self._get_filters()
self.update_fields(fields_filter) self.update_fields(fields_filter)
self.pid = pid super(TracepointProvider, self).__init__(pid)
@staticmethod @staticmethod
def get_filters(): def _get_filters():
"""Returns a dict of trace events, their filter ids and """Returns a dict of trace events, their filter ids and
the values that can be filtered. the values that can be filtered.
@ -521,8 +546,8 @@ class TracepointProvider(Provider):
filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons) filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons)
return filters return filters
def get_available_fields(self): def _get_available_fields(self):
"""Returns a list of available event's of format 'event name(filter """Returns a list of available events of format 'event name(filter
name)'. name)'.
All available events have directories under All available events have directories under
@ -549,11 +574,12 @@ class TracepointProvider(Provider):
def update_fields(self, fields_filter): def update_fields(self, fields_filter):
"""Refresh fields, applying fields_filter""" """Refresh fields, applying fields_filter"""
self.fields = [field for field in self.get_available_fields() self.fields = [field for field in self._get_available_fields()
if self.is_field_wanted(fields_filter, field)] if self.is_field_wanted(fields_filter, field) or
ARCH.tracepoint_is_child(field)]
@staticmethod @staticmethod
def get_online_cpus(): def _get_online_cpus():
"""Returns a list of cpu id integers.""" """Returns a list of cpu id integers."""
def parse_int_list(list_string): def parse_int_list(list_string):
"""Returns an int list from a string of comma separated integers and """Returns an int list from a string of comma separated integers and
@ -575,17 +601,17 @@ class TracepointProvider(Provider):
cpu_string = cpu_list.readline() cpu_string = cpu_list.readline()
return parse_int_list(cpu_string) return parse_int_list(cpu_string)
def setup_traces(self): def _setup_traces(self):
"""Creates all event and group objects needed to be able to retrieve """Creates all event and group objects needed to be able to retrieve
data.""" data."""
fields = self.get_available_fields() fields = self._get_available_fields()
if self._pid > 0: if self._pid > 0:
# Fetch list of all threads of the monitored pid, as qemu # Fetch list of all threads of the monitored pid, as qemu
# starts a thread for each vcpu. # starts a thread for each vcpu.
path = os.path.join('/proc', str(self._pid), 'task') path = os.path.join('/proc', str(self._pid), 'task')
groupids = self.walkdir(path)[1] groupids = self.walkdir(path)[1]
else: else:
groupids = self.get_online_cpus() groupids = self._get_online_cpus()
# The constant is needed as a buffer for python libs, std # The constant is needed as a buffer for python libs, std
# streams and other files that the script opens. # streams and other files that the script opens.
@ -663,7 +689,7 @@ class TracepointProvider(Provider):
# The garbage collector will get rid of all Event/Group # The garbage collector will get rid of all Event/Group
# objects and open files after removing the references. # objects and open files after removing the references.
self.group_leaders = [] self.group_leaders = []
self.setup_traces() self._setup_traces()
self.fields = self._fields self.fields = self._fields
def read(self, by_guest=0): def read(self, by_guest=0):
@ -671,8 +697,12 @@ class TracepointProvider(Provider):
ret = defaultdict(int) ret = defaultdict(int)
for group in self.group_leaders: for group in self.group_leaders:
for name, val in group.read().items(): for name, val in group.read().items():
if name in self._fields: if name not in self._fields:
ret[name] += val continue
parent = ARCH.tracepoint_is_child(name)
if parent:
name += ' ' + parent
ret[name] += val
return ret return ret
def reset(self): def reset(self):
@ -690,11 +720,11 @@ class DebugfsProvider(Provider):
self._baseline = {} self._baseline = {}
self.do_read = True self.do_read = True
self.paths = [] self.paths = []
self.pid = pid super(DebugfsProvider, self).__init__(pid)
if include_past: if include_past:
self.restore() self._restore()
def get_available_fields(self): def _get_available_fields(self):
""""Returns a list of available fields. """"Returns a list of available fields.
The fields are all available KVM debugfs files The fields are all available KVM debugfs files
@ -704,8 +734,9 @@ class DebugfsProvider(Provider):
def update_fields(self, fields_filter): def update_fields(self, fields_filter):
"""Refresh fields, applying fields_filter""" """Refresh fields, applying fields_filter"""
self._fields = [field for field in self.get_available_fields() self._fields = [field for field in self._get_available_fields()
if self.is_field_wanted(fields_filter, field)] if self.is_field_wanted(fields_filter, field) or
ARCH.debugfs_is_child(field)]
@property @property
def fields(self): def fields(self):
@ -758,7 +789,7 @@ class DebugfsProvider(Provider):
paths.append(dir) paths.append(dir)
for path in paths: for path in paths:
for field in self._fields: for field in self._fields:
value = self.read_field(field, path) value = self._read_field(field, path)
key = path + field key = path + field
if reset == 1: if reset == 1:
self._baseline[key] = value self._baseline[key] = value
@ -766,20 +797,21 @@ class DebugfsProvider(Provider):
self._baseline[key] = 0 self._baseline[key] = 0
if self._baseline.get(key, -1) == -1: if self._baseline.get(key, -1) == -1:
self._baseline[key] = value self._baseline[key] = value
increment = (results.get(field, 0) + value - parent = ARCH.debugfs_is_child(field)
self._baseline.get(key, 0)) if parent:
if by_guest: field = field + ' ' + parent
pid = key.split('-')[0] else:
if pid in results: if by_guest:
results[pid] += increment field = key.split('-')[0] # set 'field' to 'pid'
else: increment = value - self._baseline.get(key, 0)
results[pid] = increment if field in results:
results[field] += increment
else: else:
results[field] = increment results[field] = increment
return results return results
def read_field(self, field, path): def _read_field(self, field, path):
"""Returns the value of a single field from a specific VM.""" """Returns the value of a single field from a specific VM."""
try: try:
return int(open(os.path.join(PATH_DEBUGFS_KVM, return int(open(os.path.join(PATH_DEBUGFS_KVM,
@ -794,12 +826,15 @@ class DebugfsProvider(Provider):
self._baseline = {} self._baseline = {}
self.read(1) self.read(1)
def restore(self): def _restore(self):
"""Reset field counters""" """Reset field counters"""
self._baseline = {} self._baseline = {}
self.read(2) self.read(2)
EventStat = namedtuple('EventStat', ['value', 'delta'])
class Stats(object): class Stats(object):
"""Manages the data providers and the data they provide. """Manages the data providers and the data they provide.
@ -808,13 +843,13 @@ class Stats(object):
""" """
def __init__(self, options): def __init__(self, options):
self.providers = self.get_providers(options) self.providers = self._get_providers(options)
self._pid_filter = options.pid self._pid_filter = options.pid
self._fields_filter = options.fields self._fields_filter = options.fields
self.values = {} self.values = {}
self._child_events = False
@staticmethod def _get_providers(self, options):
def get_providers(options):
"""Returns a list of data providers depending on the passed options.""" """Returns a list of data providers depending on the passed options."""
providers = [] providers = []
@ -826,7 +861,7 @@ class Stats(object):
return providers return providers
def update_provider_filters(self): def _update_provider_filters(self):
"""Propagates fields filters to providers.""" """Propagates fields filters to providers."""
# As we reset the counters when updating the fields we can # As we reset the counters when updating the fields we can
# also clear the cache of old values. # also clear the cache of old values.
@ -847,7 +882,7 @@ class Stats(object):
def fields_filter(self, fields_filter): def fields_filter(self, fields_filter):
if fields_filter != self._fields_filter: if fields_filter != self._fields_filter:
self._fields_filter = fields_filter self._fields_filter = fields_filter
self.update_provider_filters() self._update_provider_filters()
@property @property
def pid_filter(self): def pid_filter(self):
@ -861,16 +896,33 @@ class Stats(object):
for provider in self.providers: for provider in self.providers:
provider.pid = self._pid_filter provider.pid = self._pid_filter
@property
def child_events(self):
return self._child_events
@child_events.setter
def child_events(self, val):
self._child_events = val
for provider in self.providers:
provider.child_events = val
def get(self, by_guest=0): def get(self, by_guest=0):
"""Returns a dict with field -> (value, delta to last value) of all """Returns a dict with field -> (value, delta to last value) of all
provider data.""" provider data.
Key formats:
* plain: 'key' is event name
* child-parent: 'key' is in format '<child> <parent>'
* pid: 'key' is the pid of the guest, and the record contains the
aggregated event data
These formats are generated by the providers, and handled in class TUI.
"""
for provider in self.providers: for provider in self.providers:
new = provider.read(by_guest=by_guest) new = provider.read(by_guest=by_guest)
for key in new if by_guest else provider.fields: for key in new:
oldval = self.values.get(key, (0, 0))[0] oldval = self.values.get(key, EventStat(0, 0)).value
newval = new.get(key, 0) newval = new.get(key, 0)
newdelta = newval - oldval newdelta = newval - oldval
self.values[key] = (newval, newdelta) self.values[key] = EventStat(newval, newdelta)
return self.values return self.values
def toggle_display_guests(self, to_pid): def toggle_display_guests(self, to_pid):
@ -899,10 +951,10 @@ class Stats(object):
self.get(to_pid) self.get(to_pid)
return 0 return 0
DELAY_DEFAULT = 3.0 DELAY_DEFAULT = 3.0
MAX_GUEST_NAME_LEN = 48 MAX_GUEST_NAME_LEN = 48
MAX_REGEX_LEN = 44 MAX_REGEX_LEN = 44
DEFAULT_REGEX = r'^[^\(]*$'
SORT_DEFAULT = 0 SORT_DEFAULT = 0
@ -969,7 +1021,7 @@ class Tui(object):
return res return res
def print_all_gnames(self, row): def _print_all_gnames(self, row):
"""Print a list of all running guests along with their pids.""" """Print a list of all running guests along with their pids."""
self.screen.addstr(row, 2, '%8s %-60s' % self.screen.addstr(row, 2, '%8s %-60s' %
('Pid', 'Guest Name (fuzzy list, might be ' ('Pid', 'Guest Name (fuzzy list, might be '
@ -1032,19 +1084,13 @@ class Tui(object):
return name return name
def update_drilldown(self): def _update_pid(self, pid):
"""Sets or removes a filter that only allows fields without braces."""
if not self.stats.fields_filter:
self.stats.fields_filter = DEFAULT_REGEX
elif self.stats.fields_filter == DEFAULT_REGEX:
self.stats.fields_filter = None
def update_pid(self, pid):
"""Propagates pid selection to stats object.""" """Propagates pid selection to stats object."""
self.screen.addstr(4, 1, 'Updating pid filter...')
self.screen.refresh()
self.stats.pid_filter = pid self.stats.pid_filter = pid
def refresh_header(self, pid=None): def _refresh_header(self, pid=None):
"""Refreshes the header.""" """Refreshes the header."""
if pid is None: if pid is None:
pid = self.stats.pid_filter pid = self.stats.pid_filter
@ -1059,8 +1105,7 @@ class Tui(object):
.format(pid, gname), curses.A_BOLD) .format(pid, gname), curses.A_BOLD)
else: else:
self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD) self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD)
if self.stats.fields_filter and self.stats.fields_filter \ if self.stats.fields_filter:
!= DEFAULT_REGEX:
regex = self.stats.fields_filter regex = self.stats.fields_filter
if len(regex) > MAX_REGEX_LEN: if len(regex) > MAX_REGEX_LEN:
regex = regex[:MAX_REGEX_LEN] + '...' regex = regex[:MAX_REGEX_LEN] + '...'
@ -1075,56 +1120,99 @@ class Tui(object):
self.screen.addstr(4, 1, 'Collecting data...') self.screen.addstr(4, 1, 'Collecting data...')
self.screen.refresh() self.screen.refresh()
def refresh_body(self, sleeptime): def _refresh_body(self, sleeptime):
def is_child_field(field):
return field.find('(') != -1
def insert_child(sorted_items, child, values, parent):
num = len(sorted_items)
for i in range(0, num):
# only add child if parent is present
if parent.startswith(sorted_items[i][0]):
sorted_items.insert(i + 1, (' ' + child, values))
def get_sorted_events(self, stats):
""" separate parent and child events """
if self._sorting == SORT_DEFAULT:
def sortkey((_k, v)):
# sort by (delta value, overall value)
return (v.delta, v.value)
else:
def sortkey((_k, v)):
# sort by overall value
return v.value
childs = []
sorted_items = []
# we can't rule out child events to appear prior to parents even
# when sorted - separate out all children first, and add in later
for key, values in sorted(stats.items(), key=sortkey,
reverse=True):
if values == (0, 0):
continue
if key.find(' ') != -1:
if not self.stats.child_events:
continue
childs.insert(0, (key, values))
else:
sorted_items.append((key, values))
if self.stats.child_events:
for key, values in childs:
(child, parent) = key.split(' ')
insert_child(sorted_items, child, values, parent)
return sorted_items
row = 3 row = 3
self.screen.move(row, 0) self.screen.move(row, 0)
self.screen.clrtobot() self.screen.clrtobot()
stats = self.stats.get(self._display_guests) stats = self.stats.get(self._display_guests)
def sortCurAvg(x):
# sort by current events if available
if stats[x][1]:
return (-stats[x][1], -stats[x][0])
else:
return (0, -stats[x][0])
def sortTotal(x):
# sort by totals
return (0, -stats[x][0])
total = 0. total = 0.
for key in stats.keys(): ctotal = 0.
if key.find('(') is -1: for key, values in stats.items():
total += stats[key][0] if self._display_guests:
if self._sorting == SORT_DEFAULT: if self.get_gname_from_pid(key):
sortkey = sortCurAvg total += values.value
else: continue
sortkey = sortTotal if not key.find(' ') != -1:
total += values.value
else:
ctotal += values.value
if total == 0.:
# we don't have any fields, or all non-child events are filtered
total = ctotal
# print events
tavg = 0 tavg = 0
for key in sorted(stats.keys(), key=sortkey): tcur = 0
if row >= self.screen.getmaxyx()[0] - 1: for key, values in get_sorted_events(self, stats):
if row >= self.screen.getmaxyx()[0] - 1 or values == (0, 0):
break break
values = stats[key] if self._display_guests:
if not values[0] and not values[1]: key = self.get_gname_from_pid(key)
break if not key:
if values[0] is not None: continue
cur = int(round(values[1] / sleeptime)) if values[1] else '' cur = int(round(values.delta / sleeptime)) if values.delta else ''
if self._display_guests: if key[0] != ' ':
key = self.get_gname_from_pid(key) if values.delta:
self.screen.addstr(row, 1, '%-40s %10d%7.1f %8s' % tcur += values.delta
(key, values[0], values[0] * 100 / total, ptotal = values.value
cur)) ltotal = total
if cur is not '' and key.find('(') is -1: else:
tavg += cur ltotal = ptotal
self.screen.addstr(row, 1, '%-40s %10d%7.1f %8s' % (key,
values.value,
values.value * 100 / float(ltotal), cur))
row += 1 row += 1
if row == 3: if row == 3:
self.screen.addstr(4, 1, 'No matching events reported yet') self.screen.addstr(4, 1, 'No matching events reported yet')
else: if row > 4:
tavg = int(round(tcur / sleeptime)) if tcur > 0 else ''
self.screen.addstr(row, 1, '%-40s %10d %8s' % self.screen.addstr(row, 1, '%-40s %10d %8s' %
('Total', total, tavg if tavg else ''), ('Total', total, tavg), curses.A_BOLD)
curses.A_BOLD)
self.screen.refresh() self.screen.refresh()
def show_msg(self, text): def _show_msg(self, text):
"""Display message centered text and exit on key press""" """Display message centered text and exit on key press"""
hint = 'Press any key to continue' hint = 'Press any key to continue'
curses.cbreak() curses.cbreak()
@ -1139,16 +1227,16 @@ class Tui(object):
curses.A_STANDOUT) curses.A_STANDOUT)
self.screen.getkey() self.screen.getkey()
def show_help_interactive(self): def _show_help_interactive(self):
"""Display help with list of interactive commands""" """Display help with list of interactive commands"""
msg = (' b toggle events by guests (debugfs only, honors' msg = (' b toggle events by guests (debugfs only, honors'
' filters)', ' filters)',
' c clear filter', ' c clear filter',
' f filter by regular expression', ' f filter by regular expression',
' g filter by guest name', ' g filter by guest name/PID',
' h display interactive commands reference', ' h display interactive commands reference',
' o toggle sorting order (Total vs CurAvg/s)', ' o toggle sorting order (Total vs CurAvg/s)',
' p filter by PID', ' p filter by guest name/PID',
' q quit', ' q quit',
' r reset stats', ' r reset stats',
' s set update interval', ' s set update interval',
@ -1165,14 +1253,15 @@ class Tui(object):
self.screen.addstr(row, 0, line) self.screen.addstr(row, 0, line)
row += 1 row += 1
self.screen.getkey() self.screen.getkey()
self.refresh_header() self._refresh_header()
def show_filter_selection(self): def _show_filter_selection(self):
"""Draws filter selection mask. """Draws filter selection mask.
Asks for a valid regex and sets the fields filter accordingly. Asks for a valid regex and sets the fields filter accordingly.
""" """
msg = ''
while True: while True:
self.screen.erase() self.screen.erase()
self.screen.addstr(0, 0, self.screen.addstr(0, 0,
@ -1181,61 +1270,25 @@ class Tui(object):
self.screen.addstr(2, 0, self.screen.addstr(2, 0,
"Current regex: {0}" "Current regex: {0}"
.format(self.stats.fields_filter)) .format(self.stats.fields_filter))
self.screen.addstr(5, 0, msg)
self.screen.addstr(3, 0, "New regex: ") self.screen.addstr(3, 0, "New regex: ")
curses.echo() curses.echo()
regex = self.screen.getstr().decode(ENCODING) regex = self.screen.getstr().decode(ENCODING)
curses.noecho() curses.noecho()
if len(regex) == 0: if len(regex) == 0:
self.stats.fields_filter = DEFAULT_REGEX self.stats.fields_filter = ''
self.refresh_header() self._refresh_header()
return return
try: try:
re.compile(regex) re.compile(regex)
self.stats.fields_filter = regex self.stats.fields_filter = regex
self.refresh_header() self._refresh_header()
return return
except re.error: except re.error:
msg = '"' + regex + '": Not a valid regular expression'
continue continue
def show_vm_selection_by_pid(self): def _show_set_update_interval(self):
"""Draws PID selection mask.
Asks for a pid until a valid pid or 0 has been entered.
"""
msg = ''
while True:
self.screen.erase()
self.screen.addstr(0, 0,
'Show statistics for specific pid.',
curses.A_BOLD)
self.screen.addstr(1, 0,
'This might limit the shown data to the trace '
'statistics.')
self.screen.addstr(5, 0, msg)
self.print_all_gnames(7)
curses.echo()
self.screen.addstr(3, 0, "Pid [0 or pid]: ")
pid = self.screen.getstr().decode(ENCODING)
curses.noecho()
try:
if len(pid) > 0:
pid = int(pid)
if pid != 0 and not os.path.isdir(os.path.join('/proc/',
str(pid))):
msg = '"' + str(pid) + '": Not a running process'
continue
else:
pid = 0
self.refresh_header(pid)
self.update_pid(pid)
break
except ValueError:
msg = '"' + str(pid) + '": Not a valid pid'
def show_set_update_interval(self):
"""Draws update interval selection mask.""" """Draws update interval selection mask."""
msg = '' msg = ''
while True: while True:
@ -1265,60 +1318,67 @@ class Tui(object):
except ValueError: except ValueError:
msg = '"' + str(val) + '": Invalid value' msg = '"' + str(val) + '": Invalid value'
self.refresh_header() self._refresh_header()
def show_vm_selection_by_guest_name(self): def _show_vm_selection_by_guest(self):
"""Draws guest selection mask. """Draws guest selection mask.
Asks for a guest name until a valid guest name or '' is entered. Asks for a guest name or pid until a valid guest name or '' is entered.
""" """
msg = '' msg = ''
while True: while True:
self.screen.erase() self.screen.erase()
self.screen.addstr(0, 0, self.screen.addstr(0, 0,
'Show statistics for specific guest.', 'Show statistics for specific guest or pid.',
curses.A_BOLD) curses.A_BOLD)
self.screen.addstr(1, 0, self.screen.addstr(1, 0,
'This might limit the shown data to the trace ' 'This might limit the shown data to the trace '
'statistics.') 'statistics.')
self.screen.addstr(5, 0, msg) self.screen.addstr(5, 0, msg)
self.print_all_gnames(7) self._print_all_gnames(7)
curses.echo() curses.echo()
self.screen.addstr(3, 0, "Guest [ENTER or guest]: ") curses.curs_set(1)
gname = self.screen.getstr().decode(ENCODING) self.screen.addstr(3, 0, "Guest or pid [ENTER exits]: ")
guest = self.screen.getstr().decode(ENCODING)
curses.noecho() curses.noecho()
if not gname: pid = 0
self.refresh_header(0) if not guest or guest == '0':
self.update_pid(0)
break break
else: if guest.isdigit():
pids = [] if not os.path.isdir(os.path.join('/proc/', guest)):
try: msg = '"' + guest + '": Not a running process'
pids = self.get_pid_from_gname(gname)
except:
msg = '"' + gname + '": Internal error while searching, ' \
'use pid filter instead'
continue continue
if len(pids) == 0: pid = int(guest)
msg = '"' + gname + '": Not an active guest'
continue
if len(pids) > 1:
msg = '"' + gname + '": Multiple matches found, use pid ' \
'filter instead'
continue
self.refresh_header(pids[0])
self.update_pid(pids[0])
break break
pids = []
try:
pids = self.get_pid_from_gname(guest)
except:
msg = '"' + guest + '": Internal error while searching, ' \
'use pid filter instead'
continue
if len(pids) == 0:
msg = '"' + guest + '": Not an active guest'
continue
if len(pids) > 1:
msg = '"' + guest + '": Multiple matches found, use pid ' \
'filter instead'
continue
pid = pids[0]
break
curses.curs_set(0)
self._refresh_header(pid)
self._update_pid(pid)
def show_stats(self): def show_stats(self):
"""Refreshes the screen and processes user input.""" """Refreshes the screen and processes user input."""
sleeptime = self._delay_initial sleeptime = self._delay_initial
self.refresh_header() self._refresh_header()
start = 0.0 # result based on init value never appears on screen start = 0.0 # result based on init value never appears on screen
while True: while True:
self.refresh_body(time.time() - start) self._refresh_body(time.time() - start)
curses.halfdelay(int(sleeptime * 10)) curses.halfdelay(int(sleeptime * 10))
start = time.time() start = time.time()
sleeptime = self._delay_regular sleeptime = self._delay_regular
@ -1327,47 +1387,39 @@ class Tui(object):
if char == 'b': if char == 'b':
self._display_guests = not self._display_guests self._display_guests = not self._display_guests
if self.stats.toggle_display_guests(self._display_guests): if self.stats.toggle_display_guests(self._display_guests):
self.show_msg(['Command not available with tracepoints' self._show_msg(['Command not available with '
' enabled', 'Restart with debugfs only ' 'tracepoints enabled', 'Restart with '
'(see option \'-d\') and try again!']) 'debugfs only (see option \'-d\') and '
'try again!'])
self._display_guests = not self._display_guests self._display_guests = not self._display_guests
self.refresh_header() self._refresh_header()
if char == 'c': if char == 'c':
self.stats.fields_filter = DEFAULT_REGEX self.stats.fields_filter = ''
self.refresh_header(0) self._refresh_header(0)
self.update_pid(0) self._update_pid(0)
if char == 'f': if char == 'f':
curses.curs_set(1) curses.curs_set(1)
self.show_filter_selection() self._show_filter_selection()
curses.curs_set(0) curses.curs_set(0)
sleeptime = self._delay_initial sleeptime = self._delay_initial
if char == 'g': if char == 'g' or char == 'p':
curses.curs_set(1) self._show_vm_selection_by_guest()
self.show_vm_selection_by_guest_name()
curses.curs_set(0)
sleeptime = self._delay_initial sleeptime = self._delay_initial
if char == 'h': if char == 'h':
self.show_help_interactive() self._show_help_interactive()
if char == 'o': if char == 'o':
self._sorting = not self._sorting self._sorting = not self._sorting
if char == 'p':
curses.curs_set(1)
self.show_vm_selection_by_pid()
curses.curs_set(0)
sleeptime = self._delay_initial
if char == 'q': if char == 'q':
break break
if char == 'r': if char == 'r':
self.stats.reset() self.stats.reset()
if char == 's': if char == 's':
curses.curs_set(1) curses.curs_set(1)
self.show_set_update_interval() self._show_set_update_interval()
curses.curs_set(0) curses.curs_set(0)
sleeptime = self._delay_initial sleeptime = self._delay_initial
if char == 'x': if char == 'x':
self.update_drilldown() self.stats.child_events = not self.stats.child_events
# prevents display of current values on next refresh
self.stats.get(self._display_guests)
except KeyboardInterrupt: except KeyboardInterrupt:
break break
except curses.error: except curses.error:
@ -1380,9 +1432,9 @@ def batch(stats):
s = stats.get() s = stats.get()
time.sleep(1) time.sleep(1)
s = stats.get() s = stats.get()
for key in sorted(s.keys()): for key, values in sorted(s.items()):
values = s[key] print('%-42s%10d%10d' % (key.split(' ')[0], values.value,
print('%-42s%10d%10d' % (key, values[0], values[1])) values.delta))
except KeyboardInterrupt: except KeyboardInterrupt:
pass pass
@ -1392,14 +1444,14 @@ def log(stats):
keys = sorted(stats.get().keys()) keys = sorted(stats.get().keys())
def banner(): def banner():
for k in keys: for key in keys:
print(k, end=' ') print(key.split(' ')[0], end=' ')
print() print()
def statline(): def statline():
s = stats.get() s = stats.get()
for k in keys: for key in keys:
print(' %9d' % s[k][1], end=' ') print(' %9d' % s[key].delta, end=' ')
print() print()
line = 0 line = 0
banner_repeat = 20 banner_repeat = 20
@ -1504,7 +1556,7 @@ Press any other key to refresh statistics immediately.
) )
optparser.add_option('-f', '--fields', optparser.add_option('-f', '--fields',
action='store', action='store',
default=DEFAULT_REGEX, default='',
dest='fields', dest='fields',
help='''fields to display (regex) help='''fields to display (regex)
"-f help" for a list of available events''', "-f help" for a list of available events''',
@ -1539,17 +1591,6 @@ Press any other key to refresh statistics immediately.
def check_access(options): def check_access(options):
"""Exits if the current user can't access all needed directories.""" """Exits if the current user can't access all needed directories."""
if not os.path.exists('/sys/kernel/debug'):
sys.stderr.write('Please enable CONFIG_DEBUG_FS in your kernel.')
sys.exit(1)
if not os.path.exists(PATH_DEBUGFS_KVM):
sys.stderr.write("Please make sure, that debugfs is mounted and "
"readable by the current user:\n"
"('mount -t debugfs debugfs /sys/kernel/debug')\n"
"Also ensure, that the kvm modules are loaded.\n")
sys.exit(1)
if not os.path.exists(PATH_DEBUGFS_TRACING) and (options.tracepoints or if not os.path.exists(PATH_DEBUGFS_TRACING) and (options.tracepoints or
not options.debugfs): not options.debugfs):
sys.stderr.write("Please enable CONFIG_TRACING in your kernel " sys.stderr.write("Please enable CONFIG_TRACING in your kernel "
@ -1567,7 +1608,33 @@ def check_access(options):
return options return options
def assign_globals():
global PATH_DEBUGFS_KVM
global PATH_DEBUGFS_TRACING
debugfs = ''
for line in file('/proc/mounts'):
if line.split(' ')[0] == 'debugfs':
debugfs = line.split(' ')[1]
break
if debugfs == '':
sys.stderr.write("Please make sure that CONFIG_DEBUG_FS is enabled in "
"your kernel, mounted and\nreadable by the current "
"user:\n"
"('mount -t debugfs debugfs /sys/kernel/debug')\n")
sys.exit(1)
PATH_DEBUGFS_KVM = os.path.join(debugfs, 'kvm')
PATH_DEBUGFS_TRACING = os.path.join(debugfs, 'tracing')
if not os.path.exists(PATH_DEBUGFS_KVM):
sys.stderr.write("Please make sure that CONFIG_KVM is enabled in "
"your kernel and that the modules are loaded.\n")
sys.exit(1)
def main(): def main():
assign_globals()
options = get_options() options = get_options()
options = check_access(options) options = check_access(options)

View File

@ -35,13 +35,13 @@ INTERACTIVE COMMANDS
*f*:: filter by regular expression *f*:: filter by regular expression
*g*:: filter by guest name *g*:: filter by guest name/PID
*h*:: display interactive commands reference *h*:: display interactive commands reference
*o*:: toggle sorting order (Total vs CurAvg/s) *o*:: toggle sorting order (Total vs CurAvg/s)
*p*:: filter by PID *p*:: filter by guest name/PID
*q*:: quit *q*:: quit

View File

@ -36,6 +36,8 @@ static struct timecounter *timecounter;
static unsigned int host_vtimer_irq; static unsigned int host_vtimer_irq;
static u32 host_vtimer_irq_flags; static u32 host_vtimer_irq_flags;
static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
static const struct kvm_irq_level default_ptimer_irq = { static const struct kvm_irq_level default_ptimer_irq = {
.irq = 30, .irq = 30,
.level = 1, .level = 1,
@ -56,6 +58,12 @@ u64 kvm_phys_timer_read(void)
return timecounter->cc->read(timecounter->cc); return timecounter->cc->read(timecounter->cc);
} }
static inline bool userspace_irqchip(struct kvm *kvm)
{
return static_branch_unlikely(&userspace_irqchip_in_use) &&
unlikely(!irqchip_in_kernel(kvm));
}
static void soft_timer_start(struct hrtimer *hrt, u64 ns) static void soft_timer_start(struct hrtimer *hrt, u64 ns)
{ {
hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns), hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns),
@ -69,25 +77,6 @@ static void soft_timer_cancel(struct hrtimer *hrt, struct work_struct *work)
cancel_work_sync(work); cancel_work_sync(work);
} }
static void kvm_vtimer_update_mask_user(struct kvm_vcpu *vcpu)
{
struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
/*
* When using a userspace irqchip with the architected timers, we must
* prevent continuously exiting from the guest, and therefore mask the
* physical interrupt by disabling it on the host interrupt controller
* when the virtual level is high, such that the guest can make
* forward progress. Once we detect the output level being
* de-asserted, we unmask the interrupt again so that we exit from the
* guest when the timer fires.
*/
if (vtimer->irq.level)
disable_percpu_irq(host_vtimer_irq);
else
enable_percpu_irq(host_vtimer_irq, 0);
}
static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
{ {
struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
@ -106,9 +95,9 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
if (kvm_timer_should_fire(vtimer)) if (kvm_timer_should_fire(vtimer))
kvm_timer_update_irq(vcpu, true, vtimer); kvm_timer_update_irq(vcpu, true, vtimer);
if (static_branch_unlikely(&userspace_irqchip_in_use) && if (userspace_irqchip(vcpu->kvm) &&
unlikely(!irqchip_in_kernel(vcpu->kvm))) !static_branch_unlikely(&has_gic_active_state))
kvm_vtimer_update_mask_user(vcpu); disable_percpu_irq(host_vtimer_irq);
return IRQ_HANDLED; return IRQ_HANDLED;
} }
@ -290,8 +279,7 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq, trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq,
timer_ctx->irq.level); timer_ctx->irq.level);
if (!static_branch_unlikely(&userspace_irqchip_in_use) || if (!userspace_irqchip(vcpu->kvm)) {
likely(irqchip_in_kernel(vcpu->kvm))) {
ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
timer_ctx->irq.irq, timer_ctx->irq.irq,
timer_ctx->irq.level, timer_ctx->irq.level,
@ -350,12 +338,6 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
phys_timer_emulate(vcpu); phys_timer_emulate(vcpu);
} }
static void __timer_snapshot_state(struct arch_timer_context *timer)
{
timer->cnt_ctl = read_sysreg_el0(cntv_ctl);
timer->cnt_cval = read_sysreg_el0(cntv_cval);
}
static void vtimer_save_state(struct kvm_vcpu *vcpu) static void vtimer_save_state(struct kvm_vcpu *vcpu)
{ {
struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
@ -367,8 +349,10 @@ static void vtimer_save_state(struct kvm_vcpu *vcpu)
if (!vtimer->loaded) if (!vtimer->loaded)
goto out; goto out;
if (timer->enabled) if (timer->enabled) {
__timer_snapshot_state(vtimer); vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
vtimer->cnt_cval = read_sysreg_el0(cntv_cval);
}
/* Disable the virtual timer */ /* Disable the virtual timer */
write_sysreg_el0(0, cntv_ctl); write_sysreg_el0(0, cntv_ctl);
@ -460,23 +444,43 @@ static void set_cntvoff(u64 cntvoff)
kvm_call_hyp(__kvm_timer_set_cntvoff, low, high); kvm_call_hyp(__kvm_timer_set_cntvoff, low, high);
} }
static void kvm_timer_vcpu_load_vgic(struct kvm_vcpu *vcpu) static inline void set_vtimer_irq_phys_active(struct kvm_vcpu *vcpu, bool active)
{
int r;
r = irq_set_irqchip_state(host_vtimer_irq, IRQCHIP_STATE_ACTIVE, active);
WARN_ON(r);
}
static void kvm_timer_vcpu_load_gic(struct kvm_vcpu *vcpu)
{ {
struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
bool phys_active; bool phys_active;
int ret;
phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq); if (irqchip_in_kernel(vcpu->kvm))
phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq);
ret = irq_set_irqchip_state(host_vtimer_irq, else
IRQCHIP_STATE_ACTIVE, phys_active = vtimer->irq.level;
phys_active); set_vtimer_irq_phys_active(vcpu, phys_active);
WARN_ON(ret);
} }
static void kvm_timer_vcpu_load_user(struct kvm_vcpu *vcpu) static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
{ {
kvm_vtimer_update_mask_user(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
/*
* When using a userspace irqchip with the architected timers and a
* host interrupt controller that doesn't support an active state, we
* must still prevent continuously exiting from the guest, and
* therefore mask the physical interrupt by disabling it on the host
* interrupt controller when the virtual level is high, such that the
* guest can make forward progress. Once we detect the output level
* being de-asserted, we unmask the interrupt again so that we exit
* from the guest when the timer fires.
*/
if (vtimer->irq.level)
disable_percpu_irq(host_vtimer_irq);
else
enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
} }
void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
@ -487,10 +491,10 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
if (unlikely(!timer->enabled)) if (unlikely(!timer->enabled))
return; return;
if (unlikely(!irqchip_in_kernel(vcpu->kvm))) if (static_branch_likely(&has_gic_active_state))
kvm_timer_vcpu_load_user(vcpu); kvm_timer_vcpu_load_gic(vcpu);
else else
kvm_timer_vcpu_load_vgic(vcpu); kvm_timer_vcpu_load_nogic(vcpu);
set_cntvoff(vtimer->cntvoff); set_cntvoff(vtimer->cntvoff);
@ -555,18 +559,24 @@ static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu)
{ {
struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { if (!kvm_timer_should_fire(vtimer)) {
__timer_snapshot_state(vtimer); kvm_timer_update_irq(vcpu, false, vtimer);
if (!kvm_timer_should_fire(vtimer)) { if (static_branch_likely(&has_gic_active_state))
kvm_timer_update_irq(vcpu, false, vtimer); set_vtimer_irq_phys_active(vcpu, false);
kvm_vtimer_update_mask_user(vcpu); else
} enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
} }
} }
void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
{ {
unmask_vtimer_irq_user(vcpu); struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
if (unlikely(!timer->enabled))
return;
if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
unmask_vtimer_irq_user(vcpu);
} }
int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
@ -753,6 +763,8 @@ int kvm_timer_hyp_init(bool has_gic)
kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
goto out_free_irq; goto out_free_irq;
} }
static_branch_enable(&has_gic_active_state);
} }
kvm_info("virtual timer IRQ%d\n", host_vtimer_irq); kvm_info("virtual timer IRQ%d\n", host_vtimer_irq);

View File

@ -969,8 +969,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
/* Check for overlaps */ /* Check for overlaps */
r = -EEXIST; r = -EEXIST;
kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) { kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) {
if ((slot->id >= KVM_USER_MEM_SLOTS) || if (slot->id == id)
(slot->id == id))
continue; continue;
if (!((base_gfn + npages <= slot->base_gfn) || if (!((base_gfn + npages <= slot->base_gfn) ||
(base_gfn >= slot->base_gfn + slot->npages))) (base_gfn >= slot->base_gfn + slot->npages)))