s390: A bunch of fixes and optimizations for interrupt and time

handling. PPC: Mostly bug fixes. ARM: No big features, but many small fixes and prerequisites including: - a number of fixes for the arch-timer - introducing proper level-triggered semantics for the arch-timers - a series of patches to synchronously halt a guest (prerequisite for IRQ forwarding) - some tracepoint improvements - a tweak for the EL2 panic handlers - some more VGIC cleanups getting rid of redundant state x86: quite a few changes: - support for VT-d posted interrupts (i.e. PCI devices can inject interrupts directly into vCPUs). This introduces a new component (in virt/lib/) that connects VFIO and KVM together. The same infrastructure will be used for ARM interrupt forwarding as well. - more Hyper-V features, though the main one Hyper-V synthetic interrupt controller will have to wait for 4.5. These will let KVM expose Hyper-V devices. - nested virtualization now supports VPID (same as PCID but for vCPUs) which makes it quite a bit faster - for future hardware that supports NVDIMM, there is support for clflushopt, clwb, pcommit - support for "split irqchip", i.e. LAPIC in kernel + IOAPIC/PIC/PIT in userspace, which reduces the attack surface of the hypervisor - obligatory smattering of SMM fixes - on the guest side, stable scheduler clock support was rewritten to not require help from the hypervisor. -----BEGIN PGP SIGNATURE----- Version: GnuPG v2.0.22 (GNU/Linux) iQEcBAABAgAGBQJWO2IQAAoJEL/70l94x66D/K0H/3AovAgYmJQToZlimsktMk6a f2xhdIqfU5lIQQh5uNBCfL3o9o8H9Py1ym7aEw3fmztPHHJYc91oTatt2UEKhmEw VtZHp/dFHt3hwaIdXmjRPEXiYctraKCyrhaUYdWmUYkoKi7lW5OL5h+S7frG2U6u p/hFKnHRZfXHr6NSgIqvYkKqtnc+C0FWY696IZMzgCksOO8jB1xrxoSN3tANW3oJ PDV+4og0fN/Fr1capJUFEc/fejREHneANvlKrLaa8ht0qJQutoczNADUiSFLcMPG iHljXeDsv5eyjMtUuIL8+MPzcrIt/y4rY41ZPiKggxULrXc6H+JJL/e/zThZpXc= =iv2z -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull KVM updates from Paolo Bonzini: "First batch of KVM changes for 4.4. s390: A bunch of fixes and optimizations for interrupt and time handling. PPC: Mostly bug fixes. ARM: No big features, but many small fixes and prerequisites including: - a number of fixes for the arch-timer - introducing proper level-triggered semantics for the arch-timers - a series of patches to synchronously halt a guest (prerequisite for IRQ forwarding) - some tracepoint improvements - a tweak for the EL2 panic handlers - some more VGIC cleanups getting rid of redundant state x86: Quite a few changes: - support for VT-d posted interrupts (i.e. PCI devices can inject interrupts directly into vCPUs). This introduces a new component (in virt/lib/) that connects VFIO and KVM together. The same infrastructure will be used for ARM interrupt forwarding as well. - more Hyper-V features, though the main one Hyper-V synthetic interrupt controller will have to wait for 4.5. These will let KVM expose Hyper-V devices. - nested virtualization now supports VPID (same as PCID but for vCPUs) which makes it quite a bit faster - for future hardware that supports NVDIMM, there is support for clflushopt, clwb, pcommit - support for "split irqchip", i.e. LAPIC in kernel + IOAPIC/PIC/PIT in userspace, which reduces the attack surface of the hypervisor - obligatory smattering of SMM fixes - on the guest side, stable scheduler clock support was rewritten to not require help from the hypervisor" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (123 commits) KVM: VMX: Fix commit which broke PML KVM: x86: obey KVM_X86_QUIRK_CD_NW_CLEARED in kvm_set_cr0() KVM: x86: allow RSM from 64-bit mode KVM: VMX: fix SMEP and SMAP without EPT KVM: x86: move kvm_set_irq_inatomic to legacy device assignment KVM: device assignment: remove pointless #ifdefs KVM: x86: merge kvm_arch_set_irq with kvm_set_msi_inatomic KVM: x86: zero apic_arb_prio on reset drivers/hv: share Hyper-V SynIC constants with userspace KVM: x86: handle SMBASE as physical address in RSM KVM: x86: add read_phys to x86_emulate_ops KVM: x86: removing unused variable KVM: don't pointlessly leave KVM_COMPAT=y in non-KVM configs KVM: arm/arm64: Merge vgic_set_lr() and vgic_sync_lr_elrsr() KVM: arm/arm64: Clean up vgic_retire_lr() and surroundings KVM: arm/arm64: Optimize away redundant LR tracking KVM: s390: use simple switch statement as multiplexer KVM: s390: drop useless newline in debugging data KVM: s390: SCA must not cross page boundaries KVM: arm: Do not indent the arguments of DECLARE_BITMAP ...
2015-11-05 16:26:26 -08:00 · 2015-11-05 16:26:26 -08:00 · 933425fb00
parent a3e7531535 a3eaa8649e
commit 933425fb00
89 changed files with 2956 additions and 1029 deletions
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@ -1585,6 +1585,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			nosid	disable Source ID checking
 			no_x2apic_optout
 				BIOS x2APIC opt-out request will be ignored
 			nopost	disable Interrupt Posting
 	iomem=		Disable strict checking of access to MMIO memory
 		strict	regions from userspace.
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@ -401,10 +401,9 @@ Capability: basic
 Architectures: x86, ppc, mips
 Type: vcpu ioctl
 Parameters: struct kvm_interrupt (in)
-Returns: 0 on success, -1 on error
+Returns: 0 on success, negative on failure.
-Queues a hardware interrupt vector to be injected.  This is only
+Queues a hardware interrupt vector to be injected.
 useful if in-kernel local APIC or equivalent is not used.
 /* for KVM_INTERRUPT */
 struct kvm_interrupt {
@ -414,7 +413,14 @@ struct kvm_interrupt {
 X86:
-Note 'irq' is an interrupt vector, not an interrupt pin or line.
+Returns: 0 on success,
 	 -EEXIST if an interrupt is already enqueued
 	 -EINVAL the the irq number is invalid
 	 -ENXIO if the PIC is in the kernel
 	 -EFAULT if the pointer is invalid
 Note 'irq' is an interrupt vector, not an interrupt pin or line. This
 ioctl is useful if the in-kernel PIC is not used.
 PPC:
@ -1598,7 +1604,7 @@ provided event instead of triggering an exit.
 struct kvm_ioeventfd {
 	__u64 datamatch;
 	__u64 addr;        /* legal pio/mmio address */
-	__u32 len;         /* 1, 2, 4, or 8 bytes    */
+	__u32 len;         /* 0, 1, 2, 4, or 8 bytes    */
 	__s32 fd;
 	__u32 flags;
 	__u8  pad[36];
@ -1621,6 +1627,10 @@ to the registered address is equal to datamatch in struct kvm_ioeventfd.
 For virtio-ccw devices, addr contains the subchannel id and datamatch the
 virtqueue index.
 With KVM_CAP_IOEVENTFD_ANY_LENGTH, a zero length ioeventfd is allowed, and
 the kernel will ignore the length of guest write and may get a faster vmexit.
 The speedup may only apply to specific architectures, but the ioeventfd will
 work anyway.
 4.60 KVM_DIRTY_TLB
@ -3309,6 +3319,18 @@ Valid values for 'type' are:
   to ignore the request, or to gather VM memory core dump and/or
   reset/shutdown of the VM.
 		/* KVM_EXIT_IOAPIC_EOI */
 		struct {
 			__u8 vector;
 		} eoi;
 Indicates that the VCPU's in-kernel local APIC received an EOI for a
 level-triggered IOAPIC interrupt.  This exit only triggers when the
 IOAPIC is implemented in userspace (i.e. KVM_CAP_SPLIT_IRQCHIP is enabled);
 the userspace IOAPIC should process the EOI and retrigger the interrupt if
 it is still asserted.  Vector is the LAPIC interrupt vector for which the
 EOI was received.
 		/* Fix the size of the union. */
 		char padding[256];
 	};
@ -3627,6 +3649,26 @@ struct {
 KVM handlers should exit to userspace with rc = -EREMOTE.
 7.5 KVM_CAP_SPLIT_IRQCHIP
 Architectures: x86
 Parameters: args[0] - number of routes reserved for userspace IOAPICs
 Returns: 0 on success, -1 on error
 Create a local apic for each processor in the kernel. This can be used
 instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the
 IOAPIC and PIC (and also the PIT, even though this has to be enabled
 separately).
 This capability also enables in kernel routing of interrupt requests;
 when KVM_CAP_SPLIT_IRQCHIP only routes of KVM_IRQ_ROUTING_MSI type are
 used in the IRQ routing table.  The first args[0] MSI routes are reserved
 for the IOAPIC pins.  Whenever the LAPIC receives an EOI for these routes,
 a KVM_EXIT_IOAPIC_EOI vmexit will be reported to userspace.
 Fails if VCPU has already been created, or if the irqchip is already in the
 kernel (i.e. KVM_CREATE_IRQCHIP has already been called).
 8. Other capabilities.
 ----------------------
--- a/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt
+++ b/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt
@ -0,0 +1,187 @@
 KVM/ARM VGIC Forwarded Physical Interrupts
 ==========================================
 The KVM/ARM code implements software support for the ARM Generic
 Interrupt Controller's (GIC's) hardware support for virtualization by
 allowing software to inject virtual interrupts to a VM, which the guest
 OS sees as regular interrupts.  The code is famously known as the VGIC.
 Some of these virtual interrupts, however, correspond to physical
 interrupts from real physical devices.  One example could be the
 architected timer, which itself supports virtualization, and therefore
 lets a guest OS program the hardware device directly to raise an
 interrupt at some point in time.  When such an interrupt is raised, the
 host OS initially handles the interrupt and must somehow signal this
 event as a virtual interrupt to the guest.  Another example could be a
 passthrough device, where the physical interrupts are initially handled
 by the host, but the device driver for the device lives in the guest OS
 and KVM must therefore somehow inject a virtual interrupt on behalf of
 the physical one to the guest OS.
 These virtual interrupts corresponding to a physical interrupt on the
 host are called forwarded physical interrupts, but are also sometimes
 referred to as 'virtualized physical interrupts' and 'mapped interrupts'.
 Forwarded physical interrupts are handled slightly differently compared
 to virtual interrupts generated purely by a software emulated device.
 The HW bit
 ----------
 Virtual interrupts are signalled to the guest by programming the List
 Registers (LRs) on the GIC before running a VCPU.  The LR is programmed
 with the virtual IRQ number and the state of the interrupt (Pending,
 Active, or Pending+Active).  When the guest ACKs and EOIs a virtual
 interrupt, the LR state moves from Pending to Active, and finally to
 inactive.
 The LRs include an extra bit, called the HW bit.  When this bit is set,
 KVM must also program an additional field in the LR, the physical IRQ
 number, to link the virtual with the physical IRQ.
 When the HW bit is set, KVM must EITHER set the Pending OR the Active
 bit, never both at the same time.
 Setting the HW bit causes the hardware to deactivate the physical
 interrupt on the physical distributor when the guest deactivates the
 corresponding virtual interrupt.
 Forwarded Physical Interrupts Life Cycle
 ----------------------------------------
 The state of forwarded physical interrupts is managed in the following way:
  - The physical interrupt is acked by the host, and becomes active on
    the physical distributor (*).
  - KVM sets the LR.Pending bit, because this is the only way the GICV
    interface is going to present it to the guest.
  - LR.Pending will stay set as long as the guest has not acked the interrupt.
  - LR.Pending transitions to LR.Active on the guest read of the IAR, as
    expected.
  - On guest EOI, the *physical distributor* active bit gets cleared,
    but the LR.Active is left untouched (set).
  - KVM clears the LR on VM exits when the physical distributor
    active state has been cleared.
 (*): The host handling is slightly more complicated.  For some forwarded
 interrupts (shared), KVM directly sets the active state on the physical
 distributor before entering the guest, because the interrupt is never actually
 handled on the host (see details on the timer as an example below).  For other
 forwarded interrupts (non-shared) the host does not deactivate the interrupt
 when the host ISR completes, but leaves the interrupt active until the guest
 deactivates it.  Leaving the interrupt active is allowed, because Linux
 configures the physical GIC with EOIMode=1, which causes EOI operations to
 perform a priority drop allowing the GIC to receive other interrupts of the
 default priority.
 Forwarded Edge and Level Triggered PPIs and SPIs
 ------------------------------------------------
 Forwarded physical interrupts injected should always be active on the
 physical distributor when injected to a guest.
 Level-triggered interrupts will keep the interrupt line to the GIC
 asserted, typically until the guest programs the device to deassert the
 line.  This means that the interrupt will remain pending on the physical
 distributor until the guest has reprogrammed the device.  Since we
 always run the VM with interrupts enabled on the CPU, a pending
 interrupt will exit the guest as soon as we switch into the guest,
 preventing the guest from ever making progress as the process repeats
 over and over.  Therefore, the active state on the physical distributor
 must be set when entering the guest, preventing the GIC from forwarding
 the pending interrupt to the CPU.  As soon as the guest deactivates the
 interrupt, the physical line is sampled by the hardware again and the host
 takes a new interrupt if and only if the physical line is still asserted.
 Edge-triggered interrupts do not exhibit the same problem with
 preventing guest execution that level-triggered interrupts do.  One
 option is to not use HW bit at all, and inject edge-triggered interrupts
 from a physical device as pure virtual interrupts.  But that would
 potentially slow down handling of the interrupt in the guest, because a
 physical interrupt occurring in the middle of the guest ISR would
 preempt the guest for the host to handle the interrupt.  Additionally,
 if you configure the system to handle interrupts on a separate physical
 core from that running your VCPU, you still have to interrupt the VCPU
 to queue the pending state onto the LR, even though the guest won't use
 this information until the guest ISR completes.  Therefore, the HW
 bit should always be set for forwarded edge-triggered interrupts.  With
 the HW bit set, the virtual interrupt is injected and additional
 physical interrupts occurring before the guest deactivates the interrupt
 simply mark the state on the physical distributor as Pending+Active.  As
 soon as the guest deactivates the interrupt, the host takes another
 interrupt if and only if there was a physical interrupt between injecting
 the forwarded interrupt to the guest and the guest deactivating the
 interrupt.
 Consequently, whenever we schedule a VCPU with one or more LRs with the
 HW bit set, the interrupt must also be active on the physical
 distributor.
 Forwarded LPIs
 --------------
 LPIs, introduced in GICv3, are always edge-triggered and do not have an
 active state.  They become pending when a device signal them, and as
 soon as they are acked by the CPU, they are inactive again.
 It therefore doesn't make sense, and is not supported, to set the HW bit
 for physical LPIs that are forwarded to a VM as virtual interrupts,
 typically virtual SPIs.
 For LPIs, there is no other choice than to preempt the VCPU thread if
 necessary, and queue the pending state onto the LR.
 Putting It Together: The Architected Timer
 ------------------------------------------
 The architected timer is a device that signals interrupts with level
 triggered semantics.  The timer hardware is directly accessed by VCPUs
 which program the timer to fire at some point in time.  Each VCPU on a
 system programs the timer to fire at different times, and therefore the
 hardware is multiplexed between multiple VCPUs.  This is implemented by
 context-switching the timer state along with each VCPU thread.
 However, this means that a scenario like the following is entirely
 possible, and in fact, typical:
 1.  KVM runs the VCPU
 2.  The guest programs the time to fire in T+100
 3.  The guest is idle and calls WFI (wait-for-interrupts)
 4.  The hardware traps to the host
 5.  KVM stores the timer state to memory and disables the hardware timer
 6.  KVM schedules a soft timer to fire in T+(100 - time since step 2)
 7.  KVM puts the VCPU thread to sleep (on a waitqueue)
 8.  The soft timer fires, waking up the VCPU thread
 9.  KVM reprograms the timer hardware with the VCPU's values
 10. KVM marks the timer interrupt as active on the physical distributor
 11. KVM injects a forwarded physical interrupt to the guest
 12. KVM runs the VCPU
 Notice that KVM injects a forwarded physical interrupt in step 11 without
 the corresponding interrupt having actually fired on the host.  That is
 exactly why we mark the timer interrupt as active in step 10, because
 the active state on the physical distributor is part of the state
 belonging to the timer hardware, which is context-switched along with
 the VCPU thread.
 If the guest does not idle because it is busy, the flow looks like this
 instead:
 1.  KVM runs the VCPU
 2.  The guest programs the time to fire in T+100
 4.  At T+100 the timer fires and a physical IRQ causes the VM to exit
    (note that this initially only traps to EL2 and does not run the host ISR
    until KVM has returned to the host).
 5.  With interrupts still disabled on the CPU coming back from the guest, KVM
    stores the virtual timer state to memory and disables the virtual hw timer.
 6.  KVM looks at the timer state (in memory) and injects a forwarded physical
    interrupt because it concludes the timer has expired.
 7.  KVM marks the timer interrupt as active on the physical distributor
 7.  KVM enables the timer, enables interrupts, and runs the VCPU
 Notice that again the forwarded physical interrupt is injected to the
 guest without having actually been handled on the host.  In this case it
 is because the physical interrupt is never actually seen by the host because the
 timer is disabled upon guest return, and the virtual forwarded interrupt is
 injected on the KVM guest entry path.
--- a/Documentation/virtual/kvm/devices/arm-vgic.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic.txt
@ -44,28 +44,29 @@ Groups:
  Attributes:
    The attr field of kvm_device_attr encodes two values:
    bits:     | 63   ....  40 | 39 ..  32  |  31   ....    0 |
-    values:   |    reserved   |   cpu id   |      offset     |
+    values:   |    reserved   | vcpu_index |      offset     |
    All distributor regs are (rw, 32-bit)
    The offset is relative to the "Distributor base address" as defined in the
    GICv2 specs.  Getting or setting such a register has the same effect as
-    reading or writing the register on the actual hardware from the cpu
+    reading or writing the register on the actual hardware from the cpu whose
-    specified with cpu id field.  Note that most distributor fields are not
+    index is specified with the vcpu_index field.  Note that most distributor
-    banked, but return the same value regardless of the cpu id used to access
+    fields are not banked, but return the same value regardless of the
-    the register.
+    vcpu_index used to access the register.
  Limitations:
    - Priorities are not implemented, and registers are RAZ/WI
    - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2.
  Errors:
-    -ENODEV: Getting or setting this register is not yet supported
+    -ENXIO: Getting or setting this register is not yet supported
    -EBUSY: One or more VCPUs are running
    -EINVAL: Invalid vcpu_index supplied
  KVM_DEV_ARM_VGIC_GRP_CPU_REGS
  Attributes:
    The attr field of kvm_device_attr encodes two values:
    bits:     | 63   ....  40 | 39 ..  32  |  31   ....    0 |
-    values:   |    reserved   |   cpu id   |      offset     |
+    values:   |    reserved   | vcpu_index |      offset     |
    All CPU interface regs are (rw, 32-bit)
@ -91,8 +92,9 @@ Groups:
    - Priorities are not implemented, and registers are RAZ/WI
    - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2.
  Errors:
-    -ENODEV: Getting or setting this register is not yet supported
+    -ENXIO: Getting or setting this register is not yet supported
    -EBUSY: One or more VCPUs are running
    -EINVAL: Invalid vcpu_index supplied
  KVM_DEV_ARM_VGIC_GRP_NR_IRQS
  Attributes:
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@ -166,3 +166,15 @@ Comment:	The srcu read lock must be held while accessing memslots (e.g.
 		MMIO/PIO address->device structure mapping (kvm->buses).
 		The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu
 		if it is needed by multiple functions.
 Name:		blocked_vcpu_on_cpu_lock
 Type:		spinlock_t
 Arch:		x86
 Protects:	blocked_vcpu_on_cpu
 Comment:	This is a per-CPU lock and it is used for VT-d posted-interrupts.
 		When VT-d posted-interrupts is supported and the VM has assigned
 		devices, we put the blocked vCPU on the list blocked_vcpu_on_cpu
 		protected by blocked_vcpu_on_cpu_lock, when VT-d hardware issues
 		wakeup notification event since external interrupts from the
 		assigned devices happens, we will find the vCPU on the list to
 		wakeup.
--- a/7
+++ b/7
@ -11348,6 +11348,13 @@ L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/net/ethernet/via/via-velocity.*
 VIRT LIB
 M:	Alex Williamson <alex.williamson@redhat.com>
 M:	Paolo Bonzini <pbonzini@redhat.com>
 L:	kvm@vger.kernel.org
 S:	Supported
 F:	virt/lib/
 VIVID VIRTUAL VIDEO DRIVER
 M:	Hans Verkuil <hverkuil@xs4all.nl>
 L:	linux-media@vger.kernel.org
--- a/10
+++ b/10
@ -550,6 +550,7 @@ drivers-y	:= drivers/ sound/ firmware/
 net-y		:= net/
 libs-y		:= lib/
 core-y		:= usr/
 virt-y		:= virt/
 endif # KBUILD_EXTMOD
 ifeq ($(dot-config),1)
@ -882,10 +883,10 @@ core-y		+= kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/
 vmlinux-dirs	:= $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
 		     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
-		     $(net-y) $(net-m) $(libs-y) $(libs-m)))
+		     $(net-y) $(net-m) $(libs-y) $(libs-m) $(virt-y)))
 vmlinux-alldirs	:= $(sort $(vmlinux-dirs) $(patsubst %/,%,$(filter %/, \
-		     $(init-) $(core-) $(drivers-) $(net-) $(libs-))))
+		     $(init-) $(core-) $(drivers-) $(net-) $(libs-) $(virt-))))
 init-y		:= $(patsubst %/, %/built-in.o, $(init-y))
 core-y		:= $(patsubst %/, %/built-in.o, $(core-y))
@ -894,14 +895,15 @@ net-y		:= $(patsubst %/, %/built-in.o, $(net-y))
 libs-y1		:= $(patsubst %/, %/lib.a, $(libs-y))
 libs-y2		:= $(patsubst %/, %/built-in.o, $(libs-y))
 libs-y		:= $(libs-y1) $(libs-y2)
 virt-y		:= $(patsubst %/, %/built-in.o, $(virt-y))
 # Externally visible symbols (used by link-vmlinux.sh)
 export KBUILD_VMLINUX_INIT := $(head-y) $(init-y)
-export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y) $(drivers-y) $(net-y)
+export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y) $(drivers-y) $(net-y) $(virt-y)
 export KBUILD_LDS          := arch/$(SRCARCH)/kernel/vmlinux.lds
 export LDFLAGS_vmlinux
 # used by scripts/pacmage/Makefile
-export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools virt)
+export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools)
 vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN)
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@ -218,4 +218,24 @@
 #define HSR_DABT_CM		(1U << 8)
 #define HSR_DABT_EA		(1U << 9)
 #define kvm_arm_exception_type	\
 	{0, "RESET" }, 		\
 	{1, "UNDEFINED" },	\
 	{2, "SOFTWARE" },	\
 	{3, "PREF_ABORT" },	\
 	{4, "DATA_ABORT" },	\
 	{5, "IRQ" },		\
 	{6, "FIQ" },		\
 	{7, "HVC" }
 #define HSRECN(x) { HSR_EC_##x, #x }
 #define kvm_arm_exception_class \
 	HSRECN(UNKNOWN), HSRECN(WFI), HSRECN(CP15_32), HSRECN(CP15_64), \
 	HSRECN(CP14_MR), HSRECN(CP14_LS), HSRECN(CP_0_13), HSRECN(CP10_ID), \
 	HSRECN(JAZELLE), HSRECN(BXJ), HSRECN(CP14_64), HSRECN(SVC_HYP), \
 	HSRECN(HVC), HSRECN(SMC), HSRECN(IABT), HSRECN(IABT_HYP), \
 	HSRECN(DABT), HSRECN(DABT_HYP)
 #endif /* __ARM_KVM_ARM_H__ */
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@ -126,7 +126,10 @@ struct kvm_vcpu_arch {
 	 * here.
 	 */
-	/* Don't run the guest on this vcpu */
+	/* vcpu power-off state */
 	bool power_off;
 	 /* Don't run the guest (internal implementation need) */
 	bool pause;
 	/* IO related fields */
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@ -46,4 +46,6 @@ config KVM_ARM_HOST
 	---help---
 	  Provides host support for ARM processors.
 source drivers/vhost/Kconfig
 endif # VIRTUALIZATION
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@ -271,6 +271,16 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 	return kvm_timer_should_fire(vcpu);
 }
 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
 {
 	kvm_timer_schedule(vcpu);
 }
 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
 {
 	kvm_timer_unschedule(vcpu);
 }
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
 	/* Force users to call KVM_ARM_VCPU_INIT */
@ -308,7 +318,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
-	if (vcpu->arch.pause)
+	if (vcpu->arch.power_off)
 		mp_state->mp_state = KVM_MP_STATE_STOPPED;
 	else
 		mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
@ -321,10 +331,10 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 {
 	switch (mp_state->mp_state) {
 	case KVM_MP_STATE_RUNNABLE:
-		vcpu->arch.pause = false;
+		vcpu->arch.power_off = false;
 		break;
 	case KVM_MP_STATE_STOPPED:
-		vcpu->arch.pause = true;
+		vcpu->arch.power_off = true;
 		break;
 	default:
 		return -EINVAL;
@ -342,7 +352,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 */
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
-	return !!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v);
+	return ((!!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v))
 		&& !v->arch.power_off && !v->arch.pause);
 }
 /* Just ensure a guest exit from a particular CPU */
@ -468,11 +479,38 @@ bool kvm_arch_intc_initialized(struct kvm *kvm)
 	return vgic_initialized(kvm);
 }
-static void vcpu_pause(struct kvm_vcpu *vcpu)
+static void kvm_arm_halt_guest(struct kvm *kvm) __maybe_unused;
 static void kvm_arm_resume_guest(struct kvm *kvm) __maybe_unused;
 static void kvm_arm_halt_guest(struct kvm *kvm)
 {
 	int i;
 	struct kvm_vcpu *vcpu;
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		vcpu->arch.pause = true;
 	force_vm_exit(cpu_all_mask);
 }
 static void kvm_arm_resume_guest(struct kvm *kvm)
 {
 	int i;
 	struct kvm_vcpu *vcpu;
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
 		vcpu->arch.pause = false;
 		wake_up_interruptible(wq);
 	}
 }
 static void vcpu_sleep(struct kvm_vcpu *vcpu)
 {
 	wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
-	wait_event_interruptible(*wq, !vcpu->arch.pause);
+	wait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
 				       (!vcpu->arch.pause)));
 }
 static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
@ -522,8 +560,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		update_vttbr(vcpu->kvm);
-		if (vcpu->arch.pause)
+		if (vcpu->arch.power_off || vcpu->arch.pause)
-			vcpu_pause(vcpu);
+			vcpu_sleep(vcpu);
 		/*
 		 * Disarming the background timer must be done in a
@ -549,11 +587,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 			run->exit_reason = KVM_EXIT_INTR;
 		}
-		if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) {
+		if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) ||
 			vcpu->arch.power_off || vcpu->arch.pause) {
 			local_irq_enable();
 			kvm_timer_sync_hwstate(vcpu);
 			kvm_vgic_sync_hwstate(vcpu);
 			preempt_enable();
 			kvm_timer_sync_hwstate(vcpu);
 			continue;
 		}
@ -596,14 +635,19 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		 * guest time.
 		 */
 		kvm_guest_exit();
-		trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
+		trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
 		/*
 		 * We must sync the timer state before the vgic state so that
 		 * the vgic can properly sample the updated state of the
 		 * interrupt line.
 		 */
 		kvm_timer_sync_hwstate(vcpu);
 		kvm_vgic_sync_hwstate(vcpu);
 		preempt_enable();
 		kvm_timer_sync_hwstate(vcpu);
 		ret = handle_exit(vcpu, run, ret);
 	}
@ -765,12 +809,12 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
 	vcpu_reset_hcr(vcpu);
 	/*
-	 * Handle the "start in power-off" case by marking the VCPU as paused.
+	 * Handle the "start in power-off" case.
 	 */
 	if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
-		vcpu->arch.pause = true;
+		vcpu->arch.power_off = true;
 	else
-		vcpu->arch.pause = false;
+		vcpu->arch.power_off = false;
 	return 0;
 }
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@ -63,7 +63,7 @@ static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
 static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.pause = true;
+	vcpu->arch.power_off = true;
 }
 static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
@ -87,7 +87,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 	 */
 	if (!vcpu)
 		return PSCI_RET_INVALID_PARAMS;
-	if (!vcpu->arch.pause) {
+	if (!vcpu->arch.power_off) {
 		if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1)
 			return PSCI_RET_ALREADY_ON;
 		else
@ -115,7 +115,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 	 * the general puspose registers are undefined upon CPU_ON.
 	 */
 	*vcpu_reg(vcpu, 0) = context_id;
-	vcpu->arch.pause = false;
+	vcpu->arch.power_off = false;
 	smp_mb();		/* Make sure the above is visible */
 	wq = kvm_arch_vcpu_wq(vcpu);
@ -153,7 +153,7 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
 		mpidr = kvm_vcpu_get_mpidr_aff(tmp);
 		if ((mpidr & target_affinity_mask) == target_affinity) {
 			matching_cpus++;
-			if (!tmp->arch.pause)
+			if (!tmp->arch.power_off)
 				return PSCI_0_2_AFFINITY_LEVEL_ON;
 		}
 	}
@ -179,7 +179,7 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
 	 * re-initialized.
 	 */
 	kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
-		tmp->arch.pause = true;
+		tmp->arch.power_off = true;
 		kvm_vcpu_kick(tmp);
 	}
--- a/arch/arm/kvm/trace.h
+++ b/arch/arm/kvm/trace.h
@ -25,21 +25,25 @@ TRACE_EVENT(kvm_entry,
 );
 TRACE_EVENT(kvm_exit,
-	TP_PROTO(unsigned int exit_reason, unsigned long vcpu_pc),
+	TP_PROTO(int idx, unsigned int exit_reason, unsigned long vcpu_pc),
-	TP_ARGS(exit_reason, vcpu_pc),
+	TP_ARGS(idx, exit_reason, vcpu_pc),
 	TP_STRUCT__entry(
 		__field(	int,		idx		)
 		__field(	unsigned int,	exit_reason	)
 		__field(	unsigned long,	vcpu_pc		)
 	),
 	TP_fast_assign(
 		__entry->idx			= idx;
 		__entry->exit_reason		= exit_reason;
 		__entry->vcpu_pc		= vcpu_pc;
 	),
-	TP_printk("HSR_EC: 0x%04x, PC: 0x%08lx",
+	TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx",
 		  __print_symbolic(__entry->idx, kvm_arm_exception_type),
 		  __entry->exit_reason,
 		  __print_symbolic(__entry->exit_reason, kvm_arm_exception_class),
 		  __entry->vcpu_pc)
 );
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@ -200,4 +200,20 @@
 /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
 #define HPFAR_MASK	(~UL(0xf))
 #define kvm_arm_exception_type	\
 	{0, "IRQ" }, 		\
 	{1, "TRAP" }
 #define ECN(x) { ESR_ELx_EC_##x, #x }
 #define kvm_arm_exception_class \
 	ECN(UNKNOWN), ECN(WFx), ECN(CP15_32), ECN(CP15_64), ECN(CP14_MR), \
 	ECN(CP14_LS), ECN(FP_ASIMD), ECN(CP10_ID), ECN(CP14_64), ECN(SVC64), \
 	ECN(HVC64), ECN(SMC64), ECN(SYS64), ECN(IMP_DEF), ECN(IABT_LOW), \
 	ECN(IABT_CUR), ECN(PC_ALIGN), ECN(DABT_LOW), ECN(DABT_CUR), \
 	ECN(SP_ALIGN), ECN(FP_EXC32), ECN(FP_EXC64), ECN(SERROR), \
 	ECN(BREAKPT_LOW), ECN(BREAKPT_CUR), ECN(SOFTSTP_LOW), \
 	ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \
 	ECN(BKPT32), ECN(VECTOR32), ECN(BRK64)
 #endif /* __ARM64_KVM_ARM_H__ */
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@ -149,7 +149,10 @@ struct kvm_vcpu_arch {
 		u32	mdscr_el1;
 	} guest_debug_preserved;
-	/* Don't run the guest */
+	/* vcpu power-off state */
 	bool power_off;
 	/* Don't run the guest (internal implementation need) */
 	bool pause;
 	/* IO related fields */
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@ -48,4 +48,6 @@ config KVM_ARM_HOST
 	---help---
 	  Provides host support for ARM processors.
 source drivers/vhost/Kconfig
 endif # VIRTUALIZATION
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@ -880,6 +880,14 @@ __kvm_hyp_panic:
 	bl __restore_sysregs
 	/*
 	 * Make sure we have a valid host stack, and don't leave junk in the
 	 * frame pointer that will give us a misleading host stack unwinding.
 	 */
 	ldr	x22, [x2, #CPU_GP_REG_OFFSET(CPU_SP_EL1)]
 	msr	sp_el1, x22
 	mov	x29, xzr
 1:	adr	x0, __hyp_panic_str
 	adr	x1, 2f
 	ldp	x2, x3, [x1]
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@ -847,5 +847,7 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *slot) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 #endif /* __MIPS_KVM_HOST_H__ */
--- a/arch/powerpc/include/asm/disassemble.h
+++ b/arch/powerpc/include/asm/disassemble.h
@ -42,6 +42,11 @@ static inline unsigned int get_dcrn(u32 inst)
 	return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
 }
 static inline unsigned int get_tmrn(u32 inst)
 {
 	return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
 }
 static inline unsigned int get_rt(u32 inst)
 {
 	return (inst >> 21) & 0x1f;
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@ -716,5 +716,7 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_exit(void) {}
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 #endif /* __POWERPC_KVM_HOST_H__ */
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@ -742,6 +742,12 @@
 #define MMUBE1_VBE4		0x00000002
 #define MMUBE1_VBE5		0x00000001
 #define TMRN_TMCFG0      16	/* Thread Management Configuration Register 0 */
 #define TMRN_TMCFG0_NPRIBITS       0x003f0000 /* Bits of thread priority */
 #define TMRN_TMCFG0_NPRIBITS_SHIFT 16
 #define TMRN_TMCFG0_NATHRD         0x00003f00 /* Number of active threads */
 #define TMRN_TMCFG0_NATHRD_SHIFT   8
 #define TMRN_TMCFG0_NTHRD          0x0000003f /* Number of threads */
 #define TMRN_IMSR0	0x120	/* Initial MSR Register 0 (e6500) */
 #define TMRN_IMSR1	0x121	/* Initial MSR Register 1 (e6500) */
 #define TMRN_INIA0	0x140	/* Next Instruction Address Register 0 */
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@ -70,7 +70,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 	}
 	/* Lastly try successively smaller sizes from the page allocator */
-	while (!hpt && order > PPC_MIN_HPT_ORDER) {
+	/* Only do this if userspace didn't specify a size via ioctl */
 	while (!hpt && order > PPC_MIN_HPT_ORDER && !htab_orderp) {
 		hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
 				       __GFP_NOWARN, order - PAGE_SHIFT);
 		if (!hpt)
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@ -470,6 +470,8 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 	note_hpte_modification(kvm, rev);
 	unlock_hpte(hpte, 0);
 	if (v & HPTE_V_ABSENT)
 		v = (v & ~HPTE_V_ABSENT) | HPTE_V_VALID;
 	hpret[0] = v;
 	hpret[1] = r;
 	return H_SUCCESS;
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@ -150,6 +150,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	cmpwi	cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
 	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
 	beq	11f
 	cmpwi	r12, BOOK3S_INTERRUPT_H_DOORBELL
 	beq 	15f	/* Invoke the H_DOORBELL handler */
 	cmpwi	cr2, r12, BOOK3S_INTERRUPT_HMI
 	beq	cr2, 14f			/* HMI check */
@ -174,6 +176,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	mtspr	SPRN_HSRR1, r7
 	b	hmi_exception_after_realmode
 15:	mtspr SPRN_HSRR0, r8
 	mtspr SPRN_HSRR1, r7
 	ba    0xe80
 kvmppc_primary_no_guest:
 	/* We handle this much like a ceded vcpu */
 	/* put the HDEC into the DEC, since HDEC interrupts don't wake us */
@ -2377,7 +2383,6 @@ machine_check_realmode:
 	mr	r3, r9		/* get vcpu pointer */
 	bl	kvmppc_realmode_machine_check
 	nop
 	cmpdi	r3, 0		/* Did we handle MCE ? */
 	ld	r9, HSTATE_KVM_VCPU(r13)
 	li	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
 	/*
@ -2390,13 +2395,18 @@ machine_check_realmode:
 	 * The old code used to return to host for unhandled errors which
 	 * was causing guest to hang with soft lockups inside guest and
 	 * makes it difficult to recover guest instance.
 	 *
 	 * if we receive machine check with MSR(RI=0) then deliver it to
 	 * guest as machine check causing guest to crash.
 	 */
 	ld	r10, VCPU_PC(r9)
 	ld	r11, VCPU_MSR(r9)
 	andi.	r10, r11, MSR_RI	/* check for unrecoverable exception */
 	beq	1f			/* Deliver a machine check to guest */
 	ld	r10, VCPU_PC(r9)
 	cmpdi	r3, 0		/* Did we handle MCE ? */
 	bne	2f	/* Continue guest execution. */
 	/* If not, deliver a machine check.  SRR0/1 are already set */
-	li	r10, BOOK3S_INTERRUPT_MACHINE_CHECK
+1:	li	r10, BOOK3S_INTERRUPT_MACHINE_CHECK
 	ld	r11, VCPU_MSR(r9)
 	bl	kvmppc_msr_interrupt
 2:	b	fast_interrupt_c_return
@ -2436,14 +2446,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	/* hypervisor doorbell */
 3:	li	r12, BOOK3S_INTERRUPT_H_DOORBELL
 	/*
 	 * Clear the doorbell as we will invoke the handler
 	 * explicitly in the guest exit path.
 	 */
 	lis	r6, (PPC_DBELL_SERVER << (63-36))@h
 	PPC_MSGCLR(6)
 	/* see if it's a host IPI */
 	li	r3, 1
 	lbz	r0, HSTATE_HOST_IPI(r13)
 	cmpwi	r0, 0
 	bnelr
-	/* if not, clear it and return -1 */
+	/* if not, return -1 */
 	lis	r6, (PPC_DBELL_SERVER << (63-36))@h
 	PPC_MSGCLR(6)
 	li	r3, -1
 	blr
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@ -237,7 +237,8 @@ void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500,
                           struct kvm_book3e_206_tlb_entry *gtlbe)
 {
 	struct vcpu_id_table *idt = vcpu_e500->idt;
-	unsigned int pr, tid, ts, pid;
+	unsigned int pr, tid, ts;
 	int pid;
 	u32 val, eaddr;
 	unsigned long flags;
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@ -15,6 +15,7 @@
 #include <asm/kvm_ppc.h>
 #include <asm/disassemble.h>
 #include <asm/dbell.h>
 #include <asm/reg_booke.h>
 #include "booke.h"
 #include "e500.h"
@ -22,6 +23,7 @@
 #define XOP_DCBTLS  166
 #define XOP_MSGSND  206
 #define XOP_MSGCLR  238
 #define XOP_MFTMR   366
 #define XOP_TLBIVAX 786
 #define XOP_TLBSX   914
 #define XOP_TLBRE   946
@ -113,6 +115,19 @@ static int kvmppc_e500_emul_dcbtls(struct kvm_vcpu *vcpu)
 	return EMULATE_DONE;
 }
 static int kvmppc_e500_emul_mftmr(struct kvm_vcpu *vcpu, unsigned int inst,
 				  int rt)
 {
 	/* Expose one thread per vcpu */
 	if (get_tmrn(inst) == TMRN_TMCFG0) {
 		kvmppc_set_gpr(vcpu, rt,
 			       1 | (1 << TMRN_TMCFG0_NATHRD_SHIFT));
 		return EMULATE_DONE;
 	}
 	return EMULATE_FAIL;
 }
 int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				unsigned int inst, int *advance)
 {
@ -165,6 +180,10 @@ int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			emulated = kvmppc_e500_emul_tlbivax(vcpu, ea);
 			break;
 		case XOP_MFTMR:
 			emulated = kvmppc_e500_emul_mftmr(vcpu, inst, rt);
 			break;
 		case XOP_EHPRIV:
 			emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst,
 							   advance);
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@ -406,7 +406,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 			for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
 				unsigned long gfn_start, gfn_end;
-				tsize_pages = 1 << (tsize - 2);
+				tsize_pages = 1UL << (tsize - 2);
 				gfn_start = gfn & ~(tsize_pages - 1);
 				gfn_end = gfn_start + tsize_pages;
@ -447,7 +447,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	}
 	if (likely(!pfnmap)) {
-		tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
+		tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT);
 		pfn = gfn_to_pfn_memslot(slot, gfn);
 		if (is_error_noslot_pfn(pfn)) {
 			if (printk_ratelimit())
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@ -559,6 +559,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		else
 			r = num_online_cpus();
 		break;
 	case KVM_CAP_NR_MEMSLOTS:
 		r = KVM_USER_MEM_SLOTS;
 		break;
 	case KVM_CAP_MAX_VCPUS:
 		r = KVM_MAX_VCPUS;
 		break;
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@ -644,5 +644,7 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *slot) {}
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 #endif
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@ -336,28 +336,28 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu)
 	return -EOPNOTSUPP;
 }
 static const intercept_handler_t intercept_funcs[] = {
 	[0x00 >> 2] = handle_noop,
 	[0x04 >> 2] = handle_instruction,
 	[0x08 >> 2] = handle_prog,
 	[0x10 >> 2] = handle_noop,
 	[0x14 >> 2] = handle_external_interrupt,
 	[0x18 >> 2] = handle_noop,
 	[0x1C >> 2] = kvm_s390_handle_wait,
 	[0x20 >> 2] = handle_validity,
 	[0x28 >> 2] = handle_stop,
 	[0x38 >> 2] = handle_partial_execution,
 };
 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 {
-	intercept_handler_t func;
+	switch (vcpu->arch.sie_block->icptcode) {
-	u8 code = vcpu->arch.sie_block->icptcode;
+	case 0x00:
-
+	case 0x10:
-	if (code & 3 || (code >> 2) >= ARRAY_SIZE(intercept_funcs))
+	case 0x18:
 		return handle_noop(vcpu);
 	case 0x04:
 		return handle_instruction(vcpu);
 	case 0x08:
 		return handle_prog(vcpu);
 	case 0x14:
 		return handle_external_interrupt(vcpu);
 	case 0x1c:
 		return kvm_s390_handle_wait(vcpu);
 	case 0x20:
 		return handle_validity(vcpu);
 	case 0x28:
 		return handle_stop(vcpu);
 	case 0x38:
 		return handle_partial_execution(vcpu);
 	default:
 		return -EOPNOTSUPP;
-	func = intercept_funcs[code >> 2];
+	}
 	if (func)
 		return func(vcpu);
 	return -EOPNOTSUPP;
 }
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@ -51,11 +51,9 @@ static int psw_mchk_disabled(struct kvm_vcpu *vcpu)
 static int psw_interrupts_disabled(struct kvm_vcpu *vcpu)
 {
-	if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) ||
+	return psw_extint_disabled(vcpu) &&
-	    (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_IO) ||
+	       psw_ioint_disabled(vcpu) &&
-	    (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT))
+	       psw_mchk_disabled(vcpu);
 		return 0;
 	return 1;
 }
 static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu)
@ -71,13 +69,8 @@ static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu)
 static int ckc_irq_pending(struct kvm_vcpu *vcpu)
 {
-	preempt_disable();
+	if (vcpu->arch.sie_block->ckc >= kvm_s390_get_tod_clock_fast(vcpu->kvm))
 	if (!(vcpu->arch.sie_block->ckc <
 	      get_tod_clock_fast() + vcpu->arch.sie_block->epoch)) {
 		preempt_enable();
 		return 0;
 	}
 	preempt_enable();
 	return ckc_interrupts_enabled(vcpu);
 }
@ -109,14 +102,10 @@ static inline u8 int_word_to_isc(u32 int_word)
 	return (int_word & 0x38000000) >> 27;
 }
-static inline unsigned long pending_floating_irqs(struct kvm_vcpu *vcpu)
+static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu)
 {
-	return vcpu->kvm->arch.float_int.pending_irqs;
+	return vcpu->kvm->arch.float_int.pending_irqs |
-}
+	       vcpu->arch.local_int.pending_irqs;
 static inline unsigned long pending_local_irqs(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.local_int.pending_irqs;
 }
 static unsigned long disable_iscs(struct kvm_vcpu *vcpu,
@ -135,8 +124,7 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
 {
 	unsigned long active_mask;
-	active_mask = pending_local_irqs(vcpu);
+	active_mask = pending_irqs(vcpu);
 	active_mask |= pending_floating_irqs(vcpu);
 	if (!active_mask)
 		return 0;
@ -204,7 +192,7 @@ static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag)
 static void set_intercept_indicators_io(struct kvm_vcpu *vcpu)
 {
-	if (!(pending_floating_irqs(vcpu) & IRQ_PEND_IO_MASK))
+	if (!(pending_irqs(vcpu) & IRQ_PEND_IO_MASK))
 		return;
 	else if (psw_ioint_disabled(vcpu))
 		__set_cpuflag(vcpu, CPUSTAT_IO_INT);
@ -214,7 +202,7 @@ static void set_intercept_indicators_io(struct kvm_vcpu *vcpu)
 static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu)
 {
-	if (!(pending_local_irqs(vcpu) & IRQ_PEND_EXT_MASK))
+	if (!(pending_irqs(vcpu) & IRQ_PEND_EXT_MASK))
 		return;
 	if (psw_extint_disabled(vcpu))
 		__set_cpuflag(vcpu, CPUSTAT_EXT_INT);
@ -224,7 +212,7 @@ static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu)
 static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu)
 {
-	if (!(pending_local_irqs(vcpu) & IRQ_PEND_MCHK_MASK))
+	if (!(pending_irqs(vcpu) & IRQ_PEND_MCHK_MASK))
 		return;
 	if (psw_mchk_disabled(vcpu))
 		vcpu->arch.sie_block->ictl |= ICTL_LPSW;
@ -815,23 +803,21 @@ int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu)
 int kvm_s390_vcpu_has_irq(struct kvm_vcpu *vcpu, int exclude_stop)
 {
-	int rc;
+	if (deliverable_irqs(vcpu))
 		return 1;
-	rc = !!deliverable_irqs(vcpu);
+	if (kvm_cpu_has_pending_timer(vcpu))
-
+		return 1;
 	if (!rc && kvm_cpu_has_pending_timer(vcpu))
 		rc = 1;
 	/* external call pending and deliverable */
-	if (!rc && kvm_s390_ext_call_pending(vcpu) &&
+	if (kvm_s390_ext_call_pending(vcpu) &&
 	    !psw_extint_disabled(vcpu) &&
 	    (vcpu->arch.sie_block->gcr[0] & 0x2000ul))
-		rc = 1;
+		return 1;
-	if (!rc && !exclude_stop && kvm_s390_is_stop_irq_pending(vcpu))
+	if (!exclude_stop && kvm_s390_is_stop_irq_pending(vcpu))
-		rc = 1;
+		return 1;
-
+	return 0;
 	return rc;
 }
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
@ -846,7 +832,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
 	vcpu->stat.exit_wait_state++;
 	/* fast path */
-	if (kvm_cpu_has_pending_timer(vcpu) || kvm_arch_vcpu_runnable(vcpu))
+	if (kvm_arch_vcpu_runnable(vcpu))
 		return 0;
 	if (psw_interrupts_disabled(vcpu)) {
@ -860,9 +846,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
 		goto no_timer;
 	}
-	preempt_disable();
+	now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
 	now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch;
 	preempt_enable();
 	sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
 	/* underflow */
@ -901,9 +885,7 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
 	u64 now, sltime;
 	vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer);
-	preempt_disable();
+	now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
 	now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch;
 	preempt_enable();
 	sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
 	/*
@ -981,39 +963,30 @@ static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
 	trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
 				   irq->u.pgm.code, 0);
-	li->irq.pgm = irq->u.pgm;
+	if (irq->u.pgm.code == PGM_PER) {
 		li->irq.pgm.code |= PGM_PER;
 		/* only modify PER related information */
 		li->irq.pgm.per_address = irq->u.pgm.per_address;
 		li->irq.pgm.per_code = irq->u.pgm.per_code;
 		li->irq.pgm.per_atmid = irq->u.pgm.per_atmid;
 		li->irq.pgm.per_access_id = irq->u.pgm.per_access_id;
 	} else if (!(irq->u.pgm.code & PGM_PER)) {
 		li->irq.pgm.code = (li->irq.pgm.code & PGM_PER) |
 				   irq->u.pgm.code;
 		/* only modify non-PER information */
 		li->irq.pgm.trans_exc_code = irq->u.pgm.trans_exc_code;
 		li->irq.pgm.mon_code = irq->u.pgm.mon_code;
 		li->irq.pgm.data_exc_code = irq->u.pgm.data_exc_code;
 		li->irq.pgm.mon_class_nr = irq->u.pgm.mon_class_nr;
 		li->irq.pgm.exc_access_id = irq->u.pgm.exc_access_id;
 		li->irq.pgm.op_access_id = irq->u.pgm.op_access_id;
 	} else {
 		li->irq.pgm = irq->u.pgm;
 	}
 	set_bit(IRQ_PEND_PROG, &li->pending_irqs);
 	return 0;
 }
 int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
 {
 	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
 	struct kvm_s390_irq irq;
 	spin_lock(&li->lock);
 	irq.u.pgm.code = code;
 	__inject_prog(vcpu, &irq);
 	BUG_ON(waitqueue_active(li->wq));
 	spin_unlock(&li->lock);
 	return 0;
 }
 int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
 			     struct kvm_s390_pgm_info *pgm_info)
 {
 	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
 	struct kvm_s390_irq irq;
 	int rc;
 	spin_lock(&li->lock);
 	irq.u.pgm = *pgm_info;
 	rc = __inject_prog(vcpu, &irq);
 	BUG_ON(waitqueue_active(li->wq));
 	spin_unlock(&li->lock);
 	return rc;
 }
 static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
 {
 	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@ -1390,12 +1363,9 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type)
 static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
 {
 	struct kvm_s390_float_interrupt *fi;
 	u64 type = READ_ONCE(inti->type);
 	int rc;
 	fi = &kvm->arch.float_int;
 	switch (type) {
 	case KVM_S390_MCHK:
 		rc = __inject_float_mchk(kvm, inti);
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@ -514,35 +514,20 @@ static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
 	if (gtod_high != 0)
 		return -EINVAL;
-	VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x\n", gtod_high);
+	VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x", gtod_high);
 	return 0;
 }
 static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
 {
-	struct kvm_vcpu *cur_vcpu;
+	u64 gtod;
 	unsigned int vcpu_idx;
 	u64 host_tod, gtod;
 	int r;
 	if (copy_from_user(&gtod, (void __user *)attr->addr, sizeof(gtod)))
 		return -EFAULT;
-	r = store_tod_clock(&host_tod);
+	kvm_s390_set_tod_clock(kvm, gtod);
-	if (r)
+	VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod);
 		return r;
 	mutex_lock(&kvm->lock);
 	preempt_disable();
 	kvm->arch.epoch = gtod - host_tod;
 	kvm_s390_vcpu_block_all(kvm);
 	kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm)
 		cur_vcpu->arch.sie_block->epoch = kvm->arch.epoch;
 	kvm_s390_vcpu_unblock_all(kvm);
 	preempt_enable();
 	mutex_unlock(&kvm->lock);
 	VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx\n", gtod);
 	return 0;
 }
@ -574,26 +559,19 @@ static int kvm_s390_get_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
 	if (copy_to_user((void __user *)attr->addr, &gtod_high,
 					 sizeof(gtod_high)))
 		return -EFAULT;
-	VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x\n", gtod_high);
+	VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x", gtod_high);
 	return 0;
 }
 static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
 {
-	u64 host_tod, gtod;
+	u64 gtod;
 	int r;
-	r = store_tod_clock(&host_tod);
+	gtod = kvm_s390_get_tod_clock_fast(kvm);
 	if (r)
 		return r;
 	preempt_disable();
 	gtod = host_tod + kvm->arch.epoch;
 	preempt_enable();
 	if (copy_to_user((void __user *)attr->addr, &gtod, sizeof(gtod)))
 		return -EFAULT;
-	VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx\n", gtod);
+	VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx", gtod);
 	return 0;
 }
@ -1120,7 +1098,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	if (!kvm->arch.sca)
 		goto out_err;
 	spin_lock(&kvm_lock);
-	sca_offset = (sca_offset + 16) & 0x7f0;
+	sca_offset += 16;
 	if (sca_offset + sizeof(struct sca_block) > PAGE_SIZE)
 		sca_offset = 0;
 	kvm->arch.sca = (struct sca_block *) ((char *) kvm->arch.sca + sca_offset);
 	spin_unlock(&kvm_lock);
@ -1911,6 +1891,22 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
 	return 0;
 }
 void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod)
 {
 	struct kvm_vcpu *vcpu;
 	int i;
 	mutex_lock(&kvm->lock);
 	preempt_disable();
 	kvm->arch.epoch = tod - get_tod_clock();
 	kvm_s390_vcpu_block_all(kvm);
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		vcpu->arch.sie_block->epoch = kvm->arch.epoch;
 	kvm_s390_vcpu_unblock_all(kvm);
 	preempt_enable();
 	mutex_unlock(&kvm->lock);
 }
 /**
 * kvm_arch_fault_in_page - fault-in guest page if necessary
 * @vcpu: The corresponding virtual cpu
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@ -175,6 +175,7 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
 	return kvm->arch.user_cpu_state_ctrl != 0;
 }
 /* implemented in interrupt.c */
 int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu);
 enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
@ -185,7 +186,25 @@ int __must_check kvm_s390_inject_vm(struct kvm *kvm,
 				    struct kvm_s390_interrupt *s390int);
 int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
 				      struct kvm_s390_irq *irq);
-int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
+static inline int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
 					   struct kvm_s390_pgm_info *pgm_info)
 {
 	struct kvm_s390_irq irq = {
 		.type = KVM_S390_PROGRAM_INT,
 		.u.pgm = *pgm_info,
 	};
 	return kvm_s390_inject_vcpu(vcpu, &irq);
 }
 static inline int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
 {
 	struct kvm_s390_irq irq = {
 		.type = KVM_S390_PROGRAM_INT,
 		.u.pgm.code = code,
 	};
 	return kvm_s390_inject_vcpu(vcpu, &irq);
 }
 struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
 						    u64 isc_mask, u32 schid);
 int kvm_s390_reinject_io_int(struct kvm *kvm,
@ -212,6 +231,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
 /* implemented in kvm-s390.c */
 void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod);
 long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
 int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
 int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu,
@ -231,9 +251,6 @@ extern unsigned long kvm_s390_fac_list_mask[];
 /* implemented in diag.c */
 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
 /* implemented in interrupt.c */
 int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
 			     struct kvm_s390_pgm_info *pgm_info);
 static inline void kvm_s390_vcpu_block_all(struct kvm *kvm)
 {
@ -254,6 +271,16 @@ static inline void kvm_s390_vcpu_unblock_all(struct kvm *kvm)
 		kvm_s390_vcpu_unblock(vcpu);
 }
 static inline u64 kvm_s390_get_tod_clock_fast(struct kvm *kvm)
 {
 	u64 rc;
 	preempt_disable();
 	rc = get_tod_clock_fast() + kvm->arch.epoch;
 	preempt_enable();
 	return rc;
 }
 /**
 * kvm_s390_inject_prog_cond - conditionally inject a program check
 * @vcpu: virtual cpu
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@ -33,11 +33,9 @@
 /* Handle SCK (SET CLOCK) interception */
 static int handle_set_clock(struct kvm_vcpu *vcpu)
 {
-	struct kvm_vcpu *cpup;
+	int rc;
 	s64 hostclk, val;
 	int i, rc;
 	ar_t ar;
-	u64 op2;
+	u64 op2, val;
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
@ -49,19 +47,8 @@ static int handle_set_clock(struct kvm_vcpu *vcpu)
 	if (rc)
 		return kvm_s390_inject_prog_cond(vcpu, rc);
 	if (store_tod_clock(&hostclk)) {
 		kvm_s390_set_psw_cc(vcpu, 3);
 		return 0;
 	}
 	VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", val);
-	val = (val - hostclk) & ~0x3fUL;
+	kvm_s390_set_tod_clock(vcpu->kvm, val);
 	mutex_lock(&vcpu->kvm->lock);
 	preempt_disable();
 	kvm_for_each_vcpu(i, cpup, vcpu->kvm)
 		cpup->arch.sie_block->epoch = val;
 	preempt_enable();
 	mutex_unlock(&vcpu->kvm->lock);
 	kvm_s390_set_psw_cc(vcpu, 0);
 	return 0;
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@ -33,6 +33,11 @@ enum irq_remap_cap {
 	IRQ_POSTING_CAP = 0,
 };
 struct vcpu_data {
 	u64 pi_desc_addr;	/* Physical address of PI Descriptor */
 	u32 vector;		/* Guest vector of the interrupt */
 };
 #ifdef CONFIG_IRQ_REMAP
 extern bool irq_remapping_cap(enum irq_remap_cap cap);
@ -58,11 +63,6 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void)
 	return x86_vector_domain;
 }
 struct vcpu_data {
 	u64 pi_desc_addr;	/* Physical address of PI Descriptor */
 	u32 vector;		/* Guest vector of the interrupt */
 };
 #else  /* CONFIG_IRQ_REMAP */
 static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; }
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@ -111,6 +111,16 @@ struct x86_emulate_ops {
 			unsigned int bytes,
 			struct x86_exception *fault);
 	/*
 	 * read_phys: Read bytes of standard (non-emulated/special) memory.
 	 *            Used for descriptor reading.
 	 *  @addr:  [IN ] Physical address from which to read.
 	 *  @val:   [OUT] Value read from memory.
 	 *  @bytes: [IN ] Number of bytes to read from memory.
 	 */
 	int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr,
 			void *val, unsigned int bytes);
 	/*
 	 * write_std: Write bytes of standard (non-emulated/special) memory.
 	 *            Used for descriptor writing.
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@ -24,6 +24,7 @@
 #include <linux/perf_event.h>
 #include <linux/pvclock_gtod.h>
 #include <linux/clocksource.h>
 #include <linux/irqbypass.h>
 #include <asm/pvclock-abi.h>
 #include <asm/desc.h>
@ -176,6 +177,8 @@ enum {
 */
 #define KVM_APIC_PV_EOI_PENDING	1
 struct kvm_kernel_irq_routing_entry;
 /*
 * We don't want allocation failures within the mmu code, so we preallocate
 * enough memory for a single page fault in a cache.
@ -374,6 +377,7 @@ struct kvm_mtrr {
 /* Hyper-V per vcpu emulation context */
 struct kvm_vcpu_hv {
 	u64 hv_vapic;
 	s64 runtime_offset;
 };
 struct kvm_vcpu_arch {
@ -396,6 +400,7 @@ struct kvm_vcpu_arch {
 	u64 efer;
 	u64 apic_base;
 	struct kvm_lapic *apic;    /* kernel irqchip context */
 	u64 eoi_exit_bitmap[4];
 	unsigned long apic_attention;
 	int32_t apic_arb_prio;
 	int mp_state;
@ -573,6 +578,9 @@ struct kvm_vcpu_arch {
 	struct {
 		bool pv_unhalted;
 	} pv;
 	int pending_ioapic_eoi;
 	int pending_external_vector;
 };
 struct kvm_lpage_info {
@ -683,6 +691,9 @@ struct kvm_arch {
 	u32 bsp_vcpu_id;
 	u64 disabled_quirks;
 	bool irqchip_split;
 	u8 nr_reserved_ioapic_pins;
 };
 struct kvm_vm_stat {
@ -819,10 +830,10 @@ struct kvm_x86_ops {
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
 	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
 	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
-	int (*vm_has_apicv)(struct kvm *kvm);
+	int (*cpu_uses_apicv)(struct kvm_vcpu *vcpu);
 	void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
 	void (*hwapic_isr_update)(struct kvm *kvm, int isr);
-	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu);
 	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
 	void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
 	void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
@ -887,6 +898,20 @@ struct kvm_x86_ops {
 					   gfn_t offset, unsigned long mask);
 	/* pmu operations of sub-arch */
 	const struct kvm_pmu_ops *pmu_ops;
 	/*
 	 * Architecture specific hooks for vCPU blocking due to
 	 * HLT instruction.
 	 * Returns for .pre_block():
 	 *    - 0 means continue to block the vCPU.
 	 *    - 1 means we cannot block the vCPU since some event
 	 *        happens during this period, such as, 'ON' bit in
 	 *        posted-interrupts descriptor is set.
 	 */
 	int (*pre_block)(struct kvm_vcpu *vcpu);
 	void (*post_block)(struct kvm_vcpu *vcpu);
 	int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
 			      uint32_t guest_irq, bool set);
 };
 struct kvm_arch_async_pf {
@ -1231,4 +1256,13 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size);
 bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
 bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
 bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
 			     struct kvm_vcpu **dest_vcpu);
 void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
 		     struct kvm_lapic_irq *irq);
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 #endif /* _ASM_X86_KVM_HOST_H */
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@ -72,7 +72,7 @@
 #define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
 #define SECONDARY_EXEC_ENABLE_PML               0x00020000
 #define SECONDARY_EXEC_XSAVES			0x00100000
-
+#define SECONDARY_EXEC_PCOMMIT			0x00200000
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
 #define PIN_BASED_NMI_EXITING                   0x00000008
@ -416,6 +416,7 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
 #define VMX_VPID_INVVPID_BIT                    (1ull << 0) /* (32 - 32) */
 #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT      (1ull << 9) /* (41 - 32) */
 #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT      (1ull << 10) /* (42 - 32) */
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@ -153,6 +153,12 @@
 /* MSR used to provide vcpu index */
 #define HV_X64_MSR_VP_INDEX			0x40000002
 /* MSR used to reset the guest OS. */
 #define HV_X64_MSR_RESET			0x40000003
 /* MSR used to provide vcpu runtime in 100ns units */
 #define HV_X64_MSR_VP_RUNTIME			0x40000010
 /* MSR used to read the per-partition time reference counter */
 #define HV_X64_MSR_TIME_REF_COUNT		0x40000020
@ -251,4 +257,16 @@ typedef struct _HV_REFERENCE_TSC_PAGE {
 	__s64 tsc_offset;
 } HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
 /* Define the number of synthetic interrupt sources. */
 #define HV_SYNIC_SINT_COUNT		(16)
 /* Define the expected SynIC version. */
 #define HV_SYNIC_VERSION_1		(0x1)
 #define HV_SYNIC_CONTROL_ENABLE		(1ULL << 0)
 #define HV_SYNIC_SIMP_ENABLE		(1ULL << 0)
 #define HV_SYNIC_SIEFP_ENABLE		(1ULL << 0)
 #define HV_SYNIC_SINT_MASKED		(1ULL << 16)
 #define HV_SYNIC_SINT_AUTO_EOI		(1ULL << 17)
 #define HV_SYNIC_SINT_VECTOR_MASK	(0xFF)
 #endif
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@ -78,6 +78,7 @@
 #define EXIT_REASON_PML_FULL            62
 #define EXIT_REASON_XSAVES              63
 #define EXIT_REASON_XRSTORS             64
 #define EXIT_REASON_PCOMMIT             65
 #define VMX_EXIT_REASONS \
 	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
@ -126,7 +127,8 @@
 	{ EXIT_REASON_INVVPID,               "INVVPID" }, \
 	{ EXIT_REASON_INVPCID,               "INVPCID" }, \
 	{ EXIT_REASON_XSAVES,                "XSAVES" }, \
-	{ EXIT_REASON_XRSTORS,               "XRSTORS" }
+	{ EXIT_REASON_XRSTORS,               "XRSTORS" }, \
 	{ EXIT_REASON_PCOMMIT,               "PCOMMIT" }
 #define VMX_ABORT_SAVE_GUEST_MSR_FAIL        1
 #define VMX_ABORT_LOAD_HOST_MSR_FAIL         4
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@ -32,6 +32,7 @@
 static int kvmclock = 1;
 static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
 static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
 static cycle_t kvm_sched_clock_offset;
 static int parse_no_kvmclock(char *arg)
 {
@ -92,6 +93,29 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
 	return kvm_clock_read();
 }
 static cycle_t kvm_sched_clock_read(void)
 {
 	return kvm_clock_read() - kvm_sched_clock_offset;
 }
 static inline void kvm_sched_clock_init(bool stable)
 {
 	if (!stable) {
 		pv_time_ops.sched_clock = kvm_clock_read;
 		return;
 	}
 	kvm_sched_clock_offset = kvm_clock_read();
 	pv_time_ops.sched_clock = kvm_sched_clock_read;
 	set_sched_clock_stable();
 	printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n",
 			kvm_sched_clock_offset);
 	BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
 	         sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
 }
 /*
 * If we don't do that, there is the possibility that the guest
 * will calibrate under heavy load - thus, getting a lower lpj -
@ -248,7 +272,17 @@ void __init kvmclock_init(void)
 		memblock_free(mem, size);
 		return;
 	}
-	pv_time_ops.sched_clock = kvm_clock_read;
+
 	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
 		pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
 	cpu = get_cpu();
 	vcpu_time = &hv_clock[cpu].pvti;
 	flags = pvclock_read_flags(vcpu_time);
 	kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
 	put_cpu();
 	x86_platform.calibrate_tsc = kvm_get_tsc_khz;
 	x86_platform.get_wallclock = kvm_get_wallclock;
 	x86_platform.set_wallclock = kvm_set_wallclock;
@ -265,16 +299,6 @@ void __init kvmclock_init(void)
 	kvm_get_preset_lpj();
 	clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
 	pv_info.name = "KVM";
 	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
 		pvclock_set_flags(~0);
 	cpu = get_cpu();
 	vcpu_time = &hv_clock[cpu].pvti;
 	flags = pvclock_read_flags(vcpu_time);
 	if (flags & PVCLOCK_COUNTS_FROM_ZERO)
 		set_sched_clock_stable();
 	put_cpu();
 }
 int __init kvm_setup_vsyscall_timeinfo(void)
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@ -28,6 +28,8 @@ config KVM
 	select ANON_INODES
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_IRQFD
 	select IRQ_BYPASS_MANAGER
 	select HAVE_KVM_IRQ_BYPASS
 	select HAVE_KVM_IRQ_ROUTING
 	select HAVE_KVM_EVENTFD
 	select KVM_APIC_ARCHITECTURE
--- a/arch/x86/kvm/assigned-dev.c
+++ b/arch/x86/kvm/assigned-dev.c
@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include "irq.h"
 #include "assigned-dev.h"
 #include "trace/events/kvm.h"
 struct kvm_assigned_dev_kernel {
 	struct kvm_irq_ack_notifier ack_notifier;
@ -131,7 +132,42 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
-#ifdef __KVM_HAVE_MSI
+/*
 * Deliver an IRQ in an atomic context if we can, or return a failure,
 * user can retry in a process context.
 * Return value:
 *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
 *  Other values - No need to retry.
 */
 static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq,
 				int level)
 {
 	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
 	struct kvm_kernel_irq_routing_entry *e;
 	int ret = -EINVAL;
 	int idx;
 	trace_kvm_set_irq(irq, level, irq_source_id);
 	/*
 	 * Injection into either PIC or IOAPIC might need to scan all CPUs,
 	 * which would need to be retried from thread context;  when same GSI
 	 * is connected to both PIC and IOAPIC, we'd have to report a
 	 * partial failure here.
 	 * Since there's no easy way to do this, we only support injecting MSI
 	 * which is limited to 1:1 GSI mapping.
 	 */
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
 		e = &entries[0];
 		ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
 						irq, level);
 	}
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 	return ret;
 }
 static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
 {
 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@ -150,9 +186,7 @@ static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 #endif
 #ifdef __KVM_HAVE_MSIX
 static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
 {
 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@ -183,7 +217,6 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 #endif
 /* Ack the irq line for an assigned device */
 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
@ -386,7 +419,6 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
 	return 0;
 }
 #ifdef __KVM_HAVE_MSI
 static int assigned_device_enable_host_msi(struct kvm *kvm,
 					   struct kvm_assigned_dev_kernel *dev)
 {
@ -408,9 +440,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
 	return 0;
 }
 #endif
 #ifdef __KVM_HAVE_MSIX
 static int assigned_device_enable_host_msix(struct kvm *kvm,
 					    struct kvm_assigned_dev_kernel *dev)
 {
@ -443,8 +473,6 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
 	return r;
 }
 #endif
 static int assigned_device_enable_guest_intx(struct kvm *kvm,
 				struct kvm_assigned_dev_kernel *dev,
 				struct kvm_assigned_irq *irq)
@ -454,7 +482,6 @@ static int assigned_device_enable_guest_intx(struct kvm *kvm,
 	return 0;
 }
 #ifdef __KVM_HAVE_MSI
 static int assigned_device_enable_guest_msi(struct kvm *kvm,
 			struct kvm_assigned_dev_kernel *dev,
 			struct kvm_assigned_irq *irq)
@ -463,9 +490,7 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
 	dev->ack_notifier.gsi = -1;
 	return 0;
 }
 #endif
 #ifdef __KVM_HAVE_MSIX
 static int assigned_device_enable_guest_msix(struct kvm *kvm,
 			struct kvm_assigned_dev_kernel *dev,
 			struct kvm_assigned_irq *irq)
@ -474,7 +499,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
 	dev->ack_notifier.gsi = -1;
 	return 0;
 }
 #endif
 static int assign_host_irq(struct kvm *kvm,
 			   struct kvm_assigned_dev_kernel *dev,
@ -492,16 +516,12 @@ static int assign_host_irq(struct kvm *kvm,
 	case KVM_DEV_IRQ_HOST_INTX:
 		r = assigned_device_enable_host_intx(kvm, dev);
 		break;
 #ifdef __KVM_HAVE_MSI
 	case KVM_DEV_IRQ_HOST_MSI:
 		r = assigned_device_enable_host_msi(kvm, dev);
 		break;
 #endif
 #ifdef __KVM_HAVE_MSIX
 	case KVM_DEV_IRQ_HOST_MSIX:
 		r = assigned_device_enable_host_msix(kvm, dev);
 		break;
 #endif
 	default:
 		r = -EINVAL;
 	}
@ -534,16 +554,12 @@ static int assign_guest_irq(struct kvm *kvm,
 	case KVM_DEV_IRQ_GUEST_INTX:
 		r = assigned_device_enable_guest_intx(kvm, dev, irq);
 		break;
 #ifdef __KVM_HAVE_MSI
 	case KVM_DEV_IRQ_GUEST_MSI:
 		r = assigned_device_enable_guest_msi(kvm, dev, irq);
 		break;
 #endif
 #ifdef __KVM_HAVE_MSIX
 	case KVM_DEV_IRQ_GUEST_MSIX:
 		r = assigned_device_enable_guest_msix(kvm, dev, irq);
 		break;
 #endif
 	default:
 		r = -EINVAL;
 	}
@ -826,7 +842,6 @@ static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
 }
 #ifdef __KVM_HAVE_MSIX
 static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
 				    struct kvm_assigned_msix_nr *entry_nr)
 {
@ -906,7 +921,6 @@ static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
 	return r;
 }
 #endif
 static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
 		struct kvm_assigned_pci_dev *assigned_dev)
@ -1012,7 +1026,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 			goto out;
 		break;
 	}
 #ifdef __KVM_HAVE_MSIX
 	case KVM_ASSIGN_SET_MSIX_NR: {
 		struct kvm_assigned_msix_nr entry_nr;
 		r = -EFAULT;
@ -1033,7 +1046,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 			goto out;
 		break;
 	}
 #endif
 	case KVM_ASSIGN_SET_INTX_MASK: {
 		struct kvm_assigned_pci_dev assigned_dev;
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
 		F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
 		F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
-		F(AVX512CD);
+		F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT);
 	/* cpuid 0xD.1.eax */
 	const u32 kvm_supported_word10_x86_features =
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@ -133,4 +133,41 @@ static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
 	best = kvm_find_cpuid_entry(vcpu, 7, 0);
 	return best && (best->ebx & bit(X86_FEATURE_MPX));
 }
 static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
 	best = kvm_find_cpuid_entry(vcpu, 7, 0);
 	return best && (best->ebx & bit(X86_FEATURE_PCOMMIT));
 }
 static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
 	best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 	return best && (best->edx & bit(X86_FEATURE_RDTSCP));
 }
 /*
 * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3
 */
 #define BIT_NRIPS	3
 static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
 	best = kvm_find_cpuid_entry(vcpu, 0x8000000a, 0);
 	/*
 	 * NRIPS is a scattered cpuid feature, so we can't use
 	 * X86_FEATURE_NRIPS here (X86_FEATURE_NRIPS would be bit
 	 * position 8, not 3).
 	 */
 	return best && (best->edx & bit(BIT_NRIPS));
 }
 #undef BIT_NRIPS
 #endif
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@ -2272,8 +2272,8 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
 #define GET_SMSTATE(type, smbase, offset)				  \
 	({								  \
 	 type __val;							  \
-	 int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val,       \
+	 int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val,      \
-				     sizeof(__val), NULL);		  \
+				      sizeof(__val));			  \
 	 if (r != X86EMUL_CONTINUE)					  \
 		 return X86EMUL_UNHANDLEABLE;				  \
 	 __val;								  \
@ -2484,17 +2484,36 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
 	/*
 	 * Get back to real mode, to prepare a safe state in which to load
-	 * CR0/CR3/CR4/EFER.  Also this will ensure that addresses passed
+	 * CR0/CR3/CR4/EFER.  It's all a bit more complicated if the vCPU
-	 * to read_std/write_std are not virtual.
+	 * supports long mode.
 	 *
 	 * CR4.PCIDE must be zero, because it is a 64-bit mode only feature.
 	 */
 	cr4 = ctxt->ops->get_cr(ctxt, 4);
 	if (emulator_has_longmode(ctxt)) {
 		struct desc_struct cs_desc;
 		/* Zero CR4.PCIDE before CR0.PG.  */
 		if (cr4 & X86_CR4_PCIDE) {
 			ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
 			cr4 &= ~X86_CR4_PCIDE;
 		}
 		/* A 32-bit code segment is required to clear EFER.LMA.  */
 		memset(&cs_desc, 0, sizeof(cs_desc));
 		cs_desc.type = 0xb;
 		cs_desc.s = cs_desc.g = cs_desc.p = 1;
 		ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS);
 	}
 	/* For the 64-bit case, this will clear EFER.LMA.  */
 	cr0 = ctxt->ops->get_cr(ctxt, 0);
 	if (cr0 & X86_CR0_PE)
 		ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
-	cr4 = ctxt->ops->get_cr(ctxt, 4);
+
 	/* Now clear CR4.PAE (which must be done before clearing EFER.LME).  */
 	if (cr4 & X86_CR4_PAE)
 		ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
 	/* And finally go back to 32-bit mode.  */
 	efer = 0;
 	ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
@ -4455,7 +4474,7 @@ static const struct opcode twobyte_table[256] = {
 	F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
 	/* 0xA8 - 0xAF */
 	I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
-	II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm),
+	II(EmulateOnUD | ImplicitOps, em_rsm, rsm),
 	F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
 	F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
 	F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@ -41,6 +41,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
 	case HV_X64_MSR_TIME_REF_COUNT:
 	case HV_X64_MSR_CRASH_CTL:
 	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
 	case HV_X64_MSR_RESET:
 		r = true;
 		break;
 	}
@ -163,6 +164,12 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 						 data);
 	case HV_X64_MSR_CRASH_CTL:
 		return kvm_hv_msr_set_crash_ctl(vcpu, data, host);
 	case HV_X64_MSR_RESET:
 		if (data == 1) {
 			vcpu_debug(vcpu, "hyper-v reset requested\n");
 			kvm_make_request(KVM_REQ_HV_RESET, vcpu);
 		}
 		break;
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
 			    msr, data);
@ -171,7 +178,16 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 	return 0;
 }
-static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+/* Calculate cpu time spent by current task in 100ns units */
 static u64 current_task_runtime_100ns(void)
 {
 	cputime_t utime, stime;
 	task_cputime_adjusted(current, &utime, &stime);
 	return div_u64(cputime_to_nsecs(utime + stime), 100);
 }
 static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 {
 	struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
@ -205,6 +221,11 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
 	case HV_X64_MSR_TPR:
 		return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
 	case HV_X64_MSR_VP_RUNTIME:
 		if (!host)
 			return 1;
 		hv->runtime_offset = data - current_task_runtime_100ns();
 		break;
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
 			    msr, data);
@ -241,6 +262,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 						 pdata);
 	case HV_X64_MSR_CRASH_CTL:
 		return kvm_hv_msr_get_crash_ctl(vcpu, pdata);
 	case HV_X64_MSR_RESET:
 		data = 0;
 		break;
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
 		return 1;
@ -277,6 +301,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case HV_X64_MSR_APIC_ASSIST_PAGE:
 		data = hv->hv_vapic;
 		break;
 	case HV_X64_MSR_VP_RUNTIME:
 		data = current_task_runtime_100ns() + hv->runtime_offset;
 		break;
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
 		return 1;
@ -295,7 +322,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 		mutex_unlock(&vcpu->kvm->lock);
 		return r;
 	} else
-		return kvm_hv_set_msr(vcpu, msr, data);
+		return kvm_hv_set_msr(vcpu, msr, data, host);
 }
 int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@ -35,6 +35,7 @@
 #include <linux/kvm_host.h>
 #include <linux/slab.h>
 #include "ioapic.h"
 #include "irq.h"
 #include "i8254.h"
 #include "x86.h"
@ -333,7 +334,8 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
 	s64 interval;
-	if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
+	if (!ioapic_in_kernel(kvm) ||
 	    ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
 		return;
 	interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@ -233,21 +233,7 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
 }
-static void update_handled_vectors(struct kvm_ioapic *ioapic)
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 {
 	DECLARE_BITMAP(handled_vectors, 256);
 	int i;
 	memset(handled_vectors, 0, sizeof(handled_vectors));
 	for (i = 0; i < IOAPIC_NUM_PINS; ++i)
 		__set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors);
 	memcpy(ioapic->handled_vectors, handled_vectors,
 	       sizeof(handled_vectors));
 	smp_wmb();
 }
 void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
 			u32 *tmr)
 {
 	struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
 	union kvm_ioapic_redirect_entry *e;
@ -260,13 +246,11 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
 		    kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
 		    index == RTC_GSI) {
 			if (kvm_apic_match_dest(vcpu, NULL, 0,
-				e->fields.dest_id, e->fields.dest_mode)) {
+			             e->fields.dest_id, e->fields.dest_mode) ||
 			    (e->fields.trig_mode == IOAPIC_EDGE_TRIG &&
 			     kvm_apic_pending_eoi(vcpu, e->fields.vector)))
 				__set_bit(e->fields.vector,
 					(unsigned long *)eoi_exit_bitmap);
 				if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG)
 					__set_bit(e->fields.vector,
 						(unsigned long *)tmr);
 			}
 		}
 	}
 	spin_unlock(&ioapic->lock);
@ -315,7 +299,6 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 			e->bits |= (u32) val;
 			e->fields.remote_irr = 0;
 		}
 		update_handled_vectors(ioapic);
 		mask_after = e->fields.mask;
 		if (mask_before != mask_after)
 			kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
@ -599,7 +582,6 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
 	ioapic->id = 0;
 	memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
 	rtc_irq_eoi_tracking_reset(ioapic);
 	update_handled_vectors(ioapic);
 }
 static const struct kvm_io_device_ops ioapic_mmio_ops = {
@ -628,8 +610,10 @@ int kvm_ioapic_init(struct kvm *kvm)
 	if (ret < 0) {
 		kvm->arch.vioapic = NULL;
 		kfree(ioapic);
 		return ret;
 	}
 	kvm_vcpu_request_scan_ioapic(kvm);
 	return ret;
 }
@ -666,7 +650,6 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 	memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
 	ioapic->irr = 0;
 	ioapic->irr_delivered = 0;
 	update_handled_vectors(ioapic);
 	kvm_vcpu_request_scan_ioapic(kvm);
 	kvm_ioapic_inject_all(ioapic, state->irr);
 	spin_unlock(&ioapic->lock);
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@ -9,6 +9,7 @@ struct kvm;
 struct kvm_vcpu;
 #define IOAPIC_NUM_PINS  KVM_IOAPIC_NUM_PINS
 #define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES
 #define IOAPIC_VERSION_ID 0x11	/* IOAPIC version */
 #define IOAPIC_EDGE_TRIG  0
 #define IOAPIC_LEVEL_TRIG 1
@ -73,7 +74,6 @@ struct kvm_ioapic {
 	struct kvm *kvm;
 	void (*ack_notifier)(void *opaque, int irq);
 	spinlock_t lock;
 	DECLARE_BITMAP(handled_vectors, 256);
 	struct rtc_status rtc_status;
 	struct delayed_work eoi_inject;
 	u32 irq_eoi[IOAPIC_NUM_PINS];
@ -98,11 +98,12 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
 	return kvm->arch.vioapic;
 }
-static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
+static inline int ioapic_in_kernel(struct kvm *kvm)
 {
-	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+	int ret;
-	smp_rmb();
+
-	return test_bit(vector, ioapic->handled_vectors);
+	ret = (ioapic_irqchip(kvm) != NULL);
 	return ret;
 }
 void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
@ -120,7 +121,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 		struct kvm_lapic_irq *irq, unsigned long *dest_map);
 int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
-void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
-			u32 *tmr);
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
 #endif
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@ -37,15 +37,28 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
 /*
 * check if there is a pending userspace external interrupt
 */
 static int pending_userspace_extint(struct kvm_vcpu *v)
 {
 	return v->arch.pending_external_vector != -1;
 }
 /*
 * check if there is pending interrupt from
 * non-APIC source without intack.
 */
 static int kvm_cpu_has_extint(struct kvm_vcpu *v)
 {
-	if (kvm_apic_accept_pic_intr(v))
+	u8 accept = kvm_apic_accept_pic_intr(v);
-		return pic_irqchip(v->kvm)->output;	/* PIC */
+
-	else
+	if (accept) {
 		if (irqchip_split(v->kvm))
 			return pending_userspace_extint(v);
 		else
 			return pic_irqchip(v->kvm)->output;
 	} else
 		return 0;
 }
@ -57,13 +70,13 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
 */
 int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
 {
-	if (!irqchip_in_kernel(v->kvm))
+	if (!lapic_in_kernel(v))
 		return v->arch.interrupt.pending;
 	if (kvm_cpu_has_extint(v))
 		return 1;
-	if (kvm_apic_vid_enabled(v->kvm))
+	if (kvm_vcpu_apic_vid_enabled(v))
 		return 0;
 	return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
@ -75,7 +88,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
 */
 int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 {
-	if (!irqchip_in_kernel(v->kvm))
+	if (!lapic_in_kernel(v))
 		return v->arch.interrupt.pending;
 	if (kvm_cpu_has_extint(v))
@ -91,9 +104,16 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
 */
 static int kvm_cpu_get_extint(struct kvm_vcpu *v)
 {
-	if (kvm_cpu_has_extint(v))
+	if (kvm_cpu_has_extint(v)) {
-		return kvm_pic_read_irq(v->kvm); /* PIC */
+		if (irqchip_split(v->kvm)) {
-	return -1;
+			int vector = v->arch.pending_external_vector;
 			v->arch.pending_external_vector = -1;
 			return vector;
 		} else
 			return kvm_pic_read_irq(v->kvm); /* PIC */
 	} else
 		return -1;
 }
 /*
@ -103,7 +123,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 {
 	int vector;
-	if (!irqchip_in_kernel(v->kvm))
+	if (!lapic_in_kernel(v))
 		return v->arch.interrupt.nr;
 	vector = kvm_cpu_get_extint(v);
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@ -83,13 +83,38 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
 	return kvm->arch.vpic;
 }
 static inline int pic_in_kernel(struct kvm *kvm)
 {
 	int ret;
 	ret = (pic_irqchip(kvm) != NULL);
 	return ret;
 }
 static inline int irqchip_split(struct kvm *kvm)
 {
 	return kvm->arch.irqchip_split;
 }
 static inline int irqchip_in_kernel(struct kvm *kvm)
 {
 	struct kvm_pic *vpic = pic_irqchip(kvm);
 	bool ret;
 	ret = (vpic != NULL);
 	ret |= irqchip_split(kvm);
 	/* Read vpic before kvm->irq_routing.  */
 	smp_rmb();
-	return vpic != NULL;
+	return ret;
 }
 static inline int lapic_in_kernel(struct kvm_vcpu *vcpu)
 {
 	/* Same as irqchip_in_kernel(vcpu->kvm), but with less
 	 * pointer chasing and no unnecessary memory barriers.
 	 */
 	return vcpu->arch.apic != NULL;
 }
 void kvm_pic_reset(struct kvm_kpic_state *s);
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@ -91,8 +91,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 	return r;
 }
-static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
-				   struct kvm_lapic_irq *irq)
+		     struct kvm_lapic_irq *irq)
 {
 	trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
@ -108,6 +108,7 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
 	irq->level = 1;
 	irq->shorthand = 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 		struct kvm *kvm, int irq_source_id, int level, bool line_status)
@ -123,12 +124,16 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 }
-static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
-			 struct kvm *kvm)
+			      struct kvm *kvm, int irq_source_id, int level,
 			      bool line_status)
 {
 	struct kvm_lapic_irq irq;
 	int r;
 	if (unlikely(e->type != KVM_IRQ_ROUTING_MSI))
 		return -EWOULDBLOCK;
 	kvm_set_msi_irq(e, &irq);
 	if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
@ -137,42 +142,6 @@ static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
 		return -EWOULDBLOCK;
 }
 /*
 * Deliver an IRQ in an atomic context if we can, or return a failure,
 * user can retry in a process context.
 * Return value:
 *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
 *  Other values - No need to retry.
 */
 int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
 {
 	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
 	struct kvm_kernel_irq_routing_entry *e;
 	int ret = -EINVAL;
 	int idx;
 	trace_kvm_set_irq(irq, level, irq_source_id);
 	/*
 	 * Injection into either PIC or IOAPIC might need to scan all CPUs,
 	 * which would need to be retried from thread context;  when same GSI
 	 * is connected to both PIC and IOAPIC, we'd have to report a
 	 * partial failure here.
 	 * Since there's no easy way to do this, we only support injecting MSI
 	 * which is limited to 1:1 GSI mapping.
 	 */
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
 		e = &entries[0];
 		if (likely(e->type == KVM_IRQ_ROUTING_MSI))
 			ret = kvm_set_msi_inatomic(e, kvm);
 		else
 			ret = -EWOULDBLOCK;
 	}
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 	return ret;
 }
 int kvm_request_irq_source_id(struct kvm *kvm)
 {
 	unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
@ -208,7 +177,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
 		goto unlock;
 	}
 	clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
-	if (!irqchip_in_kernel(kvm))
+	if (!ioapic_in_kernel(kvm))
 		goto unlock;
 	kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
@ -297,6 +266,33 @@ int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
 	return r;
 }
 bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
 			     struct kvm_vcpu **dest_vcpu)
 {
 	int i, r = 0;
 	struct kvm_vcpu *vcpu;
 	if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
 		return true;
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		if (!kvm_apic_present(vcpu))
 			continue;
 		if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
 					irq->dest_id, irq->dest_mode))
 			continue;
 		if (++r == 2)
 			return false;
 		*dest_vcpu = vcpu;
 	}
 	return r == 1;
 }
 EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
 #define IOAPIC_ROUTING_ENTRY(irq) \
 	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
 	  .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
@ -328,3 +324,54 @@ int kvm_setup_default_irq_routing(struct kvm *kvm)
 	return kvm_set_irq_routing(kvm, default_routing,
 				   ARRAY_SIZE(default_routing), 0);
 }
 static const struct kvm_irq_routing_entry empty_routing[] = {};
 int kvm_setup_empty_irq_routing(struct kvm *kvm)
 {
 	return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
 }
 void kvm_arch_irq_routing_update(struct kvm *kvm)
 {
 	if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm))
 		return;
 	kvm_make_scan_ioapic_request(kvm);
 }
 void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_kernel_irq_routing_entry *entry;
 	struct kvm_irq_routing_table *table;
 	u32 i, nr_ioapic_pins;
 	int idx;
 	/* kvm->irq_routing must be read after clearing
 	 * KVM_SCAN_IOAPIC. */
 	smp_mb();
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
 	nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
 			       kvm->arch.nr_reserved_ioapic_pins);
 	for (i = 0; i < nr_ioapic_pins; ++i) {
 		hlist_for_each_entry(entry, &table->map[i], link) {
 			u32 dest_id, dest_mode;
 			bool level;
 			if (entry->type != KVM_IRQ_ROUTING_MSI)
 				continue;
 			dest_id = (entry->msi.address_lo >> 12) & 0xff;
 			dest_mode = (entry->msi.address_lo >> 2) & 0x1;
 			level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL;
 			if (level && kvm_apic_match_dest(vcpu, NULL, 0,
 						dest_id, dest_mode)) {
 				u32 vector = entry->msi.data & 0xff;
 				__set_bit(vector,
 					  (unsigned long *) eoi_exit_bitmap);
 			}
 		}
 	}
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 }
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@ -209,7 +209,7 @@ static void recalculate_apic_map(struct kvm *kvm)
 	if (old)
 		kfree_rcu(old, rcu);
-	kvm_vcpu_request_scan_ioapic(kvm);
+	kvm_make_scan_ioapic_request(kvm);
 }
 static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
@ -348,6 +348,8 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	__kvm_apic_update_irr(pir, apic->regs);
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
@ -390,7 +392,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
 	vcpu = apic->vcpu;
-	if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) {
+	if (unlikely(kvm_vcpu_apic_vid_enabled(vcpu))) {
 		/* try to update RVI */
 		apic_clear_vector(vec, apic->regs + APIC_IRR);
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
@ -551,15 +553,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
 	__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
 }
 void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	int i;
 	for (i = 0; i < 8; i++)
 		apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]);
 }
 static void apic_update_ppr(struct kvm_lapic *apic)
 {
 	u32 tpr, isrv, ppr, old_ppr;
@ -764,6 +757,65 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 	return ret;
 }
 bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
 			struct kvm_vcpu **dest_vcpu)
 {
 	struct kvm_apic_map *map;
 	bool ret = false;
 	struct kvm_lapic *dst = NULL;
 	if (irq->shorthand)
 		return false;
 	rcu_read_lock();
 	map = rcu_dereference(kvm->arch.apic_map);
 	if (!map)
 		goto out;
 	if (irq->dest_mode == APIC_DEST_PHYSICAL) {
 		if (irq->dest_id == 0xFF)
 			goto out;
 		if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
 			goto out;
 		dst = map->phys_map[irq->dest_id];
 		if (dst && kvm_apic_present(dst->vcpu))
 			*dest_vcpu = dst->vcpu;
 		else
 			goto out;
 	} else {
 		u16 cid;
 		unsigned long bitmap = 1;
 		int i, r = 0;
 		if (!kvm_apic_logical_map_valid(map))
 			goto out;
 		apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
 		if (cid >= ARRAY_SIZE(map->logical_map))
 			goto out;
 		for_each_set_bit(i, &bitmap, 16) {
 			dst = map->logical_map[cid][i];
 			if (++r == 2)
 				goto out;
 		}
 		if (dst && kvm_apic_present(dst->vcpu))
 			*dest_vcpu = dst->vcpu;
 		else
 			goto out;
 	}
 	ret = true;
 out:
 	rcu_read_unlock();
 	return ret;
 }
 /*
 * Add a pending IRQ into lapic.
 * Return 1 if successfully added and 0 if discarded.
@ -781,6 +833,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 	case APIC_DM_LOWEST:
 		vcpu->arch.apic_arb_prio++;
 	case APIC_DM_FIXED:
 		if (unlikely(trig_mode && !level))
 			break;
 		/* FIXME add logic for vcpu on reset */
 		if (unlikely(!apic_enabled(apic)))
 			break;
@ -790,6 +845,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		if (dest_map)
 			__set_bit(vcpu->vcpu_id, dest_map);
 		if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
 			if (trig_mode)
 				apic_set_vector(vector, apic->regs + APIC_TMR);
 			else
 				apic_clear_vector(vector, apic->regs + APIC_TMR);
 		}
 		if (kvm_x86_ops->deliver_posted_interrupt)
 			kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
 		else {
@ -868,16 +930,32 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 	return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
 }
 static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
 {
 	return test_bit(vector, (ulong *)apic->vcpu->arch.eoi_exit_bitmap);
 }
 static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
 {
-	if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
+	int trigger_mode;
-		int trigger_mode;
+
-		if (apic_test_vector(vector, apic->regs + APIC_TMR))
+	/* Eoi the ioapic only if the ioapic doesn't own the vector. */
-			trigger_mode = IOAPIC_LEVEL_TRIG;
+	if (!kvm_ioapic_handles_vector(apic, vector))
-		else
+		return;
-			trigger_mode = IOAPIC_EDGE_TRIG;
+
-		kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
+	/* Request a KVM exit to inform the userspace IOAPIC. */
 	if (irqchip_split(apic->vcpu->kvm)) {
 		apic->vcpu->arch.pending_ioapic_eoi = vector;
 		kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
 		return;
 	}
 	if (apic_test_vector(vector, apic->regs + APIC_TMR))
 		trigger_mode = IOAPIC_LEVEL_TRIG;
 	else
 		trigger_mode = IOAPIC_EDGE_TRIG;
 	kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
 }
 static int apic_set_eoi(struct kvm_lapic *apic)
@ -1615,7 +1693,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 		apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
 		apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
 	}
-	apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm);
+	apic->irr_pending = kvm_vcpu_apic_vid_enabled(vcpu);
 	apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0;
 	apic->highest_isr_cache = -1;
 	update_divide_count(apic);
@ -1838,7 +1916,10 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
 		kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
 				apic_find_highest_isr(apic));
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
-	kvm_rtc_eoi_tracking_restore_one(vcpu);
+	if (ioapic_in_kernel(vcpu->kvm))
 		kvm_rtc_eoi_tracking_restore_one(vcpu);
 	vcpu->arch.apic_arb_prio = 0;
 }
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
@ -1922,7 +2003,7 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
 	    /* Cache not set: could be safe but we don't bother. */
 	    apic->highest_isr_cache == -1 ||
 	    /* Need EOI to update ioapic. */
-	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
+	    kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) {
 		/*
 		 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
 		 * so we need not do anything here.
@ -1978,7 +2059,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u32 reg = (msr - APIC_BASE_MSR) << 4;
-	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
 		return 1;
 	if (reg == APIC_ICR2)
@ -1995,7 +2076,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
-	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
 		return 1;
 	if (reg == APIC_DFR || reg == APIC_ICR2) {
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@ -57,7 +57,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
 u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
 void __kvm_apic_update_irr(u32 *pir, void *regs);
 void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
@ -144,9 +143,9 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
 	return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
 }
-static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
+static inline bool kvm_vcpu_apic_vid_enabled(struct kvm_vcpu *vcpu)
 {
-	return kvm_x86_ops->vm_has_apicv(kvm);
+	return kvm_x86_ops->cpu_uses_apicv(vcpu);
 }
 static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
@ -169,4 +168,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 void wait_lapic_expire(struct kvm_vcpu *vcpu);
 bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
 			struct kvm_vcpu **dest_vcpu);
 #endif
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@ -818,14 +818,11 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 	kvm->arch.indirect_shadow_pages--;
 }
-static int has_wrprotected_page(struct kvm_vcpu *vcpu,
+static int __has_wrprotected_page(gfn_t gfn, int level,
-				gfn_t gfn,
+				  struct kvm_memory_slot *slot)
 				int level)
 {
 	struct kvm_memory_slot *slot;
 	struct kvm_lpage_info *linfo;
 	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 	if (slot) {
 		linfo = lpage_info_slot(gfn, slot, level);
 		return linfo->write_count;
@ -834,6 +831,14 @@ static int has_wrprotected_page(struct kvm_vcpu *vcpu,
 	return 1;
 }
 static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
 {
 	struct kvm_memory_slot *slot;
 	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 	return __has_wrprotected_page(gfn, level, slot);
 }
 static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
 {
 	unsigned long page_size;
@ -851,6 +856,17 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
 	return ret;
 }
 static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
 					  bool no_dirty_log)
 {
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 		return false;
 	if (no_dirty_log && slot->dirty_bitmap)
 		return false;
 	return true;
 }
 static struct kvm_memory_slot *
 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
 			    bool no_dirty_log)
@ -858,21 +874,25 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
 	struct kvm_memory_slot *slot;
 	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-	if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
+	if (!memslot_valid_for_gpte(slot, no_dirty_log))
 	      (no_dirty_log && slot->dirty_bitmap))
 		slot = NULL;
 	return slot;
 }
-static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
-{
+			 bool *force_pt_level)
 	return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
 }
 static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 {
 	int host_level, level, max_level;
 	struct kvm_memory_slot *slot;
 	if (unlikely(*force_pt_level))
 		return PT_PAGE_TABLE_LEVEL;
 	slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
 	*force_pt_level = !memslot_valid_for_gpte(slot, true);
 	if (unlikely(*force_pt_level))
 		return PT_PAGE_TABLE_LEVEL;
 	host_level = host_mapping_level(vcpu->kvm, large_gfn);
@ -882,7 +902,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 	max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
 	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
-		if (has_wrprotected_page(vcpu, large_gfn, level))
+		if (__has_wrprotected_page(large_gfn, level, slot))
 			break;
 	return level - 1;
@ -2962,14 +2982,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 {
 	int r;
 	int level;
-	int force_pt_level;
+	bool force_pt_level = false;
 	pfn_t pfn;
 	unsigned long mmu_seq;
 	bool map_writable, write = error_code & PFERR_WRITE_MASK;
-	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+	level = mapping_level(vcpu, gfn, &force_pt_level);
 	if (likely(!force_pt_level)) {
 		level = mapping_level(vcpu, gfn);
 		/*
 		 * This path builds a PAE pagetable - so we can map
 		 * 2mb pages at maximum. Therefore check if the level
@ -2979,8 +2998,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 			level = PT_DIRECTORY_LEVEL;
 		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
-	} else
+	}
 		level = PT_PAGE_TABLE_LEVEL;
 	if (fast_page_fault(vcpu, v, level, error_code))
 		return 0;
@ -3427,7 +3445,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
 static bool can_do_async_pf(struct kvm_vcpu *vcpu)
 {
-	if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
+	if (unlikely(!lapic_in_kernel(vcpu) ||
 		     kvm_event_needs_reinjection(vcpu)))
 		return false;
@ -3476,7 +3494,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	pfn_t pfn;
 	int r;
 	int level;
-	int force_pt_level;
+	bool force_pt_level;
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	unsigned long mmu_seq;
 	int write = error_code & PFERR_WRITE_MASK;
@ -3495,20 +3513,15 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	if (r)
 		return r;
-	if (mapping_level_dirty_bitmap(vcpu, gfn) ||
+	force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
-	    !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL))
+							   PT_DIRECTORY_LEVEL);
-		force_pt_level = 1;
+	level = mapping_level(vcpu, gfn, &force_pt_level);
 	else
 		force_pt_level = 0;
 	if (likely(!force_pt_level)) {
 		level = mapping_level(vcpu, gfn);
 		if (level > PT_DIRECTORY_LEVEL &&
 		    !check_hugepage_cache_consistency(vcpu, gfn, level))
 			level = PT_DIRECTORY_LEVEL;
 		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
-	} else
+	}
 		level = PT_PAGE_TABLE_LEVEL;
 	if (fast_page_fault(vcpu, gpa, level, error_code))
 		return 0;
@ -3706,7 +3719,7 @@ static void
 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
 			    int maxphyaddr, bool execonly)
 {
-	int pte;
+	u64 bad_mt_xwr;
 	rsvd_check->rsvd_bits_mask[0][3] =
 		rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
@ -3724,14 +3737,16 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
 		rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
 	rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
-	for (pte = 0; pte < 64; pte++) {
+	bad_mt_xwr = 0xFFull << (2 * 8);	/* bits 3..5 must not be 2 */
-		int rwx_bits = pte & 7;
+	bad_mt_xwr |= 0xFFull << (3 * 8);	/* bits 3..5 must not be 3 */
-		int mt = pte >> 3;
+	bad_mt_xwr |= 0xFFull << (7 * 8);	/* bits 3..5 must not be 7 */
-		if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
+	bad_mt_xwr |= REPEAT_BYTE(1ull << 2);	/* bits 0..2 must not be 010 */
-				rwx_bits == 0x2 || rwx_bits == 0x6 ||
+	bad_mt_xwr |= REPEAT_BYTE(1ull << 6);	/* bits 0..2 must not be 110 */
-				(rwx_bits == 0x4 && !execonly))
+	if (!execonly) {
-			rsvd_check->bad_mt_xwr |= (1ull << pte);
+		/* bits 0..2 must not be 100 unless VMX capabilities allow it */
 		bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
 	}
 	rsvd_check->bad_mt_xwr = bad_mt_xwr;
 }
 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@ -698,7 +698,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	int r;
 	pfn_t pfn;
 	int level = PT_PAGE_TABLE_LEVEL;
-	int force_pt_level;
+	bool force_pt_level = false;
 	unsigned long mmu_seq;
 	bool map_writable, is_self_change_mapping;
@ -743,15 +743,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
 	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
-	if (walker.level >= PT_DIRECTORY_LEVEL)
+	if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) {
-		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
+		level = mapping_level(vcpu, walker.gfn, &force_pt_level);
-		   || is_self_change_mapping;
+		if (likely(!force_pt_level)) {
-	else
+			level = min(walker.level, level);
-		force_pt_level = 1;
+			walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
-	if (!force_pt_level) {
+		}
-		level = min(walker.level, mapping_level(vcpu, walker.gfn));
+	} else
-		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
+		force_pt_level = true;
 	}
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@ -159,6 +159,9 @@ struct vcpu_svm {
 	u32 apf_reason;
 	u64  tsc_ratio;
 	/* cached guest cpuid flags for faster access */
 	bool nrips_enabled	: 1;
 };
 static DEFINE_PER_CPU(u64, current_tsc_ratio);
@ -1086,7 +1089,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 	return target_tsc - tsc;
 }
-static void init_vmcb(struct vcpu_svm *svm, bool init_event)
+static void init_vmcb(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
 	struct vmcb_save_area *save = &svm->vmcb->save;
@ -1157,8 +1160,7 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event)
 	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
 	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
-	if (!init_event)
+	svm_set_efer(&svm->vcpu, 0);
 		svm_set_efer(&svm->vcpu, 0);
 	save->dr6 = 0xffff0ff0;
 	kvm_set_rflags(&svm->vcpu, 2);
 	save->rip = 0x0000fff0;
@ -1212,7 +1214,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 		if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
 			svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
 	}
-	init_vmcb(svm, init_event);
+	init_vmcb(svm);
 	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
 	kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
@ -1268,7 +1270,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	clear_page(svm->vmcb);
 	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
 	svm->asid_generation = 0;
-	init_vmcb(svm, false);
+	init_vmcb(svm);
 	svm_init_osvw(&svm->vcpu);
@ -1890,7 +1892,7 @@ static int shutdown_interception(struct vcpu_svm *svm)
 	 * so reinitialize it.
 	 */
 	clear_page(svm->vmcb);
-	init_vmcb(svm, false);
+	init_vmcb(svm);
 	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
 	return 0;
@ -2365,7 +2367,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
 	nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
 	nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
-	nested_vmcb->control.next_rip          = vmcb->control.next_rip;
+
 	if (svm->nrips_enabled)
 		nested_vmcb->control.next_rip  = vmcb->control.next_rip;
 	/*
 	 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
@ -3060,7 +3064,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
 	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
 	/* instruction emulation calls kvm_set_cr8() */
 	r = cr_interception(svm);
-	if (irqchip_in_kernel(svm->vcpu.kvm))
+	if (lapic_in_kernel(&svm->vcpu))
 		return r;
 	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
 		return r;
@ -3294,24 +3298,11 @@ static int msr_interception(struct vcpu_svm *svm)
 static int interrupt_window_interception(struct vcpu_svm *svm)
 {
 	struct kvm_run *kvm_run = svm->vcpu.run;
 	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
 	svm_clear_vintr(svm);
 	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
 	mark_dirty(svm->vmcb, VMCB_INTR);
 	++svm->vcpu.stat.irq_window_exits;
 	/*
 	 * If the user space waits to inject interrupts, exit as soon as
 	 * possible
 	 */
 	if (!irqchip_in_kernel(svm->vcpu.kvm) &&
 	    kvm_run->request_interrupt_window &&
 	    !kvm_cpu_has_interrupt(&svm->vcpu)) {
 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 		return 0;
 	}
 	return 1;
 }
@ -3659,12 +3650,12 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 	return;
 }
-static int svm_vm_has_apicv(struct kvm *kvm)
+static int svm_cpu_uses_apicv(struct kvm_vcpu *vcpu)
 {
 	return 0;
 }
-static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu)
 {
 	return;
 }
@ -4098,6 +4089,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	/* Update nrips enabled cache */
 	svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu);
 }
 static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@ -4425,7 +4420,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
 	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
-	.vm_has_apicv = svm_vm_has_apicv,
+	.cpu_uses_apicv = svm_cpu_uses_apicv,
 	.load_eoi_exitmap = svm_load_eoi_exitmap,
 	.sync_pir_to_irr = svm_sync_pir_to_irr,
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@ -128,6 +128,24 @@ TRACE_EVENT(kvm_pio,
 		  __entry->count > 1 ? "(...)" : "")
 );
 /*
 * Tracepoint for fast mmio.
 */
 TRACE_EVENT(kvm_fast_mmio,
 	TP_PROTO(u64 gpa),
 	TP_ARGS(gpa),
 	TP_STRUCT__entry(
 		__field(u64,	gpa)
 	),
 	TP_fast_assign(
 		__entry->gpa		= gpa;
 	),
 	TP_printk("fast mmio at gpa 0x%llx", __entry->gpa)
 );
 /*
 * Tracepoint for cpuid.
 */
@ -974,6 +992,39 @@ TRACE_EVENT(kvm_enter_smm,
 		  __entry->smbase)
 );
 /*
 * Tracepoint for VT-d posted-interrupts.
 */
 TRACE_EVENT(kvm_pi_irte_update,
 	TP_PROTO(unsigned int vcpu_id, unsigned int gsi,
 		 unsigned int gvec, u64 pi_desc_addr, bool set),
 	TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set),
 	TP_STRUCT__entry(
 		__field(	unsigned int,	vcpu_id		)
 		__field(	unsigned int,	gsi		)
 		__field(	unsigned int,	gvec		)
 		__field(	u64,		pi_desc_addr	)
 		__field(	bool,		set		)
 	),
 	TP_fast_assign(
 		__entry->vcpu_id	= vcpu_id;
 		__entry->gsi		= gsi;
 		__entry->gvec		= gvec;
 		__entry->pi_desc_addr	= pi_desc_addr;
 		__entry->set		= set;
 	),
 	TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, "
 		  "gvec: 0x%x, pi_desc_addr: 0x%llx",
 		  __entry->set ? "enabled and being updated" : "disabled",
 		  __entry->vcpu_id,
 		  __entry->gsi,
 		  __entry->gvec,
 		  __entry->pi_desc_addr)
 );
 #endif /* _TRACE_KVM_H */
 #undef TRACE_INCLUDE_PATH
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@ -51,6 +51,8 @@
 #include <linux/pci.h>
 #include <linux/timekeeper_internal.h>
 #include <linux/pvclock_gtod.h>
 #include <linux/kvm_irqfd.h>
 #include <linux/irqbypass.h>
 #include <trace/events/kvm.h>
 #define CREATE_TRACE_POINTS
@ -64,6 +66,7 @@
 #include <asm/fpu/internal.h> /* Ugh! */
 #include <asm/pvclock.h>
 #include <asm/div64.h>
 #include <asm/irq_remapping.h>
 #define MAX_IO_MSRS 256
 #define KVM_MAX_MCE_BANKS 32
@ -622,7 +625,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	if ((cr0 ^ old_cr0) & update_bits)
 		kvm_mmu_reset_context(vcpu);
-	if ((cr0 ^ old_cr0) & X86_CR0_CD)
+	if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
 	    kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
 	    !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
 		kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
 	return 0;
@ -789,7 +794,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
 	if (cr8 & CR8_RESERVED_BITS)
 		return 1;
-	if (irqchip_in_kernel(vcpu->kvm))
+	if (lapic_in_kernel(vcpu))
 		kvm_lapic_set_tpr(vcpu, cr8);
 	else
 		vcpu->arch.cr8 = cr8;
@ -799,7 +804,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8);
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 {
-	if (irqchip_in_kernel(vcpu->kvm))
+	if (lapic_in_kernel(vcpu))
 		return kvm_lapic_get_cr8(vcpu);
 	else
 		return vcpu->arch.cr8;
@ -953,6 +958,9 @@ static u32 emulated_msrs[] = {
 	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
 	HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
 	HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
 	HV_X64_MSR_RESET,
 	HV_X64_MSR_VP_INDEX,
 	HV_X64_MSR_VP_RUNTIME,
 	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 	MSR_KVM_PV_EOI_EN,
@ -1898,6 +1906,8 @@ static void accumulate_steal_time(struct kvm_vcpu *vcpu)
 static void record_steal_time(struct kvm_vcpu *vcpu)
 {
 	accumulate_steal_time(vcpu);
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 		return;
@ -2048,12 +2058,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (!(data & KVM_MSR_ENABLED))
 			break;
 		vcpu->arch.st.last_steal = current->sched_info.run_delay;
 		preempt_disable();
 		accumulate_steal_time(vcpu);
 		preempt_enable();
 		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 		break;
@ -2449,6 +2453,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ENABLE_CAP_VM:
 	case KVM_CAP_DISABLE_QUIRKS:
 	case KVM_CAP_SET_BOOT_CPU_ID:
 	case KVM_CAP_SPLIT_IRQCHIP:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_PCI_2_3:
@ -2628,7 +2633,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		vcpu->cpu = cpu;
 	}
 	accumulate_steal_time(vcpu);
 	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 }
@ -2662,12 +2666,24 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 {
 	if (irq->irq >= KVM_NR_INTERRUPTS)
 		return -EINVAL;
-	if (irqchip_in_kernel(vcpu->kvm))
+
 	if (!irqchip_in_kernel(vcpu->kvm)) {
 		kvm_queue_interrupt(vcpu, irq->irq, false);
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
 		return 0;
 	}
 	/*
 	 * With in-kernel LAPIC, we only use this to inject EXTINT, so
 	 * fail for in-kernel 8259.
 	 */
 	if (pic_in_kernel(vcpu->kvm))
 		return -ENXIO;
-	kvm_queue_interrupt(vcpu, irq->irq, false);
+	if (vcpu->arch.pending_external_vector != -1)
-	kvm_make_request(KVM_REQ_EVENT, vcpu);
+		return -EEXIST;
 	vcpu->arch.pending_external_vector = irq->irq;
 	return 0;
 }
@ -3176,7 +3192,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		struct kvm_vapic_addr va;
 		r = -EINVAL;
-		if (!irqchip_in_kernel(vcpu->kvm))
+		if (!lapic_in_kernel(vcpu))
 			goto out;
 		r = -EFAULT;
 		if (copy_from_user(&va, argp, sizeof va))
@ -3425,41 +3441,35 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
 	int r = 0;
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
-	return r;
+	return 0;
 }
 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
 	int r = 0;
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
 	kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
-	return r;
+	return 0;
 }
 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 {
 	int r = 0;
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
 		sizeof(ps->channels));
 	ps->flags = kvm->arch.vpit->pit_state.flags;
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 	memset(&ps->reserved, 0, sizeof(ps->reserved));
-	return r;
+	return 0;
 }
 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 {
-	int r = 0, start = 0;
+	int start = 0;
 	u32 prev_legacy, cur_legacy;
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
@ -3471,7 +3481,7 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 	kvm->arch.vpit->pit_state.flags = ps->flags;
 	kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
-	return r;
+	return 0;
 }
 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
@ -3556,6 +3566,28 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		kvm->arch.disabled_quirks = cap->args[0];
 		r = 0;
 		break;
 	case KVM_CAP_SPLIT_IRQCHIP: {
 		mutex_lock(&kvm->lock);
 		r = -EINVAL;
 		if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
 			goto split_irqchip_unlock;
 		r = -EEXIST;
 		if (irqchip_in_kernel(kvm))
 			goto split_irqchip_unlock;
 		if (atomic_read(&kvm->online_vcpus))
 			goto split_irqchip_unlock;
 		r = kvm_setup_empty_irq_routing(kvm);
 		if (r)
 			goto split_irqchip_unlock;
 		/* Pairs with irqchip_in_kernel. */
 		smp_wmb();
 		kvm->arch.irqchip_split = true;
 		kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
 		r = 0;
 split_irqchip_unlock:
 		mutex_unlock(&kvm->lock);
 		break;
 	}
 	default:
 		r = -EINVAL;
 		break;
@ -3669,7 +3701,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		}
 		r = -ENXIO;
-		if (!irqchip_in_kernel(kvm))
+		if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
 			goto get_irqchip_out;
 		r = kvm_vm_ioctl_get_irqchip(kvm, chip);
 		if (r)
@ -3693,7 +3725,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		}
 		r = -ENXIO;
-		if (!irqchip_in_kernel(kvm))
+		if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
 			goto set_irqchip_out;
 		r = kvm_vm_ioctl_set_irqchip(kvm, chip);
 		if (r)
@ -4060,6 +4092,15 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
 }
 static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
 		unsigned long addr, void *val, unsigned int bytes)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 	int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
 	return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
 }
 int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 				       gva_t addr, void *val,
 				       unsigned int bytes,
@ -4795,6 +4836,7 @@ static const struct x86_emulate_ops emulate_ops = {
 	.write_gpr           = emulator_write_gpr,
 	.read_std            = kvm_read_guest_virt_system,
 	.write_std           = kvm_write_guest_virt_system,
 	.read_phys           = kvm_read_guest_phys_system,
 	.fetch               = kvm_fetch_guest_virt,
 	.read_emulated       = emulator_read_emulated,
 	.write_emulated      = emulator_write_emulated,
@ -5667,7 +5709,7 @@ void kvm_arch_exit(void)
 int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.halt_exits;
-	if (irqchip_in_kernel(vcpu->kvm)) {
+	if (lapic_in_kernel(vcpu)) {
 		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
 		return 1;
 	} else {
@ -5774,9 +5816,15 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
 */
 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
 {
-	return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
+	if (!vcpu->run->request_interrupt_window || pic_in_kernel(vcpu->kvm))
-		vcpu->run->request_interrupt_window &&
+		return false;
-		kvm_arch_interrupt_allowed(vcpu));
+
 	if (kvm_cpu_has_interrupt(vcpu))
 		return false;
 	return (irqchip_split(vcpu->kvm)
 		? kvm_apic_accept_pic_intr(vcpu)
 		: kvm_arch_interrupt_allowed(vcpu));
 }
 static void post_kvm_run_save(struct kvm_vcpu *vcpu)
@ -5787,13 +5835,17 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
 	kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
 	kvm_run->cr8 = kvm_get_cr8(vcpu);
 	kvm_run->apic_base = kvm_get_apic_base(vcpu);
-	if (irqchip_in_kernel(vcpu->kvm))
+	if (!irqchip_in_kernel(vcpu->kvm))
 		kvm_run->ready_for_interrupt_injection = 1;
 	else
 		kvm_run->ready_for_interrupt_injection =
 			kvm_arch_interrupt_allowed(vcpu) &&
 			!kvm_cpu_has_interrupt(vcpu) &&
 			!kvm_event_needs_reinjection(vcpu);
 	else if (!pic_in_kernel(vcpu->kvm))
 		kvm_run->ready_for_interrupt_injection =
 			kvm_apic_accept_pic_intr(vcpu) &&
 			!kvm_cpu_has_interrupt(vcpu);
 	else
 		kvm_run->ready_for_interrupt_injection = 1;
 }
 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@ -6144,18 +6196,18 @@ static void process_smi(struct kvm_vcpu *vcpu)
 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 {
 	u64 eoi_exit_bitmap[4];
 	u32 tmr[8];
 	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
 		return;
-	memset(eoi_exit_bitmap, 0, 32);
+	memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8);
 	memset(tmr, 0, 32);
-	kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr);
+	if (irqchip_split(vcpu->kvm))
-	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+		kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap);
-	kvm_apic_update_tmr(vcpu, tmr);
+	else {
 		kvm_x86_ops->sync_pir_to_irr(vcpu);
 		kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap);
 	}
 	kvm_x86_ops->load_eoi_exitmap(vcpu);
 }
 static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
@ -6168,7 +6220,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
 {
 	struct page *page = NULL;
-	if (!irqchip_in_kernel(vcpu->kvm))
+	if (!lapic_in_kernel(vcpu))
 		return;
 	if (!kvm_x86_ops->set_apic_access_page_addr)
@ -6206,7 +6258,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
 	int r;
-	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+	bool req_int_win = !lapic_in_kernel(vcpu) &&
 		vcpu->run->request_interrupt_window;
 	bool req_immediate_exit = false;
@ -6258,6 +6310,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_pmu_handle_event(vcpu);
 		if (kvm_check_request(KVM_REQ_PMI, vcpu))
 			kvm_pmu_deliver_pmi(vcpu);
 		if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
 			BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
 			if (test_bit(vcpu->arch.pending_ioapic_eoi,
 				     (void *) vcpu->arch.eoi_exit_bitmap)) {
 				vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
 				vcpu->run->eoi.vector =
 						vcpu->arch.pending_ioapic_eoi;
 				r = 0;
 				goto out;
 			}
 		}
 		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
 			vcpu_scan_ioapic(vcpu);
 		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
@ -6268,6 +6331,26 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			r = 0;
 			goto out;
 		}
 		if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
 			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
 			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
 			r = 0;
 			goto out;
 		}
 	}
 	/*
 	 * KVM_REQ_EVENT is not set when posted interrupts are set by
 	 * VT-d hardware, so we have to update RVI unconditionally.
 	 */
 	if (kvm_lapic_enabled(vcpu)) {
 		/*
 		 * Update architecture specific hints for APIC
 		 * virtual interrupt delivery.
 		 */
 		if (kvm_x86_ops->hwapic_irr_update)
 			kvm_x86_ops->hwapic_irr_update(vcpu,
 				kvm_lapic_find_highest_irr(vcpu));
 	}
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@ -6286,13 +6369,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_x86_ops->enable_irq_window(vcpu);
 		if (kvm_lapic_enabled(vcpu)) {
 			/*
 			 * Update architecture specific hints for APIC
 			 * virtual interrupt delivery.
 			 */
 			if (kvm_x86_ops->hwapic_irr_update)
 				kvm_x86_ops->hwapic_irr_update(vcpu,
 					kvm_lapic_find_highest_irr(vcpu));
 			update_cr8_intercept(vcpu);
 			kvm_lapic_sync_to_vapic(vcpu);
 		}
@ -6428,10 +6504,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 {
-	if (!kvm_arch_vcpu_runnable(vcpu)) {
+	if (!kvm_arch_vcpu_runnable(vcpu) &&
 	    (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) {
 		srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 		kvm_vcpu_block(vcpu);
 		vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 		if (kvm_x86_ops->post_block)
 			kvm_x86_ops->post_block(vcpu);
 		if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
 			return 1;
 	}
@ -6468,10 +6549,12 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 	for (;;) {
-		if (kvm_vcpu_running(vcpu))
+		if (kvm_vcpu_running(vcpu)) {
 			r = vcpu_enter_guest(vcpu);
-		else
+		} else {
 			r = vcpu_block(kvm, vcpu);
 		}
 		if (r <= 0)
 			break;
@ -6480,8 +6563,8 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 			kvm_inject_pending_timer_irqs(vcpu);
 		if (dm_request_for_irq_injection(vcpu)) {
-			r = -EINTR;
+			r = 0;
-			vcpu->run->exit_reason = KVM_EXIT_INTR;
+			vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 			++vcpu->stat.request_irq_exits;
 			break;
 		}
@ -6608,7 +6691,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 	/* re-sync apic's tpr */
-	if (!irqchip_in_kernel(vcpu->kvm)) {
+	if (!lapic_in_kernel(vcpu)) {
 		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
 			r = -EINVAL;
 			goto out;
@ -7308,7 +7391,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
 bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
 {
-	return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
+	return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu);
 }
 struct static_key kvm_no_apic_vcpu __read_mostly;
@ -7377,6 +7460,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	kvm_async_pf_hash_reset(vcpu);
 	kvm_pmu_init(vcpu);
 	vcpu->arch.pending_external_vector = -1;
 	return 0;
 fail_free_mce_banks:
@ -7402,7 +7487,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 	kvm_mmu_destroy(vcpu);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	free_page((unsigned long)vcpu->arch.pio_data);
-	if (!irqchip_in_kernel(vcpu->kvm))
+	if (!lapic_in_kernel(vcpu))
 		static_key_slow_dec(&kvm_no_apic_vcpu);
 }
@ -8029,7 +8114,59 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
 				      struct irq_bypass_producer *prod)
 {
 	struct kvm_kernel_irqfd *irqfd =
 		container_of(cons, struct kvm_kernel_irqfd, consumer);
 	if (kvm_x86_ops->update_pi_irte) {
 		irqfd->producer = prod;
 		return kvm_x86_ops->update_pi_irte(irqfd->kvm,
 				prod->irq, irqfd->gsi, 1);
 	}
 	return -EINVAL;
 }
 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 				      struct irq_bypass_producer *prod)
 {
 	int ret;
 	struct kvm_kernel_irqfd *irqfd =
 		container_of(cons, struct kvm_kernel_irqfd, consumer);
 	if (!kvm_x86_ops->update_pi_irte) {
 		WARN_ON(irqfd->producer != NULL);
 		return;
 	}
 	WARN_ON(irqfd->producer != prod);
 	irqfd->producer = NULL;
 	/*
 	 * When producer of consumer is unregistered, we change back to
 	 * remapped mode, so we can re-use the current implementation
 	 * when the irq is masked/disabed or the consumer side (KVM
 	 * int this case doesn't want to receive the interrupts.
 	*/
 	ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
 	if (ret)
 		printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
 		       " fails: %d\n", irqfd->consumer.token, ret);
 }
 int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
 				   uint32_t guest_irq, bool set)
 {
 	if (!kvm_x86_ops->update_pi_irte)
 		return -EINVAL;
 	return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
 }
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
@ -8044,3 +8181,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@ -63,9 +63,6 @@ enum hv_cpuid_function {
 /* Define version of the synthetic interrupt controller. */
 #define HV_SYNIC_VERSION		(1)
 /* Define the expected SynIC version. */
 #define HV_SYNIC_VERSION_1		(0x1)
 /* Define synthetic interrupt controller message constants. */
 #define HV_MESSAGE_SIZE			(256)
 #define HV_MESSAGE_PAYLOAD_BYTE_COUNT	(240)
@ -105,8 +102,6 @@ enum hv_message_type {
 	HVMSG_X64_LEGACY_FP_ERROR		= 0x80010005
 };
 /* Define the number of synthetic interrupt sources. */
 #define HV_SYNIC_SINT_COUNT		(16)
 #define HV_SYNIC_STIMER_COUNT		(4)
 /* Define invalid partition identifier. */
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@ -22,7 +22,7 @@ int irq_remap_broken;
 int disable_sourceid_checking;
 int no_x2apic_optout;
-int disable_irq_post = 1;
+int disable_irq_post = 0;
 static int disable_irq_remap;
 static struct irq_remap_ops *remap_ops;
@ -58,14 +58,18 @@ static __init int setup_irqremap(char *str)
 		return -EINVAL;
 	while (*str) {
-		if (!strncmp(str, "on", 2))
+		if (!strncmp(str, "on", 2)) {
 			disable_irq_remap = 0;
-		else if (!strncmp(str, "off", 3))
+			disable_irq_post = 0;
 		} else if (!strncmp(str, "off", 3)) {
 			disable_irq_remap = 1;
-		else if (!strncmp(str, "nosid", 5))
+			disable_irq_post = 1;
 		} else if (!strncmp(str, "nosid", 5))
 			disable_sourceid_checking = 1;
 		else if (!strncmp(str, "no_x2apic_optout", 16))
 			no_x2apic_optout = 1;
 		else if (!strncmp(str, "nopost", 6))
 			disable_irq_post = 1;
 		str += strcspn(str, ",");
 		while (*str == ',')
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@ -33,3 +33,4 @@ menuconfig VFIO
 source "drivers/vfio/pci/Kconfig"
 source "drivers/vfio/platform/Kconfig"
 source "virt/lib/Kconfig"
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@ -2,6 +2,7 @@ config VFIO_PCI
 	tristate "VFIO support for PCI devices"
 	depends on VFIO && PCI && EVENTFD
 	select VFIO_VIRQFD
 	select IRQ_BYPASS_MANAGER
 	help
 	  Support for the PCI VFIO bus driver.  This is required to make
 	  use of PCI drivers using the VFIO framework.
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@ -319,6 +319,7 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
 	if (vdev->ctx[vector].trigger) {
 		free_irq(irq, vdev->ctx[vector].trigger);
 		irq_bypass_unregister_producer(&vdev->ctx[vector].producer);
 		kfree(vdev->ctx[vector].name);
 		eventfd_ctx_put(vdev->ctx[vector].trigger);
 		vdev->ctx[vector].trigger = NULL;
@ -360,6 +361,14 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
 		return ret;
 	}
 	vdev->ctx[vector].producer.token = trigger;
 	vdev->ctx[vector].producer.irq = irq;
 	ret = irq_bypass_register_producer(&vdev->ctx[vector].producer);
 	if (unlikely(ret))
 		dev_info(&pdev->dev,
 		"irq bypass producer (token %p) registration fails: %d\n",
 		vdev->ctx[vector].producer.token, ret);
 	vdev->ctx[vector].trigger = trigger;
 	return 0;
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@ -13,6 +13,7 @@
 #include <linux/mutex.h>
 #include <linux/pci.h>
 #include <linux/irqbypass.h>
 #ifndef VFIO_PCI_PRIVATE_H
 #define VFIO_PCI_PRIVATE_H
@ -29,6 +30,7 @@ struct vfio_pci_irq_ctx {
 	struct virqfd		*mask;
 	char			*name;
 	bool			masked;
 	struct irq_bypass_producer	producer;
 };
 struct vfio_pci_device {
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@ -51,7 +51,7 @@ struct arch_timer_cpu {
 	bool				armed;
 	/* Timer IRQ */
-	const struct kvm_irq_level	*irq;
+	struct kvm_irq_level		irq;
 	/* VGIC mapping */
 	struct irq_phys_map		*map;
@ -71,5 +71,7 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 bool kvm_timer_should_fire(struct kvm_vcpu *vcpu);
 void kvm_timer_schedule(struct kvm_vcpu *vcpu);
 void kvm_timer_unschedule(struct kvm_vcpu *vcpu);
 #endif
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@ -112,7 +112,6 @@ struct vgic_vmcr {
 struct vgic_ops {
 	struct vgic_lr	(*get_lr)(const struct kvm_vcpu *, int);
 	void	(*set_lr)(struct kvm_vcpu *, int, struct vgic_lr);
 	void	(*sync_lr_elrsr)(struct kvm_vcpu *, int, struct vgic_lr);
 	u64	(*get_elrsr)(const struct kvm_vcpu *vcpu);
 	u64	(*get_eisr)(const struct kvm_vcpu *vcpu);
 	void	(*clear_eisr)(struct kvm_vcpu *vcpu);
@ -159,7 +158,6 @@ struct irq_phys_map {
 	u32			virt_irq;
 	u32			phys_irq;
 	u32			irq;
 	bool			active;
 };
 struct irq_phys_map_entry {
@ -296,22 +294,16 @@ struct vgic_v3_cpu_if {
 };
 struct vgic_cpu {
 	/* per IRQ to LR mapping */
 	u8		*vgic_irq_lr_map;
 	/* Pending/active/both interrupts on this VCPU */
-	DECLARE_BITMAP(	pending_percpu, VGIC_NR_PRIVATE_IRQS);
+	DECLARE_BITMAP(pending_percpu, VGIC_NR_PRIVATE_IRQS);
-	DECLARE_BITMAP(	active_percpu, VGIC_NR_PRIVATE_IRQS);
+	DECLARE_BITMAP(active_percpu, VGIC_NR_PRIVATE_IRQS);
-	DECLARE_BITMAP(	pend_act_percpu, VGIC_NR_PRIVATE_IRQS);
+	DECLARE_BITMAP(pend_act_percpu, VGIC_NR_PRIVATE_IRQS);
 	/* Pending/active/both shared interrupts, dynamically sized */
 	unsigned long	*pending_shared;
 	unsigned long   *active_shared;
 	unsigned long   *pend_act_shared;
 	/* Bitmap of used/free list registers */
 	DECLARE_BITMAP(	lr_used, VGIC_V2_MAX_LRS);
 	/* Number of list registers on this CPU */
 	int		nr_lr;
@ -354,8 +346,6 @@ int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
 struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
 					   int virt_irq, int irq);
 int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map);
 bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map);
 void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active);
 #define irqchip_in_kernel(k)	(!!((k)->arch.vgic.in_kernel))
 #define vgic_initialized(k)	(!!((k)->arch.vgic.nr_cpus))
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@ -26,6 +26,7 @@
 #define _HYPERV_H
 #include <uapi/linux/hyperv.h>
 #include <uapi/asm/hyperv.h>
 #include <linux/types.h>
 #include <linux/scatterlist.h>
--- a/include/linux/irqbypass.h
+++ b/include/linux/irqbypass.h
@ -0,0 +1,90 @@
 /*
 * IRQ offload/bypass manager
 *
 * Copyright (C) 2015 Red Hat, Inc.
 * Copyright (c) 2015 Linaro Ltd.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
 #ifndef IRQBYPASS_H
 #define IRQBYPASS_H
 #include <linux/list.h>
 struct irq_bypass_consumer;
 /*
 * Theory of operation
 *
 * The IRQ bypass manager is a simple set of lists and callbacks that allows
 * IRQ producers (ex. physical interrupt sources) to be matched to IRQ
 * consumers (ex. virtualization hardware that allows IRQ bypass or offload)
 * via a shared token (ex. eventfd_ctx).  Producers and consumers register
 * independently.  When a token match is found, the optional @stop callback
 * will be called for each participant.  The pair will then be connected via
 * the @add_* callbacks, and finally the optional @start callback will allow
 * any final coordination.  When either participant is unregistered, the
 * process is repeated using the @del_* callbacks in place of the @add_*
 * callbacks.  Match tokens must be unique per producer/consumer, 1:N pairings
 * are not supported.
 */
 /**
 * struct irq_bypass_producer - IRQ bypass producer definition
 * @node: IRQ bypass manager private list management
 * @token: opaque token to match between producer and consumer
 * @irq: Linux IRQ number for the producer device
 * @add_consumer: Connect the IRQ producer to an IRQ consumer (optional)
 * @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional)
 * @stop: Perform any quiesce operations necessary prior to add/del (optional)
 * @start: Perform any startup operations necessary after add/del (optional)
 *
 * The IRQ bypass producer structure represents an interrupt source for
 * participation in possible host bypass, for instance an interrupt vector
 * for a physical device assigned to a VM.
 */
 struct irq_bypass_producer {
 	struct list_head node;
 	void *token;
 	int irq;
 	int (*add_consumer)(struct irq_bypass_producer *,
 			    struct irq_bypass_consumer *);
 	void (*del_consumer)(struct irq_bypass_producer *,
 			     struct irq_bypass_consumer *);
 	void (*stop)(struct irq_bypass_producer *);
 	void (*start)(struct irq_bypass_producer *);
 };
 /**
 * struct irq_bypass_consumer - IRQ bypass consumer definition
 * @node: IRQ bypass manager private list management
 * @token: opaque token to match between producer and consumer
 * @add_producer: Connect the IRQ consumer to an IRQ producer
 * @del_producer: Disconnect the IRQ consumer from an IRQ producer
 * @stop: Perform any quiesce operations necessary prior to add/del (optional)
 * @start: Perform any startup operations necessary after add/del (optional)
 *
 * The IRQ bypass consumer structure represents an interrupt sink for
 * participation in possible host bypass, for instance a hypervisor may
 * support offloads to allow bypassing the host entirely or offload
 * portions of the interrupt handling to the VM.
 */
 struct irq_bypass_consumer {
 	struct list_head node;
 	void *token;
 	int (*add_producer)(struct irq_bypass_consumer *,
 			    struct irq_bypass_producer *);
 	void (*del_producer)(struct irq_bypass_consumer *,
 			     struct irq_bypass_producer *);
 	void (*stop)(struct irq_bypass_consumer *);
 	void (*start)(struct irq_bypass_consumer *);
 };
 int irq_bypass_register_producer(struct irq_bypass_producer *);
 void irq_bypass_unregister_producer(struct irq_bypass_producer *);
 int irq_bypass_register_consumer(struct irq_bypass_consumer *);
 void irq_bypass_unregister_consumer(struct irq_bypass_consumer *);
 #endif /* IRQBYPASS_H */
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@ -24,6 +24,7 @@
 #include <linux/err.h>
 #include <linux/irqflags.h>
 #include <linux/context_tracking.h>
 #include <linux/irqbypass.h>
 #include <asm/signal.h>
 #include <linux/kvm.h>
@ -140,6 +141,8 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_APIC_PAGE_RELOAD  25
 #define KVM_REQ_SMI               26
 #define KVM_REQ_HV_CRASH          27
 #define KVM_REQ_IOAPIC_EOI_EXIT   28
 #define KVM_REQ_HV_RESET          29
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
@ -231,6 +234,9 @@ struct kvm_vcpu {
 	unsigned long requests;
 	unsigned long guest_debug;
 	int pre_pcpu;
 	struct list_head blocked_vcpu_list;
 	struct mutex mutex;
 	struct kvm_run *run;
@ -329,6 +335,18 @@ struct kvm_kernel_irq_routing_entry {
 	struct hlist_node link;
 };
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 struct kvm_irq_routing_table {
 	int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
 	u32 nr_rt_entries;
 	/*
 	 * Array indexed by gsi. Each entry contains list of irq chips
 	 * the gsi is connected to.
 	 */
 	struct hlist_head map[0];
 };
 #endif
 #ifndef KVM_PRIVATE_MEM_SLOTS
 #define KVM_PRIVATE_MEM_SLOTS 0
 #endif
@ -455,10 +473,14 @@ void vcpu_put(struct kvm_vcpu *vcpu);
 #ifdef __KVM_HAVE_IOAPIC
 void kvm_vcpu_request_scan_ioapic(struct kvm *kvm);
 void kvm_arch_irq_routing_update(struct kvm *kvm);
 #else
 static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
 {
 }
 static inline void kvm_arch_irq_routing_update(struct kvm *kvm)
 {
 }
 #endif
 #ifdef CONFIG_HAVE_KVM_IRQFD
@ -625,6 +647,8 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 int kvm_vcpu_yield_to(struct kvm_vcpu *target);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
@ -803,10 +827,13 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin);
 int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
 		bool line_status);
 int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level);
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
 		int irq_source_id, int level, bool line_status);
 int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
 			       struct kvm *kvm, int irq_source_id,
 			       int level, bool line_status);
 bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_notify_acked_gsi(struct kvm *kvm, int gsi);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian);
@ -1002,6 +1029,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
 #endif
 int kvm_setup_default_irq_routing(struct kvm *kvm);
 int kvm_setup_empty_irq_routing(struct kvm *kvm);
 int kvm_set_irq_routing(struct kvm *kvm,
 			const struct kvm_irq_routing_entry *entries,
 			unsigned nr,
@ -1144,5 +1172,15 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
 {
 }
 #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
 #endif
 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *,
 			   struct irq_bypass_producer *);
 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *,
 			   struct irq_bypass_producer *);
 void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *);
 void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *);
 int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
 				  uint32_t guest_irq, bool set);
 #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */
 #endif
--- a/include/linux/kvm_irqfd.h
+++ b/include/linux/kvm_irqfd.h
@ -0,0 +1,71 @@
 /*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * irqfd: Allows an fd to be used to inject an interrupt to the guest
 * Credit goes to Avi Kivity for the original idea.
 */
 #ifndef __LINUX_KVM_IRQFD_H
 #define __LINUX_KVM_IRQFD_H
 #include <linux/kvm_host.h>
 #include <linux/poll.h>
 /*
 * Resampling irqfds are a special variety of irqfds used to emulate
 * level triggered interrupts.  The interrupt is asserted on eventfd
 * trigger.  On acknowledgment through the irq ack notifier, the
 * interrupt is de-asserted and userspace is notified through the
 * resamplefd.  All resamplers on the same gsi are de-asserted
 * together, so we don't need to track the state of each individual
 * user.  We can also therefore share the same irq source ID.
 */
 struct kvm_kernel_irqfd_resampler {
 	struct kvm *kvm;
 	/*
 	 * List of resampling struct _irqfd objects sharing this gsi.
 	 * RCU list modified under kvm->irqfds.resampler_lock
 	 */
 	struct list_head list;
 	struct kvm_irq_ack_notifier notifier;
 	/*
 	 * Entry in list of kvm->irqfd.resampler_list.  Use for sharing
 	 * resamplers among irqfds on the same gsi.
 	 * Accessed and modified under kvm->irqfds.resampler_lock
 	 */
 	struct list_head link;
 };
 struct kvm_kernel_irqfd {
 	/* Used for MSI fast-path */
 	struct kvm *kvm;
 	wait_queue_t wait;
 	/* Update side is protected by irqfds.lock */
 	struct kvm_kernel_irq_routing_entry irq_entry;
 	seqcount_t irq_entry_sc;
 	/* Used for level IRQ fast-path */
 	int gsi;
 	struct work_struct inject;
 	/* The resampler used by this irqfd (resampler-only) */
 	struct kvm_kernel_irqfd_resampler *resampler;
 	/* Eventfd notified on resample (resampler-only) */
 	struct eventfd_ctx *resamplefd;
 	/* Entry in list of irqfds for a resampler (resampler-only) */
 	struct list_head resampler_link;
 	/* Used for setup/shutdown */
 	struct eventfd_ctx *eventfd;
 	struct list_head list;
 	poll_table pt;
 	struct work_struct shutdown;
 	struct irq_bypass_consumer consumer;
 	struct irq_bypass_producer *producer;
 };
 #endif /* __LINUX_KVM_IRQFD_H */
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@ -183,6 +183,7 @@ struct kvm_s390_skeys {
 #define KVM_EXIT_EPR              23
 #define KVM_EXIT_SYSTEM_EVENT     24
 #define KVM_EXIT_S390_STSI        25
 #define KVM_EXIT_IOAPIC_EOI       26
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@ -333,6 +334,10 @@ struct kvm_run {
 			__u8 sel1;
 			__u16 sel2;
 		} s390_stsi;
 		/* KVM_EXIT_IOAPIC_EOI */
 		struct {
 			__u8 vector;
 		} eoi;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
@ -824,6 +829,8 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_MULTI_ADDRESS_SPACE 118
 #define KVM_CAP_GUEST_DEBUG_HW_BPS 119
 #define KVM_CAP_GUEST_DEBUG_HW_WPS 120
 #define KVM_CAP_SPLIT_IRQCHIP 121
 #define KVM_CAP_IOEVENTFD_ANY_LENGTH 122
 #ifdef KVM_CAP_IRQ_ROUTING
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@ -444,6 +444,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	*ut = p->utime;
 	*st = p->stime;
 }
 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
@ -652,6 +653,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	task_cputime(p, &cputime.utime, &cputime.stime);
 	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
--- a/virt/Makefile
+++ b/virt/Makefile
@ -0,0 +1 @@
 obj-y	+= lib/
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@ -46,4 +46,7 @@ config KVM_GENERIC_DIRTYLOG_READ_PROTECT
 config KVM_COMPAT
       def_bool y
-       depends on COMPAT && !S390
+       depends on KVM && COMPAT && !S390
 config HAVE_KVM_IRQ_BYPASS
       bool
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@ -28,6 +28,8 @@
 #include <kvm/arm_vgic.h>
 #include <kvm/arm_arch_timer.h>
 #include "trace.h"
 static struct timecounter *timecounter;
 static struct workqueue_struct *wqueue;
 static unsigned int host_vtimer_irq;
@ -59,18 +61,6 @@ static void timer_disarm(struct arch_timer_cpu *timer)
 	}
 }
 static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu)
 {
 	int ret;
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	kvm_vgic_set_phys_irq_active(timer->map, true);
 	ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id,
 					 timer->map,
 					 timer->irq->level);
 	WARN_ON(ret);
 }
 static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
 {
 	struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
@ -111,14 +101,20 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt)
 	return HRTIMER_NORESTART;
 }
 static bool kvm_timer_irq_can_fire(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	return !(timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) &&
 		(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE);
 }
 bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	cycle_t cval, now;
-	if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) ||
+	if (!kvm_timer_irq_can_fire(vcpu))
 	    !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE) ||
 	    kvm_vgic_get_phys_irq_active(timer->map))
 		return false;
 	cval = timer->cntv_cval;
@ -127,12 +123,94 @@ bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
 	return cval <= now;
 }
 static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level)
 {
 	int ret;
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	BUG_ON(!vgic_initialized(vcpu->kvm));
 	timer->irq.level = new_level;
 	trace_kvm_timer_update_irq(vcpu->vcpu_id, timer->map->virt_irq,
 				   timer->irq.level);
 	ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id,
 					 timer->map,
 					 timer->irq.level);
 	WARN_ON(ret);
 }
 /*
 * Check if there was a change in the timer state (should we raise or lower
 * the line level to the GIC).
 */
 static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	/*
 	 * If userspace modified the timer registers via SET_ONE_REG before
 	 * the vgic was initialized, we mustn't set the timer->irq.level value
 	 * because the guest would never see the interrupt.  Instead wait
 	 * until we call this function from kvm_timer_flush_hwstate.
 	 */
 	if (!vgic_initialized(vcpu->kvm))
 	    return;
 	if (kvm_timer_should_fire(vcpu) != timer->irq.level)
 		kvm_timer_update_irq(vcpu, !timer->irq.level);
 }
 /*
 * Schedule the background timer before calling kvm_vcpu_block, so that this
 * thread is removed from its waitqueue and made runnable when there's a timer
 * interrupt to handle.
 */
 void kvm_timer_schedule(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	u64 ns;
 	cycle_t cval, now;
 	BUG_ON(timer_is_armed(timer));
 	/*
 	 * No need to schedule a background timer if the guest timer has
 	 * already expired, because kvm_vcpu_block will return before putting
 	 * the thread to sleep.
 	 */
 	if (kvm_timer_should_fire(vcpu))
 		return;
 	/*
 	 * If the timer is not capable of raising interrupts (disabled or
 	 * masked), then there's no more work for us to do.
 	 */
 	if (!kvm_timer_irq_can_fire(vcpu))
 		return;
 	/*  The timer has not yet expired, schedule a background timer */
 	cval = timer->cntv_cval;
 	now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
 	ns = cyclecounter_cyc2ns(timecounter->cc,
 				 cval - now,
 				 timecounter->mask,
 				 &timecounter->frac);
 	timer_arm(timer, ns);
 }
 void kvm_timer_unschedule(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	timer_disarm(timer);
 }
 /**
 * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu
 * @vcpu: The vcpu pointer
 *
- * Disarm any pending soft timers, since the world-switch code will write the
+ * Check if the virtual timer has expired while we were running in the host,
- * virtual timer state back to the physical CPU.
+ * and inject an interrupt if that was the case.
 */
 void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
 {
@ -140,28 +218,20 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
 	bool phys_active;
 	int ret;
-	/*
+	kvm_timer_update_state(vcpu);
 	 * We're about to run this vcpu again, so there is no need to
 	 * keep the background timer running, as we're about to
 	 * populate the CPU timer again.
 	 */
 	timer_disarm(timer);
 	/*
-	 * If the timer expired while we were not scheduled, now is the time
+	 * If we enter the guest with the virtual input level to the VGIC
-	 * to inject it.
+	 * asserted, then we have already told the VGIC what we need to, and
 	 * we don't need to exit from the guest until the guest deactivates
 	 * the already injected interrupt, so therefore we should set the
 	 * hardware active state to prevent unnecessary exits from the guest.
 	 *
 	 * Conversely, if the virtual input level is deasserted, then always
 	 * clear the hardware active state to ensure that hardware interrupts
 	 * from the timer triggers a guest exit.
 	 */
-	if (kvm_timer_should_fire(vcpu))
+	if (timer->irq.level)
 		kvm_timer_inject_irq(vcpu);
 	/*
 	 * We keep track of whether the edge-triggered interrupt has been
 	 * signalled to the vgic/guest, and if so, we mask the interrupt and
 	 * the physical distributor to prevent the timer from raising a
 	 * physical interrupt whenever we run a guest, preventing forward
 	 * VCPU progress.
 	 */
 	if (kvm_vgic_get_phys_irq_active(timer->map))
 		phys_active = true;
 	else
 		phys_active = false;
@ -176,32 +246,20 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
 * kvm_timer_sync_hwstate - sync timer state from cpu
 * @vcpu: The vcpu pointer
 *
- * Check if the virtual timer was armed and either schedule a corresponding
+ * Check if the virtual timer has expired while we were running in the guest,
- * soft timer or inject directly if already expired.
+ * and inject an interrupt if that was the case.
 */
 void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	cycle_t cval, now;
 	u64 ns;
 	BUG_ON(timer_is_armed(timer));
-	if (kvm_timer_should_fire(vcpu)) {
+	/*
-		/*
+	 * The guest could have modified the timer registers or the timer
-		 * Timer has already expired while we were not
+	 * could have expired, update the timer state.
-		 * looking. Inject the interrupt and carry on.
+	 */
-		 */
+	kvm_timer_update_state(vcpu);
 		kvm_timer_inject_irq(vcpu);
 		return;
 	}
 	cval = timer->cntv_cval;
 	now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
 	ns = cyclecounter_cyc2ns(timecounter->cc, cval - now, timecounter->mask,
 				 &timecounter->frac);
 	timer_arm(timer, ns);
 }
 int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
@ -216,7 +274,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
 	 * kvm_vcpu_set_target(). To handle this, we determine
 	 * vcpu timer irq number when the vcpu is reset.
 	 */
-	timer->irq = irq;
+	timer->irq.irq = irq->irq;
 	/*
 	 * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8
@ -225,6 +283,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
 	 * the ARMv7 architecture.
 	 */
 	timer->cntv_ctl = 0;
 	kvm_timer_update_state(vcpu);
 	/*
 	 * Tell the VGIC that the virtual interrupt is tied to a
@ -269,6 +328,8 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
 	default:
 		return -1;
 	}
 	kvm_timer_update_state(vcpu);
 	return 0;
 }
--- a/virt/kvm/arm/trace.h
+++ b/virt/kvm/arm/trace.h
@ -0,0 +1,63 @@
 #if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_KVM_H
 #include <linux/tracepoint.h>
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvm
 /*
 * Tracepoints for vgic
 */
 TRACE_EVENT(vgic_update_irq_pending,
 	TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level),
 	TP_ARGS(vcpu_id, irq, level),
 	TP_STRUCT__entry(
 		__field(	unsigned long,	vcpu_id	)
 		__field(	__u32,		irq	)
 		__field(	bool,		level	)
 	),
 	TP_fast_assign(
 		__entry->vcpu_id	= vcpu_id;
 		__entry->irq		= irq;
 		__entry->level		= level;
 	),
 	TP_printk("VCPU: %ld, IRQ %d, level: %d",
 		  __entry->vcpu_id, __entry->irq, __entry->level)
 );
 /*
 * Tracepoints for arch_timer
 */
 TRACE_EVENT(kvm_timer_update_irq,
 	TP_PROTO(unsigned long vcpu_id, __u32 irq, int level),
 	TP_ARGS(vcpu_id, irq, level),
 	TP_STRUCT__entry(
 		__field(	unsigned long,	vcpu_id	)
 		__field(	__u32,		irq	)
 		__field(	int,		level	)
 	),
 	TP_fast_assign(
 		__entry->vcpu_id	= vcpu_id;
 		__entry->irq		= irq;
 		__entry->level		= level;
 	),
 	TP_printk("VCPU: %ld, IRQ %d, level %d",
 		  __entry->vcpu_id, __entry->irq, __entry->level)
 );
 #endif /* _TRACE_KVM_H */
 #undef TRACE_INCLUDE_PATH
 #define TRACE_INCLUDE_PATH ../../../virt/kvm/arm
 #undef TRACE_INCLUDE_FILE
 #define TRACE_INCLUDE_FILE trace
 /* This part must be outside protection */
 #include <trace/define_trace.h>
--- a/virt/kvm/arm/vgic-v2.c
+++ b/virt/kvm/arm/vgic-v2.c
@ -79,11 +79,7 @@ static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
 		lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT);
 	vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val;
 }
 static void vgic_v2_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr,
 				  struct vgic_lr lr_desc)
 {
 	if (!(lr_desc.state & LR_STATE_MASK))
 		vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr);
 	else
@ -158,6 +154,7 @@ static void vgic_v2_enable(struct kvm_vcpu *vcpu)
 	 * anyway.
 	 */
 	vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
 	vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0;
 	/* Get the show on the road... */
 	vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
@ -166,7 +163,6 @@ static void vgic_v2_enable(struct kvm_vcpu *vcpu)
 static const struct vgic_ops vgic_v2_ops = {
 	.get_lr			= vgic_v2_get_lr,
 	.set_lr			= vgic_v2_set_lr,
 	.sync_lr_elrsr		= vgic_v2_sync_lr_elrsr,
 	.get_elrsr		= vgic_v2_get_elrsr,
 	.get_eisr		= vgic_v2_get_eisr,
 	.clear_eisr		= vgic_v2_clear_eisr,
--- a/virt/kvm/arm/vgic-v3.c
+++ b/virt/kvm/arm/vgic-v3.c
@ -112,11 +112,7 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
 	}
 	vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)] = lr_val;
 }
 static void vgic_v3_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr,
 				  struct vgic_lr lr_desc)
 {
 	if (!(lr_desc.state & LR_STATE_MASK))
 		vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr);
 	else
@ -193,6 +189,7 @@ static void vgic_v3_enable(struct kvm_vcpu *vcpu)
 	 * anyway.
 	 */
 	vgic_v3->vgic_vmcr = 0;
 	vgic_v3->vgic_elrsr = ~0;
 	/*
 	 * If we are emulating a GICv3, we do it in an non-GICv2-compatible
@ -211,7 +208,6 @@ static void vgic_v3_enable(struct kvm_vcpu *vcpu)
 static const struct vgic_ops vgic_v3_ops = {
 	.get_lr			= vgic_v3_get_lr,
 	.set_lr			= vgic_v3_set_lr,
 	.sync_lr_elrsr		= vgic_v3_sync_lr_elrsr,
 	.get_elrsr		= vgic_v3_get_elrsr,
 	.get_eisr		= vgic_v3_get_eisr,
 	.clear_eisr		= vgic_v3_clear_eisr,
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@ -34,6 +34,9 @@
 #include <asm/kvm.h>
 #include <kvm/iodev.h>
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 /*
 * How the whole thing works (courtesy of Christoffer Dall):
 *
@ -102,11 +105,13 @@
 #include "vgic.h"
 static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
-static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu);
+static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu);
 static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr);
 static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc);
 static u64 vgic_get_elrsr(struct kvm_vcpu *vcpu);
 static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
 						int virt_irq);
 static int compute_pending_for_cpu(struct kvm_vcpu *vcpu);
 static const struct vgic_ops *vgic_ops;
 static const struct vgic_params *vgic;
@ -357,6 +362,11 @@ static void vgic_dist_irq_clear_soft_pend(struct kvm_vcpu *vcpu, int irq)
 	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
 	vgic_bitmap_set_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq, 0);
 	if (!vgic_dist_irq_get_level(vcpu, irq)) {
 		vgic_dist_irq_clear_pending(vcpu, irq);
 		if (!compute_pending_for_cpu(vcpu))
 			clear_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
 	}
 }
 static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq)
@ -531,34 +541,6 @@ bool vgic_handle_set_pending_reg(struct kvm *kvm,
 	return false;
 }
 /*
 * If a mapped interrupt's state has been modified by the guest such that it
 * is no longer active or pending, without it have gone through the sync path,
 * then the map->active field must be cleared so the interrupt can be taken
 * again.
 */
 static void vgic_handle_clear_mapped_irq(struct kvm_vcpu *vcpu)
 {
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 	struct list_head *root;
 	struct irq_phys_map_entry *entry;
 	struct irq_phys_map *map;
 	rcu_read_lock();
 	/* Check for PPIs */
 	root = &vgic_cpu->irq_phys_map_list;
 	list_for_each_entry_rcu(entry, root, entry) {
 		map = &entry->map;
 		if (!vgic_dist_irq_is_pending(vcpu, map->virt_irq) &&
 		    !vgic_irq_is_active(vcpu, map->virt_irq))
 			map->active = false;
 	}
 	rcu_read_unlock();
 }
 bool vgic_handle_clear_pending_reg(struct kvm *kvm,
 				   struct kvm_exit_mmio *mmio,
 				   phys_addr_t offset, int vcpu_id)
@ -589,7 +571,6 @@ bool vgic_handle_clear_pending_reg(struct kvm *kvm,
 					  vcpu_id, offset);
 		vgic_reg_access(mmio, reg, offset, mode);
 		vgic_handle_clear_mapped_irq(kvm_get_vcpu(kvm, vcpu_id));
 		vgic_update_state(kvm);
 		return true;
 	}
@ -627,7 +608,6 @@ bool vgic_handle_clear_active_reg(struct kvm *kvm,
 			ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
 	if (mmio->is_write) {
 		vgic_handle_clear_mapped_irq(kvm_get_vcpu(kvm, vcpu_id));
 		vgic_update_state(kvm);
 		return true;
 	}
@ -684,10 +664,9 @@ bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
 	vgic_reg_access(mmio, &val, offset,
 			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
 	if (mmio->is_write) {
-		if (offset < 8) {
+		/* Ignore writes to read-only SGI and PPI bits */
-			*reg = ~0U; /* Force PPIs/SGIs to 1 */
+		if (offset < 8)
 			return false;
 		}
 		val = vgic_cfg_compress(val);
 		if (offset & 4) {
@ -713,9 +692,11 @@ bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
 void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
 {
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 	u64 elrsr = vgic_get_elrsr(vcpu);
 	unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
 	int i;
-	for_each_set_bit(i, vgic_cpu->lr_used, vgic_cpu->nr_lr) {
+	for_each_clear_bit(i, elrsr_ptr, vgic_cpu->nr_lr) {
 		struct vgic_lr lr = vgic_get_lr(vcpu, i);
 		/*
@ -736,30 +717,14 @@ void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
 		 * interrupt then move the active state to the
 		 * distributor tracking bit.
 		 */
-		if (lr.state & LR_STATE_ACTIVE) {
+		if (lr.state & LR_STATE_ACTIVE)
 			vgic_irq_set_active(vcpu, lr.irq);
 			lr.state &= ~LR_STATE_ACTIVE;
 		}
 		/*
 		 * Reestablish the pending state on the distributor and the
-		 * CPU interface.  It may have already been pending, but that
+		 * CPU interface and mark the LR as free for other use.
 		 * is fine, then we are only setting a few bits that were
 		 * already set.
 		 */
-		if (lr.state & LR_STATE_PENDING) {
+		vgic_retire_lr(i, vcpu);
 			vgic_dist_irq_set_pending(vcpu, lr.irq);
 			lr.state &= ~LR_STATE_PENDING;
 		}
 		vgic_set_lr(vcpu, i, lr);
 		/*
 		 * Mark the LR as free for other use.
 		 */
 		BUG_ON(lr.state & LR_STATE_MASK);
 		vgic_retire_lr(i, lr.irq, vcpu);
 		vgic_irq_clear_queued(vcpu, lr.irq);
 		/* Finally update the VGIC state. */
 		vgic_update_state(vcpu->kvm);
@ -1067,12 +1032,6 @@ static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr,
 	vgic_ops->set_lr(vcpu, lr, vlr);
 }
 static void vgic_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr,
 			       struct vgic_lr vlr)
 {
 	vgic_ops->sync_lr_elrsr(vcpu, lr, vlr);
 }
 static inline u64 vgic_get_elrsr(struct kvm_vcpu *vcpu)
 {
 	return vgic_ops->get_elrsr(vcpu);
@ -1118,25 +1077,23 @@ static inline void vgic_enable(struct kvm_vcpu *vcpu)
 	vgic_ops->enable(vcpu);
 }
-static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu)
+static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu)
 {
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 	struct vgic_lr vlr = vgic_get_lr(vcpu, lr_nr);
 	vgic_irq_clear_queued(vcpu, vlr.irq);
 	/*
 	 * We must transfer the pending state back to the distributor before
 	 * retiring the LR, otherwise we may loose edge-triggered interrupts.
 	 */
 	if (vlr.state & LR_STATE_PENDING) {
-		vgic_dist_irq_set_pending(vcpu, irq);
+		vgic_dist_irq_set_pending(vcpu, vlr.irq);
 		vlr.hwirq = 0;
 	}
 	vlr.state = 0;
 	vgic_set_lr(vcpu, lr_nr, vlr);
 	clear_bit(lr_nr, vgic_cpu->lr_used);
 	vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY;
 	vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
 }
 /*
@ -1150,17 +1107,15 @@ static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu)
 */
 static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu)
 {
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	u64 elrsr = vgic_get_elrsr(vcpu);
 	unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
 	int lr;
-	for_each_set_bit(lr, vgic_cpu->lr_used, vgic->nr_lr) {
+	for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) {
 		struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
-		if (!vgic_irq_is_enabled(vcpu, vlr.irq)) {
+		if (!vgic_irq_is_enabled(vcpu, vlr.irq))
-			vgic_retire_lr(lr, vlr.irq, vcpu);
+			vgic_retire_lr(lr, vcpu);
 			if (vgic_irq_is_queued(vcpu, vlr.irq))
 				vgic_irq_clear_queued(vcpu, vlr.irq);
 		}
 	}
 }
@ -1200,7 +1155,6 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
 	}
 	vgic_set_lr(vcpu, lr_nr, vlr);
 	vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
 }
 /*
@ -1210,8 +1164,9 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
 */
 bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
 {
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
 	u64 elrsr = vgic_get_elrsr(vcpu);
 	unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
 	struct vgic_lr vlr;
 	int lr;
@ -1222,28 +1177,22 @@ bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
 	kvm_debug("Queue IRQ%d\n", irq);
 	lr = vgic_cpu->vgic_irq_lr_map[irq];
 	/* Do we have an active interrupt for the same CPUID? */
-	if (lr != LR_EMPTY) {
+	for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) {
 		vlr = vgic_get_lr(vcpu, lr);
-		if (vlr.source == sgi_source_id) {
+		if (vlr.irq == irq && vlr.source == sgi_source_id) {
 			kvm_debug("LR%d piggyback for IRQ%d\n", lr, vlr.irq);
 			BUG_ON(!test_bit(lr, vgic_cpu->lr_used));
 			vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
 			return true;
 		}
 	}
 	/* Try to use another LR for this interrupt */
-	lr = find_first_zero_bit((unsigned long *)vgic_cpu->lr_used,
+	lr = find_first_bit(elrsr_ptr, vgic->nr_lr);
 			       vgic->nr_lr);
 	if (lr >= vgic->nr_lr)
 		return false;
 	kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id);
 	vgic_cpu->vgic_irq_lr_map[irq] = lr;
 	set_bit(lr, vgic_cpu->lr_used);
 	vlr.irq = irq;
 	vlr.source = sgi_source_id;
@ -1338,12 +1287,60 @@ static void __kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
 	}
 }
 static int process_queued_irq(struct kvm_vcpu *vcpu,
 				   int lr, struct vgic_lr vlr)
 {
 	int pending = 0;
 	/*
 	 * If the IRQ was EOIed (called from vgic_process_maintenance) or it
 	 * went from active to non-active (called from vgic_sync_hwirq) it was
 	 * also ACKed and we we therefore assume we can clear the soft pending
 	 * state (should it had been set) for this interrupt.
 	 *
 	 * Note: if the IRQ soft pending state was set after the IRQ was
 	 * acked, it actually shouldn't be cleared, but we have no way of
 	 * knowing that unless we start trapping ACKs when the soft-pending
 	 * state is set.
 	 */
 	vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq);
 	/*
 	 * Tell the gic to start sampling this interrupt again.
 	 */
 	vgic_irq_clear_queued(vcpu, vlr.irq);
 	/* Any additional pending interrupt? */
 	if (vgic_irq_is_edge(vcpu, vlr.irq)) {
 		BUG_ON(!(vlr.state & LR_HW));
 		pending = vgic_dist_irq_is_pending(vcpu, vlr.irq);
 	} else {
 		if (vgic_dist_irq_get_level(vcpu, vlr.irq)) {
 			vgic_cpu_irq_set(vcpu, vlr.irq);
 			pending = 1;
 		} else {
 			vgic_dist_irq_clear_pending(vcpu, vlr.irq);
 			vgic_cpu_irq_clear(vcpu, vlr.irq);
 		}
 	}
 	/*
 	 * Despite being EOIed, the LR may not have
 	 * been marked as empty.
 	 */
 	vlr.state = 0;
 	vlr.hwirq = 0;
 	vgic_set_lr(vcpu, lr, vlr);
 	return pending;
 }
 static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
 {
 	u32 status = vgic_get_interrupt_status(vcpu);
 	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
 	bool level_pending = false;
 	struct kvm *kvm = vcpu->kvm;
 	int level_pending = 0;
 	kvm_debug("STATUS = %08x\n", status);
@ -1358,54 +1355,22 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
 		for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) {
 			struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
 			WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq));
 			spin_lock(&dist->lock);
 			vgic_irq_clear_queued(vcpu, vlr.irq);
 			WARN_ON(vlr.state & LR_STATE_MASK);
 			vlr.state = 0;
 			vgic_set_lr(vcpu, lr, vlr);
 			/*
 			 * If the IRQ was EOIed it was also ACKed and we we
 			 * therefore assume we can clear the soft pending
 			 * state (should it had been set) for this interrupt.
 			 *
 			 * Note: if the IRQ soft pending state was set after
 			 * the IRQ was acked, it actually shouldn't be
 			 * cleared, but we have no way of knowing that unless
 			 * we start trapping ACKs when the soft-pending state
 			 * is set.
 			 */
 			vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq);
 			/*
 			 * kvm_notify_acked_irq calls kvm_set_irq()
-			 * to reset the IRQ level. Need to release the
+			 * to reset the IRQ level, which grabs the dist->lock
-			 * lock for kvm_set_irq to grab it.
+			 * so we call this before taking the dist->lock.
 			 */
 			spin_unlock(&dist->lock);
 			kvm_notify_acked_irq(kvm, 0,
 					     vlr.irq - VGIC_NR_PRIVATE_IRQS);
 			spin_lock(&dist->lock);
-
+			level_pending |= process_queued_irq(vcpu, lr, vlr);
 			/* Any additional pending interrupt? */
 			if (vgic_dist_irq_get_level(vcpu, vlr.irq)) {
 				vgic_cpu_irq_set(vcpu, vlr.irq);
 				level_pending = true;
 			} else {
 				vgic_dist_irq_clear_pending(vcpu, vlr.irq);
 				vgic_cpu_irq_clear(vcpu, vlr.irq);
 			}
 			spin_unlock(&dist->lock);
 			/*
 			 * Despite being EOIed, the LR may not have
 			 * been marked as empty.
 			 */
 			vgic_sync_lr_elrsr(vcpu, lr, vlr);
 		}
 	}
@ -1426,35 +1391,40 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
 /*
 * Save the physical active state, and reset it to inactive.
 *
- * Return 1 if HW interrupt went from active to inactive, and 0 otherwise.
+ * Return true if there's a pending forwarded interrupt to queue.
 */
-static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
+static bool vgic_sync_hwirq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr)
 {
 	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
 	struct irq_phys_map *map;
 	bool phys_active;
 	bool level_pending;
 	int ret;
 	if (!(vlr.state & LR_HW))
-		return 0;
+		return false;
 	map = vgic_irq_map_search(vcpu, vlr.irq);
 	BUG_ON(!map);
 	ret = irq_get_irqchip_state(map->irq,
 				    IRQCHIP_STATE_ACTIVE,
-				    &map->active);
+				    &phys_active);
 	WARN_ON(ret);
-	if (map->active)
+	if (phys_active)
 		return 0;
-	return 1;
+	spin_lock(&dist->lock);
 	level_pending = process_queued_irq(vcpu, lr, vlr);
 	spin_unlock(&dist->lock);
 	return level_pending;
 }
 /* Sync back the VGIC state after a guest run */
 static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 {
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
 	u64 elrsr;
 	unsigned long *elrsr_ptr;
@ -1462,40 +1432,18 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 	bool level_pending;
 	level_pending = vgic_process_maintenance(vcpu);
 	elrsr = vgic_get_elrsr(vcpu);
 	elrsr_ptr = u64_to_bitmask(&elrsr);
 	/* Deal with HW interrupts, and clear mappings for empty LRs */
 	for (lr = 0; lr < vgic->nr_lr; lr++) {
-		struct vgic_lr vlr;
+		struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
 		if (!test_bit(lr, vgic_cpu->lr_used))
 			continue;
 		vlr = vgic_get_lr(vcpu, lr);
 		if (vgic_sync_hwirq(vcpu, vlr)) {
 			/*
 			 * So this is a HW interrupt that the guest
 			 * EOI-ed. Clean the LR state and allow the
 			 * interrupt to be sampled again.
 			 */
 			vlr.state = 0;
 			vlr.hwirq = 0;
 			vgic_set_lr(vcpu, lr, vlr);
 			vgic_irq_clear_queued(vcpu, vlr.irq);
 			set_bit(lr, elrsr_ptr);
 		}
 		if (!test_bit(lr, elrsr_ptr))
 			continue;
 		clear_bit(lr, vgic_cpu->lr_used);
 		level_pending |= vgic_sync_hwirq(vcpu, lr, vlr);
 		BUG_ON(vlr.irq >= dist->nr_irqs);
 		vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY;
 	}
 	/* Check if we still have something up our sleeve... */
 	elrsr = vgic_get_elrsr(vcpu);
 	elrsr_ptr = u64_to_bitmask(&elrsr);
 	pending = find_first_zero_bit(elrsr_ptr, vgic->nr_lr);
 	if (level_pending || pending < vgic->nr_lr)
 		set_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
@ -1585,6 +1533,8 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 	int enabled;
 	bool ret = true, can_inject = true;
 	trace_vgic_update_irq_pending(cpuid, irq_num, level);
 	if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020))
 		return -EINVAL;
@ -1863,30 +1813,6 @@ static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu)
 	kfree(entry);
 }
 /**
 * kvm_vgic_get_phys_irq_active - Return the active state of a mapped IRQ
 *
 * Return the logical active state of a mapped interrupt. This doesn't
 * necessarily reflects the current HW state.
 */
 bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map)
 {
 	BUG_ON(!map);
 	return map->active;
 }
 /**
 * kvm_vgic_set_phys_irq_active - Set the active state of a mapped IRQ
 *
 * Set the logical active state of a mapped interrupt. This doesn't
 * immediately affects the HW state.
 */
 void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active)
 {
 	BUG_ON(!map);
 	map->active = active;
 }
 /**
 * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping
 * @vcpu: The VCPU pointer
@ -1942,12 +1868,10 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kfree(vgic_cpu->pending_shared);
 	kfree(vgic_cpu->active_shared);
 	kfree(vgic_cpu->pend_act_shared);
 	kfree(vgic_cpu->vgic_irq_lr_map);
 	vgic_destroy_irq_phys_map(vcpu->kvm, &vgic_cpu->irq_phys_map_list);
 	vgic_cpu->pending_shared = NULL;
 	vgic_cpu->active_shared = NULL;
 	vgic_cpu->pend_act_shared = NULL;
 	vgic_cpu->vgic_irq_lr_map = NULL;
 }
 static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
@ -1958,18 +1882,14 @@ static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
 	vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL);
 	vgic_cpu->active_shared = kzalloc(sz, GFP_KERNEL);
 	vgic_cpu->pend_act_shared = kzalloc(sz, GFP_KERNEL);
 	vgic_cpu->vgic_irq_lr_map = kmalloc(nr_irqs, GFP_KERNEL);
 	if (!vgic_cpu->pending_shared
 		|| !vgic_cpu->active_shared
-		|| !vgic_cpu->pend_act_shared
+		|| !vgic_cpu->pend_act_shared) {
 		|| !vgic_cpu->vgic_irq_lr_map) {
 		kvm_vgic_vcpu_destroy(vcpu);
 		return -ENOMEM;
 	}
 	memset(vgic_cpu->vgic_irq_lr_map, LR_EMPTY, nr_irqs);
 	/*
 	 * Store the number of LRs per vcpu, so we don't have to go
 	 * all the way to the distributor structure to find out. Only
@ -2111,14 +2031,24 @@ int vgic_init(struct kvm *kvm)
 			break;
 		}
-		for (i = 0; i < dist->nr_irqs; i++) {
+		/*
-			if (i < VGIC_NR_PPIS)
+		 * Enable and configure all SGIs to be edge-triggere and
 		 * configure all PPIs as level-triggered.
 		 */
 		for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
 			if (i < VGIC_NR_SGIS) {
 				/* SGIs */
 				vgic_bitmap_set_irq_val(&dist->irq_enabled,
 							vcpu->vcpu_id, i, 1);
 			if (i < VGIC_NR_PRIVATE_IRQS)
 				vgic_bitmap_set_irq_val(&dist->irq_cfg,
 							vcpu->vcpu_id, i,
 							VGIC_CFG_EDGE);
 			} else if (i < VGIC_NR_PRIVATE_IRQS) {
 				/* PPIs */
 				vgic_bitmap_set_irq_val(&dist->irq_cfg,
 							vcpu->vcpu_id, i,
 							VGIC_CFG_LEVEL);
 			}
 		}
 		vgic_enable(vcpu);
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@ -94,6 +94,10 @@ static void async_pf_execute(struct work_struct *work)
 	trace_kvm_async_pf_completed(addr, gva);
 	/*
 	 * This memory barrier pairs with prepare_to_wait's set_current_state()
 	 */
 	smp_mb();
 	if (waitqueue_active(&vcpu->wq))
 		wake_up_interruptible(&vcpu->wq);
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@ -23,6 +23,7 @@
 #include <linux/kvm_host.h>
 #include <linux/kvm.h>
 #include <linux/kvm_irqfd.h>
 #include <linux/workqueue.h>
 #include <linux/syscalls.h>
 #include <linux/wait.h>
@ -34,73 +35,20 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <linux/seqlock.h>
 #include <linux/irqbypass.h>
 #include <trace/events/kvm.h>
 #include <kvm/iodev.h>
 #ifdef CONFIG_HAVE_KVM_IRQFD
 /*
 * --------------------------------------------------------------------
 * irqfd: Allows an fd to be used to inject an interrupt to the guest
 *
 * Credit goes to Avi Kivity for the original idea.
 * --------------------------------------------------------------------
 */
 /*
 * Resampling irqfds are a special variety of irqfds used to emulate
 * level triggered interrupts.  The interrupt is asserted on eventfd
 * trigger.  On acknowledgement through the irq ack notifier, the
 * interrupt is de-asserted and userspace is notified through the
 * resamplefd.  All resamplers on the same gsi are de-asserted
 * together, so we don't need to track the state of each individual
 * user.  We can also therefore share the same irq source ID.
 */
 struct _irqfd_resampler {
 	struct kvm *kvm;
 	/*
 	 * List of resampling struct _irqfd objects sharing this gsi.
 	 * RCU list modified under kvm->irqfds.resampler_lock
 	 */
 	struct list_head list;
 	struct kvm_irq_ack_notifier notifier;
 	/*
 	 * Entry in list of kvm->irqfd.resampler_list.  Use for sharing
 	 * resamplers among irqfds on the same gsi.
 	 * Accessed and modified under kvm->irqfds.resampler_lock
 	 */
 	struct list_head link;
 };
 struct _irqfd {
 	/* Used for MSI fast-path */
 	struct kvm *kvm;
 	wait_queue_t wait;
 	/* Update side is protected by irqfds.lock */
 	struct kvm_kernel_irq_routing_entry irq_entry;
 	seqcount_t irq_entry_sc;
 	/* Used for level IRQ fast-path */
 	int gsi;
 	struct work_struct inject;
 	/* The resampler used by this irqfd (resampler-only) */
 	struct _irqfd_resampler *resampler;
 	/* Eventfd notified on resample (resampler-only) */
 	struct eventfd_ctx *resamplefd;
 	/* Entry in list of irqfds for a resampler (resampler-only) */
 	struct list_head resampler_link;
 	/* Used for setup/shutdown */
 	struct eventfd_ctx *eventfd;
 	struct list_head list;
 	poll_table pt;
 	struct work_struct shutdown;
 };
 static struct workqueue_struct *irqfd_cleanup_wq;
 static void
 irqfd_inject(struct work_struct *work)
 {
-	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
+	struct kvm_kernel_irqfd *irqfd =
 		container_of(work, struct kvm_kernel_irqfd, inject);
 	struct kvm *kvm = irqfd->kvm;
 	if (!irqfd->resampler) {
@ -121,12 +69,13 @@ irqfd_inject(struct work_struct *work)
 static void
 irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
 {
-	struct _irqfd_resampler *resampler;
+	struct kvm_kernel_irqfd_resampler *resampler;
 	struct kvm *kvm;
-	struct _irqfd *irqfd;
+	struct kvm_kernel_irqfd *irqfd;
 	int idx;
-	resampler = container_of(kian, struct _irqfd_resampler, notifier);
+	resampler = container_of(kian,
 			struct kvm_kernel_irqfd_resampler, notifier);
 	kvm = resampler->kvm;
 	kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
@ -141,9 +90,9 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
 }
 static void
-irqfd_resampler_shutdown(struct _irqfd *irqfd)
+irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd)
 {
-	struct _irqfd_resampler *resampler = irqfd->resampler;
+	struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler;
 	struct kvm *kvm = resampler->kvm;
 	mutex_lock(&kvm->irqfds.resampler_lock);
@ -168,7 +117,8 @@ irqfd_resampler_shutdown(struct _irqfd *irqfd)
 static void
 irqfd_shutdown(struct work_struct *work)
 {
-	struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown);
+	struct kvm_kernel_irqfd *irqfd =
 		container_of(work, struct kvm_kernel_irqfd, shutdown);
 	u64 cnt;
 	/*
@ -191,6 +141,9 @@ irqfd_shutdown(struct work_struct *work)
 	/*
 	 * It is now safe to release the object's resources
 	 */
 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
 	irq_bypass_unregister_consumer(&irqfd->consumer);
 #endif
 	eventfd_ctx_put(irqfd->eventfd);
 	kfree(irqfd);
 }
@ -198,7 +151,7 @@ irqfd_shutdown(struct work_struct *work)
 /* assumes kvm->irqfds.lock is held */
 static bool
-irqfd_is_active(struct _irqfd *irqfd)
+irqfd_is_active(struct kvm_kernel_irqfd *irqfd)
 {
 	return list_empty(&irqfd->list) ? false : true;
 }
@ -209,7 +162,7 @@ irqfd_is_active(struct _irqfd *irqfd)
 * assumes kvm->irqfds.lock is held
 */
 static void
-irqfd_deactivate(struct _irqfd *irqfd)
+irqfd_deactivate(struct kvm_kernel_irqfd *irqfd)
 {
 	BUG_ON(!irqfd_is_active(irqfd));
@ -218,13 +171,23 @@ irqfd_deactivate(struct _irqfd *irqfd)
 	queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
 }
 int __attribute__((weak)) kvm_arch_set_irq_inatomic(
 				struct kvm_kernel_irq_routing_entry *irq,
 				struct kvm *kvm, int irq_source_id,
 				int level,
 				bool line_status)
 {
 	return -EWOULDBLOCK;
 }
 /*
 * Called with wqh->lock held and interrupts disabled
 */
 static int
 irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
 {
-	struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
+	struct kvm_kernel_irqfd *irqfd =
 		container_of(wait, struct kvm_kernel_irqfd, wait);
 	unsigned long flags = (unsigned long)key;
 	struct kvm_kernel_irq_routing_entry irq;
 	struct kvm *kvm = irqfd->kvm;
@ -238,10 +201,9 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
 			irq = irqfd->irq_entry;
 		} while (read_seqcount_retry(&irqfd->irq_entry_sc, seq));
 		/* An event has been signaled, inject an interrupt */
-		if (irq.type == KVM_IRQ_ROUTING_MSI)
+		if (kvm_arch_set_irq_inatomic(&irq, kvm,
-			kvm_set_msi(&irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1,
+					      KVM_USERSPACE_IRQ_SOURCE_ID, 1,
-					false);
+					      false) == -EWOULDBLOCK)
 		else
 			schedule_work(&irqfd->inject);
 		srcu_read_unlock(&kvm->irq_srcu, idx);
 	}
@ -274,37 +236,54 @@ static void
 irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
 			poll_table *pt)
 {
-	struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
+	struct kvm_kernel_irqfd *irqfd =
 		container_of(pt, struct kvm_kernel_irqfd, pt);
 	add_wait_queue(wqh, &irqfd->wait);
 }
 /* Must be called under irqfds.lock */
-static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd)
+static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
 {
 	struct kvm_kernel_irq_routing_entry *e;
 	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
-	int i, n_entries;
+	int n_entries;
 	n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);
 	write_seqcount_begin(&irqfd->irq_entry_sc);
 	irqfd->irq_entry.type = 0;
 	e = entries;
-	for (i = 0; i < n_entries; ++i, ++e) {
+	if (n_entries == 1)
-		/* Only fast-path MSI. */
+		irqfd->irq_entry = *e;
-		if (e->type == KVM_IRQ_ROUTING_MSI)
+	else
-			irqfd->irq_entry = *e;
+		irqfd->irq_entry.type = 0;
 	}
 	write_seqcount_end(&irqfd->irq_entry_sc);
 }
 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
 void __attribute__((weak)) kvm_arch_irq_bypass_stop(
 				struct irq_bypass_consumer *cons)
 {
 }
 void __attribute__((weak)) kvm_arch_irq_bypass_start(
 				struct irq_bypass_consumer *cons)
 {
 }
 int  __attribute__((weak)) kvm_arch_update_irqfd_routing(
 				struct kvm *kvm, unsigned int host_irq,
 				uint32_t guest_irq, bool set)
 {
 	return 0;
 }
 #endif
 static int
 kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 {
-	struct _irqfd *irqfd, *tmp;
+	struct kvm_kernel_irqfd *irqfd, *tmp;
 	struct fd f;
 	struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
 	int ret;
@ -340,7 +319,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	irqfd->eventfd = eventfd;
 	if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
-		struct _irqfd_resampler *resampler;
+		struct kvm_kernel_irqfd_resampler *resampler;
 		resamplefd = eventfd_ctx_fdget(args->resamplefd);
 		if (IS_ERR(resamplefd)) {
@ -428,6 +407,17 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	 * we might race against the POLLHUP
 	 */
 	fdput(f);
 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
 	irqfd->consumer.token = (void *)irqfd->eventfd;
 	irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
 	irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
 	irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
 	irqfd->consumer.start = kvm_arch_irq_bypass_start;
 	ret = irq_bypass_register_consumer(&irqfd->consumer);
 	if (ret)
 		pr_info("irq bypass consumer (token %p) registration fails: %d\n",
 				irqfd->consumer.token, ret);
 #endif
 	return 0;
@ -469,9 +459,18 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
 }
 EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
-void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
+void kvm_notify_acked_gsi(struct kvm *kvm, int gsi)
 {
 	struct kvm_irq_ack_notifier *kian;
 	hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
 				 link)
 		if (kian->gsi == gsi)
 			kian->irq_acked(kian);
 }
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
 {
 	int gsi, idx;
 	trace_kvm_ack_irq(irqchip, pin);
@ -479,10 +478,7 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
 	if (gsi != -1)
-		hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
+		kvm_notify_acked_gsi(kvm, gsi);
 					 link)
 			if (kian->gsi == gsi)
 				kian->irq_acked(kian);
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 }
@ -525,7 +521,7 @@ kvm_eventfd_init(struct kvm *kvm)
 static int
 kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
 {
-	struct _irqfd *irqfd, *tmp;
+	struct kvm_kernel_irqfd *irqfd, *tmp;
 	struct eventfd_ctx *eventfd;
 	eventfd = eventfd_ctx_fdget(args->fd);
@ -581,7 +577,7 @@ kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
 void
 kvm_irqfd_release(struct kvm *kvm)
 {
-	struct _irqfd *irqfd, *tmp;
+	struct kvm_kernel_irqfd *irqfd, *tmp;
 	spin_lock_irq(&kvm->irqfds.lock);
@ -604,13 +600,23 @@ kvm_irqfd_release(struct kvm *kvm)
 */
 void kvm_irq_routing_update(struct kvm *kvm)
 {
-	struct _irqfd *irqfd;
+	struct kvm_kernel_irqfd *irqfd;
 	spin_lock_irq(&kvm->irqfds.lock);
-	list_for_each_entry(irqfd, &kvm->irqfds.items, list)
+	list_for_each_entry(irqfd, &kvm->irqfds.items, list) {
 		irqfd_update(kvm, irqfd);
 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
 		if (irqfd->producer) {
 			int ret = kvm_arch_update_irqfd_routing(
 					irqfd->kvm, irqfd->producer->irq,
 					irqfd->gsi, 1);
 			WARN_ON(ret);
 		}
 #endif
 	}
 	spin_unlock_irq(&kvm->irqfds.lock);
 }
@ -914,9 +920,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 		return -EINVAL;
 	/* ioeventfd with no length can't be combined with DATAMATCH */
-	if (!args->len &&
+	if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH))
 	    args->flags & (KVM_IOEVENTFD_FLAG_PIO |
 			   KVM_IOEVENTFD_FLAG_DATAMATCH))
 		return -EINVAL;
 	ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args);
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@ -31,16 +31,6 @@
 #include <trace/events/kvm.h>
 #include "irq.h"
 struct kvm_irq_routing_table {
 	int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
 	u32 nr_rt_entries;
 	/*
 	 * Array indexed by gsi. Each entry contains list of irq chips
 	 * the gsi is connected to.
 	 */
 	struct hlist_head map[0];
 };
 int kvm_irq_map_gsi(struct kvm *kvm,
 		    struct kvm_kernel_irq_routing_entry *entries, int gsi)
 {
@ -154,11 +144,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
 	/*
 	 * Do not allow GSI to be mapped to the same irqchip more than once.
-	 * Allow only one to one mapping between GSI and MSI.
+	 * Allow only one to one mapping between GSI and non-irqchip routing.
 	 */
 	hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
-		if (ei->type == KVM_IRQ_ROUTING_MSI ||
+		if (ei->type != KVM_IRQ_ROUTING_IRQCHIP ||
-		    ue->type == KVM_IRQ_ROUTING_MSI ||
+		    ue->type != KVM_IRQ_ROUTING_IRQCHIP ||
 		    ue->u.irqchip.irqchip == ei->irqchip.irqchip)
 			return r;
@ -231,6 +221,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
 	kvm_irq_routing_update(kvm);
 	mutex_unlock(&kvm->irq_lock);
 	kvm_arch_irq_routing_update(kvm);
 	synchronize_srcu_expedited(&kvm->irq_srcu);
 	new = old;
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@ -230,6 +230,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	init_waitqueue_head(&vcpu->wq);
 	kvm_async_pf_vcpu_init(vcpu);
 	vcpu->pre_pcpu = -1;
 	INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page) {
 		r = -ENOMEM;
@ -2018,6 +2021,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 		} while (single_task_running() && ktime_before(cur, stop));
 	}
 	kvm_arch_vcpu_blocking(vcpu);
 	for (;;) {
 		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
@ -2031,6 +2036,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	finish_wait(&vcpu->wq, &wait);
 	cur = ktime_get();
 	kvm_arch_vcpu_unblocking(vcpu);
 out:
 	block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
@ -2718,6 +2724,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_IRQFD:
 	case KVM_CAP_IRQFD_RESAMPLE:
 #endif
 	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
 	case KVM_CAP_CHECK_EXTENSION_VM:
 		return 1;
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
@ -3341,7 +3348,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
 		return -ENOSPC;
-	new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) *
+	new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) *
 			  sizeof(struct kvm_io_range)), GFP_KERNEL);
 	if (!new_bus)
 		return -ENOMEM;
@ -3373,7 +3380,7 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 	if (r)
 		return r;
-	new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) *
+	new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
 			  sizeof(struct kvm_io_range)), GFP_KERNEL);
 	if (!new_bus)
 		return -ENOMEM;
--- a/virt/lib/Kconfig
+++ b/virt/lib/Kconfig
@ -0,0 +1,2 @@
 config IRQ_BYPASS_MANAGER
 	tristate
--- a/virt/lib/Makefile
+++ b/virt/lib/Makefile
@ -0,0 +1 @@
 obj-$(CONFIG_IRQ_BYPASS_MANAGER) += irqbypass.o
--- a/virt/lib/irqbypass.c
+++ b/virt/lib/irqbypass.c
@ -0,0 +1,257 @@
 /*
 * IRQ offload/bypass manager
 *
 * Copyright (C) 2015 Red Hat, Inc.
 * Copyright (c) 2015 Linaro Ltd.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 *
 * Various virtualization hardware acceleration techniques allow bypassing or
 * offloading interrupts received from devices around the host kernel.  Posted
 * Interrupts on Intel VT-d systems can allow interrupts to be received
 * directly by a virtual machine.  ARM IRQ Forwarding allows forwarded physical
 * interrupts to be directly deactivated by the guest.  This manager allows
 * interrupt producers and consumers to find each other to enable this sort of
 * bypass.
 */
 #include <linux/irqbypass.h>
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("IRQ bypass manager utility module");
 static LIST_HEAD(producers);
 static LIST_HEAD(consumers);
 static DEFINE_MUTEX(lock);
 /* @lock must be held when calling connect */
 static int __connect(struct irq_bypass_producer *prod,
 		     struct irq_bypass_consumer *cons)
 {
 	int ret = 0;
 	if (prod->stop)
 		prod->stop(prod);
 	if (cons->stop)
 		cons->stop(cons);
 	if (prod->add_consumer)
 		ret = prod->add_consumer(prod, cons);
 	if (!ret) {
 		ret = cons->add_producer(cons, prod);
 		if (ret && prod->del_consumer)
 			prod->del_consumer(prod, cons);
 	}
 	if (cons->start)
 		cons->start(cons);
 	if (prod->start)
 		prod->start(prod);
 	return ret;
 }
 /* @lock must be held when calling disconnect */
 static void __disconnect(struct irq_bypass_producer *prod,
 			 struct irq_bypass_consumer *cons)
 {
 	if (prod->stop)
 		prod->stop(prod);
 	if (cons->stop)
 		cons->stop(cons);
 	cons->del_producer(cons, prod);
 	if (prod->del_consumer)
 		prod->del_consumer(prod, cons);
 	if (cons->start)
 		cons->start(cons);
 	if (prod->start)
 		prod->start(prod);
 }
 /**
 * irq_bypass_register_producer - register IRQ bypass producer
 * @producer: pointer to producer structure
 *
 * Add the provided IRQ producer to the list of producers and connect
 * with any matching token found on the IRQ consumers list.
 */
 int irq_bypass_register_producer(struct irq_bypass_producer *producer)
 {
 	struct irq_bypass_producer *tmp;
 	struct irq_bypass_consumer *consumer;
 	might_sleep();
 	if (!try_module_get(THIS_MODULE))
 		return -ENODEV;
 	mutex_lock(&lock);
 	list_for_each_entry(tmp, &producers, node) {
 		if (tmp->token == producer->token) {
 			mutex_unlock(&lock);
 			module_put(THIS_MODULE);
 			return -EBUSY;
 		}
 	}
 	list_for_each_entry(consumer, &consumers, node) {
 		if (consumer->token == producer->token) {
 			int ret = __connect(producer, consumer);
 			if (ret) {
 				mutex_unlock(&lock);
 				module_put(THIS_MODULE);
 				return ret;
 			}
 			break;
 		}
 	}
 	list_add(&producer->node, &producers);
 	mutex_unlock(&lock);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(irq_bypass_register_producer);
 /**
 * irq_bypass_unregister_producer - unregister IRQ bypass producer
 * @producer: pointer to producer structure
 *
 * Remove a previously registered IRQ producer from the list of producers
 * and disconnect it from any connected IRQ consumer.
 */
 void irq_bypass_unregister_producer(struct irq_bypass_producer *producer)
 {
 	struct irq_bypass_producer *tmp;
 	struct irq_bypass_consumer *consumer;
 	might_sleep();
 	if (!try_module_get(THIS_MODULE))
 		return; /* nothing in the list anyway */
 	mutex_lock(&lock);
 	list_for_each_entry(tmp, &producers, node) {
 		if (tmp->token != producer->token)
 			continue;
 		list_for_each_entry(consumer, &consumers, node) {
 			if (consumer->token == producer->token) {
 				__disconnect(producer, consumer);
 				break;
 			}
 		}
 		list_del(&producer->node);
 		module_put(THIS_MODULE);
 		break;
 	}
 	mutex_unlock(&lock);
 	module_put(THIS_MODULE);
 }
 EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer);
 /**
 * irq_bypass_register_consumer - register IRQ bypass consumer
 * @consumer: pointer to consumer structure
 *
 * Add the provided IRQ consumer to the list of consumers and connect
 * with any matching token found on the IRQ producer list.
 */
 int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer)
 {
 	struct irq_bypass_consumer *tmp;
 	struct irq_bypass_producer *producer;
 	if (!consumer->add_producer || !consumer->del_producer)
 		return -EINVAL;
 	might_sleep();
 	if (!try_module_get(THIS_MODULE))
 		return -ENODEV;
 	mutex_lock(&lock);
 	list_for_each_entry(tmp, &consumers, node) {
 		if (tmp->token == consumer->token) {
 			mutex_unlock(&lock);
 			module_put(THIS_MODULE);
 			return -EBUSY;
 		}
 	}
 	list_for_each_entry(producer, &producers, node) {
 		if (producer->token == consumer->token) {
 			int ret = __connect(producer, consumer);
 			if (ret) {
 				mutex_unlock(&lock);
 				module_put(THIS_MODULE);
 				return ret;
 			}
 			break;
 		}
 	}
 	list_add(&consumer->node, &consumers);
 	mutex_unlock(&lock);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(irq_bypass_register_consumer);
 /**
 * irq_bypass_unregister_consumer - unregister IRQ bypass consumer
 * @consumer: pointer to consumer structure
 *
 * Remove a previously registered IRQ consumer from the list of consumers
 * and disconnect it from any connected IRQ producer.
 */
 void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer)
 {
 	struct irq_bypass_consumer *tmp;
 	struct irq_bypass_producer *producer;
 	might_sleep();
 	if (!try_module_get(THIS_MODULE))
 		return; /* nothing in the list anyway */
 	mutex_lock(&lock);
 	list_for_each_entry(tmp, &consumers, node) {
 		if (tmp->token != consumer->token)
 			continue;
 		list_for_each_entry(producer, &producers, node) {
 			if (producer->token == consumer->token) {
 				__disconnect(producer, consumer);
 				break;
 			}
 		}
 		list_del(&consumer->node);
 		module_put(THIS_MODULE);
 		break;
 	}
 	mutex_unlock(&lock);
 	module_put(THIS_MODULE);
 }
 EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer);
		`@ -0,0 +1 @@`
							`obj-$(CONFIG_IRQ_BYPASS_MANAGER) += irqbypass.o`