diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 05866391f406..7d64f99ea3b8 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -35,6 +35,7 @@ #include #include #include +#include #define KVM_MAX_VCPUS NR_CPUS #define KVM_MAX_VCORES NR_CPUS @@ -727,6 +728,7 @@ struct kvm_vcpu_arch { int prev_cpu; bool timer_running; wait_queue_head_t cpu_run; + struct machine_check_event mce_evt; /* Valid if trap == 0x200 */ struct kvm_vcpu_arch_shared *shared; #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE) diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 07fbeb927834..8cf8f0c96906 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -60,6 +60,12 @@ struct kvm_regs { #define KVM_SREGS_E_FSL_PIDn (1 << 0) /* PID1/PID2 */ +/* flags for kvm_run.flags */ +#define KVM_RUN_PPC_NMI_DISP_MASK (3 << 0) +#define KVM_RUN_PPC_NMI_DISP_FULLY_RECOV (1 << 0) +#define KVM_RUN_PPC_NMI_DISP_LIMITED_RECOV (2 << 0) +#define KVM_RUN_PPC_NMI_DISP_NOT_RECOV (3 << 0) + /* * Feature bits indicate which sections of the sregs struct are valid, * both in KVM_GET_SREGS and KVM_SET_SREGS. On KVM_SET_SREGS, registers diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index f6a846c4f984..c4ada89be658 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1088,15 +1088,20 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_MACHINE_CHECK: - /* - * Deliver a machine check interrupt to the guest. - * We have to do this, even if the host has handled the - * machine check, because machine checks use SRR0/1 and - * the interrupt might have trashed guest state in them. - */ - kvmppc_book3s_queue_irqprio(vcpu, - BOOK3S_INTERRUPT_MACHINE_CHECK); - r = RESUME_GUEST; + /* Exit to guest with KVM_EXIT_NMI as exit reason */ + run->exit_reason = KVM_EXIT_NMI; + run->hw.hardware_exit_reason = vcpu->arch.trap; + /* Clear out the old NMI status from run->flags */ + run->flags &= ~KVM_RUN_PPC_NMI_DISP_MASK; + /* Now set the NMI status */ + if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED) + run->flags |= KVM_RUN_PPC_NMI_DISP_FULLY_RECOV; + else + run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV; + + r = RESUME_HOST; + /* Print the MCE event to host console. */ + machine_check_print_event_info(&vcpu->arch.mce_evt, false); break; case BOOK3S_INTERRUPT_PROGRAM: { diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c index 7ef0993214f3..c356f9a40b24 100644 --- a/arch/powerpc/kvm/book3s_hv_ras.c +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -130,12 +130,28 @@ static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) out: /* + * For guest that supports FWNMI capability, hook the MCE event into + * vcpu structure. We are going to exit the guest with KVM_EXIT_NMI + * exit reason. On our way to exit we will pull this event from vcpu + * structure and print it from thread 0 of the core/subcore. + * + * For guest that does not support FWNMI capability (old QEMU): * We are now going enter guest either through machine check * interrupt (for unhandled errors) or will continue from * current HSRR0 (for handled errors) in guest. Hence * queue up the event so that we can log it from host console later. */ - machine_check_queue_event(); + if (vcpu->kvm->arch.fwnmi_enabled) { + /* + * Hook up the mce event on to vcpu structure. + * First clear the old event. + */ + memset(&vcpu->arch.mce_evt, 0, sizeof(vcpu->arch.mce_evt)); + if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) { + vcpu->arch.mce_evt = mce_evt; + } + } else + machine_check_queue_event(); return handled; } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index ae6d93ee99d4..e3793bd510fe 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -153,15 +153,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) stb r0, HSTATE_HWTHREAD_REQ(r13) /* - * For external and machine check interrupts, we need - * to call the Linux handler to process the interrupt. - * We do that by jumping to absolute address 0x500 for - * external interrupts, or the machine_check_fwnmi label - * for machine checks (since firmware might have patched - * the vector area at 0x200). The [h]rfid at the end of the - * handler will return to the book3s_hv_interrupts.S code. - * For other interrupts we do the rfid to get back - * to the book3s_hv_interrupts.S code here. + * For external interrupts we need to call the Linux + * handler to process the interrupt. We do that by jumping + * to absolute address 0x500 for external interrupts. + * The [h]rfid at the end of the handler will return to + * the book3s_hv_interrupts.S code. For other interrupts + * we do the rfid to get back to the book3s_hv_interrupts.S + * code here. */ ld r8, 112+PPC_LR_STKOFF(r1) addi r1, r1, 112 @@ -176,7 +174,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) andi. r0, r0, MSR_IR /* in real mode? */ bne .Lvirt_return - cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL beq 11f cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL @@ -191,7 +188,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtmsrd r6, 1 /* Clear RI in MSR */ mtsrr0 r8 mtsrr1 r7 - beq cr1, 13f /* machine check */ + /* + * BOOK3S_INTERRUPT_MACHINE_CHECK is handled at the + * time of guest exit + */ RFI /* On POWER7, we have external interrupts set to use HSRR0/1 */ @@ -199,8 +199,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtspr SPRN_HSRR1, r7 ba 0x500 -13: b machine_check_fwnmi - 14: mtspr SPRN_HSRR0, r8 mtspr SPRN_HSRR1, r7 b hmi_exception_after_realmode @@ -2640,22 +2638,32 @@ machine_check_realmode: ld r9, HSTATE_KVM_VCPU(r13) li r12, BOOK3S_INTERRUPT_MACHINE_CHECK /* - * Deliver unhandled/fatal (e.g. UE) MCE errors to guest through - * machine check interrupt (set HSRR0 to 0x200). And for handled - * errors (no-fatal), just go back to guest execution with current - * HSRR0 instead of exiting guest. This new approach will inject - * machine check to guest for fatal error causing guest to crash. - * - * The old code used to return to host for unhandled errors which - * was causing guest to hang with soft lockups inside guest and - * makes it difficult to recover guest instance. + * For the guest that is FWNMI capable, deliver all the MCE errors + * (handled/unhandled) by exiting the guest with KVM_EXIT_NMI exit + * reason. This new approach injects machine check errors in guest + * address space to guest with additional information in the form + * of RTAS event, thus enabling guest kernel to suitably handle + * such errors. * + * For the guest that is not FWNMI capable (old QEMU) fallback + * to old behaviour for backward compatibility: + * Deliver unhandled/fatal (e.g. UE) MCE errors to guest either + * through machine check interrupt (set HSRR0 to 0x200). + * For handled errors (no-fatal), just go back to guest execution + * with current HSRR0. * if we receive machine check with MSR(RI=0) then deliver it to * guest as machine check causing guest to crash. */ ld r11, VCPU_MSR(r9) rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */ bne mc_cont /* if so, exit to host */ + /* Check if guest is capable of handling NMI exit */ + ld r10, VCPU_KVM(r9) + lbz r10, KVM_FWNMI(r10) + cmpdi r10, 1 /* FWNMI capable? */ + beq mc_cont /* if so, exit with KVM_EXIT_NMI. */ + + /* if not, fall through for backward compatibility. */ andi. r10, r11, MSR_RI /* check for unrecoverable exception */ beq 1f /* Deliver a machine check to guest */ ld r10, VCPU_PC(r9)