KVM: PPC: Book3S HV: Improve handling of debug-trigger HMIs on POWER9

Hypervisor maintenance interrupts (HMIs) are generated by various
causes, signalled by bits in the hypervisor maintenance exception
register (HMER).  In most cases calling OPAL to handle the interrupt
is the correct thing to do, but the "debug trigger" HMIs signalled by
PPC bit 17 (bit 46) of HMER are used to invoke software workarounds
for hardware bugs, and OPAL does not have any code to handle this
cause.  The debug trigger HMI is used in POWER9 DD2.0 and DD2.1 chips
to work around a hardware bug in executing vector load instructions to
cache inhibited memory.  In POWER9 DD2.2 chips, it is generated when
conditions are detected relating to threads being in TM (transactional
memory) suspended mode when the core SMT configuration needs to be
reconfigured.

The kernel currently has code to detect the vector CI load condition,
but only when the HMI occurs in the host, not when it occurs in a
guest.  If a HMI occurs in the guest, it is always passed to OPAL, and
then we always re-sync the timebase, because the HMI cause might have
been a timebase error, for which OPAL would re-sync the timebase, thus
removing the timebase offset which KVM applied for the guest.  Since
we don't know what OPAL did, we don't know whether to subtract the
timebase offset from the timebase, so instead we re-sync the timebase.

This adds code to determine explicitly what the cause of a debug
trigger HMI will be.  This is based on a new device-tree property
under the CPU nodes called ibm,hmi-special-triggers, if it is
present, or otherwise based on the PVR (processor version register).
The handling of debug trigger HMIs is pulled out into a separate
function which can be called from the KVM guest exit code.  If this
function handles and clears the HMI, and no other HMI causes remain,
then we skip calling OPAL and we proceed to subtract the guest
timebase offset from the timebase.

The overall handling for HMIs that occur in the host (i.e. not in a
KVM guest) is largely unchanged, except that we now don't set the flag
for the vector CI load workaround on DD2.2 processors.

This also removes a BUG_ON in the KVM code.  BUG_ON is generally not
useful in KVM guest entry/exit code since it is difficult to handle
the resulting trap gracefully.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
This commit is contained in:
Paul Mackerras 2018-01-17 20:51:13 +11:00 committed by Michael Ellerman
parent 7f1c410da5
commit d075745d89
5 changed files with 131 additions and 37 deletions

View File

@ -42,4 +42,8 @@ extern void wait_for_tb_resync(void);
static inline void wait_for_subcore_guest_exit(void) { }
static inline void wait_for_tb_resync(void) { }
#endif
struct pt_regs;
extern long hmi_handle_debugtrig(struct pt_regs *regs);
#endif /* __ASM_PPC64_HMI_H__ */

View File

@ -432,8 +432,9 @@
#define SPRN_LPID 0x13F /* Logical Partition Identifier */
#endif
#define LPID_RSVD 0x3ff /* Reserved LPID for partn switching */
#define SPRN_HMER 0x150 /* Hardware m? error recovery */
#define SPRN_HMEER 0x151 /* Hardware m? enable error recovery */
#define SPRN_HMER 0x150 /* Hypervisor maintenance exception reg */
#define HMER_DEBUG_TRIG (1ul << (63 - 17)) /* Debug trigger */
#define SPRN_HMEER 0x151 /* Hyp maintenance exception enable reg */
#define SPRN_PCR 0x152 /* Processor compatibility register */
#define PCR_VEC_DIS (1ul << (63-0)) /* Vec. disable (bit NA since POWER8) */
#define PCR_VSX_DIS (1ul << (63-1)) /* VSX disable (bit NA since POWER8) */

View File

@ -495,37 +495,123 @@ long machine_check_early(struct pt_regs *regs)
return handled;
}
long hmi_exception_realmode(struct pt_regs *regs)
/* Possible meanings for HMER_DEBUG_TRIG bit being set on POWER9 */
static enum {
DTRIG_UNKNOWN,
DTRIG_VECTOR_CI, /* need to emulate vector CI load instr */
DTRIG_SUSPEND_ESCAPE, /* need to escape from TM suspend mode */
} hmer_debug_trig_function;
static int init_debug_trig_function(void)
{
__this_cpu_inc(irq_stat.hmi_exceptions);
int pvr;
struct device_node *cpun;
struct property *prop = NULL;
const char *str;
#ifdef CONFIG_PPC_BOOK3S_64
/* Workaround for P9 vector CI loads (see p9_hmi_special_emu) */
if (pvr_version_is(PVR_POWER9)) {
/* First look in the device tree */
preempt_disable();
cpun = of_get_cpu_node(smp_processor_id(), NULL);
if (cpun) {
of_property_for_each_string(cpun, "ibm,hmi-special-triggers",
prop, str) {
if (strcmp(str, "bit17-vector-ci-load") == 0)
hmer_debug_trig_function = DTRIG_VECTOR_CI;
else if (strcmp(str, "bit17-tm-suspend-escape") == 0)
hmer_debug_trig_function = DTRIG_SUSPEND_ESCAPE;
}
of_node_put(cpun);
}
preempt_enable();
/* If we found the property, don't look at PVR */
if (prop)
goto out;
pvr = mfspr(SPRN_PVR);
/* Check for POWER9 Nimbus (scale-out) */
if ((PVR_VER(pvr) == PVR_POWER9) && (pvr & 0xe000) == 0) {
/* DD2.2 and later */
if ((pvr & 0xfff) >= 0x202)
hmer_debug_trig_function = DTRIG_SUSPEND_ESCAPE;
/* DD2.0 and DD2.1 - used for vector CI load emulation */
else if ((pvr & 0xfff) >= 0x200)
hmer_debug_trig_function = DTRIG_VECTOR_CI;
}
out:
switch (hmer_debug_trig_function) {
case DTRIG_VECTOR_CI:
pr_debug("HMI debug trigger used for vector CI load\n");
break;
case DTRIG_SUSPEND_ESCAPE:
pr_debug("HMI debug trigger used for TM suspend escape\n");
break;
default:
break;
}
return 0;
}
__initcall(init_debug_trig_function);
/*
* Handle HMIs that occur as a result of a debug trigger.
* Return values:
* -1 means this is not a HMI cause that we know about
* 0 means no further handling is required
* 1 means further handling is required
*/
long hmi_handle_debugtrig(struct pt_regs *regs)
{
unsigned long hmer = mfspr(SPRN_HMER);
long ret = 0;
/* Do we have the debug bit set */
if (hmer & PPC_BIT(17)) {
hmer &= ~PPC_BIT(17);
mtspr(SPRN_HMER, hmer);
/* HMER_DEBUG_TRIG bit is used for various workarounds on P9 */
if (!((hmer & HMER_DEBUG_TRIG)
&& hmer_debug_trig_function != DTRIG_UNKNOWN))
return -1;
hmer &= ~HMER_DEBUG_TRIG;
/* HMER is a write-AND register */
mtspr(SPRN_HMER, ~HMER_DEBUG_TRIG);
switch (hmer_debug_trig_function) {
case DTRIG_VECTOR_CI:
/*
* Now to avoid problems with soft-disable we
* only do the emulation if we are coming from
* user space
* host user space
*/
if (user_mode(regs))
local_paca->hmi_p9_special_emu = 1;
if (regs && user_mode(regs))
ret = local_paca->hmi_p9_special_emu = 1;
break;
default:
break;
}
/*
* Don't bother going to OPAL if that's the
* only relevant bit.
* See if any other HMI causes remain to be handled
*/
if (!(hmer & mfspr(SPRN_HMEER)))
return local_paca->hmi_p9_special_emu;
}
}
#endif /* CONFIG_PPC_BOOK3S_64 */
if (hmer & mfspr(SPRN_HMEER))
return -1;
return ret;
}
/*
* Return values:
*/
long hmi_exception_realmode(struct pt_regs *regs)
{
int ret;
__this_cpu_inc(irq_stat.hmi_exceptions);
ret = hmi_handle_debugtrig(regs);
if (ret >= 0)
return ret;
wait_for_subcore_guest_exit();

View File

@ -268,17 +268,19 @@ static void kvmppc_tb_resync_done(void)
* secondary threads to proceed.
* - All secondary threads will eventually call opal hmi handler on
* their exit path.
*
* Returns 1 if the timebase offset should be applied, 0 if not.
*/
long kvmppc_realmode_hmi_handler(void)
{
int ptid = local_paca->kvm_hstate.ptid;
bool resync_req;
/* This is only called on primary thread. */
BUG_ON(ptid != 0);
__this_cpu_inc(irq_stat.hmi_exceptions);
if (hmi_handle_debugtrig(NULL) >= 0)
return 1;
/*
* By now primary thread has already completed guest->host
* partition switch but haven't signaled secondaries yet.

View File

@ -1909,16 +1909,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
bne 27f
bl kvmppc_realmode_hmi_handler
nop
cmpdi r3, 0
li r12, BOOK3S_INTERRUPT_HMI
/*
* At this point kvmppc_realmode_hmi_handler would have resync-ed
* the TB. Hence it is not required to subtract guest timebase
* offset from timebase. So, skip it.
* At this point kvmppc_realmode_hmi_handler may have resync-ed
* the TB, and if it has, we must not subtract the guest timebase
* offset from the timebase. So, skip it.
*
* Also, do not call kvmppc_subcore_exit_guest() because it has
* been invoked as part of kvmppc_realmode_hmi_handler().
*/
b 30f
beq 30f
27:
/* Subtract timebase offset from timebase */