2007-08-22 11:46:44 +08:00
|
|
|
#ifndef _ASM_POWERPC_EXCEPTION_H
|
|
|
|
#define _ASM_POWERPC_EXCEPTION_H
|
|
|
|
/*
|
|
|
|
* Extracted from head_64.S
|
|
|
|
*
|
|
|
|
* PowerPC version
|
|
|
|
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
|
|
|
|
*
|
|
|
|
* Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP
|
|
|
|
* Copyright (C) 1996 Cort Dougan <cort@cs.nmt.edu>
|
|
|
|
* Adapted for Power Macintosh by Paul Mackerras.
|
|
|
|
* Low-level exception handlers and MMU support
|
|
|
|
* rewritten by Paul Mackerras.
|
|
|
|
* Copyright (C) 1996 Paul Mackerras.
|
|
|
|
*
|
|
|
|
* Adapted for 64bit PowerPC by Dave Engebretsen, Peter Bergner, and
|
|
|
|
* Mike Corrigan {engebret|bergner|mikejc}@us.ibm.com
|
|
|
|
*
|
|
|
|
* This file contains the low-level support and setup for the
|
|
|
|
* PowerPC-64 platform, including trap and interrupt dispatch.
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation; either version
|
|
|
|
* 2 of the License, or (at your option) any later version.
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
* The following macros define the code that appears as
|
|
|
|
* the prologue to each of the exception handlers. They
|
|
|
|
* are split into two parts to allow a single kernel binary
|
|
|
|
* to be used for pSeries and iSeries.
|
|
|
|
*
|
|
|
|
* We make as much of the exception code common between native
|
|
|
|
* exception handlers (including pSeries LPAR) and iSeries LPAR
|
|
|
|
* implementations as possible.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define EX_R9 0
|
|
|
|
#define EX_R10 8
|
|
|
|
#define EX_R11 16
|
|
|
|
#define EX_R12 24
|
|
|
|
#define EX_R13 32
|
|
|
|
#define EX_SRR0 40
|
|
|
|
#define EX_DAR 48
|
|
|
|
#define EX_DSISR 56
|
|
|
|
#define EX_CCR 60
|
|
|
|
#define EX_R3 64
|
|
|
|
#define EX_LR 72
|
2011-05-02 03:48:20 +08:00
|
|
|
#define EX_CFAR 80
|
2007-08-22 11:46:44 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We're short on space and time in the exception prolog, so we can't
|
|
|
|
* use the normal SET_REG_IMMEDIATE macro. Normally we just need the
|
|
|
|
* low halfword of the address, but for Kdump we need the whole low
|
|
|
|
* word.
|
|
|
|
*/
|
|
|
|
#define LOAD_HANDLER(reg, label) \
|
2008-08-30 09:40:24 +08:00
|
|
|
addi reg,reg,(label)-_stext; /* virt addr of handler ... */
|
2007-08-22 11:46:44 +08:00
|
|
|
|
2011-04-05 12:20:31 +08:00
|
|
|
/* Exception register prefixes */
|
|
|
|
#define EXC_HV H
|
|
|
|
#define EXC_STD
|
|
|
|
|
2011-06-29 08:18:26 +08:00
|
|
|
#define __EXCEPTION_PROLOG_1(area, extra, vec) \
|
2011-01-20 14:50:21 +08:00
|
|
|
GET_PACA(r13); \
|
2007-08-22 11:48:37 +08:00
|
|
|
std r9,area+EX_R9(r13); /* save r9 - r12 */ \
|
|
|
|
std r10,area+EX_R10(r13); \
|
2011-05-02 03:48:20 +08:00
|
|
|
BEGIN_FTR_SECTION_NESTED(66); \
|
|
|
|
mfspr r10,SPRN_CFAR; \
|
|
|
|
std r10,area+EX_CFAR(r13); \
|
|
|
|
END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \
|
2011-06-29 08:18:26 +08:00
|
|
|
mfcr r9; \
|
|
|
|
extra(vec); \
|
|
|
|
std r11,area+EX_R11(r13); \
|
|
|
|
std r12,area+EX_R12(r13); \
|
|
|
|
GET_SCRATCH0(r10); \
|
|
|
|
std r10,area+EX_R13(r13)
|
|
|
|
#define EXCEPTION_PROLOG_1(area, extra, vec) \
|
|
|
|
__EXCEPTION_PROLOG_1(area, extra, vec)
|
2007-08-22 11:48:37 +08:00
|
|
|
|
2011-04-05 12:20:31 +08:00
|
|
|
#define __EXCEPTION_PROLOG_PSERIES_1(label, h) \
|
2008-08-30 09:40:24 +08:00
|
|
|
ld r12,PACAKBASE(r13); /* get high part of &label */ \
|
|
|
|
ld r10,PACAKMSR(r13); /* get MSR value for kernel */ \
|
2011-04-05 12:20:31 +08:00
|
|
|
mfspr r11,SPRN_##h##SRR0; /* save SRR0 */ \
|
2007-08-22 11:46:44 +08:00
|
|
|
LOAD_HANDLER(r12,label) \
|
2011-04-05 12:20:31 +08:00
|
|
|
mtspr SPRN_##h##SRR0,r12; \
|
|
|
|
mfspr r12,SPRN_##h##SRR1; /* and SRR1 */ \
|
|
|
|
mtspr SPRN_##h##SRR1,r10; \
|
|
|
|
h##rfid; \
|
2007-08-22 11:46:44 +08:00
|
|
|
b . /* prevent speculative execution */
|
2011-06-29 08:18:26 +08:00
|
|
|
#define EXCEPTION_PROLOG_PSERIES_1(label, h) \
|
2011-04-05 12:20:31 +08:00
|
|
|
__EXCEPTION_PROLOG_PSERIES_1(label, h)
|
2007-08-22 11:46:44 +08:00
|
|
|
|
2011-06-29 08:18:26 +08:00
|
|
|
#define EXCEPTION_PROLOG_PSERIES(area, label, h, extra, vec) \
|
|
|
|
EXCEPTION_PROLOG_1(area, extra, vec); \
|
2011-04-05 12:20:31 +08:00
|
|
|
EXCEPTION_PROLOG_PSERIES_1(label, h);
|
2009-07-17 03:36:57 +08:00
|
|
|
|
2011-06-29 08:18:26 +08:00
|
|
|
#define __KVMTEST(n) \
|
2011-06-29 08:20:58 +08:00
|
|
|
lbz r10,HSTATE_IN_GUEST(r13); \
|
2011-06-29 08:18:26 +08:00
|
|
|
cmpwi r10,0; \
|
|
|
|
bne do_kvm_##n
|
|
|
|
|
|
|
|
#define __KVM_HANDLER(area, h, n) \
|
|
|
|
do_kvm_##n: \
|
|
|
|
ld r10,area+EX_R10(r13); \
|
2011-06-29 08:20:58 +08:00
|
|
|
stw r9,HSTATE_SCRATCH1(r13); \
|
2011-06-29 08:18:26 +08:00
|
|
|
ld r9,area+EX_R9(r13); \
|
2011-06-29 08:20:58 +08:00
|
|
|
std r12,HSTATE_SCRATCH0(r13); \
|
2011-06-29 08:18:26 +08:00
|
|
|
li r12,n; \
|
|
|
|
b kvmppc_interrupt
|
|
|
|
|
|
|
|
#define __KVM_HANDLER_SKIP(area, h, n) \
|
|
|
|
do_kvm_##n: \
|
|
|
|
cmpwi r10,KVM_GUEST_MODE_SKIP; \
|
|
|
|
ld r10,area+EX_R10(r13); \
|
|
|
|
beq 89f; \
|
2011-06-29 08:20:58 +08:00
|
|
|
stw r9,HSTATE_SCRATCH1(r13); \
|
2011-06-29 08:18:26 +08:00
|
|
|
ld r9,area+EX_R9(r13); \
|
2011-06-29 08:20:58 +08:00
|
|
|
std r12,HSTATE_SCRATCH0(r13); \
|
2011-06-29 08:18:26 +08:00
|
|
|
li r12,n; \
|
|
|
|
b kvmppc_interrupt; \
|
|
|
|
89: mtocrf 0x80,r9; \
|
|
|
|
ld r9,area+EX_R9(r13); \
|
|
|
|
b kvmppc_skip_##h##interrupt
|
|
|
|
|
|
|
|
#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
|
|
|
|
#define KVMTEST(n) __KVMTEST(n)
|
|
|
|
#define KVM_HANDLER(area, h, n) __KVM_HANDLER(area, h, n)
|
|
|
|
#define KVM_HANDLER_SKIP(area, h, n) __KVM_HANDLER_SKIP(area, h, n)
|
|
|
|
|
|
|
|
#else
|
|
|
|
#define KVMTEST(n)
|
|
|
|
#define KVM_HANDLER(area, h, n)
|
|
|
|
#define KVM_HANDLER_SKIP(area, h, n)
|
|
|
|
#endif
|
|
|
|
|
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:21:34 +08:00
|
|
|
#ifdef CONFIG_KVM_BOOK3S_PR
|
|
|
|
#define KVMTEST_PR(n) __KVMTEST(n)
|
|
|
|
#define KVM_HANDLER_PR(area, h, n) __KVM_HANDLER(area, h, n)
|
|
|
|
#define KVM_HANDLER_PR_SKIP(area, h, n) __KVM_HANDLER_SKIP(area, h, n)
|
|
|
|
|
|
|
|
#else
|
|
|
|
#define KVMTEST_PR(n)
|
|
|
|
#define KVM_HANDLER_PR(area, h, n)
|
|
|
|
#define KVM_HANDLER_PR_SKIP(area, h, n)
|
|
|
|
#endif
|
|
|
|
|
2011-06-29 08:18:26 +08:00
|
|
|
#define NOTEST(n)
|
|
|
|
|
2007-08-22 11:46:44 +08:00
|
|
|
/*
|
|
|
|
* The common exception prolog is used for all except a few exceptions
|
|
|
|
* such as a segment miss on a kernel address. We have to be prepared
|
|
|
|
* to take another exception from the point where we first touch the
|
|
|
|
* kernel stack onwards.
|
|
|
|
*
|
|
|
|
* On entry r13 points to the paca, r9-r13 are saved in the paca,
|
|
|
|
* r9 contains the saved CR, r11 and r12 contain the saved SRR0 and
|
|
|
|
* SRR1, and relocation is on.
|
|
|
|
*/
|
|
|
|
#define EXCEPTION_PROLOG_COMMON(n, area) \
|
|
|
|
andi. r10,r12,MSR_PR; /* See if coming from user */ \
|
|
|
|
mr r10,r1; /* Save r1 */ \
|
|
|
|
subi r1,r1,INT_FRAME_SIZE; /* alloc frame on kernel stack */ \
|
|
|
|
beq- 1f; \
|
|
|
|
ld r1,PACAKSAVE(r13); /* kernel stack to use */ \
|
|
|
|
1: cmpdi cr1,r1,0; /* check if r1 is in userspace */ \
|
2011-05-02 03:46:44 +08:00
|
|
|
blt+ cr1,3f; /* abort if it is */ \
|
|
|
|
li r1,(n); /* will be reloaded later */ \
|
2007-08-22 11:46:44 +08:00
|
|
|
sth r1,PACA_TRAP_SAVE(r13); \
|
2011-05-02 03:46:44 +08:00
|
|
|
std r3,area+EX_R3(r13); \
|
|
|
|
addi r3,r13,area; /* r3 -> where regs are saved*/ \
|
2007-08-22 11:46:44 +08:00
|
|
|
b bad_stack; \
|
|
|
|
3: std r9,_CCR(r1); /* save CR in stackframe */ \
|
|
|
|
std r11,_NIP(r1); /* save SRR0 in stackframe */ \
|
|
|
|
std r12,_MSR(r1); /* save SRR1 in stackframe */ \
|
|
|
|
std r10,0(r1); /* make stack chain pointer */ \
|
|
|
|
std r0,GPR0(r1); /* save r0 in stackframe */ \
|
|
|
|
std r10,GPR1(r1); /* save r1 in stackframe */ \
|
|
|
|
ACCOUNT_CPU_USER_ENTRY(r9, r10); \
|
|
|
|
std r2,GPR2(r1); /* save r2 in stackframe */ \
|
|
|
|
SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \
|
|
|
|
SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe */ \
|
|
|
|
ld r9,area+EX_R9(r13); /* move r9, r10 to stackframe */ \
|
|
|
|
ld r10,area+EX_R10(r13); \
|
|
|
|
std r9,GPR9(r1); \
|
|
|
|
std r10,GPR10(r1); \
|
|
|
|
ld r9,area+EX_R11(r13); /* move r11 - r13 to stackframe */ \
|
|
|
|
ld r10,area+EX_R12(r13); \
|
|
|
|
ld r11,area+EX_R13(r13); \
|
|
|
|
std r9,GPR11(r1); \
|
|
|
|
std r10,GPR12(r1); \
|
|
|
|
std r11,GPR13(r1); \
|
2011-05-02 03:48:20 +08:00
|
|
|
BEGIN_FTR_SECTION_NESTED(66); \
|
|
|
|
ld r10,area+EX_CFAR(r13); \
|
|
|
|
std r10,ORIG_GPR3(r1); \
|
|
|
|
END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \
|
2007-08-22 11:46:44 +08:00
|
|
|
ld r2,PACATOC(r13); /* get kernel TOC into r2 */ \
|
|
|
|
mflr r9; /* save LR in stackframe */ \
|
|
|
|
std r9,_LINK(r1); \
|
|
|
|
mfctr r10; /* save CTR in stackframe */ \
|
|
|
|
std r10,_CTR(r1); \
|
|
|
|
lbz r10,PACASOFTIRQEN(r13); \
|
|
|
|
mfspr r11,SPRN_XER; /* save XER in stackframe */ \
|
|
|
|
std r10,SOFTE(r1); \
|
|
|
|
std r11,_XER(r1); \
|
|
|
|
li r9,(n)+1; \
|
|
|
|
std r9,_TRAP(r1); /* set trap number */ \
|
|
|
|
li r10,0; \
|
|
|
|
ld r11,exception_marker@toc(r2); \
|
|
|
|
std r10,RESULT(r1); /* clear regs->result */ \
|
powerpc: Account time using timebase rather than PURR
Currently, when CONFIG_VIRT_CPU_ACCOUNTING is enabled, we use the
PURR register for measuring the user and system time used by
processes, as well as other related times such as hardirq and
softirq times. This turns out to be quite confusing for users
because it means that a program will often be measured as taking
less time when run on a multi-threaded processor (SMT2 or SMT4 mode)
than it does when run on a single-threaded processor (ST mode), even
though the program takes longer to finish. The discrepancy is
accounted for as stolen time, which is also confusing, particularly
when there are no other partitions running.
This changes the accounting to use the timebase instead, meaning that
the reported user and system times are the actual number of real-time
seconds that the program was executing on the processor thread,
regardless of which SMT mode the processor is in. Thus a program will
generally show greater user and system times when run on a
multi-threaded processor than on a single-threaded processor.
On pSeries systems on POWER5 or later processors, we measure the
stolen time (time when this partition wasn't running) using the
hypervisor dispatch trace log. We check for new entries in the
log on every entry from user mode and on every transition from
kernel process context to soft or hard IRQ context (i.e. when
account_system_vtime() gets called). So that we can correctly
distinguish time stolen from user time and time stolen from system
time, without having to check the log on every exit to user mode,
we store separate timestamps for exit to user mode and entry from
user mode.
On systems that have a SPURR (POWER6 and POWER7), we read the SPURR
in account_system_vtime() (as before), and then apportion the SPURR
ticks since the last time we read it between scaled user time and
scaled system time according to the relative proportions of user
time and system time over the same interval. This avoids having to
read the SPURR on every kernel entry and exit. On systems that have
PURR but not SPURR (i.e., POWER5), we do the same using the PURR
rather than the SPURR.
This disables the DTL user interface in /sys/debug/kernel/powerpc/dtl
for now since it conflicts with the use of the dispatch trace log
by the time accounting code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2010-08-27 03:56:43 +08:00
|
|
|
std r11,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ \
|
|
|
|
ACCOUNT_STOLEN_TIME
|
2007-08-22 11:46:44 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Exception vectors.
|
|
|
|
*/
|
2011-04-05 12:27:11 +08:00
|
|
|
#define STD_EXCEPTION_PSERIES(loc, vec, label) \
|
|
|
|
. = loc; \
|
2007-08-22 11:46:44 +08:00
|
|
|
.globl label##_pSeries; \
|
|
|
|
label##_pSeries: \
|
|
|
|
HMT_MEDIUM; \
|
2011-04-05 11:59:58 +08:00
|
|
|
SET_SCRATCH0(r13); /* save r13 */ \
|
2011-06-29 08:18:26 +08:00
|
|
|
EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common, \
|
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:21:34 +08:00
|
|
|
EXC_STD, KVMTEST_PR, vec)
|
2007-08-22 11:46:44 +08:00
|
|
|
|
2011-04-05 12:27:11 +08:00
|
|
|
#define STD_EXCEPTION_HV(loc, vec, label) \
|
|
|
|
. = loc; \
|
|
|
|
.globl label##_hv; \
|
|
|
|
label##_hv: \
|
2007-08-22 11:46:44 +08:00
|
|
|
HMT_MEDIUM; \
|
2011-06-29 08:18:26 +08:00
|
|
|
SET_SCRATCH0(r13); /* save r13 */ \
|
|
|
|
EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common, \
|
|
|
|
EXC_HV, KVMTEST, vec)
|
2007-08-22 11:46:44 +08:00
|
|
|
|
powerpc: Rework lazy-interrupt handling
The current implementation of lazy interrupts handling has some
issues that this tries to address.
We don't do the various workarounds we need to do when re-enabling
interrupts in some cases such as when returning from an interrupt
and thus we may still lose or get delayed decrementer or doorbell
interrupts.
The current scheme also makes it much harder to handle the external
"edge" interrupts provided by some BookE processors when using the
EPR facility (External Proxy) and the Freescale Hypervisor.
Additionally, we tend to keep interrupts hard disabled in a number
of cases, such as decrementer interrupts, external interrupts, or
when a masked decrementer interrupt is pending. This is sub-optimal.
This is an attempt at fixing it all in one go by reworking the way
we do the lazy interrupt disabling from the ground up.
The base idea is to replace the "hard_enabled" field with a
"irq_happened" field in which we store a bit mask of what interrupt
occurred while soft-disabled.
When re-enabling, either via arch_local_irq_restore() or when returning
from an interrupt, we can now decide what to do by testing bits in that
field.
We then implement replaying of the missed interrupts either by
re-using the existing exception frame (in exception exit case) or via
the creation of a new one from an assembly trampoline (in the
arch_local_irq_enable case).
This removes the need to play with the decrementer to try to create
fake interrupts, among others.
In addition, this adds a few refinements:
- We no longer hard disable decrementer interrupts that occur
while soft-disabled. We now simply bump the decrementer back to max
(on BookS) or leave it stopped (on BookE) and continue with hard interrupts
enabled, which means that we'll potentially get better sample quality from
performance monitor interrupts.
- Timer, decrementer and doorbell interrupts now hard-enable
shortly after removing the source of the interrupt, which means
they no longer run entirely hard disabled. Again, this will improve
perf sample quality.
- On Book3E 64-bit, we now make the performance monitor interrupt
act as an NMI like Book3S (the necessary C code for that to work
appear to already be present in the FSL perf code, notably calling
nmi_enter instead of irq_enter). (This also fixes a bug where BookE
perfmon interrupts could clobber r14 ... oops)
- We could make "masked" decrementer interrupts act as NMIs when doing
timer-based perf sampling to improve the sample quality.
Signed-off-by-yet: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
v2:
- Add hard-enable to decrementer, timer and doorbells
- Fix CR clobber in masked irq handling on BookE
- Make embedded perf interrupt act as an NMI
- Add a PACA_HAPPENED_EE_EDGE for use by FSL if they want
to retrigger an interrupt without preventing hard-enable
v3:
- Fix or vs. ori bug on Book3E
- Fix enabling of interrupts for some exceptions on Book3E
v4:
- Fix resend of doorbells on return from interrupt on Book3E
v5:
- Rebased on top of my latest series, which involves some significant
rework of some aspects of the patch.
v6:
- 32-bit compile fix
- more compile fixes with various .config combos
- factor out the asm code to soft-disable interrupts
- remove the C wrapper around preempt_schedule_irq
v7:
- Fix a bug with hard irq state tracking on native power7
2012-03-06 15:27:59 +08:00
|
|
|
/* This associate vector numbers with bits in paca->irq_happened */
|
|
|
|
#define SOFTEN_VALUE_0x500 PACA_IRQ_EE
|
|
|
|
#define SOFTEN_VALUE_0x502 PACA_IRQ_EE
|
|
|
|
#define SOFTEN_VALUE_0x900 PACA_IRQ_DEC
|
|
|
|
#define SOFTEN_VALUE_0x982 PACA_IRQ_DEC
|
|
|
|
|
|
|
|
#define __SOFTEN_TEST(h, vec) \
|
2007-08-22 11:46:44 +08:00
|
|
|
lbz r10,PACASOFTIRQEN(r13); \
|
|
|
|
cmpwi r10,0; \
|
powerpc: Rework lazy-interrupt handling
The current implementation of lazy interrupts handling has some
issues that this tries to address.
We don't do the various workarounds we need to do when re-enabling
interrupts in some cases such as when returning from an interrupt
and thus we may still lose or get delayed decrementer or doorbell
interrupts.
The current scheme also makes it much harder to handle the external
"edge" interrupts provided by some BookE processors when using the
EPR facility (External Proxy) and the Freescale Hypervisor.
Additionally, we tend to keep interrupts hard disabled in a number
of cases, such as decrementer interrupts, external interrupts, or
when a masked decrementer interrupt is pending. This is sub-optimal.
This is an attempt at fixing it all in one go by reworking the way
we do the lazy interrupt disabling from the ground up.
The base idea is to replace the "hard_enabled" field with a
"irq_happened" field in which we store a bit mask of what interrupt
occurred while soft-disabled.
When re-enabling, either via arch_local_irq_restore() or when returning
from an interrupt, we can now decide what to do by testing bits in that
field.
We then implement replaying of the missed interrupts either by
re-using the existing exception frame (in exception exit case) or via
the creation of a new one from an assembly trampoline (in the
arch_local_irq_enable case).
This removes the need to play with the decrementer to try to create
fake interrupts, among others.
In addition, this adds a few refinements:
- We no longer hard disable decrementer interrupts that occur
while soft-disabled. We now simply bump the decrementer back to max
(on BookS) or leave it stopped (on BookE) and continue with hard interrupts
enabled, which means that we'll potentially get better sample quality from
performance monitor interrupts.
- Timer, decrementer and doorbell interrupts now hard-enable
shortly after removing the source of the interrupt, which means
they no longer run entirely hard disabled. Again, this will improve
perf sample quality.
- On Book3E 64-bit, we now make the performance monitor interrupt
act as an NMI like Book3S (the necessary C code for that to work
appear to already be present in the FSL perf code, notably calling
nmi_enter instead of irq_enter). (This also fixes a bug where BookE
perfmon interrupts could clobber r14 ... oops)
- We could make "masked" decrementer interrupts act as NMIs when doing
timer-based perf sampling to improve the sample quality.
Signed-off-by-yet: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
v2:
- Add hard-enable to decrementer, timer and doorbells
- Fix CR clobber in masked irq handling on BookE
- Make embedded perf interrupt act as an NMI
- Add a PACA_HAPPENED_EE_EDGE for use by FSL if they want
to retrigger an interrupt without preventing hard-enable
v3:
- Fix or vs. ori bug on Book3E
- Fix enabling of interrupts for some exceptions on Book3E
v4:
- Fix resend of doorbells on return from interrupt on Book3E
v5:
- Rebased on top of my latest series, which involves some significant
rework of some aspects of the patch.
v6:
- 32-bit compile fix
- more compile fixes with various .config combos
- factor out the asm code to soft-disable interrupts
- remove the C wrapper around preempt_schedule_irq
v7:
- Fix a bug with hard irq state tracking on native power7
2012-03-06 15:27:59 +08:00
|
|
|
li r10,SOFTEN_VALUE_##vec; \
|
2011-06-29 08:18:26 +08:00
|
|
|
beq masked_##h##interrupt
|
powerpc: Rework lazy-interrupt handling
The current implementation of lazy interrupts handling has some
issues that this tries to address.
We don't do the various workarounds we need to do when re-enabling
interrupts in some cases such as when returning from an interrupt
and thus we may still lose or get delayed decrementer or doorbell
interrupts.
The current scheme also makes it much harder to handle the external
"edge" interrupts provided by some BookE processors when using the
EPR facility (External Proxy) and the Freescale Hypervisor.
Additionally, we tend to keep interrupts hard disabled in a number
of cases, such as decrementer interrupts, external interrupts, or
when a masked decrementer interrupt is pending. This is sub-optimal.
This is an attempt at fixing it all in one go by reworking the way
we do the lazy interrupt disabling from the ground up.
The base idea is to replace the "hard_enabled" field with a
"irq_happened" field in which we store a bit mask of what interrupt
occurred while soft-disabled.
When re-enabling, either via arch_local_irq_restore() or when returning
from an interrupt, we can now decide what to do by testing bits in that
field.
We then implement replaying of the missed interrupts either by
re-using the existing exception frame (in exception exit case) or via
the creation of a new one from an assembly trampoline (in the
arch_local_irq_enable case).
This removes the need to play with the decrementer to try to create
fake interrupts, among others.
In addition, this adds a few refinements:
- We no longer hard disable decrementer interrupts that occur
while soft-disabled. We now simply bump the decrementer back to max
(on BookS) or leave it stopped (on BookE) and continue with hard interrupts
enabled, which means that we'll potentially get better sample quality from
performance monitor interrupts.
- Timer, decrementer and doorbell interrupts now hard-enable
shortly after removing the source of the interrupt, which means
they no longer run entirely hard disabled. Again, this will improve
perf sample quality.
- On Book3E 64-bit, we now make the performance monitor interrupt
act as an NMI like Book3S (the necessary C code for that to work
appear to already be present in the FSL perf code, notably calling
nmi_enter instead of irq_enter). (This also fixes a bug where BookE
perfmon interrupts could clobber r14 ... oops)
- We could make "masked" decrementer interrupts act as NMIs when doing
timer-based perf sampling to improve the sample quality.
Signed-off-by-yet: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
v2:
- Add hard-enable to decrementer, timer and doorbells
- Fix CR clobber in masked irq handling on BookE
- Make embedded perf interrupt act as an NMI
- Add a PACA_HAPPENED_EE_EDGE for use by FSL if they want
to retrigger an interrupt without preventing hard-enable
v3:
- Fix or vs. ori bug on Book3E
- Fix enabling of interrupts for some exceptions on Book3E
v4:
- Fix resend of doorbells on return from interrupt on Book3E
v5:
- Rebased on top of my latest series, which involves some significant
rework of some aspects of the patch.
v6:
- 32-bit compile fix
- more compile fixes with various .config combos
- factor out the asm code to soft-disable interrupts
- remove the C wrapper around preempt_schedule_irq
v7:
- Fix a bug with hard irq state tracking on native power7
2012-03-06 15:27:59 +08:00
|
|
|
#define _SOFTEN_TEST(h, vec) __SOFTEN_TEST(h, vec)
|
2011-06-29 08:18:26 +08:00
|
|
|
|
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:21:34 +08:00
|
|
|
#define SOFTEN_TEST_PR(vec) \
|
|
|
|
KVMTEST_PR(vec); \
|
powerpc: Rework lazy-interrupt handling
The current implementation of lazy interrupts handling has some
issues that this tries to address.
We don't do the various workarounds we need to do when re-enabling
interrupts in some cases such as when returning from an interrupt
and thus we may still lose or get delayed decrementer or doorbell
interrupts.
The current scheme also makes it much harder to handle the external
"edge" interrupts provided by some BookE processors when using the
EPR facility (External Proxy) and the Freescale Hypervisor.
Additionally, we tend to keep interrupts hard disabled in a number
of cases, such as decrementer interrupts, external interrupts, or
when a masked decrementer interrupt is pending. This is sub-optimal.
This is an attempt at fixing it all in one go by reworking the way
we do the lazy interrupt disabling from the ground up.
The base idea is to replace the "hard_enabled" field with a
"irq_happened" field in which we store a bit mask of what interrupt
occurred while soft-disabled.
When re-enabling, either via arch_local_irq_restore() or when returning
from an interrupt, we can now decide what to do by testing bits in that
field.
We then implement replaying of the missed interrupts either by
re-using the existing exception frame (in exception exit case) or via
the creation of a new one from an assembly trampoline (in the
arch_local_irq_enable case).
This removes the need to play with the decrementer to try to create
fake interrupts, among others.
In addition, this adds a few refinements:
- We no longer hard disable decrementer interrupts that occur
while soft-disabled. We now simply bump the decrementer back to max
(on BookS) or leave it stopped (on BookE) and continue with hard interrupts
enabled, which means that we'll potentially get better sample quality from
performance monitor interrupts.
- Timer, decrementer and doorbell interrupts now hard-enable
shortly after removing the source of the interrupt, which means
they no longer run entirely hard disabled. Again, this will improve
perf sample quality.
- On Book3E 64-bit, we now make the performance monitor interrupt
act as an NMI like Book3S (the necessary C code for that to work
appear to already be present in the FSL perf code, notably calling
nmi_enter instead of irq_enter). (This also fixes a bug where BookE
perfmon interrupts could clobber r14 ... oops)
- We could make "masked" decrementer interrupts act as NMIs when doing
timer-based perf sampling to improve the sample quality.
Signed-off-by-yet: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
v2:
- Add hard-enable to decrementer, timer and doorbells
- Fix CR clobber in masked irq handling on BookE
- Make embedded perf interrupt act as an NMI
- Add a PACA_HAPPENED_EE_EDGE for use by FSL if they want
to retrigger an interrupt without preventing hard-enable
v3:
- Fix or vs. ori bug on Book3E
- Fix enabling of interrupts for some exceptions on Book3E
v4:
- Fix resend of doorbells on return from interrupt on Book3E
v5:
- Rebased on top of my latest series, which involves some significant
rework of some aspects of the patch.
v6:
- 32-bit compile fix
- more compile fixes with various .config combos
- factor out the asm code to soft-disable interrupts
- remove the C wrapper around preempt_schedule_irq
v7:
- Fix a bug with hard irq state tracking on native power7
2012-03-06 15:27:59 +08:00
|
|
|
_SOFTEN_TEST(EXC_STD, vec)
|
2011-06-29 08:18:26 +08:00
|
|
|
|
|
|
|
#define SOFTEN_TEST_HV(vec) \
|
|
|
|
KVMTEST(vec); \
|
powerpc: Rework lazy-interrupt handling
The current implementation of lazy interrupts handling has some
issues that this tries to address.
We don't do the various workarounds we need to do when re-enabling
interrupts in some cases such as when returning from an interrupt
and thus we may still lose or get delayed decrementer or doorbell
interrupts.
The current scheme also makes it much harder to handle the external
"edge" interrupts provided by some BookE processors when using the
EPR facility (External Proxy) and the Freescale Hypervisor.
Additionally, we tend to keep interrupts hard disabled in a number
of cases, such as decrementer interrupts, external interrupts, or
when a masked decrementer interrupt is pending. This is sub-optimal.
This is an attempt at fixing it all in one go by reworking the way
we do the lazy interrupt disabling from the ground up.
The base idea is to replace the "hard_enabled" field with a
"irq_happened" field in which we store a bit mask of what interrupt
occurred while soft-disabled.
When re-enabling, either via arch_local_irq_restore() or when returning
from an interrupt, we can now decide what to do by testing bits in that
field.
We then implement replaying of the missed interrupts either by
re-using the existing exception frame (in exception exit case) or via
the creation of a new one from an assembly trampoline (in the
arch_local_irq_enable case).
This removes the need to play with the decrementer to try to create
fake interrupts, among others.
In addition, this adds a few refinements:
- We no longer hard disable decrementer interrupts that occur
while soft-disabled. We now simply bump the decrementer back to max
(on BookS) or leave it stopped (on BookE) and continue with hard interrupts
enabled, which means that we'll potentially get better sample quality from
performance monitor interrupts.
- Timer, decrementer and doorbell interrupts now hard-enable
shortly after removing the source of the interrupt, which means
they no longer run entirely hard disabled. Again, this will improve
perf sample quality.
- On Book3E 64-bit, we now make the performance monitor interrupt
act as an NMI like Book3S (the necessary C code for that to work
appear to already be present in the FSL perf code, notably calling
nmi_enter instead of irq_enter). (This also fixes a bug where BookE
perfmon interrupts could clobber r14 ... oops)
- We could make "masked" decrementer interrupts act as NMIs when doing
timer-based perf sampling to improve the sample quality.
Signed-off-by-yet: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
v2:
- Add hard-enable to decrementer, timer and doorbells
- Fix CR clobber in masked irq handling on BookE
- Make embedded perf interrupt act as an NMI
- Add a PACA_HAPPENED_EE_EDGE for use by FSL if they want
to retrigger an interrupt without preventing hard-enable
v3:
- Fix or vs. ori bug on Book3E
- Fix enabling of interrupts for some exceptions on Book3E
v4:
- Fix resend of doorbells on return from interrupt on Book3E
v5:
- Rebased on top of my latest series, which involves some significant
rework of some aspects of the patch.
v6:
- 32-bit compile fix
- more compile fixes with various .config combos
- factor out the asm code to soft-disable interrupts
- remove the C wrapper around preempt_schedule_irq
v7:
- Fix a bug with hard irq state tracking on native power7
2012-03-06 15:27:59 +08:00
|
|
|
_SOFTEN_TEST(EXC_HV, vec)
|
2011-06-29 08:18:26 +08:00
|
|
|
|
KVM: PPC: book3s_hv: Add support for PPC970-family processors
This adds support for running KVM guests in supervisor mode on those
PPC970 processors that have a usable hypervisor mode. Unfortunately,
Apple G5 machines have supervisor mode disabled (MSR[HV] is forced to
1), but the YDL PowerStation does have a usable hypervisor mode.
There are several differences between the PPC970 and POWER7 in how
guests are managed. These differences are accommodated using the
CPU_FTR_ARCH_201 (PPC970) and CPU_FTR_ARCH_206 (POWER7) CPU feature
bits. Notably, on PPC970:
* The LPCR, LPID or RMOR registers don't exist, and the functions of
those registers are provided by bits in HID4 and one bit in HID0.
* External interrupts can be directed to the hypervisor, but unlike
POWER7 they are masked by MSR[EE] in non-hypervisor modes and use
SRR0/1 not HSRR0/1.
* There is no virtual RMA (VRMA) mode; the guest must use an RMO
(real mode offset) area.
* The TLB entries are not tagged with the LPID, so it is necessary to
flush the whole TLB on partition switch. Furthermore, when switching
partitions we have to ensure that no other CPU is executing the tlbie
or tlbsync instructions in either the old or the new partition,
otherwise undefined behaviour can occur.
* The PMU has 8 counters (PMC registers) rather than 6.
* The DSCR, PURR, SPURR, AMR, AMOR, UAMOR registers don't exist.
* The SLB has 64 entries rather than 32.
* There is no mediated external interrupt facility, so if we switch to
a guest that has a virtual external interrupt pending but the guest
has MSR[EE] = 0, we have to arrange to have an interrupt pending for
it so that we can get control back once it re-enables interrupts. We
do that by sending ourselves an IPI with smp_send_reschedule after
hard-disabling interrupts.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:40:08 +08:00
|
|
|
#define SOFTEN_TEST_HV_201(vec) \
|
|
|
|
KVMTEST(vec); \
|
powerpc: Rework lazy-interrupt handling
The current implementation of lazy interrupts handling has some
issues that this tries to address.
We don't do the various workarounds we need to do when re-enabling
interrupts in some cases such as when returning from an interrupt
and thus we may still lose or get delayed decrementer or doorbell
interrupts.
The current scheme also makes it much harder to handle the external
"edge" interrupts provided by some BookE processors when using the
EPR facility (External Proxy) and the Freescale Hypervisor.
Additionally, we tend to keep interrupts hard disabled in a number
of cases, such as decrementer interrupts, external interrupts, or
when a masked decrementer interrupt is pending. This is sub-optimal.
This is an attempt at fixing it all in one go by reworking the way
we do the lazy interrupt disabling from the ground up.
The base idea is to replace the "hard_enabled" field with a
"irq_happened" field in which we store a bit mask of what interrupt
occurred while soft-disabled.
When re-enabling, either via arch_local_irq_restore() or when returning
from an interrupt, we can now decide what to do by testing bits in that
field.
We then implement replaying of the missed interrupts either by
re-using the existing exception frame (in exception exit case) or via
the creation of a new one from an assembly trampoline (in the
arch_local_irq_enable case).
This removes the need to play with the decrementer to try to create
fake interrupts, among others.
In addition, this adds a few refinements:
- We no longer hard disable decrementer interrupts that occur
while soft-disabled. We now simply bump the decrementer back to max
(on BookS) or leave it stopped (on BookE) and continue with hard interrupts
enabled, which means that we'll potentially get better sample quality from
performance monitor interrupts.
- Timer, decrementer and doorbell interrupts now hard-enable
shortly after removing the source of the interrupt, which means
they no longer run entirely hard disabled. Again, this will improve
perf sample quality.
- On Book3E 64-bit, we now make the performance monitor interrupt
act as an NMI like Book3S (the necessary C code for that to work
appear to already be present in the FSL perf code, notably calling
nmi_enter instead of irq_enter). (This also fixes a bug where BookE
perfmon interrupts could clobber r14 ... oops)
- We could make "masked" decrementer interrupts act as NMIs when doing
timer-based perf sampling to improve the sample quality.
Signed-off-by-yet: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
v2:
- Add hard-enable to decrementer, timer and doorbells
- Fix CR clobber in masked irq handling on BookE
- Make embedded perf interrupt act as an NMI
- Add a PACA_HAPPENED_EE_EDGE for use by FSL if they want
to retrigger an interrupt without preventing hard-enable
v3:
- Fix or vs. ori bug on Book3E
- Fix enabling of interrupts for some exceptions on Book3E
v4:
- Fix resend of doorbells on return from interrupt on Book3E
v5:
- Rebased on top of my latest series, which involves some significant
rework of some aspects of the patch.
v6:
- 32-bit compile fix
- more compile fixes with various .config combos
- factor out the asm code to soft-disable interrupts
- remove the C wrapper around preempt_schedule_irq
v7:
- Fix a bug with hard irq state tracking on native power7
2012-03-06 15:27:59 +08:00
|
|
|
_SOFTEN_TEST(EXC_STD, vec)
|
KVM: PPC: book3s_hv: Add support for PPC970-family processors
This adds support for running KVM guests in supervisor mode on those
PPC970 processors that have a usable hypervisor mode. Unfortunately,
Apple G5 machines have supervisor mode disabled (MSR[HV] is forced to
1), but the YDL PowerStation does have a usable hypervisor mode.
There are several differences between the PPC970 and POWER7 in how
guests are managed. These differences are accommodated using the
CPU_FTR_ARCH_201 (PPC970) and CPU_FTR_ARCH_206 (POWER7) CPU feature
bits. Notably, on PPC970:
* The LPCR, LPID or RMOR registers don't exist, and the functions of
those registers are provided by bits in HID4 and one bit in HID0.
* External interrupts can be directed to the hypervisor, but unlike
POWER7 they are masked by MSR[EE] in non-hypervisor modes and use
SRR0/1 not HSRR0/1.
* There is no virtual RMA (VRMA) mode; the guest must use an RMO
(real mode offset) area.
* The TLB entries are not tagged with the LPID, so it is necessary to
flush the whole TLB on partition switch. Furthermore, when switching
partitions we have to ensure that no other CPU is executing the tlbie
or tlbsync instructions in either the old or the new partition,
otherwise undefined behaviour can occur.
* The PMU has 8 counters (PMC registers) rather than 6.
* The DSCR, PURR, SPURR, AMR, AMOR, UAMOR registers don't exist.
* The SLB has 64 entries rather than 32.
* There is no mediated external interrupt facility, so if we switch to
a guest that has a virtual external interrupt pending but the guest
has MSR[EE] = 0, we have to arrange to have an interrupt pending for
it so that we can get control back once it re-enables interrupts. We
do that by sending ourselves an IPI with smp_send_reschedule after
hard-disabling interrupts.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:40:08 +08:00
|
|
|
|
2011-06-29 08:18:26 +08:00
|
|
|
#define __MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra) \
|
|
|
|
HMT_MEDIUM; \
|
|
|
|
SET_SCRATCH0(r13); /* save r13 */ \
|
|
|
|
__EXCEPTION_PROLOG_1(PACA_EXGEN, extra, vec); \
|
|
|
|
EXCEPTION_PROLOG_PSERIES_1(label##_common, h);
|
|
|
|
#define _MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra) \
|
|
|
|
__MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)
|
2011-04-05 12:27:11 +08:00
|
|
|
|
|
|
|
#define MASKABLE_EXCEPTION_PSERIES(loc, vec, label) \
|
|
|
|
. = loc; \
|
|
|
|
.globl label##_pSeries; \
|
|
|
|
label##_pSeries: \
|
2011-06-29 08:18:26 +08:00
|
|
|
_MASKABLE_EXCEPTION_PSERIES(vec, label, \
|
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:21:34 +08:00
|
|
|
EXC_STD, SOFTEN_TEST_PR)
|
2011-04-05 12:27:11 +08:00
|
|
|
|
|
|
|
#define MASKABLE_EXCEPTION_HV(loc, vec, label) \
|
|
|
|
. = loc; \
|
|
|
|
.globl label##_hv; \
|
|
|
|
label##_hv: \
|
2011-06-29 08:18:26 +08:00
|
|
|
_MASKABLE_EXCEPTION_PSERIES(vec, label, \
|
|
|
|
EXC_HV, SOFTEN_TEST_HV)
|
2007-08-22 11:46:44 +08:00
|
|
|
|
2012-03-01 12:42:56 +08:00
|
|
|
/*
|
|
|
|
* Our exception common code can be passed various "additions"
|
|
|
|
* to specify the behaviour of interrupts, whether to kick the
|
|
|
|
* runlatch, etc...
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Exception addition: Hard disable interrupts */
|
powerpc: Rework lazy-interrupt handling
The current implementation of lazy interrupts handling has some
issues that this tries to address.
We don't do the various workarounds we need to do when re-enabling
interrupts in some cases such as when returning from an interrupt
and thus we may still lose or get delayed decrementer or doorbell
interrupts.
The current scheme also makes it much harder to handle the external
"edge" interrupts provided by some BookE processors when using the
EPR facility (External Proxy) and the Freescale Hypervisor.
Additionally, we tend to keep interrupts hard disabled in a number
of cases, such as decrementer interrupts, external interrupts, or
when a masked decrementer interrupt is pending. This is sub-optimal.
This is an attempt at fixing it all in one go by reworking the way
we do the lazy interrupt disabling from the ground up.
The base idea is to replace the "hard_enabled" field with a
"irq_happened" field in which we store a bit mask of what interrupt
occurred while soft-disabled.
When re-enabling, either via arch_local_irq_restore() or when returning
from an interrupt, we can now decide what to do by testing bits in that
field.
We then implement replaying of the missed interrupts either by
re-using the existing exception frame (in exception exit case) or via
the creation of a new one from an assembly trampoline (in the
arch_local_irq_enable case).
This removes the need to play with the decrementer to try to create
fake interrupts, among others.
In addition, this adds a few refinements:
- We no longer hard disable decrementer interrupts that occur
while soft-disabled. We now simply bump the decrementer back to max
(on BookS) or leave it stopped (on BookE) and continue with hard interrupts
enabled, which means that we'll potentially get better sample quality from
performance monitor interrupts.
- Timer, decrementer and doorbell interrupts now hard-enable
shortly after removing the source of the interrupt, which means
they no longer run entirely hard disabled. Again, this will improve
perf sample quality.
- On Book3E 64-bit, we now make the performance monitor interrupt
act as an NMI like Book3S (the necessary C code for that to work
appear to already be present in the FSL perf code, notably calling
nmi_enter instead of irq_enter). (This also fixes a bug where BookE
perfmon interrupts could clobber r14 ... oops)
- We could make "masked" decrementer interrupts act as NMIs when doing
timer-based perf sampling to improve the sample quality.
Signed-off-by-yet: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
v2:
- Add hard-enable to decrementer, timer and doorbells
- Fix CR clobber in masked irq handling on BookE
- Make embedded perf interrupt act as an NMI
- Add a PACA_HAPPENED_EE_EDGE for use by FSL if they want
to retrigger an interrupt without preventing hard-enable
v3:
- Fix or vs. ori bug on Book3E
- Fix enabling of interrupts for some exceptions on Book3E
v4:
- Fix resend of doorbells on return from interrupt on Book3E
v5:
- Rebased on top of my latest series, which involves some significant
rework of some aspects of the patch.
v6:
- 32-bit compile fix
- more compile fixes with various .config combos
- factor out the asm code to soft-disable interrupts
- remove the C wrapper around preempt_schedule_irq
v7:
- Fix a bug with hard irq state tracking on native power7
2012-03-06 15:27:59 +08:00
|
|
|
#define DISABLE_INTS SOFT_DISABLE_INTS(r10,r11)
|
2007-08-22 11:46:44 +08:00
|
|
|
|
2012-03-01 09:45:27 +08:00
|
|
|
#define ADD_NVGPRS \
|
|
|
|
bl .save_nvgprs
|
|
|
|
|
|
|
|
#define RUNLATCH_ON \
|
|
|
|
BEGIN_FTR_SECTION \
|
2012-07-05 12:41:35 +08:00
|
|
|
CURRENT_THREAD_INFO(r3, r1); \
|
2012-03-01 09:45:27 +08:00
|
|
|
ld r4,TI_LOCAL_FLAGS(r3); \
|
|
|
|
andi. r0,r4,_TLF_RUNLATCH; \
|
|
|
|
beql ppc64_runlatch_on_trampoline; \
|
|
|
|
END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
|
|
|
|
|
|
|
|
#define EXCEPTION_COMMON(trap, label, hdlr, ret, additions) \
|
|
|
|
.align 7; \
|
|
|
|
.globl label##_common; \
|
|
|
|
label##_common: \
|
|
|
|
EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN); \
|
|
|
|
additions; \
|
|
|
|
addi r3,r1,STACK_FRAME_OVERHEAD; \
|
|
|
|
bl hdlr; \
|
|
|
|
b ret
|
|
|
|
|
|
|
|
#define STD_EXCEPTION_COMMON(trap, label, hdlr) \
|
|
|
|
EXCEPTION_COMMON(trap, label, hdlr, ret_from_except, \
|
|
|
|
ADD_NVGPRS;DISABLE_INTS)
|
2007-08-22 11:46:44 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Like STD_EXCEPTION_COMMON, but for exceptions that can occur
|
2012-03-01 07:52:01 +08:00
|
|
|
* in the idle task and therefore need the special idle handling
|
|
|
|
* (finish nap and runlatch)
|
2007-08-22 11:46:44 +08:00
|
|
|
*/
|
2012-03-01 09:45:27 +08:00
|
|
|
#define STD_EXCEPTION_COMMON_ASYNC(trap, label, hdlr) \
|
|
|
|
EXCEPTION_COMMON(trap, label, hdlr, ret_from_except_lite, \
|
|
|
|
FINISH_NAP;RUNLATCH_ON;DISABLE_INTS)
|
2007-08-22 11:46:44 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* When the idle code in power4_idle puts the CPU into NAP mode,
|
|
|
|
* it has to do so in a loop, and relies on the external interrupt
|
|
|
|
* and decrementer interrupt entry code to get it out of the loop.
|
|
|
|
* It sets the _TLF_NAPPING bit in current_thread_info()->local_flags
|
|
|
|
* to signal that it is in the loop and needs help to get out.
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_PPC_970_NAP
|
|
|
|
#define FINISH_NAP \
|
|
|
|
BEGIN_FTR_SECTION \
|
2012-07-05 12:41:35 +08:00
|
|
|
CURRENT_THREAD_INFO(r11, r1); \
|
2007-08-22 11:46:44 +08:00
|
|
|
ld r9,TI_LOCAL_FLAGS(r11); \
|
|
|
|
andi. r10,r9,_TLF_NAPPING; \
|
|
|
|
bnel power4_fixup_nap; \
|
|
|
|
END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
|
|
|
|
#else
|
|
|
|
#define FINISH_NAP
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* _ASM_POWERPC_EXCEPTION_H */
|