2010-04-16 06:11:37 +08:00
|
|
|
/*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License, version 2, as
|
|
|
|
* published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, write to the Free Software
|
|
|
|
* Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
*
|
|
|
|
* Copyright SUSE Linux Products GmbH 2010
|
|
|
|
*
|
|
|
|
* Authors: Alexander Graf <agraf@suse.de>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef __ASM_KVM_BOOK3S_64_H__
|
|
|
|
#define __ASM_KVM_BOOK3S_64_H__
|
|
|
|
|
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:21:34 +08:00
|
|
|
#ifdef CONFIG_KVM_BOOK3S_PR
|
2011-12-09 21:44:13 +08:00
|
|
|
static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu)
|
2010-04-16 06:11:37 +08:00
|
|
|
{
|
2011-12-09 21:44:13 +08:00
|
|
|
preempt_disable();
|
2010-04-16 06:11:37 +08:00
|
|
|
return &get_paca()->shadow_vcpu;
|
|
|
|
}
|
2011-12-09 21:44:13 +08:00
|
|
|
|
|
|
|
static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
|
|
|
|
{
|
|
|
|
preempt_enable();
|
|
|
|
}
|
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:21:34 +08:00
|
|
|
#endif
|
2010-04-16 06:11:37 +08:00
|
|
|
|
2011-06-29 08:22:41 +08:00
|
|
|
#define SPAPR_TCE_SHIFT 12
|
|
|
|
|
2011-12-12 20:27:39 +08:00
|
|
|
#ifdef CONFIG_KVM_BOOK3S_64_HV
|
|
|
|
/* For now use fixed-size 16MB page table */
|
|
|
|
#define HPT_ORDER 24
|
|
|
|
#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */
|
|
|
|
#define HPT_NPTE (HPT_NPTEG << 3) /* 8 PTEs per PTEG */
|
|
|
|
#define HPT_HASH_MASK (HPT_NPTEG - 1)
|
|
|
|
#endif
|
|
|
|
|
2011-12-12 20:30:16 +08:00
|
|
|
/*
|
|
|
|
* We use a lock bit in HPTE dword 0 to synchronize updates and
|
|
|
|
* accesses to each HPTE, and another bit to indicate non-present
|
|
|
|
* HPTEs.
|
|
|
|
*/
|
|
|
|
#define HPTE_V_HVLOCK 0x40UL
|
|
|
|
|
|
|
|
static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
|
|
|
|
{
|
|
|
|
unsigned long tmp, old;
|
|
|
|
|
|
|
|
asm volatile(" ldarx %0,0,%2\n"
|
|
|
|
" and. %1,%0,%3\n"
|
|
|
|
" bne 2f\n"
|
|
|
|
" ori %0,%0,%4\n"
|
|
|
|
" stdcx. %0,0,%2\n"
|
|
|
|
" beq+ 2f\n"
|
|
|
|
" li %1,%3\n"
|
|
|
|
"2: isync"
|
|
|
|
: "=&r" (tmp), "=&r" (old)
|
|
|
|
: "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
|
|
|
|
: "cc", "memory");
|
|
|
|
return old == 0;
|
|
|
|
}
|
|
|
|
|
2011-11-08 15:08:52 +08:00
|
|
|
static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
|
|
|
|
unsigned long pte_index)
|
|
|
|
{
|
|
|
|
unsigned long rb, va_low;
|
|
|
|
|
|
|
|
rb = (v & ~0x7fUL) << 16; /* AVA field */
|
|
|
|
va_low = pte_index >> 3;
|
|
|
|
if (v & HPTE_V_SECONDARY)
|
|
|
|
va_low = ~va_low;
|
|
|
|
/* xor vsid from AVA */
|
|
|
|
if (!(v & HPTE_V_1TB_SEG))
|
|
|
|
va_low ^= v >> 12;
|
|
|
|
else
|
|
|
|
va_low ^= v >> 24;
|
|
|
|
va_low &= 0x7ff;
|
|
|
|
if (v & HPTE_V_LARGE) {
|
|
|
|
rb |= 1; /* L field */
|
|
|
|
if (cpu_has_feature(CPU_FTR_ARCH_206) &&
|
|
|
|
(r & 0xff000)) {
|
|
|
|
/* non-16MB large page, must be 64k */
|
|
|
|
/* (masks depend on page size) */
|
|
|
|
rb |= 0x1000; /* page encoding in LP field */
|
|
|
|
rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
|
|
|
|
rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* 4kB page */
|
|
|
|
rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */
|
|
|
|
}
|
|
|
|
rb |= (v >> 54) & 0x300; /* B field */
|
|
|
|
return rb;
|
|
|
|
}
|
|
|
|
|
KVM: PPC: Only get pages when actually needed, not in prepare_memory_region()
This removes the code from kvmppc_core_prepare_memory_region() that
looked up the VMA for the region being added and called hva_to_page
to get the pfns for the memory. We have no guarantee that there will
be anything mapped there at the time of the KVM_SET_USER_MEMORY_REGION
ioctl call; userspace can do that ioctl and then map memory into the
region later.
Instead we defer looking up the pfn for each memory page until it is
needed, which generally means when the guest does an H_ENTER hcall on
the page. Since we can't call get_user_pages in real mode, if we don't
already have the pfn for the page, kvmppc_h_enter() will return
H_TOO_HARD and we then call kvmppc_virtmode_h_enter() once we get back
to kernel context. That calls kvmppc_get_guest_page() to get the pfn
for the page, and then calls back to kvmppc_h_enter() to redo the HPTE
insertion.
When the first vcpu starts executing, we need to have the RMO or VRMA
region mapped so that the guest's real mode accesses will work. Thus
we now have a check in kvmppc_vcpu_run() to see if the RMO/VRMA is set
up and if not, call kvmppc_hv_setup_rma(). It checks if the memslot
starting at guest physical 0 now has RMO memory mapped there; if so it
sets it up for the guest, otherwise on POWER7 it sets up the VRMA.
The function that does that, kvmppc_map_vrma, is now a bit simpler,
as it calls kvmppc_virtmode_h_enter instead of creating the HPTE itself.
Since we are now potentially updating entries in the slot_phys[]
arrays from multiple vcpu threads, we now have a spinlock protecting
those updates to ensure that we don't lose track of any references
to pages.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
2011-12-12 20:31:00 +08:00
|
|
|
static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
|
|
|
|
{
|
|
|
|
/* only handle 4k, 64k and 16M pages for now */
|
|
|
|
if (!(h & HPTE_V_LARGE))
|
|
|
|
return 1ul << 12; /* 4k page */
|
|
|
|
if ((l & 0xf000) == 0x1000 && cpu_has_feature(CPU_FTR_ARCH_206))
|
|
|
|
return 1ul << 16; /* 64k page */
|
|
|
|
if ((l & 0xff000) == 0)
|
|
|
|
return 1ul << 24; /* 16M page */
|
|
|
|
return 0; /* error */
|
|
|
|
}
|
|
|
|
|
2011-12-12 20:32:27 +08:00
|
|
|
static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type)
|
|
|
|
{
|
|
|
|
unsigned int wimg = ptel & HPTE_R_WIMG;
|
|
|
|
|
|
|
|
/* Handle SAO */
|
|
|
|
if (wimg == (HPTE_R_W | HPTE_R_I | HPTE_R_M) &&
|
|
|
|
cpu_has_feature(CPU_FTR_ARCH_206))
|
|
|
|
wimg = HPTE_R_M;
|
|
|
|
|
|
|
|
if (!io_type)
|
|
|
|
return wimg == HPTE_R_M;
|
|
|
|
|
|
|
|
return (wimg & (HPTE_R_W | HPTE_R_I)) == io_type;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Return HPTE cache control bits corresponding to Linux pte bits */
|
|
|
|
static inline unsigned long hpte_cache_bits(unsigned long pte_val)
|
|
|
|
{
|
|
|
|
#if _PAGE_NO_CACHE == HPTE_R_I && _PAGE_WRITETHRU == HPTE_R_W
|
|
|
|
return pte_val & (HPTE_R_W | HPTE_R_I);
|
|
|
|
#else
|
|
|
|
return ((pte_val & _PAGE_NO_CACHE) ? HPTE_R_I : 0) +
|
|
|
|
((pte_val & _PAGE_WRITETHRU) ? HPTE_R_W : 0);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2011-12-12 20:31:41 +08:00
|
|
|
static inline bool slot_is_aligned(struct kvm_memory_slot *memslot,
|
|
|
|
unsigned long pagesize)
|
|
|
|
{
|
|
|
|
unsigned long mask = (pagesize >> PAGE_SHIFT) - 1;
|
|
|
|
|
|
|
|
if (pagesize <= PAGE_SIZE)
|
|
|
|
return 1;
|
|
|
|
return !(memslot->base_gfn & mask) && !(memslot->npages & mask);
|
|
|
|
}
|
|
|
|
|
2010-04-16 06:11:37 +08:00
|
|
|
#endif /* __ASM_KVM_BOOK3S_64_H__ */
|