2009-10-30 13:47:05 +08:00
|
|
|
/*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License, version 2, as
|
|
|
|
* published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, write to the Free Software
|
|
|
|
* Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
*
|
|
|
|
* Copyright SUSE Linux Products GmbH 2009
|
|
|
|
*
|
|
|
|
* Authors: Alexander Graf <agraf@suse.de>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef __ASM_KVM_BOOK3S_H__
|
|
|
|
#define __ASM_KVM_BOOK3S_H__
|
|
|
|
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/kvm_host.h>
|
2010-04-16 06:11:32 +08:00
|
|
|
#include <asm/kvm_book3s_asm.h>
|
2009-10-30 13:47:05 +08:00
|
|
|
|
|
|
|
struct kvmppc_bat {
|
2009-11-30 11:02:02 +08:00
|
|
|
u64 raw;
|
2009-10-30 13:47:05 +08:00
|
|
|
u32 bepi;
|
|
|
|
u32 bepi_mask;
|
|
|
|
u32 brpn;
|
|
|
|
u8 wimg;
|
|
|
|
u8 pp;
|
2010-03-25 04:48:36 +08:00
|
|
|
bool vs : 1;
|
|
|
|
bool vp : 1;
|
2009-10-30 13:47:05 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct kvmppc_sid_map {
|
|
|
|
u64 guest_vsid;
|
|
|
|
u64 guest_esid;
|
|
|
|
u64 host_vsid;
|
2010-03-25 04:48:36 +08:00
|
|
|
bool valid : 1;
|
2009-10-30 13:47:05 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
#define SID_MAP_BITS 9
|
|
|
|
#define SID_MAP_NUM (1 << SID_MAP_BITS)
|
|
|
|
#define SID_MAP_MASK (SID_MAP_NUM - 1)
|
|
|
|
|
2010-08-15 14:04:24 +08:00
|
|
|
#ifdef CONFIG_PPC_BOOK3S_64
|
|
|
|
#define SID_CONTEXTS 1
|
|
|
|
#else
|
|
|
|
#define SID_CONTEXTS 128
|
|
|
|
#define VSID_POOL_SIZE (SID_CONTEXTS * 16)
|
|
|
|
#endif
|
|
|
|
|
2011-06-29 08:17:33 +08:00
|
|
|
struct hpte_cache {
|
|
|
|
struct hlist_node list_pte;
|
|
|
|
struct hlist_node list_pte_long;
|
|
|
|
struct hlist_node list_vpte;
|
|
|
|
struct hlist_node list_vpte_long;
|
2013-09-20 12:52:44 +08:00
|
|
|
#ifdef CONFIG_PPC_BOOK3S_64
|
|
|
|
struct hlist_node list_vpte_64k;
|
|
|
|
#endif
|
2011-06-29 08:17:33 +08:00
|
|
|
struct rcu_head rcu_head;
|
2012-09-10 10:52:50 +08:00
|
|
|
u64 host_vpn;
|
2011-06-29 08:17:33 +08:00
|
|
|
u64 pfn;
|
|
|
|
ulong slot;
|
|
|
|
struct kvmppc_pte pte;
|
2013-09-20 12:52:45 +08:00
|
|
|
int pagesize;
|
2011-06-29 08:17:33 +08:00
|
|
|
};
|
|
|
|
|
2016-08-02 12:03:19 +08:00
|
|
|
/*
|
|
|
|
* Struct for a virtual core.
|
|
|
|
* Note: entry_exit_map combines a bitmap of threads that have entered
|
|
|
|
* in the bottom 8 bits and a bitmap of threads that have exited in the
|
|
|
|
* next 8 bits. This is so that we can atomically set the entry bit
|
|
|
|
* iff the exit map is 0 without taking a lock.
|
|
|
|
*/
|
|
|
|
struct kvmppc_vcore {
|
|
|
|
int n_runnable;
|
|
|
|
int num_threads;
|
|
|
|
int entry_exit_map;
|
|
|
|
int napping_threads;
|
|
|
|
int first_vcpuid;
|
|
|
|
u16 pcpu;
|
|
|
|
u16 last_cpu;
|
|
|
|
u8 vcore_state;
|
|
|
|
u8 in_guest;
|
2016-08-02 12:03:20 +08:00
|
|
|
struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS];
|
2016-08-02 12:03:19 +08:00
|
|
|
struct list_head preempt_list;
|
|
|
|
spinlock_t lock;
|
|
|
|
struct swait_queue_head wq;
|
|
|
|
spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
|
|
|
|
u64 stolen_tb;
|
|
|
|
u64 preempt_tb;
|
|
|
|
struct kvm_vcpu *runner;
|
|
|
|
struct kvm *kvm;
|
|
|
|
u64 tb_offset; /* guest timebase - host timebase */
|
KVM: PPC: Book3S HV: Snapshot timebase offset on guest entry
Currently, the HV KVM guest entry/exit code adds the timebase offset
from the vcore struct to the timebase on guest entry, and subtracts
it on guest exit. Which is fine, except that it is possible for
userspace to change the offset using the SET_ONE_REG interface while
the vcore is running, as there is only one timebase offset per vcore
but potentially multiple VCPUs in the vcore. If that were to happen,
KVM would subtract a different offset on guest exit from that which
it had added on guest entry, leading to the timebase being out of sync
between cores in the host, which then leads to bad things happening
such as hangs and spurious watchdog timeouts.
To fix this, we add a new field 'tb_offset_applied' to the vcore struct
which stores the offset that is currently applied to the timebase.
This value is set from the vcore tb_offset field on guest entry, and
is what is subtracted from the timebase on guest exit. Since it is
zero when the timebase offset is not applied, we can simplify the
logic in kvmhv_start_timing and kvmhv_accumulate_time.
In addition, we had secondary threads reading the timebase while
running concurrently with code on the primary thread which would
eventually add or subtract the timebase offset from the timebase.
This occurred while saving or restoring the DEC register value on
the secondary threads. Although no specific incorrect behaviour has
been observed, this is a race which should be fixed. To fix it, we
move the DEC saving code to just before we call kvmhv_commence_exit,
and the DEC restoring code to after the point where we have waited
for the primary thread to switch the MMU context and add the timebase
offset. That way we are sure that the timebase contains the guest
timebase value in both cases.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2018-04-20 20:51:11 +08:00
|
|
|
u64 tb_offset_applied; /* timebase offset currently in force */
|
2016-08-02 12:03:19 +08:00
|
|
|
ulong lpcr;
|
|
|
|
u32 arch_compat;
|
|
|
|
ulong pcr;
|
|
|
|
ulong dpdes; /* doorbell state (POWER8) */
|
2016-09-15 11:42:52 +08:00
|
|
|
ulong vtb; /* virtual timebase */
|
2016-08-02 12:03:19 +08:00
|
|
|
ulong conferring_threads;
|
KVM: PPC: Book3S HV: Implement halt polling
This patch introduces new halt polling functionality into the kvm_hv kernel
module. When a vcore is idle it will poll for some period of time before
scheduling itself out.
When all of the runnable vcpus on a vcore have ceded (and thus the vcore is
idle) we schedule ourselves out to allow something else to run. In the
event that we need to wake up very quickly (for example an interrupt
arrives), we are required to wait until we get scheduled again.
Implement halt polling so that when a vcore is idle, and before scheduling
ourselves, we poll for vcpus in the runnable_threads list which have
pending exceptions or which leave the ceded state. If we poll successfully
then we can get back into the guest very quickly without ever scheduling
ourselves, otherwise we schedule ourselves out as before.
There exists generic halt_polling code in virt/kvm_main.c, however on
powerpc the polling conditions are different to the generic case. It would
be nice if we could just implement an arch specific kvm_check_block()
function, but there is still other arch specific things which need to be
done for kvm_hv (for example manipulating vcore states) which means that a
separate implementation is the best option.
Testing of this patch with a TCP round robin test between two guests with
virtio network interfaces has found a decrease in round trip time of ~15us
on average. A performance gain is only seen when going out of and
back into the guest often and quickly, otherwise there is no net benefit
from the polling. The polling interval is adjusted such that when we are
often scheduled out for long periods of time it is reduced, and when we
often poll successfully it is increased. The rate at which the polling
interval increases or decreases, and the maximum polling interval, can
be set through module parameters.
Based on the implementation in the generic kvm module by Wanpeng Li and
Paolo Bonzini, and on direction from Paul Mackerras.
Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2016-08-02 12:03:21 +08:00
|
|
|
unsigned int halt_poll_ns;
|
2018-04-20 17:53:22 +08:00
|
|
|
atomic_t online_count;
|
2016-08-02 12:03:19 +08:00
|
|
|
};
|
|
|
|
|
2009-10-30 13:47:05 +08:00
|
|
|
struct kvmppc_vcpu_book3s {
|
|
|
|
struct kvmppc_sid_map sid_map[SID_MAP_NUM];
|
|
|
|
struct {
|
|
|
|
u64 esid;
|
|
|
|
u64 vsid;
|
|
|
|
} slb_shadow[64];
|
|
|
|
u8 slb_shadow_max;
|
|
|
|
struct kvmppc_bat ibat[8];
|
|
|
|
struct kvmppc_bat dbat[8];
|
|
|
|
u64 hid[6];
|
2010-02-19 18:00:33 +08:00
|
|
|
u64 gqr[8];
|
2009-10-30 13:47:05 +08:00
|
|
|
u64 sdr1;
|
|
|
|
u64 hior;
|
|
|
|
u64 msr_mask;
|
2016-09-15 11:42:52 +08:00
|
|
|
u64 vtb;
|
2010-08-15 14:04:24 +08:00
|
|
|
#ifdef CONFIG_PPC_BOOK3S_32
|
|
|
|
u32 vsid_pool[VSID_POOL_SIZE];
|
2012-03-23 08:21:14 +08:00
|
|
|
u32 vsid_next;
|
2010-08-15 14:04:24 +08:00
|
|
|
#else
|
2012-03-23 08:21:14 +08:00
|
|
|
u64 proto_vsid_first;
|
|
|
|
u64 proto_vsid_max;
|
|
|
|
u64 proto_vsid_next;
|
2010-08-15 14:04:24 +08:00
|
|
|
#endif
|
|
|
|
int context_id[SID_CONTEXTS];
|
2011-06-29 08:17:33 +08:00
|
|
|
|
2011-09-15 03:45:23 +08:00
|
|
|
bool hior_explicit; /* HIOR is set by ioctl, not PVR */
|
|
|
|
|
2011-06-29 08:17:33 +08:00
|
|
|
struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
|
|
|
|
struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
|
|
|
|
struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
|
|
|
|
struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG];
|
2013-09-20 12:52:44 +08:00
|
|
|
#ifdef CONFIG_PPC_BOOK3S_64
|
|
|
|
struct hlist_head hpte_hash_vpte_64k[HPTEG_HASH_NUM_VPTE_64K];
|
|
|
|
#endif
|
2011-06-29 08:17:33 +08:00
|
|
|
int hpte_cache_count;
|
|
|
|
spinlock_t mmu_lock;
|
2009-10-30 13:47:05 +08:00
|
|
|
};
|
|
|
|
|
2013-09-20 12:52:45 +08:00
|
|
|
#define VSID_REAL 0x07ffffffffc00000ULL
|
|
|
|
#define VSID_BAT 0x07ffffffffb00000ULL
|
|
|
|
#define VSID_64K 0x0800000000000000ULL
|
2013-06-22 15:16:32 +08:00
|
|
|
#define VSID_1T 0x1000000000000000ULL
|
2010-04-20 08:49:48 +08:00
|
|
|
#define VSID_REAL_DR 0x2000000000000000ULL
|
|
|
|
#define VSID_REAL_IR 0x4000000000000000ULL
|
2010-03-25 04:48:35 +08:00
|
|
|
#define VSID_PR 0x8000000000000000ULL
|
2009-10-30 13:47:05 +08:00
|
|
|
|
2010-04-20 08:49:46 +08:00
|
|
|
extern void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong ea, ulong ea_mask);
|
2009-10-30 13:47:05 +08:00
|
|
|
extern void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 vp, u64 vp_mask);
|
2010-04-20 08:49:46 +08:00
|
|
|
extern void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end);
|
2009-10-30 13:47:05 +08:00
|
|
|
extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr);
|
|
|
|
extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu);
|
|
|
|
extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
|
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:21:34 +08:00
|
|
|
extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
|
KVM: PPC: Book3S PR: Better handling of host-side read-only pages
Currently we request write access to all pages that get mapped into the
guest, even if the guest is only loading from the page. This reduces
the effectiveness of KSM because it means that we unshare every page we
access. Also, we always set the changed (C) bit in the guest HPTE if
it allows writing, even for a guest load.
This fixes both these problems. We pass an 'iswrite' flag to the
mmu.xlate() functions and to kvmppc_mmu_map_page() to indicate whether
the access is a load or a store. The mmu.xlate() functions now only
set C for stores. kvmppc_gfn_to_pfn() now calls gfn_to_pfn_prot()
instead of gfn_to_pfn() so that it can indicate whether we need write
access to the page, and get back a 'writable' flag to indicate whether
the page is writable or not. If that 'writable' flag is clear, we then
make the host HPTE read-only even if the guest HPTE allowed writing.
This means that we can get a protection fault when the guest writes to a
page that it has mapped read-write but which is read-only on the host
side (perhaps due to KSM having merged the page). Thus we now call
kvmppc_handle_pagefault() for protection faults as well as HPTE not found
faults. In kvmppc_handle_pagefault(), if the access was allowed by the
guest HPTE and we thus need to install a new host HPTE, we then need to
remove the old host HPTE if there is one. This is done with a new
function, kvmppc_mmu_unmap_page(), which uses kvmppc_mmu_pte_vflush() to
find and remove the old host HPTE.
Since the memslot-related functions require the KVM SRCU read lock to
be held, this adds srcu_read_lock/unlock pairs around the calls to
kvmppc_handle_pagefault().
Finally, this changes kvmppc_mmu_book3s_32_xlate_pte() to not ignore
guest HPTEs that don't permit access, and to return -EPERM for accesses
that are not permitted by the page protections.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2013-09-20 12:52:51 +08:00
|
|
|
extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte,
|
|
|
|
bool iswrite);
|
|
|
|
extern void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
|
2009-10-30 13:47:05 +08:00
|
|
|
extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
|
2013-06-22 15:16:32 +08:00
|
|
|
extern void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong eaddr, ulong seg_size);
|
2009-10-30 13:47:05 +08:00
|
|
|
extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
|
KVM: PPC: Implement MMIO emulation support for Book3S HV guests
This provides the low-level support for MMIO emulation in Book3S HV
guests. When the guest tries to map a page which is not covered by
any memslot, that page is taken to be an MMIO emulation page. Instead
of inserting a valid HPTE, we insert an HPTE that has the valid bit
clear but another hypervisor software-use bit set, which we call
HPTE_V_ABSENT, to indicate that this is an absent page. An
absent page is treated much like a valid page as far as guest hcalls
(H_ENTER, H_REMOVE, H_READ etc.) are concerned, except of course that
an absent HPTE doesn't need to be invalidated with tlbie since it
was never valid as far as the hardware is concerned.
When the guest accesses a page for which there is an absent HPTE, it
will take a hypervisor data storage interrupt (HDSI) since we now set
the VPM1 bit in the LPCR. Our HDSI handler for HPTE-not-present faults
looks up the hash table and if it finds an absent HPTE mapping the
requested virtual address, will switch to kernel mode and handle the
fault in kvmppc_book3s_hv_page_fault(), which at present just calls
kvmppc_hv_emulate_mmio() to set up the MMIO emulation.
This is based on an earlier patch by Benjamin Herrenschmidt, but since
heavily reworked.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
2011-12-12 20:36:37 +08:00
|
|
|
extern int kvmppc_book3s_hv_page_fault(struct kvm_run *run,
|
|
|
|
struct kvm_vcpu *vcpu, unsigned long addr,
|
|
|
|
unsigned long status);
|
|
|
|
extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr,
|
|
|
|
unsigned long slb_v, unsigned long valid);
|
2017-01-30 18:21:46 +08:00
|
|
|
extern int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
|
|
|
|
unsigned long gpa, gva_t ea, int is_store);
|
2010-06-30 21:18:46 +08:00
|
|
|
|
|
|
|
extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
|
|
|
|
extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu);
|
KVM: PPC: Book3S PR: Use mmu_notifier_retry() in kvmppc_mmu_map_page()
When the MM code is invalidating a range of pages, it calls the KVM
kvm_mmu_notifier_invalidate_range_start() notifier function, which calls
kvm_unmap_hva_range(), which arranges to flush all the existing host
HPTEs for guest pages. However, the Linux PTEs for the range being
flushed are still valid at that point. We are not supposed to establish
any new references to pages in the range until the ...range_end()
notifier gets called. The PPC-specific KVM code doesn't get any
explicit notification of that; instead, we are supposed to use
mmu_notifier_retry() to test whether we are or have been inside a
range flush notifier pair while we have been getting a page and
instantiating a host HPTE for the page.
This therefore adds a call to mmu_notifier_retry inside
kvmppc_mmu_map_page(). This call is inside a region locked with
kvm->mmu_lock, which is the same lock that is called by the KVM
MMU notifier functions, thus ensuring that no new notification can
proceed while we are in the locked region. Inside this region we
also create the host HPTE and link the corresponding hpte_cache
structure into the lists used to find it later. We cannot allocate
the hpte_cache structure inside this locked region because that can
lead to deadlock, so we allocate it outside the region and free it
if we end up not using it.
This also moves the updates of vcpu3s->hpte_cache_count inside the
regions locked with vcpu3s->mmu_lock, and does the increment in
kvmppc_mmu_hpte_cache_map() when the pte is added to the cache
rather than when it is allocated, in order that the hpte_cache_count
is accurate.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2013-09-20 12:52:52 +08:00
|
|
|
extern void kvmppc_mmu_hpte_cache_free(struct hpte_cache *pte);
|
2010-06-30 21:18:46 +08:00
|
|
|
extern void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu);
|
|
|
|
extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu);
|
|
|
|
extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
|
|
|
|
extern int kvmppc_mmu_hpte_sysinit(void);
|
|
|
|
extern void kvmppc_mmu_hpte_sysexit(void);
|
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:21:34 +08:00
|
|
|
extern int kvmppc_mmu_hv_init(void);
|
2014-06-02 09:03:00 +08:00
|
|
|
extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc);
|
2010-06-30 21:18:46 +08:00
|
|
|
|
2017-01-30 18:21:46 +08:00
|
|
|
extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
|
|
|
|
struct kvm_vcpu *vcpu,
|
|
|
|
unsigned long ea, unsigned long dsisr);
|
2018-12-14 13:29:09 +08:00
|
|
|
extern unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
|
|
|
|
gva_t eaddr, void *to, void *from,
|
|
|
|
unsigned long n);
|
2018-12-14 13:29:05 +08:00
|
|
|
extern long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
|
|
|
|
void *to, unsigned long n);
|
|
|
|
extern long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
|
|
|
|
void *from, unsigned long n);
|
2018-10-08 13:31:07 +08:00
|
|
|
extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
|
|
|
|
struct kvmppc_pte *gpte, u64 root,
|
|
|
|
u64 *pte_ret_p);
|
2018-10-08 13:31:00 +08:00
|
|
|
extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
|
|
|
|
struct kvmppc_pte *gpte, u64 table,
|
|
|
|
int table_index, u64 *pte_ret_p);
|
2017-01-30 18:21:44 +08:00
|
|
|
extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
|
|
|
|
struct kvmppc_pte *gpte, bool data, bool iswrite);
|
2018-12-21 11:28:42 +08:00
|
|
|
extern void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
|
|
|
|
unsigned int pshift, unsigned int lpid);
|
2018-10-08 13:31:08 +08:00
|
|
|
extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
|
2018-12-12 12:16:48 +08:00
|
|
|
unsigned int shift,
|
|
|
|
const struct kvm_memory_slot *memslot,
|
2018-10-08 13:31:08 +08:00
|
|
|
unsigned int lpid);
|
2018-10-08 13:31:07 +08:00
|
|
|
extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
|
|
|
|
bool writing, unsigned long gpa,
|
|
|
|
unsigned int lpid);
|
|
|
|
extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
|
|
|
|
unsigned long gpa,
|
|
|
|
struct kvm_memory_slot *memslot,
|
|
|
|
bool writing, bool kvm_ro,
|
|
|
|
pte_t *inserted_pte, unsigned int *levelp);
|
2017-01-30 18:21:53 +08:00
|
|
|
extern int kvmppc_init_vm_radix(struct kvm *kvm);
|
2017-01-30 18:21:46 +08:00
|
|
|
extern void kvmppc_free_radix(struct kvm *kvm);
|
2018-10-08 13:31:07 +08:00
|
|
|
extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd,
|
|
|
|
unsigned int lpid);
|
2017-01-30 18:21:46 +08:00
|
|
|
extern int kvmppc_radix_init(void);
|
|
|
|
extern void kvmppc_radix_exit(void);
|
2017-01-30 18:21:47 +08:00
|
|
|
extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
|
|
|
|
unsigned long gfn);
|
|
|
|
extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
|
|
|
|
unsigned long gfn);
|
|
|
|
extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
|
|
|
|
unsigned long gfn);
|
2017-01-30 18:21:48 +08:00
|
|
|
extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
|
|
|
|
struct kvm_memory_slot *memslot, unsigned long *map);
|
2018-12-12 12:17:17 +08:00
|
|
|
extern void kvmppc_radix_flush_memslot(struct kvm *kvm,
|
|
|
|
const struct kvm_memory_slot *memslot);
|
2017-01-30 18:21:53 +08:00
|
|
|
extern int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info);
|
2017-01-30 18:21:44 +08:00
|
|
|
|
2014-06-20 19:58:16 +08:00
|
|
|
/* XXX remove this export when load_last_inst() is generic */
|
2010-02-19 18:00:38 +08:00
|
|
|
extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
|
2009-10-30 13:47:05 +08:00
|
|
|
extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
|
2013-04-18 04:30:26 +08:00
|
|
|
extern void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
|
|
|
|
unsigned int vec);
|
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:21:34 +08:00
|
|
|
extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
|
2018-05-23 15:02:00 +08:00
|
|
|
extern void kvmppc_trigger_fac_interrupt(struct kvm_vcpu *vcpu, ulong fac);
|
2009-11-30 11:02:02 +08:00
|
|
|
extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
|
|
|
|
bool upper, u32 val);
|
2010-02-19 18:00:39 +08:00
|
|
|
extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
|
2010-02-19 18:00:44 +08:00
|
|
|
extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
|
kvm: rename pfn_t to kvm_pfn_t
To date, we have implemented two I/O usage models for persistent memory,
PMEM (a persistent "ram disk") and DAX (mmap persistent memory into
userspace). This series adds a third, DAX-GUP, that allows DAX mappings
to be the target of direct-i/o. It allows userspace to coordinate
DMA/RDMA from/to persistent memory.
The implementation leverages the ZONE_DEVICE mm-zone that went into
4.3-rc1 (also discussed at kernel summit) to flag pages that are owned
and dynamically mapped by a device driver. The pmem driver, after
mapping a persistent memory range into the system memmap via
devm_memremap_pages(), arranges for DAX to distinguish pfn-only versus
page-backed pmem-pfns via flags in the new pfn_t type.
The DAX code, upon seeing a PFN_DEV+PFN_MAP flagged pfn, flags the
resulting pte(s) inserted into the process page tables with a new
_PAGE_DEVMAP flag. Later, when get_user_pages() is walking ptes it keys
off _PAGE_DEVMAP to pin the device hosting the page range active.
Finally, get_page() and put_page() are modified to take references
against the device driver established page mapping.
Finally, this need for "struct page" for persistent memory requires
memory capacity to store the memmap array. Given the memmap array for a
large pool of persistent may exhaust available DRAM introduce a
mechanism to allocate the memmap from persistent memory. The new
"struct vmem_altmap *" parameter to devm_memremap_pages() enables
arch_add_memory() to use reserved pmem capacity rather than the page
allocator.
This patch (of 18):
The core has developed a need for a "pfn_t" type [1]. Move the existing
pfn_t in KVM to kvm_pfn_t [2].
[1]: https://lists.01.org/pipermail/linux-nvdimm/2015-September/002199.html
[2]: https://lists.01.org/pipermail/linux-nvdimm/2015-September/002218.html
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-16 08:56:11 +08:00
|
|
|
extern kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa,
|
|
|
|
bool writing, bool *writable);
|
KVM: PPC: Implement MMU notifiers for Book3S HV guests
This adds the infrastructure to enable us to page out pages underneath
a Book3S HV guest, on processors that support virtualized partition
memory, that is, POWER7. Instead of pinning all the guest's pages,
we now look in the host userspace Linux page tables to find the
mapping for a given guest page. Then, if the userspace Linux PTE
gets invalidated, kvm_unmap_hva() gets called for that address, and
we replace all the guest HPTEs that refer to that page with absent
HPTEs, i.e. ones with the valid bit clear and the HPTE_V_ABSENT bit
set, which will cause an HDSI when the guest tries to access them.
Finally, the page fault handler is extended to reinstantiate the
guest HPTE when the guest tries to access a page which has been paged
out.
Since we can't intercept the guest DSI and ISI interrupts on PPC970,
we still have to pin all the guest pages on PPC970. We have a new flag,
kvm->arch.using_mmu_notifiers, that indicates whether we can page
guest pages out. If it is not set, the MMU notifier callbacks do
nothing and everything operates as before.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
2011-12-12 20:38:05 +08:00
|
|
|
extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
|
|
|
|
unsigned long *rmap, long pte_index, int realmode);
|
2018-12-12 12:16:48 +08:00
|
|
|
extern void kvmppc_update_dirty_map(const struct kvm_memory_slot *memslot,
|
KVM: PPC: Book3S HV: Unify dirty page map between HPT and radix
Currently, the HPT code in HV KVM maintains a dirty bit per guest page
in the rmap array, whether or not dirty page tracking has been enabled
for the memory slot. In contrast, the radix code maintains a dirty
bit per guest page in memslot->dirty_bitmap, and only does so when
dirty page tracking has been enabled.
This changes the HPT code to maintain the dirty bits in the memslot
dirty_bitmap like radix does. This results in slightly less code
overall, and will mean that we do not lose the dirty bits when
transitioning between HPT and radix mode in future.
There is one minor change to behaviour as a result. With HPT, when
dirty tracking was enabled for a memslot, we would previously clear
all the dirty bits at that point (both in the HPT entries and in the
rmap arrays), meaning that a KVM_GET_DIRTY_LOG ioctl immediately
following would show no pages as dirty (assuming no vcpus have run
in the meantime). With this change, the dirty bits on HPT entries
are not cleared at the point where dirty tracking is enabled, so
KVM_GET_DIRTY_LOG would show as dirty any guest pages that are
resident in the HPT and dirty. This is consistent with what happens
on radix.
This also fixes a bug in the mark_pages_dirty() function for radix
(in the sense that the function no longer exists). In the case where
a large page of 64 normal pages or more is marked dirty, the
addressing of the dirty bitmap was incorrect and could write past
the end of the bitmap. Fortunately this case was never hit in
practice because a 2MB large page is only 32 x 64kB pages, and we
don't support backing the guest with 1GB huge pages at this point.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2017-10-26 13:39:19 +08:00
|
|
|
unsigned long gfn, unsigned long psize);
|
2014-06-11 16:16:06 +08:00
|
|
|
extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
|
KVM: PPC: Implement MMU notifiers for Book3S HV guests
This adds the infrastructure to enable us to page out pages underneath
a Book3S HV guest, on processors that support virtualized partition
memory, that is, POWER7. Instead of pinning all the guest's pages,
we now look in the host userspace Linux page tables to find the
mapping for a given guest page. Then, if the userspace Linux PTE
gets invalidated, kvm_unmap_hva() gets called for that address, and
we replace all the guest HPTEs that refer to that page with absent
HPTEs, i.e. ones with the valid bit clear and the HPTE_V_ABSENT bit
set, which will cause an HDSI when the guest tries to access them.
Finally, the page fault handler is extended to reinstantiate the
guest HPTE when the guest tries to access a page which has been paged
out.
Since we can't intercept the guest DSI and ISI interrupts on PPC970,
we still have to pin all the guest pages on PPC970. We have a new flag,
kvm->arch.using_mmu_notifiers, that indicates whether we can page
guest pages out. If it is not set, the MMU notifier callbacks do
nothing and everything operates as before.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
2011-12-12 20:38:05 +08:00
|
|
|
unsigned long pte_index);
|
2014-06-11 16:16:06 +08:00
|
|
|
void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
|
2011-12-15 10:02:47 +08:00
|
|
|
unsigned long pte_index);
|
2011-12-12 20:28:55 +08:00
|
|
|
extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr,
|
|
|
|
unsigned long *nb_ret);
|
KVM: PPC: Book3S HV: Report VPA and DTL modifications in dirty map
At present, the KVM_GET_DIRTY_LOG ioctl doesn't report modifications
done by the host to the virtual processor areas (VPAs) and dispatch
trace logs (DTLs) registered by the guest. This is because those
modifications are done either in real mode or in the host kernel
context, and in neither case does the access go through the guest's
HPT, and thus no change (C) bit gets set in the guest's HPT.
However, the changes done by the host do need to be tracked so that
the modified pages get transferred when doing live migration. In
order to track these modifications, this adds a dirty flag to the
struct representing the VPA/DTL areas, and arranges to set the flag
when the VPA/DTL gets modified by the host. Then, when we are
collecting the dirty log, we also check the dirty flags for the
VPA and DTL for each vcpu and set the relevant bit in the dirty log
if necessary. Doing this also means we now need to keep track of
the guest physical address of the VPA/DTL areas.
So as not to lose track of modifications to a VPA/DTL area when it gets
unregistered, or when a new area gets registered in its place, we need
to transfer the dirty state to the rmap chain. This adds code to
kvmppc_unpin_guest_page() to do that if the area was dirty. To simplify
that code, we now require that all VPA, DTL and SLB shadow buffer areas
fit within a single host page. Guests already comply with this
requirement because pHyp requires that these areas not cross a 4k
boundary.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2013-04-19 03:51:04 +08:00
|
|
|
extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr,
|
|
|
|
unsigned long gpa, bool dirty);
|
KVM: PPC: Book3S HV: Restructure HPT entry creation code
This restructures the code that creates HPT (hashed page table)
entries so that it can be called in situations where we don't have a
struct vcpu pointer, only a struct kvm pointer. It also fixes a bug
where kvmppc_map_vrma() would corrupt the guest R4 value.
Most of the work of kvmppc_virtmode_h_enter is now done by a new
function, kvmppc_virtmode_do_h_enter, which itself calls another new
function, kvmppc_do_h_enter, which contains most of the old
kvmppc_h_enter. The new kvmppc_do_h_enter takes explicit arguments
for the place to return the HPTE index, the Linux page tables to use,
and whether it is being called in real mode, thus removing the need
for it to have the vcpu as an argument.
Currently kvmppc_map_vrma creates the VRMA (virtual real mode area)
HPTEs by calling kvmppc_virtmode_h_enter, which is designed primarily
to handle H_ENTER hcalls from the guest that need to pin a page of
memory. Since H_ENTER returns the index of the created HPTE in R4,
kvmppc_virtmode_h_enter updates the guest R4, corrupting the guest R4
in the case when it gets called from kvmppc_map_vrma on the first
VCPU_RUN ioctl. With this, kvmppc_map_vrma instead calls
kvmppc_virtmode_do_h_enter with the address of a dummy word as the
place to store the HPTE index, thus avoiding corrupting the guest R4.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2012-11-14 02:31:32 +08:00
|
|
|
extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
|
|
|
|
long pte_index, unsigned long pteh, unsigned long ptel,
|
|
|
|
pgd_t *pgdir, bool realmode, unsigned long *idx_ret);
|
2012-11-20 06:55:44 +08:00
|
|
|
extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
|
|
|
|
unsigned long pte_index, unsigned long avpn,
|
|
|
|
unsigned long *hpret);
|
2017-01-30 18:21:48 +08:00
|
|
|
extern long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
|
2012-09-11 21:28:18 +08:00
|
|
|
struct kvm_memory_slot *memslot, unsigned long *map);
|
2017-01-30 18:21:48 +08:00
|
|
|
extern void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa,
|
|
|
|
struct kvm_memory_slot *memslot,
|
|
|
|
unsigned long *map);
|
2013-09-20 12:52:38 +08:00
|
|
|
extern void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr,
|
|
|
|
unsigned long mask);
|
2014-07-31 16:21:59 +08:00
|
|
|
extern void kvmppc_set_fscr(struct kvm_vcpu *vcpu, u64 fscr);
|
2009-10-30 13:47:05 +08:00
|
|
|
|
KVM: PPC: Book3S HV: Work around transactional memory bugs in POWER9
POWER9 has hardware bugs relating to transactional memory and thread
reconfiguration (changes to hardware SMT mode). Specifically, the core
does not have enough storage to store a complete checkpoint of all the
architected state for all four threads. The DD2.2 version of POWER9
includes hardware modifications designed to allow hypervisor software
to implement workarounds for these problems. This patch implements
those workarounds in KVM code so that KVM guests see a full, working
transactional memory implementation.
The problems center around the use of TM suspended state, where the
CPU has a checkpointed state but execution is not transactional. The
workaround is to implement a "fake suspend" state, which looks to the
guest like suspended state but the CPU does not store a checkpoint.
In this state, any instruction that would cause a transition to
transactional state (rfid, rfebb, mtmsrd, tresume) or would use the
checkpointed state (treclaim) causes a "soft patch" interrupt (vector
0x1500) to the hypervisor so that it can be emulated. The trechkpt
instruction also causes a soft patch interrupt.
On POWER9 DD2.2, we avoid returning to the guest in any state which
would require a checkpoint to be present. The trechkpt in the guest
entry path which would normally create that checkpoint is replaced by
either a transition to fake suspend state, if the guest is in suspend
state, or a rollback to the pre-transactional state if the guest is in
transactional state. Fake suspend state is indicated by a flag in the
PACA plus a new bit in the PSSCR. The new PSSCR bit is write-only and
reads back as 0.
On exit from the guest, if the guest is in fake suspend state, we still
do the treclaim instruction as we would in real suspend state, in order
to get into non-transactional state, but we do not save the resulting
register state since there was no checkpoint.
Emulation of the instructions that cause a softpatch interrupt is
handled in two paths. If the guest is in real suspend mode, we call
kvmhv_p9_tm_emulation_early() to handle the cases where the guest is
transitioning to transactional state. This is called before we do the
treclaim in the guest exit path; because we haven't done treclaim, we
can get back to the guest with the transaction still active. If the
instruction is a case that kvmhv_p9_tm_emulation_early() doesn't
handle, or if the guest is in fake suspend state, then we proceed to
do the complete guest exit path and subsequently call
kvmhv_p9_tm_emulation() in host context with the MMU on. This handles
all the cases including the cases that generate program interrupts
(illegal instruction or TM Bad Thing) and facility unavailable
interrupts.
The emulation is reasonably straightforward and is mostly concerned
with checking for exception conditions and updating the state of
registers such as MSR and CR0. The treclaim emulation takes care to
ensure that the TEXASR register gets updated as if it were the guest
treclaim instruction that had done failure recording, not the treclaim
done in hypervisor state in the guest exit path.
With this, the KVM_CAP_PPC_HTM capability returns true (1) even if
transactional memory is not available to host userspace.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-03-21 18:32:01 +08:00
|
|
|
extern int kvmhv_p9_tm_emulation_early(struct kvm_vcpu *vcpu);
|
|
|
|
extern int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu);
|
|
|
|
extern void kvmhv_emulate_tm_rollback(struct kvm_vcpu *vcpu);
|
|
|
|
|
2011-07-23 15:41:44 +08:00
|
|
|
extern void kvmppc_entry_trampoline(void);
|
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:21:34 +08:00
|
|
|
extern void kvmppc_hv_entry_trampoline(void);
|
2010-03-25 04:48:28 +08:00
|
|
|
extern u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst);
|
|
|
|
extern ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst);
|
2011-08-08 23:21:15 +08:00
|
|
|
extern int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd);
|
2014-06-02 09:02:59 +08:00
|
|
|
extern void kvmppc_pr_init_default_hcalls(struct kvm *kvm);
|
2014-06-02 09:03:00 +08:00
|
|
|
extern int kvmppc_hcall_impl_pr(unsigned long cmd);
|
|
|
|
extern int kvmppc_hcall_impl_hv_realmode(unsigned long cmd);
|
2018-02-01 05:24:58 +08:00
|
|
|
extern void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu);
|
|
|
|
extern void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu);
|
2018-05-23 15:01:58 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
|
|
|
|
void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu);
|
|
|
|
void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu);
|
2018-05-23 15:02:04 +08:00
|
|
|
void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu);
|
2018-05-23 15:02:01 +08:00
|
|
|
void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu);
|
2018-05-23 15:01:58 +08:00
|
|
|
#else
|
|
|
|
static inline void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu) {}
|
|
|
|
static inline void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu) {}
|
2018-05-23 15:02:04 +08:00
|
|
|
static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu) {}
|
2018-05-23 15:02:01 +08:00
|
|
|
static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {}
|
2018-05-23 15:01:58 +08:00
|
|
|
#endif
|
|
|
|
|
2018-10-08 13:31:03 +08:00
|
|
|
long kvmhv_nested_init(void);
|
|
|
|
void kvmhv_nested_exit(void);
|
|
|
|
void kvmhv_vm_nested_init(struct kvm *kvm);
|
|
|
|
long kvmhv_set_partition_table(struct kvm_vcpu *vcpu);
|
2018-12-14 13:29:09 +08:00
|
|
|
long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu);
|
2018-10-08 13:31:03 +08:00
|
|
|
void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
|
|
|
|
void kvmhv_release_all_nested(struct kvm *kvm);
|
2018-10-08 13:31:04 +08:00
|
|
|
long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
|
KVM: PPC: Book3S HV: Implement H_TLB_INVALIDATE hcall
When running a nested (L2) guest the guest (L1) hypervisor will use
the H_TLB_INVALIDATE hcall when it needs to change the partition
scoped page tables or the partition table which it manages. It will
use this hcall in the situations where it would use a partition-scoped
tlbie instruction if it were running in hypervisor mode.
The H_TLB_INVALIDATE hcall can invalidate different scopes:
Invalidate TLB for a given target address:
- This invalidates a single L2 -> L1 pte
- We need to invalidate any L2 -> L0 shadow_pgtable ptes which map the L2
address space which is being invalidated. This is because a single
L2 -> L1 pte may have been mapped with more than one pte in the
L2 -> L0 page tables.
Invalidate the entire TLB for a given LPID or for all LPIDs:
- Invalidate the entire shadow_pgtable for a given nested guest, or
for all nested guests.
Invalidate the PWC (page walk cache) for a given LPID or for all LPIDs:
- We don't cache the PWC, so nothing to do.
Invalidate the entire TLB, PWC and partition table for a given/all LPIDs:
- Here we re-read the partition table entry and remove the nested state
for any nested guest for which the first doubleword of the partition
table entry is now zero.
The H_TLB_INVALIDATE hcall takes as parameters the tlbie instruction
word (of which only the RIC, PRS and R fields are used), the rS value
(giving the lpid, where required) and the rB value (giving the IS, AP
and EPN values).
[paulus@ozlabs.org - adapted to having the partition table in guest
memory, added the H_TLB_INVALIDATE implementation, removed tlbie
instruction emulation, reworded the commit message.]
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-10-08 13:31:09 +08:00
|
|
|
long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu);
|
2018-10-08 13:31:04 +08:00
|
|
|
int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu,
|
|
|
|
u64 time_limit, unsigned long lpcr);
|
|
|
|
void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr);
|
|
|
|
void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
|
|
|
|
struct hv_guest_state *hr);
|
2018-12-14 13:29:08 +08:00
|
|
|
long int kvmhv_nested_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu);
|
2018-10-08 13:31:03 +08:00
|
|
|
|
2018-05-23 15:02:07 +08:00
|
|
|
void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
|
|
|
|
|
2016-08-19 13:35:54 +08:00
|
|
|
extern int kvm_irq_bypass;
|
2009-10-30 13:47:05 +08:00
|
|
|
|
|
|
|
static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
|
|
|
|
{
|
2013-09-20 12:52:49 +08:00
|
|
|
return vcpu->arch.book3s;
|
2009-10-30 13:47:05 +08:00
|
|
|
}
|
|
|
|
|
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:21:34 +08:00
|
|
|
/* Also add subarch specific defines */
|
|
|
|
|
|
|
|
#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
|
|
|
|
#include <asm/kvm_book3s_32.h>
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
|
|
|
|
#include <asm/kvm_book3s_64.h>
|
|
|
|
#endif
|
|
|
|
|
2010-04-16 06:11:40 +08:00
|
|
|
static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
|
|
|
|
{
|
2018-05-07 14:20:07 +08:00
|
|
|
vcpu->arch.regs.gpr[num] = val;
|
2010-04-16 06:11:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
|
|
|
|
{
|
2018-05-07 14:20:07 +08:00
|
|
|
return vcpu->arch.regs.gpr[num];
|
2010-04-16 06:11:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
|
|
|
|
{
|
2018-10-08 13:30:58 +08:00
|
|
|
vcpu->arch.regs.ccr = val;
|
2010-04-16 06:11:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
|
|
|
|
{
|
2018-10-08 13:30:58 +08:00
|
|
|
return vcpu->arch.regs.ccr;
|
2010-04-16 06:11:40 +08:00
|
|
|
}
|
|
|
|
|
2015-05-27 07:56:57 +08:00
|
|
|
static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
|
2010-04-16 06:11:40 +08:00
|
|
|
{
|
2018-05-07 14:20:08 +08:00
|
|
|
vcpu->arch.regs.xer = val;
|
2010-04-16 06:11:40 +08:00
|
|
|
}
|
|
|
|
|
2015-05-27 07:56:57 +08:00
|
|
|
static inline ulong kvmppc_get_xer(struct kvm_vcpu *vcpu)
|
2010-04-16 06:11:40 +08:00
|
|
|
{
|
2018-05-07 14:20:08 +08:00
|
|
|
return vcpu->arch.regs.xer;
|
2010-04-16 06:11:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
|
|
|
|
{
|
2018-05-07 14:20:08 +08:00
|
|
|
vcpu->arch.regs.ctr = val;
|
2010-04-16 06:11:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu)
|
|
|
|
{
|
2018-05-07 14:20:08 +08:00
|
|
|
return vcpu->arch.regs.ctr;
|
2010-04-16 06:11:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val)
|
|
|
|
{
|
2018-05-07 14:20:08 +08:00
|
|
|
vcpu->arch.regs.link = val;
|
2010-04-16 06:11:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu)
|
|
|
|
{
|
2018-05-07 14:20:08 +08:00
|
|
|
return vcpu->arch.regs.link;
|
2010-04-16 06:11:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val)
|
|
|
|
{
|
2018-05-07 14:20:08 +08:00
|
|
|
vcpu->arch.regs.nip = val;
|
2010-04-16 06:11:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
|
|
|
|
{
|
2018-05-07 14:20:08 +08:00
|
|
|
return vcpu->arch.regs.nip;
|
2010-04-16 06:11:40 +08:00
|
|
|
}
|
|
|
|
|
2014-04-24 19:46:24 +08:00
|
|
|
static inline u64 kvmppc_get_msr(struct kvm_vcpu *vcpu);
|
2014-01-09 18:51:16 +08:00
|
|
|
static inline bool kvmppc_need_byteswap(struct kvm_vcpu *vcpu)
|
2010-04-16 06:11:40 +08:00
|
|
|
{
|
2014-04-24 19:46:24 +08:00
|
|
|
return (kvmppc_get_msr(vcpu) & MSR_LE) != (MSR_KERNEL & MSR_LE);
|
2014-01-09 18:51:16 +08:00
|
|
|
}
|
2010-04-16 06:11:40 +08:00
|
|
|
|
|
|
|
static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
|
|
|
|
{
|
KVM: PPC: Book3S PR: Keep volatile reg values in vcpu rather than shadow_vcpu
Currently PR-style KVM keeps the volatile guest register values
(R0 - R13, CR, LR, CTR, XER, PC) in a shadow_vcpu struct rather than
the main kvm_vcpu struct. For 64-bit, the shadow_vcpu exists in two
places, a kmalloc'd struct and in the PACA, and it gets copied back
and forth in kvmppc_core_vcpu_load/put(), because the real-mode code
can't rely on being able to access the kmalloc'd struct.
This changes the code to copy the volatile values into the shadow_vcpu
as one of the last things done before entering the guest. Similarly
the values are copied back out of the shadow_vcpu to the kvm_vcpu
immediately after exiting the guest. We arrange for interrupts to be
still disabled at this point so that we can't get preempted on 64-bit
and end up copying values from the wrong PACA.
This means that the accessor functions in kvm_book3s.h for these
registers are greatly simplified, and are same between PR and HV KVM.
In places where accesses to shadow_vcpu fields are now replaced by
accesses to the kvm_vcpu, we can also remove the svcpu_get/put pairs.
Finally, on 64-bit, we don't need the kmalloc'd struct at all any more.
With this, the time to read the PVR one million times in a loop went
from 567.7ms to 575.5ms (averages of 6 values), an increase of about
1.4% for this worse-case test for guest entries and exits. The
standard deviation of the measurements is about 11ms, so the
difference is only marginally significant statistically.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2013-09-20 12:52:43 +08:00
|
|
|
return vcpu->arch.fault_dar;
|
|
|
|
}
|
|
|
|
|
2014-02-07 00:36:56 +08:00
|
|
|
static inline bool is_kvmppc_resume_guest(int r)
|
|
|
|
{
|
|
|
|
return (r == RESUME_GUEST || r == RESUME_GUEST_NV);
|
|
|
|
}
|
|
|
|
|
2014-06-20 20:43:36 +08:00
|
|
|
static inline bool is_kvmppc_hv_enabled(struct kvm *kvm);
|
|
|
|
static inline bool kvmppc_supports_magic_page(struct kvm_vcpu *vcpu)
|
|
|
|
{
|
|
|
|
/* Only PR KVM supports the magic page */
|
|
|
|
return !is_kvmppc_hv_enabled(vcpu->kvm);
|
|
|
|
}
|
|
|
|
|
kvmppc: Implement H_LOGICAL_CI_{LOAD,STORE} in KVM
On POWER, storage caching is usually configured via the MMU - attributes
such as cache-inhibited are stored in the TLB and the hashed page table.
This makes correctly performing cache inhibited IO accesses awkward when
the MMU is turned off (real mode). Some CPU models provide special
registers to control the cache attributes of real mode load and stores but
this is not at all consistent. This is a problem in particular for SLOF,
the firmware used on KVM guests, which runs entirely in real mode, but
which needs to do IO to load the kernel.
To simplify this qemu implements two special hypercalls, H_LOGICAL_CI_LOAD
and H_LOGICAL_CI_STORE which simulate a cache-inhibited load or store to
a logical address (aka guest physical address). SLOF uses these for IO.
However, because these are implemented within qemu, not the host kernel,
these bypass any IO devices emulated within KVM itself. The simplest way
to see this problem is to attempt to boot a KVM guest from a virtio-blk
device with iothread / dataplane enabled. The iothread code relies on an
in kernel implementation of the virtio queue notification, which is not
triggered by the IO hcalls, and so the guest will stall in SLOF unable to
load the guest OS.
This patch addresses this by providing in-kernel implementations of the
2 hypercalls, which correctly scan the KVM IO bus. Any access to an
address not handled by the KVM IO bus will cause a VM exit, hitting the
qemu implementation as before.
Note that a userspace change is also required, in order to enable these
new hcall implementations with KVM_CAP_PPC_ENABLE_HCALL.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[agraf: fix compilation]
Signed-off-by: Alexander Graf <agraf@suse.de>
2015-02-05 08:53:25 +08:00
|
|
|
extern int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu);
|
|
|
|
extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu);
|
|
|
|
|
2010-03-25 04:48:30 +08:00
|
|
|
/* Magic register values loaded into r3 and r4 before the 'sc' assembly
|
|
|
|
* instruction for the OSI hypercalls */
|
|
|
|
#define OSI_SC_MAGIC_R3 0x113724FA
|
|
|
|
#define OSI_SC_MAGIC_R4 0x77810F9B
|
|
|
|
|
2009-10-30 13:47:05 +08:00
|
|
|
#define INS_DCBZ 0x7c0007ec
|
2013-03-21 04:24:58 +08:00
|
|
|
/* TO = 31 for unconditional trap */
|
|
|
|
#define INS_TW 0x7fe00008
|
2009-10-30 13:47:05 +08:00
|
|
|
|
2014-07-11 08:58:58 +08:00
|
|
|
#define SPLIT_HACK_MASK 0xff000000
|
|
|
|
#define SPLIT_HACK_OFFS 0xfb000000
|
|
|
|
|
KVM: PPC: Book3S HV: Pack VCORE IDs to access full VCPU ID space
It is not currently possible to create the full number of possible
VCPUs (KVM_MAX_VCPUS) on Power9 with KVM-HV when the guest uses fewer
threads per core than its core stride (or "VSMT mode"). This is
because the VCORE ID and XIVE offsets grow beyond KVM_MAX_VCPUS
even though the VCPU ID is less than KVM_MAX_VCPU_ID.
To address this, "pack" the VCORE ID and XIVE offsets by using
knowledge of the way the VCPU IDs will be used when there are fewer
guest threads per core than the core stride. The primary thread of
each core will always be used first. Then, if the guest uses more than
one thread per core, these secondary threads will sequentially follow
the primary in each core.
So, the only way an ID above KVM_MAX_VCPUS can be seen, is if the
VCPUs are being spaced apart, so at least half of each core is empty,
and IDs between KVM_MAX_VCPUS and (KVM_MAX_VCPUS * 2) can be mapped
into the second half of each core (4..7, in an 8-thread core).
Similarly, if IDs above KVM_MAX_VCPUS * 2 are seen, at least 3/4 of
each core is being left empty, and we can map down into the second and
third quarters of each core (2, 3 and 5, 6 in an 8-thread core).
Lastly, if IDs above KVM_MAX_VCPUS * 4 are seen, only the primary
threads are being used and 7/8 of the core is empty, allowing use of
the 1, 5, 3 and 7 thread slots.
(Strides less than 8 are handled similarly.)
This allows the VCORE ID or offset to be calculated quickly from the
VCPU ID or XIVE server numbers, without access to the VCPU structure.
[paulus@ozlabs.org - tidied up comment a little, changed some WARN_ONCE
to pr_devel, wrapped line, fixed id check.]
Signed-off-by: Sam Bobroff <sam.bobroff@au1.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2018-07-25 14:12:02 +08:00
|
|
|
/*
|
|
|
|
* This packs a VCPU ID from the [0..KVM_MAX_VCPU_ID) space down to the
|
|
|
|
* [0..KVM_MAX_VCPUS) space, using knowledge of the guest's core stride
|
|
|
|
* (but not its actual threading mode, which is not available) to avoid
|
|
|
|
* collisions.
|
|
|
|
*
|
|
|
|
* The implementation leaves VCPU IDs from the range [0..KVM_MAX_VCPUS) (block
|
|
|
|
* 0) unchanged: if the guest is filling each VCORE completely then it will be
|
|
|
|
* using consecutive IDs and it will fill the space without any packing.
|
|
|
|
*
|
|
|
|
* For higher VCPU IDs, the packed ID is based on the VCPU ID modulo
|
|
|
|
* KVM_MAX_VCPUS (effectively masking off the top bits) and then an offset is
|
|
|
|
* added to avoid collisions.
|
|
|
|
*
|
|
|
|
* VCPU IDs in the range [KVM_MAX_VCPUS..(KVM_MAX_VCPUS*2)) (block 1) are only
|
|
|
|
* possible if the guest is leaving at least 1/2 of each VCORE empty, so IDs
|
|
|
|
* can be safely packed into the second half of each VCORE by adding an offset
|
|
|
|
* of (stride / 2).
|
|
|
|
*
|
|
|
|
* Similarly, if VCPU IDs in the range [(KVM_MAX_VCPUS*2)..(KVM_MAX_VCPUS*4))
|
|
|
|
* (blocks 2 and 3) are seen, the guest must be leaving at least 3/4 of each
|
|
|
|
* VCORE empty so packed IDs can be offset by (stride / 4) and (stride * 3 / 4).
|
|
|
|
*
|
|
|
|
* Finally, VCPU IDs from blocks 5..7 will only be seen if the guest is using a
|
|
|
|
* stride of 8 and 1 thread per core so the remaining offsets of 1, 5, 3 and 7
|
|
|
|
* must be free to use.
|
|
|
|
*
|
|
|
|
* (The offsets for each block are stored in block_offsets[], indexed by the
|
|
|
|
* block number if the stride is 8. For cases where the guest's stride is less
|
|
|
|
* than 8, we can re-use the block_offsets array by multiplying the block
|
|
|
|
* number by (MAX_SMT_THREADS / stride) to reach the correct entry.)
|
|
|
|
*/
|
|
|
|
static inline u32 kvmppc_pack_vcpu_id(struct kvm *kvm, u32 id)
|
|
|
|
{
|
|
|
|
const int block_offsets[MAX_SMT_THREADS] = {0, 4, 2, 6, 1, 5, 3, 7};
|
|
|
|
int stride = kvm->arch.emul_smt_mode;
|
|
|
|
int block = (id / KVM_MAX_VCPUS) * (MAX_SMT_THREADS / stride);
|
|
|
|
u32 packed_id;
|
|
|
|
|
|
|
|
if (WARN_ONCE(block >= MAX_SMT_THREADS, "VCPU ID too large to pack"))
|
|
|
|
return 0;
|
|
|
|
packed_id = (id % KVM_MAX_VCPUS) + block_offsets[block];
|
|
|
|
if (WARN_ONCE(packed_id >= KVM_MAX_VCPUS, "VCPU ID packing failed"))
|
|
|
|
return 0;
|
|
|
|
return packed_id;
|
|
|
|
}
|
|
|
|
|
2009-10-30 13:47:05 +08:00
|
|
|
#endif /* __ASM_KVM_BOOK3S_H__ */
|