KVM: PPC: Book3S HV: Provide a method for userspace to read and write the HPT

A new ioctl, KVM_PPC_GET_HTAB_FD, returns a file descriptor.  Reads on
this fd return the contents of the HPT (hashed page table), writes
create and/or remove entries in the HPT.  There is a new capability,
KVM_CAP_PPC_HTAB_FD, to indicate the presence of the ioctl.  The ioctl
takes an argument structure with the index of the first HPT entry to
read out and a set of flags.  The flags indicate whether the user is
intending to read or write the HPT, and whether to return all entries
or only the "bolted" entries (those with the bolted bit, 0x10, set in
the first doubleword).

This is intended for use in implementing qemu's savevm/loadvm and for
live migration.  Therefore, on reads, the first pass returns information
about all HPTEs (or all bolted HPTEs).  When the first pass reaches the
end of the HPT, it returns from the read.  Subsequent reads only return
information about HPTEs that have changed since they were last read.
A read that finds no changed HPTEs in the HPT following where the last
read finished will return 0 bytes.

The format of the data provides a simple run-length compression of the
invalid entries.  Each block of data starts with a header that indicates
the index (position in the HPT, which is just an array), the number of
valid entries starting at that index (may be zero), and the number of
invalid entries following those valid entries.  The valid entries, 16
bytes each, follow the header.  The invalid entries are not explicitly
represented.

Signed-off-by: Paul Mackerras <paulus@samba.org>
[agraf: fix documentation]
Signed-off-by: Alexander Graf <agraf@suse.de>
This commit is contained in:
Paul Mackerras 2012-11-19 22:57:20 +00:00 committed by Alexander Graf
parent 6b445ad4f8
commit a2932923cc
8 changed files with 467 additions and 12 deletions

View File

@ -2071,6 +2071,60 @@ KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm
Note that the vcpu ioctl is asynchronous to vcpu execution.
4.78 KVM_PPC_GET_HTAB_FD
Capability: KVM_CAP_PPC_HTAB_FD
Architectures: powerpc
Type: vm ioctl
Parameters: Pointer to struct kvm_get_htab_fd (in)
Returns: file descriptor number (>= 0) on success, -1 on error
This returns a file descriptor that can be used either to read out the
entries in the guest's hashed page table (HPT), or to write entries to
initialize the HPT. The returned fd can only be written to if the
KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and
can only be read if that bit is clear. The argument struct looks like
this:
/* For KVM_PPC_GET_HTAB_FD */
struct kvm_get_htab_fd {
__u64 flags;
__u64 start_index;
__u64 reserved[2];
};
/* Values for kvm_get_htab_fd.flags */
#define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1)
#define KVM_GET_HTAB_WRITE ((__u64)0x2)
The `start_index' field gives the index in the HPT of the entry at
which to start reading. It is ignored when writing.
Reads on the fd will initially supply information about all
"interesting" HPT entries. Interesting entries are those with the
bolted bit set, if the KVM_GET_HTAB_BOLTED_ONLY bit is set, otherwise
all entries. When the end of the HPT is reached, the read() will
return. If read() is called again on the fd, it will start again from
the beginning of the HPT, but will only return HPT entries that have
changed since they were last read.
Data read or written is structured as a header (8 bytes) followed by a
series of valid HPT entries (16 bytes) each. The header indicates how
many valid HPT entries there are and how many invalid entries follow
the valid entries. The invalid entries are not represented explicitly
in the stream. The header format is:
struct kvm_get_htab_header {
__u32 index;
__u16 n_valid;
__u16 n_invalid;
};
Writes to the fd create HPT entries starting at the index given in the
header; first `n_valid' valid entries with contents from the data
written, then `n_invalid' invalid entries, invalidating any previously
valid entries found.
5. The kvm_run structure
------------------------

View File

@ -246,4 +246,26 @@ static inline bool slot_is_aligned(struct kvm_memory_slot *memslot,
return !(memslot->base_gfn & mask) && !(memslot->npages & mask);
}
/*
* This works for 4k, 64k and 16M pages on POWER7,
* and 4k and 16M pages on PPC970.
*/
static inline unsigned long slb_pgsize_encoding(unsigned long psize)
{
unsigned long senc = 0;
if (psize > 0x1000) {
senc = SLB_VSID_L;
if (psize == 0x10000)
senc |= SLB_VSID_LP_01;
}
return senc;
}
static inline int is_vrma_hpte(unsigned long hpte_v)
{
return (hpte_v & ~0xffffffUL) ==
(HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)));
}
#endif /* __ASM_KVM_BOOK3S_64_H__ */

View File

@ -164,6 +164,8 @@ extern void kvmppc_bookehv_exit(void);
extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu);
extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *);
/*
* Cuts out inst bits with ordering according to spec.
* That means the leftmost bit is zero. All given bits are included.

View File

@ -331,6 +331,31 @@ struct kvm_book3e_206_tlb_params {
__u32 reserved[8];
};
/* For KVM_PPC_GET_HTAB_FD */
struct kvm_get_htab_fd {
__u64 flags;
__u64 start_index;
__u64 reserved[2];
};
/* Values for kvm_get_htab_fd.flags */
#define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1)
#define KVM_GET_HTAB_WRITE ((__u64)0x2)
/*
* Data read on the file descriptor is formatted as a series of
* records, each consisting of a header followed by a series of
* `n_valid' HPTEs (16 bytes each), which are all valid. Following
* those valid HPTEs there are `n_invalid' invalid HPTEs, which
* are not represented explicitly in the stream. The same format
* is used for writing.
*/
struct kvm_get_htab_header {
__u32 index;
__u16 n_valid;
__u16 n_invalid;
};
#define KVM_REG_PPC_HIOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1)
#define KVM_REG_PPC_IAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x2)
#define KVM_REG_PPC_IAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3)

View File

@ -25,6 +25,8 @@
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/srcu.h>
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <asm/tlbflush.h>
#include <asm/kvm_ppc.h>
@ -1145,6 +1147,348 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
put_page(page);
}
/*
* Functions for reading and writing the hash table via reads and
* writes on a file descriptor.
*
* Reads return the guest view of the hash table, which has to be
* pieced together from the real hash table and the guest_rpte
* values in the revmap array.
*
* On writes, each HPTE written is considered in turn, and if it
* is valid, it is written to the HPT as if an H_ENTER with the
* exact flag set was done. When the invalid count is non-zero
* in the header written to the stream, the kernel will make
* sure that that many HPTEs are invalid, and invalidate them
* if not.
*/
struct kvm_htab_ctx {
unsigned long index;
unsigned long flags;
struct kvm *kvm;
int first_pass;
};
#define HPTE_SIZE (2 * sizeof(unsigned long))
static long record_hpte(unsigned long flags, unsigned long *hptp,
unsigned long *hpte, struct revmap_entry *revp,
int want_valid, int first_pass)
{
unsigned long v, r;
int ok = 1;
int valid, dirty;
/* Unmodified entries are uninteresting except on the first pass */
dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
if (!first_pass && !dirty)
return 0;
valid = 0;
if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) {
valid = 1;
if ((flags & KVM_GET_HTAB_BOLTED_ONLY) &&
!(hptp[0] & HPTE_V_BOLTED))
valid = 0;
}
if (valid != want_valid)
return 0;
v = r = 0;
if (valid || dirty) {
/* lock the HPTE so it's stable and read it */
preempt_disable();
while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
cpu_relax();
v = hptp[0];
if (v & HPTE_V_ABSENT) {
v &= ~HPTE_V_ABSENT;
v |= HPTE_V_VALID;
}
/* re-evaluate valid and dirty from synchronized HPTE value */
valid = !!(v & HPTE_V_VALID);
if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED))
valid = 0;
r = revp->guest_rpte | (hptp[1] & (HPTE_R_R | HPTE_R_C));
dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
/* only clear modified if this is the right sort of entry */
if (valid == want_valid && dirty) {
r &= ~HPTE_GR_MODIFIED;
revp->guest_rpte = r;
}
asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
hptp[0] &= ~HPTE_V_HVLOCK;
preempt_enable();
if (!(valid == want_valid && (first_pass || dirty)))
ok = 0;
}
hpte[0] = v;
hpte[1] = r;
return ok;
}
static ssize_t kvm_htab_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
struct kvm_htab_ctx *ctx = file->private_data;
struct kvm *kvm = ctx->kvm;
struct kvm_get_htab_header hdr;
unsigned long *hptp;
struct revmap_entry *revp;
unsigned long i, nb, nw;
unsigned long __user *lbuf;
struct kvm_get_htab_header __user *hptr;
unsigned long flags;
int first_pass;
unsigned long hpte[2];
if (!access_ok(VERIFY_WRITE, buf, count))
return -EFAULT;
first_pass = ctx->first_pass;
flags = ctx->flags;
i = ctx->index;
hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
revp = kvm->arch.revmap + i;
lbuf = (unsigned long __user *)buf;
nb = 0;
while (nb + sizeof(hdr) + HPTE_SIZE < count) {
/* Initialize header */
hptr = (struct kvm_get_htab_header __user *)buf;
hdr.index = i;
hdr.n_valid = 0;
hdr.n_invalid = 0;
nw = nb;
nb += sizeof(hdr);
lbuf = (unsigned long __user *)(buf + sizeof(hdr));
/* Skip uninteresting entries, i.e. clean on not-first pass */
if (!first_pass) {
while (i < kvm->arch.hpt_npte &&
!(revp->guest_rpte & HPTE_GR_MODIFIED)) {
++i;
hptp += 2;
++revp;
}
}
/* Grab a series of valid entries */
while (i < kvm->arch.hpt_npte &&
hdr.n_valid < 0xffff &&
nb + HPTE_SIZE < count &&
record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
/* valid entry, write it out */
++hdr.n_valid;
if (__put_user(hpte[0], lbuf) ||
__put_user(hpte[1], lbuf + 1))
return -EFAULT;
nb += HPTE_SIZE;
lbuf += 2;
++i;
hptp += 2;
++revp;
}
/* Now skip invalid entries while we can */
while (i < kvm->arch.hpt_npte &&
hdr.n_invalid < 0xffff &&
record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
/* found an invalid entry */
++hdr.n_invalid;
++i;
hptp += 2;
++revp;
}
if (hdr.n_valid || hdr.n_invalid) {
/* write back the header */
if (__copy_to_user(hptr, &hdr, sizeof(hdr)))
return -EFAULT;
nw = nb;
buf = (char __user *)lbuf;
} else {
nb = nw;
}
/* Check if we've wrapped around the hash table */
if (i >= kvm->arch.hpt_npte) {
i = 0;
ctx->first_pass = 0;
break;
}
}
ctx->index = i;
return nb;
}
static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct kvm_htab_ctx *ctx = file->private_data;
struct kvm *kvm = ctx->kvm;
struct kvm_get_htab_header hdr;
unsigned long i, j;
unsigned long v, r;
unsigned long __user *lbuf;
unsigned long *hptp;
unsigned long tmp[2];
ssize_t nb;
long int err, ret;
int rma_setup;
if (!access_ok(VERIFY_READ, buf, count))
return -EFAULT;
/* lock out vcpus from running while we're doing this */
mutex_lock(&kvm->lock);
rma_setup = kvm->arch.rma_setup_done;
if (rma_setup) {
kvm->arch.rma_setup_done = 0; /* temporarily */
/* order rma_setup_done vs. vcpus_running */
smp_mb();
if (atomic_read(&kvm->arch.vcpus_running)) {
kvm->arch.rma_setup_done = 1;
mutex_unlock(&kvm->lock);
return -EBUSY;
}
}
err = 0;
for (nb = 0; nb + sizeof(hdr) <= count; ) {
err = -EFAULT;
if (__copy_from_user(&hdr, buf, sizeof(hdr)))
break;
err = 0;
if (nb + hdr.n_valid * HPTE_SIZE > count)
break;
nb += sizeof(hdr);
buf += sizeof(hdr);
err = -EINVAL;
i = hdr.index;
if (i >= kvm->arch.hpt_npte ||
i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte)
break;
hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
lbuf = (unsigned long __user *)buf;
for (j = 0; j < hdr.n_valid; ++j) {
err = -EFAULT;
if (__get_user(v, lbuf) || __get_user(r, lbuf + 1))
goto out;
err = -EINVAL;
if (!(v & HPTE_V_VALID))
goto out;
lbuf += 2;
nb += HPTE_SIZE;
if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
err = -EIO;
ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r,
tmp);
if (ret != H_SUCCESS) {
pr_err("kvm_htab_write ret %ld i=%ld v=%lx "
"r=%lx\n", ret, i, v, r);
goto out;
}
if (!rma_setup && is_vrma_hpte(v)) {
unsigned long psize = hpte_page_size(v, r);
unsigned long senc = slb_pgsize_encoding(psize);
unsigned long lpcr;
kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
(VRMA_VSID << SLB_VSID_SHIFT_1T);
lpcr = kvm->arch.lpcr & ~LPCR_VRMASD;
lpcr |= senc << (LPCR_VRMASD_SH - 4);
kvm->arch.lpcr = lpcr;
rma_setup = 1;
}
++i;
hptp += 2;
}
for (j = 0; j < hdr.n_invalid; ++j) {
if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
++i;
hptp += 2;
}
err = 0;
}
out:
/* Order HPTE updates vs. rma_setup_done */
smp_wmb();
kvm->arch.rma_setup_done = rma_setup;
mutex_unlock(&kvm->lock);
if (err)
return err;
return nb;
}
static int kvm_htab_release(struct inode *inode, struct file *filp)
{
struct kvm_htab_ctx *ctx = filp->private_data;
filp->private_data = NULL;
if (!(ctx->flags & KVM_GET_HTAB_WRITE))
atomic_dec(&ctx->kvm->arch.hpte_mod_interest);
kvm_put_kvm(ctx->kvm);
kfree(ctx);
return 0;
}
static struct file_operations kvm_htab_fops = {
.read = kvm_htab_read,
.write = kvm_htab_write,
.llseek = default_llseek,
.release = kvm_htab_release,
};
int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
{
int ret;
struct kvm_htab_ctx *ctx;
int rwflag;
/* reject flags we don't recognize */
if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE))
return -EINVAL;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
kvm_get_kvm(kvm);
ctx->kvm = kvm;
ctx->index = ghf->start_index;
ctx->flags = ghf->flags;
ctx->first_pass = 1;
rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag);
if (ret < 0) {
kvm_put_kvm(kvm);
return ret;
}
if (rwflag == O_RDONLY) {
mutex_lock(&kvm->slots_lock);
atomic_inc(&kvm->arch.hpte_mod_interest);
/* make sure kvmppc_do_h_enter etc. see the increment */
synchronize_srcu_expedited(&kvm->srcu);
mutex_unlock(&kvm->slots_lock);
}
return ret;
}
void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
{
struct kvmppc_mmu *mmu = &vcpu->arch.mmu;

View File

@ -1563,18 +1563,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
return r;
}
static unsigned long slb_pgsize_encoding(unsigned long psize)
{
unsigned long senc = 0;
if (psize > 0x1000) {
senc = SLB_VSID_L;
if (psize == 0x10000)
senc |= SLB_VSID_LP_01;
}
return senc;
}
static void unpin_slot(struct kvm_memory_slot *memslot)
{
unsigned long *physp;

View File

@ -354,6 +354,12 @@ int kvm_dev_ioctl_check_extension(long ext)
r = 1;
#else
r = 0;
break;
#endif
#ifdef CONFIG_KVM_BOOK3S_64_HV
case KVM_CAP_PPC_HTAB_FD:
r = 1;
break;
#endif
break;
case KVM_CAP_NR_VCPUS:
@ -954,6 +960,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = 0;
break;
}
case KVM_PPC_GET_HTAB_FD: {
struct kvm *kvm = filp->private_data;
struct kvm_get_htab_fd ghf;
r = -EFAULT;
if (copy_from_user(&ghf, argp, sizeof(ghf)))
break;
r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
break;
}
#endif /* CONFIG_KVM_BOOK3S_64_HV */
#ifdef CONFIG_PPC_BOOK3S_64

View File

@ -634,6 +634,7 @@ struct kvm_ppc_smmu_info {
#endif
#define KVM_CAP_IRQFD_RESAMPLE 82
#define KVM_CAP_PPC_BOOKE_WATCHDOG 83
#define KVM_CAP_PPC_HTAB_FD 84
#ifdef KVM_CAP_IRQ_ROUTING
@ -859,6 +860,8 @@ struct kvm_s390_ucas_mapping {
#define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce)
/* Available with KVM_CAP_RMA */
#define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma)
/* Available with KVM_CAP_PPC_HTAB_FD */
#define KVM_PPC_GET_HTAB_FD _IOW(KVMIO, 0xaa, struct kvm_get_htab_fd)
/*
* ioctls for vcpu fds