s390/mm: add shadow gmap support

For a nested KVM guest the outer KVM host needs to create shadow
page tables for the nested guest. This patch adds the basic support
to the guest address space (gmap) code.

For each guest address space the inner KVM host creates, the first
outer KVM host needs to create shadow page tables. The address space
is identified by the ASCE loaded into the control register 1 at the
time the inner SIE instruction for the second nested KVM guest is
executed. The outer KVM host creates the shadow tables starting with
the table identified by the ASCE on a on-demand basis. The outer KVM
host will get repeated faults for all the shadow tables needed to
run the second KVM guest.

While a shadow page table for the second KVM guest is active the access
to the origin region, segment and page tables needs to be restricted
for the first KVM guest. For region and segment and page tables the first
KVM guest may read the memory, but write attempt has to lead to an
unshadow.  This is done using the page invalid and read-only bits in the
page table of the first KVM guest. If the first guest re-accesses one of
the origin pages of a shadow, it gets a fault and the affected parts of
the shadow page table hierarchy needs to be removed again.

PGSTE tables don't have to be shadowed, as all interpretation assist can't
deal with the invalid bits in the shadow pte being set differently than
the original ones provided by the first KVM guest.

Many bug fixes and improvements by David Hildenbrand.

Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
This commit is contained in:
Martin Schwidefsky 2016-03-08 12:12:18 +01:00 committed by Christian Borntraeger
parent 6ea427bbbd
commit 4be130a084
8 changed files with 1271 additions and 43 deletions

View File

@ -10,6 +10,7 @@
/** /**
* struct gmap_struct - guest address space * struct gmap_struct - guest address space
* @list: list head for the mm->context gmap list
* @crst_list: list of all crst tables used in the guest address space * @crst_list: list of all crst tables used in the guest address space
* @mm: pointer to the parent mm_struct * @mm: pointer to the parent mm_struct
* @guest_to_host: radix tree with guest to host address translation * @guest_to_host: radix tree with guest to host address translation
@ -19,6 +20,13 @@
* @table: pointer to the page directory * @table: pointer to the page directory
* @asce: address space control element for gmap page table * @asce: address space control element for gmap page table
* @pfault_enabled: defines if pfaults are applicable for the guest * @pfault_enabled: defines if pfaults are applicable for the guest
* @host_to_rmap: radix tree with gmap_rmap lists
* @children: list of shadow gmap structures
* @pt_list: list of all page tables used in the shadow guest address space
* @shadow_lock: spinlock to protect the shadow gmap list
* @parent: pointer to the parent gmap for shadow guest address spaces
* @orig_asce: ASCE for which the shadow page table has been created
* @removed: flag to indicate if a shadow guest address space has been removed
*/ */
struct gmap { struct gmap {
struct list_head list; struct list_head list;
@ -33,8 +41,32 @@ struct gmap {
unsigned long asce_end; unsigned long asce_end;
void *private; void *private;
bool pfault_enabled; bool pfault_enabled;
/* Additional data for shadow guest address spaces */
struct radix_tree_root host_to_rmap;
struct list_head children;
struct list_head pt_list;
spinlock_t shadow_lock;
struct gmap *parent;
unsigned long orig_asce;
bool removed;
}; };
/**
* struct gmap_rmap - reverse mapping for shadow page table entries
* @next: pointer to next rmap in the list
* @raddr: virtual rmap address in the shadow guest address space
*/
struct gmap_rmap {
struct gmap_rmap *next;
unsigned long raddr;
};
#define gmap_for_each_rmap(pos, head) \
for (pos = (head); pos; pos = pos->next)
#define gmap_for_each_rmap_safe(pos, n, head) \
for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n)
/** /**
* struct gmap_notifier - notify function block for page invalidation * struct gmap_notifier - notify function block for page invalidation
* @notifier_call: address of callback function * @notifier_call: address of callback function
@ -46,6 +78,11 @@ struct gmap_notifier {
unsigned long end); unsigned long end);
}; };
static inline int gmap_is_shadow(struct gmap *gmap)
{
return !!gmap->parent;
}
struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit); struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit);
void gmap_remove(struct gmap *gmap); void gmap_remove(struct gmap *gmap);
struct gmap *gmap_get(struct gmap *gmap); struct gmap *gmap_get(struct gmap *gmap);
@ -64,9 +101,22 @@ void gmap_discard(struct gmap *, unsigned long from, unsigned long to);
void __gmap_zap(struct gmap *, unsigned long gaddr); void __gmap_zap(struct gmap *, unsigned long gaddr);
void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr); void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr);
int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val);
struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce);
int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t);
int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t);
int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt);
int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt);
int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
unsigned long *pgt, int *dat_protection);
int gmap_shadow_page(struct gmap *sg, unsigned long saddr,
unsigned long paddr, int write);
void gmap_register_pte_notifier(struct gmap_notifier *); void gmap_register_pte_notifier(struct gmap_notifier *);
void gmap_unregister_pte_notifier(struct gmap_notifier *); void gmap_unregister_pte_notifier(struct gmap_notifier *);
void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *); void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *,
unsigned long bits);
int gmap_mprotect_notify(struct gmap *, unsigned long start, int gmap_mprotect_notify(struct gmap *, unsigned long start,
unsigned long len, int prot); unsigned long len, int prot);

View File

@ -19,8 +19,10 @@ unsigned long *crst_table_alloc(struct mm_struct *);
void crst_table_free(struct mm_struct *, unsigned long *); void crst_table_free(struct mm_struct *, unsigned long *);
unsigned long *page_table_alloc(struct mm_struct *); unsigned long *page_table_alloc(struct mm_struct *);
struct page *page_table_alloc_pgste(struct mm_struct *mm);
void page_table_free(struct mm_struct *, unsigned long *); void page_table_free(struct mm_struct *, unsigned long *);
void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long); void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long);
void page_table_free_pgste(struct page *page);
extern int page_table_allocate_pgste; extern int page_table_allocate_pgste;
static inline void clear_table(unsigned long *s, unsigned long val, size_t n) static inline void clear_table(unsigned long *s, unsigned long val, size_t n)

View File

@ -256,6 +256,7 @@ static inline int is_module_addr(void *addr)
/* Bits in the region table entry */ /* Bits in the region table entry */
#define _REGION_ENTRY_ORIGIN ~0xfffUL/* region/segment table origin */ #define _REGION_ENTRY_ORIGIN ~0xfffUL/* region/segment table origin */
#define _REGION_ENTRY_PROTECT 0x200 /* region protection bit */ #define _REGION_ENTRY_PROTECT 0x200 /* region protection bit */
#define _REGION_ENTRY_OFFSET 0xc0 /* region table offset */
#define _REGION_ENTRY_INVALID 0x20 /* invalid region table entry */ #define _REGION_ENTRY_INVALID 0x20 /* invalid region table entry */
#define _REGION_ENTRY_TYPE_MASK 0x0c /* region/segment table type mask */ #define _REGION_ENTRY_TYPE_MASK 0x0c /* region/segment table type mask */
#define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */ #define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */
@ -327,6 +328,7 @@ static inline int is_module_addr(void *addr)
#define PGSTE_GC_BIT 0x0002000000000000UL #define PGSTE_GC_BIT 0x0002000000000000UL
#define PGSTE_UC_BIT 0x0000800000000000UL /* user dirty (migration) */ #define PGSTE_UC_BIT 0x0000800000000000UL /* user dirty (migration) */
#define PGSTE_IN_BIT 0x0000400000000000UL /* IPTE notify bit */ #define PGSTE_IN_BIT 0x0000400000000000UL /* IPTE notify bit */
#define PGSTE_VSIE_BIT 0x0000200000000000UL /* ref'd in a shadow table */
/* Guest Page State used for virtualization */ /* Guest Page State used for virtualization */
#define _PGSTE_GPS_ZERO 0x0000000080000000UL #define _PGSTE_GPS_ZERO 0x0000000080000000UL
@ -885,12 +887,16 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t entry); pte_t *ptep, pte_t entry);
void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void ptep_notify(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long bits);
int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr, int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr,
pte_t *ptep, int prot); pte_t *ptep, int prot, unsigned long bit);
void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
pte_t *ptep , int reset); pte_t *ptep , int reset);
void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
pte_t *sptep, pte_t *tptep, int write);
void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep);
bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address); bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,

View File

@ -109,6 +109,7 @@ struct thread_struct {
unsigned long ksp; /* kernel stack pointer */ unsigned long ksp; /* kernel stack pointer */
mm_segment_t mm_segment; mm_segment_t mm_segment;
unsigned long gmap_addr; /* address of last gmap fault. */ unsigned long gmap_addr; /* address of last gmap fault. */
unsigned int gmap_write_flag; /* gmap fault write indication */
unsigned int gmap_pfault; /* signal of a pending guest pfault */ unsigned int gmap_pfault; /* signal of a pending guest pfault */
struct per_regs per_user; /* User specified PER registers */ struct per_regs per_user; /* User specified PER registers */
struct per_event per_event; /* Cause of the last PER trap */ struct per_event per_event; /* Cause of the last PER trap */

View File

@ -418,6 +418,7 @@ static inline int do_exception(struct pt_regs *regs, int access)
(struct gmap *) S390_lowcore.gmap : NULL; (struct gmap *) S390_lowcore.gmap : NULL;
if (gmap) { if (gmap) {
current->thread.gmap_addr = address; current->thread.gmap_addr = address;
current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
address = __gmap_translate(gmap, address); address = __gmap_translate(gmap, address);
if (address == -EFAULT) { if (address == -EFAULT) {
fault = VM_FAULT_BADMAP; fault = VM_FAULT_BADMAP;

File diff suppressed because it is too large Load Diff

View File

@ -137,6 +137,29 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
return new; return new;
} }
#ifdef CONFIG_PGSTE
struct page *page_table_alloc_pgste(struct mm_struct *mm)
{
struct page *page;
unsigned long *table;
page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
if (page) {
table = (unsigned long *) page_to_phys(page);
clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
}
return page;
}
void page_table_free_pgste(struct page *page)
{
__free_page(page);
}
#endif /* CONFIG_PGSTE */
/* /*
* page table entry allocation/free routines. * page table entry allocation/free routines.
*/ */

View File

@ -184,9 +184,12 @@ static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
pte_t *ptep, pgste_t pgste) pte_t *ptep, pgste_t pgste)
{ {
#ifdef CONFIG_PGSTE #ifdef CONFIG_PGSTE
if (pgste_val(pgste) & PGSTE_IN_BIT) { unsigned long bits;
pgste_val(pgste) &= ~PGSTE_IN_BIT;
ptep_notify(mm, addr, ptep); bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
if (bits) {
pgste_val(pgste) ^= bits;
ptep_notify(mm, addr, ptep, bits);
} }
#endif #endif
return pgste; return pgste;
@ -420,12 +423,13 @@ void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
* @addr: virtual address in the guest address space * @addr: virtual address in the guest address space
* @ptep: pointer to the page table entry * @ptep: pointer to the page table entry
* @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
* @bit: pgste bit to set (e.g. for notification)
* *
* Returns 0 if the access rights were changed and -EAGAIN if the current * Returns 0 if the access rights were changed and -EAGAIN if the current
* and requested access rights are incompatible. * and requested access rights are incompatible.
*/ */
int ptep_force_prot(struct mm_struct *mm, unsigned long addr, int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, int prot) pte_t *ptep, int prot, unsigned long bit)
{ {
pte_t entry; pte_t entry;
pgste_t pgste; pgste_t pgste;
@ -441,7 +445,7 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
pgste_set_unlock(ptep, pgste); pgste_set_unlock(ptep, pgste);
return -EAGAIN; return -EAGAIN;
} }
/* Change access rights and set the pgste notification bit */ /* Change access rights and set pgste bit */
if (prot == PROT_NONE && !pte_i) { if (prot == PROT_NONE && !pte_i) {
ptep_flush_direct(mm, addr, ptep); ptep_flush_direct(mm, addr, ptep);
pgste = pgste_update_all(entry, pgste, mm); pgste = pgste_update_all(entry, pgste, mm);
@ -452,12 +456,53 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
pte_val(entry) &= ~_PAGE_INVALID; pte_val(entry) &= ~_PAGE_INVALID;
pte_val(entry) |= _PAGE_PROTECT; pte_val(entry) |= _PAGE_PROTECT;
} }
pgste_val(pgste) |= PGSTE_IN_BIT; pgste_val(pgste) |= bit;
pgste = pgste_set_pte(ptep, pgste, entry); pgste = pgste_set_pte(ptep, pgste, entry);
pgste_set_unlock(ptep, pgste); pgste_set_unlock(ptep, pgste);
return 0; return 0;
} }
int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
pte_t *sptep, pte_t *tptep, int write)
{
pgste_t spgste, tpgste;
pte_t spte, tpte;
int rc = -EAGAIN;
spgste = pgste_get_lock(sptep);
spte = *sptep;
if (!(pte_val(spte) & _PAGE_INVALID) &&
!(pte_val(spte) & _PAGE_PROTECT)) {
rc = 0;
if (!(pte_val(*tptep) & _PAGE_INVALID))
/* Update existing mapping */
ptep_flush_direct(mm, saddr, tptep);
else
rc = 1;
pgste_val(spgste) |= PGSTE_VSIE_BIT;
tpgste = pgste_get_lock(tptep);
pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
(write ? 0 : _PAGE_PROTECT);
/* don't touch the storage key - it belongs to parent pgste */
tpgste = pgste_set_pte(tptep, tpgste, tpte);
pgste_set_unlock(tptep, tpgste);
}
pgste_set_unlock(sptep, spgste);
return rc;
}
void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
{
pgste_t pgste;
pgste = pgste_get_lock(ptep);
/* notifier is called by the caller */
ptep_flush_direct(mm, saddr, ptep);
/* don't touch the storage key - it belongs to parent pgste */
pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID));
pgste_set_unlock(ptep, pgste);
}
static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
{ {
if (!non_swap_entry(entry)) if (!non_swap_entry(entry))