From a318c2432218d3cd189ec8228b8a795666899c2a Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Fri, 31 Aug 2018 14:34:02 +0200 Subject: [PATCH 01/53] mfd: da9063: Fix DT probing with constraints Commit 1c892e38ce59 ("regulator: da9063: Handle less LDOs on DA9063L") reordered the da9063_regulator_info[] array, but not the DA9063_ID_* regulator ids and not the da9063_matches[] array, because ids are used as indices in the array initializer. This mismatch between regulator id and da9063_regulator_info[] array index causes the driver probe to fail because constraints from DT are not applied to the correct regulator: da9063 0-0058: Device detected (chip-ID: 0x61, var-ID: 0x50) DA9063_BMEM: Bringing 900000uV into 3300000-3300000uV DA9063_LDO9: Bringing 3300000uV into 2500000-2500000uV DA9063_LDO1: Bringing 900000uV into 3300000-3300000uV DA9063_LDO1: failed to apply 3300000-3300000uV constraint(-22) This patch reorders the DA9063_ID_* as apparently intended, and with them the entries in the da90630_matches[] array. Fixes: 1c892e38ce59 ("regulator: da9063: Handle less LDOs on DA9063L") Signed-off-by: Philipp Zabel Reviewed-by: Marek Vasut Reviewed-by: Geert Uytterhoeven Signed-off-by: Lee Jones --- include/linux/mfd/da9063/pdata.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/include/linux/mfd/da9063/pdata.h b/include/linux/mfd/da9063/pdata.h index 8a125701ef7b..50bed4f89c1a 100644 --- a/include/linux/mfd/da9063/pdata.h +++ b/include/linux/mfd/da9063/pdata.h @@ -21,7 +21,7 @@ /* * Regulator configuration */ -/* DA9063 regulator IDs */ +/* DA9063 and DA9063L regulator IDs */ enum { /* BUCKs */ DA9063_ID_BCORE1, @@ -37,18 +37,20 @@ enum { DA9063_ID_BMEM_BIO_MERGED, /* When two BUCKs are merged, they cannot be reused separately */ - /* LDOs */ - DA9063_ID_LDO1, - DA9063_ID_LDO2, + /* LDOs on both DA9063 and DA9063L */ DA9063_ID_LDO3, - DA9063_ID_LDO4, - DA9063_ID_LDO5, - DA9063_ID_LDO6, DA9063_ID_LDO7, DA9063_ID_LDO8, DA9063_ID_LDO9, - DA9063_ID_LDO10, DA9063_ID_LDO11, + + /* DA9063-only LDOs */ + DA9063_ID_LDO1, + DA9063_ID_LDO2, + DA9063_ID_LDO4, + DA9063_ID_LDO5, + DA9063_ID_LDO6, + DA9063_ID_LDO10, }; /* Regulators platform data */ From 10492ee8ed9188d6d420e1f79b2b9bdbc0624e65 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 25 Apr 2018 07:29:22 -0700 Subject: [PATCH 02/53] mfd: omap-usb-host: Fix dts probe of children It currently only works if the parent bus uses "simple-bus". We currently try to probe children with non-existing compatible values. And we're missing .probe. I noticed this while testing devices configured to probe using ti-sysc interconnect target module driver. For that we also may want to rebind the driver, so let's remove __init and __exit. Signed-off-by: Tony Lindgren Acked-by: Roger Quadros Signed-off-by: Lee Jones --- drivers/mfd/omap-usb-host.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/mfd/omap-usb-host.c b/drivers/mfd/omap-usb-host.c index e11ab12fbdf2..800986a79704 100644 --- a/drivers/mfd/omap-usb-host.c +++ b/drivers/mfd/omap-usb-host.c @@ -528,8 +528,8 @@ static int usbhs_omap_get_dt_pdata(struct device *dev, } static const struct of_device_id usbhs_child_match_table[] = { - { .compatible = "ti,omap-ehci", }, - { .compatible = "ti,omap-ohci", }, + { .compatible = "ti,ehci-omap", }, + { .compatible = "ti,ohci-omap3", }, { } }; @@ -855,6 +855,7 @@ static struct platform_driver usbhs_omap_driver = { .pm = &usbhsomap_dev_pm_ops, .of_match_table = usbhs_omap_dt_ids, }, + .probe = usbhs_omap_probe, .remove = usbhs_omap_remove, }; @@ -864,9 +865,9 @@ MODULE_ALIAS("platform:" USBHS_DRIVER_NAME); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("usb host common core driver for omap EHCI and OHCI"); -static int __init omap_usbhs_drvinit(void) +static int omap_usbhs_drvinit(void) { - return platform_driver_probe(&usbhs_omap_driver, usbhs_omap_probe); + return platform_driver_register(&usbhs_omap_driver); } /* @@ -878,7 +879,7 @@ static int __init omap_usbhs_drvinit(void) */ fs_initcall_sync(omap_usbhs_drvinit); -static void __exit omap_usbhs_drvexit(void) +static void omap_usbhs_drvexit(void) { platform_driver_unregister(&usbhs_omap_driver); } From d310959365942b100f79b56bcce859968fe7ca9c Mon Sep 17 00:00:00 2001 From: Scott Branden Date: Tue, 11 Sep 2018 13:26:38 -0700 Subject: [PATCH 03/53] efi/libstub/arm: default EFI_ARMSTUB_DTB_LOADER to y Default EFI_ARMSTUB_DTB_LOADER to y to allow the dtb= command line parameter to function with efi loader. Required for development purposes and to boot on existing bootloaders that do not support devicetree provided by the firmware or by the bootloader. Fixes: 3d7ee348aa41 ("efi/libstub/arm: Add opt-in Kconfig option ...") Signed-off-by: Scott Branden Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/Kconfig | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index d8e159feb573..89110dfc7127 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -90,14 +90,17 @@ config EFI_ARMSTUB config EFI_ARMSTUB_DTB_LOADER bool "Enable the DTB loader" depends on EFI_ARMSTUB + default y help Select this config option to add support for the dtb= command line parameter, allowing a device tree blob to be loaded into memory from the EFI System Partition by the stub. - The device tree is typically provided by the platform or by - the bootloader, so this option is mostly for development - purposes only. + If the device tree is provided by the platform or by + the bootloader this option may not be needed. + But, for various development reasons and to maintain existing + functionality for bootloaders that do not have such support + this option is necessary. config EFI_BOOTLOADER_CONTROL tristate "EFI Bootloader Control" From b3f0907c71e006e12fde74ea9a745b6096b6f90f Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Fri, 14 Sep 2018 08:45:58 -0500 Subject: [PATCH 04/53] x86/mm: Add .bss..decrypted section to hold shared variables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvmclock defines few static variables which are shared with the hypervisor during the kvmclock initialization. When SEV is active, memory is encrypted with a guest-specific key, and if the guest OS wants to share the memory region with the hypervisor then it must clear the C-bit before sharing it. Currently, we use kernel_physical_mapping_init() to split large pages before clearing the C-bit on shared pages. But it fails when called from the kvmclock initialization (mainly because the memblock allocator is not ready that early during boot). Add a __bss_decrypted section attribute which can be used when defining such shared variable. The so-defined variables will be placed in the .bss..decrypted section. This section will be mapped with C=0 early during boot. The .bss..decrypted section has a big chunk of memory that may be unused when memory encryption is not active, free it when memory encryption is not active. Suggested-by: Thomas Gleixner Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Cc: Tom Lendacky Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Radim Krčmář Cc: kvm@vger.kernel.org Link: https://lkml.kernel.org/r/1536932759-12905-2-git-send-email-brijesh.singh@amd.com --- arch/x86/include/asm/mem_encrypt.h | 7 +++++++ arch/x86/kernel/head64.c | 16 ++++++++++++++++ arch/x86/kernel/vmlinux.lds.S | 19 +++++++++++++++++++ arch/x86/mm/init.c | 4 ++++ arch/x86/mm/mem_encrypt.c | 24 ++++++++++++++++++++++++ 5 files changed, 70 insertions(+) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index c0643831706e..616f8e637bc3 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -48,10 +48,13 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void); +void __init mem_encrypt_free_decrypted_mem(void); bool sme_active(void); bool sev_active(void); +#define __bss_decrypted __attribute__((__section__(".bss..decrypted"))) + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define sme_me_mask 0ULL @@ -77,6 +80,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; static inline int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } +#define __bss_decrypted + #endif /* CONFIG_AMD_MEM_ENCRYPT */ /* @@ -88,6 +93,8 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; #define __sme_pa(x) (__pa(x) | sme_me_mask) #define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask) +extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[]; + #endif /* __ASSEMBLY__ */ #endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 8047379e575a..c16af27eb23f 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -112,6 +112,7 @@ static bool __head check_la57_support(unsigned long physaddr) unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { + unsigned long vaddr, vaddr_end; unsigned long load_delta, *p; unsigned long pgtable_flags; pgdval_t *pgd; @@ -234,6 +235,21 @@ unsigned long __head __startup_64(unsigned long physaddr, /* Encrypt the kernel and related (if SME is active) */ sme_encrypt_kernel(bp); + /* + * Clear the memory encryption mask from the .bss..decrypted section. + * The bss section will be memset to zero later in the initialization so + * there is no need to zero it after changing the memory encryption + * attribute. + */ + if (mem_encrypt_active()) { + vaddr = (unsigned long)__start_bss_decrypted; + vaddr_end = (unsigned long)__end_bss_decrypted; + for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { + i = pmd_index(vaddr); + pmd[i] -= sme_get_me_mask(); + } + } + /* * Return the SME encryption mask (if SME is active) to be used as a * modifier for the initial pgdir entry programmed into CR3. diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 8bde0a419f86..5dd3317d761f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -65,6 +65,23 @@ jiffies_64 = jiffies; #define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); #define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); +/* + * This section contains data which will be mapped as decrypted. Memory + * encryption operates on a page basis. Make this section PMD-aligned + * to avoid splitting the pages while mapping the section early. + * + * Note: We use a separate section so that only this section gets + * decrypted to avoid exposing more than we wish. + */ +#define BSS_DECRYPTED \ + . = ALIGN(PMD_SIZE); \ + __start_bss_decrypted = .; \ + *(.bss..decrypted); \ + . = ALIGN(PAGE_SIZE); \ + __start_bss_decrypted_unused = .; \ + . = ALIGN(PMD_SIZE); \ + __end_bss_decrypted = .; \ + #else #define X86_ALIGN_RODATA_BEGIN @@ -74,6 +91,7 @@ jiffies_64 = jiffies; #define ALIGN_ENTRY_TEXT_BEGIN #define ALIGN_ENTRY_TEXT_END +#define BSS_DECRYPTED #endif @@ -355,6 +373,7 @@ SECTIONS __bss_start = .; *(.bss..page_aligned) *(.bss) + BSS_DECRYPTED . = ALIGN(PAGE_SIZE); __bss_stop = .; } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 7a8fc26c1115..faca978ebf9d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -815,10 +815,14 @@ void free_kernel_image_pages(void *begin, void *end) set_memory_np_noalias(begin_ul, len_pages); } +void __weak mem_encrypt_free_decrypted_mem(void) { } + void __ref free_initmem(void) { e820__reallocate_tables(); + mem_encrypt_free_decrypted_mem(); + free_kernel_image_pages(&__init_begin, &__init_end); } diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index b2de398d1fd3..006f373f54ab 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -348,6 +348,30 @@ bool sev_active(void) EXPORT_SYMBOL(sev_active); /* Architecture __weak replacement functions */ +void __init mem_encrypt_free_decrypted_mem(void) +{ + unsigned long vaddr, vaddr_end, npages; + int r; + + vaddr = (unsigned long)__start_bss_decrypted_unused; + vaddr_end = (unsigned long)__end_bss_decrypted; + npages = (vaddr_end - vaddr) >> PAGE_SHIFT; + + /* + * The unused memory range was mapped decrypted, change the encryption + * attribute from decrypted to encrypted before freeing it. + */ + if (mem_encrypt_active()) { + r = set_memory_encrypted(vaddr, npages); + if (r) { + pr_warn("failed to free unused decrypted pages\n"); + return; + } + } + + free_init_pages("unused decrypted", vaddr, vaddr_end); +} + void __init mem_encrypt_init(void) { if (!sme_me_mask) From 6a1cac56f41f9ea94e440dfcc1cac44b41a1b194 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Fri, 14 Sep 2018 08:45:59 -0500 Subject: [PATCH 05/53] x86/kvm: Use __bss_decrypted attribute in shared variables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The recent removal of the memblock dependency from kvmclock caused a SEV guest regression because the wall_clock and hv_clock_boot variables are no longer mapped decrypted when SEV is active. Use the __bss_decrypted attribute to put the static wall_clock and hv_clock_boot in the .bss..decrypted section so that they are mapped decrypted during boot. In the preparatory stage of CPU hotplug, the per-cpu pvclock data pointer assigns either an element of the static array or dynamically allocated memory for the pvclock data pointer. The static array are now mapped decrypted but the dynamically allocated memory is not mapped decrypted. However, when SEV is active this memory range must be mapped decrypted. Add a function which is called after the page allocator is up, and allocate memory for the pvclock data pointers for the all possible cpus. Map this memory range as decrypted when SEV is active. Fixes: 368a540e0232 ("x86/kvmclock: Remove memblock dependency") Suggested-by: Thomas Gleixner Signed-off-by: Brijesh Singh Signed-off-by: Thomas Gleixner Cc: Tom Lendacky Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: Sean Christopherson Cc: "Radim Krčmář" Cc: kvm@vger.kernel.org Link: https://lkml.kernel.org/r/1536932759-12905-3-git-send-email-brijesh.singh@amd.com --- arch/x86/kernel/kvmclock.c | 52 +++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1e6764648af3..013fe3d21dbb 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -61,9 +62,10 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) static struct pvclock_vsyscall_time_info - hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE); -static struct pvclock_wall_clock wall_clock; + hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE); +static struct pvclock_wall_clock wall_clock __bss_decrypted; static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); +static struct pvclock_vsyscall_time_info *hvclock_mem; static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) { @@ -236,6 +238,45 @@ static void kvm_shutdown(void) native_machine_shutdown(); } +static void __init kvmclock_init_mem(void) +{ + unsigned long ncpus; + unsigned int order; + struct page *p; + int r; + + if (HVC_BOOT_ARRAY_SIZE >= num_possible_cpus()) + return; + + ncpus = num_possible_cpus() - HVC_BOOT_ARRAY_SIZE; + order = get_order(ncpus * sizeof(*hvclock_mem)); + + p = alloc_pages(GFP_KERNEL, order); + if (!p) { + pr_warn("%s: failed to alloc %d pages", __func__, (1U << order)); + return; + } + + hvclock_mem = page_address(p); + + /* + * hvclock is shared between the guest and the hypervisor, must + * be mapped decrypted. + */ + if (sev_active()) { + r = set_memory_decrypted((unsigned long) hvclock_mem, + 1UL << order); + if (r) { + __free_pages(p, order); + hvclock_mem = NULL; + pr_warn("kvmclock: set_memory_decrypted() failed. Disabling\n"); + return; + } + } + + memset(hvclock_mem, 0, PAGE_SIZE << order); +} + static int __init kvm_setup_vsyscall_timeinfo(void) { #ifdef CONFIG_X86_64 @@ -250,6 +291,9 @@ static int __init kvm_setup_vsyscall_timeinfo(void) kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; #endif + + kvmclock_init_mem(); + return 0; } early_initcall(kvm_setup_vsyscall_timeinfo); @@ -269,8 +313,10 @@ static int kvmclock_setup_percpu(unsigned int cpu) /* Use the static page for the first CPUs, allocate otherwise */ if (cpu < HVC_BOOT_ARRAY_SIZE) p = &hv_clock_boot[cpu]; + else if (hvclock_mem) + p = hvclock_mem + cpu - HVC_BOOT_ARRAY_SIZE; else - p = kzalloc(sizeof(*p), GFP_KERNEL); + return -ENOMEM; per_cpu(hv_clock_per_cpu, cpu) = p; return p ? 0 : -ENOMEM; From 6d41907c630d3196be89c9ed5a7f8258486b3eaf Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 14 Sep 2018 16:47:14 -0300 Subject: [PATCH 06/53] tools lib bpf: Provide wrapper for strerror_r to build in !_GNU_SOURCE systems Same problem that got fixed in a similar fashion in tools/perf/ in c8b5f2c96d1b ("tools: Introduce str_error_r()"), fix it in the same way, licensing needs to be sorted out to libbpf to use libapi, so, for this simple case, just get the same wrapper in tools/lib/bpf. This makes libbpf and its users (bpftool, selftests, perf) to build again in Alpine Linux 3.[45678] and edge. Acked-by: Alexei Starovoitov Cc: Adrian Hunter Cc: Daniel Borkmann Cc: David Ahern Cc: Hendrik Brueckner Cc: Jakub Kicinski Cc: Jiri Olsa Cc: Martin KaFai Lau Cc: Namhyung Kim Cc: Quentin Monnet Cc: Thomas Richter Cc: Wang Nan Cc: Yonghong Song Fixes: 1ce6a9fc1549 ("bpf: fix build error in libbpf with EXTRA_CFLAGS="-Wp, -D_FORTIFY_SOURCE=2 -O2"") Link: https://lkml.kernel.org/r/20180917151636.GA21790@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/bpf/Build | 2 +- tools/lib/bpf/libbpf.c | 20 ++++++++++---------- tools/lib/bpf/str_error.c | 18 ++++++++++++++++++ tools/lib/bpf/str_error.h | 6 ++++++ 4 files changed, 35 insertions(+), 11 deletions(-) create mode 100644 tools/lib/bpf/str_error.c create mode 100644 tools/lib/bpf/str_error.h diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index 13a861135127..6eb9bacd1948 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1 +1 @@ -libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o +libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2abd0f112627..bdb94939fd60 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -50,6 +50,7 @@ #include "libbpf.h" #include "bpf.h" #include "btf.h" +#include "str_error.h" #ifndef EM_BPF #define EM_BPF 247 @@ -469,7 +470,7 @@ static int bpf_object__elf_init(struct bpf_object *obj) obj->efile.fd = open(obj->path, O_RDONLY); if (obj->efile.fd < 0) { char errmsg[STRERR_BUFSIZE]; - char *cp = strerror_r(errno, errmsg, sizeof(errmsg)); + char *cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("failed to open %s: %s\n", obj->path, cp); return -errno; @@ -810,8 +811,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj) data->d_size, name, idx); if (err) { char errmsg[STRERR_BUFSIZE]; - char *cp = strerror_r(-err, errmsg, - sizeof(errmsg)); + char *cp = str_error(-err, errmsg, sizeof(errmsg)); pr_warning("failed to alloc program %s (%s): %s", name, obj->path, cp); @@ -1140,7 +1140,7 @@ bpf_object__create_maps(struct bpf_object *obj) *pfd = bpf_create_map_xattr(&create_attr); if (*pfd < 0 && create_attr.btf_key_type_id) { - cp = strerror_r(errno, errmsg, sizeof(errmsg)); + cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n", map->name, cp, errno); create_attr.btf_fd = 0; @@ -1155,7 +1155,7 @@ bpf_object__create_maps(struct bpf_object *obj) size_t j; err = *pfd; - cp = strerror_r(errno, errmsg, sizeof(errmsg)); + cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("failed to create map (name: '%s'): %s\n", map->name, cp); for (j = 0; j < i; j++) @@ -1339,7 +1339,7 @@ load_program(enum bpf_prog_type type, enum bpf_attach_type expected_attach_type, } ret = -LIBBPF_ERRNO__LOAD; - cp = strerror_r(errno, errmsg, sizeof(errmsg)); + cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("load bpf program failed: %s\n", cp); if (log_buf && log_buf[0] != '\0') { @@ -1654,7 +1654,7 @@ static int check_path(const char *path) dir = dirname(dname); if (statfs(dir, &st_fs)) { - cp = strerror_r(errno, errmsg, sizeof(errmsg)); + cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("failed to statfs %s: %s\n", dir, cp); err = -errno; } @@ -1690,7 +1690,7 @@ int bpf_program__pin_instance(struct bpf_program *prog, const char *path, } if (bpf_obj_pin(prog->instances.fds[instance], path)) { - cp = strerror_r(errno, errmsg, sizeof(errmsg)); + cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("failed to pin program: %s\n", cp); return -errno; } @@ -1708,7 +1708,7 @@ static int make_dir(const char *path) err = -errno; if (err) { - cp = strerror_r(-err, errmsg, sizeof(errmsg)); + cp = str_error(-err, errmsg, sizeof(errmsg)); pr_warning("failed to mkdir %s: %s\n", path, cp); } return err; @@ -1770,7 +1770,7 @@ int bpf_map__pin(struct bpf_map *map, const char *path) } if (bpf_obj_pin(map->fd, path)) { - cp = strerror_r(errno, errmsg, sizeof(errmsg)); + cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("failed to pin map: %s\n", cp); return -errno; } diff --git a/tools/lib/bpf/str_error.c b/tools/lib/bpf/str_error.c new file mode 100644 index 000000000000..b8798114a357 --- /dev/null +++ b/tools/lib/bpf/str_error.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: LGPL-2.1 +#undef _GNU_SOURCE +#include +#include +#include "str_error.h" + +/* + * Wrapper to allow for building in non-GNU systems such as Alpine Linux's musl + * libc, while checking strerror_r() return to avoid having to check this in + * all places calling it. + */ +char *str_error(int err, char *dst, int len) +{ + int ret = strerror_r(err, dst, len); + if (ret) + snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret); + return dst; +} diff --git a/tools/lib/bpf/str_error.h b/tools/lib/bpf/str_error.h new file mode 100644 index 000000000000..355b1db571d1 --- /dev/null +++ b/tools/lib/bpf/str_error.h @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: LGPL-2.1 +#ifndef BPF_STR_ERROR +#define BPF_STR_ERROR + +char *str_error(int err, char *dst, int len); +#endif // BPF_STR_ERROR From 169e366c08084aeb49a3793c892c9abfaa47eeda Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sun, 16 Sep 2018 16:17:05 +0100 Subject: [PATCH 07/53] perf Documentation: Fix out-of-tree asciidoctor man page generation The dependency for the man page rule using asciidoctor incorrectly specifies a source file in $(OUTPUT). When building out-of-tree, the source file is not found, resulting in a fall-back to the following rule which uses xmlto. Signed-off-by: Ben Hutchings Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20180916151704.GF4765@decadent.org.uk Fixes: ffef80ecf89f ("perf Documentation: Support for asciidoctor") Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile index 42261a9b280e..ac841bc5c35b 100644 --- a/tools/perf/Documentation/Makefile +++ b/tools/perf/Documentation/Makefile @@ -280,7 +280,7 @@ $(MAN_HTML): $(OUTPUT)%.html : %.txt mv $@+ $@ ifdef USE_ASCIIDOCTOR -$(OUTPUT)%.1 $(OUTPUT)%.5 $(OUTPUT)%.7 : $(OUTPUT)%.txt +$(OUTPUT)%.1 $(OUTPUT)%.5 $(OUTPUT)%.7 : %.txt $(QUIET_ASCIIDOC)$(RM) $@+ $@ && \ $(ASCIIDOC) -b manpage -d manpage \ $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \ From 753694a8df318f204a0ac1303de136def16f2e9c Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Sat, 15 Sep 2018 14:58:19 -0700 Subject: [PATCH 08/53] x86/intel_rdt: Fix data type in parsing callbacks Each resource is associated with a parsing callback to parse the data provided from user space when writing schemata file. The 'data' parameter in the callbacks is defined as a void pointer which is error prone due to lack of type check. parse_bw() processes the 'data' parameter as a string while its caller actually passes the parameter as a pointer to struct rdt_cbm_parse_data. Thus, parse_bw() takes wrong data and causes failure of parsing MBA throttle value. To fix the issue, the 'data' parameter in all parsing callbacks is defined and handled as a pointer to struct rdt_parse_data (renamed from struct rdt_cbm_parse_data). Fixes: 7604df6e16ae ("x86/intel_rdt: Support flexible data to parsing callbacks") Fixes: 9ab9aa15c309 ("x86/intel_rdt: Ensure requested schemata respects mode") Signed-off-by: Xiaochen Shen Signed-off-by: Reinette Chatre Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Cc: "H Peter Anvin" Cc: "Tony Luck" Cc: "Chen Yu" Link: https://lkml.kernel.org/r/1537048707-76280-2-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 16 ++++++++++++---- arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 21 ++++++++------------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 4e588f36228f..78266c798280 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -382,6 +382,11 @@ static inline bool is_mbm_event(int e) e <= QOS_L3_MBM_LOCAL_EVENT_ID); } +struct rdt_parse_data { + struct rdtgroup *rdtgrp; + char *buf; +}; + /** * struct rdt_resource - attributes of an RDT resource * @rid: The index of the resource @@ -423,16 +428,19 @@ struct rdt_resource { struct rdt_cache cache; struct rdt_membw membw; const char *format_str; - int (*parse_ctrlval) (void *data, struct rdt_resource *r, - struct rdt_domain *d); + int (*parse_ctrlval)(struct rdt_parse_data *data, + struct rdt_resource *r, + struct rdt_domain *d); struct list_head evt_list; int num_rmid; unsigned int mon_scale; unsigned long fflags; }; -int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d); -int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d); +int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d); +int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d); extern struct mutex rdtgroup_mutex; diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index af358ca05160..edd5761f7336 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -64,19 +64,19 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) return true; } -int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d) +int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d) { - unsigned long data; - char *buf = _buf; + unsigned long bw_val; if (d->have_new_ctrl) { rdt_last_cmd_printf("duplicate domain %d\n", d->id); return -EINVAL; } - if (!bw_validate(buf, &data, r)) + if (!bw_validate(data->buf, &bw_val, r)) return -EINVAL; - d->new_ctrl = data; + d->new_ctrl = bw_val; d->have_new_ctrl = true; return 0; @@ -123,18 +123,13 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) return true; } -struct rdt_cbm_parse_data { - struct rdtgroup *rdtgrp; - char *buf; -}; - /* * Read one cache bit mask (hex). Check that it is valid for the current * resource type. */ -int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d) +int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d) { - struct rdt_cbm_parse_data *data = _data; struct rdtgroup *rdtgrp = data->rdtgrp; u32 cbm_val; @@ -195,7 +190,7 @@ int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d) static int parse_line(char *line, struct rdt_resource *r, struct rdtgroup *rdtgrp) { - struct rdt_cbm_parse_data data; + struct rdt_parse_data data; char *dom = NULL, *id; struct rdt_domain *d; unsigned long dom_id; From f968dc119a159a95628a20de2a2dcc913d0a82d7 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Sat, 15 Sep 2018 14:58:20 -0700 Subject: [PATCH 09/53] x86/intel_rdt: Fix size reporting of MBA resource Chen Yu reported a divide-by-zero error when accessing the 'size' resctrl file when a MBA resource is enabled. divide error: 0000 [#1] SMP PTI CPU: 93 PID: 1929 Comm: cat Not tainted 4.19.0-rc2-debug-rdt+ #25 RIP: 0010:rdtgroup_cbm_to_size+0x7e/0xa0 Call Trace: rdtgroup_size_show+0x11a/0x1d0 seq_read+0xd8/0x3b0 Quoting Chen Yu's report: This is because for MB resource, the r->cache.cbm_len is zero, thus calculating size in rdtgroup_cbm_to_size() will trigger the exception. Fix this issue in the 'size' file by getting correct memory bandwidth value which is in MBps when MBA software controller is enabled or in percentage when MBA software controller is disabled. Fixes: d9b48c86eb38 ("x86/intel_rdt: Display resource groups' allocations in bytes") Reported-by: Chen Yu Signed-off-by: Reinette Chatre Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Tested-by: Chen Yu Cc: "H Peter Anvin" Cc: "Tony Luck" Cc: "Xiaochen Shen" Link: https://lkml.kernel.org/r/20180904174614.26682-1-yu.c.chen@intel.com Link: https://lkml.kernel.org/r/1537048707-76280-3-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index b799c00bef09..32e8bbdf2400 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -1155,8 +1155,8 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, struct rdt_resource *r; struct rdt_domain *d; unsigned int size; - bool sep = false; - u32 cbm; + bool sep; + u32 ctrl; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { @@ -1174,6 +1174,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, } for_each_alloc_enabled_rdt_resource(r) { + sep = false; seq_printf(s, "%*s:", max_name_width, r->name); list_for_each_entry(d, &r->domains, list) { if (sep) @@ -1181,8 +1182,13 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { size = 0; } else { - cbm = d->ctrl_val[rdtgrp->closid]; - size = rdtgroup_cbm_to_size(r, d, cbm); + ctrl = (!is_mba_sc(r) ? + d->ctrl_val[rdtgrp->closid] : + d->mbps_val[rdtgrp->closid]); + if (r->rid == RDT_RESOURCE_MBA) + size = ctrl; + else + size = rdtgroup_cbm_to_size(r, d, ctrl); } seq_printf(s, "%d=%u", d->id, size); sep = true; From c793da8e4c62d2c002a79c47f44efead450cbcae Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Sat, 15 Sep 2018 14:58:21 -0700 Subject: [PATCH 10/53] x86/intel_rdt: Global closid helper to support future fixes The number of CLOSIDs supported by a system is the minimum number of CLOSIDs supported by any of its resources. Care should be taken when iterating over the CLOSIDs of a resource since it may be that the number of CLOSIDs supported on the system is less than the number of CLOSIDs supported by the resource. Introduce a helper function that can be used to query the number of CLOSIDs that is supported by all resources, irrespective of how many CLOSIDs are supported by a particular resource. Signed-off-by: Reinette Chatre Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Cc: "H Peter Anvin" Cc: "Tony Luck" Cc: "Xiaochen Shen" Cc: "Chen Yu" Link: https://lkml.kernel.org/r/1537048707-76280-4-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 1 + arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 78266c798280..285eb3ec4200 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -544,6 +544,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); int update_domains(struct rdt_resource *r, int closid); +int closids_supported(void); void closid_free(int closid); int alloc_rmid(void); void free_rmid(u32 rmid); diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 32e8bbdf2400..b372923eb209 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -97,6 +97,12 @@ void rdt_last_cmd_printf(const char *fmt, ...) * limited as the number of resources grows. */ static int closid_free_map; +static int closid_free_map_len; + +int closids_supported(void) +{ + return closid_free_map_len; +} static void closid_init(void) { @@ -111,6 +117,7 @@ static void closid_init(void) /* CLOSID 0 is always reserved for the default group */ closid_free_map &= ~1; + closid_free_map_len = rdt_min_closid; } static int closid_alloc(void) From 47d53b184aee983ab9492503da11b0a81b19145b Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Sat, 15 Sep 2018 14:58:22 -0700 Subject: [PATCH 11/53] x86/intel_rdt: Fix invalid mode warning when multiple resources are managed When multiple resources are managed by RDT, the number of CLOSIDs used is the minimum of the CLOSIDs supported by each resource. In the function rdt_bit_usage_show(), the annotated bitmask is created to depict how the CAT supporting caches are being used. During this annotated bitmask creation, each resource group is queried for its mode that is used as a label in the annotated bitmask. The maximum number of resource groups is currently assumed to be the number of CLOSIDs supported by the resource for which the information is being displayed. This is incorrect since the number of active CLOSIDs is the minimum across all resources. If information for a cache instance with more CLOSIDs than another is being generated we thus encounter a warning like: invalid mode for closid 8 WARNING: CPU: 88 PID: 1791 at [SNIP]/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c :827 rdt_bit_usage_show+0x221/0x2b0 Fix this by ensuring that only the number of supported CLOSIDs are considered. Fixes: e651901187ab8 ("x86/intel_rdt: Introduce "bit_usage" to display cache allocations details") Signed-off-by: Reinette Chatre Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Cc: "H Peter Anvin" Cc: "Tony Luck" Cc: "Xiaochen Shen" Cc: "Chen Yu" Link: https://lkml.kernel.org/r/1537048707-76280-5-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index b372923eb209..ea91750ba27f 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -809,7 +809,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, sw_shareable = 0; exclusive = 0; seq_printf(seq, "%d=", dom->id); - for (i = 0; i < r->num_closid; i++, ctrl++) { + for (i = 0; i < closids_supported(); i++, ctrl++) { if (!closid_allocated(i)) continue; mode = rdtgroup_mode_by_closid(i); From 70479c012b67b89e219c40eddc5dc338b7c447a3 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Sat, 15 Sep 2018 14:58:23 -0700 Subject: [PATCH 12/53] x86/intel_rdt: Fix unchecked MSR access When a new resource group is created, it is initialized with sane defaults that currently assume the resource being initialized is a CAT resource. This code path is also followed by a MBA resource that is not allocated the same as a CAT resource and as a result we encounter the following unchecked MSR access error: unchecked MSR access error: WRMSR to 0xd51 (tried to write 0x0000 000000000064) at rIP: 0xffffffffae059994 (native_write_msr+0x4/0x20) Call Trace: mba_wrmsr+0x41/0x80 update_domains+0x125/0x130 rdtgroup_mkdir+0x270/0x500 Fix the above by ensuring the initial allocation is only attempted on a CAT resource. Fixes: 95f0b77ef ("x86/intel_rdt: Initialize new resource group with sane defaults") Signed-off-by: Reinette Chatre Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Cc: "H Peter Anvin" Cc: "Tony Luck" Cc: "Xiaochen Shen" Cc: "Chen Yu" Link: https://lkml.kernel.org/r/1537048707-76280-6-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index ea91750ba27f..74821bc457c0 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -2349,6 +2349,12 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) u32 *ctrl; for_each_alloc_enabled_rdt_resource(r) { + /* + * Only initialize default allocations for CBM cache + * resources + */ + if (r->rid == RDT_RESOURCE_MBA) + continue; list_for_each_entry(d, &r->domains, list) { d->have_new_ctrl = false; d->new_ctrl = r->cache.shareable_bits; @@ -2386,6 +2392,12 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) } for_each_alloc_enabled_rdt_resource(r) { + /* + * Only initialize default allocations for CBM cache + * resources + */ + if (r->rid == RDT_RESOURCE_MBA) + continue; ret = update_domains(r, rdtgrp->closid); if (ret < 0) { rdt_last_cmd_puts("failed to initialize allocations\n"); From 32d736abed4febff4b6bf85d5d240ee24d254322 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Sat, 15 Sep 2018 14:58:24 -0700 Subject: [PATCH 13/53] x86/intel_rdt: Do not allow pseudo-locking of MBA resource A system supporting pseudo-locking may have MBA as well as CAT resources of which only the CAT resources could support cache pseudo-locking. When the schemata to be pseudo-locked is provided it should be checked that that schemata does not attempt to pseudo-lock a MBA resource. Fixes: e0bdfe8e3 ("x86/intel_rdt: Support creation/removal of pseudo-locked region") Signed-off-by: Reinette Chatre Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Cc: "H Peter Anvin" Cc: "Tony Luck" Cc: "Xiaochen Shen" Cc: "Chen Yu" Link: https://lkml.kernel.org/r/1537048707-76280-7-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index edd5761f7336..0f53049719cd 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -195,6 +195,12 @@ static int parse_line(char *line, struct rdt_resource *r, struct rdt_domain *d; unsigned long dom_id; + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + r->rid == RDT_RESOURCE_MBA) { + rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); + return -EINVAL; + } + next: if (!line || line[0] == '\0') return 0; From f0df4e1acf3d721958dcafb2c9c0bdf25189068d Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Sat, 15 Sep 2018 14:58:25 -0700 Subject: [PATCH 14/53] x86/intel_rdt: Fix incorrect loop end condition A loop is used to check if a CAT resource's CBM of one CLOSID overlaps with the CBM of another CLOSID of the same resource. The loop is run over all CLOSIDs supported by the resource. The problem with running the loop over all CLOSIDs supported by the resource is that its number of supported CLOSIDs may be more than the number of supported CLOSIDs on the system, which is the minimum number of CLOSIDs supported across all resources. Fix the loop to only consider the number of system supported CLOSIDs, not all that are supported by the resource. Fixes: 49f7b4efa ("x86/intel_rdt: Enable setting of exclusive mode") Signed-off-by: Reinette Chatre Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Cc: "H Peter Anvin" Cc: "Tony Luck" Cc: "Xiaochen Shen" Cc: "Chen Yu" Link: https://lkml.kernel.org/r/1537048707-76280-8-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 74821bc457c0..afd93d45e21b 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -996,7 +996,7 @@ bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, /* Check for overlap with other resource groups */ ctrl = d->ctrl_val; - for (i = 0; i < r->num_closid; i++, ctrl++) { + for (i = 0; i < closids_supported(); i++, ctrl++) { ctrl_b = (unsigned long *)ctrl; mode = rdtgroup_mode_by_closid(i); if (closid_allocated(i) && i != closid && From 939b90b20bc87e199b6b53942764b987289b87ce Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Sat, 15 Sep 2018 14:58:26 -0700 Subject: [PATCH 15/53] x86/intel_rdt: Fix exclusive mode handling of MBA resource It is possible for a resource group to consist out of MBA as well as CAT/CDP resources. The "exclusive" resource mode only applies to the CAT/CDP resources since MBA allocations cannot be specified to overlap or not. When a user requests a resource group to become "exclusive" then it can only be successful if there are CAT/CDP resources in the group and none of their CBMs associated with the group's CLOSID overlaps with any other resource group. Fix the "exclusive" mode setting by failing if there isn't any CAT/CDP resource in the group and ensuring that the CBM checking is only done on CAT/CDP resources. Fixes: 49f7b4efa ("x86/intel_rdt: Enable setting of exclusive mode") Signed-off-by: Reinette Chatre Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Cc: "H Peter Anvin" Cc: "Tony Luck" Cc: "Xiaochen Shen" Cc: "Chen Yu" Link: https://lkml.kernel.org/r/1537048707-76280-9-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index afd93d45e21b..f3231f78d69b 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -1031,16 +1031,27 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) { int closid = rdtgrp->closid; struct rdt_resource *r; + bool has_cache = false; struct rdt_domain *d; for_each_alloc_enabled_rdt_resource(r) { + if (r->rid == RDT_RESOURCE_MBA) + continue; + has_cache = true; list_for_each_entry(d, &r->domains, list) { if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid], - rdtgrp->closid, false)) + rdtgrp->closid, false)) { + rdt_last_cmd_puts("schemata overlaps\n"); return false; + } } } + if (!has_cache) { + rdt_last_cmd_puts("cannot be exclusive without CAT/CDP\n"); + return false; + } + return true; } @@ -1092,7 +1103,6 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, rdtgrp->mode = RDT_MODE_SHAREABLE; } else if (!strcmp(buf, "exclusive")) { if (!rdtgroup_mode_test_exclusive(rdtgrp)) { - rdt_last_cmd_printf("schemata overlaps\n"); ret = -EINVAL; goto out; } From ffb2315fd22c2568747402eecdc581a245a2f5ba Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Sat, 15 Sep 2018 14:58:27 -0700 Subject: [PATCH 16/53] x86/intel_rdt: Fix incorrect loop end condition In order to determine a sane default cache allocation for a new CAT/CDP resource group, all resource groups are checked to determine which cache portions are available to share. At this time all possible CLOSIDs that can be supported by the resource is checked. This is problematic if the resource supports more CLOSIDs than another CAT/CDP resource. In this case, the number of CLOSIDs that could be allocated are fewer than the number of CLOSIDs that can be supported by the resource. Limit the check of closids to that what is supported by the system based on the minimum across all resources. Fixes: 95f0b77ef ("x86/intel_rdt: Initialize new resource group with sane defaults") Signed-off-by: Reinette Chatre Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Cc: "H Peter Anvin" Cc: "Tony Luck" Cc: "Xiaochen Shen" Cc: "Chen Yu" Link: https://lkml.kernel.org/r/1537048707-76280-10-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index f3231f78d69b..1b8e86a5d5e1 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -2370,7 +2370,7 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) d->new_ctrl = r->cache.shareable_bits; used_b = r->cache.shareable_bits; ctrl = d->ctrl_val; - for (i = 0; i < r->num_closid; i++, ctrl++) { + for (i = 0; i < closids_supported(); i++, ctrl++) { if (closid_allocated(i) && i != closid) { mode = rdtgroup_mode_by_closid(i); if (mode == RDT_MODE_PSEUDO_LOCKSETUP) From 8e2aac333785f91ff74e219a1e78e6bdc1ef2c41 Mon Sep 17 00:00:00 2001 From: Simon Detheridge Date: Sat, 15 Sep 2018 22:15:18 +0100 Subject: [PATCH 17/53] pinctrl: cannonlake: Fix gpio base for GPP-E The gpio base for GPP-E was set incorrectly to 258 instead of 256, preventing the touchpad working on my Tong Fang GK5CN5Z laptop. Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=200787 Signed-off-by: Simon Detheridge Acked-by: Mika Westerberg Signed-off-by: Linus Walleij --- drivers/pinctrl/intel/pinctrl-cannonlake.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/intel/pinctrl-cannonlake.c b/drivers/pinctrl/intel/pinctrl-cannonlake.c index fb1afe55bf53..8d48371caaa2 100644 --- a/drivers/pinctrl/intel/pinctrl-cannonlake.c +++ b/drivers/pinctrl/intel/pinctrl-cannonlake.c @@ -379,7 +379,7 @@ static const struct intel_padgroup cnlh_community1_gpps[] = { static const struct intel_padgroup cnlh_community3_gpps[] = { CNL_GPP(0, 155, 178, 192), /* GPP_K */ CNL_GPP(1, 179, 202, 224), /* GPP_H */ - CNL_GPP(2, 203, 215, 258), /* GPP_E */ + CNL_GPP(2, 203, 215, 256), /* GPP_E */ CNL_GPP(3, 216, 239, 288), /* GPP_F */ CNL_GPP(4, 240, 248, CNL_NO_GPIO), /* SPI */ }; From 571d0563c8881595f4ab027aef9ed1c55e3e7b7c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 19 Sep 2018 13:35:53 +0300 Subject: [PATCH 18/53] x86/paravirt: Fix some warning messages The first argument to WARN_ONCE() is a condition. Fixes: 5800dc5c19f3 ("x86/paravirt: Fix spectre-v2 mitigations for paravirt guests") Signed-off-by: Dan Carpenter Signed-off-by: Thomas Gleixner Reviewed-by: Juergen Gross Cc: Peter Zijlstra Cc: Alok Kataria Cc: "H. Peter Anvin" Cc: virtualization@lists.linux-foundation.org Cc: kernel-janitors@vger.kernel.org Link: https://lkml.kernel.org/r/20180919103553.GD9238@mwanda --- arch/x86/kernel/paravirt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index afdb303285f8..8dc69d82567e 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -91,7 +91,7 @@ unsigned paravirt_patch_call(void *insnbuf, if (len < 5) { #ifdef CONFIG_RETPOLINE - WARN_ONCE("Failing to patch indirect CALL in %ps\n", (void *)addr); + WARN_ONCE(1, "Failing to patch indirect CALL in %ps\n", (void *)addr); #endif return len; /* call too long for patch site */ } @@ -111,7 +111,7 @@ unsigned paravirt_patch_jmp(void *insnbuf, const void *target, if (len < 5) { #ifdef CONFIG_RETPOLINE - WARN_ONCE("Failing to patch indirect JMP in %ps\n", (void *)addr); + WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr); #endif return len; /* call too long for patch site */ } From 336b08088d4da009108d4418e241ca95c9152237 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Sep 2018 13:48:41 +0200 Subject: [PATCH 19/53] MAINTAINERS: Add Borislav to the x86 maintainers Borislav is effectivly maintaining parts of X86 already, make it official. Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: Borislav Petkov --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 4ece30f15777..091e66b60cd2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15913,6 +15913,7 @@ F: net/x25/ X86 ARCHITECTURE (32-BIT AND 64-BIT) M: Thomas Gleixner M: Ingo Molnar +M: Borislav Petkov R: "H. Peter Anvin" M: x86@kernel.org L: linux-kernel@vger.kernel.org From 70513d58751d7c6c1a0133557b13089b9f2e3e66 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Thu, 12 Jul 2018 13:27:00 -0400 Subject: [PATCH 20/53] xen/x86/vpmu: Zero struct pt_regs before calling into sample handling code Otherwise we may leak kernel stack for events that sample user registers. Reported-by: Mark Rutland Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky Cc: stable@vger.kernel.org --- arch/x86/xen/pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index 7d00d4ad44d4..95997e6c0696 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -478,7 +478,7 @@ static void xen_convert_regs(const struct xen_pmu_regs *xen_regs, irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id) { int err, ret = IRQ_NONE; - struct pt_regs regs; + struct pt_regs regs = {0}; const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); uint8_t xenpmu_flags = get_xenpmu_flags(); From d59f532480f5231bf62615a9287e05b78225fb05 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 19 Sep 2018 15:42:33 +0200 Subject: [PATCH 21/53] xen: issue warning message when out of grant maptrack entries When a driver domain (e.g. dom0) is running out of maptrack entries it can't map any more foreign domain pages. Instead of silently stalling the affected domUs issue a rate limited warning in this case in order to make it easier to detect that situation. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/xen/grant-table.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 7bafa703a992..84575baceebc 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -1040,18 +1040,33 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, return ret; for (i = 0; i < count; i++) { - /* Retry eagain maps */ - if (map_ops[i].status == GNTST_eagain) - gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i, - &map_ops[i].status, __func__); - - if (map_ops[i].status == GNTST_okay) { + switch (map_ops[i].status) { + case GNTST_okay: + { struct xen_page_foreign *foreign; SetPageForeign(pages[i]); foreign = xen_page_foreign(pages[i]); foreign->domid = map_ops[i].dom; foreign->gref = map_ops[i].ref; + break; + } + + case GNTST_no_device_space: + pr_warn_ratelimited("maptrack limit reached, can't map all guest pages\n"); + break; + + case GNTST_eagain: + /* Retry eagain maps */ + gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, + map_ops + i, + &map_ops[i].status, __func__); + /* Test status in next loop iteration. */ + i--; + break; + + default: + break; } } From 96147db1e1dff83679e71ac92193cbcab761a14c Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 18 Sep 2018 18:36:21 +0300 Subject: [PATCH 22/53] pinctrl: intel: Do pin translation in other GPIO operations as well For some reason I thought GPIOLIB handles translation from GPIO ranges to pinctrl pins but it turns out not to be the case. This means that when GPIOs operations are performed for a pin controller having a custom GPIO base such as Cannon Lake and Ice Lake incorrect pin number gets used internally. Fix this in the same way we did for lock/unlock IRQ operations and translate the GPIO number to pin before using it. Fixes: a60eac3239f0 ("pinctrl: intel: Allow custom GPIO base for pad groups") Reported-by: Rajat Jain Signed-off-by: Mika Westerberg Tested-by: Rajat Jain Signed-off-by: Linus Walleij --- drivers/pinctrl/intel/pinctrl-intel.c | 175 ++++++++++++++------------ 1 file changed, 95 insertions(+), 80 deletions(-) diff --git a/drivers/pinctrl/intel/pinctrl-intel.c b/drivers/pinctrl/intel/pinctrl-intel.c index 62b009b27eda..ec8dafc94694 100644 --- a/drivers/pinctrl/intel/pinctrl-intel.c +++ b/drivers/pinctrl/intel/pinctrl-intel.c @@ -747,86 +747,6 @@ static const struct pinctrl_desc intel_pinctrl_desc = { .owner = THIS_MODULE, }; -static int intel_gpio_get(struct gpio_chip *chip, unsigned offset) -{ - struct intel_pinctrl *pctrl = gpiochip_get_data(chip); - void __iomem *reg; - u32 padcfg0; - - reg = intel_get_padcfg(pctrl, offset, PADCFG0); - if (!reg) - return -EINVAL; - - padcfg0 = readl(reg); - if (!(padcfg0 & PADCFG0_GPIOTXDIS)) - return !!(padcfg0 & PADCFG0_GPIOTXSTATE); - - return !!(padcfg0 & PADCFG0_GPIORXSTATE); -} - -static void intel_gpio_set(struct gpio_chip *chip, unsigned offset, int value) -{ - struct intel_pinctrl *pctrl = gpiochip_get_data(chip); - unsigned long flags; - void __iomem *reg; - u32 padcfg0; - - reg = intel_get_padcfg(pctrl, offset, PADCFG0); - if (!reg) - return; - - raw_spin_lock_irqsave(&pctrl->lock, flags); - padcfg0 = readl(reg); - if (value) - padcfg0 |= PADCFG0_GPIOTXSTATE; - else - padcfg0 &= ~PADCFG0_GPIOTXSTATE; - writel(padcfg0, reg); - raw_spin_unlock_irqrestore(&pctrl->lock, flags); -} - -static int intel_gpio_get_direction(struct gpio_chip *chip, unsigned int offset) -{ - struct intel_pinctrl *pctrl = gpiochip_get_data(chip); - void __iomem *reg; - u32 padcfg0; - - reg = intel_get_padcfg(pctrl, offset, PADCFG0); - if (!reg) - return -EINVAL; - - padcfg0 = readl(reg); - - if (padcfg0 & PADCFG0_PMODE_MASK) - return -EINVAL; - - return !!(padcfg0 & PADCFG0_GPIOTXDIS); -} - -static int intel_gpio_direction_input(struct gpio_chip *chip, unsigned offset) -{ - return pinctrl_gpio_direction_input(chip->base + offset); -} - -static int intel_gpio_direction_output(struct gpio_chip *chip, unsigned offset, - int value) -{ - intel_gpio_set(chip, offset, value); - return pinctrl_gpio_direction_output(chip->base + offset); -} - -static const struct gpio_chip intel_gpio_chip = { - .owner = THIS_MODULE, - .request = gpiochip_generic_request, - .free = gpiochip_generic_free, - .get_direction = intel_gpio_get_direction, - .direction_input = intel_gpio_direction_input, - .direction_output = intel_gpio_direction_output, - .get = intel_gpio_get, - .set = intel_gpio_set, - .set_config = gpiochip_generic_config, -}; - /** * intel_gpio_to_pin() - Translate from GPIO offset to pin number * @pctrl: Pinctrl structure @@ -872,6 +792,101 @@ static int intel_gpio_to_pin(struct intel_pinctrl *pctrl, unsigned offset, return -EINVAL; } +static int intel_gpio_get(struct gpio_chip *chip, unsigned offset) +{ + struct intel_pinctrl *pctrl = gpiochip_get_data(chip); + void __iomem *reg; + u32 padcfg0; + int pin; + + pin = intel_gpio_to_pin(pctrl, offset, NULL, NULL); + if (pin < 0) + return -EINVAL; + + reg = intel_get_padcfg(pctrl, pin, PADCFG0); + if (!reg) + return -EINVAL; + + padcfg0 = readl(reg); + if (!(padcfg0 & PADCFG0_GPIOTXDIS)) + return !!(padcfg0 & PADCFG0_GPIOTXSTATE); + + return !!(padcfg0 & PADCFG0_GPIORXSTATE); +} + +static void intel_gpio_set(struct gpio_chip *chip, unsigned offset, int value) +{ + struct intel_pinctrl *pctrl = gpiochip_get_data(chip); + unsigned long flags; + void __iomem *reg; + u32 padcfg0; + int pin; + + pin = intel_gpio_to_pin(pctrl, offset, NULL, NULL); + if (pin < 0) + return; + + reg = intel_get_padcfg(pctrl, pin, PADCFG0); + if (!reg) + return; + + raw_spin_lock_irqsave(&pctrl->lock, flags); + padcfg0 = readl(reg); + if (value) + padcfg0 |= PADCFG0_GPIOTXSTATE; + else + padcfg0 &= ~PADCFG0_GPIOTXSTATE; + writel(padcfg0, reg); + raw_spin_unlock_irqrestore(&pctrl->lock, flags); +} + +static int intel_gpio_get_direction(struct gpio_chip *chip, unsigned int offset) +{ + struct intel_pinctrl *pctrl = gpiochip_get_data(chip); + void __iomem *reg; + u32 padcfg0; + int pin; + + pin = intel_gpio_to_pin(pctrl, offset, NULL, NULL); + if (pin < 0) + return -EINVAL; + + reg = intel_get_padcfg(pctrl, pin, PADCFG0); + if (!reg) + return -EINVAL; + + padcfg0 = readl(reg); + + if (padcfg0 & PADCFG0_PMODE_MASK) + return -EINVAL; + + return !!(padcfg0 & PADCFG0_GPIOTXDIS); +} + +static int intel_gpio_direction_input(struct gpio_chip *chip, unsigned offset) +{ + return pinctrl_gpio_direction_input(chip->base + offset); +} + +static int intel_gpio_direction_output(struct gpio_chip *chip, unsigned offset, + int value) +{ + intel_gpio_set(chip, offset, value); + return pinctrl_gpio_direction_output(chip->base + offset); +} + +static const struct gpio_chip intel_gpio_chip = { + .owner = THIS_MODULE, + .request = gpiochip_generic_request, + .free = gpiochip_generic_free, + .get_direction = intel_gpio_get_direction, + .direction_input = intel_gpio_direction_input, + .direction_output = intel_gpio_direction_output, + .get = intel_gpio_get, + .set = intel_gpio_set, + .set_config = gpiochip_generic_config, +}; + static int intel_gpio_irq_reqres(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); From a8b3bb338e4ee4cc84a2b9a6fdf27049b84baa59 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 20 Sep 2018 12:37:08 -0700 Subject: [PATCH 23/53] x86/intel_rdt: Add Reinette as co-maintainer for RDT Reinette Chatre is doing great job on enabling pseudo-locking and other features in RDT. Add her as co-maintainer for RDT. Suggested-by: Thomas Gleixner Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: Reinette Chatre Cc: "H Peter Anvin" Cc: "Tony Luck" Link: https://lkml.kernel.org/r/1537472228-221799-1-git-send-email-fenghua.yu@intel.com --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 091e66b60cd2..140ea6ee3ac8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12260,6 +12260,7 @@ F: Documentation/networking/rds.txt RDT - RESOURCE ALLOCATION M: Fenghua Yu +M: Reinette Chatre L: linux-kernel@vger.kernel.org S: Supported F: arch/x86/kernel/cpu/intel_rdt* From 9068a427ee0beb1365a0925e8c33f338f09f5e97 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Sep 2018 14:33:14 +0200 Subject: [PATCH 24/53] MAINTAINERS: Add X86 MM entry Dave, Andy and Peter are de facto overseing the mm parts of X86. Add an explicit maintainers entry. Signed-off-by: Thomas Gleixner Acked-by: Dave Hansen Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra Acked-by: Ingo Molnar --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 140ea6ee3ac8..80a311252b04 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15943,6 +15943,15 @@ M: Borislav Petkov S: Maintained F: arch/x86/kernel/cpu/microcode/* +X86 MM +M: Dave Hansen +M: Andy Lutomirski +M: Peter Zijlstra +L: linux-kernel@vger.kernel.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/mm +S: Maintained +F: arch/x86/mm/ + X86 PLATFORM DRIVERS M: Darren Hart M: Andy Shevchenko From 05ab1d8a4b36ee912b7087c6da127439ed0a903e Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 20 Sep 2018 10:58:28 +0800 Subject: [PATCH 25/53] x86/mm: Expand static page table for fixmap space We met a kernel panic when enabling earlycon, which is due to the fixmap address of earlycon is not statically setup. Currently the static fixmap setup in head_64.S only covers 2M virtual address space, while it actually could be in 4M space with different kernel configurations, e.g. when VSYSCALL emulation is disabled. So increase the static space to 4M for now by defining FIXMAP_PMD_NUM to 2, and add a build time check to ensure that the fixmap is covered by the initial static page tables. Fixes: 1ad83c858c7d ("x86_64,vsyscall: Make vsyscall emulation configurable") Suggested-by: Thomas Gleixner Signed-off-by: Feng Tang Signed-off-by: Thomas Gleixner Tested-by: kernel test robot Reviewed-by: Juergen Gross (Xen parts) Cc: H Peter Anvin Cc: Peter Zijlstra Cc: Michal Hocko Cc: Yinghai Lu Cc: Dave Hansen Cc: Andi Kleen Cc: Andy Lutomirsky Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180920025828.23699-1-feng.tang@intel.com --- arch/x86/include/asm/fixmap.h | 10 ++++++++++ arch/x86/include/asm/pgtable_64.h | 3 ++- arch/x86/kernel/head64.c | 4 +++- arch/x86/kernel/head_64.S | 16 ++++++++++++---- arch/x86/mm/pgtable.c | 9 +++++++++ arch/x86/xen/mmu_pv.c | 8 ++++++-- 6 files changed, 42 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index e203169931c7..6390bd8c141b 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -14,6 +14,16 @@ #ifndef _ASM_X86_FIXMAP_H #define _ASM_X86_FIXMAP_H +/* + * Exposed to assembly code for setting up initial page tables. Cannot be + * calculated in assembly code (fixmap entries are an enum), but is sanity + * checked in the actual fixmap C code to make sure that the fixmap is + * covered fully. + */ +#define FIXMAP_PMD_NUM 2 +/* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */ +#define FIXMAP_PMD_TOP 507 + #ifndef __ASSEMBLY__ #include #include diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index ce2b59047cb8..9c85b54bf03c 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -14,6 +14,7 @@ #include #include #include +#include extern p4d_t level4_kernel_pgt[512]; extern p4d_t level4_ident_pgt[512]; @@ -22,7 +23,7 @@ extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; extern pmd_t level2_fixmap_pgt[512]; extern pmd_t level2_ident_pgt[512]; -extern pte_t level1_fixmap_pgt[512]; +extern pte_t level1_fixmap_pgt[512 * FIXMAP_PMD_NUM]; extern pgd_t init_top_pgt[]; #define swapper_pg_dir init_top_pgt diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index c16af27eb23f..ddee1f0870c4 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -35,6 +35,7 @@ #include #include #include +#include /* * Manage page tables very early on. @@ -166,7 +167,8 @@ unsigned long __head __startup_64(unsigned long physaddr, pud[511] += load_delta; pmd = fixup_pointer(level2_fixmap_pgt, physaddr); - pmd[506] += load_delta; + for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--) + pmd[i] += load_delta; /* * Set up the identity mapping for the switchover. These diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 15ebc2fc166e..a3618cf04cf6 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -24,6 +24,7 @@ #include "../entry/calling.h" #include #include +#include #ifdef CONFIG_PARAVIRT #include @@ -445,13 +446,20 @@ NEXT_PAGE(level2_kernel_pgt) KERNEL_IMAGE_SIZE/PMD_SIZE) NEXT_PAGE(level2_fixmap_pgt) - .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC - /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ - .fill 5,8,0 + .fill (512 - 4 - FIXMAP_PMD_NUM),8,0 + pgtno = 0 + .rept (FIXMAP_PMD_NUM) + .quad level1_fixmap_pgt + (pgtno << PAGE_SHIFT) - __START_KERNEL_map \ + + _PAGE_TABLE_NOENC; + pgtno = pgtno + 1 + .endr + /* 6 MB reserved space + a 2MB hole */ + .fill 4,8,0 NEXT_PAGE(level1_fixmap_pgt) + .rept (FIXMAP_PMD_NUM) .fill 512,8,0 + .endr #undef PMDS diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index ae394552fb94..089e78c4effd 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -637,6 +637,15 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) { unsigned long address = __fix_to_virt(idx); +#ifdef CONFIG_X86_64 + /* + * Ensure that the static initial page tables are covering the + * fixmap completely. + */ + BUILD_BUG_ON(__end_of_permanent_fixed_addresses > + (FIXMAP_PMD_NUM * PTRS_PER_PTE)); +#endif + if (idx >= __end_of_fixed_addresses) { BUG(); return; diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 2fe5c9b1816b..dd461c0167ef 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1907,7 +1907,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* L3_k[511] -> level2_fixmap_pgt */ convert_pfn_mfn(level3_kernel_pgt); - /* L3_k[511][506] -> level1_fixmap_pgt */ + /* L3_k[511][508-FIXMAP_PMD_NUM ... 507] -> level1_fixmap_pgt */ convert_pfn_mfn(level2_fixmap_pgt); /* We get [511][511] and have Xen's version of level2_kernel_pgt */ @@ -1952,7 +1952,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); - set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO); + + for (i = 0; i < FIXMAP_PMD_NUM; i++) { + set_page_prot(level1_fixmap_pgt + i * PTRS_PER_PTE, + PAGE_KERNEL_RO); + } /* Pin down new L4 */ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, From b57e99b4b8b0ebdf9707424e7ddc0c392bdc5fe6 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 21 Sep 2018 16:44:34 -0700 Subject: [PATCH 26/53] block: use nanosecond resolution for iostat Klaus Kusche reported that the I/O busy time in /proc/diskstats was not updating properly on 4.18. This is because we started using ktime to track elapsed time, and we convert nanoseconds to jiffies when we update the partition counter. However, this gets rounded down, so any I/Os that take less than a jiffy are not accounted for. Previously in this case, the value of jiffies would sometimes increment while we were doing I/O, so at least some I/Os were accounted for. Let's convert the stats to use nanoseconds internally. We still report milliseconds as before, now more accurately than ever. The value is still truncated to 32 bits for backwards compatibility. Fixes: 522a777566f5 ("block: consolidate struct request timestamp fields") Cc: stable@vger.kernel.org Reported-by: Klaus Kusche Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/bio.c | 2 +- block/blk-core.c | 4 +--- block/genhd.c | 6 +++--- block/partition-generic.c | 6 +++--- include/linux/genhd.h | 5 ++++- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/block/bio.c b/block/bio.c index 8c680a776171..0093bed81c0e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1684,7 +1684,7 @@ void generic_end_io_acct(struct request_queue *q, int req_op, const int sgrp = op_stat_group(req_op); int cpu = part_stat_lock(); - part_stat_add(cpu, part, ticks[sgrp], duration); + part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration)); part_round_stats(q, cpu, part); part_dec_in_flight(q, part, op_is_write(req_op)); diff --git a/block/blk-core.c b/block/blk-core.c index 4dbc93f43b38..cff0a60ee200 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2733,17 +2733,15 @@ void blk_account_io_done(struct request *req, u64 now) * containing request is enough. */ if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { - unsigned long duration; const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; int cpu; - duration = nsecs_to_jiffies(now - req->start_time_ns); cpu = part_stat_lock(); part = req->part; part_stat_inc(cpu, part, ios[sgrp]); - part_stat_add(cpu, part, ticks[sgrp], duration); + part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns); part_round_stats(req->q, cpu, part); part_dec_in_flight(req->q, part, rq_data_dir(req)); diff --git a/block/genhd.c b/block/genhd.c index 8cc719a37b32..be5bab20b2ab 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1343,18 +1343,18 @@ static int diskstats_show(struct seq_file *seqf, void *v) part_stat_read(hd, ios[STAT_READ]), part_stat_read(hd, merges[STAT_READ]), part_stat_read(hd, sectors[STAT_READ]), - jiffies_to_msecs(part_stat_read(hd, ticks[STAT_READ])), + (unsigned int)part_stat_read_msecs(hd, STAT_READ), part_stat_read(hd, ios[STAT_WRITE]), part_stat_read(hd, merges[STAT_WRITE]), part_stat_read(hd, sectors[STAT_WRITE]), - jiffies_to_msecs(part_stat_read(hd, ticks[STAT_WRITE])), + (unsigned int)part_stat_read_msecs(hd, STAT_WRITE), inflight[0], jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)), part_stat_read(hd, ios[STAT_DISCARD]), part_stat_read(hd, merges[STAT_DISCARD]), part_stat_read(hd, sectors[STAT_DISCARD]), - jiffies_to_msecs(part_stat_read(hd, ticks[STAT_DISCARD])) + (unsigned int)part_stat_read_msecs(hd, STAT_DISCARD) ); } disk_part_iter_exit(&piter); diff --git a/block/partition-generic.c b/block/partition-generic.c index 5a8975a1201c..d3d14e81fb12 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -136,18 +136,18 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, ios[STAT_READ]), part_stat_read(p, merges[STAT_READ]), (unsigned long long)part_stat_read(p, sectors[STAT_READ]), - jiffies_to_msecs(part_stat_read(p, ticks[STAT_READ])), + (unsigned int)part_stat_read_msecs(p, STAT_READ), part_stat_read(p, ios[STAT_WRITE]), part_stat_read(p, merges[STAT_WRITE]), (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), - jiffies_to_msecs(part_stat_read(p, ticks[STAT_WRITE])), + (unsigned int)part_stat_read_msecs(p, STAT_WRITE), inflight[0], jiffies_to_msecs(part_stat_read(p, io_ticks)), jiffies_to_msecs(part_stat_read(p, time_in_queue)), part_stat_read(p, ios[STAT_DISCARD]), part_stat_read(p, merges[STAT_DISCARD]), (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]), - jiffies_to_msecs(part_stat_read(p, ticks[STAT_DISCARD]))); + (unsigned int)part_stat_read_msecs(p, STAT_DISCARD)); } ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 57864422a2c8..25c08c6c7f99 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -83,10 +83,10 @@ struct partition { } __attribute__((packed)); struct disk_stats { + u64 nsecs[NR_STAT_GROUPS]; unsigned long sectors[NR_STAT_GROUPS]; unsigned long ios[NR_STAT_GROUPS]; unsigned long merges[NR_STAT_GROUPS]; - unsigned long ticks[NR_STAT_GROUPS]; unsigned long io_ticks; unsigned long time_in_queue; }; @@ -354,6 +354,9 @@ static inline void free_part_stats(struct hd_struct *part) #endif /* CONFIG_SMP */ +#define part_stat_read_msecs(part, which) \ + div_u64(part_stat_read(part, nsecs[which]), NSEC_PER_MSEC) + #define part_stat_read_accum(part, field) \ (part_stat_read(part, field[STAT_READ]) + \ part_stat_read(part, field[STAT_WRITE]) + \ From 6bf4ca7fbc85d80446ac01c0d1d77db4d91a6d84 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 23 Sep 2018 19:15:18 +0200 Subject: [PATCH 27/53] Linux 4.19-rc5 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f03a1e062503..0c90c4354979 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 4 PATCHLEVEL = 19 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Merciless Moray # *DOCUMENTATION* From d2db7773ba864df6b4e19643dfc54838550d8049 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 26 Sep 2018 17:32:37 +0100 Subject: [PATCH 28/53] kvm: arm/arm64: Fix stage2_flush_memslot for 4 level page table So far we have only supported 3 level page table with fixed IPA of 40bits, where PUD is folded. With 4 level page tables, we need to check if the PUD entry is valid or not. Fix stage2_flush_memslot() to do this check, before walking down the table. Acked-by: Christoffer Dall Acked-by: Marc Zyngier Reviewed-by: Eric Auger Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index ed162a6c57c5..ee7ce8fa4a12 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -412,7 +412,8 @@ static void stage2_flush_memslot(struct kvm *kvm, pgd = kvm->arch.pgd + stage2_pgd_index(addr); do { next = stage2_pgd_addr_end(addr, end); - stage2_flush_puds(kvm, pgd, addr, next); + if (!stage2_pgd_none(*pgd)) + stage2_flush_puds(kvm, pgd, addr, next); } while (pgd++, addr = next, addr != end); } From 7788a28062aca211979ecd2c334e06e2ef5281cc Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 26 Sep 2018 17:32:38 +0100 Subject: [PATCH 29/53] kvm: arm/arm64: Remove spurious WARN_ON On a 4-level page table pgd entry can be empty, unlike a 3-level page table. Remove the spurious WARN_ON() in stage_get_pud(). Acked-by: Christoffer Dall Acked-by: Marc Zyngier Reviewed-by: Eric Auger Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index ee7ce8fa4a12..4a285d760ce0 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1005,7 +1005,7 @@ static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache pud_t *pud; pgd = kvm->arch.pgd + stage2_pgd_index(addr); - if (WARN_ON(stage2_pgd_none(*pgd))) { + if (stage2_pgd_none(*pgd)) { if (!cache) return NULL; pud = mmu_memory_cache_alloc(cache); From 9f98ddd6686cc9469fb73b11ddd403271d65cbdf Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 26 Sep 2018 17:32:39 +0100 Subject: [PATCH 30/53] kvm: arm64: Add helper for loading the stage2 setting for a VM We load the stage2 context of a guest for different operations, including running the guest and tlb maintenance on behalf of the guest. As of now only the vttbr is private to the guest, but this is about to change with IPA per VM. Add a helper to load the stage2 configuration for a VM, which could do the right thing with the future changes. Cc: Christoffer Dall Cc: Marc Zyngier Reviewed-by: Eric Auger Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_hyp.h | 9 +++++++++ arch/arm64/kvm/hyp/switch.c | 2 +- arch/arm64/kvm/hyp/tlb.c | 4 ++-- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 384c34397619..d1bd1e0f14d7 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -155,5 +155,14 @@ void deactivate_traps_vhe_put(void); u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt); void __noreturn __hyp_do_panic(unsigned long, ...); +/* + * Must be called from hyp code running at EL2 with an updated VTTBR + * and interrupts disabled. + */ +static __always_inline void __hyp_text __load_guest_stage2(struct kvm *kvm) +{ + write_sysreg(kvm->arch.vttbr, vttbr_el2); +} + #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index ca46153d7915..9d5ce1a3039a 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -198,7 +198,7 @@ void deactivate_traps_vhe_put(void) static void __hyp_text __activate_vm(struct kvm *kvm) { - write_sysreg(kvm->arch.vttbr, vttbr_el2); + __load_guest_stage2(kvm); } static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c index 131c7772703c..4dbd9c69a96d 100644 --- a/arch/arm64/kvm/hyp/tlb.c +++ b/arch/arm64/kvm/hyp/tlb.c @@ -30,7 +30,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm) * bits. Changing E2H is impossible (goodbye TTBR1_EL2), so * let's flip TGE before executing the TLB operation. */ - write_sysreg(kvm->arch.vttbr, vttbr_el2); + __load_guest_stage2(kvm); val = read_sysreg(hcr_el2); val &= ~HCR_TGE; write_sysreg(val, hcr_el2); @@ -39,7 +39,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm) static void __hyp_text __tlb_switch_to_guest_nvhe(struct kvm *kvm) { - write_sysreg(kvm->arch.vttbr, vttbr_el2); + __load_guest_stage2(kvm); isb(); } From ce00e3cb4fb496683708db6bfce470e5c7710ddc Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 26 Sep 2018 17:32:40 +0100 Subject: [PATCH 31/53] arm64: Add a helper for PARange to physical shift conversion On arm64, ID_AA64MMFR0_EL1.PARange encodes the maximum Physical Address range supported by the CPU. Add a helper to decode this to actual physical shift. If we hit an unallocated value, return the maximum range supported by the kernel. This will be used by KVM to set the VTCR_EL2.T0SZ, as it is about to move its place. Having this helper keeps the code movement cleaner. Cc: Marc Zyngier Cc: James Morse Cc: Christoffer Dall Acked-by: Catalin Marinas Reviewed-by: Eric Auger Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/cpufeature.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 1717ba1db35d..072cc1c970c2 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -530,6 +530,26 @@ void arm64_set_ssbd_mitigation(bool state); static inline void arm64_set_ssbd_mitigation(bool state) {} #endif +static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) +{ + switch (parange) { + case 0: return 32; + case 1: return 36; + case 2: return 40; + case 3: return 42; + case 4: return 44; + case 5: return 48; + case 6: return 52; + /* + * A future PE could use a value unknown to the kernel. + * However, by the "D10.1.4 Principles of the ID scheme + * for fields in ID registers", ARM DDI 0487C.a, any new + * value is guaranteed to be higher than what we know already. + * As a safe limit, we return the limit supported by the kernel. + */ + default: return CONFIG_ARM64_PA_BITS; + } +} #endif /* __ASSEMBLY__ */ #endif From b2df44ffba363bd90274340e0adfd8137f5cd878 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 26 Sep 2018 17:32:41 +0100 Subject: [PATCH 32/53] kvm: arm64: Clean up VTCR_EL2 initialisation Use the new helper for converting the parange to the physical shift. Also, add the missing definitions for the VTCR_EL2 register fields and use them instead of hard coding numbers. Cc: Marc Zyngier Cc: Christoffer Dall Reviewed-by: Eric Auger Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_arm.h | 3 +++ arch/arm64/kvm/hyp/s2-setup.c | 34 ++++++++------------------------ 2 files changed, 11 insertions(+), 26 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index aa45df752a16..5f807b680a5f 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -107,6 +107,7 @@ #define VTCR_EL2_RES1 (1 << 31) #define VTCR_EL2_HD (1 << 22) #define VTCR_EL2_HA (1 << 21) +#define VTCR_EL2_PS_SHIFT TCR_EL2_PS_SHIFT #define VTCR_EL2_PS_MASK TCR_EL2_PS_MASK #define VTCR_EL2_TG0_MASK TCR_TG0_MASK #define VTCR_EL2_TG0_4K TCR_TG0_4K @@ -127,6 +128,8 @@ #define VTCR_EL2_VS_8BIT (0 << VTCR_EL2_VS_SHIFT) #define VTCR_EL2_VS_16BIT (1 << VTCR_EL2_VS_SHIFT) +#define VTCR_EL2_T0SZ(x) TCR_T0SZ(x) + /* * We configure the Stage-2 page tables to always restrict the IPA space to be * 40 bits wide (T0SZ = 24). Systems with a PARange smaller than 40 bits are diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c index 603e1ee83e89..e1ca672e937a 100644 --- a/arch/arm64/kvm/hyp/s2-setup.c +++ b/arch/arm64/kvm/hyp/s2-setup.c @@ -19,45 +19,27 @@ #include #include #include +#include u32 __hyp_text __init_stage2_translation(void) { u64 val = VTCR_EL2_FLAGS; u64 parange; + u32 phys_shift; u64 tmp; /* * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS - * bits in VTCR_EL2. Amusingly, the PARange is 4 bits, while - * PS is only 3. Fortunately, bit 19 is RES0 in VTCR_EL2... + * bits in VTCR_EL2. Amusingly, the PARange is 4 bits, but the + * allocated values are limited to 3bits. */ parange = read_sysreg(id_aa64mmfr0_el1) & 7; if (parange > ID_AA64MMFR0_PARANGE_MAX) parange = ID_AA64MMFR0_PARANGE_MAX; - val |= parange << 16; + val |= parange << VTCR_EL2_PS_SHIFT; /* Compute the actual PARange... */ - switch (parange) { - case 0: - parange = 32; - break; - case 1: - parange = 36; - break; - case 2: - parange = 40; - break; - case 3: - parange = 42; - break; - case 4: - parange = 44; - break; - case 5: - default: - parange = 48; - break; - } + phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange); /* * ... and clamp it to 40 bits, unless we have some braindead @@ -65,7 +47,7 @@ u32 __hyp_text __init_stage2_translation(void) * return that value for the rest of the kernel to decide what * to do. */ - val |= 64 - (parange > 40 ? 40 : parange); + val |= VTCR_EL2_T0SZ(phys_shift > 40 ? 40 : phys_shift); /* * Check the availability of Hardware Access Flag / Dirty Bit @@ -86,5 +68,5 @@ u32 __hyp_text __init_stage2_translation(void) write_sysreg(val, vtcr_el2); - return parange; + return phys_shift; } From 5b6c6742b5350a6fb5c631fb99a6bc046a62739c Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 26 Sep 2018 17:32:42 +0100 Subject: [PATCH 33/53] kvm: arm/arm64: Allow arch specific configurations for VM Allow the arch backends to perform VM specific initialisation. This will be later used to handle IPA size configuration and per-VM VTCR configuration on arm64. Cc: Marc Zyngier Cc: Christoffer Dall Reviewed-by: Eric Auger Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_host.h | 7 +++++++ arch/arm64/include/asm/kvm_host.h | 2 ++ arch/arm64/kvm/reset.c | 7 +++++++ virt/kvm/arm/arm.c | 5 +++-- 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 3ad482d2f1eb..72d46418e1ef 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -354,4 +354,11 @@ static inline void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu) {} struct kvm *kvm_arch_alloc_vm(void); void kvm_arch_free_vm(struct kvm *kvm); +static inline int kvm_arm_config_vm(struct kvm *kvm, unsigned long type) +{ + if (type) + return -EINVAL; + return 0; +} + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3d6d7336f871..b04280ae1be0 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -513,4 +513,6 @@ void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu); struct kvm *kvm_arch_alloc_vm(void); void kvm_arch_free_vm(struct kvm *kvm); +int kvm_arm_config_vm(struct kvm *kvm, unsigned long type); + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index e37c78bbe1ca..b0c07dab5cb3 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -133,3 +133,10 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) /* Reset timer */ return kvm_timer_vcpu_reset(vcpu); } + +int kvm_arm_config_vm(struct kvm *kvm, unsigned long type) +{ + if (type) + return -EINVAL; + return 0; +} diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index c92053bc3f96..327d0fd28380 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -120,8 +120,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int ret, cpu; - if (type) - return -EINVAL; + ret = kvm_arm_config_vm(kvm, type); + if (ret) + return ret; kvm->arch.last_vcpu_ran = alloc_percpu(typeof(*kvm->arch.last_vcpu_ran)); if (!kvm->arch.last_vcpu_ran) From 7665f3a8491b0ed3c6f65c0bc3a5424ea8f87731 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 26 Sep 2018 17:32:43 +0100 Subject: [PATCH 34/53] kvm: arm64: Configure VTCR_EL2 per VM Add support for setting the VTCR_EL2 per VM, rather than hard coding a value at boot time per CPU. This would allow us to tune the stage2 page table parameters per VM in the later changes. We compute the VTCR fields based on the system wide sanitised feature registers, except for the hardware management of Access Flags (VTCR_EL2.HA). It is fine to run a system with a mix of CPUs that may or may not update the page table Access Flags. Since the bit is RES0 on CPUs that don't support it, the bit should be ignored on them. Suggested-by: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_arm.h | 3 +- arch/arm64/include/asm/kvm_asm.h | 2 - arch/arm64/include/asm/kvm_host.h | 12 ++++-- arch/arm64/include/asm/kvm_hyp.h | 1 + arch/arm64/kvm/hyp/Makefile | 1 - arch/arm64/kvm/hyp/s2-setup.c | 72 ------------------------------- arch/arm64/kvm/reset.c | 35 +++++++++++++++ 7 files changed, 45 insertions(+), 81 deletions(-) delete mode 100644 arch/arm64/kvm/hyp/s2-setup.c diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 5f807b680a5f..14317b3a1820 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -135,8 +135,7 @@ * 40 bits wide (T0SZ = 24). Systems with a PARange smaller than 40 bits are * not known to exist and will break with this configuration. * - * VTCR_EL2.PS is extracted from ID_AA64MMFR0_EL1.PARange at boot time - * (see hyp-init.S). + * The VTCR_EL2 is configured per VM and is initialised in kvm_arm_config_vm(). * * Note that when using 4K pages, we concatenate two first level page tables * together. With 16K pages, we concatenate 16 first level page tables. diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 102b5a5c47b6..0b53c72e7591 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -72,8 +72,6 @@ extern void __vgic_v3_init_lrs(void); extern u32 __kvm_get_mdcr_el2(void); -extern u32 __init_stage2_translation(void); - /* Home-grown __this_cpu_{ptr,read} variants that always work at HYP */ #define __hyp_this_cpu_ptr(sym) \ ({ \ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index b04280ae1be0..5ecd457bce7d 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -61,11 +61,13 @@ struct kvm_arch { u64 vmid_gen; u32 vmid; - /* 1-level 2nd stage table, protected by kvm->mmu_lock */ + /* stage2 entry level table */ pgd_t *pgd; /* VTTBR value associated with above pgd and vmid */ u64 vttbr; + /* VTCR_EL2 value for this VM */ + u64 vtcr; /* The last vcpu id that ran on each physical CPU */ int __percpu *last_vcpu_ran; @@ -442,10 +444,12 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, static inline void __cpu_init_stage2(void) { - u32 parange = kvm_call_hyp(__init_stage2_translation); + u32 ps; - WARN_ONCE(parange < 40, - "PARange is %d bits, unsupported configuration!", parange); + /* Sanity check for minimum IPA size support */ + ps = id_aa64mmfr0_parange_to_phys_shift(read_sysreg(id_aa64mmfr0_el1) & 0x7); + WARN_ONCE(ps < 40, + "PARange is %d bits, unsupported configuration!", ps); } /* Guest/host FPSIMD coordination helpers */ diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index d1bd1e0f14d7..23aca66767f9 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -161,6 +161,7 @@ void __noreturn __hyp_do_panic(unsigned long, ...); */ static __always_inline void __hyp_text __load_guest_stage2(struct kvm *kvm) { + write_sysreg(kvm->arch.vtcr, vtcr_el2); write_sysreg(kvm->arch.vttbr, vttbr_el2); } diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index 2fabc2dc1966..82d1904328ad 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -19,7 +19,6 @@ obj-$(CONFIG_KVM_ARM_HOST) += switch.o obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o obj-$(CONFIG_KVM_ARM_HOST) += tlb.o obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o -obj-$(CONFIG_KVM_ARM_HOST) += s2-setup.o # KVM code is run at a different exception code with a different map, so # compiler instrumentation that inserts callbacks or checks into the code may diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c deleted file mode 100644 index e1ca672e937a..000000000000 --- a/arch/arm64/kvm/hyp/s2-setup.c +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (C) 2016 - ARM Ltd - * Author: Marc Zyngier - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#include -#include -#include -#include -#include - -u32 __hyp_text __init_stage2_translation(void) -{ - u64 val = VTCR_EL2_FLAGS; - u64 parange; - u32 phys_shift; - u64 tmp; - - /* - * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS - * bits in VTCR_EL2. Amusingly, the PARange is 4 bits, but the - * allocated values are limited to 3bits. - */ - parange = read_sysreg(id_aa64mmfr0_el1) & 7; - if (parange > ID_AA64MMFR0_PARANGE_MAX) - parange = ID_AA64MMFR0_PARANGE_MAX; - val |= parange << VTCR_EL2_PS_SHIFT; - - /* Compute the actual PARange... */ - phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange); - - /* - * ... and clamp it to 40 bits, unless we have some braindead - * HW that implements less than that. In all cases, we'll - * return that value for the rest of the kernel to decide what - * to do. - */ - val |= VTCR_EL2_T0SZ(phys_shift > 40 ? 40 : phys_shift); - - /* - * Check the availability of Hardware Access Flag / Dirty Bit - * Management in ID_AA64MMFR1_EL1 and enable the feature in VTCR_EL2. - */ - tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_HADBS_SHIFT) & 0xf; - if (tmp) - val |= VTCR_EL2_HA; - - /* - * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS - * bit in VTCR_EL2. - */ - tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_VMIDBITS_SHIFT) & 0xf; - val |= (tmp == ID_AA64MMFR1_VMIDBITS_16) ? - VTCR_EL2_VS_16BIT : - VTCR_EL2_VS_8BIT; - - write_sysreg(val, vtcr_el2); - - return phys_shift; -} diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index b0c07dab5cb3..616120c4176b 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -26,6 +26,7 @@ #include +#include #include #include #include @@ -134,9 +135,43 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) return kvm_timer_vcpu_reset(vcpu); } +/* + * Configure the VTCR_EL2 for this VM. The VTCR value is common + * across all the physical CPUs on the system. We use system wide + * sanitised values to fill in different fields, except for Hardware + * Management of Access Flags. HA Flag is set unconditionally on + * all CPUs, as it is safe to run with or without the feature and + * the bit is RES0 on CPUs that don't support it. + */ int kvm_arm_config_vm(struct kvm *kvm, unsigned long type) { + u64 vtcr = VTCR_EL2_FLAGS; + u32 parange, phys_shift; + if (type) return -EINVAL; + + parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 7; + if (parange > ID_AA64MMFR0_PARANGE_MAX) + parange = ID_AA64MMFR0_PARANGE_MAX; + vtcr |= parange << VTCR_EL2_PS_SHIFT; + + phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange); + if (phys_shift > KVM_PHYS_SHIFT) + phys_shift = KVM_PHYS_SHIFT; + vtcr |= VTCR_EL2_T0SZ(phys_shift); + + /* + * Enable the Hardware Access Flag management, unconditionally + * on all CPUs. The features is RES0 on CPUs without the support + * and must be ignored by the CPUs. + */ + vtcr |= VTCR_EL2_HA; + + /* Set the vmid bits */ + vtcr |= (kvm_get_vmid_bits() == 16) ? + VTCR_EL2_VS_16BIT : + VTCR_EL2_VS_8BIT; + kvm->arch.vtcr = vtcr; return 0; } From e55cac5bf2a9cc86b57a9533d6b9e5005bc19b5c Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 26 Sep 2018 17:32:44 +0100 Subject: [PATCH 35/53] kvm: arm/arm64: Prepare for VM specific stage2 translations Right now the stage2 page table for a VM is hard coded, assuming an IPA of 40bits. As we are about to add support for per VM IPA, prepare the stage2 page table helpers to accept the kvm instance to make the right decision for the VM. No functional changes. Adds stage2_pgd_size(kvm) to replace S2_PGD_SIZE. Also, moves some of the definitions in arm32 to align with the arm64. Also drop the _AC() specifier constants wherever possible. Cc: Christoffer Dall Acked-by: Marc Zyngier Reviewed-by: Eric Auger Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_arm.h | 3 +- arch/arm/include/asm/kvm_mmu.h | 13 +- arch/arm/include/asm/stage2_pgtable.h | 50 +++++--- arch/arm64/include/asm/kvm_mmu.h | 7 +- arch/arm64/include/asm/stage2_pgtable-nopmd.h | 18 +-- arch/arm64/include/asm/stage2_pgtable-nopud.h | 16 +-- arch/arm64/include/asm/stage2_pgtable.h | 58 +++++---- virt/kvm/arm/arm.c | 2 +- virt/kvm/arm/mmu.c | 119 +++++++++--------- virt/kvm/arm/vgic/vgic-kvm-device.c | 2 +- 10 files changed, 156 insertions(+), 132 deletions(-) diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h index 3ab8b3781bfe..c3f1f9b304b7 100644 --- a/arch/arm/include/asm/kvm_arm.h +++ b/arch/arm/include/asm/kvm_arm.h @@ -133,8 +133,7 @@ * space. */ #define KVM_PHYS_SHIFT (40) -#define KVM_PHYS_SIZE (_AC(1, ULL) << KVM_PHYS_SHIFT) -#define KVM_PHYS_MASK (KVM_PHYS_SIZE - _AC(1, ULL)) + #define PTRS_PER_S2_PGD (_AC(1, ULL) << (KVM_PHYS_SHIFT - 30)) /* Virtualization Translation Control Register (VTCR) bits */ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 265ea9cf7df7..12ae5fbbcf01 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -35,16 +35,12 @@ addr; \ }) -/* - * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels. - */ -#define KVM_MMU_CACHE_MIN_PAGES 2 - #ifndef __ASSEMBLY__ #include #include #include +#include #include #include #include @@ -52,6 +48,13 @@ /* Ensure compatibility with arm64 */ #define VA_BITS 32 +#define kvm_phys_shift(kvm) KVM_PHYS_SHIFT +#define kvm_phys_size(kvm) (1ULL << kvm_phys_shift(kvm)) +#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - 1ULL) +#define kvm_vttbr_baddr_mask(kvm) VTTBR_BADDR_MASK + +#define stage2_pgd_size(kvm) (PTRS_PER_S2_PGD * sizeof(pgd_t)) + int create_hyp_mappings(void *from, void *to, pgprot_t prot); int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, void __iomem **kaddr, diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h index 460d616bb2d6..f6a7ea805232 100644 --- a/arch/arm/include/asm/stage2_pgtable.h +++ b/arch/arm/include/asm/stage2_pgtable.h @@ -19,43 +19,53 @@ #ifndef __ARM_S2_PGTABLE_H_ #define __ARM_S2_PGTABLE_H_ -#define stage2_pgd_none(pgd) pgd_none(pgd) -#define stage2_pgd_clear(pgd) pgd_clear(pgd) -#define stage2_pgd_present(pgd) pgd_present(pgd) -#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud) -#define stage2_pud_offset(pgd, address) pud_offset(pgd, address) -#define stage2_pud_free(pud) pud_free(NULL, pud) +/* + * kvm_mmu_cache_min_pages() is the number of pages required + * to install a stage-2 translation. We pre-allocate the entry + * level table at VM creation. Since we have a 3 level page-table, + * we need only two pages to add a new mapping. + */ +#define kvm_mmu_cache_min_pages(kvm) 2 -#define stage2_pud_none(pud) pud_none(pud) -#define stage2_pud_clear(pud) pud_clear(pud) -#define stage2_pud_present(pud) pud_present(pud) -#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd) -#define stage2_pmd_offset(pud, address) pmd_offset(pud, address) -#define stage2_pmd_free(pmd) pmd_free(NULL, pmd) +#define stage2_pgd_none(kvm, pgd) pgd_none(pgd) +#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd) +#define stage2_pgd_present(kvm, pgd) pgd_present(pgd) +#define stage2_pgd_populate(kvm, pgd, pud) pgd_populate(NULL, pgd, pud) +#define stage2_pud_offset(kvm, pgd, address) pud_offset(pgd, address) +#define stage2_pud_free(kvm, pud) pud_free(NULL, pud) -#define stage2_pud_huge(pud) pud_huge(pud) +#define stage2_pud_none(kvm, pud) pud_none(pud) +#define stage2_pud_clear(kvm, pud) pud_clear(pud) +#define stage2_pud_present(kvm, pud) pud_present(pud) +#define stage2_pud_populate(kvm, pud, pmd) pud_populate(NULL, pud, pmd) +#define stage2_pmd_offset(kvm, pud, address) pmd_offset(pud, address) +#define stage2_pmd_free(kvm, pmd) pmd_free(NULL, pmd) + +#define stage2_pud_huge(kvm, pud) pud_huge(pud) /* Open coded p*d_addr_end that can deal with 64bit addresses */ -static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end) +static inline phys_addr_t +stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { phys_addr_t boundary = (addr + PGDIR_SIZE) & PGDIR_MASK; return (boundary - 1 < end - 1) ? boundary : end; } -#define stage2_pud_addr_end(addr, end) (end) +#define stage2_pud_addr_end(kvm, addr, end) (end) -static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end) +static inline phys_addr_t +stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { phys_addr_t boundary = (addr + PMD_SIZE) & PMD_MASK; return (boundary - 1 < end - 1) ? boundary : end; } -#define stage2_pgd_index(addr) pgd_index(addr) +#define stage2_pgd_index(kvm, addr) pgd_index(addr) -#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep) -#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp) -#define stage2_pud_table_empty(pudp) false +#define stage2_pte_table_empty(kvm, ptep) kvm_page_empty(ptep) +#define stage2_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) +#define stage2_pud_table_empty(kvm, pudp) false #endif /* __ARM_S2_PGTABLE_H_ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index d6fff7de5539..3a032066e52c 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -141,8 +141,11 @@ static inline unsigned long __kern_hyp_va(unsigned long v) * We currently only support a 40bit IPA. */ #define KVM_PHYS_SHIFT (40) -#define KVM_PHYS_SIZE (1UL << KVM_PHYS_SHIFT) -#define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1UL) + +#define kvm_phys_shift(kvm) KVM_PHYS_SHIFT +#define kvm_phys_size(kvm) (_AC(1, ULL) << kvm_phys_shift(kvm)) +#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL)) +#define kvm_vttbr_baddr_mask(kvm) VTTBR_BADDR_MASK #include diff --git a/arch/arm64/include/asm/stage2_pgtable-nopmd.h b/arch/arm64/include/asm/stage2_pgtable-nopmd.h index 2656a0fd05a6..0280dedbf75f 100644 --- a/arch/arm64/include/asm/stage2_pgtable-nopmd.h +++ b/arch/arm64/include/asm/stage2_pgtable-nopmd.h @@ -26,17 +26,17 @@ #define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) #define S2_PMD_MASK (~(S2_PMD_SIZE-1)) -#define stage2_pud_none(pud) (0) -#define stage2_pud_present(pud) (1) -#define stage2_pud_clear(pud) do { } while (0) -#define stage2_pud_populate(pud, pmd) do { } while (0) -#define stage2_pmd_offset(pud, address) ((pmd_t *)(pud)) +#define stage2_pud_none(kvm, pud) (0) +#define stage2_pud_present(kvm, pud) (1) +#define stage2_pud_clear(kvm, pud) do { } while (0) +#define stage2_pud_populate(kvm, pud, pmd) do { } while (0) +#define stage2_pmd_offset(kvm, pud, address) ((pmd_t *)(pud)) -#define stage2_pmd_free(pmd) do { } while (0) +#define stage2_pmd_free(kvm, pmd) do { } while (0) -#define stage2_pmd_addr_end(addr, end) (end) +#define stage2_pmd_addr_end(kvm, addr, end) (end) -#define stage2_pud_huge(pud) (0) -#define stage2_pmd_table_empty(pmdp) (0) +#define stage2_pud_huge(kvm, pud) (0) +#define stage2_pmd_table_empty(kvm, pmdp) (0) #endif diff --git a/arch/arm64/include/asm/stage2_pgtable-nopud.h b/arch/arm64/include/asm/stage2_pgtable-nopud.h index 5ee87b54ebf3..cd6304e203be 100644 --- a/arch/arm64/include/asm/stage2_pgtable-nopud.h +++ b/arch/arm64/include/asm/stage2_pgtable-nopud.h @@ -24,16 +24,16 @@ #define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) #define S2_PUD_MASK (~(S2_PUD_SIZE-1)) -#define stage2_pgd_none(pgd) (0) -#define stage2_pgd_present(pgd) (1) -#define stage2_pgd_clear(pgd) do { } while (0) -#define stage2_pgd_populate(pgd, pud) do { } while (0) +#define stage2_pgd_none(kvm, pgd) (0) +#define stage2_pgd_present(kvm, pgd) (1) +#define stage2_pgd_clear(kvm, pgd) do { } while (0) +#define stage2_pgd_populate(kvm, pgd, pud) do { } while (0) -#define stage2_pud_offset(pgd, address) ((pud_t *)(pgd)) +#define stage2_pud_offset(kvm, pgd, address) ((pud_t *)(pgd)) -#define stage2_pud_free(x) do { } while (0) +#define stage2_pud_free(kvm, x) do { } while (0) -#define stage2_pud_addr_end(addr, end) (end) -#define stage2_pud_table_empty(pmdp) (0) +#define stage2_pud_addr_end(kvm, addr, end) (end) +#define stage2_pud_table_empty(kvm, pmdp) (0) #endif diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index 8b68099348e5..11891612be14 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -55,7 +55,7 @@ /* S2_PGDIR_SHIFT is the size mapped by top-level stage2 entry */ #define S2_PGDIR_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - STAGE2_PGTABLE_LEVELS) -#define S2_PGDIR_SIZE (_AC(1, UL) << S2_PGDIR_SHIFT) +#define S2_PGDIR_SIZE (1UL << S2_PGDIR_SHIFT) #define S2_PGDIR_MASK (~(S2_PGDIR_SIZE - 1)) /* @@ -65,28 +65,30 @@ #define PTRS_PER_S2_PGD (1 << (KVM_PHYS_SHIFT - S2_PGDIR_SHIFT)) /* - * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation - * levels in addition to the PGD. + * kvm_mmmu_cache_min_pages() is the number of pages required to install + * a stage-2 translation. We pre-allocate the entry level page table at + * the VM creation. */ -#define KVM_MMU_CACHE_MIN_PAGES (STAGE2_PGTABLE_LEVELS - 1) +#define kvm_mmu_cache_min_pages(kvm) (STAGE2_PGTABLE_LEVELS - 1) #if STAGE2_PGTABLE_LEVELS > 3 #define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1) -#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) +#define S2_PUD_SIZE (1UL << S2_PUD_SHIFT) #define S2_PUD_MASK (~(S2_PUD_SIZE - 1)) -#define stage2_pgd_none(pgd) pgd_none(pgd) -#define stage2_pgd_clear(pgd) pgd_clear(pgd) -#define stage2_pgd_present(pgd) pgd_present(pgd) -#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud) -#define stage2_pud_offset(pgd, address) pud_offset(pgd, address) -#define stage2_pud_free(pud) pud_free(NULL, pud) +#define stage2_pgd_none(kvm, pgd) pgd_none(pgd) +#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd) +#define stage2_pgd_present(kvm, pgd) pgd_present(pgd) +#define stage2_pgd_populate(kvm, pgd, pud) pgd_populate(NULL, pgd, pud) +#define stage2_pud_offset(kvm, pgd, address) pud_offset(pgd, address) +#define stage2_pud_free(kvm, pud) pud_free(NULL, pud) -#define stage2_pud_table_empty(pudp) kvm_page_empty(pudp) +#define stage2_pud_table_empty(kvm, pudp) kvm_page_empty(pudp) -static inline phys_addr_t stage2_pud_addr_end(phys_addr_t addr, phys_addr_t end) +static inline phys_addr_t +stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK; @@ -99,20 +101,21 @@ static inline phys_addr_t stage2_pud_addr_end(phys_addr_t addr, phys_addr_t end) #if STAGE2_PGTABLE_LEVELS > 2 #define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2) -#define S2_PMD_SIZE (_AC(1, UL) << S2_PMD_SHIFT) +#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) #define S2_PMD_MASK (~(S2_PMD_SIZE - 1)) -#define stage2_pud_none(pud) pud_none(pud) -#define stage2_pud_clear(pud) pud_clear(pud) -#define stage2_pud_present(pud) pud_present(pud) -#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd) -#define stage2_pmd_offset(pud, address) pmd_offset(pud, address) -#define stage2_pmd_free(pmd) pmd_free(NULL, pmd) +#define stage2_pud_none(kvm, pud) pud_none(pud) +#define stage2_pud_clear(kvm, pud) pud_clear(pud) +#define stage2_pud_present(kvm, pud) pud_present(pud) +#define stage2_pud_populate(kvm, pud, pmd) pud_populate(NULL, pud, pmd) +#define stage2_pmd_offset(kvm, pud, address) pmd_offset(pud, address) +#define stage2_pmd_free(kvm, pmd) pmd_free(NULL, pmd) -#define stage2_pud_huge(pud) pud_huge(pud) -#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp) +#define stage2_pud_huge(kvm, pud) pud_huge(pud) +#define stage2_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) -static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end) +static inline phys_addr_t +stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK; @@ -121,7 +124,7 @@ static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end) #endif /* STAGE2_PGTABLE_LEVELS > 2 */ -#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep) +#define stage2_pte_table_empty(kvm, ptep) kvm_page_empty(ptep) #if STAGE2_PGTABLE_LEVELS == 2 #include @@ -129,10 +132,13 @@ static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end) #include #endif +#define stage2_pgd_size(kvm) (PTRS_PER_S2_PGD * sizeof(pgd_t)) -#define stage2_pgd_index(addr) (((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) +#define stage2_pgd_index(kvm, addr) \ + (((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) -static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end) +static inline phys_addr_t +stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { phys_addr_t boundary = (addr + S2_PGDIR_SIZE) & S2_PGDIR_MASK; diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 327d0fd28380..43e716bc3f08 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -545,7 +545,7 @@ static void update_vttbr(struct kvm *kvm) /* update vttbr to be used with the new vmid */ pgd_phys = virt_to_phys(kvm->arch.pgd); - BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK); + BUG_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)); vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits); kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid; diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 4a285d760ce0..7e477b3cae5b 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -45,7 +45,6 @@ static phys_addr_t hyp_idmap_vector; static unsigned long io_map_base; -#define S2_PGD_SIZE (PTRS_PER_S2_PGD * sizeof(pgd_t)) #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) #define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) @@ -150,20 +149,20 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) { - pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL); - stage2_pgd_clear(pgd); + pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL); + stage2_pgd_clear(kvm, pgd); kvm_tlb_flush_vmid_ipa(kvm, addr); - stage2_pud_free(pud_table); + stage2_pud_free(kvm, pud_table); put_page(virt_to_page(pgd)); } static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) { - pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0); - VM_BUG_ON(stage2_pud_huge(*pud)); - stage2_pud_clear(pud); + pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0); + VM_BUG_ON(stage2_pud_huge(kvm, *pud)); + stage2_pud_clear(kvm, pud); kvm_tlb_flush_vmid_ipa(kvm, addr); - stage2_pmd_free(pmd_table); + stage2_pmd_free(kvm, pmd_table); put_page(virt_to_page(pud)); } @@ -252,7 +251,7 @@ static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd, } } while (pte++, addr += PAGE_SIZE, addr != end); - if (stage2_pte_table_empty(start_pte)) + if (stage2_pte_table_empty(kvm, start_pte)) clear_stage2_pmd_entry(kvm, pmd, start_addr); } @@ -262,9 +261,9 @@ static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, phys_addr_t next, start_addr = addr; pmd_t *pmd, *start_pmd; - start_pmd = pmd = stage2_pmd_offset(pud, addr); + start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr); do { - next = stage2_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(kvm, addr, end); if (!pmd_none(*pmd)) { if (pmd_thp_or_huge(*pmd)) { pmd_t old_pmd = *pmd; @@ -281,7 +280,7 @@ static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, } } while (pmd++, addr = next, addr != end); - if (stage2_pmd_table_empty(start_pmd)) + if (stage2_pmd_table_empty(kvm, start_pmd)) clear_stage2_pud_entry(kvm, pud, start_addr); } @@ -291,14 +290,14 @@ static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, phys_addr_t next, start_addr = addr; pud_t *pud, *start_pud; - start_pud = pud = stage2_pud_offset(pgd, addr); + start_pud = pud = stage2_pud_offset(kvm, pgd, addr); do { - next = stage2_pud_addr_end(addr, end); - if (!stage2_pud_none(*pud)) { - if (stage2_pud_huge(*pud)) { + next = stage2_pud_addr_end(kvm, addr, end); + if (!stage2_pud_none(kvm, *pud)) { + if (stage2_pud_huge(kvm, *pud)) { pud_t old_pud = *pud; - stage2_pud_clear(pud); + stage2_pud_clear(kvm, pud); kvm_tlb_flush_vmid_ipa(kvm, addr); kvm_flush_dcache_pud(old_pud); put_page(virt_to_page(pud)); @@ -308,7 +307,7 @@ static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, } } while (pud++, addr = next, addr != end); - if (stage2_pud_table_empty(start_pud)) + if (stage2_pud_table_empty(kvm, start_pud)) clear_stage2_pgd_entry(kvm, pgd, start_addr); } @@ -332,7 +331,7 @@ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) assert_spin_locked(&kvm->mmu_lock); WARN_ON(size & ~PAGE_MASK); - pgd = kvm->arch.pgd + stage2_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); do { /* * Make sure the page table is still active, as another thread @@ -341,8 +340,8 @@ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) */ if (!READ_ONCE(kvm->arch.pgd)) break; - next = stage2_pgd_addr_end(addr, end); - if (!stage2_pgd_none(*pgd)) + next = stage2_pgd_addr_end(kvm, addr, end); + if (!stage2_pgd_none(kvm, *pgd)) unmap_stage2_puds(kvm, pgd, addr, next); /* * If the range is too large, release the kvm->mmu_lock @@ -371,9 +370,9 @@ static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, pmd_t *pmd; phys_addr_t next; - pmd = stage2_pmd_offset(pud, addr); + pmd = stage2_pmd_offset(kvm, pud, addr); do { - next = stage2_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(kvm, addr, end); if (!pmd_none(*pmd)) { if (pmd_thp_or_huge(*pmd)) kvm_flush_dcache_pmd(*pmd); @@ -389,11 +388,11 @@ static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, pud_t *pud; phys_addr_t next; - pud = stage2_pud_offset(pgd, addr); + pud = stage2_pud_offset(kvm, pgd, addr); do { - next = stage2_pud_addr_end(addr, end); - if (!stage2_pud_none(*pud)) { - if (stage2_pud_huge(*pud)) + next = stage2_pud_addr_end(kvm, addr, end); + if (!stage2_pud_none(kvm, *pud)) { + if (stage2_pud_huge(kvm, *pud)) kvm_flush_dcache_pud(*pud); else stage2_flush_pmds(kvm, pud, addr, next); @@ -409,10 +408,10 @@ static void stage2_flush_memslot(struct kvm *kvm, phys_addr_t next; pgd_t *pgd; - pgd = kvm->arch.pgd + stage2_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); do { - next = stage2_pgd_addr_end(addr, end); - if (!stage2_pgd_none(*pgd)) + next = stage2_pgd_addr_end(kvm, addr, end); + if (!stage2_pgd_none(kvm, *pgd)) stage2_flush_puds(kvm, pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -898,7 +897,7 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm) } /* Allocate the HW PGD, making sure that each page gets its own refcount */ - pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO); + pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO); if (!pgd) return -ENOMEM; @@ -987,7 +986,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm) spin_lock(&kvm->mmu_lock); if (kvm->arch.pgd) { - unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); + unmap_stage2_range(kvm, 0, kvm_phys_size(kvm)); pgd = READ_ONCE(kvm->arch.pgd); kvm->arch.pgd = NULL; } @@ -995,7 +994,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm) /* Free the HW pgd, one page at a time */ if (pgd) - free_pages_exact(pgd, S2_PGD_SIZE); + free_pages_exact(pgd, stage2_pgd_size(kvm)); } static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, @@ -1004,16 +1003,16 @@ static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache pgd_t *pgd; pud_t *pud; - pgd = kvm->arch.pgd + stage2_pgd_index(addr); - if (stage2_pgd_none(*pgd)) { + pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); + if (stage2_pgd_none(kvm, *pgd)) { if (!cache) return NULL; pud = mmu_memory_cache_alloc(cache); - stage2_pgd_populate(pgd, pud); + stage2_pgd_populate(kvm, pgd, pud); get_page(virt_to_page(pgd)); } - return stage2_pud_offset(pgd, addr); + return stage2_pud_offset(kvm, pgd, addr); } static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, @@ -1026,15 +1025,15 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache if (!pud) return NULL; - if (stage2_pud_none(*pud)) { + if (stage2_pud_none(kvm, *pud)) { if (!cache) return NULL; pmd = mmu_memory_cache_alloc(cache); - stage2_pud_populate(pud, pmd); + stage2_pud_populate(kvm, pud, pmd); get_page(virt_to_page(pud)); } - return stage2_pmd_offset(pud, addr); + return stage2_pmd_offset(kvm, pud, addr); } static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache @@ -1208,8 +1207,9 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, if (writable) pte = kvm_s2pte_mkwrite(pte); - ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES, - KVM_NR_MEM_OBJS); + ret = mmu_topup_memory_cache(&cache, + kvm_mmu_cache_min_pages(kvm), + KVM_NR_MEM_OBJS); if (ret) goto out; spin_lock(&kvm->mmu_lock); @@ -1297,19 +1297,21 @@ static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) /** * stage2_wp_pmds - write protect PUD range + * kvm: kvm instance for the VM * @pud: pointer to pud entry * @addr: range start address * @end: range end address */ -static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) +static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud, + phys_addr_t addr, phys_addr_t end) { pmd_t *pmd; phys_addr_t next; - pmd = stage2_pmd_offset(pud, addr); + pmd = stage2_pmd_offset(kvm, pud, addr); do { - next = stage2_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(kvm, addr, end); if (!pmd_none(*pmd)) { if (pmd_thp_or_huge(*pmd)) { if (!kvm_s2pmd_readonly(pmd)) @@ -1329,18 +1331,19 @@ static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) * * Process PUD entries, for a huge PUD we cause a panic. */ -static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) +static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t addr, phys_addr_t end) { pud_t *pud; phys_addr_t next; - pud = stage2_pud_offset(pgd, addr); + pud = stage2_pud_offset(kvm, pgd, addr); do { - next = stage2_pud_addr_end(addr, end); - if (!stage2_pud_none(*pud)) { + next = stage2_pud_addr_end(kvm, addr, end); + if (!stage2_pud_none(kvm, *pud)) { /* TODO:PUD not supported, revisit later if supported */ - BUG_ON(stage2_pud_huge(*pud)); - stage2_wp_pmds(pud, addr, next); + BUG_ON(stage2_pud_huge(kvm, *pud)); + stage2_wp_pmds(kvm, pud, addr, next); } } while (pud++, addr = next, addr != end); } @@ -1356,7 +1359,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) pgd_t *pgd; phys_addr_t next; - pgd = kvm->arch.pgd + stage2_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); do { /* * Release kvm_mmu_lock periodically if the memory region is @@ -1370,9 +1373,9 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) cond_resched_lock(&kvm->mmu_lock); if (!READ_ONCE(kvm->arch.pgd)) break; - next = stage2_pgd_addr_end(addr, end); - if (stage2_pgd_present(*pgd)) - stage2_wp_puds(pgd, addr, next); + next = stage2_pgd_addr_end(kvm, addr, end); + if (stage2_pgd_present(kvm, *pgd)) + stage2_wp_puds(kvm, pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -1521,7 +1524,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, up_read(¤t->mm->mmap_sem); /* We need minimum second+third level pages */ - ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES, + ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm), KVM_NR_MEM_OBJS); if (ret) return ret; @@ -1764,7 +1767,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) } /* Userspace should not be able to register out-of-bounds IPAs */ - VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE); + VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm)); if (fault_status == FSC_ACCESS) { handle_access_fault(vcpu, fault_ipa); @@ -2063,7 +2066,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * space addressable by the KVM guest IPA space. */ if (memslot->base_gfn + memslot->npages >= - (KVM_PHYS_SIZE >> PAGE_SHIFT)) + (kvm_phys_size(kvm) >> PAGE_SHIFT)) return -EFAULT; down_read(¤t->mm->mmap_sem); diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c index 6ada2432e37c..114dce9f4bf5 100644 --- a/virt/kvm/arm/vgic/vgic-kvm-device.c +++ b/virt/kvm/arm/vgic/vgic-kvm-device.c @@ -25,7 +25,7 @@ int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, phys_addr_t addr, phys_addr_t alignment) { - if (addr & ~KVM_PHYS_MASK) + if (addr & ~kvm_phys_mask(kvm)) return -E2BIG; if (!IS_ALIGNED(addr, alignment)) From 865b30cdd9b286820aef44393bafc4c0ccee53d0 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 26 Sep 2018 17:32:45 +0100 Subject: [PATCH 36/53] kvm: arm64: Prepare for dynamic stage2 page table layout Our stage2 page table helpers are statically defined based on the fixed IPA of 40bits and the host page size. As we are about to add support for configurable IPA size for VMs, we need to make the page table checks for each VM. This patch prepares the stage2 helpers to make the transition to a VM dependent table layout easier. Instead of statically defining the table helpers based on the page table levels, we now check the page table levels in the helpers to do the right thing. In effect, it simply converts the macros to static inline functions. Cc: Eric Auger Cc: Marc Zyngier Cc: Christoffer Dall Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_mmu.h | 12 +- arch/arm64/include/asm/stage2_pgtable-nopmd.h | 42 ----- arch/arm64/include/asm/stage2_pgtable-nopud.h | 39 ---- arch/arm64/include/asm/stage2_pgtable.h | 169 ++++++++++++++---- 4 files changed, 137 insertions(+), 125 deletions(-) delete mode 100644 arch/arm64/include/asm/stage2_pgtable-nopmd.h delete mode 100644 arch/arm64/include/asm/stage2_pgtable-nopud.h diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 3a032066e52c..7342d2c51773 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -147,6 +147,12 @@ static inline unsigned long __kern_hyp_va(unsigned long v) #define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL)) #define kvm_vttbr_baddr_mask(kvm) VTTBR_BADDR_MASK +static inline bool kvm_page_empty(void *ptr) +{ + struct page *ptr_page = virt_to_page(ptr); + return page_count(ptr_page) == 1; +} + #include int create_hyp_mappings(void *from, void *to, pgprot_t prot); @@ -241,12 +247,6 @@ static inline bool kvm_s2pmd_exec(pmd_t *pmdp) return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN); } -static inline bool kvm_page_empty(void *ptr) -{ - struct page *ptr_page = virt_to_page(ptr); - return page_count(ptr_page) == 1; -} - #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) #ifdef __PAGETABLE_PMD_FOLDED diff --git a/arch/arm64/include/asm/stage2_pgtable-nopmd.h b/arch/arm64/include/asm/stage2_pgtable-nopmd.h deleted file mode 100644 index 0280dedbf75f..000000000000 --- a/arch/arm64/include/asm/stage2_pgtable-nopmd.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (C) 2016 - ARM Ltd - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#ifndef __ARM64_S2_PGTABLE_NOPMD_H_ -#define __ARM64_S2_PGTABLE_NOPMD_H_ - -#include - -#define __S2_PGTABLE_PMD_FOLDED - -#define S2_PMD_SHIFT S2_PUD_SHIFT -#define S2_PTRS_PER_PMD 1 -#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) -#define S2_PMD_MASK (~(S2_PMD_SIZE-1)) - -#define stage2_pud_none(kvm, pud) (0) -#define stage2_pud_present(kvm, pud) (1) -#define stage2_pud_clear(kvm, pud) do { } while (0) -#define stage2_pud_populate(kvm, pud, pmd) do { } while (0) -#define stage2_pmd_offset(kvm, pud, address) ((pmd_t *)(pud)) - -#define stage2_pmd_free(kvm, pmd) do { } while (0) - -#define stage2_pmd_addr_end(kvm, addr, end) (end) - -#define stage2_pud_huge(kvm, pud) (0) -#define stage2_pmd_table_empty(kvm, pmdp) (0) - -#endif diff --git a/arch/arm64/include/asm/stage2_pgtable-nopud.h b/arch/arm64/include/asm/stage2_pgtable-nopud.h deleted file mode 100644 index cd6304e203be..000000000000 --- a/arch/arm64/include/asm/stage2_pgtable-nopud.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (C) 2016 - ARM Ltd - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#ifndef __ARM64_S2_PGTABLE_NOPUD_H_ -#define __ARM64_S2_PGTABLE_NOPUD_H_ - -#define __S2_PGTABLE_PUD_FOLDED - -#define S2_PUD_SHIFT S2_PGDIR_SHIFT -#define S2_PTRS_PER_PUD 1 -#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) -#define S2_PUD_MASK (~(S2_PUD_SIZE-1)) - -#define stage2_pgd_none(kvm, pgd) (0) -#define stage2_pgd_present(kvm, pgd) (1) -#define stage2_pgd_clear(kvm, pgd) do { } while (0) -#define stage2_pgd_populate(kvm, pgd, pud) do { } while (0) - -#define stage2_pud_offset(kvm, pgd, address) ((pud_t *)(pgd)) - -#define stage2_pud_free(kvm, x) do { } while (0) - -#define stage2_pud_addr_end(kvm, addr, end) (end) -#define stage2_pud_table_empty(kvm, pmdp) (0) - -#endif diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index 11891612be14..384f9e982cba 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -19,6 +19,7 @@ #ifndef __ARM64_S2_PGTABLE_H_ #define __ARM64_S2_PGTABLE_H_ +#include #include /* @@ -71,71 +72,163 @@ */ #define kvm_mmu_cache_min_pages(kvm) (STAGE2_PGTABLE_LEVELS - 1) - -#if STAGE2_PGTABLE_LEVELS > 3 - +/* Stage2 PUD definitions when the level is present */ +#define STAGE2_PGTABLE_HAS_PUD (STAGE2_PGTABLE_LEVELS > 3) #define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1) #define S2_PUD_SIZE (1UL << S2_PUD_SHIFT) #define S2_PUD_MASK (~(S2_PUD_SIZE - 1)) -#define stage2_pgd_none(kvm, pgd) pgd_none(pgd) -#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd) -#define stage2_pgd_present(kvm, pgd) pgd_present(pgd) -#define stage2_pgd_populate(kvm, pgd, pud) pgd_populate(NULL, pgd, pud) -#define stage2_pud_offset(kvm, pgd, address) pud_offset(pgd, address) -#define stage2_pud_free(kvm, pud) pud_free(NULL, pud) +static inline bool stage2_pgd_none(struct kvm *kvm, pgd_t pgd) +{ + if (STAGE2_PGTABLE_HAS_PUD) + return pgd_none(pgd); + else + return 0; +} -#define stage2_pud_table_empty(kvm, pudp) kvm_page_empty(pudp) +static inline void stage2_pgd_clear(struct kvm *kvm, pgd_t *pgdp) +{ + if (STAGE2_PGTABLE_HAS_PUD) + pgd_clear(pgdp); +} + +static inline bool stage2_pgd_present(struct kvm *kvm, pgd_t pgd) +{ + if (STAGE2_PGTABLE_HAS_PUD) + return pgd_present(pgd); + else + return 1; +} + +static inline void stage2_pgd_populate(struct kvm *kvm, pgd_t *pgd, pud_t *pud) +{ + if (STAGE2_PGTABLE_HAS_PUD) + pgd_populate(NULL, pgd, pud); +} + +static inline pud_t *stage2_pud_offset(struct kvm *kvm, + pgd_t *pgd, unsigned long address) +{ + if (STAGE2_PGTABLE_HAS_PUD) + return pud_offset(pgd, address); + else + return (pud_t *)pgd; +} + +static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud) +{ + if (STAGE2_PGTABLE_HAS_PUD) + pud_free(NULL, pud); +} + +static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp) +{ + if (STAGE2_PGTABLE_HAS_PUD) + return kvm_page_empty(pudp); + else + return false; +} static inline phys_addr_t stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { - phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK; + if (STAGE2_PGTABLE_HAS_PUD) { + phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK; - return (boundary - 1 < end - 1) ? boundary : end; + return (boundary - 1 < end - 1) ? boundary : end; + } else { + return end; + } } -#endif /* STAGE2_PGTABLE_LEVELS > 3 */ - - -#if STAGE2_PGTABLE_LEVELS > 2 - +/* Stage2 PMD definitions when the level is present */ +#define STAGE2_PGTABLE_HAS_PMD (STAGE2_PGTABLE_LEVELS > 2) #define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2) #define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) #define S2_PMD_MASK (~(S2_PMD_SIZE - 1)) -#define stage2_pud_none(kvm, pud) pud_none(pud) -#define stage2_pud_clear(kvm, pud) pud_clear(pud) -#define stage2_pud_present(kvm, pud) pud_present(pud) -#define stage2_pud_populate(kvm, pud, pmd) pud_populate(NULL, pud, pmd) -#define stage2_pmd_offset(kvm, pud, address) pmd_offset(pud, address) -#define stage2_pmd_free(kvm, pmd) pmd_free(NULL, pmd) +static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud) +{ + if (STAGE2_PGTABLE_HAS_PMD) + return pud_none(pud); + else + return 0; +} -#define stage2_pud_huge(kvm, pud) pud_huge(pud) -#define stage2_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) +static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud) +{ + if (STAGE2_PGTABLE_HAS_PMD) + pud_clear(pud); +} + +static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud) +{ + if (STAGE2_PGTABLE_HAS_PMD) + return pud_present(pud); + else + return 1; +} + +static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd) +{ + if (STAGE2_PGTABLE_HAS_PMD) + pud_populate(NULL, pud, pmd); +} + +static inline pmd_t *stage2_pmd_offset(struct kvm *kvm, + pud_t *pud, unsigned long address) +{ + if (STAGE2_PGTABLE_HAS_PMD) + return pmd_offset(pud, address); + else + return (pmd_t *)pud; +} + +static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd) +{ + if (STAGE2_PGTABLE_HAS_PMD) + pmd_free(NULL, pmd); +} + +static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud) +{ + if (STAGE2_PGTABLE_HAS_PMD) + return pud_huge(pud); + else + return 0; +} + +static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp) +{ + if (STAGE2_PGTABLE_HAS_PMD) + return kvm_page_empty(pmdp); + else + return 0; +} static inline phys_addr_t stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { - phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK; + if (STAGE2_PGTABLE_HAS_PMD) { + phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK; - return (boundary - 1 < end - 1) ? boundary : end; + return (boundary - 1 < end - 1) ? boundary : end; + } else { + return end; + } } -#endif /* STAGE2_PGTABLE_LEVELS > 2 */ - -#define stage2_pte_table_empty(kvm, ptep) kvm_page_empty(ptep) - -#if STAGE2_PGTABLE_LEVELS == 2 -#include -#elif STAGE2_PGTABLE_LEVELS == 3 -#include -#endif +static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep) +{ + return kvm_page_empty(ptep); +} #define stage2_pgd_size(kvm) (PTRS_PER_S2_PGD * sizeof(pgd_t)) -#define stage2_pgd_index(kvm, addr) \ - (((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) +static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr) +{ + return (((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)); +} static inline phys_addr_t stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) From 61fa5a867b6521b4c55865b88bf70c93078d0cf8 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 26 Sep 2018 17:32:46 +0100 Subject: [PATCH 37/53] kvm: arm64: Make stage2 page table layout dynamic Switch to dynamic stage2 page table layout based on the given VM. So far we had a common stage2 table layout determined at compile time. Make decision based on the VM instance depending on the IPA limit for the VM. Adds helpers to compute the stage2 parameters based on the guest's IPA and uses them to make the decisions. The IPA limit is still fixed to 40bits and the build time check to ensure the stage2 doesn't exceed the host kernels page table levels is retained. Also make sure that we use the pud/pmd level helpers from the host only when they are not folded. Cc: Christoffer Dall Cc: Marc Zyngier Reviewed-by: Eric Auger Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/stage2_pgtable.h | 84 +++++++++++++++---------- 1 file changed, 52 insertions(+), 32 deletions(-) diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index 384f9e982cba..36a0a1165003 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -22,6 +22,13 @@ #include #include +/* + * PGDIR_SHIFT determines the size a top-level page table entry can map + * and depends on the number of levels in the page table. Compute the + * PGDIR_SHIFT for a given number of levels. + */ +#define pt_levels_pgdir_shift(lvls) ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls)) + /* * The hardware supports concatenation of up to 16 tables at stage2 entry level * and we use the feature whenever possible. @@ -30,11 +37,13 @@ * On arm64, the smallest PAGE_SIZE supported is 4k, which means * (PAGE_SHIFT - 3) > 4 holds for all page sizes. * This implies, the total number of page table levels at stage2 expected - * by the hardware is actually the number of levels required for (KVM_PHYS_SHIFT - 4) + * by the hardware is actually the number of levels required for (IPA_SHIFT - 4) * in normal translations(e.g, stage1), since we cannot have another level in - * the range (KVM_PHYS_SHIFT, KVM_PHYS_SHIFT - 4). + * the range (IPA_SHIFT, IPA_SHIFT - 4). */ -#define STAGE2_PGTABLE_LEVELS ARM64_HW_PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4) +#define stage2_pgtable_levels(ipa) ARM64_HW_PGTABLE_LEVELS((ipa) - 4) +#define STAGE2_PGTABLE_LEVELS stage2_pgtable_levels(KVM_PHYS_SHIFT) +#define kvm_stage2_levels(kvm) stage2_pgtable_levels(kvm_phys_shift(kvm)) /* * With all the supported VA_BITs and 40bit guest IPA, the following condition @@ -54,33 +63,42 @@ #error "Unsupported combination of guest IPA and host VA_BITS." #endif -/* S2_PGDIR_SHIFT is the size mapped by top-level stage2 entry */ -#define S2_PGDIR_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - STAGE2_PGTABLE_LEVELS) -#define S2_PGDIR_SIZE (1UL << S2_PGDIR_SHIFT) -#define S2_PGDIR_MASK (~(S2_PGDIR_SIZE - 1)) + +/* stage2_pgdir_shift() is the size mapped by top-level stage2 entry for the VM */ +#define stage2_pgdir_shift(kvm) pt_levels_pgdir_shift(kvm_stage2_levels(kvm)) +#define stage2_pgdir_size(kvm) (1ULL << stage2_pgdir_shift(kvm)) +#define stage2_pgdir_mask(kvm) ~(stage2_pgdir_size(kvm) - 1) /* * The number of PTRS across all concatenated stage2 tables given by the * number of bits resolved at the initial level. */ -#define PTRS_PER_S2_PGD (1 << (KVM_PHYS_SHIFT - S2_PGDIR_SHIFT)) +#define __s2_pgd_ptrs(ipa, lvls) (1 << ((ipa) - pt_levels_pgdir_shift((lvls)))) +#define __s2_pgd_size(ipa, lvls) (__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t)) + +#define stage2_pgd_ptrs(kvm) __s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)) +#define stage2_pgd_size(kvm) __s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)) /* * kvm_mmmu_cache_min_pages() is the number of pages required to install * a stage-2 translation. We pre-allocate the entry level page table at * the VM creation. */ -#define kvm_mmu_cache_min_pages(kvm) (STAGE2_PGTABLE_LEVELS - 1) +#define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1) /* Stage2 PUD definitions when the level is present */ -#define STAGE2_PGTABLE_HAS_PUD (STAGE2_PGTABLE_LEVELS > 3) +static inline bool kvm_stage2_has_pud(struct kvm *kvm) +{ + return (CONFIG_PGTABLE_LEVELS > 3) && (kvm_stage2_levels(kvm) > 3); +} + #define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1) #define S2_PUD_SIZE (1UL << S2_PUD_SHIFT) #define S2_PUD_MASK (~(S2_PUD_SIZE - 1)) static inline bool stage2_pgd_none(struct kvm *kvm, pgd_t pgd) { - if (STAGE2_PGTABLE_HAS_PUD) + if (kvm_stage2_has_pud(kvm)) return pgd_none(pgd); else return 0; @@ -88,13 +106,13 @@ static inline bool stage2_pgd_none(struct kvm *kvm, pgd_t pgd) static inline void stage2_pgd_clear(struct kvm *kvm, pgd_t *pgdp) { - if (STAGE2_PGTABLE_HAS_PUD) + if (kvm_stage2_has_pud(kvm)) pgd_clear(pgdp); } static inline bool stage2_pgd_present(struct kvm *kvm, pgd_t pgd) { - if (STAGE2_PGTABLE_HAS_PUD) + if (kvm_stage2_has_pud(kvm)) return pgd_present(pgd); else return 1; @@ -102,14 +120,14 @@ static inline bool stage2_pgd_present(struct kvm *kvm, pgd_t pgd) static inline void stage2_pgd_populate(struct kvm *kvm, pgd_t *pgd, pud_t *pud) { - if (STAGE2_PGTABLE_HAS_PUD) + if (kvm_stage2_has_pud(kvm)) pgd_populate(NULL, pgd, pud); } static inline pud_t *stage2_pud_offset(struct kvm *kvm, pgd_t *pgd, unsigned long address) { - if (STAGE2_PGTABLE_HAS_PUD) + if (kvm_stage2_has_pud(kvm)) return pud_offset(pgd, address); else return (pud_t *)pgd; @@ -117,13 +135,13 @@ static inline pud_t *stage2_pud_offset(struct kvm *kvm, static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud) { - if (STAGE2_PGTABLE_HAS_PUD) + if (kvm_stage2_has_pud(kvm)) pud_free(NULL, pud); } static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp) { - if (STAGE2_PGTABLE_HAS_PUD) + if (kvm_stage2_has_pud(kvm)) return kvm_page_empty(pudp); else return false; @@ -132,7 +150,7 @@ static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp) static inline phys_addr_t stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { - if (STAGE2_PGTABLE_HAS_PUD) { + if (kvm_stage2_has_pud(kvm)) { phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK; return (boundary - 1 < end - 1) ? boundary : end; @@ -142,14 +160,18 @@ stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) } /* Stage2 PMD definitions when the level is present */ -#define STAGE2_PGTABLE_HAS_PMD (STAGE2_PGTABLE_LEVELS > 2) +static inline bool kvm_stage2_has_pmd(struct kvm *kvm) +{ + return (CONFIG_PGTABLE_LEVELS > 2) && (kvm_stage2_levels(kvm) > 2); +} + #define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2) #define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) #define S2_PMD_MASK (~(S2_PMD_SIZE - 1)) static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud) { - if (STAGE2_PGTABLE_HAS_PMD) + if (kvm_stage2_has_pmd(kvm)) return pud_none(pud); else return 0; @@ -157,13 +179,13 @@ static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud) static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud) { - if (STAGE2_PGTABLE_HAS_PMD) + if (kvm_stage2_has_pmd(kvm)) pud_clear(pud); } static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud) { - if (STAGE2_PGTABLE_HAS_PMD) + if (kvm_stage2_has_pmd(kvm)) return pud_present(pud); else return 1; @@ -171,14 +193,14 @@ static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud) static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd) { - if (STAGE2_PGTABLE_HAS_PMD) + if (kvm_stage2_has_pmd(kvm)) pud_populate(NULL, pud, pmd); } static inline pmd_t *stage2_pmd_offset(struct kvm *kvm, pud_t *pud, unsigned long address) { - if (STAGE2_PGTABLE_HAS_PMD) + if (kvm_stage2_has_pmd(kvm)) return pmd_offset(pud, address); else return (pmd_t *)pud; @@ -186,13 +208,13 @@ static inline pmd_t *stage2_pmd_offset(struct kvm *kvm, static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd) { - if (STAGE2_PGTABLE_HAS_PMD) + if (kvm_stage2_has_pmd(kvm)) pmd_free(NULL, pmd); } static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud) { - if (STAGE2_PGTABLE_HAS_PMD) + if (kvm_stage2_has_pmd(kvm)) return pud_huge(pud); else return 0; @@ -200,7 +222,7 @@ static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud) static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp) { - if (STAGE2_PGTABLE_HAS_PMD) + if (kvm_stage2_has_pmd(kvm)) return kvm_page_empty(pmdp); else return 0; @@ -209,7 +231,7 @@ static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp) static inline phys_addr_t stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { - if (STAGE2_PGTABLE_HAS_PMD) { + if (kvm_stage2_has_pmd(kvm)) { phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK; return (boundary - 1 < end - 1) ? boundary : end; @@ -223,17 +245,15 @@ static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep) return kvm_page_empty(ptep); } -#define stage2_pgd_size(kvm) (PTRS_PER_S2_PGD * sizeof(pgd_t)) - static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr) { - return (((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)); + return (((addr) >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1)); } static inline phys_addr_t stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { - phys_addr_t boundary = (addr + S2_PGDIR_SIZE) & S2_PGDIR_MASK; + phys_addr_t boundary = (addr + stage2_pgdir_size(kvm)) & stage2_pgdir_mask(kvm); return (boundary - 1 < end - 1) ? boundary : end; } From 595583306434caafac4a71102dca5a8d32d1a769 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 26 Sep 2018 17:32:47 +0100 Subject: [PATCH 38/53] kvm: arm64: Dynamic configuration of VTTBR mask On arm64 VTTBR_EL2:BADDR holds the base address for the stage2 translation table. The Arm ARM mandates that the bits BADDR[x-1:0] should be 0, where 'x' is defined for a given IPA Size and the number of levels for a translation granule size. It is defined using some magical constants. This patch is a reverse engineered implementation to calculate the 'x' at runtime for a given ipa and number of page table levels. See patch for more details. Cc: Marc Zyngier Cc: Christoffer Dall Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_arm.h | 73 ++++++++++++++++++++++++++++---- arch/arm64/include/asm/kvm_mmu.h | 25 ++++++++++- 2 files changed, 88 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 14317b3a1820..b236d90ca056 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -123,7 +123,6 @@ #define VTCR_EL2_SL0_MASK (3 << VTCR_EL2_SL0_SHIFT) #define VTCR_EL2_SL0_LVL1 (1 << VTCR_EL2_SL0_SHIFT) #define VTCR_EL2_T0SZ_MASK 0x3f -#define VTCR_EL2_T0SZ_40B 24 #define VTCR_EL2_VS_SHIFT 19 #define VTCR_EL2_VS_8BIT (0 << VTCR_EL2_VS_SHIFT) #define VTCR_EL2_VS_16BIT (1 << VTCR_EL2_VS_SHIFT) @@ -140,11 +139,8 @@ * Note that when using 4K pages, we concatenate two first level page tables * together. With 16K pages, we concatenate 16 first level page tables. * - * The magic numbers used for VTTBR_X in this patch can be found in Tables - * D4-23 and D4-25 in ARM DDI 0487A.b. */ -#define VTCR_EL2_T0SZ_IPA VTCR_EL2_T0SZ_40B #define VTCR_EL2_COMMON_BITS (VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \ VTCR_EL2_IRGN0_WBWA | VTCR_EL2_RES1) @@ -155,7 +151,6 @@ * 2 level page tables (SL = 1) */ #define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1) -#define VTTBR_X_TGRAN_MAGIC 38 #elif defined(CONFIG_ARM64_16K_PAGES) /* * Stage2 translation configuration: @@ -163,7 +158,6 @@ * 2 level page tables (SL = 1) */ #define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_16K | VTCR_EL2_SL0_LVL1) -#define VTTBR_X_TGRAN_MAGIC 42 #else /* 4K */ /* * Stage2 translation configuration: @@ -171,13 +165,74 @@ * 3 level page tables (SL = 1) */ #define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SL0_LVL1) -#define VTTBR_X_TGRAN_MAGIC 37 #endif #define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS) -#define VTTBR_X (VTTBR_X_TGRAN_MAGIC - VTCR_EL2_T0SZ_IPA) +/* + * ARM VMSAv8-64 defines an algorithm for finding the translation table + * descriptors in section D4.2.8 in ARM DDI 0487C.a. + * + * The algorithm defines the expectations on the translation table + * addresses for each level, based on PAGE_SIZE, entry level + * and the translation table size (T0SZ). The variable "x" in the + * algorithm determines the alignment of a table base address at a given + * level and thus determines the alignment of VTTBR:BADDR for stage2 + * page table entry level. + * Since the number of bits resolved at the entry level could vary + * depending on the T0SZ, the value of "x" is defined based on a + * Magic constant for a given PAGE_SIZE and Entry Level. The + * intermediate levels must be always aligned to the PAGE_SIZE (i.e, + * x = PAGE_SHIFT). + * + * The value of "x" for entry level is calculated as : + * x = Magic_N - T0SZ + * + * where Magic_N is an integer depending on the page size and the entry + * level of the page table as below: + * + * -------------------------------------------- + * | Entry level | 4K 16K 64K | + * -------------------------------------------- + * | Level: 0 (4 levels) | 28 | - | - | + * -------------------------------------------- + * | Level: 1 (3 levels) | 37 | 31 | 25 | + * -------------------------------------------- + * | Level: 2 (2 levels) | 46 | 42 | 38 | + * -------------------------------------------- + * | Level: 3 (1 level) | - | 53 | 51 | + * -------------------------------------------- + * + * We have a magic formula for the Magic_N below: + * + * Magic_N(PAGE_SIZE, Level) = 64 - ((PAGE_SHIFT - 3) * Number_of_levels) + * + * where Number_of_levels = (4 - Level). We are only interested in the + * value for Entry_Level for the stage2 page table. + * + * So, given that T0SZ = (64 - IPA_SHIFT), we can compute 'x' as follows: + * + * x = (64 - ((PAGE_SHIFT - 3) * Number_of_levels)) - (64 - IPA_SHIFT) + * = IPA_SHIFT - ((PAGE_SHIFT - 3) * Number of levels) + * + * Here is one way to explain the Magic Formula: + * + * x = log2(Size_of_Entry_Level_Table) + * + * Since, we can resolve (PAGE_SHIFT - 3) bits at each level, and another + * PAGE_SHIFT bits in the PTE, we have : + * + * Bits_Entry_level = IPA_SHIFT - ((PAGE_SHIFT - 3) * (n - 1) + PAGE_SHIFT) + * = IPA_SHIFT - (PAGE_SHIFT - 3) * n - 3 + * where n = number of levels, and since each pointer is 8bytes, we have: + * + * x = Bits_Entry_Level + 3 + * = IPA_SHIFT - (PAGE_SHIFT - 3) * n + * + * The only constraint here is that, we have to find the number of page table + * levels for a given IPA size (which we do, see stage2_pt_levels()) + */ +#define ARM64_VTTBR_X(ipa, levels) ((ipa) - ((levels) * (PAGE_SHIFT - 3))) -#define VTTBR_BADDR_MASK (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_X) #define VTTBR_VMID_SHIFT (UL(48)) #define VTTBR_VMID_MASK(size) (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 7342d2c51773..ac3ca9690bad 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -145,7 +145,6 @@ static inline unsigned long __kern_hyp_va(unsigned long v) #define kvm_phys_shift(kvm) KVM_PHYS_SHIFT #define kvm_phys_size(kvm) (_AC(1, ULL) << kvm_phys_shift(kvm)) #define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL)) -#define kvm_vttbr_baddr_mask(kvm) VTTBR_BADDR_MASK static inline bool kvm_page_empty(void *ptr) { @@ -520,5 +519,29 @@ static inline int hyp_map_aux_data(void) #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) +/* + * Get the magic number 'x' for VTTBR:BADDR of this KVM instance. + * With v8.2 LVA extensions, 'x' should be a minimum of 6 with + * 52bit IPS. + */ +static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels) +{ + int x = ARM64_VTTBR_X(ipa_shift, levels); + + return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x; +} + +static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels) +{ + unsigned int x = arm64_vttbr_x(ipa_shift, levels); + + return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x); +} + +static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm) +{ + return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)); +} + #endif /* __ASSEMBLY__ */ #endif /* __ARM64_KVM_MMU_H__ */ From 7e8130456e067f49693fdcc25b5ba242a9c5568b Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 26 Sep 2018 17:32:48 +0100 Subject: [PATCH 39/53] kvm: arm64: Configure VTCR_EL2.SL0 per VM VTCR_EL2 holds the following key stage2 translation table parameters: SL0 - Entry level in the page table lookup. T0SZ - Denotes the size of the memory addressed by the table. We have been using fixed values for the SL0 depending on the page size as we have a fixed IPA size. But since we are about to make it dynamic, we need to calculate the SL0 at runtime per VM. This patch adds a helper to compute the value of SL0 for a VM based on the IPA size. Cc: Marc Zyngier Cc: Christoffer Dall Reviewed-by: Eric Auger Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_arm.h | 69 +++++++++++++++++++++++--------- arch/arm64/kvm/reset.c | 1 + 2 files changed, 50 insertions(+), 20 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index b236d90ca056..f913adb44f93 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -121,7 +121,6 @@ #define VTCR_EL2_IRGN0_WBWA TCR_IRGN0_WBWA #define VTCR_EL2_SL0_SHIFT 6 #define VTCR_EL2_SL0_MASK (3 << VTCR_EL2_SL0_SHIFT) -#define VTCR_EL2_SL0_LVL1 (1 << VTCR_EL2_SL0_SHIFT) #define VTCR_EL2_T0SZ_MASK 0x3f #define VTCR_EL2_VS_SHIFT 19 #define VTCR_EL2_VS_8BIT (0 << VTCR_EL2_VS_SHIFT) @@ -144,30 +143,60 @@ #define VTCR_EL2_COMMON_BITS (VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \ VTCR_EL2_IRGN0_WBWA | VTCR_EL2_RES1) +/* + * VTCR_EL2:SL0 indicates the entry level for Stage2 translation. + * Interestingly, it depends on the page size. + * See D.10.2.121, VTCR_EL2, in ARM DDI 0487C.a + * + * ----------------------------------------- + * | Entry level | 4K | 16K/64K | + * ------------------------------------------ + * | Level: 0 | 2 | - | + * ------------------------------------------ + * | Level: 1 | 1 | 2 | + * ------------------------------------------ + * | Level: 2 | 0 | 1 | + * ------------------------------------------ + * | Level: 3 | - | 0 | + * ------------------------------------------ + * + * The table roughly translates to : + * + * SL0(PAGE_SIZE, Entry_level) = TGRAN_SL0_BASE - Entry_Level + * + * Where TGRAN_SL0_BASE is a magic number depending on the page size: + * TGRAN_SL0_BASE(4K) = 2 + * TGRAN_SL0_BASE(16K) = 3 + * TGRAN_SL0_BASE(64K) = 3 + * provided we take care of ruling out the unsupported cases and + * Entry_Level = 4 - Number_of_levels. + * + */ #ifdef CONFIG_ARM64_64K_PAGES -/* - * Stage2 translation configuration: - * 64kB pages (TG0 = 1) - * 2 level page tables (SL = 1) - */ -#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1) + +#define VTCR_EL2_TGRAN VTCR_EL2_TG0_64K +#define VTCR_EL2_TGRAN_SL0_BASE 3UL + #elif defined(CONFIG_ARM64_16K_PAGES) -/* - * Stage2 translation configuration: - * 16kB pages (TG0 = 2) - * 2 level page tables (SL = 1) - */ -#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_16K | VTCR_EL2_SL0_LVL1) + +#define VTCR_EL2_TGRAN VTCR_EL2_TG0_16K +#define VTCR_EL2_TGRAN_SL0_BASE 3UL + #else /* 4K */ -/* - * Stage2 translation configuration: - * 4kB pages (TG0 = 0) - * 3 level page tables (SL = 1) - */ -#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SL0_LVL1) + +#define VTCR_EL2_TGRAN VTCR_EL2_TG0_4K +#define VTCR_EL2_TGRAN_SL0_BASE 2UL + #endif -#define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS) +#define VTCR_EL2_LVLS_TO_SL0(levels) \ + ((VTCR_EL2_TGRAN_SL0_BASE - (4 - (levels))) << VTCR_EL2_SL0_SHIFT) +#define VTCR_EL2_SL0_TO_LVLS(sl0) \ + ((sl0) + 4 - VTCR_EL2_TGRAN_SL0_BASE) +#define VTCR_EL2_LVLS(vtcr) \ + VTCR_EL2_SL0_TO_LVLS(((vtcr) & VTCR_EL2_SL0_MASK) >> VTCR_EL2_SL0_SHIFT) + +#define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN) /* * ARM VMSAv8-64 defines an algorithm for finding the translation table * descriptors in section D4.2.8 in ARM DDI 0487C.a. diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 616120c4176b..1ced1e37374e 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -160,6 +160,7 @@ int kvm_arm_config_vm(struct kvm *kvm, unsigned long type) if (phys_shift > KVM_PHYS_SHIFT) phys_shift = KVM_PHYS_SHIFT; vtcr |= VTCR_EL2_T0SZ(phys_shift); + vtcr |= VTCR_EL2_LVLS_TO_SL0(kvm_stage2_levels(kvm)); /* * Enable the Hardware Access Flag management, unconditionally From 13ac4bbcc457d3925b4031cc70e3031fd8b9c3b7 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 26 Sep 2018 17:32:49 +0100 Subject: [PATCH 40/53] kvm: arm64: Switch to per VM IPA limit Now that we can manage the stage2 page table per VM, switch the configuration details to per VM instance. The VTCR is updated with the values specific to the VM based on the configuration. We store the IPA size and the number of stage2 page table levels for the guest already in VTCR. Decode it back from the vtcr field wherever we need it. Cc: Marc Zyngier Cc: Christoffer Dall Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_arm.h | 2 ++ arch/arm64/include/asm/kvm_mmu.h | 2 +- arch/arm64/include/asm/stage2_pgtable.h | 2 +- arch/arm64/kvm/reset.c | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index f913adb44f93..e4240568cc18 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -197,6 +197,8 @@ VTCR_EL2_SL0_TO_LVLS(((vtcr) & VTCR_EL2_SL0_MASK) >> VTCR_EL2_SL0_SHIFT) #define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN) +#define VTCR_EL2_IPA(vtcr) (64 - ((vtcr) & VTCR_EL2_T0SZ_MASK)) + /* * ARM VMSAv8-64 defines an algorithm for finding the translation table * descriptors in section D4.2.8 in ARM DDI 0487C.a. diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index ac3ca9690bad..77b1af9e64db 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -142,7 +142,7 @@ static inline unsigned long __kern_hyp_va(unsigned long v) */ #define KVM_PHYS_SHIFT (40) -#define kvm_phys_shift(kvm) KVM_PHYS_SHIFT +#define kvm_phys_shift(kvm) VTCR_EL2_IPA(kvm->arch.vtcr) #define kvm_phys_size(kvm) (_AC(1, ULL) << kvm_phys_shift(kvm)) #define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL)) diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index 36a0a1165003..c62fe118a898 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -43,7 +43,7 @@ */ #define stage2_pgtable_levels(ipa) ARM64_HW_PGTABLE_LEVELS((ipa) - 4) #define STAGE2_PGTABLE_LEVELS stage2_pgtable_levels(KVM_PHYS_SHIFT) -#define kvm_stage2_levels(kvm) stage2_pgtable_levels(kvm_phys_shift(kvm)) +#define kvm_stage2_levels(kvm) VTCR_EL2_LVLS(kvm->arch.vtcr) /* * With all the supported VA_BITs and 40bit guest IPA, the following condition diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 1ced1e37374e..2bf41e007390 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -160,7 +160,7 @@ int kvm_arm_config_vm(struct kvm *kvm, unsigned long type) if (phys_shift > KVM_PHYS_SHIFT) phys_shift = KVM_PHYS_SHIFT; vtcr |= VTCR_EL2_T0SZ(phys_shift); - vtcr |= VTCR_EL2_LVLS_TO_SL0(kvm_stage2_levels(kvm)); + vtcr |= VTCR_EL2_LVLS_TO_SL0(stage2_pgtable_levels(phys_shift)); /* * Enable the Hardware Access Flag management, unconditionally From 8ad50c8985d805923f52a80698010a0a5123c07d Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Wed, 26 Sep 2018 17:32:50 +0100 Subject: [PATCH 41/53] vgic: Add support for 52bit guest physical address Add support for handling 52bit guest physical address to the VGIC layer. So far we have limited the guest physical address to 48bits, by explicitly masking the upper bits. This patch removes the restriction. We do not have to check if the host supports 52bit as the gpa is always validated during an access. (e.g, kvm_{read/write}_guest, kvm_is_visible_gfn()). Also, the ITS table save-restore is also not affected with the enhancement. The DTE entries already store the bits[51:8] of the ITT_addr (with a 256byte alignment). Cc: Marc Zyngier Cc: Christoffer Dall Reviewed-by: Eric Auger Signed-off-by: Kristina Martsenko [ Macro clean ups, fix PROPBASER and PENDBASER accesses ] Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v3.h | 5 +++++ virt/kvm/arm/vgic/vgic-its.c | 36 +++++++++--------------------- virt/kvm/arm/vgic/vgic-mmio-v3.c | 2 -- 3 files changed, 15 insertions(+), 28 deletions(-) diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 8bdbb5f29494..74b0aa9c7499 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -357,6 +357,8 @@ #define GITS_CBASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt) #define GITS_CBASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb) +#define GITS_CBASER_ADDRESS(cbaser) ((cbaser) & GENMASK_ULL(51, 12)) + #define GITS_BASER_NR_REGS 8 #define GITS_BASER_VALID (1ULL << 63) @@ -388,6 +390,9 @@ #define GITS_BASER_ENTRY_SIZE_MASK GENMASK_ULL(52, 48) #define GITS_BASER_PHYS_52_to_48(phys) \ (((phys) & GENMASK_ULL(47, 16)) | (((phys) >> 48) & 0xf) << 12) +#define GITS_BASER_ADDR_48_to_52(baser) \ + (((baser) & GENMASK_ULL(47, 16)) | (((baser) >> 12) & 0xf) << 48) + #define GITS_BASER_SHAREABILITY_SHIFT (10) #define GITS_BASER_InnerShareable \ GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 12502251727e..eb2a390a6c86 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -241,13 +241,6 @@ static struct its_ite *find_ite(struct vgic_its *its, u32 device_id, list_for_each_entry(dev, &(its)->device_list, dev_list) \ list_for_each_entry(ite, &(dev)->itt_head, ite_list) -/* - * We only implement 48 bits of PA at the moment, although the ITS - * supports more. Let's be restrictive here. - */ -#define BASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 16)) -#define CBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 12)) - #define GIC_LPI_OFFSET 8192 #define VITS_TYPER_IDBITS 16 @@ -759,6 +752,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, { int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; u64 indirect_ptr, type = GITS_BASER_TYPE(baser); + phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser); int esz = GITS_BASER_ENTRY_SIZE(baser); int index; gfn_t gfn; @@ -783,7 +777,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, if (id >= (l1_tbl_size / esz)) return false; - addr = BASER_ADDRESS(baser) + id * esz; + addr = base + id * esz; gfn = addr >> PAGE_SHIFT; if (eaddr) @@ -798,7 +792,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, /* Each 1st level entry is represented by a 64-bit value. */ if (kvm_read_guest_lock(its->dev->kvm, - BASER_ADDRESS(baser) + index * sizeof(indirect_ptr), + base + index * sizeof(indirect_ptr), &indirect_ptr, sizeof(indirect_ptr))) return false; @@ -808,11 +802,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, if (!(indirect_ptr & BIT_ULL(63))) return false; - /* - * Mask the guest physical address and calculate the frame number. - * Any address beyond our supported 48 bits of PA will be caught - * by the actual check in the final step. - */ + /* Mask the guest physical address and calculate the frame number. */ indirect_ptr &= GENMASK_ULL(51, 16); /* Find the address of the actual entry */ @@ -1304,9 +1294,6 @@ static u64 vgic_sanitise_its_baser(u64 reg) GITS_BASER_OUTER_CACHEABILITY_SHIFT, vgic_sanitise_outer_cacheability); - /* Bits 15:12 contain bits 51:48 of the PA, which we don't support. */ - reg &= ~GENMASK_ULL(15, 12); - /* We support only one (ITS) page size: 64K */ reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K; @@ -1325,11 +1312,8 @@ static u64 vgic_sanitise_its_cbaser(u64 reg) GITS_CBASER_OUTER_CACHEABILITY_SHIFT, vgic_sanitise_outer_cacheability); - /* - * Sanitise the physical address to be 64k aligned. - * Also limit the physical addresses to 48 bits. - */ - reg &= ~(GENMASK_ULL(51, 48) | GENMASK_ULL(15, 12)); + /* Sanitise the physical address to be 64k aligned. */ + reg &= ~GENMASK_ULL(15, 12); return reg; } @@ -1375,7 +1359,7 @@ static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its) if (!its->enabled) return; - cbaser = CBASER_ADDRESS(its->cbaser); + cbaser = GITS_CBASER_ADDRESS(its->cbaser); while (its->cwriter != its->creadr) { int ret = kvm_read_guest_lock(kvm, cbaser + its->creadr, @@ -2233,7 +2217,7 @@ static int vgic_its_restore_device_tables(struct vgic_its *its) if (!(baser & GITS_BASER_VALID)) return 0; - l1_gpa = BASER_ADDRESS(baser); + l1_gpa = GITS_BASER_ADDR_48_to_52(baser); if (baser & GITS_BASER_INDIRECT) { l1_esz = GITS_LVL1_ENTRY_SIZE; @@ -2305,7 +2289,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); u64 baser = its->baser_coll_table; - gpa_t gpa = BASER_ADDRESS(baser); + gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser); struct its_collection *collection; u64 val; size_t max_size, filled = 0; @@ -2354,7 +2338,7 @@ static int vgic_its_restore_collection_table(struct vgic_its *its) if (!(baser & GITS_BASER_VALID)) return 0; - gpa = BASER_ADDRESS(baser); + gpa = GITS_BASER_ADDR_48_to_52(baser); max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index a2a175b08b17..b3d1f0985117 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -364,7 +364,6 @@ static u64 vgic_sanitise_pendbaser(u64 reg) vgic_sanitise_outer_cacheability); reg &= ~PENDBASER_RES0_MASK; - reg &= ~GENMASK_ULL(51, 48); return reg; } @@ -382,7 +381,6 @@ static u64 vgic_sanitise_propbaser(u64 reg) vgic_sanitise_outer_cacheability); reg &= ~PROPBASER_RES0_MASK; - reg &= ~GENMASK_ULL(51, 48); return reg; } From bc1d7de8c550db2f8fd59458a07fefa863358b8d Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 26 Sep 2018 17:32:51 +0100 Subject: [PATCH 42/53] kvm: arm64: Add 52bit support for PAR to HPFAR conversoin Add support for handling 52bit addresses in PAR to HPFAR conversion. Instead of hardcoding the address limits, we now use PHYS_MASK_SHIFT. Cc: Marc Zyngier Cc: Christoffer Dall Reviewed-by: Eric Auger Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_arm.h | 7 +++++++ arch/arm64/kvm/hyp/switch.c | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index e4240568cc18..f1330284720d 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -311,6 +311,13 @@ /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ #define HPFAR_MASK (~UL(0xf)) +/* + * We have + * PAR [PA_Shift - 1 : 12] = PA [PA_Shift - 1 : 12] + * HPFAR [PA_Shift - 9 : 4] = FIPA [PA_Shift - 1 : 12] + */ +#define PAR_TO_HPFAR(par) \ + (((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8) #define kvm_arm_exception_type \ {0, "IRQ" }, \ diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 9d5ce1a3039a..7cc175c88a37 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -263,7 +263,7 @@ static bool __hyp_text __translate_far_to_hpfar(u64 far, u64 *hpfar) return false; /* Translation failed, back to guest */ /* Convert PAR to HPFAR format */ - *hpfar = ((tmp >> 12) & ((1UL << 36) - 1)) << 4; + *hpfar = PAR_TO_HPFAR(tmp); return true; } From 0f62f0e95be29200ab2ab98ca870e22c9b148dfa Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 26 Sep 2018 17:32:52 +0100 Subject: [PATCH 43/53] kvm: arm64: Set a limit on the IPA size So far we have restricted the IPA size of the VM to the default value (40bits). Now that we can manage the IPA size per VM and support dynamic stage2 page tables, we can allow VMs to have larger IPA. This patch introduces a the maximum IPA size supported on the host. This is decided by the following factors : 1) Maximum PARange supported by the CPUs - This can be inferred from the system wide safe value. 2) Maximum PA size supported by the host kernel (48 vs 52) 3) Number of levels in the host page table (as we base our stage2 tables on the host table helpers). Since the stage2 page table code is dependent on the stage1 page table, we always ensure that : Number of Levels at Stage1 >= Number of Levels at Stage2 So we limit the IPA to make sure that the above condition is satisfied. This will affect the following combinations of VA_BITS and IPA for different page sizes. Host configuration | Unsupported IPA ranges 39bit VA, 4K | [44, 48] 36bit VA, 16K | [41, 48] 42bit VA, 64K | [47, 52] Supporting the above combinations need independent stage2 page table manipulation code, which would need substantial changes. We could purse the solution independently and switch the page table code once we have it ready. Cc: Catalin Marinas Cc: Marc Zyngier Cc: Christoffer Dall Reviewed-by: Eric Auger Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_mmu.h | 2 ++ arch/arm64/include/asm/kvm_host.h | 12 +++------ arch/arm64/kvm/reset.c | 43 +++++++++++++++++++++++++++++++ virt/kvm/arm/arm.c | 2 ++ 4 files changed, 50 insertions(+), 9 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 12ae5fbbcf01..5ad1a54f98dc 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -358,6 +358,8 @@ static inline int hyp_map_aux_data(void) #define kvm_phys_to_vttbr(addr) (addr) +static inline void kvm_set_ipa_limit(void) {} + #endif /* !__ASSEMBLY__ */ #endif /* __ARM_KVM_MMU_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 5ecd457bce7d..f008f8866b2a 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -442,15 +442,7 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); -static inline void __cpu_init_stage2(void) -{ - u32 ps; - - /* Sanity check for minimum IPA size support */ - ps = id_aa64mmfr0_parange_to_phys_shift(read_sysreg(id_aa64mmfr0_el1) & 0x7); - WARN_ONCE(ps < 40, - "PARange is %d bits, unsupported configuration!", ps); -} +static inline void __cpu_init_stage2(void) {} /* Guest/host FPSIMD coordination helpers */ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu); @@ -513,6 +505,8 @@ static inline int kvm_arm_have_ssbd(void) void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu); void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu); +void kvm_set_ipa_limit(void); + #define __KVM_HAVE_ARCH_VM_ALLOC struct kvm *kvm_arch_alloc_vm(void); void kvm_arch_free_vm(struct kvm *kvm); diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 2bf41e007390..96b3f50101bc 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -34,6 +34,9 @@ #include #include +/* Maximum phys_shift supported for any VM on this host */ +static u32 kvm_ipa_limit; + /* * ARMv8 Reset Values */ @@ -135,6 +138,46 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) return kvm_timer_vcpu_reset(vcpu); } +void kvm_set_ipa_limit(void) +{ + unsigned int ipa_max, pa_max, va_max, parange; + + parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 0x7; + pa_max = id_aa64mmfr0_parange_to_phys_shift(parange); + + /* Clamp the IPA limit to the PA size supported by the kernel */ + ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max; + /* + * Since our stage2 table is dependent on the stage1 page table code, + * we must always honor the following condition: + * + * Number of levels in Stage1 >= Number of levels in Stage2. + * + * So clamp the ipa limit further down to limit the number of levels. + * Since we can concatenate upto 16 tables at entry level, we could + * go upto 4bits above the maximum VA addressible with the current + * number of levels. + */ + va_max = PGDIR_SHIFT + PAGE_SHIFT - 3; + va_max += 4; + + if (va_max < ipa_max) + ipa_max = va_max; + + /* + * If the final limit is lower than the real physical address + * limit of the CPUs, report the reason. + */ + if (ipa_max < pa_max) + pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n", + (va_max < pa_max) ? "Virtual" : "Physical"); + + WARN(ipa_max < KVM_PHYS_SHIFT, + "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max); + kvm_ipa_limit = ipa_max; + kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit); +} + /* * Configure the VTCR_EL2 for this VM. The VTCR value is common * across all the physical CPUs on the system. We use system wide diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 43e716bc3f08..631f9a3ad99a 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1413,6 +1413,8 @@ static int init_common_resources(void) kvm_vmid_bits = kvm_get_vmid_bits(); kvm_info("%d-bit VMID\n", kvm_vmid_bits); + kvm_set_ipa_limit(); + return 0; } From 58b3efc820acd3219e89ff014e93346a734229b8 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 26 Sep 2018 17:32:53 +0100 Subject: [PATCH 44/53] kvm: arm64: Limit the minimum number of page table levels Since we are about to remove the lower limit on the IPA size, make sure that we do not go to 1 level page table (e.g, with 32bit IPA on 64K host with concatenation) to avoid splitting the host PMD huge pages at stage2. Cc: Marc Zyngier Cc: Christoffer Dall Reviewed-by: Eric Auger Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/stage2_pgtable.h | 7 ++++++- arch/arm64/kvm/reset.c | 10 +++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index c62fe118a898..2cce769ba4c6 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -72,8 +72,13 @@ /* * The number of PTRS across all concatenated stage2 tables given by the * number of bits resolved at the initial level. + * If we force more levels than necessary, we may have (stage2_pgdir_shift > IPA), + * in which case, stage2_pgd_ptrs will have one entry. */ -#define __s2_pgd_ptrs(ipa, lvls) (1 << ((ipa) - pt_levels_pgdir_shift((lvls)))) +#define pgd_ptrs_shift(ipa, pgdir_shift) \ + ((ipa) > (pgdir_shift) ? ((ipa) - (pgdir_shift)) : 0) +#define __s2_pgd_ptrs(ipa, lvls) \ + (1 << (pgd_ptrs_shift((ipa), pt_levels_pgdir_shift(lvls)))) #define __s2_pgd_size(ipa, lvls) (__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t)) #define stage2_pgd_ptrs(kvm) __s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)) diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 96b3f50101bc..f156e45760bc 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -190,6 +190,7 @@ int kvm_arm_config_vm(struct kvm *kvm, unsigned long type) { u64 vtcr = VTCR_EL2_FLAGS; u32 parange, phys_shift; + u8 lvls; if (type) return -EINVAL; @@ -203,7 +204,14 @@ int kvm_arm_config_vm(struct kvm *kvm, unsigned long type) if (phys_shift > KVM_PHYS_SHIFT) phys_shift = KVM_PHYS_SHIFT; vtcr |= VTCR_EL2_T0SZ(phys_shift); - vtcr |= VTCR_EL2_LVLS_TO_SL0(stage2_pgtable_levels(phys_shift)); + /* + * Use a minimum 2 level page table to prevent splitting + * host PMD huge pages at stage2. + */ + lvls = stage2_pgtable_levels(phys_shift); + if (lvls < 2) + lvls = 2; + vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); /* * Enable the Hardware Access Flag management, unconditionally From 233a7cb235318223df8133235383f4c595c654c1 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 26 Sep 2018 17:32:54 +0100 Subject: [PATCH 45/53] kvm: arm64: Allow tuning the physical address size for VM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow specifying the physical address size limit for a new VM via the kvm_type argument for the KVM_CREATE_VM ioctl. This allows us to finalise the stage2 page table as early as possible and hence perform the right checks on the memory slots without complication. The size is encoded as Log2(PA_Size) in bits[7:0] of the type field. For backward compatibility the value 0 is reserved and implies 40bits. Also, lift the limit of the IPA to host limit and allow lower IPA sizes (e.g, 32). The userspace could check the extension KVM_CAP_ARM_VM_IPA_SIZE for the availability of this feature. The cap check returns the maximum limit for the physical address shift supported by the host. Cc: Marc Zyngier Cc: Christoffer Dall Cc: Peter Maydell Cc: Paolo Bonzini Cc: Radim Krčmář Reviewed-by: Eric Auger Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- Documentation/virtual/kvm/api.txt | 31 +++++++++++++++++++++++++ arch/arm64/include/asm/stage2_pgtable.h | 20 ---------------- arch/arm64/kvm/reset.c | 17 ++++++++++---- include/uapi/linux/kvm.h | 10 ++++++++ 4 files changed, 54 insertions(+), 24 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 647f94128a85..f6b0af55d010 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -123,6 +123,37 @@ memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the flag KVM_VM_MIPS_VZ. +On arm64, the physical address size for a VM (IPA Size limit) is limited +to 40bits by default. The limit can be configured if the host supports the +extension KVM_CAP_ARM_VM_IPA_SIZE. When supported, use +KVM_VM_TYPE_ARM_IPA_SIZE(IPA_Bits) to set the size in the machine type +identifier, where IPA_Bits is the maximum width of any physical +address used by the VM. The IPA_Bits is encoded in bits[7-0] of the +machine type identifier. + +e.g, to configure a guest to use 48bit physical address size : + + vm_fd = ioctl(dev_fd, KVM_CREATE_VM, KVM_VM_TYPE_ARM_IPA_SIZE(48)); + +The requested size (IPA_Bits) must be : + 0 - Implies default size, 40bits (for backward compatibility) + + or + + N - Implies N bits, where N is a positive integer such that, + 32 <= N <= Host_IPA_Limit + +Host_IPA_Limit is the maximum possible value for IPA_Bits on the host and +is dependent on the CPU capability and the kernel configuration. The limit can +be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION +ioctl() at run-time. + +Please note that configuring the IPA size does not affect the capability +exposed by the guest CPUs in ID_AA64MMFR0_EL1[PARange]. It only affects +size of the address translated by the stage2 level (guest physical to +host physical address translations). + + 4.3 KVM_GET_MSR_INDEX_LIST, KVM_GET_MSR_FEATURE_INDEX_LIST Capability: basic, KVM_CAP_GET_MSR_FEATURES for KVM_GET_MSR_FEATURE_INDEX_LIST diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index 2cce769ba4c6..d352f6df8d2c 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -42,28 +42,8 @@ * the range (IPA_SHIFT, IPA_SHIFT - 4). */ #define stage2_pgtable_levels(ipa) ARM64_HW_PGTABLE_LEVELS((ipa) - 4) -#define STAGE2_PGTABLE_LEVELS stage2_pgtable_levels(KVM_PHYS_SHIFT) #define kvm_stage2_levels(kvm) VTCR_EL2_LVLS(kvm->arch.vtcr) -/* - * With all the supported VA_BITs and 40bit guest IPA, the following condition - * is always true: - * - * STAGE2_PGTABLE_LEVELS <= CONFIG_PGTABLE_LEVELS - * - * We base our stage-2 page table walker helpers on this assumption and - * fall back to using the host version of the helper wherever possible. - * i.e, if a particular level is not folded (e.g, PUD) at stage2, we fall back - * to using the host version, since it is guaranteed it is not folded at host. - * - * If the condition breaks in the future, we can rearrange the host level - * definitions and reuse them for stage2. Till then... - */ -#if STAGE2_PGTABLE_LEVELS > CONFIG_PGTABLE_LEVELS -#error "Unsupported combination of guest IPA and host VA_BITS." -#endif - - /* stage2_pgdir_shift() is the size mapped by top-level stage2 entry for the VM */ #define stage2_pgdir_shift(kvm) pt_levels_pgdir_shift(kvm_stage2_levels(kvm)) #define stage2_pgdir_size(kvm) (1ULL << stage2_pgdir_shift(kvm)) diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index f156e45760bc..95f28d5950e0 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -89,6 +89,9 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VCPU_EVENTS: r = 1; break; + case KVM_CAP_ARM_VM_IPA_SIZE: + r = kvm_ipa_limit; + break; default: r = 0; } @@ -192,17 +195,23 @@ int kvm_arm_config_vm(struct kvm *kvm, unsigned long type) u32 parange, phys_shift; u8 lvls; - if (type) + if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) return -EINVAL; + phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); + if (phys_shift) { + if (phys_shift > kvm_ipa_limit || + phys_shift < 32) + return -EINVAL; + } else { + phys_shift = KVM_PHYS_SHIFT; + } + parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 7; if (parange > ID_AA64MMFR0_PARANGE_MAX) parange = ID_AA64MMFR0_PARANGE_MAX; vtcr |= parange << VTCR_EL2_PS_SHIFT; - phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange); - if (phys_shift > KVM_PHYS_SHIFT) - phys_shift = KVM_PHYS_SHIFT; vtcr |= VTCR_EL2_T0SZ(phys_shift); /* * Use a minimum 2 level page table to prevent splitting diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 251be353f950..95aa73ca65dc 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -750,6 +750,15 @@ struct kvm_ppc_resize_hpt { #define KVM_S390_SIE_PAGE_OFFSET 1 +/* + * On arm64, machine type can be used to request the physical + * address size for the VM. Bits[7-0] are reserved for the guest + * PA size shift (i.e, log2(PA_Size)). For backward compatibility, + * value 0 implies the default IPA size, 40bits. + */ +#define KVM_VM_TYPE_ARM_IPA_SIZE_MASK 0xffULL +#define KVM_VM_TYPE_ARM_IPA_SIZE(x) \ + ((x) & KVM_VM_TYPE_ARM_IPA_SIZE_MASK) /* * ioctls for /dev/kvm fds: */ @@ -953,6 +962,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_NESTED_STATE 157 #define KVM_CAP_ARM_INJECT_SERROR_ESR 158 #define KVM_CAP_MSR_PLATFORM_INFO 159 +#define KVM_CAP_ARM_VM_IPA_SIZE 160 /* returns maximum IPA bits for a VM */ #ifdef KVM_CAP_IRQ_ROUTING From bca607ebc76af9540e4aad5b2241a7323354be43 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 1 Oct 2018 13:40:36 +0100 Subject: [PATCH 46/53] KVM: arm/arm64: Rename kvm_arm_config_vm to kvm_arm_setup_stage2 VM tends to be a very overloaded term in KVM, so let's keep it to describe the virtual machine. For the virtual memory setup, let's use the "stage2" suffix. Reviewed-by: Eric Auger Reviewed-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_host.h | 6 +++++- arch/arm64/include/asm/kvm_arm.h | 2 +- arch/arm64/include/asm/kvm_host.h | 2 +- arch/arm64/kvm/reset.c | 2 +- virt/kvm/arm/arm.c | 2 +- 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 72d46418e1ef..b45af481ccf7 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -354,8 +354,12 @@ static inline void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu) {} struct kvm *kvm_arch_alloc_vm(void); void kvm_arch_free_vm(struct kvm *kvm); -static inline int kvm_arm_config_vm(struct kvm *kvm, unsigned long type) +static inline int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) { + /* + * On 32bit ARM, VMs get a static 40bit IPA stage2 setup, + * so any non-zero value used as type is illegal. + */ if (type) return -EINVAL; return 0; diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index f1330284720d..6e324d1f1231 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -133,7 +133,7 @@ * 40 bits wide (T0SZ = 24). Systems with a PARange smaller than 40 bits are * not known to exist and will break with this configuration. * - * The VTCR_EL2 is configured per VM and is initialised in kvm_arm_config_vm(). + * The VTCR_EL2 is configured per VM and is initialised in kvm_arm_setup_stage2(). * * Note that when using 4K pages, we concatenate two first level page tables * together. With 16K pages, we concatenate 16 first level page tables. diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f008f8866b2a..376a5b695467 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -511,6 +511,6 @@ void kvm_set_ipa_limit(void); struct kvm *kvm_arch_alloc_vm(void); void kvm_arch_free_vm(struct kvm *kvm); -int kvm_arm_config_vm(struct kvm *kvm, unsigned long type); +int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type); #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 95f28d5950e0..aa806d582552 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -189,7 +189,7 @@ void kvm_set_ipa_limit(void) * all CPUs, as it is safe to run with or without the feature and * the bit is RES0 on CPUs that don't support it. */ -int kvm_arm_config_vm(struct kvm *kvm, unsigned long type) +int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) { u64 vtcr = VTCR_EL2_FLAGS; u32 parange, phys_shift; diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 631f9a3ad99a..91c464c9cd21 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -120,7 +120,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int ret, cpu; - ret = kvm_arm_config_vm(kvm, type); + ret = kvm_arm_setup_stage2(kvm, type); if (ret) return ret; From 9d47bb0d9ea8528373b4c6f9bca6c7f402900297 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 1 Oct 2018 13:41:32 +0100 Subject: [PATCH 47/53] KVM: arm64: Drop __cpu_init_stage2 on the VHE path __cpu_init_stage2 doesn't do anything anymore on arm64, and is totally non-sensical if running VHE (as VHE is 64bit only). Reviewed-by: Eric Auger Reviewed-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- virt/kvm/arm/arm.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 91c464c9cd21..4ce99bb223bc 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1310,16 +1310,10 @@ static void cpu_hyp_reinit(void) { cpu_hyp_reset(); - if (is_kernel_in_hyp_mode()) { - /* - * __cpu_init_stage2() is safe to call even if the PM - * event was cancelled before the CPU was reset. - */ - __cpu_init_stage2(); + if (is_kernel_in_hyp_mode()) kvm_timer_init_vhe(); - } else { + else cpu_init_hyp_mode(NULL); - } if (vgic_present) kvm_vgic_init_cpu_hardware(); From f0725345e3e127032376e4fcb6b0fc893237fcef Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Thu, 9 Aug 2018 22:20:41 +0800 Subject: [PATCH 48/53] arm64: KVM: Remove some extra semicolon in kvm_target_cpu There are some extra semicolon in kvm_target_cpu, remove it. Signed-off-by: zhong jiang Signed-off-by: Marc Zyngier --- arch/arm64/kvm/guest.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 07256b08226c..a74f84d09412 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -338,15 +338,15 @@ int __attribute_const__ kvm_target_cpu(void) return KVM_ARM_TARGET_CORTEX_A53; case ARM_CPU_PART_CORTEX_A57: return KVM_ARM_TARGET_CORTEX_A57; - }; + } break; case ARM_CPU_IMP_APM: switch (part_number) { case APM_CPU_PART_POTENZA: return KVM_ARM_TARGET_XGENE_POTENZA; - }; + } break; - }; + } /* Return a default generic target */ return KVM_ARM_TARGET_GENERIC_V8; From fd2ef358282c849c193aa36dadbf4f07f7dcd29b Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Mon, 1 Oct 2018 16:54:35 +0100 Subject: [PATCH 49/53] KVM: arm/arm64: Ensure only THP is candidate for adjustment PageTransCompoundMap() returns true for hugetlbfs and THP hugepages. This behaviour incorrectly leads to stage 2 faults for unsupported hugepage sizes (e.g., 64K hugepage with 4K pages) to be treated as THP faults. Tighten the check to filter out hugetlbfs pages. This also leads to consistently mapping all unsupported hugepage sizes as PTE level entries at stage 2. Signed-off-by: Punit Agrawal Reviewed-by: Suzuki Poulose Cc: Christoffer Dall Cc: Marc Zyngier Cc: stable@vger.kernel.org # v4.13+ Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmu.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 7e477b3cae5b..c23a1b323aad 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1231,8 +1231,14 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) { kvm_pfn_t pfn = *pfnp; gfn_t gfn = *ipap >> PAGE_SHIFT; + struct page *page = pfn_to_page(pfn); - if (PageTransCompoundMap(pfn_to_page(pfn))) { + /* + * PageTransCompoungMap() returns true for THP and + * hugetlbfs. Make sure the adjustment is done only for THP + * pages. + */ + if (!PageHuge(page) && PageTransCompoundMap(page)) { unsigned long mask; /* * The address we faulted on is backed by a transparent huge From da5a3ce66b8bb51b0ea8a89f42aac153903f90fb Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 17 Oct 2018 17:42:10 +0100 Subject: [PATCH 50/53] KVM: arm64: Fix caching of host MDCR_EL2 value At boot time, KVM stashes the host MDCR_EL2 value, but only does this when the kernel is not running in hyp mode (i.e. is non-VHE). In these cases, the stashed value of MDCR_EL2.HPMN happens to be zero, which can lead to CONSTRAINED UNPREDICTABLE behaviour. Since we use this value to derive the MDCR_EL2 value when switching to/from a guest, after a guest have been run, the performance counters do not behave as expected. This has been observed to result in accesses via PMXEVTYPER_EL0 and PMXEVCNTR_EL0 not affecting the relevant counters, resulting in events not being counted. In these cases, only the fixed-purpose cycle counter appears to work as expected. Fix this by always stashing the host MDCR_EL2 value, regardless of VHE. Cc: Christopher Dall Cc: James Morse Cc: Will Deacon Cc: stable@vger.kernel.org Fixes: 1e947bad0b63b351 ("arm64: KVM: Skip HYP setup when already running in HYP") Tested-by: Robin Murphy Signed-off-by: Mark Rutland Signed-off-by: Marc Zyngier --- virt/kvm/arm/arm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 4ce99bb223bc..4c5ff06b18f9 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1296,8 +1296,6 @@ static void cpu_init_hyp_mode(void *dummy) __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr); __cpu_init_stage2(); - - kvm_arm_init_debug(); } static void cpu_hyp_reset(void) @@ -1315,6 +1313,8 @@ static void cpu_hyp_reinit(void) else cpu_init_hyp_mode(NULL); + kvm_arm_init_debug(); + if (vgic_present) kvm_vgic_init_cpu_hardware(); } From 375bdd3b5d4f7cf146f0df1488b4671b141dd799 Mon Sep 17 00:00:00 2001 From: Dongjiu Geng Date: Sat, 13 Oct 2018 00:12:48 +0800 Subject: [PATCH 51/53] arm/arm64: KVM: Rename function kvm_arch_dev_ioctl_check_extension() Rename kvm_arch_dev_ioctl_check_extension() to kvm_arch_vm_ioctl_check_extension(), because it does not have any relationship with device. Renaming this function can make code readable. Cc: James Morse Reviewed-by: Suzuki K Poulose Signed-off-by: Dongjiu Geng Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_host.h | 2 +- arch/arm64/include/asm/kvm_host.h | 2 +- arch/arm64/kvm/reset.c | 4 ++-- virt/kvm/arm/arm.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index b45af481ccf7..5ca5d9af0c26 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -273,7 +273,7 @@ static inline void __cpu_init_stage2(void) kvm_call_hyp(__init_stage2_translation); } -static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) +static inline int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) { return 0; } diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 376a5b695467..f84052f306af 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -53,7 +53,7 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); -int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext); +int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext); void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); struct kvm_arch { diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index aa806d582552..337d2fbc2f06 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -59,12 +59,12 @@ static bool cpu_has_32bit_el1(void) } /** - * kvm_arch_dev_ioctl_check_extension + * kvm_arch_vm_ioctl_check_extension * * We currently assume that the number of HW registers is uniform * across all CPUs (see cpuinfo_sanity_check). */ -int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) +int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r; diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 4c5ff06b18f9..7c9d7b51ce89 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -241,7 +241,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 1; break; default: - r = kvm_arch_dev_ioctl_check_extension(kvm, ext); + r = kvm_arch_vm_ioctl_check_extension(kvm, ext); break; } return r; From 58bf437ff64eac8aca606e42d7e4623e40b61fa1 Mon Sep 17 00:00:00 2001 From: Dongjiu Geng Date: Sat, 13 Oct 2018 00:12:49 +0800 Subject: [PATCH 52/53] arm/arm64: KVM: Enable 32 bits kvm vcpu events support The commit 539aee0edb9f ("KVM: arm64: Share the parts of get/set events useful to 32bit") shares the get/set events helper for arm64 and arm32, but forgot to share the cap extension code. User space will check whether KVM supports vcpu events by checking the KVM_CAP_VCPU_EVENTS extension Acked-by: James Morse Reviewed-by : Suzuki K Poulose Signed-off-by: Dongjiu Geng Signed-off-by: Marc Zyngier --- arch/arm64/kvm/reset.c | 1 - virt/kvm/arm/arm.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 337d2fbc2f06..b72a3dd56204 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -86,7 +86,6 @@ int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_VCPU_ATTRIBUTES: - case KVM_CAP_VCPU_EVENTS: r = 1; break; case KVM_CAP_ARM_VM_IPA_SIZE: diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 7c9d7b51ce89..11b98b2b0486 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -213,6 +213,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_READONLY_MEM: case KVM_CAP_MP_STATE: case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_VCPU_EVENTS: r = 1; break; case KVM_CAP_ARM_SET_DEVICE_ADDR: From e4e11cc0f81ee7be17d6f6fb96128a6d51c0e838 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Wed, 17 Oct 2018 20:21:16 +0200 Subject: [PATCH 53/53] KVM: arm64: Safety check PSTATE when entering guest and handle IL This commit adds a paranoid check when entering the guest to make sure we don't attempt running guest code in an equally or more privilged mode than the hypervisor. We also catch other accidental programming of the SPSR_EL2 which results in an illegal exception return and report this safely back to the user. Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_asm.h | 1 + arch/arm64/include/asm/ptrace.h | 3 +++ arch/arm64/kvm/handle_exit.c | 7 +++++++ arch/arm64/kvm/hyp/hyp-entry.S | 16 +++++++++++++++- arch/arm64/kvm/hyp/sysreg-sr.c | 19 ++++++++++++++++++- 5 files changed, 44 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 0b53c72e7591..aea01a09eb94 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -30,6 +30,7 @@ #define ARM_EXCEPTION_IRQ 0 #define ARM_EXCEPTION_EL1_SERROR 1 #define ARM_EXCEPTION_TRAP 2 +#define ARM_EXCEPTION_IL 3 /* The hyp-stub will return this for any kvm_call_hyp() call */ #define ARM_EXCEPTION_HYP_GONE HVC_STUB_ERR diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 177b851ca6d9..ff35ac1258eb 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -25,6 +25,9 @@ #define CurrentEL_EL1 (1 << 2) #define CurrentEL_EL2 (2 << 2) +/* Additional SPSR bits not exposed in the UABI */ +#define PSR_IL_BIT (1 << 20) + /* AArch32-specific ptrace requests */ #define COMPAT_PTRACE_GETREGS 12 #define COMPAT_PTRACE_SETREGS 13 diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index e5e741bfffe1..35a81bebd02b 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -284,6 +284,13 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, */ run->exit_reason = KVM_EXIT_FAIL_ENTRY; return 0; + case ARM_EXCEPTION_IL: + /* + * We attempted an illegal exception return. Guest state must + * have been corrupted somehow. Give up. + */ + run->exit_reason = KVM_EXIT_FAIL_ENTRY; + return -EINVAL; default: kvm_pr_unimpl("Unsupported exception type: %d", exception_index); diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 24b4fbafe3e4..b1f14f736962 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -162,6 +162,20 @@ el1_error: mov x0, #ARM_EXCEPTION_EL1_SERROR b __guest_exit +el2_sync: + /* Check for illegal exception return, otherwise panic */ + mrs x0, spsr_el2 + + /* if this was something else, then panic! */ + tst x0, #PSR_IL_BIT + b.eq __hyp_panic + + /* Let's attempt a recovery from the illegal exception return */ + get_vcpu_ptr x1, x0 + mov x0, #ARM_EXCEPTION_IL + b __guest_exit + + el2_error: ldp x0, x1, [sp], #16 @@ -240,7 +254,7 @@ ENTRY(__kvm_hyp_vector) invalid_vect el2t_fiq_invalid // FIQ EL2t invalid_vect el2t_error_invalid // Error EL2t - invalid_vect el2h_sync_invalid // Synchronous EL2h + valid_vect el2_sync // Synchronous EL2h invalid_vect el2h_irq_invalid // IRQ EL2h invalid_vect el2h_fiq_invalid // FIQ EL2h valid_vect el2_error // Error EL2h diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c index 9ce223944983..8dc285318204 100644 --- a/arch/arm64/kvm/hyp/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/sysreg-sr.c @@ -152,8 +152,25 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) static void __hyp_text __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt) { + u64 pstate = ctxt->gp_regs.regs.pstate; + u64 mode = pstate & PSR_AA32_MODE_MASK; + + /* + * Safety check to ensure we're setting the CPU up to enter the guest + * in a less privileged mode. + * + * If we are attempting a return to EL2 or higher in AArch64 state, + * program SPSR_EL2 with M=EL2h and the IL bit set which ensures that + * we'll take an illegal exception state exception immediately after + * the ERET to the guest. Attempts to return to AArch32 Hyp will + * result in an illegal exception return because EL2's execution state + * is determined by SCR_EL3.RW. + */ + if (!(mode & PSR_MODE32_BIT) && mode >= PSR_MODE_EL2t) + pstate = PSR_MODE_EL2h | PSR_IL_BIT; + write_sysreg_el2(ctxt->gp_regs.regs.pc, elr); - write_sysreg_el2(ctxt->gp_regs.regs.pstate, spsr); + write_sysreg_el2(pstate, spsr); if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) write_sysreg_s(ctxt->sys_regs[DISR_EL1], SYS_VDISR_EL2);